xref: /aosp_15_r20/external/arm-optimized-routines/networking/test/chksum.c (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li /*
2*412f47f9SXin Li  * Ones' complement checksum test & benchmark
3*412f47f9SXin Li  *
4*412f47f9SXin Li  * Copyright (c) 2016-2020, Arm Limited.
5*412f47f9SXin Li  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li  */
7*412f47f9SXin Li 
8*412f47f9SXin Li #define _GNU_SOURCE
9*412f47f9SXin Li #include <inttypes.h>
10*412f47f9SXin Li #include <stdbool.h>
11*412f47f9SXin Li #include <stdint.h>
12*412f47f9SXin Li #include <stdio.h>
13*412f47f9SXin Li #include <stdlib.h>
14*412f47f9SXin Li #include <string.h>
15*412f47f9SXin Li #include <sys/mman.h>
16*412f47f9SXin Li #include <time.h>
17*412f47f9SXin Li #include <unistd.h>
18*412f47f9SXin Li #include "../include/networking.h"
19*412f47f9SXin Li 
20*412f47f9SXin Li #if WANT_ASSERT
21*412f47f9SXin Li #undef NDEBUG
22*412f47f9SXin Li #include <assert.h>
23*412f47f9SXin Li #define Assert(exp) assert(exp)
24*412f47f9SXin Li #else
25*412f47f9SXin Li #define Assert(exp) (void) (exp)
26*412f47f9SXin Li #endif
27*412f47f9SXin Li 
28*412f47f9SXin Li #ifdef __GNUC__
29*412f47f9SXin Li #define may_alias __attribute__((__may_alias__))
30*412f47f9SXin Li #else
31*412f47f9SXin Li #define may_alias
32*412f47f9SXin Li #endif
33*412f47f9SXin Li 
34*412f47f9SXin Li #define CACHE_LINE 64
35*412f47f9SXin Li #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
36*412f47f9SXin Li 
37*412f47f9SXin Li /* Reference implementation - do not modify! */
38*412f47f9SXin Li static uint16_t
checksum_simple(const void * ptr,uint32_t nbytes)39*412f47f9SXin Li checksum_simple(const void *ptr, uint32_t nbytes)
40*412f47f9SXin Li {
41*412f47f9SXin Li     const uint16_t *may_alias hptr = ptr;
42*412f47f9SXin Li     uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
43*412f47f9SXin Li 
44*412f47f9SXin Li     /* Sum all halfwords, assume misaligned accesses are handled in HW */
45*412f47f9SXin Li     for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
46*412f47f9SXin Li     {
47*412f47f9SXin Li 	sum += *hptr++;
48*412f47f9SXin Li     }
49*412f47f9SXin Li 
50*412f47f9SXin Li     /* Add any trailing odd byte */
51*412f47f9SXin Li     if ((nbytes & 0x01) != 0)
52*412f47f9SXin Li     {
53*412f47f9SXin Li 	sum += *(uint8_t *) hptr;
54*412f47f9SXin Li     }
55*412f47f9SXin Li 
56*412f47f9SXin Li     /* Fold 64-bit sum to 32 bits */
57*412f47f9SXin Li     sum = (sum & 0xffffffff) + (sum >> 32);
58*412f47f9SXin Li     sum = (sum & 0xffffffff) + (sum >> 32);
59*412f47f9SXin Li     Assert(sum == (uint32_t) sum);
60*412f47f9SXin Li 
61*412f47f9SXin Li     /* Fold 32-bit sum to 16 bits */
62*412f47f9SXin Li     sum = (sum & 0xffff) + (sum >> 16);
63*412f47f9SXin Li     sum = (sum & 0xffff) + (sum >> 16);
64*412f47f9SXin Li     Assert(sum == (uint16_t) sum);
65*412f47f9SXin Li 
66*412f47f9SXin Li     return (uint16_t) sum;
67*412f47f9SXin Li }
68*412f47f9SXin Li 
69*412f47f9SXin Li static struct
70*412f47f9SXin Li {
71*412f47f9SXin Li     uint16_t (*cksum_fp)(const void *, uint32_t);
72*412f47f9SXin Li     const char *name;
73*412f47f9SXin Li } implementations[] =
74*412f47f9SXin Li {
75*412f47f9SXin Li     { checksum_simple, "simple"},
76*412f47f9SXin Li     { __chksum, "scalar"},
77*412f47f9SXin Li #if __arm__
78*412f47f9SXin Li     { __chksum_arm_simd, "simd" },
79*412f47f9SXin Li #elif __aarch64__
80*412f47f9SXin Li     { __chksum_aarch64_simd, "simd" },
81*412f47f9SXin Li #endif
82*412f47f9SXin Li     { NULL, NULL}
83*412f47f9SXin Li };
84*412f47f9SXin Li 
85*412f47f9SXin Li static int
find_impl(const char * name)86*412f47f9SXin Li find_impl(const char *name)
87*412f47f9SXin Li {
88*412f47f9SXin Li     for (int i = 0; implementations[i].name != NULL; i++)
89*412f47f9SXin Li     {
90*412f47f9SXin Li 	if (strcmp(implementations[i].name, name) == 0)
91*412f47f9SXin Li 	{
92*412f47f9SXin Li 	    return i;
93*412f47f9SXin Li 	}
94*412f47f9SXin Li     }
95*412f47f9SXin Li     return -1;
96*412f47f9SXin Li }
97*412f47f9SXin Li 
98*412f47f9SXin Li static uint16_t (*CKSUM_FP)(const void *, uint32_t);
99*412f47f9SXin Li static volatile uint16_t SINK;
100*412f47f9SXin Li 
101*412f47f9SXin Li static bool
verify(const void * data,uint32_t offset,uint32_t size)102*412f47f9SXin Li verify(const void *data, uint32_t offset, uint32_t size)
103*412f47f9SXin Li {
104*412f47f9SXin Li 
105*412f47f9SXin Li     uint16_t csum_expected = checksum_simple(data, size);
106*412f47f9SXin Li     uint16_t csum_actual = CKSUM_FP(data, size);
107*412f47f9SXin Li     if (csum_actual != csum_expected)
108*412f47f9SXin Li     {
109*412f47f9SXin Li 	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
110*412f47f9SXin Li 		"actual %04x expected %04x (valid)",
111*412f47f9SXin Li 		offset, size, csum_actual, csum_expected);
112*412f47f9SXin Li 	if (size < 65536)
113*412f47f9SXin Li 	{
114*412f47f9SXin Li 	    /* Fatal error */
115*412f47f9SXin Li 	    exit(EXIT_FAILURE);
116*412f47f9SXin Li 	}
117*412f47f9SXin Li 	/* Else some implementations only support sizes up to 2^16 */
118*412f47f9SXin Li 	return false;
119*412f47f9SXin Li     }
120*412f47f9SXin Li     return true;
121*412f47f9SXin Li }
122*412f47f9SXin Li 
123*412f47f9SXin Li static uint64_t
clock_get_ns(void)124*412f47f9SXin Li clock_get_ns(void)
125*412f47f9SXin Li {
126*412f47f9SXin Li     struct timespec ts;
127*412f47f9SXin Li     clock_gettime(CLOCK_MONOTONIC, &ts);
128*412f47f9SXin Li     return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
129*412f47f9SXin Li }
130*412f47f9SXin Li 
131*412f47f9SXin Li static void
benchmark(const uint8_t * base,size_t poolsize,uint32_t blksize,uint32_t numops,uint64_t cpufreq)132*412f47f9SXin Li benchmark(const uint8_t *base,
133*412f47f9SXin Li 	  size_t poolsize,
134*412f47f9SXin Li 	  uint32_t blksize,
135*412f47f9SXin Li 	  uint32_t numops,
136*412f47f9SXin Li 	  uint64_t cpufreq)
137*412f47f9SXin Li {
138*412f47f9SXin Li     printf("%11u ", (unsigned int) blksize); fflush(stdout);
139*412f47f9SXin Li 
140*412f47f9SXin Li     uint64_t start = clock_get_ns();
141*412f47f9SXin Li     for (uint32_t i = 0; i < numops; i ++)
142*412f47f9SXin Li     {
143*412f47f9SXin Li 	/* Read a random value from the pool */
144*412f47f9SXin Li 	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
145*412f47f9SXin Li 	/* Generate a random starting address */
146*412f47f9SXin Li 	const void *data = &base[random % (poolsize - blksize)];
147*412f47f9SXin Li 	SINK = CKSUM_FP(data, blksize);
148*412f47f9SXin Li     }
149*412f47f9SXin Li     uint64_t end = clock_get_ns();
150*412f47f9SXin Li 
151*412f47f9SXin Li #define MEGABYTE 1000000 /* Decimal megabyte (MB) */
152*412f47f9SXin Li     uint64_t elapsed_ns = end - start;
153*412f47f9SXin Li     uint64_t elapsed_ms = elapsed_ns / 1000000;
154*412f47f9SXin Li     uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
155*412f47f9SXin Li     uint64_t accbytes = (uint64_t) numops * blksize;
156*412f47f9SXin Li     printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
157*412f47f9SXin Li     unsigned int cyc_per_blk = cpufreq / blks_per_s;
158*412f47f9SXin Li     printf("%11u ", cyc_per_blk);
159*412f47f9SXin Li     if (blksize != 0)
160*412f47f9SXin Li     {
161*412f47f9SXin Li 	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
162*412f47f9SXin Li 	printf("%7u.%03u ",
163*412f47f9SXin Li 		cyc_per_byte / 1000, cyc_per_byte % 1000);
164*412f47f9SXin Li     }
165*412f47f9SXin Li     printf("\n");
166*412f47f9SXin Li }
167*412f47f9SXin Li 
main(int argc,char * argv[])168*412f47f9SXin Li int main(int argc, char *argv[])
169*412f47f9SXin Li {
170*412f47f9SXin Li     int c;
171*412f47f9SXin Li     bool DUMP = false;
172*412f47f9SXin Li     uint32_t IMPL = 0;/* Simple implementation */
173*412f47f9SXin Li     uint64_t CPUFREQ = 0;
174*412f47f9SXin Li     uint32_t BLKSIZE = 0;
175*412f47f9SXin Li     uint32_t NUMOPS = 1000000;
176*412f47f9SXin Li     uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
177*412f47f9SXin Li 
178*412f47f9SXin Li     setvbuf(stdout, NULL, _IOLBF, 160);
179*412f47f9SXin Li     while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
180*412f47f9SXin Li     {
181*412f47f9SXin Li 	switch (c)
182*412f47f9SXin Li 	{
183*412f47f9SXin Li 	    case 'b' :
184*412f47f9SXin Li 		{
185*412f47f9SXin Li 		    int blksize = atoi(optarg);
186*412f47f9SXin Li 		    if (blksize < 1 || blksize > POOLSIZE / 2)
187*412f47f9SXin Li 		    {
188*412f47f9SXin Li 			fprintf(stderr, "Invalid block size %d\n", blksize);
189*412f47f9SXin Li 			exit(EXIT_FAILURE);
190*412f47f9SXin Li 		    }
191*412f47f9SXin Li 		    BLKSIZE = (unsigned) blksize;
192*412f47f9SXin Li 		    break;
193*412f47f9SXin Li 		}
194*412f47f9SXin Li 	    case 'd' :
195*412f47f9SXin Li 		DUMP = true;
196*412f47f9SXin Li 		break;
197*412f47f9SXin Li 	    case 'f' :
198*412f47f9SXin Li 		{
199*412f47f9SXin Li 		    int64_t cpufreq = atoll(optarg);
200*412f47f9SXin Li 		    if (cpufreq < 1)
201*412f47f9SXin Li 		    {
202*412f47f9SXin Li 			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
203*412f47f9SXin Li 				cpufreq);
204*412f47f9SXin Li 			exit(EXIT_FAILURE);
205*412f47f9SXin Li 		    }
206*412f47f9SXin Li 		    CPUFREQ = cpufreq;
207*412f47f9SXin Li 		    break;
208*412f47f9SXin Li 		}
209*412f47f9SXin Li 	    case 'i' :
210*412f47f9SXin Li 		{
211*412f47f9SXin Li 		    int impl = find_impl(optarg);
212*412f47f9SXin Li 		    if (impl < 0)
213*412f47f9SXin Li 		    {
214*412f47f9SXin Li 			fprintf(stderr, "Invalid implementation %s\n", optarg);
215*412f47f9SXin Li 			goto usage;
216*412f47f9SXin Li 		    }
217*412f47f9SXin Li 		    IMPL = (unsigned) impl;
218*412f47f9SXin Li 		    break;
219*412f47f9SXin Li 		}
220*412f47f9SXin Li 	    case 'n' :
221*412f47f9SXin Li 		{
222*412f47f9SXin Li 		    int numops = atoi(optarg);
223*412f47f9SXin Li 		    if (numops < 1)
224*412f47f9SXin Li 		    {
225*412f47f9SXin Li 			fprintf(stderr, "Invalid number of operations %d\n", numops);
226*412f47f9SXin Li 			exit(EXIT_FAILURE);
227*412f47f9SXin Li 		    }
228*412f47f9SXin Li 		    NUMOPS = (unsigned) numops;
229*412f47f9SXin Li 		    break;
230*412f47f9SXin Li 		}
231*412f47f9SXin Li 	    case 'p' :
232*412f47f9SXin Li 		{
233*412f47f9SXin Li 		    int poolsize = atoi(optarg);
234*412f47f9SXin Li 		    if (poolsize < 4096)
235*412f47f9SXin Li 		    {
236*412f47f9SXin Li 			fprintf(stderr, "Invalid pool size %d\n", poolsize);
237*412f47f9SXin Li 			exit(EXIT_FAILURE);
238*412f47f9SXin Li 		    }
239*412f47f9SXin Li 		    char c = optarg[strlen(optarg) - 1];
240*412f47f9SXin Li 		    if (c == 'M')
241*412f47f9SXin Li 		    {
242*412f47f9SXin Li 			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
243*412f47f9SXin Li 		    }
244*412f47f9SXin Li 		    else if (c == 'K')
245*412f47f9SXin Li 		    {
246*412f47f9SXin Li 			POOLSIZE = (unsigned) poolsize * 1024;
247*412f47f9SXin Li 		    }
248*412f47f9SXin Li 		    else
249*412f47f9SXin Li 		    {
250*412f47f9SXin Li 			POOLSIZE = (unsigned) poolsize;
251*412f47f9SXin Li 		    }
252*412f47f9SXin Li 		    break;
253*412f47f9SXin Li 		}
254*412f47f9SXin Li 	    default :
255*412f47f9SXin Li usage :
256*412f47f9SXin Li 		fprintf(stderr, "Usage: checksum <options>\n"
257*412f47f9SXin Li 			"-b <blksize>    Block size\n"
258*412f47f9SXin Li 			"-d              Dump first 96 bytes of data\n"
259*412f47f9SXin Li 			"-f <cpufreq>    CPU frequency (Hz)\n"
260*412f47f9SXin Li 			"-i <impl>       Implementation\n"
261*412f47f9SXin Li 			"-n <numops>     Number of operations\n"
262*412f47f9SXin Li 			"-p <poolsize>   Pool size (K or M suffix)\n"
263*412f47f9SXin Li 		       );
264*412f47f9SXin Li 		printf("Implementations:");
265*412f47f9SXin Li 		for (int i = 0; implementations[i].name != NULL; i++)
266*412f47f9SXin Li 		{
267*412f47f9SXin Li 		    printf(" %s", implementations[i].name);
268*412f47f9SXin Li 		}
269*412f47f9SXin Li 		printf("\n");
270*412f47f9SXin Li 		exit(EXIT_FAILURE);
271*412f47f9SXin Li 	}
272*412f47f9SXin Li     }
273*412f47f9SXin Li     if (optind > argc)
274*412f47f9SXin Li     {
275*412f47f9SXin Li 	goto usage;
276*412f47f9SXin Li     }
277*412f47f9SXin Li 
278*412f47f9SXin Li     CKSUM_FP = implementations[IMPL].cksum_fp;
279*412f47f9SXin Li     POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
280*412f47f9SXin Li     uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
281*412f47f9SXin Li 			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
282*412f47f9SXin Li     if (base == MAP_FAILED)
283*412f47f9SXin Li     {
284*412f47f9SXin Li 	perror("aligned_alloc"), exit(EXIT_FAILURE);
285*412f47f9SXin Li     }
286*412f47f9SXin Li     for (size_t i = 0; i < POOLSIZE / 4; i++)
287*412f47f9SXin Li     {
288*412f47f9SXin Li 	((uint32_t *) base)[i] = rand();
289*412f47f9SXin Li     }
290*412f47f9SXin Li 
291*412f47f9SXin Li     printf("Implementation: %s\n", implementations[IMPL].name);
292*412f47f9SXin Li     printf("numops %u, poolsize ", NUMOPS);
293*412f47f9SXin Li     if (POOLSIZE % (1024 * 1024) == 0)
294*412f47f9SXin Li     {
295*412f47f9SXin Li 	printf("%uMiB", POOLSIZE / (1024 * 1024));
296*412f47f9SXin Li     }
297*412f47f9SXin Li     else if (POOLSIZE % 1024 == 0)
298*412f47f9SXin Li     {
299*412f47f9SXin Li 	printf("%uKiB", POOLSIZE / 1024);
300*412f47f9SXin Li     }
301*412f47f9SXin Li     else
302*412f47f9SXin Li     {
303*412f47f9SXin Li 	printf("%uB", POOLSIZE);
304*412f47f9SXin Li     }
305*412f47f9SXin Li     printf(", blocksize %u, CPU frequency %juMHz\n",
306*412f47f9SXin Li 	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
307*412f47f9SXin Li #if WANT_ASSERT
308*412f47f9SXin Li     printf("Warning: assertions are enabled\n");
309*412f47f9SXin Li #endif
310*412f47f9SXin Li 
311*412f47f9SXin Li     if (DUMP)
312*412f47f9SXin Li     {
313*412f47f9SXin Li 	/* Print out first 96 bytes of data for human debugging */
314*412f47f9SXin Li 	for (int i = 0; i < 96; i++)
315*412f47f9SXin Li 	{
316*412f47f9SXin Li 	    if (i % 8 == 0)
317*412f47f9SXin Li 		printf("%2u:", i);
318*412f47f9SXin Li 	    printf(" %02x", base[i]);
319*412f47f9SXin Li 	    if (i % 8 == 7)
320*412f47f9SXin Li 		printf("\n");
321*412f47f9SXin Li 	}
322*412f47f9SXin Li     }
323*412f47f9SXin Li 
324*412f47f9SXin Li     /* Verify that chosen algorithm handles all combinations of offsets and sizes */
325*412f47f9SXin Li     printf("Verifying..."); fflush(stdout);
326*412f47f9SXin Li     bool success = true;
327*412f47f9SXin Li     /* Check all (relevant) combinations of size and offset */
328*412f47f9SXin Li     for (int size = 0; size <= 256; size++)
329*412f47f9SXin Li     {
330*412f47f9SXin Li 	for (int offset = 0; offset < 255; offset++)
331*412f47f9SXin Li 	{
332*412f47f9SXin Li 	    /* Check at start of mapped memory */
333*412f47f9SXin Li 	    success &= verify(&base[offset], offset, size);
334*412f47f9SXin Li 	    /* Check at end of mapped memory */
335*412f47f9SXin Li 	    uint8_t *p = base + POOLSIZE - (size + offset);
336*412f47f9SXin Li 	    success &= verify(p, (uintptr_t) p % 64, size);
337*412f47f9SXin Li 	}
338*412f47f9SXin Li     }
339*412f47f9SXin Li     /* Check increasingly larger sizes */
340*412f47f9SXin Li     for (size_t size = 1; size < POOLSIZE; size *= 2)
341*412f47f9SXin Li     {
342*412f47f9SXin Li 	success &= verify(base, 0, size);
343*412f47f9SXin Li     }
344*412f47f9SXin Li     /* Check the full size, this can detect accumulator overflows */
345*412f47f9SXin Li     success &= verify(base, 0, POOLSIZE);
346*412f47f9SXin Li     printf("%s\n", success ? "OK" : "failure");
347*412f47f9SXin Li 
348*412f47f9SXin Li     /* Print throughput in decimal megabyte (1000000B) per second */
349*412f47f9SXin Li     if (CPUFREQ != 0)
350*412f47f9SXin Li     {
351*412f47f9SXin Li 	printf("%11s %11s %11s %11s\n",
352*412f47f9SXin Li 	       "block size", "MB/s", "cycles/blk", "cycles/byte");
353*412f47f9SXin Li     }
354*412f47f9SXin Li     else
355*412f47f9SXin Li     {
356*412f47f9SXin Li 	printf("%11s %11s %11s %11s\n",
357*412f47f9SXin Li 	       "block size", "MB/s", "ns/blk", "ns/byte");
358*412f47f9SXin Li 	CPUFREQ = 1000000000;
359*412f47f9SXin Li     }
360*412f47f9SXin Li     if (BLKSIZE != 0)
361*412f47f9SXin Li     {
362*412f47f9SXin Li 	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
363*412f47f9SXin Li     }
364*412f47f9SXin Li     else
365*412f47f9SXin Li     {
366*412f47f9SXin Li 	static const uint16_t sizes[] =
367*412f47f9SXin Li 	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
368*412f47f9SXin Li 	for (int i = 0; sizes[i] != 0; i++)
369*412f47f9SXin Li 	{
370*412f47f9SXin Li 	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
371*412f47f9SXin Li 	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
372*412f47f9SXin Li 	}
373*412f47f9SXin Li     }
374*412f47f9SXin Li 
375*412f47f9SXin Li     if (munmap(base, POOLSIZE) != 0)
376*412f47f9SXin Li     {
377*412f47f9SXin Li 	perror("munmap"), exit(EXIT_FAILURE);
378*412f47f9SXin Li     }
379*412f47f9SXin Li 
380*412f47f9SXin Li     return success ? EXIT_SUCCESS : EXIT_FAILURE;
381*412f47f9SXin Li }
382