1 /*
2 * memcpy benchmark.
3 *
4 * Copyright (c) 2020-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #define _GNU_SOURCE
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <assert.h>
13 #include "stringlib.h"
14 #include "benchlib.h"
15
16 #define ITERS 5000
17 #define ITERS2 20000000
18 #define ITERS3 200000
19 #define NUM_TESTS 16384
20 #define MIN_SIZE 32768
21 #define MAX_SIZE (1024 * 1024)
22
23 static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
24 static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
25
26 #define F(x) {#x, x},
27
28 static const struct fun
29 {
30 const char *name;
31 void *(*fun)(void *, const void *, size_t);
32 } funtab[] =
33 {
34 #if __aarch64__
35 F(__memcpy_aarch64)
36 # if __ARM_NEON
37 F(__memcpy_aarch64_simd)
38 # endif
39 # if __ARM_FEATURE_SVE
40 F(__memcpy_aarch64_sve)
41 # endif
42 # if WANT_MOPS
43 F(__memcpy_aarch64_mops)
44 # endif
45 #elif __arm__
46 F(__memcpy_arm)
47 #endif
48 F(memcpy)
49 #undef F
50 {0, 0}
51 };
52
53 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
54 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
55
56 #define SIZE_NUM 65536
57 #define SIZE_MASK (SIZE_NUM-1)
58 static uint8_t size_arr[SIZE_NUM];
59
60 /* Frequency data for memcpy of less than 4096 bytes based on SPEC2017. */
61 static freq_data_t size_freq[] =
62 {
63 {32,22320}, { 16,9554}, { 8,8915}, {152,5327}, { 4,2159}, {292,2035},
64 { 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
65 {120, 661}, { 2, 649}, {882, 550}, { 5, 475}, { 7, 461}, {108, 460},
66 { 10, 361}, { 9, 361}, { 6, 334}, { 3, 326}, {464, 308}, {2048,303},
67 { 1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
68 {192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288, 96},
69 {104, 96}, {1144, 83}, { 18, 80}, { 23, 78}, { 40, 77}, { 19, 68},
70 { 48, 63}, { 17, 57}, { 72, 54}, {1280, 51}, { 20, 49}, { 28, 47},
71 { 22, 46}, {640, 45}, { 25, 41}, { 14, 40}, { 56, 37}, { 27, 35},
72 { 35, 33}, {384, 33}, { 29, 32}, { 80, 30}, {4095, 22}, {232, 22},
73 { 36, 19}, {184, 17}, { 21, 17}, {256, 16}, { 44, 15}, { 26, 15},
74 { 31, 14}, { 88, 14}, {176, 13}, { 33, 12}, {1024, 12}, {208, 11},
75 { 62, 11}, {128, 10}, {704, 10}, {324, 10}, { 96, 10}, { 60, 9},
76 {136, 9}, {124, 9}, { 34, 8}, { 30, 8}, {480, 8}, {1344, 8},
77 {273, 7}, {520, 7}, {112, 6}, { 52, 6}, {344, 6}, {336, 6},
78 {504, 5}, {168, 5}, {424, 5}, { 0, 4}, { 76, 3}, {200, 3},
79 {512, 3}, {312, 3}, {240, 3}, {960, 3}, {264, 2}, {672, 2},
80 { 38, 2}, {328, 2}, { 84, 2}, { 39, 2}, {216, 2}, { 42, 2},
81 { 37, 2}, {1608, 2}, { 70, 2}, { 46, 2}, {536, 2}, {280, 1},
82 {248, 1}, { 47, 1}, {1088, 1}, {1288, 1}, {224, 1}, { 41, 1},
83 { 50, 1}, { 49, 1}, {808, 1}, {360, 1}, {440, 1}, { 43, 1},
84 { 45, 1}, { 78, 1}, {968, 1}, {392, 1}, { 54, 1}, { 53, 1},
85 { 59, 1}, {376, 1}, {664, 1}, { 58, 1}, {272, 1}, { 66, 1},
86 {2688, 1}, {472, 1}, {568, 1}, {720, 1}, { 51, 1}, { 63, 1},
87 { 86, 1}, {496, 1}, {776, 1}, { 57, 1}, {680, 1}, {792, 1},
88 {122, 1}, {760, 1}, {824, 1}, {552, 1}, { 67, 1}, {456, 1},
89 {984, 1}, { 74, 1}, {408, 1}, { 75, 1}, { 92, 1}, {576, 1},
90 {116, 1}, { 65, 1}, {117, 1}, { 82, 1}, {352, 1}, { 55, 1},
91 {100, 1}, { 90, 1}, {696, 1}, {111, 1}, {880, 1}, { 79, 1},
92 {488, 1}, { 61, 1}, {114, 1}, { 94, 1}, {1032, 1}, { 98, 1},
93 { 87, 1}, {584, 1}, { 85, 1}, {648, 1}, {0, 0}
94 };
95
96 #define ALIGN_NUM 1024
97 #define ALIGN_MASK (ALIGN_NUM-1)
98 static uint8_t src_align_arr[ALIGN_NUM];
99 static uint8_t dst_align_arr[ALIGN_NUM];
100
101 /* Source alignment frequency for memcpy based on SPEC2017. */
102 static align_data_t src_align_freq[] =
103 {
104 {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
105 };
106
107 static align_data_t dst_align_freq[] =
108 {
109 {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
110 };
111
112 typedef struct
113 {
114 uint64_t src : 24;
115 uint64_t dst : 24;
116 uint64_t len : 16;
117 } copy_t;
118
119 static copy_t test_arr[NUM_TESTS];
120
121 typedef char *(*proto_t) (char *, const char *, size_t);
122
123 static void
init_copy_distribution(void)124 init_copy_distribution (void)
125 {
126 int i, j, freq, size, n;
127
128 for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
129 for (j = 0, size = size_freq[i].size; j < freq; j++)
130 size_arr[n++] = size;
131 assert (n == SIZE_NUM);
132
133 for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
134 for (j = 0, size = src_align_freq[i].align; j < freq; j++)
135 src_align_arr[n++] = size - 1;
136 assert (n == ALIGN_NUM);
137
138 for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
139 for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
140 dst_align_arr[n++] = size - 1;
141 assert (n == ALIGN_NUM);
142 }
143
144 static size_t
init_copies(size_t max_size)145 init_copies (size_t max_size)
146 {
147 size_t total = 0;
148 /* Create a random set of copies with the given size and alignment
149 distributions. */
150 for (int i = 0; i < NUM_TESTS; i++)
151 {
152 test_arr[i].dst = (rand32 (0) & (max_size - 1));
153 test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
154 test_arr[i].src = (rand32 (0) & (max_size - 1));
155 test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
156 test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
157 total += test_arr[i].len;
158 }
159
160 return total;
161 }
162
main(void)163 int main (void)
164 {
165 init_copy_distribution ();
166
167 memset (a, 1, sizeof (a));
168 memset (b, 2, sizeof (b));
169
170 printf("Random memcpy (bytes/ns):\n");
171 for (int f = 0; funtab[f].name != 0; f++)
172 {
173 size_t total = 0;
174 uint64_t tsum = 0;
175 printf ("%22s ", funtab[f].name);
176 rand32 (0x12345678);
177
178 for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
179 {
180 size_t copy_size = init_copies (size) * ITERS;
181
182 for (int c = 0; c < NUM_TESTS; c++)
183 funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
184 test_arr[c].len);
185
186 uint64_t t = clock_get_ns ();
187 for (int i = 0; i < ITERS; i++)
188 for (int c = 0; c < NUM_TESTS; c++)
189 funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
190 test_arr[c].len);
191 t = clock_get_ns () - t;
192 total += copy_size;
193 tsum += t;
194 printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
195 }
196 printf( "avg %.2f\n", (double)total / tsum);
197 }
198
199 size_t total = 0;
200 uint64_t tsum = 0;
201 printf ("%22s ", "memcpy_call");
202 rand32 (0x12345678);
203
204 for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
205 {
206 size_t copy_size = init_copies (size) * ITERS;
207
208 for (int c = 0; c < NUM_TESTS; c++)
209 memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
210
211 uint64_t t = clock_get_ns ();
212 for (int i = 0; i < ITERS; i++)
213 for (int c = 0; c < NUM_TESTS; c++)
214 memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
215 t = clock_get_ns () - t;
216 total += copy_size;
217 tsum += t;
218 printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
219 }
220 printf( "avg %.2f\n", (double)total / tsum);
221
222
223 printf ("\nAligned medium memcpy (bytes/ns):\n");
224 for (int f = 0; funtab[f].name != 0; f++)
225 {
226 printf ("%22s ", funtab[f].name);
227
228 for (int size = 8; size <= 512; size *= 2)
229 {
230 uint64_t t = clock_get_ns ();
231 for (int i = 0; i < ITERS2; i++)
232 funtab[f].fun (b, a, size);
233 t = clock_get_ns () - t;
234 printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
235 }
236 printf ("\n");
237 }
238
239 printf ("%22s ", "memcpy_call");
240 for (int size = 8; size <= 512; size *= 2)
241 {
242 uint64_t t = clock_get_ns ();
243 for (int i = 0; i < ITERS2; i++)
244 memcpy (b, a, size);
245 t = clock_get_ns () - t;
246 printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
247 }
248 printf ("\n");
249
250
251 printf ("\nUnaligned medium memcpy (bytes/ns):\n");
252 for (int f = 0; funtab[f].name != 0; f++)
253 {
254 printf ("%22s ", funtab[f].name);
255
256 for (int size = 8; size <= 512; size *= 2)
257 {
258 uint64_t t = clock_get_ns ();
259 for (int i = 0; i < ITERS2; i++)
260 funtab[f].fun (b + 3, a + 1, size);
261 t = clock_get_ns () - t;
262 printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
263 }
264 printf ("\n");
265 }
266
267 printf ("%22s ", "memcpy_call");
268 for (int size = 8; size <= 512; size *= 2)
269 {
270 uint64_t t = clock_get_ns ();
271 for (int i = 0; i < ITERS2; i++)
272 memcpy (b + 3, a + 1, size);
273 t = clock_get_ns () - t;
274 printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
275 }
276 printf ("\n");
277
278
279 printf ("\nLarge memcpy (bytes/ns):\n");
280 for (int f = 0; funtab[f].name != 0; f++)
281 {
282 printf ("%22s ", funtab[f].name);
283
284 for (int size = 1024; size <= 65536; size *= 2)
285 {
286 uint64_t t = clock_get_ns ();
287 for (int i = 0; i < ITERS3; i++)
288 funtab[f].fun (b, a, size);
289 t = clock_get_ns () - t;
290 printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
291 }
292 printf ("\n");
293 }
294
295 printf ("%22s ", "memcpy_call");
296 for (int size = 1024; size <= 65536; size *= 2)
297 {
298 uint64_t t = clock_get_ns ();
299 for (int i = 0; i < ITERS3; i++)
300 memcpy (b, a, size);
301 t = clock_get_ns () - t;
302 printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
303 }
304 printf ("\n");
305
306
307 printf ("\nUnaligned forwards memmove (bytes/ns):\n");
308 for (int f = 0; funtab[f].name != 0; f++)
309 {
310 printf ("%22s ", funtab[f].name);
311
312 for (int size = 1024; size <= 65536; size *= 2)
313 {
314 uint64_t t = clock_get_ns ();
315 for (int i = 0; i < ITERS3; i++)
316 funtab[f].fun (a, a + 256 + (i & 31), size);
317 t = clock_get_ns () - t;
318 printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
319 }
320 printf ("\n");
321 }
322
323
324 printf ("\nUnaligned backwards memmove (bytes/ns):\n");
325 for (int f = 0; funtab[f].name != 0; f++)
326 {
327 printf ("%22s ", funtab[f].name);
328
329 for (int size = 1024; size <= 65536; size *= 2)
330 {
331 uint64_t t = clock_get_ns ();
332 for (int i = 0; i < ITERS3; i++)
333 funtab[f].fun (a + 256 + (i & 31), a, size);
334 t = clock_get_ns () - t;
335 printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
336 }
337 printf ("\n");
338 }
339 printf ("\n");
340
341 return 0;
342 }
343