1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <inttypes.h>
5 #include <sys/types.h>
6 #include <sys/stat.h>
7 #include <sys/time.h>
8 #include <fcntl.h>
9 #include <unistd.h>
10
11 int utf8_naive(const unsigned char *data, int len);
12 int utf8_lookup(const unsigned char *data, int len);
13 int utf8_boost(const unsigned char *data, int len);
14 int utf8_lemire(const unsigned char *data, int len);
15 int utf8_range(const unsigned char *data, int len);
16 int utf8_range2(const unsigned char *data, int len);
17 #ifdef __AVX2__
18 int utf8_lemire_avx2(const unsigned char *data, int len);
19 int utf8_range_avx2(const unsigned char *data, int len);
20 #endif
21
22 static struct ftab {
23 const char *name;
24 int (*func)(const unsigned char *data, int len);
25 } ftab[] = {
26 {
27 .name = "naive",
28 .func = utf8_naive,
29 },
30 {
31 .name = "lookup",
32 .func = utf8_lookup,
33 },
34 {
35 .name = "lemire",
36 .func = utf8_lemire,
37 },
38 {
39 .name = "range",
40 .func = utf8_range,
41 },
42 {
43 .name = "range2",
44 .func = utf8_range2,
45 },
46 #ifdef __AVX2__
47 {
48 .name = "lemire_avx2",
49 .func = utf8_lemire_avx2,
50 },
51 {
52 .name = "range_avx2",
53 .func = utf8_range_avx2,
54 },
55 #endif
56 #ifdef BOOST
57 {
58 .name = "boost",
59 .func = utf8_boost,
60 },
61 #endif
62 };
63
load_test_buf(int len)64 static unsigned char *load_test_buf(int len)
65 {
66 const char utf8[] = "\xF0\x90\xBF\x80";
67 const int utf8_len = sizeof(utf8)/sizeof(utf8[0]) - 1;
68
69 unsigned char *data = malloc(len);
70 unsigned char *p = data;
71
72 while (len >= utf8_len) {
73 memcpy(p, utf8, utf8_len);
74 p += utf8_len;
75 len -= utf8_len;
76 }
77
78 while (len--)
79 *p++ = 0x7F;
80
81 return data;
82 }
83
load_test_file(int * len)84 static unsigned char *load_test_file(int *len)
85 {
86 unsigned char *data;
87 int fd;
88 struct stat stat;
89
90 fd = open("./UTF-8-demo.txt", O_RDONLY);
91 if (fd == -1) {
92 printf("Failed to open UTF-8-demo.txt!\n");
93 exit(1);
94 }
95 if (fstat(fd, &stat) == -1) {
96 printf("Failed to get file size!\n");
97 exit(1);
98 }
99
100 *len = stat.st_size;
101 data = malloc(*len);
102 if (read(fd, data, *len) != *len) {
103 printf("Failed to read file!\n");
104 exit(1);
105 }
106
107 utf8_range(data, *len);
108 #ifdef __AVX2__
109 utf8_range_avx2(data, *len);
110 #endif
111 close(fd);
112
113 return data;
114 }
115
print_test(const unsigned char * data,int len)116 static void print_test(const unsigned char *data, int len)
117 {
118 while (len--)
119 printf("\\x%02X", *data++);
120
121 printf("\n");
122 }
123
124 struct test {
125 const unsigned char *data;
126 int len;
127 };
128
prepare_test_buf(unsigned char * buf,const struct test * pos,int pos_len,int pos_idx)129 static void prepare_test_buf(unsigned char *buf, const struct test *pos,
130 int pos_len, int pos_idx)
131 {
132 /* Round concatenate correct tokens to 1024 bytes */
133 int buf_idx = 0;
134 while (buf_idx < 1024) {
135 int buf_len = 1024 - buf_idx;
136
137 if (buf_len >= pos[pos_idx].len) {
138 memcpy(buf+buf_idx, pos[pos_idx].data, pos[pos_idx].len);
139 buf_idx += pos[pos_idx].len;
140 } else {
141 memset(buf+buf_idx, 0, buf_len);
142 buf_idx += buf_len;
143 }
144
145 if (++pos_idx == pos_len)
146 pos_idx = 0;
147 }
148 }
149
150 /* Return 0 on success, -1 on error */
test_manual(const struct ftab * ftab)151 static int test_manual(const struct ftab *ftab)
152 {
153 #pragma GCC diagnostic push
154 #pragma GCC diagnostic ignored "-Wpointer-sign"
155 /* positive tests */
156 static const struct test pos[] = {
157 {"", 0},
158 {"\x00", 1},
159 {"\x66", 1},
160 {"\x7F", 1},
161 {"\x00\x7F", 2},
162 {"\x7F\x00", 2},
163 {"\xC2\x80", 2},
164 {"\xDF\xBF", 2},
165 {"\xE0\xA0\x80", 3},
166 {"\xE0\xA0\xBF", 3},
167 {"\xED\x9F\x80", 3},
168 {"\xEF\x80\xBF", 3},
169 {"\xF0\x90\xBF\x80", 4},
170 {"\xF2\x81\xBE\x99", 4},
171 {"\xF4\x8F\x88\xAA", 4},
172 };
173
174 /* negative tests */
175 static const struct test neg[] = {
176 {"\x80", 1},
177 {"\xBF", 1},
178 {"\xC0\x80", 2},
179 {"\xC1\x00", 2},
180 {"\xC2\x7F", 2},
181 {"\xDF\xC0", 2},
182 {"\xE0\x9F\x80", 3},
183 {"\xE0\xC2\x80", 3},
184 {"\xED\xA0\x80", 3},
185 {"\xED\x7F\x80", 3},
186 {"\xEF\x80\x00", 3},
187 {"\xF0\x8F\x80\x80", 4},
188 {"\xF0\xEE\x80\x80", 4},
189 {"\xF2\x90\x91\x7F", 4},
190 {"\xF4\x90\x88\xAA", 4},
191 {"\xF4\x00\xBF\xBF", 4},
192 {"\x00\x00\x00\x00\x00\xC2\x80\x00\x00\x00\xE1\x80\x80\x00\x00\xC2" \
193 "\xC2\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
194 32},
195 {"\x00\x00\x00\x00\x00\xC2\xC2\x80\x00\x00\xE1\x80\x80\x00\x00\x00",
196 16},
197 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
198 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80",
199 32},
200 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
201 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1",
202 32},
203 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
204 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
205 "\x80", 33},
206 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
207 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
208 "\xC2\x80", 34},
209 {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
210 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF0" \
211 "\x80\x80\x80", 35},
212 };
213 #pragma GCC diagnostic push
214
215 /* Test single token */
216 for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
217 if (ftab->func(pos[i].data, pos[i].len) != 0) {
218 printf("FAILED positive test: ");
219 print_test(pos[i].data, pos[i].len);
220 return -1;
221 }
222 }
223 for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
224 if (ftab->func(neg[i].data, neg[i].len) == 0) {
225 printf("FAILED negitive test: ");
226 print_test(neg[i].data, neg[i].len);
227 return -1;
228 }
229 }
230
231 /* Test shifted buffer to cover 1k length */
232 /* buffer size must be greater than 1024 + 16 + max(test string length) */
233 const int max_size = 1024*2;
234 uint64_t buf64[max_size/8 + 2];
235 /* Offset 8 bytes by 1 byte */
236 unsigned char *buf = ((unsigned char *)buf64) + 1;
237 int buf_len;
238
239 for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
240 /* Positive test: shift 16 bytes, validate each shift */
241 prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), i);
242 buf_len = 1024;
243 for (int j = 0; j < 16; ++j) {
244 if (ftab->func(buf, buf_len) != 0) {
245 printf("FAILED positive test: ");
246 print_test(buf, buf_len);
247 return -1;
248 }
249 for (int k = buf_len; k >= 1; --k)
250 buf[k] = buf[k-1];
251 buf[0] = '\x55';
252 ++buf_len;
253 }
254
255 /* Negative test: trunk last non ascii */
256 while (buf_len >= 1 && buf[buf_len-1] <= 0x7F)
257 --buf_len;
258 if (buf_len && ftab->func(buf, buf_len-1) == 0) {
259 printf("FAILED negitive test: ");
260 print_test(buf, buf_len);
261 return -1;
262 }
263 }
264
265 /* Negative test */
266 for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
267 /* Append one error token, shift 16 bytes, validate each shift */
268 int pos_idx = i % (sizeof(pos)/sizeof(pos[0]));
269 prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), pos_idx);
270 memcpy(buf+1024, neg[i].data, neg[i].len);
271 buf_len = 1024 + neg[i].len;
272 for (int j = 0; j < 16; ++j) {
273 if (ftab->func(buf, buf_len) == 0) {
274 printf("FAILED negative test: ");
275 print_test(buf, buf_len);
276 return -1;
277 }
278 for (int k = buf_len; k >= 1; --k)
279 buf[k] = buf[k-1];
280 buf[0] = '\x66';
281 ++buf_len;
282 }
283 }
284
285 return 0;
286 }
287
test(const unsigned char * data,int len,const struct ftab * ftab)288 static int test(const unsigned char *data, int len, const struct ftab *ftab)
289 {
290 int ret_standard = ftab->func(data, len);
291 int ret_manual = test_manual(ftab);
292 printf("%s\n", ftab->name);
293 printf("standard test: %s\n", ret_standard ? "FAIL" : "pass");
294 printf("manual test: %s\n", ret_manual ? "FAIL" : "pass");
295
296 return ret_standard | ret_manual;
297 }
298
bench(const unsigned char * data,int len,const struct ftab * ftab)299 static int bench(const unsigned char *data, int len, const struct ftab *ftab)
300 {
301 const int loops = 1024*1024*1024/len;
302 int ret = 0;
303 double time, size;
304 struct timeval tv1, tv2;
305
306 fprintf(stderr, "bench %s... ", ftab->name);
307 gettimeofday(&tv1, 0);
308 for (int i = 0; i < loops; ++i)
309 ret |= ftab->func(data, len);
310 gettimeofday(&tv2, 0);
311 printf("%s\n", ret?"FAIL":"pass");
312
313 time = tv2.tv_usec - tv1.tv_usec;
314 time = time / 1000000 + tv2.tv_sec - tv1.tv_sec;
315 size = ((double)len * loops) / (1024*1024);
316 printf("time: %.4f s\n", time);
317 printf("data: %.0f MB\n", size);
318 printf("BW: %.2f MB/s\n", size / time);
319
320 return 0;
321 }
322
usage(const char * bin)323 static void usage(const char *bin)
324 {
325 printf("Usage:\n");
326 printf("%s test [alg] ==> test all or one algorithm\n", bin);
327 printf("%s bench [alg] ==> benchmark all or one algorithm\n", bin);
328 printf("%s bench size NUM ==> benchmark with specific buffer size\n", bin);
329 printf("alg = ");
330 for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i)
331 printf("%s ", ftab[i].name);
332 printf("\nNUM = buffer size in bytes, 1 ~ 67108864(64M)\n");
333 }
334
main(int argc,char * argv[])335 int main(int argc, char *argv[])
336 {
337 int len = 0;
338 unsigned char *data;
339 const char *alg = NULL;
340 int (*tb)(const unsigned char *data, int len, const struct ftab *ftab);
341
342 tb = NULL;
343 if (argc >= 2) {
344 if (strcmp(argv[1], "test") == 0)
345 tb = test;
346 else if (strcmp(argv[1], "bench") == 0)
347 tb = bench;
348 if (argc >= 3) {
349 alg = argv[2];
350 if (strcmp(alg, "size") == 0) {
351 if (argc < 4) {
352 tb = NULL;
353 } else {
354 alg = NULL;
355 len = atoi(argv[3]);
356 if (len <= 0 || len > 67108864) {
357 printf("Buffer size error!\n\n");
358 tb = NULL;
359 }
360 }
361 }
362 }
363 }
364
365 if (tb == NULL) {
366 usage(argv[0]);
367 return 1;
368 }
369
370 /* Load UTF8 test buffer */
371 if (len)
372 data = load_test_buf(len);
373 else
374 data = load_test_file(&len);
375
376 int ret = 0;
377 if (tb == bench)
378 printf("=============== Bench UTF8 (%d bytes) ===============\n", len);
379 for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
380 if (alg && strcmp(alg, ftab[i].name) != 0)
381 continue;
382 ret |= tb((const unsigned char *)data, len, &ftab[i]);
383 printf("\n");
384 }
385
386 #if 0
387 if (tb == bench) {
388 printf("==================== Bench ASCII ====================\n");
389 /* Change test buffer to ascii */
390 for (int i = 0; i < len; i++)
391 data[i] &= 0x7F;
392
393 for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
394 if (alg && strcmp(alg, ftab[i].name) != 0)
395 continue;
396 tb((const unsigned char *)data, len, &ftab[i]);
397 printf("\n");
398 }
399 }
400 #endif
401
402 free(data);
403
404 return ret;
405 }
406