1 #include <stdio.h>
2
3 /* http://bjoern.hoehrmann.de/utf-8/decoder/dfa */
4 /* Optimized version based on Rich Felker's variant. */
5 #define UTF8_ACCEPT 0
6 #define UTF8_REJECT 12
7
8 static const unsigned char utf8d[] = {
9 /* The first part of the table maps bytes to character classes that
10 * to reduce the size of the transition table and create bitmasks. */
11 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
12 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
13 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
14 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
15 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
16 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
17 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
18 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
19 };
20 /* Note: Splitting the table improves performance on ARM due to its simpler
21 * addressing modes not being able to encode x[y + 256]. */
22 static const unsigned char utf8s[] = {
23 /* The second part is a transition table that maps a combination
24 * of a state of the automaton and a character class to a state. */
25 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
26 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
27 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
28 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
29 12,36,12,12,12,12,12,12,12,12,12,12
30 };
31
32 /* Return 0 on success, -1 on error */
utf8_lookup(const unsigned char * data,int len)33 int utf8_lookup(const unsigned char *data, int len)
34 {
35 int state = 0;
36
37 while (len-- && state != UTF8_REJECT)
38 state = utf8s[state + utf8d[*data++]];
39
40 return state == UTF8_ACCEPT ? 0 : -1;
41 }
42