xref: /aosp_15_r20/external/grpc-grpc/third_party/utf8_range/lookup.c (revision cc02d7e222339f7a4f6ba5f422e6413f4bd931f2)
1 #include <stdio.h>
2 
3 /* http://bjoern.hoehrmann.de/utf-8/decoder/dfa */
4 /* Optimized version based on Rich Felker's variant. */
5 #define UTF8_ACCEPT	0
6 #define UTF8_REJECT	12
7 
8 static const unsigned char utf8d[] = {
9     /* The first part of the table maps bytes to character classes that
10      * to reduce the size of the transition table and create bitmasks. */
11      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
12      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
13      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
14      0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
15      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
16      7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
17      8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
18     10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
19 };
20 /* Note: Splitting the table improves performance on ARM due to its simpler
21  * addressing modes not being able to encode x[y + 256]. */
22 static const unsigned char utf8s[] = {
23     /* The second part is a transition table that maps a combination
24      * of a state of the automaton and a character class to a state. */
25      0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
26     12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
27     12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
28     12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
29     12,36,12,12,12,12,12,12,12,12,12,12
30 };
31 
32 /* Return 0 on success, -1 on error */
utf8_lookup(const unsigned char * data,int len)33 int utf8_lookup(const unsigned char *data, int len)
34 {
35     int state = 0;
36 
37     while (len-- && state != UTF8_REJECT)
38         state = utf8s[state + utf8d[*data++]];
39 
40     return state == UTF8_ACCEPT ? 0 : -1;
41 }
42