1*1b3f573fSAndroid Build Coastguard Worker #include <stdio.h>
2*1b3f573fSAndroid Build Coastguard Worker
3*1b3f573fSAndroid Build Coastguard Worker /*
4*1b3f573fSAndroid Build Coastguard Worker * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
5*1b3f573fSAndroid Build Coastguard Worker *
6*1b3f573fSAndroid Build Coastguard Worker * Table 3-7. Well-Formed UTF-8 Byte Sequences
7*1b3f573fSAndroid Build Coastguard Worker *
8*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
9*1b3f573fSAndroid Build Coastguard Worker * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
10*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
11*1b3f573fSAndroid Build Coastguard Worker * | U+0000..U+007F | 00..7F | | | |
12*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
13*1b3f573fSAndroid Build Coastguard Worker * | U+0080..U+07FF | C2..DF | 80..BF | | |
14*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
15*1b3f573fSAndroid Build Coastguard Worker * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
16*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
17*1b3f573fSAndroid Build Coastguard Worker * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
18*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
19*1b3f573fSAndroid Build Coastguard Worker * | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
20*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
21*1b3f573fSAndroid Build Coastguard Worker * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
22*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
23*1b3f573fSAndroid Build Coastguard Worker * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
24*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
25*1b3f573fSAndroid Build Coastguard Worker * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
26*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
27*1b3f573fSAndroid Build Coastguard Worker * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
28*1b3f573fSAndroid Build Coastguard Worker * +--------------------+------------+-------------+------------+-------------+
29*1b3f573fSAndroid Build Coastguard Worker */
30*1b3f573fSAndroid Build Coastguard Worker
31*1b3f573fSAndroid Build Coastguard Worker /* Return 0 - success, >0 - index(1 based) of first error char */
utf8_naive(const unsigned char * data,int len)32*1b3f573fSAndroid Build Coastguard Worker int utf8_naive(const unsigned char *data, int len)
33*1b3f573fSAndroid Build Coastguard Worker {
34*1b3f573fSAndroid Build Coastguard Worker int err_pos = 1;
35*1b3f573fSAndroid Build Coastguard Worker
36*1b3f573fSAndroid Build Coastguard Worker while (len) {
37*1b3f573fSAndroid Build Coastguard Worker int bytes;
38*1b3f573fSAndroid Build Coastguard Worker const unsigned char byte1 = data[0];
39*1b3f573fSAndroid Build Coastguard Worker
40*1b3f573fSAndroid Build Coastguard Worker /* 00..7F */
41*1b3f573fSAndroid Build Coastguard Worker if (byte1 <= 0x7F) {
42*1b3f573fSAndroid Build Coastguard Worker bytes = 1;
43*1b3f573fSAndroid Build Coastguard Worker /* C2..DF, 80..BF */
44*1b3f573fSAndroid Build Coastguard Worker } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
45*1b3f573fSAndroid Build Coastguard Worker (signed char)data[1] <= (signed char)0xBF) {
46*1b3f573fSAndroid Build Coastguard Worker bytes = 2;
47*1b3f573fSAndroid Build Coastguard Worker } else if (len >= 3) {
48*1b3f573fSAndroid Build Coastguard Worker const unsigned char byte2 = data[1];
49*1b3f573fSAndroid Build Coastguard Worker
50*1b3f573fSAndroid Build Coastguard Worker /* Is byte2, byte3 between 0x80 ~ 0xBF */
51*1b3f573fSAndroid Build Coastguard Worker const int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
52*1b3f573fSAndroid Build Coastguard Worker const int byte3_ok = (signed char)data[2] <= (signed char)0xBF;
53*1b3f573fSAndroid Build Coastguard Worker
54*1b3f573fSAndroid Build Coastguard Worker if (byte2_ok && byte3_ok &&
55*1b3f573fSAndroid Build Coastguard Worker /* E0, A0..BF, 80..BF */
56*1b3f573fSAndroid Build Coastguard Worker ((byte1 == 0xE0 && byte2 >= 0xA0) ||
57*1b3f573fSAndroid Build Coastguard Worker /* E1..EC, 80..BF, 80..BF */
58*1b3f573fSAndroid Build Coastguard Worker (byte1 >= 0xE1 && byte1 <= 0xEC) ||
59*1b3f573fSAndroid Build Coastguard Worker /* ED, 80..9F, 80..BF */
60*1b3f573fSAndroid Build Coastguard Worker (byte1 == 0xED && byte2 <= 0x9F) ||
61*1b3f573fSAndroid Build Coastguard Worker /* EE..EF, 80..BF, 80..BF */
62*1b3f573fSAndroid Build Coastguard Worker (byte1 >= 0xEE && byte1 <= 0xEF))) {
63*1b3f573fSAndroid Build Coastguard Worker bytes = 3;
64*1b3f573fSAndroid Build Coastguard Worker } else if (len >= 4) {
65*1b3f573fSAndroid Build Coastguard Worker /* Is byte4 between 0x80 ~ 0xBF */
66*1b3f573fSAndroid Build Coastguard Worker const int byte4_ok = (signed char)data[3] <= (signed char)0xBF;
67*1b3f573fSAndroid Build Coastguard Worker
68*1b3f573fSAndroid Build Coastguard Worker if (byte2_ok && byte3_ok && byte4_ok &&
69*1b3f573fSAndroid Build Coastguard Worker /* F0, 90..BF, 80..BF, 80..BF */
70*1b3f573fSAndroid Build Coastguard Worker ((byte1 == 0xF0 && byte2 >= 0x90) ||
71*1b3f573fSAndroid Build Coastguard Worker /* F1..F3, 80..BF, 80..BF, 80..BF */
72*1b3f573fSAndroid Build Coastguard Worker (byte1 >= 0xF1 && byte1 <= 0xF3) ||
73*1b3f573fSAndroid Build Coastguard Worker /* F4, 80..8F, 80..BF, 80..BF */
74*1b3f573fSAndroid Build Coastguard Worker (byte1 == 0xF4 && byte2 <= 0x8F))) {
75*1b3f573fSAndroid Build Coastguard Worker bytes = 4;
76*1b3f573fSAndroid Build Coastguard Worker } else {
77*1b3f573fSAndroid Build Coastguard Worker return err_pos;
78*1b3f573fSAndroid Build Coastguard Worker }
79*1b3f573fSAndroid Build Coastguard Worker } else {
80*1b3f573fSAndroid Build Coastguard Worker return err_pos;
81*1b3f573fSAndroid Build Coastguard Worker }
82*1b3f573fSAndroid Build Coastguard Worker } else {
83*1b3f573fSAndroid Build Coastguard Worker return err_pos;
84*1b3f573fSAndroid Build Coastguard Worker }
85*1b3f573fSAndroid Build Coastguard Worker
86*1b3f573fSAndroid Build Coastguard Worker len -= bytes;
87*1b3f573fSAndroid Build Coastguard Worker err_pos += bytes;
88*1b3f573fSAndroid Build Coastguard Worker data += bytes;
89*1b3f573fSAndroid Build Coastguard Worker }
90*1b3f573fSAndroid Build Coastguard Worker
91*1b3f573fSAndroid Build Coastguard Worker return 0;
92*1b3f573fSAndroid Build Coastguard Worker }
93