1*59bfda1fSAndroid Build Coastguard Worker /*
2*59bfda1fSAndroid Build Coastguard Worker * Copyright (c) 2014 SGI.
3*59bfda1fSAndroid Build Coastguard Worker * Copyright (c) 2018 Collabora Ltd.
4*59bfda1fSAndroid Build Coastguard Worker * All rights reserved.
5*59bfda1fSAndroid Build Coastguard Worker *
6*59bfda1fSAndroid Build Coastguard Worker * This program is free software; you can redistribute it and/or
7*59bfda1fSAndroid Build Coastguard Worker * modify it under the terms of the GNU General Public License as
8*59bfda1fSAndroid Build Coastguard Worker * published by the Free Software Foundation.
9*59bfda1fSAndroid Build Coastguard Worker *
10*59bfda1fSAndroid Build Coastguard Worker * This program is distributed in the hope that it would be useful,
11*59bfda1fSAndroid Build Coastguard Worker * but WITHOUT ANY WARRANTY; without even the implied warranty of
12*59bfda1fSAndroid Build Coastguard Worker * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*59bfda1fSAndroid Build Coastguard Worker * GNU General Public License for more details.
14*59bfda1fSAndroid Build Coastguard Worker *
15*59bfda1fSAndroid Build Coastguard Worker */
16*59bfda1fSAndroid Build Coastguard Worker
17*59bfda1fSAndroid Build Coastguard Worker /*
18*59bfda1fSAndroid Build Coastguard Worker * This code is adapted from the Linux Kernel. We have a
19*59bfda1fSAndroid Build Coastguard Worker * userspace version here such that the hashes will match that
20*59bfda1fSAndroid Build Coastguard Worker * implementation.
21*59bfda1fSAndroid Build Coastguard Worker */
22*59bfda1fSAndroid Build Coastguard Worker
23*59bfda1fSAndroid Build Coastguard Worker #include <stdint.h>
24*59bfda1fSAndroid Build Coastguard Worker #include <unistd.h>
25*59bfda1fSAndroid Build Coastguard Worker #include <string.h>
26*59bfda1fSAndroid Build Coastguard Worker #include <limits.h>
27*59bfda1fSAndroid Build Coastguard Worker #include <errno.h>
28*59bfda1fSAndroid Build Coastguard Worker
29*59bfda1fSAndroid Build Coastguard Worker #include <f2fs_fs.h>
30*59bfda1fSAndroid Build Coastguard Worker
31*59bfda1fSAndroid Build Coastguard Worker /* Encoding a unicode version number as a single unsigned int. */
32*59bfda1fSAndroid Build Coastguard Worker #define UNICODE_MAJ_SHIFT (16)
33*59bfda1fSAndroid Build Coastguard Worker #define UNICODE_MIN_SHIFT (8)
34*59bfda1fSAndroid Build Coastguard Worker
35*59bfda1fSAndroid Build Coastguard Worker #define UNICODE_AGE(MAJ, MIN, REV) \
36*59bfda1fSAndroid Build Coastguard Worker (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
37*59bfda1fSAndroid Build Coastguard Worker ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
38*59bfda1fSAndroid Build Coastguard Worker ((unsigned int)(REV)))
39*59bfda1fSAndroid Build Coastguard Worker
40*59bfda1fSAndroid Build Coastguard Worker /* Needed in struct utf8cursor below. */
41*59bfda1fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF (12)
42*59bfda1fSAndroid Build Coastguard Worker
43*59bfda1fSAndroid Build Coastguard Worker /*
44*59bfda1fSAndroid Build Coastguard Worker * Cursor structure used by the normalizer.
45*59bfda1fSAndroid Build Coastguard Worker */
46*59bfda1fSAndroid Build Coastguard Worker struct utf8cursor {
47*59bfda1fSAndroid Build Coastguard Worker const struct utf8data *data;
48*59bfda1fSAndroid Build Coastguard Worker const char *s;
49*59bfda1fSAndroid Build Coastguard Worker const char *p;
50*59bfda1fSAndroid Build Coastguard Worker const char *ss;
51*59bfda1fSAndroid Build Coastguard Worker const char *sp;
52*59bfda1fSAndroid Build Coastguard Worker unsigned int len;
53*59bfda1fSAndroid Build Coastguard Worker unsigned int slen;
54*59bfda1fSAndroid Build Coastguard Worker short int ccc;
55*59bfda1fSAndroid Build Coastguard Worker short int nccc;
56*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
57*59bfda1fSAndroid Build Coastguard Worker };
58*59bfda1fSAndroid Build Coastguard Worker
59*59bfda1fSAndroid Build Coastguard Worker /*
60*59bfda1fSAndroid Build Coastguard Worker * Initialize a utf8cursor to normalize a string.
61*59bfda1fSAndroid Build Coastguard Worker * Returns 0 on success.
62*59bfda1fSAndroid Build Coastguard Worker * Returns -1 on failure.
63*59bfda1fSAndroid Build Coastguard Worker */
64*59bfda1fSAndroid Build Coastguard Worker // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
65*59bfda1fSAndroid Build Coastguard Worker // const char *s);
66*59bfda1fSAndroid Build Coastguard Worker // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
67*59bfda1fSAndroid Build Coastguard Worker // const char *s, size_t len);
68*59bfda1fSAndroid Build Coastguard Worker
69*59bfda1fSAndroid Build Coastguard Worker /*
70*59bfda1fSAndroid Build Coastguard Worker * Get the next byte in the normalization.
71*59bfda1fSAndroid Build Coastguard Worker * Returns a value > 0 && < 256 on success.
72*59bfda1fSAndroid Build Coastguard Worker * Returns 0 when the end of the normalization is reached.
73*59bfda1fSAndroid Build Coastguard Worker * Returns -1 if the string being normalized is not valid UTF-8.
74*59bfda1fSAndroid Build Coastguard Worker */
75*59bfda1fSAndroid Build Coastguard Worker // extern int utf8byte(struct utf8cursor *u8c);
76*59bfda1fSAndroid Build Coastguard Worker
77*59bfda1fSAndroid Build Coastguard Worker
78*59bfda1fSAndroid Build Coastguard Worker struct utf8data {
79*59bfda1fSAndroid Build Coastguard Worker unsigned int maxage;
80*59bfda1fSAndroid Build Coastguard Worker unsigned int offset;
81*59bfda1fSAndroid Build Coastguard Worker };
82*59bfda1fSAndroid Build Coastguard Worker
83*59bfda1fSAndroid Build Coastguard Worker #define __INCLUDED_FROM_UTF8NORM_C__
84*59bfda1fSAndroid Build Coastguard Worker #include "utf8data.h"
85*59bfda1fSAndroid Build Coastguard Worker #undef __INCLUDED_FROM_UTF8NORM_C__
86*59bfda1fSAndroid Build Coastguard Worker
87*59bfda1fSAndroid Build Coastguard Worker #define ARRAY_SIZE(array) \
88*59bfda1fSAndroid Build Coastguard Worker (sizeof(array) / sizeof(array[0]))
89*59bfda1fSAndroid Build Coastguard Worker
90*59bfda1fSAndroid Build Coastguard Worker #if 0
91*59bfda1fSAndroid Build Coastguard Worker /* Highest unicode version supported by the data tables. */
92*59bfda1fSAndroid Build Coastguard Worker static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
93*59bfda1fSAndroid Build Coastguard Worker {
94*59bfda1fSAndroid Build Coastguard Worker int i = ARRAY_SIZE(utf8agetab) - 1;
95*59bfda1fSAndroid Build Coastguard Worker unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
96*59bfda1fSAndroid Build Coastguard Worker
97*59bfda1fSAndroid Build Coastguard Worker while (i >= 0 && utf8agetab[i] != 0) {
98*59bfda1fSAndroid Build Coastguard Worker if (sb_utf8version == utf8agetab[i])
99*59bfda1fSAndroid Build Coastguard Worker return 1;
100*59bfda1fSAndroid Build Coastguard Worker i--;
101*59bfda1fSAndroid Build Coastguard Worker }
102*59bfda1fSAndroid Build Coastguard Worker return 0;
103*59bfda1fSAndroid Build Coastguard Worker }
104*59bfda1fSAndroid Build Coastguard Worker #endif
105*59bfda1fSAndroid Build Coastguard Worker
106*59bfda1fSAndroid Build Coastguard Worker #if 0
107*59bfda1fSAndroid Build Coastguard Worker static int utf8version_latest(void)
108*59bfda1fSAndroid Build Coastguard Worker {
109*59bfda1fSAndroid Build Coastguard Worker return utf8vers;
110*59bfda1fSAndroid Build Coastguard Worker }
111*59bfda1fSAndroid Build Coastguard Worker #endif
112*59bfda1fSAndroid Build Coastguard Worker
113*59bfda1fSAndroid Build Coastguard Worker /*
114*59bfda1fSAndroid Build Coastguard Worker * UTF-8 valid ranges.
115*59bfda1fSAndroid Build Coastguard Worker *
116*59bfda1fSAndroid Build Coastguard Worker * The UTF-8 encoding spreads the bits of a 32bit word over several
117*59bfda1fSAndroid Build Coastguard Worker * bytes. This table gives the ranges that can be held and how they'd
118*59bfda1fSAndroid Build Coastguard Worker * be represented.
119*59bfda1fSAndroid Build Coastguard Worker *
120*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x0000007F: 0xxxxxxx
121*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
122*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
123*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
124*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
125*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
126*59bfda1fSAndroid Build Coastguard Worker *
127*59bfda1fSAndroid Build Coastguard Worker * There is an additional requirement on UTF-8, in that only the
128*59bfda1fSAndroid Build Coastguard Worker * shortest representation of a 32bit value is to be used. A decoder
129*59bfda1fSAndroid Build Coastguard Worker * must not decode sequences that do not satisfy this requirement.
130*59bfda1fSAndroid Build Coastguard Worker * Thus the allowed ranges have a lower bound.
131*59bfda1fSAndroid Build Coastguard Worker *
132*59bfda1fSAndroid Build Coastguard Worker * 0x00000000 0x0000007F: 0xxxxxxx
133*59bfda1fSAndroid Build Coastguard Worker * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
134*59bfda1fSAndroid Build Coastguard Worker * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
135*59bfda1fSAndroid Build Coastguard Worker * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136*59bfda1fSAndroid Build Coastguard Worker * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
137*59bfda1fSAndroid Build Coastguard Worker * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
138*59bfda1fSAndroid Build Coastguard Worker *
139*59bfda1fSAndroid Build Coastguard Worker * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
140*59bfda1fSAndroid Build Coastguard Worker * 17 planes of 65536 values. This limits the sequences actually seen
141*59bfda1fSAndroid Build Coastguard Worker * even more, to just the following.
142*59bfda1fSAndroid Build Coastguard Worker *
143*59bfda1fSAndroid Build Coastguard Worker * 0 - 0x7F: 0 - 0x7F
144*59bfda1fSAndroid Build Coastguard Worker * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
145*59bfda1fSAndroid Build Coastguard Worker * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
146*59bfda1fSAndroid Build Coastguard Worker * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
147*59bfda1fSAndroid Build Coastguard Worker *
148*59bfda1fSAndroid Build Coastguard Worker * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
149*59bfda1fSAndroid Build Coastguard Worker *
150*59bfda1fSAndroid Build Coastguard Worker * Note that the longest sequence seen with valid usage is 4 bytes,
151*59bfda1fSAndroid Build Coastguard Worker * the same a single UTF-32 character. This makes the UTF-8
152*59bfda1fSAndroid Build Coastguard Worker * representation of Unicode strictly smaller than UTF-32.
153*59bfda1fSAndroid Build Coastguard Worker *
154*59bfda1fSAndroid Build Coastguard Worker * The shortest sequence requirement was introduced by:
155*59bfda1fSAndroid Build Coastguard Worker * Corrigendum #1: UTF-8 Shortest Form
156*59bfda1fSAndroid Build Coastguard Worker * It can be found here:
157*59bfda1fSAndroid Build Coastguard Worker * http://www.unicode.org/versions/corrigendum1.html
158*59bfda1fSAndroid Build Coastguard Worker *
159*59bfda1fSAndroid Build Coastguard Worker */
160*59bfda1fSAndroid Build Coastguard Worker
161*59bfda1fSAndroid Build Coastguard Worker /*
162*59bfda1fSAndroid Build Coastguard Worker * Return the number of bytes used by the current UTF-8 sequence.
163*59bfda1fSAndroid Build Coastguard Worker * Assumes the input points to the first byte of a valid UTF-8
164*59bfda1fSAndroid Build Coastguard Worker * sequence.
165*59bfda1fSAndroid Build Coastguard Worker */
utf8clen(const char * s)166*59bfda1fSAndroid Build Coastguard Worker static inline int utf8clen(const char *s)
167*59bfda1fSAndroid Build Coastguard Worker {
168*59bfda1fSAndroid Build Coastguard Worker unsigned char c = *s;
169*59bfda1fSAndroid Build Coastguard Worker
170*59bfda1fSAndroid Build Coastguard Worker return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
171*59bfda1fSAndroid Build Coastguard Worker }
172*59bfda1fSAndroid Build Coastguard Worker
173*59bfda1fSAndroid Build Coastguard Worker /*
174*59bfda1fSAndroid Build Coastguard Worker * Decode a 3-byte UTF-8 sequence.
175*59bfda1fSAndroid Build Coastguard Worker */
176*59bfda1fSAndroid Build Coastguard Worker static unsigned int
utf8decode3(const char * str)177*59bfda1fSAndroid Build Coastguard Worker utf8decode3(const char *str)
178*59bfda1fSAndroid Build Coastguard Worker {
179*59bfda1fSAndroid Build Coastguard Worker unsigned int uc;
180*59bfda1fSAndroid Build Coastguard Worker
181*59bfda1fSAndroid Build Coastguard Worker uc = *str++ & 0x0F;
182*59bfda1fSAndroid Build Coastguard Worker uc <<= 6;
183*59bfda1fSAndroid Build Coastguard Worker uc |= *str++ & 0x3F;
184*59bfda1fSAndroid Build Coastguard Worker uc <<= 6;
185*59bfda1fSAndroid Build Coastguard Worker uc |= *str++ & 0x3F;
186*59bfda1fSAndroid Build Coastguard Worker
187*59bfda1fSAndroid Build Coastguard Worker return uc;
188*59bfda1fSAndroid Build Coastguard Worker }
189*59bfda1fSAndroid Build Coastguard Worker
190*59bfda1fSAndroid Build Coastguard Worker /*
191*59bfda1fSAndroid Build Coastguard Worker * Encode a 3-byte UTF-8 sequence.
192*59bfda1fSAndroid Build Coastguard Worker */
193*59bfda1fSAndroid Build Coastguard Worker static int
utf8encode3(char * str,unsigned int val)194*59bfda1fSAndroid Build Coastguard Worker utf8encode3(char *str, unsigned int val)
195*59bfda1fSAndroid Build Coastguard Worker {
196*59bfda1fSAndroid Build Coastguard Worker str[2] = (val & 0x3F) | 0x80;
197*59bfda1fSAndroid Build Coastguard Worker val >>= 6;
198*59bfda1fSAndroid Build Coastguard Worker str[1] = (val & 0x3F) | 0x80;
199*59bfda1fSAndroid Build Coastguard Worker val >>= 6;
200*59bfda1fSAndroid Build Coastguard Worker str[0] = val | 0xE0;
201*59bfda1fSAndroid Build Coastguard Worker
202*59bfda1fSAndroid Build Coastguard Worker return 3;
203*59bfda1fSAndroid Build Coastguard Worker }
204*59bfda1fSAndroid Build Coastguard Worker
205*59bfda1fSAndroid Build Coastguard Worker /*
206*59bfda1fSAndroid Build Coastguard Worker * utf8trie_t
207*59bfda1fSAndroid Build Coastguard Worker *
208*59bfda1fSAndroid Build Coastguard Worker * A compact binary tree, used to decode UTF-8 characters.
209*59bfda1fSAndroid Build Coastguard Worker *
210*59bfda1fSAndroid Build Coastguard Worker * Internal nodes are one byte for the node itself, and up to three
211*59bfda1fSAndroid Build Coastguard Worker * bytes for an offset into the tree. The first byte contains the
212*59bfda1fSAndroid Build Coastguard Worker * following information:
213*59bfda1fSAndroid Build Coastguard Worker * NEXTBYTE - flag - advance to next byte if set
214*59bfda1fSAndroid Build Coastguard Worker * BITNUM - 3 bit field - the bit number to tested
215*59bfda1fSAndroid Build Coastguard Worker * OFFLEN - 2 bit field - number of bytes in the offset
216*59bfda1fSAndroid Build Coastguard Worker * if offlen == 0 (non-branching node)
217*59bfda1fSAndroid Build Coastguard Worker * RIGHTPATH - 1 bit field - set if the following node is for the
218*59bfda1fSAndroid Build Coastguard Worker * right-hand path (tested bit is set)
219*59bfda1fSAndroid Build Coastguard Worker * TRIENODE - 1 bit field - set if the following node is an internal
220*59bfda1fSAndroid Build Coastguard Worker * node, otherwise it is a leaf node
221*59bfda1fSAndroid Build Coastguard Worker * if offlen != 0 (branching node)
222*59bfda1fSAndroid Build Coastguard Worker * LEFTNODE - 1 bit field - set if the left-hand node is internal
223*59bfda1fSAndroid Build Coastguard Worker * RIGHTNODE - 1 bit field - set if the right-hand node is internal
224*59bfda1fSAndroid Build Coastguard Worker *
225*59bfda1fSAndroid Build Coastguard Worker * Due to the way utf8 works, there cannot be branching nodes with
226*59bfda1fSAndroid Build Coastguard Worker * NEXTBYTE set, and moreover those nodes always have a righthand
227*59bfda1fSAndroid Build Coastguard Worker * descendant.
228*59bfda1fSAndroid Build Coastguard Worker */
229*59bfda1fSAndroid Build Coastguard Worker typedef const unsigned char utf8trie_t;
230*59bfda1fSAndroid Build Coastguard Worker #define BITNUM 0x07
231*59bfda1fSAndroid Build Coastguard Worker #define NEXTBYTE 0x08
232*59bfda1fSAndroid Build Coastguard Worker #define OFFLEN 0x30
233*59bfda1fSAndroid Build Coastguard Worker #define OFFLEN_SHIFT 4
234*59bfda1fSAndroid Build Coastguard Worker #define RIGHTPATH 0x40
235*59bfda1fSAndroid Build Coastguard Worker #define TRIENODE 0x80
236*59bfda1fSAndroid Build Coastguard Worker #define RIGHTNODE 0x40
237*59bfda1fSAndroid Build Coastguard Worker #define LEFTNODE 0x80
238*59bfda1fSAndroid Build Coastguard Worker
239*59bfda1fSAndroid Build Coastguard Worker /*
240*59bfda1fSAndroid Build Coastguard Worker * utf8leaf_t
241*59bfda1fSAndroid Build Coastguard Worker *
242*59bfda1fSAndroid Build Coastguard Worker * The leaves of the trie are embedded in the trie, and so the same
243*59bfda1fSAndroid Build Coastguard Worker * underlying datatype: unsigned char.
244*59bfda1fSAndroid Build Coastguard Worker *
245*59bfda1fSAndroid Build Coastguard Worker * leaf[0]: The unicode version, stored as a generation number that is
246*59bfda1fSAndroid Build Coastguard Worker * an index into utf8agetab[]. With this we can filter code
247*59bfda1fSAndroid Build Coastguard Worker * points based on the unicode version in which they were
248*59bfda1fSAndroid Build Coastguard Worker * defined. The CCC of a non-defined code point is 0.
249*59bfda1fSAndroid Build Coastguard Worker * leaf[1]: Canonical Combining Class. During normalization, we need
250*59bfda1fSAndroid Build Coastguard Worker * to do a stable sort into ascending order of all characters
251*59bfda1fSAndroid Build Coastguard Worker * with a non-zero CCC that occur between two characters with
252*59bfda1fSAndroid Build Coastguard Worker * a CCC of 0, or at the begin or end of a string.
253*59bfda1fSAndroid Build Coastguard Worker * The unicode standard guarantees that all CCC values are
254*59bfda1fSAndroid Build Coastguard Worker * between 0 and 254 inclusive, which leaves 255 available as
255*59bfda1fSAndroid Build Coastguard Worker * a special value.
256*59bfda1fSAndroid Build Coastguard Worker * Code points with CCC 0 are known as stoppers.
257*59bfda1fSAndroid Build Coastguard Worker * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
258*59bfda1fSAndroid Build Coastguard Worker * start of a NUL-terminated string that is the decomposition
259*59bfda1fSAndroid Build Coastguard Worker * of the character.
260*59bfda1fSAndroid Build Coastguard Worker * The CCC of a decomposable character is the same as the CCC
261*59bfda1fSAndroid Build Coastguard Worker * of the first character of its decomposition.
262*59bfda1fSAndroid Build Coastguard Worker * Some characters decompose as the empty string: these are
263*59bfda1fSAndroid Build Coastguard Worker * characters with the Default_Ignorable_Code_Point property.
264*59bfda1fSAndroid Build Coastguard Worker * These do affect normalization, as they all have CCC 0.
265*59bfda1fSAndroid Build Coastguard Worker *
266*59bfda1fSAndroid Build Coastguard Worker * The decompositions in the trie have been fully expanded, with the
267*59bfda1fSAndroid Build Coastguard Worker * exception of Hangul syllables, which are decomposed algorithmically.
268*59bfda1fSAndroid Build Coastguard Worker *
269*59bfda1fSAndroid Build Coastguard Worker * Casefolding, if applicable, is also done using decompositions.
270*59bfda1fSAndroid Build Coastguard Worker *
271*59bfda1fSAndroid Build Coastguard Worker * The trie is constructed in such a way that leaves exist for all
272*59bfda1fSAndroid Build Coastguard Worker * UTF-8 sequences that match the criteria from the "UTF-8 valid
273*59bfda1fSAndroid Build Coastguard Worker * ranges" comment above, and only for those sequences. Therefore a
274*59bfda1fSAndroid Build Coastguard Worker * lookup in the trie can be used to validate the UTF-8 input.
275*59bfda1fSAndroid Build Coastguard Worker */
276*59bfda1fSAndroid Build Coastguard Worker typedef const unsigned char utf8leaf_t;
277*59bfda1fSAndroid Build Coastguard Worker
278*59bfda1fSAndroid Build Coastguard Worker #define LEAF_GEN(LEAF) ((LEAF)[0])
279*59bfda1fSAndroid Build Coastguard Worker #define LEAF_CCC(LEAF) ((LEAF)[1])
280*59bfda1fSAndroid Build Coastguard Worker #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
281*59bfda1fSAndroid Build Coastguard Worker
282*59bfda1fSAndroid Build Coastguard Worker #define MINCCC (0)
283*59bfda1fSAndroid Build Coastguard Worker #define MAXCCC (254)
284*59bfda1fSAndroid Build Coastguard Worker #define STOPPER (0)
285*59bfda1fSAndroid Build Coastguard Worker #define DECOMPOSE (255)
286*59bfda1fSAndroid Build Coastguard Worker
287*59bfda1fSAndroid Build Coastguard Worker /* Marker for hangul syllable decomposition. */
288*59bfda1fSAndroid Build Coastguard Worker #define HANGUL ((char)(255))
289*59bfda1fSAndroid Build Coastguard Worker /* Size of the synthesized leaf used for Hangul syllable decomposition. */
290*59bfda1fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF (12)
291*59bfda1fSAndroid Build Coastguard Worker
292*59bfda1fSAndroid Build Coastguard Worker /*
293*59bfda1fSAndroid Build Coastguard Worker * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
294*59bfda1fSAndroid Build Coastguard Worker *
295*59bfda1fSAndroid Build Coastguard Worker * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
296*59bfda1fSAndroid Build Coastguard Worker * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
297*59bfda1fSAndroid Build Coastguard Worker *
298*59bfda1fSAndroid Build Coastguard Worker * SBase = 0xAC00
299*59bfda1fSAndroid Build Coastguard Worker * LBase = 0x1100
300*59bfda1fSAndroid Build Coastguard Worker * VBase = 0x1161
301*59bfda1fSAndroid Build Coastguard Worker * TBase = 0x11A7
302*59bfda1fSAndroid Build Coastguard Worker * LCount = 19
303*59bfda1fSAndroid Build Coastguard Worker * VCount = 21
304*59bfda1fSAndroid Build Coastguard Worker * TCount = 28
305*59bfda1fSAndroid Build Coastguard Worker * NCount = 588 (VCount * TCount)
306*59bfda1fSAndroid Build Coastguard Worker * SCount = 11172 (LCount * NCount)
307*59bfda1fSAndroid Build Coastguard Worker *
308*59bfda1fSAndroid Build Coastguard Worker * Decomposition:
309*59bfda1fSAndroid Build Coastguard Worker * SIndex = s - SBase
310*59bfda1fSAndroid Build Coastguard Worker *
311*59bfda1fSAndroid Build Coastguard Worker * LV (Canonical/Full)
312*59bfda1fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
313*59bfda1fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
314*59bfda1fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
315*59bfda1fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
316*59bfda1fSAndroid Build Coastguard Worker *
317*59bfda1fSAndroid Build Coastguard Worker * LVT (Canonical)
318*59bfda1fSAndroid Build Coastguard Worker * LVIndex = (SIndex / TCount) * TCount
319*59bfda1fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
320*59bfda1fSAndroid Build Coastguard Worker * LVPart = SBase + LVIndex
321*59bfda1fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
322*59bfda1fSAndroid Build Coastguard Worker *
323*59bfda1fSAndroid Build Coastguard Worker * LVT (Full)
324*59bfda1fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
325*59bfda1fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
326*59bfda1fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
327*59bfda1fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
328*59bfda1fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
329*59bfda1fSAndroid Build Coastguard Worker * if (TIndex == 0) {
330*59bfda1fSAndroid Build Coastguard Worker * d = <LPart, VPart>
331*59bfda1fSAndroid Build Coastguard Worker * } else {
332*59bfda1fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
333*59bfda1fSAndroid Build Coastguard Worker * d = <LPart, TPart, VPart>
334*59bfda1fSAndroid Build Coastguard Worker * }
335*59bfda1fSAndroid Build Coastguard Worker */
336*59bfda1fSAndroid Build Coastguard Worker
337*59bfda1fSAndroid Build Coastguard Worker /* Constants */
338*59bfda1fSAndroid Build Coastguard Worker #define SB (0xAC00)
339*59bfda1fSAndroid Build Coastguard Worker #define LB (0x1100)
340*59bfda1fSAndroid Build Coastguard Worker #define VB (0x1161)
341*59bfda1fSAndroid Build Coastguard Worker #define TB (0x11A7)
342*59bfda1fSAndroid Build Coastguard Worker #define LC (19)
343*59bfda1fSAndroid Build Coastguard Worker #define VC (21)
344*59bfda1fSAndroid Build Coastguard Worker #define TC (28)
345*59bfda1fSAndroid Build Coastguard Worker #define NC (VC * TC)
346*59bfda1fSAndroid Build Coastguard Worker #define SC (LC * NC)
347*59bfda1fSAndroid Build Coastguard Worker
348*59bfda1fSAndroid Build Coastguard Worker /* Algorithmic decomposition of hangul syllable. */
349*59bfda1fSAndroid Build Coastguard Worker static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)350*59bfda1fSAndroid Build Coastguard Worker utf8hangul(const char *str, unsigned char *hangul)
351*59bfda1fSAndroid Build Coastguard Worker {
352*59bfda1fSAndroid Build Coastguard Worker unsigned int si;
353*59bfda1fSAndroid Build Coastguard Worker unsigned int li;
354*59bfda1fSAndroid Build Coastguard Worker unsigned int vi;
355*59bfda1fSAndroid Build Coastguard Worker unsigned int ti;
356*59bfda1fSAndroid Build Coastguard Worker unsigned char *h;
357*59bfda1fSAndroid Build Coastguard Worker
358*59bfda1fSAndroid Build Coastguard Worker /* Calculate the SI, LI, VI, and TI values. */
359*59bfda1fSAndroid Build Coastguard Worker si = utf8decode3(str) - SB;
360*59bfda1fSAndroid Build Coastguard Worker li = si / NC;
361*59bfda1fSAndroid Build Coastguard Worker vi = (si % NC) / TC;
362*59bfda1fSAndroid Build Coastguard Worker ti = si % TC;
363*59bfda1fSAndroid Build Coastguard Worker
364*59bfda1fSAndroid Build Coastguard Worker /* Fill in base of leaf. */
365*59bfda1fSAndroid Build Coastguard Worker h = hangul;
366*59bfda1fSAndroid Build Coastguard Worker LEAF_GEN(h) = 2;
367*59bfda1fSAndroid Build Coastguard Worker LEAF_CCC(h) = DECOMPOSE;
368*59bfda1fSAndroid Build Coastguard Worker h += 2;
369*59bfda1fSAndroid Build Coastguard Worker
370*59bfda1fSAndroid Build Coastguard Worker /* Add LPart, a 3-byte UTF-8 sequence. */
371*59bfda1fSAndroid Build Coastguard Worker h += utf8encode3((char *)h, li + LB);
372*59bfda1fSAndroid Build Coastguard Worker
373*59bfda1fSAndroid Build Coastguard Worker /* Add VPart, a 3-byte UTF-8 sequence. */
374*59bfda1fSAndroid Build Coastguard Worker h += utf8encode3((char *)h, vi + VB);
375*59bfda1fSAndroid Build Coastguard Worker
376*59bfda1fSAndroid Build Coastguard Worker /* Add TPart if required, also a 3-byte UTF-8 sequence. */
377*59bfda1fSAndroid Build Coastguard Worker if (ti)
378*59bfda1fSAndroid Build Coastguard Worker h += utf8encode3((char *)h, ti + TB);
379*59bfda1fSAndroid Build Coastguard Worker
380*59bfda1fSAndroid Build Coastguard Worker /* Terminate string. */
381*59bfda1fSAndroid Build Coastguard Worker h[0] = '\0';
382*59bfda1fSAndroid Build Coastguard Worker
383*59bfda1fSAndroid Build Coastguard Worker return hangul;
384*59bfda1fSAndroid Build Coastguard Worker }
385*59bfda1fSAndroid Build Coastguard Worker
386*59bfda1fSAndroid Build Coastguard Worker /*
387*59bfda1fSAndroid Build Coastguard Worker * Use trie to scan s, touching at most len bytes.
388*59bfda1fSAndroid Build Coastguard Worker * Returns the leaf if one exists, NULL otherwise.
389*59bfda1fSAndroid Build Coastguard Worker *
390*59bfda1fSAndroid Build Coastguard Worker * A non-NULL return guarantees that the UTF-8 sequence starting at s
391*59bfda1fSAndroid Build Coastguard Worker * is well-formed and corresponds to a known unicode code point. The
392*59bfda1fSAndroid Build Coastguard Worker * shorthand for this will be "is valid UTF-8 unicode".
393*59bfda1fSAndroid Build Coastguard Worker */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)394*59bfda1fSAndroid Build Coastguard Worker static utf8leaf_t *utf8nlookup(const struct utf8data *data,
395*59bfda1fSAndroid Build Coastguard Worker unsigned char *hangul, const char *s, size_t len)
396*59bfda1fSAndroid Build Coastguard Worker {
397*59bfda1fSAndroid Build Coastguard Worker utf8trie_t *trie;
398*59bfda1fSAndroid Build Coastguard Worker int offlen;
399*59bfda1fSAndroid Build Coastguard Worker int offset;
400*59bfda1fSAndroid Build Coastguard Worker int mask;
401*59bfda1fSAndroid Build Coastguard Worker int node;
402*59bfda1fSAndroid Build Coastguard Worker
403*59bfda1fSAndroid Build Coastguard Worker if (!data)
404*59bfda1fSAndroid Build Coastguard Worker return NULL;
405*59bfda1fSAndroid Build Coastguard Worker if (len == 0)
406*59bfda1fSAndroid Build Coastguard Worker return NULL;
407*59bfda1fSAndroid Build Coastguard Worker
408*59bfda1fSAndroid Build Coastguard Worker trie = utf8data + data->offset;
409*59bfda1fSAndroid Build Coastguard Worker node = 1;
410*59bfda1fSAndroid Build Coastguard Worker while (node) {
411*59bfda1fSAndroid Build Coastguard Worker offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
412*59bfda1fSAndroid Build Coastguard Worker if (*trie & NEXTBYTE) {
413*59bfda1fSAndroid Build Coastguard Worker if (--len == 0)
414*59bfda1fSAndroid Build Coastguard Worker return NULL;
415*59bfda1fSAndroid Build Coastguard Worker s++;
416*59bfda1fSAndroid Build Coastguard Worker }
417*59bfda1fSAndroid Build Coastguard Worker mask = 1 << (*trie & BITNUM);
418*59bfda1fSAndroid Build Coastguard Worker if (*s & mask) {
419*59bfda1fSAndroid Build Coastguard Worker /* Right leg */
420*59bfda1fSAndroid Build Coastguard Worker if (offlen) {
421*59bfda1fSAndroid Build Coastguard Worker /* Right node at offset of trie */
422*59bfda1fSAndroid Build Coastguard Worker node = (*trie & RIGHTNODE);
423*59bfda1fSAndroid Build Coastguard Worker offset = trie[offlen];
424*59bfda1fSAndroid Build Coastguard Worker while (--offlen) {
425*59bfda1fSAndroid Build Coastguard Worker offset <<= 8;
426*59bfda1fSAndroid Build Coastguard Worker offset |= trie[offlen];
427*59bfda1fSAndroid Build Coastguard Worker }
428*59bfda1fSAndroid Build Coastguard Worker trie += offset;
429*59bfda1fSAndroid Build Coastguard Worker } else if (*trie & RIGHTPATH) {
430*59bfda1fSAndroid Build Coastguard Worker /* Right node after this node */
431*59bfda1fSAndroid Build Coastguard Worker node = (*trie & TRIENODE);
432*59bfda1fSAndroid Build Coastguard Worker trie++;
433*59bfda1fSAndroid Build Coastguard Worker } else {
434*59bfda1fSAndroid Build Coastguard Worker /* No right node. */
435*59bfda1fSAndroid Build Coastguard Worker return NULL;
436*59bfda1fSAndroid Build Coastguard Worker }
437*59bfda1fSAndroid Build Coastguard Worker } else {
438*59bfda1fSAndroid Build Coastguard Worker /* Left leg */
439*59bfda1fSAndroid Build Coastguard Worker if (offlen) {
440*59bfda1fSAndroid Build Coastguard Worker /* Left node after this node. */
441*59bfda1fSAndroid Build Coastguard Worker node = (*trie & LEFTNODE);
442*59bfda1fSAndroid Build Coastguard Worker trie += offlen + 1;
443*59bfda1fSAndroid Build Coastguard Worker } else if (*trie & RIGHTPATH) {
444*59bfda1fSAndroid Build Coastguard Worker /* No left node. */
445*59bfda1fSAndroid Build Coastguard Worker return NULL;
446*59bfda1fSAndroid Build Coastguard Worker } else {
447*59bfda1fSAndroid Build Coastguard Worker /* Left node after this node */
448*59bfda1fSAndroid Build Coastguard Worker node = (*trie & TRIENODE);
449*59bfda1fSAndroid Build Coastguard Worker trie++;
450*59bfda1fSAndroid Build Coastguard Worker }
451*59bfda1fSAndroid Build Coastguard Worker }
452*59bfda1fSAndroid Build Coastguard Worker }
453*59bfda1fSAndroid Build Coastguard Worker /*
454*59bfda1fSAndroid Build Coastguard Worker * Hangul decomposition is done algorithmically. These are the
455*59bfda1fSAndroid Build Coastguard Worker * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
456*59bfda1fSAndroid Build Coastguard Worker * always 3 bytes long, so s has been advanced twice, and the
457*59bfda1fSAndroid Build Coastguard Worker * start of the sequence is at s-2.
458*59bfda1fSAndroid Build Coastguard Worker */
459*59bfda1fSAndroid Build Coastguard Worker if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
460*59bfda1fSAndroid Build Coastguard Worker trie = utf8hangul(s - 2, hangul);
461*59bfda1fSAndroid Build Coastguard Worker return trie;
462*59bfda1fSAndroid Build Coastguard Worker }
463*59bfda1fSAndroid Build Coastguard Worker
464*59bfda1fSAndroid Build Coastguard Worker /*
465*59bfda1fSAndroid Build Coastguard Worker * Use trie to scan s.
466*59bfda1fSAndroid Build Coastguard Worker * Returns the leaf if one exists, NULL otherwise.
467*59bfda1fSAndroid Build Coastguard Worker *
468*59bfda1fSAndroid Build Coastguard Worker * Forwards to utf8nlookup().
469*59bfda1fSAndroid Build Coastguard Worker */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)470*59bfda1fSAndroid Build Coastguard Worker static utf8leaf_t *utf8lookup(const struct utf8data *data,
471*59bfda1fSAndroid Build Coastguard Worker unsigned char *hangul, const char *s)
472*59bfda1fSAndroid Build Coastguard Worker {
473*59bfda1fSAndroid Build Coastguard Worker return utf8nlookup(data, hangul, s, (size_t)-1);
474*59bfda1fSAndroid Build Coastguard Worker }
475*59bfda1fSAndroid Build Coastguard Worker
476*59bfda1fSAndroid Build Coastguard Worker #if 0
477*59bfda1fSAndroid Build Coastguard Worker /*
478*59bfda1fSAndroid Build Coastguard Worker * Maximum age of any character in s.
479*59bfda1fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
480*59bfda1fSAndroid Build Coastguard Worker * Return 0 if only non-assigned code points are used.
481*59bfda1fSAndroid Build Coastguard Worker */
482*59bfda1fSAndroid Build Coastguard Worker static int utf8agemax(const struct utf8data *data, const char *s)
483*59bfda1fSAndroid Build Coastguard Worker {
484*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
485*59bfda1fSAndroid Build Coastguard Worker int age = 0;
486*59bfda1fSAndroid Build Coastguard Worker int leaf_age;
487*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
488*59bfda1fSAndroid Build Coastguard Worker
489*59bfda1fSAndroid Build Coastguard Worker if (!data)
490*59bfda1fSAndroid Build Coastguard Worker return -1;
491*59bfda1fSAndroid Build Coastguard Worker
492*59bfda1fSAndroid Build Coastguard Worker while (*s) {
493*59bfda1fSAndroid Build Coastguard Worker leaf = utf8lookup(data, hangul, s);
494*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
495*59bfda1fSAndroid Build Coastguard Worker return -1;
496*59bfda1fSAndroid Build Coastguard Worker
497*59bfda1fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
498*59bfda1fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age > age)
499*59bfda1fSAndroid Build Coastguard Worker age = leaf_age;
500*59bfda1fSAndroid Build Coastguard Worker s += utf8clen(s);
501*59bfda1fSAndroid Build Coastguard Worker }
502*59bfda1fSAndroid Build Coastguard Worker return age;
503*59bfda1fSAndroid Build Coastguard Worker }
504*59bfda1fSAndroid Build Coastguard Worker #endif
505*59bfda1fSAndroid Build Coastguard Worker
506*59bfda1fSAndroid Build Coastguard Worker #if 0
507*59bfda1fSAndroid Build Coastguard Worker /*
508*59bfda1fSAndroid Build Coastguard Worker * Minimum age of any character in s.
509*59bfda1fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
510*59bfda1fSAndroid Build Coastguard Worker * Return 0 if non-assigned code points are used.
511*59bfda1fSAndroid Build Coastguard Worker */
512*59bfda1fSAndroid Build Coastguard Worker static int utf8agemin(const struct utf8data *data, const char *s)
513*59bfda1fSAndroid Build Coastguard Worker {
514*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
515*59bfda1fSAndroid Build Coastguard Worker int age;
516*59bfda1fSAndroid Build Coastguard Worker int leaf_age;
517*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
518*59bfda1fSAndroid Build Coastguard Worker
519*59bfda1fSAndroid Build Coastguard Worker if (!data)
520*59bfda1fSAndroid Build Coastguard Worker return -1;
521*59bfda1fSAndroid Build Coastguard Worker age = data->maxage;
522*59bfda1fSAndroid Build Coastguard Worker while (*s) {
523*59bfda1fSAndroid Build Coastguard Worker leaf = utf8lookup(data, hangul, s);
524*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
525*59bfda1fSAndroid Build Coastguard Worker return -1;
526*59bfda1fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
527*59bfda1fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age < age)
528*59bfda1fSAndroid Build Coastguard Worker age = leaf_age;
529*59bfda1fSAndroid Build Coastguard Worker s += utf8clen(s);
530*59bfda1fSAndroid Build Coastguard Worker }
531*59bfda1fSAndroid Build Coastguard Worker return age;
532*59bfda1fSAndroid Build Coastguard Worker }
533*59bfda1fSAndroid Build Coastguard Worker #endif
534*59bfda1fSAndroid Build Coastguard Worker
535*59bfda1fSAndroid Build Coastguard Worker #if 0
536*59bfda1fSAndroid Build Coastguard Worker /*
537*59bfda1fSAndroid Build Coastguard Worker * Maximum age of any character in s, touch at most len bytes.
538*59bfda1fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
539*59bfda1fSAndroid Build Coastguard Worker */
540*59bfda1fSAndroid Build Coastguard Worker static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
541*59bfda1fSAndroid Build Coastguard Worker {
542*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
543*59bfda1fSAndroid Build Coastguard Worker int age = 0;
544*59bfda1fSAndroid Build Coastguard Worker int leaf_age;
545*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
546*59bfda1fSAndroid Build Coastguard Worker
547*59bfda1fSAndroid Build Coastguard Worker if (!data)
548*59bfda1fSAndroid Build Coastguard Worker return -1;
549*59bfda1fSAndroid Build Coastguard Worker
550*59bfda1fSAndroid Build Coastguard Worker while (len && *s) {
551*59bfda1fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
552*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
553*59bfda1fSAndroid Build Coastguard Worker return -1;
554*59bfda1fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
555*59bfda1fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age > age)
556*59bfda1fSAndroid Build Coastguard Worker age = leaf_age;
557*59bfda1fSAndroid Build Coastguard Worker len -= utf8clen(s);
558*59bfda1fSAndroid Build Coastguard Worker s += utf8clen(s);
559*59bfda1fSAndroid Build Coastguard Worker }
560*59bfda1fSAndroid Build Coastguard Worker return age;
561*59bfda1fSAndroid Build Coastguard Worker }
562*59bfda1fSAndroid Build Coastguard Worker #endif
563*59bfda1fSAndroid Build Coastguard Worker
564*59bfda1fSAndroid Build Coastguard Worker #if 0
565*59bfda1fSAndroid Build Coastguard Worker /*
566*59bfda1fSAndroid Build Coastguard Worker * Maximum age of any character in s, touch at most len bytes.
567*59bfda1fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
568*59bfda1fSAndroid Build Coastguard Worker */
569*59bfda1fSAndroid Build Coastguard Worker static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
570*59bfda1fSAndroid Build Coastguard Worker {
571*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
572*59bfda1fSAndroid Build Coastguard Worker int leaf_age;
573*59bfda1fSAndroid Build Coastguard Worker int age;
574*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
575*59bfda1fSAndroid Build Coastguard Worker
576*59bfda1fSAndroid Build Coastguard Worker if (!data)
577*59bfda1fSAndroid Build Coastguard Worker return -1;
578*59bfda1fSAndroid Build Coastguard Worker age = data->maxage;
579*59bfda1fSAndroid Build Coastguard Worker while (len && *s) {
580*59bfda1fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
581*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
582*59bfda1fSAndroid Build Coastguard Worker return -1;
583*59bfda1fSAndroid Build Coastguard Worker leaf_age = utf8agetab[LEAF_GEN(leaf)];
584*59bfda1fSAndroid Build Coastguard Worker if (leaf_age <= data->maxage && leaf_age < age)
585*59bfda1fSAndroid Build Coastguard Worker age = leaf_age;
586*59bfda1fSAndroid Build Coastguard Worker len -= utf8clen(s);
587*59bfda1fSAndroid Build Coastguard Worker s += utf8clen(s);
588*59bfda1fSAndroid Build Coastguard Worker }
589*59bfda1fSAndroid Build Coastguard Worker return age;
590*59bfda1fSAndroid Build Coastguard Worker }
591*59bfda1fSAndroid Build Coastguard Worker #endif
592*59bfda1fSAndroid Build Coastguard Worker
593*59bfda1fSAndroid Build Coastguard Worker #if 0
594*59bfda1fSAndroid Build Coastguard Worker /*
595*59bfda1fSAndroid Build Coastguard Worker * Length of the normalization of s.
596*59bfda1fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
597*59bfda1fSAndroid Build Coastguard Worker *
598*59bfda1fSAndroid Build Coastguard Worker * A string of Default_Ignorable_Code_Point has length 0.
599*59bfda1fSAndroid Build Coastguard Worker */
600*59bfda1fSAndroid Build Coastguard Worker static ssize_t utf8len(const struct utf8data *data, const char *s)
601*59bfda1fSAndroid Build Coastguard Worker {
602*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
603*59bfda1fSAndroid Build Coastguard Worker size_t ret = 0;
604*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
605*59bfda1fSAndroid Build Coastguard Worker
606*59bfda1fSAndroid Build Coastguard Worker if (!data)
607*59bfda1fSAndroid Build Coastguard Worker return -1;
608*59bfda1fSAndroid Build Coastguard Worker while (*s) {
609*59bfda1fSAndroid Build Coastguard Worker leaf = utf8lookup(data, hangul, s);
610*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
611*59bfda1fSAndroid Build Coastguard Worker return -1;
612*59bfda1fSAndroid Build Coastguard Worker if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
613*59bfda1fSAndroid Build Coastguard Worker ret += utf8clen(s);
614*59bfda1fSAndroid Build Coastguard Worker else if (LEAF_CCC(leaf) == DECOMPOSE)
615*59bfda1fSAndroid Build Coastguard Worker ret += strlen(LEAF_STR(leaf));
616*59bfda1fSAndroid Build Coastguard Worker else
617*59bfda1fSAndroid Build Coastguard Worker ret += utf8clen(s);
618*59bfda1fSAndroid Build Coastguard Worker s += utf8clen(s);
619*59bfda1fSAndroid Build Coastguard Worker }
620*59bfda1fSAndroid Build Coastguard Worker return ret;
621*59bfda1fSAndroid Build Coastguard Worker }
622*59bfda1fSAndroid Build Coastguard Worker #endif
623*59bfda1fSAndroid Build Coastguard Worker
624*59bfda1fSAndroid Build Coastguard Worker #if 0
625*59bfda1fSAndroid Build Coastguard Worker /*
626*59bfda1fSAndroid Build Coastguard Worker * Length of the normalization of s, touch at most len bytes.
627*59bfda1fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
628*59bfda1fSAndroid Build Coastguard Worker */
629*59bfda1fSAndroid Build Coastguard Worker static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
630*59bfda1fSAndroid Build Coastguard Worker {
631*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
632*59bfda1fSAndroid Build Coastguard Worker size_t ret = 0;
633*59bfda1fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
634*59bfda1fSAndroid Build Coastguard Worker
635*59bfda1fSAndroid Build Coastguard Worker if (!data)
636*59bfda1fSAndroid Build Coastguard Worker return -1;
637*59bfda1fSAndroid Build Coastguard Worker while (len && *s) {
638*59bfda1fSAndroid Build Coastguard Worker leaf = utf8nlookup(data, hangul, s, len);
639*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
640*59bfda1fSAndroid Build Coastguard Worker return -1;
641*59bfda1fSAndroid Build Coastguard Worker if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
642*59bfda1fSAndroid Build Coastguard Worker ret += utf8clen(s);
643*59bfda1fSAndroid Build Coastguard Worker else if (LEAF_CCC(leaf) == DECOMPOSE)
644*59bfda1fSAndroid Build Coastguard Worker ret += strlen(LEAF_STR(leaf));
645*59bfda1fSAndroid Build Coastguard Worker else
646*59bfda1fSAndroid Build Coastguard Worker ret += utf8clen(s);
647*59bfda1fSAndroid Build Coastguard Worker len -= utf8clen(s);
648*59bfda1fSAndroid Build Coastguard Worker s += utf8clen(s);
649*59bfda1fSAndroid Build Coastguard Worker }
650*59bfda1fSAndroid Build Coastguard Worker return ret;
651*59bfda1fSAndroid Build Coastguard Worker }
652*59bfda1fSAndroid Build Coastguard Worker #endif
653*59bfda1fSAndroid Build Coastguard Worker
654*59bfda1fSAndroid Build Coastguard Worker /*
655*59bfda1fSAndroid Build Coastguard Worker * Set up an utf8cursor for use by utf8byte().
656*59bfda1fSAndroid Build Coastguard Worker *
657*59bfda1fSAndroid Build Coastguard Worker * u8c : pointer to cursor.
658*59bfda1fSAndroid Build Coastguard Worker * data : const struct utf8data to use for normalization.
659*59bfda1fSAndroid Build Coastguard Worker * s : string.
660*59bfda1fSAndroid Build Coastguard Worker * len : length of s.
661*59bfda1fSAndroid Build Coastguard Worker *
662*59bfda1fSAndroid Build Coastguard Worker * Returns -1 on error, 0 on success.
663*59bfda1fSAndroid Build Coastguard Worker */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)664*59bfda1fSAndroid Build Coastguard Worker static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
665*59bfda1fSAndroid Build Coastguard Worker const char *s, size_t len)
666*59bfda1fSAndroid Build Coastguard Worker {
667*59bfda1fSAndroid Build Coastguard Worker if (!data)
668*59bfda1fSAndroid Build Coastguard Worker return -1;
669*59bfda1fSAndroid Build Coastguard Worker if (!s)
670*59bfda1fSAndroid Build Coastguard Worker return -1;
671*59bfda1fSAndroid Build Coastguard Worker u8c->data = data;
672*59bfda1fSAndroid Build Coastguard Worker u8c->s = s;
673*59bfda1fSAndroid Build Coastguard Worker u8c->p = NULL;
674*59bfda1fSAndroid Build Coastguard Worker u8c->ss = NULL;
675*59bfda1fSAndroid Build Coastguard Worker u8c->sp = NULL;
676*59bfda1fSAndroid Build Coastguard Worker u8c->len = len;
677*59bfda1fSAndroid Build Coastguard Worker u8c->slen = 0;
678*59bfda1fSAndroid Build Coastguard Worker u8c->ccc = STOPPER;
679*59bfda1fSAndroid Build Coastguard Worker u8c->nccc = STOPPER;
680*59bfda1fSAndroid Build Coastguard Worker /* Check we didn't clobber the maximum length. */
681*59bfda1fSAndroid Build Coastguard Worker if (u8c->len != len)
682*59bfda1fSAndroid Build Coastguard Worker return -1;
683*59bfda1fSAndroid Build Coastguard Worker /* The first byte of s may not be an utf8 continuation. */
684*59bfda1fSAndroid Build Coastguard Worker if (len > 0 && (*s & 0xC0) == 0x80)
685*59bfda1fSAndroid Build Coastguard Worker return -1;
686*59bfda1fSAndroid Build Coastguard Worker return 0;
687*59bfda1fSAndroid Build Coastguard Worker }
688*59bfda1fSAndroid Build Coastguard Worker
689*59bfda1fSAndroid Build Coastguard Worker #if 0
690*59bfda1fSAndroid Build Coastguard Worker /*
691*59bfda1fSAndroid Build Coastguard Worker * Set up an utf8cursor for use by utf8byte().
692*59bfda1fSAndroid Build Coastguard Worker *
693*59bfda1fSAndroid Build Coastguard Worker * u8c : pointer to cursor.
694*59bfda1fSAndroid Build Coastguard Worker * data : const struct utf8data to use for normalization.
695*59bfda1fSAndroid Build Coastguard Worker * s : NUL-terminated string.
696*59bfda1fSAndroid Build Coastguard Worker *
697*59bfda1fSAndroid Build Coastguard Worker * Returns -1 on error, 0 on success.
698*59bfda1fSAndroid Build Coastguard Worker */
699*59bfda1fSAndroid Build Coastguard Worker static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
700*59bfda1fSAndroid Build Coastguard Worker const char *s)
701*59bfda1fSAndroid Build Coastguard Worker {
702*59bfda1fSAndroid Build Coastguard Worker return utf8ncursor(u8c, data, s, (unsigned int)-1);
703*59bfda1fSAndroid Build Coastguard Worker }
704*59bfda1fSAndroid Build Coastguard Worker #endif
705*59bfda1fSAndroid Build Coastguard Worker
706*59bfda1fSAndroid Build Coastguard Worker /*
707*59bfda1fSAndroid Build Coastguard Worker * Get one byte from the normalized form of the string described by u8c.
708*59bfda1fSAndroid Build Coastguard Worker *
709*59bfda1fSAndroid Build Coastguard Worker * Returns the byte cast to an unsigned char on succes, and -1 on failure.
710*59bfda1fSAndroid Build Coastguard Worker *
711*59bfda1fSAndroid Build Coastguard Worker * The cursor keeps track of the location in the string in u8c->s.
712*59bfda1fSAndroid Build Coastguard Worker * When a character is decomposed, the current location is stored in
713*59bfda1fSAndroid Build Coastguard Worker * u8c->p, and u8c->s is set to the start of the decomposition. Note
714*59bfda1fSAndroid Build Coastguard Worker * that bytes from a decomposition do not count against u8c->len.
715*59bfda1fSAndroid Build Coastguard Worker *
716*59bfda1fSAndroid Build Coastguard Worker * Characters are emitted if they match the current CCC in u8c->ccc.
717*59bfda1fSAndroid Build Coastguard Worker * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
718*59bfda1fSAndroid Build Coastguard Worker * and the function returns 0 in that case.
719*59bfda1fSAndroid Build Coastguard Worker *
720*59bfda1fSAndroid Build Coastguard Worker * Sorting by CCC is done by repeatedly scanning the string. The
721*59bfda1fSAndroid Build Coastguard Worker * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
722*59bfda1fSAndroid Build Coastguard Worker * the start of the scan. The first pass finds the lowest CCC to be
723*59bfda1fSAndroid Build Coastguard Worker * emitted and stores it in u8c->nccc, the second pass emits the
724*59bfda1fSAndroid Build Coastguard Worker * characters with this CCC and finds the next lowest CCC. This limits
725*59bfda1fSAndroid Build Coastguard Worker * the number of passes to 1 + the number of different CCCs in the
726*59bfda1fSAndroid Build Coastguard Worker * sequence being scanned.
727*59bfda1fSAndroid Build Coastguard Worker *
728*59bfda1fSAndroid Build Coastguard Worker * Therefore:
729*59bfda1fSAndroid Build Coastguard Worker * u8c->p != NULL -> a decomposition is being scanned.
730*59bfda1fSAndroid Build Coastguard Worker * u8c->ss != NULL -> this is a repeating scan.
731*59bfda1fSAndroid Build Coastguard Worker * u8c->ccc == -1 -> this is the first scan of a repeating scan.
732*59bfda1fSAndroid Build Coastguard Worker */
utf8byte(struct utf8cursor * u8c)733*59bfda1fSAndroid Build Coastguard Worker static int utf8byte(struct utf8cursor *u8c)
734*59bfda1fSAndroid Build Coastguard Worker {
735*59bfda1fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
736*59bfda1fSAndroid Build Coastguard Worker int ccc;
737*59bfda1fSAndroid Build Coastguard Worker
738*59bfda1fSAndroid Build Coastguard Worker for (;;) {
739*59bfda1fSAndroid Build Coastguard Worker /* Check for the end of a decomposed character. */
740*59bfda1fSAndroid Build Coastguard Worker if (u8c->p && *u8c->s == '\0') {
741*59bfda1fSAndroid Build Coastguard Worker u8c->s = u8c->p;
742*59bfda1fSAndroid Build Coastguard Worker u8c->p = NULL;
743*59bfda1fSAndroid Build Coastguard Worker }
744*59bfda1fSAndroid Build Coastguard Worker
745*59bfda1fSAndroid Build Coastguard Worker /* Check for end-of-string. */
746*59bfda1fSAndroid Build Coastguard Worker if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
747*59bfda1fSAndroid Build Coastguard Worker /* There is no next byte. */
748*59bfda1fSAndroid Build Coastguard Worker if (u8c->ccc == STOPPER)
749*59bfda1fSAndroid Build Coastguard Worker return 0;
750*59bfda1fSAndroid Build Coastguard Worker /* End-of-string during a scan counts as a stopper. */
751*59bfda1fSAndroid Build Coastguard Worker ccc = STOPPER;
752*59bfda1fSAndroid Build Coastguard Worker goto ccc_mismatch;
753*59bfda1fSAndroid Build Coastguard Worker } else if ((*u8c->s & 0xC0) == 0x80) {
754*59bfda1fSAndroid Build Coastguard Worker /* This is a continuation of the current character. */
755*59bfda1fSAndroid Build Coastguard Worker if (!u8c->p)
756*59bfda1fSAndroid Build Coastguard Worker u8c->len--;
757*59bfda1fSAndroid Build Coastguard Worker return (unsigned char)*u8c->s++;
758*59bfda1fSAndroid Build Coastguard Worker }
759*59bfda1fSAndroid Build Coastguard Worker
760*59bfda1fSAndroid Build Coastguard Worker /* Look up the data for the current character. */
761*59bfda1fSAndroid Build Coastguard Worker if (u8c->p) {
762*59bfda1fSAndroid Build Coastguard Worker leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
763*59bfda1fSAndroid Build Coastguard Worker } else {
764*59bfda1fSAndroid Build Coastguard Worker leaf = utf8nlookup(u8c->data, u8c->hangul,
765*59bfda1fSAndroid Build Coastguard Worker u8c->s, u8c->len);
766*59bfda1fSAndroid Build Coastguard Worker }
767*59bfda1fSAndroid Build Coastguard Worker
768*59bfda1fSAndroid Build Coastguard Worker /* No leaf found implies that the input is a binary blob. */
769*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
770*59bfda1fSAndroid Build Coastguard Worker return -1;
771*59bfda1fSAndroid Build Coastguard Worker
772*59bfda1fSAndroid Build Coastguard Worker ccc = LEAF_CCC(leaf);
773*59bfda1fSAndroid Build Coastguard Worker /* Characters that are too new have CCC 0. */
774*59bfda1fSAndroid Build Coastguard Worker if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
775*59bfda1fSAndroid Build Coastguard Worker ccc = STOPPER;
776*59bfda1fSAndroid Build Coastguard Worker } else if (ccc == DECOMPOSE) {
777*59bfda1fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
778*59bfda1fSAndroid Build Coastguard Worker u8c->p = u8c->s + utf8clen(u8c->s);
779*59bfda1fSAndroid Build Coastguard Worker u8c->s = LEAF_STR(leaf);
780*59bfda1fSAndroid Build Coastguard Worker /* Empty decomposition implies CCC 0. */
781*59bfda1fSAndroid Build Coastguard Worker if (*u8c->s == '\0') {
782*59bfda1fSAndroid Build Coastguard Worker if (u8c->ccc == STOPPER)
783*59bfda1fSAndroid Build Coastguard Worker continue;
784*59bfda1fSAndroid Build Coastguard Worker ccc = STOPPER;
785*59bfda1fSAndroid Build Coastguard Worker goto ccc_mismatch;
786*59bfda1fSAndroid Build Coastguard Worker }
787*59bfda1fSAndroid Build Coastguard Worker
788*59bfda1fSAndroid Build Coastguard Worker leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
789*59bfda1fSAndroid Build Coastguard Worker if (!leaf)
790*59bfda1fSAndroid Build Coastguard Worker return -1;
791*59bfda1fSAndroid Build Coastguard Worker ccc = LEAF_CCC(leaf);
792*59bfda1fSAndroid Build Coastguard Worker }
793*59bfda1fSAndroid Build Coastguard Worker
794*59bfda1fSAndroid Build Coastguard Worker /*
795*59bfda1fSAndroid Build Coastguard Worker * If this is not a stopper, then see if it updates
796*59bfda1fSAndroid Build Coastguard Worker * the next canonical class to be emitted.
797*59bfda1fSAndroid Build Coastguard Worker */
798*59bfda1fSAndroid Build Coastguard Worker if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
799*59bfda1fSAndroid Build Coastguard Worker u8c->nccc = ccc;
800*59bfda1fSAndroid Build Coastguard Worker
801*59bfda1fSAndroid Build Coastguard Worker /*
802*59bfda1fSAndroid Build Coastguard Worker * Return the current byte if this is the current
803*59bfda1fSAndroid Build Coastguard Worker * combining class.
804*59bfda1fSAndroid Build Coastguard Worker */
805*59bfda1fSAndroid Build Coastguard Worker if (ccc == u8c->ccc) {
806*59bfda1fSAndroid Build Coastguard Worker if (!u8c->p)
807*59bfda1fSAndroid Build Coastguard Worker u8c->len--;
808*59bfda1fSAndroid Build Coastguard Worker return (unsigned char)*u8c->s++;
809*59bfda1fSAndroid Build Coastguard Worker }
810*59bfda1fSAndroid Build Coastguard Worker
811*59bfda1fSAndroid Build Coastguard Worker /* Current combining class mismatch. */
812*59bfda1fSAndroid Build Coastguard Worker ccc_mismatch:
813*59bfda1fSAndroid Build Coastguard Worker if (u8c->nccc == STOPPER) {
814*59bfda1fSAndroid Build Coastguard Worker /*
815*59bfda1fSAndroid Build Coastguard Worker * Scan forward for the first canonical class
816*59bfda1fSAndroid Build Coastguard Worker * to be emitted. Save the position from
817*59bfda1fSAndroid Build Coastguard Worker * which to restart.
818*59bfda1fSAndroid Build Coastguard Worker */
819*59bfda1fSAndroid Build Coastguard Worker u8c->ccc = MINCCC - 1;
820*59bfda1fSAndroid Build Coastguard Worker u8c->nccc = ccc;
821*59bfda1fSAndroid Build Coastguard Worker u8c->sp = u8c->p;
822*59bfda1fSAndroid Build Coastguard Worker u8c->ss = u8c->s;
823*59bfda1fSAndroid Build Coastguard Worker u8c->slen = u8c->len;
824*59bfda1fSAndroid Build Coastguard Worker if (!u8c->p)
825*59bfda1fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
826*59bfda1fSAndroid Build Coastguard Worker u8c->s += utf8clen(u8c->s);
827*59bfda1fSAndroid Build Coastguard Worker } else if (ccc != STOPPER) {
828*59bfda1fSAndroid Build Coastguard Worker /* Not a stopper, and not the ccc we're emitting. */
829*59bfda1fSAndroid Build Coastguard Worker if (!u8c->p)
830*59bfda1fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
831*59bfda1fSAndroid Build Coastguard Worker u8c->s += utf8clen(u8c->s);
832*59bfda1fSAndroid Build Coastguard Worker } else if (u8c->nccc != MAXCCC + 1) {
833*59bfda1fSAndroid Build Coastguard Worker /* At a stopper, restart for next ccc. */
834*59bfda1fSAndroid Build Coastguard Worker u8c->ccc = u8c->nccc;
835*59bfda1fSAndroid Build Coastguard Worker u8c->nccc = MAXCCC + 1;
836*59bfda1fSAndroid Build Coastguard Worker u8c->s = u8c->ss;
837*59bfda1fSAndroid Build Coastguard Worker u8c->p = u8c->sp;
838*59bfda1fSAndroid Build Coastguard Worker u8c->len = u8c->slen;
839*59bfda1fSAndroid Build Coastguard Worker } else {
840*59bfda1fSAndroid Build Coastguard Worker /* All done, proceed from here. */
841*59bfda1fSAndroid Build Coastguard Worker u8c->ccc = STOPPER;
842*59bfda1fSAndroid Build Coastguard Worker u8c->nccc = STOPPER;
843*59bfda1fSAndroid Build Coastguard Worker u8c->sp = NULL;
844*59bfda1fSAndroid Build Coastguard Worker u8c->ss = NULL;
845*59bfda1fSAndroid Build Coastguard Worker u8c->slen = 0;
846*59bfda1fSAndroid Build Coastguard Worker }
847*59bfda1fSAndroid Build Coastguard Worker }
848*59bfda1fSAndroid Build Coastguard Worker }
849*59bfda1fSAndroid Build Coastguard Worker
850*59bfda1fSAndroid Build Coastguard Worker #if 0
851*59bfda1fSAndroid Build Coastguard Worker /*
852*59bfda1fSAndroid Build Coastguard Worker * Look for the correct const struct utf8data for a unicode version.
853*59bfda1fSAndroid Build Coastguard Worker * Returns NULL if the version requested is too new.
854*59bfda1fSAndroid Build Coastguard Worker *
855*59bfda1fSAndroid Build Coastguard Worker * Two normalization forms are supported: nfdi and nfdicf.
856*59bfda1fSAndroid Build Coastguard Worker *
857*59bfda1fSAndroid Build Coastguard Worker * nfdi:
858*59bfda1fSAndroid Build Coastguard Worker * - Apply unicode normalization form NFD.
859*59bfda1fSAndroid Build Coastguard Worker * - Remove any Default_Ignorable_Code_Point.
860*59bfda1fSAndroid Build Coastguard Worker *
861*59bfda1fSAndroid Build Coastguard Worker * nfdicf:
862*59bfda1fSAndroid Build Coastguard Worker * - Apply unicode normalization form NFD.
863*59bfda1fSAndroid Build Coastguard Worker * - Remove any Default_Ignorable_Code_Point.
864*59bfda1fSAndroid Build Coastguard Worker * - Apply a full casefold (C + F).
865*59bfda1fSAndroid Build Coastguard Worker */
866*59bfda1fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdi(unsigned int maxage)
867*59bfda1fSAndroid Build Coastguard Worker {
868*59bfda1fSAndroid Build Coastguard Worker int i = ARRAY_SIZE(utf8nfdidata) - 1;
869*59bfda1fSAndroid Build Coastguard Worker
870*59bfda1fSAndroid Build Coastguard Worker while (maxage < utf8nfdidata[i].maxage)
871*59bfda1fSAndroid Build Coastguard Worker i--;
872*59bfda1fSAndroid Build Coastguard Worker if (maxage > utf8nfdidata[i].maxage)
873*59bfda1fSAndroid Build Coastguard Worker return NULL;
874*59bfda1fSAndroid Build Coastguard Worker return &utf8nfdidata[i];
875*59bfda1fSAndroid Build Coastguard Worker }
876*59bfda1fSAndroid Build Coastguard Worker #endif
877*59bfda1fSAndroid Build Coastguard Worker
utf8nfdicf(unsigned int maxage)878*59bfda1fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdicf(unsigned int maxage)
879*59bfda1fSAndroid Build Coastguard Worker {
880*59bfda1fSAndroid Build Coastguard Worker int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
881*59bfda1fSAndroid Build Coastguard Worker
882*59bfda1fSAndroid Build Coastguard Worker while (maxage < utf8nfdicfdata[i].maxage)
883*59bfda1fSAndroid Build Coastguard Worker i--;
884*59bfda1fSAndroid Build Coastguard Worker if (maxage > utf8nfdicfdata[i].maxage)
885*59bfda1fSAndroid Build Coastguard Worker return NULL;
886*59bfda1fSAndroid Build Coastguard Worker return &utf8nfdicfdata[i];
887*59bfda1fSAndroid Build Coastguard Worker }
888*59bfda1fSAndroid Build Coastguard Worker
utf8_casefold(const struct f2fs_nls_table * table,const unsigned char * str,size_t len,unsigned char * dest,size_t dlen)889*59bfda1fSAndroid Build Coastguard Worker static int utf8_casefold(const struct f2fs_nls_table *table,
890*59bfda1fSAndroid Build Coastguard Worker const unsigned char *str, size_t len,
891*59bfda1fSAndroid Build Coastguard Worker unsigned char *dest, size_t dlen)
892*59bfda1fSAndroid Build Coastguard Worker {
893*59bfda1fSAndroid Build Coastguard Worker const struct utf8data *data = utf8nfdicf(table->version);
894*59bfda1fSAndroid Build Coastguard Worker struct utf8cursor cur;
895*59bfda1fSAndroid Build Coastguard Worker size_t nlen = 0;
896*59bfda1fSAndroid Build Coastguard Worker
897*59bfda1fSAndroid Build Coastguard Worker if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
898*59bfda1fSAndroid Build Coastguard Worker goto invalid_seq;
899*59bfda1fSAndroid Build Coastguard Worker
900*59bfda1fSAndroid Build Coastguard Worker for (nlen = 0; nlen < dlen; nlen++) {
901*59bfda1fSAndroid Build Coastguard Worker int c = utf8byte(&cur);
902*59bfda1fSAndroid Build Coastguard Worker
903*59bfda1fSAndroid Build Coastguard Worker dest[nlen] = c;
904*59bfda1fSAndroid Build Coastguard Worker if (!c)
905*59bfda1fSAndroid Build Coastguard Worker return nlen;
906*59bfda1fSAndroid Build Coastguard Worker if (c == -1)
907*59bfda1fSAndroid Build Coastguard Worker break;
908*59bfda1fSAndroid Build Coastguard Worker }
909*59bfda1fSAndroid Build Coastguard Worker
910*59bfda1fSAndroid Build Coastguard Worker return -ENAMETOOLONG;
911*59bfda1fSAndroid Build Coastguard Worker
912*59bfda1fSAndroid Build Coastguard Worker invalid_seq:
913*59bfda1fSAndroid Build Coastguard Worker if (dlen < len)
914*59bfda1fSAndroid Build Coastguard Worker return -ENAMETOOLONG;
915*59bfda1fSAndroid Build Coastguard Worker
916*59bfda1fSAndroid Build Coastguard Worker /* Signal invalid sequence */
917*59bfda1fSAndroid Build Coastguard Worker return -EINVAL;
918*59bfda1fSAndroid Build Coastguard Worker }
919*59bfda1fSAndroid Build Coastguard Worker
920*59bfda1fSAndroid Build Coastguard Worker static const struct f2fs_nls_ops utf8_ops = {
921*59bfda1fSAndroid Build Coastguard Worker .casefold = utf8_casefold,
922*59bfda1fSAndroid Build Coastguard Worker };
923*59bfda1fSAndroid Build Coastguard Worker
924*59bfda1fSAndroid Build Coastguard Worker static const struct f2fs_nls_table nls_utf8 = {
925*59bfda1fSAndroid Build Coastguard Worker .ops = &utf8_ops,
926*59bfda1fSAndroid Build Coastguard Worker .version = UNICODE_AGE(12, 1, 0),
927*59bfda1fSAndroid Build Coastguard Worker };
928*59bfda1fSAndroid Build Coastguard Worker
f2fs_load_nls_table(int encoding)929*59bfda1fSAndroid Build Coastguard Worker const struct f2fs_nls_table *f2fs_load_nls_table(int encoding)
930*59bfda1fSAndroid Build Coastguard Worker {
931*59bfda1fSAndroid Build Coastguard Worker if (encoding == F2FS_ENC_UTF8_12_1)
932*59bfda1fSAndroid Build Coastguard Worker return &nls_utf8;
933*59bfda1fSAndroid Build Coastguard Worker
934*59bfda1fSAndroid Build Coastguard Worker return NULL;
935*59bfda1fSAndroid Build Coastguard Worker }
936