xref: /aosp_15_r20/external/f2fs-tools/lib/nls_utf8.c (revision 59bfda1f02d633cd6b8b69f31eee485d40f6eef6)
1*59bfda1fSAndroid Build Coastguard Worker /*
2*59bfda1fSAndroid Build Coastguard Worker  * Copyright (c) 2014 SGI.
3*59bfda1fSAndroid Build Coastguard Worker  * Copyright (c) 2018 Collabora Ltd.
4*59bfda1fSAndroid Build Coastguard Worker  * All rights reserved.
5*59bfda1fSAndroid Build Coastguard Worker  *
6*59bfda1fSAndroid Build Coastguard Worker  * This program is free software; you can redistribute it and/or
7*59bfda1fSAndroid Build Coastguard Worker  * modify it under the terms of the GNU General Public License as
8*59bfda1fSAndroid Build Coastguard Worker  * published by the Free Software Foundation.
9*59bfda1fSAndroid Build Coastguard Worker  *
10*59bfda1fSAndroid Build Coastguard Worker  * This program is distributed in the hope that it would be useful,
11*59bfda1fSAndroid Build Coastguard Worker  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12*59bfda1fSAndroid Build Coastguard Worker  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13*59bfda1fSAndroid Build Coastguard Worker  * GNU General Public License for more details.
14*59bfda1fSAndroid Build Coastguard Worker  *
15*59bfda1fSAndroid Build Coastguard Worker  */
16*59bfda1fSAndroid Build Coastguard Worker 
17*59bfda1fSAndroid Build Coastguard Worker /*
18*59bfda1fSAndroid Build Coastguard Worker  * This code is adapted from the Linux Kernel.  We have a
19*59bfda1fSAndroid Build Coastguard Worker  * userspace version here such that the hashes will match that
20*59bfda1fSAndroid Build Coastguard Worker  * implementation.
21*59bfda1fSAndroid Build Coastguard Worker  */
22*59bfda1fSAndroid Build Coastguard Worker 
23*59bfda1fSAndroid Build Coastguard Worker #include <stdint.h>
24*59bfda1fSAndroid Build Coastguard Worker #include <unistd.h>
25*59bfda1fSAndroid Build Coastguard Worker #include <string.h>
26*59bfda1fSAndroid Build Coastguard Worker #include <limits.h>
27*59bfda1fSAndroid Build Coastguard Worker #include <errno.h>
28*59bfda1fSAndroid Build Coastguard Worker 
29*59bfda1fSAndroid Build Coastguard Worker #include <f2fs_fs.h>
30*59bfda1fSAndroid Build Coastguard Worker 
31*59bfda1fSAndroid Build Coastguard Worker /* Encoding a unicode version number as a single unsigned int. */
32*59bfda1fSAndroid Build Coastguard Worker #define UNICODE_MAJ_SHIFT		(16)
33*59bfda1fSAndroid Build Coastguard Worker #define UNICODE_MIN_SHIFT		(8)
34*59bfda1fSAndroid Build Coastguard Worker 
35*59bfda1fSAndroid Build Coastguard Worker #define UNICODE_AGE(MAJ, MIN, REV)			\
36*59bfda1fSAndroid Build Coastguard Worker 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
37*59bfda1fSAndroid Build Coastguard Worker 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
38*59bfda1fSAndroid Build Coastguard Worker 	 ((unsigned int)(REV)))
39*59bfda1fSAndroid Build Coastguard Worker 
40*59bfda1fSAndroid Build Coastguard Worker /* Needed in struct utf8cursor below. */
41*59bfda1fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF	(12)
42*59bfda1fSAndroid Build Coastguard Worker 
43*59bfda1fSAndroid Build Coastguard Worker /*
44*59bfda1fSAndroid Build Coastguard Worker  * Cursor structure used by the normalizer.
45*59bfda1fSAndroid Build Coastguard Worker  */
46*59bfda1fSAndroid Build Coastguard Worker struct utf8cursor {
47*59bfda1fSAndroid Build Coastguard Worker 	const struct utf8data	*data;
48*59bfda1fSAndroid Build Coastguard Worker 	const char	*s;
49*59bfda1fSAndroid Build Coastguard Worker 	const char	*p;
50*59bfda1fSAndroid Build Coastguard Worker 	const char	*ss;
51*59bfda1fSAndroid Build Coastguard Worker 	const char	*sp;
52*59bfda1fSAndroid Build Coastguard Worker 	unsigned int	len;
53*59bfda1fSAndroid Build Coastguard Worker 	unsigned int	slen;
54*59bfda1fSAndroid Build Coastguard Worker 	short int	ccc;
55*59bfda1fSAndroid Build Coastguard Worker 	short int	nccc;
56*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
57*59bfda1fSAndroid Build Coastguard Worker };
58*59bfda1fSAndroid Build Coastguard Worker 
59*59bfda1fSAndroid Build Coastguard Worker /*
60*59bfda1fSAndroid Build Coastguard Worker  * Initialize a utf8cursor to normalize a string.
61*59bfda1fSAndroid Build Coastguard Worker  * Returns 0 on success.
62*59bfda1fSAndroid Build Coastguard Worker  * Returns -1 on failure.
63*59bfda1fSAndroid Build Coastguard Worker  */
64*59bfda1fSAndroid Build Coastguard Worker // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
65*59bfda1fSAndroid Build Coastguard Worker //		      const char *s);
66*59bfda1fSAndroid Build Coastguard Worker // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
67*59bfda1fSAndroid Build Coastguard Worker //		       const char *s, size_t len);
68*59bfda1fSAndroid Build Coastguard Worker 
69*59bfda1fSAndroid Build Coastguard Worker /*
70*59bfda1fSAndroid Build Coastguard Worker  * Get the next byte in the normalization.
71*59bfda1fSAndroid Build Coastguard Worker  * Returns a value > 0 && < 256 on success.
72*59bfda1fSAndroid Build Coastguard Worker  * Returns 0 when the end of the normalization is reached.
73*59bfda1fSAndroid Build Coastguard Worker  * Returns -1 if the string being normalized is not valid UTF-8.
74*59bfda1fSAndroid Build Coastguard Worker  */
75*59bfda1fSAndroid Build Coastguard Worker // extern int utf8byte(struct utf8cursor *u8c);
76*59bfda1fSAndroid Build Coastguard Worker 
77*59bfda1fSAndroid Build Coastguard Worker 
78*59bfda1fSAndroid Build Coastguard Worker struct utf8data {
79*59bfda1fSAndroid Build Coastguard Worker 	unsigned int maxage;
80*59bfda1fSAndroid Build Coastguard Worker 	unsigned int offset;
81*59bfda1fSAndroid Build Coastguard Worker };
82*59bfda1fSAndroid Build Coastguard Worker 
83*59bfda1fSAndroid Build Coastguard Worker #define __INCLUDED_FROM_UTF8NORM_C__
84*59bfda1fSAndroid Build Coastguard Worker #include "utf8data.h"
85*59bfda1fSAndroid Build Coastguard Worker #undef __INCLUDED_FROM_UTF8NORM_C__
86*59bfda1fSAndroid Build Coastguard Worker 
87*59bfda1fSAndroid Build Coastguard Worker #define ARRAY_SIZE(array)			\
88*59bfda1fSAndroid Build Coastguard Worker         (sizeof(array) / sizeof(array[0]))
89*59bfda1fSAndroid Build Coastguard Worker 
90*59bfda1fSAndroid Build Coastguard Worker #if 0
91*59bfda1fSAndroid Build Coastguard Worker /* Highest unicode version supported by the data tables. */
92*59bfda1fSAndroid Build Coastguard Worker static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
93*59bfda1fSAndroid Build Coastguard Worker {
94*59bfda1fSAndroid Build Coastguard Worker 	int i = ARRAY_SIZE(utf8agetab) - 1;
95*59bfda1fSAndroid Build Coastguard Worker 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
96*59bfda1fSAndroid Build Coastguard Worker 
97*59bfda1fSAndroid Build Coastguard Worker 	while (i >= 0 && utf8agetab[i] != 0) {
98*59bfda1fSAndroid Build Coastguard Worker 		if (sb_utf8version == utf8agetab[i])
99*59bfda1fSAndroid Build Coastguard Worker 			return 1;
100*59bfda1fSAndroid Build Coastguard Worker 		i--;
101*59bfda1fSAndroid Build Coastguard Worker 	}
102*59bfda1fSAndroid Build Coastguard Worker 	return 0;
103*59bfda1fSAndroid Build Coastguard Worker }
104*59bfda1fSAndroid Build Coastguard Worker #endif
105*59bfda1fSAndroid Build Coastguard Worker 
106*59bfda1fSAndroid Build Coastguard Worker #if 0
107*59bfda1fSAndroid Build Coastguard Worker static int utf8version_latest(void)
108*59bfda1fSAndroid Build Coastguard Worker {
109*59bfda1fSAndroid Build Coastguard Worker 	return utf8vers;
110*59bfda1fSAndroid Build Coastguard Worker }
111*59bfda1fSAndroid Build Coastguard Worker #endif
112*59bfda1fSAndroid Build Coastguard Worker 
113*59bfda1fSAndroid Build Coastguard Worker /*
114*59bfda1fSAndroid Build Coastguard Worker  * UTF-8 valid ranges.
115*59bfda1fSAndroid Build Coastguard Worker  *
116*59bfda1fSAndroid Build Coastguard Worker  * The UTF-8 encoding spreads the bits of a 32bit word over several
117*59bfda1fSAndroid Build Coastguard Worker  * bytes. This table gives the ranges that can be held and how they'd
118*59bfda1fSAndroid Build Coastguard Worker  * be represented.
119*59bfda1fSAndroid Build Coastguard Worker  *
120*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x0000007F: 0xxxxxxx
121*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
122*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
123*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
124*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
125*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
126*59bfda1fSAndroid Build Coastguard Worker  *
127*59bfda1fSAndroid Build Coastguard Worker  * There is an additional requirement on UTF-8, in that only the
128*59bfda1fSAndroid Build Coastguard Worker  * shortest representation of a 32bit value is to be used.  A decoder
129*59bfda1fSAndroid Build Coastguard Worker  * must not decode sequences that do not satisfy this requirement.
130*59bfda1fSAndroid Build Coastguard Worker  * Thus the allowed ranges have a lower bound.
131*59bfda1fSAndroid Build Coastguard Worker  *
132*59bfda1fSAndroid Build Coastguard Worker  * 0x00000000 0x0000007F: 0xxxxxxx
133*59bfda1fSAndroid Build Coastguard Worker  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
134*59bfda1fSAndroid Build Coastguard Worker  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
135*59bfda1fSAndroid Build Coastguard Worker  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136*59bfda1fSAndroid Build Coastguard Worker  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
137*59bfda1fSAndroid Build Coastguard Worker  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
138*59bfda1fSAndroid Build Coastguard Worker  *
139*59bfda1fSAndroid Build Coastguard Worker  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
140*59bfda1fSAndroid Build Coastguard Worker  * 17 planes of 65536 values.  This limits the sequences actually seen
141*59bfda1fSAndroid Build Coastguard Worker  * even more, to just the following.
142*59bfda1fSAndroid Build Coastguard Worker  *
143*59bfda1fSAndroid Build Coastguard Worker  *          0 -     0x7F: 0                   - 0x7F
144*59bfda1fSAndroid Build Coastguard Worker  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
145*59bfda1fSAndroid Build Coastguard Worker  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
146*59bfda1fSAndroid Build Coastguard Worker  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
147*59bfda1fSAndroid Build Coastguard Worker  *
148*59bfda1fSAndroid Build Coastguard Worker  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
149*59bfda1fSAndroid Build Coastguard Worker  *
150*59bfda1fSAndroid Build Coastguard Worker  * Note that the longest sequence seen with valid usage is 4 bytes,
151*59bfda1fSAndroid Build Coastguard Worker  * the same a single UTF-32 character.  This makes the UTF-8
152*59bfda1fSAndroid Build Coastguard Worker  * representation of Unicode strictly smaller than UTF-32.
153*59bfda1fSAndroid Build Coastguard Worker  *
154*59bfda1fSAndroid Build Coastguard Worker  * The shortest sequence requirement was introduced by:
155*59bfda1fSAndroid Build Coastguard Worker  *    Corrigendum #1: UTF-8 Shortest Form
156*59bfda1fSAndroid Build Coastguard Worker  * It can be found here:
157*59bfda1fSAndroid Build Coastguard Worker  *    http://www.unicode.org/versions/corrigendum1.html
158*59bfda1fSAndroid Build Coastguard Worker  *
159*59bfda1fSAndroid Build Coastguard Worker  */
160*59bfda1fSAndroid Build Coastguard Worker 
161*59bfda1fSAndroid Build Coastguard Worker /*
162*59bfda1fSAndroid Build Coastguard Worker  * Return the number of bytes used by the current UTF-8 sequence.
163*59bfda1fSAndroid Build Coastguard Worker  * Assumes the input points to the first byte of a valid UTF-8
164*59bfda1fSAndroid Build Coastguard Worker  * sequence.
165*59bfda1fSAndroid Build Coastguard Worker  */
utf8clen(const char * s)166*59bfda1fSAndroid Build Coastguard Worker static inline int utf8clen(const char *s)
167*59bfda1fSAndroid Build Coastguard Worker {
168*59bfda1fSAndroid Build Coastguard Worker 	unsigned char c = *s;
169*59bfda1fSAndroid Build Coastguard Worker 
170*59bfda1fSAndroid Build Coastguard Worker 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
171*59bfda1fSAndroid Build Coastguard Worker }
172*59bfda1fSAndroid Build Coastguard Worker 
173*59bfda1fSAndroid Build Coastguard Worker /*
174*59bfda1fSAndroid Build Coastguard Worker  * Decode a 3-byte UTF-8 sequence.
175*59bfda1fSAndroid Build Coastguard Worker  */
176*59bfda1fSAndroid Build Coastguard Worker static unsigned int
utf8decode3(const char * str)177*59bfda1fSAndroid Build Coastguard Worker utf8decode3(const char *str)
178*59bfda1fSAndroid Build Coastguard Worker {
179*59bfda1fSAndroid Build Coastguard Worker 	unsigned int		uc;
180*59bfda1fSAndroid Build Coastguard Worker 
181*59bfda1fSAndroid Build Coastguard Worker 	uc = *str++ & 0x0F;
182*59bfda1fSAndroid Build Coastguard Worker 	uc <<= 6;
183*59bfda1fSAndroid Build Coastguard Worker 	uc |= *str++ & 0x3F;
184*59bfda1fSAndroid Build Coastguard Worker 	uc <<= 6;
185*59bfda1fSAndroid Build Coastguard Worker 	uc |= *str++ & 0x3F;
186*59bfda1fSAndroid Build Coastguard Worker 
187*59bfda1fSAndroid Build Coastguard Worker 	return uc;
188*59bfda1fSAndroid Build Coastguard Worker }
189*59bfda1fSAndroid Build Coastguard Worker 
190*59bfda1fSAndroid Build Coastguard Worker /*
191*59bfda1fSAndroid Build Coastguard Worker  * Encode a 3-byte UTF-8 sequence.
192*59bfda1fSAndroid Build Coastguard Worker  */
193*59bfda1fSAndroid Build Coastguard Worker static int
utf8encode3(char * str,unsigned int val)194*59bfda1fSAndroid Build Coastguard Worker utf8encode3(char *str, unsigned int val)
195*59bfda1fSAndroid Build Coastguard Worker {
196*59bfda1fSAndroid Build Coastguard Worker 	str[2] = (val & 0x3F) | 0x80;
197*59bfda1fSAndroid Build Coastguard Worker 	val >>= 6;
198*59bfda1fSAndroid Build Coastguard Worker 	str[1] = (val & 0x3F) | 0x80;
199*59bfda1fSAndroid Build Coastguard Worker 	val >>= 6;
200*59bfda1fSAndroid Build Coastguard Worker 	str[0] = val | 0xE0;
201*59bfda1fSAndroid Build Coastguard Worker 
202*59bfda1fSAndroid Build Coastguard Worker 	return 3;
203*59bfda1fSAndroid Build Coastguard Worker }
204*59bfda1fSAndroid Build Coastguard Worker 
205*59bfda1fSAndroid Build Coastguard Worker /*
206*59bfda1fSAndroid Build Coastguard Worker  * utf8trie_t
207*59bfda1fSAndroid Build Coastguard Worker  *
208*59bfda1fSAndroid Build Coastguard Worker  * A compact binary tree, used to decode UTF-8 characters.
209*59bfda1fSAndroid Build Coastguard Worker  *
210*59bfda1fSAndroid Build Coastguard Worker  * Internal nodes are one byte for the node itself, and up to three
211*59bfda1fSAndroid Build Coastguard Worker  * bytes for an offset into the tree.  The first byte contains the
212*59bfda1fSAndroid Build Coastguard Worker  * following information:
213*59bfda1fSAndroid Build Coastguard Worker  *  NEXTBYTE  - flag        - advance to next byte if set
214*59bfda1fSAndroid Build Coastguard Worker  *  BITNUM    - 3 bit field - the bit number to tested
215*59bfda1fSAndroid Build Coastguard Worker  *  OFFLEN    - 2 bit field - number of bytes in the offset
216*59bfda1fSAndroid Build Coastguard Worker  * if offlen == 0 (non-branching node)
217*59bfda1fSAndroid Build Coastguard Worker  *  RIGHTPATH - 1 bit field - set if the following node is for the
218*59bfda1fSAndroid Build Coastguard Worker  *                            right-hand path (tested bit is set)
219*59bfda1fSAndroid Build Coastguard Worker  *  TRIENODE  - 1 bit field - set if the following node is an internal
220*59bfda1fSAndroid Build Coastguard Worker  *                            node, otherwise it is a leaf node
221*59bfda1fSAndroid Build Coastguard Worker  * if offlen != 0 (branching node)
222*59bfda1fSAndroid Build Coastguard Worker  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
223*59bfda1fSAndroid Build Coastguard Worker  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
224*59bfda1fSAndroid Build Coastguard Worker  *
225*59bfda1fSAndroid Build Coastguard Worker  * Due to the way utf8 works, there cannot be branching nodes with
226*59bfda1fSAndroid Build Coastguard Worker  * NEXTBYTE set, and moreover those nodes always have a righthand
227*59bfda1fSAndroid Build Coastguard Worker  * descendant.
228*59bfda1fSAndroid Build Coastguard Worker  */
229*59bfda1fSAndroid Build Coastguard Worker typedef const unsigned char utf8trie_t;
230*59bfda1fSAndroid Build Coastguard Worker #define BITNUM		0x07
231*59bfda1fSAndroid Build Coastguard Worker #define NEXTBYTE	0x08
232*59bfda1fSAndroid Build Coastguard Worker #define OFFLEN		0x30
233*59bfda1fSAndroid Build Coastguard Worker #define OFFLEN_SHIFT	4
234*59bfda1fSAndroid Build Coastguard Worker #define RIGHTPATH	0x40
235*59bfda1fSAndroid Build Coastguard Worker #define TRIENODE	0x80
236*59bfda1fSAndroid Build Coastguard Worker #define RIGHTNODE	0x40
237*59bfda1fSAndroid Build Coastguard Worker #define LEFTNODE	0x80
238*59bfda1fSAndroid Build Coastguard Worker 
239*59bfda1fSAndroid Build Coastguard Worker /*
240*59bfda1fSAndroid Build Coastguard Worker  * utf8leaf_t
241*59bfda1fSAndroid Build Coastguard Worker  *
242*59bfda1fSAndroid Build Coastguard Worker  * The leaves of the trie are embedded in the trie, and so the same
243*59bfda1fSAndroid Build Coastguard Worker  * underlying datatype: unsigned char.
244*59bfda1fSAndroid Build Coastguard Worker  *
245*59bfda1fSAndroid Build Coastguard Worker  * leaf[0]: The unicode version, stored as a generation number that is
246*59bfda1fSAndroid Build Coastguard Worker  *          an index into utf8agetab[].  With this we can filter code
247*59bfda1fSAndroid Build Coastguard Worker  *          points based on the unicode version in which they were
248*59bfda1fSAndroid Build Coastguard Worker  *          defined.  The CCC of a non-defined code point is 0.
249*59bfda1fSAndroid Build Coastguard Worker  * leaf[1]: Canonical Combining Class. During normalization, we need
250*59bfda1fSAndroid Build Coastguard Worker  *          to do a stable sort into ascending order of all characters
251*59bfda1fSAndroid Build Coastguard Worker  *          with a non-zero CCC that occur between two characters with
252*59bfda1fSAndroid Build Coastguard Worker  *          a CCC of 0, or at the begin or end of a string.
253*59bfda1fSAndroid Build Coastguard Worker  *          The unicode standard guarantees that all CCC values are
254*59bfda1fSAndroid Build Coastguard Worker  *          between 0 and 254 inclusive, which leaves 255 available as
255*59bfda1fSAndroid Build Coastguard Worker  *          a special value.
256*59bfda1fSAndroid Build Coastguard Worker  *          Code points with CCC 0 are known as stoppers.
257*59bfda1fSAndroid Build Coastguard Worker  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
258*59bfda1fSAndroid Build Coastguard Worker  *          start of a NUL-terminated string that is the decomposition
259*59bfda1fSAndroid Build Coastguard Worker  *          of the character.
260*59bfda1fSAndroid Build Coastguard Worker  *          The CCC of a decomposable character is the same as the CCC
261*59bfda1fSAndroid Build Coastguard Worker  *          of the first character of its decomposition.
262*59bfda1fSAndroid Build Coastguard Worker  *          Some characters decompose as the empty string: these are
263*59bfda1fSAndroid Build Coastguard Worker  *          characters with the Default_Ignorable_Code_Point property.
264*59bfda1fSAndroid Build Coastguard Worker  *          These do affect normalization, as they all have CCC 0.
265*59bfda1fSAndroid Build Coastguard Worker  *
266*59bfda1fSAndroid Build Coastguard Worker  * The decompositions in the trie have been fully expanded, with the
267*59bfda1fSAndroid Build Coastguard Worker  * exception of Hangul syllables, which are decomposed algorithmically.
268*59bfda1fSAndroid Build Coastguard Worker  *
269*59bfda1fSAndroid Build Coastguard Worker  * Casefolding, if applicable, is also done using decompositions.
270*59bfda1fSAndroid Build Coastguard Worker  *
271*59bfda1fSAndroid Build Coastguard Worker  * The trie is constructed in such a way that leaves exist for all
272*59bfda1fSAndroid Build Coastguard Worker  * UTF-8 sequences that match the criteria from the "UTF-8 valid
273*59bfda1fSAndroid Build Coastguard Worker  * ranges" comment above, and only for those sequences.  Therefore a
274*59bfda1fSAndroid Build Coastguard Worker  * lookup in the trie can be used to validate the UTF-8 input.
275*59bfda1fSAndroid Build Coastguard Worker  */
276*59bfda1fSAndroid Build Coastguard Worker typedef const unsigned char utf8leaf_t;
277*59bfda1fSAndroid Build Coastguard Worker 
278*59bfda1fSAndroid Build Coastguard Worker #define LEAF_GEN(LEAF)	((LEAF)[0])
279*59bfda1fSAndroid Build Coastguard Worker #define LEAF_CCC(LEAF)	((LEAF)[1])
280*59bfda1fSAndroid Build Coastguard Worker #define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))
281*59bfda1fSAndroid Build Coastguard Worker 
282*59bfda1fSAndroid Build Coastguard Worker #define MINCCC		(0)
283*59bfda1fSAndroid Build Coastguard Worker #define MAXCCC		(254)
284*59bfda1fSAndroid Build Coastguard Worker #define STOPPER		(0)
285*59bfda1fSAndroid Build Coastguard Worker #define	DECOMPOSE	(255)
286*59bfda1fSAndroid Build Coastguard Worker 
287*59bfda1fSAndroid Build Coastguard Worker /* Marker for hangul syllable decomposition. */
288*59bfda1fSAndroid Build Coastguard Worker #define HANGUL		((char)(255))
289*59bfda1fSAndroid Build Coastguard Worker /* Size of the synthesized leaf used for Hangul syllable decomposition. */
290*59bfda1fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF	(12)
291*59bfda1fSAndroid Build Coastguard Worker 
292*59bfda1fSAndroid Build Coastguard Worker /*
293*59bfda1fSAndroid Build Coastguard Worker  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
294*59bfda1fSAndroid Build Coastguard Worker  *
295*59bfda1fSAndroid Build Coastguard Worker  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
296*59bfda1fSAndroid Build Coastguard Worker  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
297*59bfda1fSAndroid Build Coastguard Worker  *
298*59bfda1fSAndroid Build Coastguard Worker  * SBase = 0xAC00
299*59bfda1fSAndroid Build Coastguard Worker  * LBase = 0x1100
300*59bfda1fSAndroid Build Coastguard Worker  * VBase = 0x1161
301*59bfda1fSAndroid Build Coastguard Worker  * TBase = 0x11A7
302*59bfda1fSAndroid Build Coastguard Worker  * LCount = 19
303*59bfda1fSAndroid Build Coastguard Worker  * VCount = 21
304*59bfda1fSAndroid Build Coastguard Worker  * TCount = 28
305*59bfda1fSAndroid Build Coastguard Worker  * NCount = 588 (VCount * TCount)
306*59bfda1fSAndroid Build Coastguard Worker  * SCount = 11172 (LCount * NCount)
307*59bfda1fSAndroid Build Coastguard Worker  *
308*59bfda1fSAndroid Build Coastguard Worker  * Decomposition:
309*59bfda1fSAndroid Build Coastguard Worker  *   SIndex = s - SBase
310*59bfda1fSAndroid Build Coastguard Worker  *
311*59bfda1fSAndroid Build Coastguard Worker  * LV (Canonical/Full)
312*59bfda1fSAndroid Build Coastguard Worker  *   LIndex = SIndex / NCount
313*59bfda1fSAndroid Build Coastguard Worker  *   VIndex = (Sindex % NCount) / TCount
314*59bfda1fSAndroid Build Coastguard Worker  *   LPart = LBase + LIndex
315*59bfda1fSAndroid Build Coastguard Worker  *   VPart = VBase + VIndex
316*59bfda1fSAndroid Build Coastguard Worker  *
317*59bfda1fSAndroid Build Coastguard Worker  * LVT (Canonical)
318*59bfda1fSAndroid Build Coastguard Worker  *   LVIndex = (SIndex / TCount) * TCount
319*59bfda1fSAndroid Build Coastguard Worker  *   TIndex = (Sindex % TCount)
320*59bfda1fSAndroid Build Coastguard Worker  *   LVPart = SBase + LVIndex
321*59bfda1fSAndroid Build Coastguard Worker  *   TPart = TBase + TIndex
322*59bfda1fSAndroid Build Coastguard Worker  *
323*59bfda1fSAndroid Build Coastguard Worker  * LVT (Full)
324*59bfda1fSAndroid Build Coastguard Worker  *   LIndex = SIndex / NCount
325*59bfda1fSAndroid Build Coastguard Worker  *   VIndex = (Sindex % NCount) / TCount
326*59bfda1fSAndroid Build Coastguard Worker  *   TIndex = (Sindex % TCount)
327*59bfda1fSAndroid Build Coastguard Worker  *   LPart = LBase + LIndex
328*59bfda1fSAndroid Build Coastguard Worker  *   VPart = VBase + VIndex
329*59bfda1fSAndroid Build Coastguard Worker  *   if (TIndex == 0) {
330*59bfda1fSAndroid Build Coastguard Worker  *          d = <LPart, VPart>
331*59bfda1fSAndroid Build Coastguard Worker  *   } else {
332*59bfda1fSAndroid Build Coastguard Worker  *          TPart = TBase + TIndex
333*59bfda1fSAndroid Build Coastguard Worker  *          d = <LPart, TPart, VPart>
334*59bfda1fSAndroid Build Coastguard Worker  *   }
335*59bfda1fSAndroid Build Coastguard Worker  */
336*59bfda1fSAndroid Build Coastguard Worker 
337*59bfda1fSAndroid Build Coastguard Worker /* Constants */
338*59bfda1fSAndroid Build Coastguard Worker #define SB	(0xAC00)
339*59bfda1fSAndroid Build Coastguard Worker #define LB	(0x1100)
340*59bfda1fSAndroid Build Coastguard Worker #define VB	(0x1161)
341*59bfda1fSAndroid Build Coastguard Worker #define TB	(0x11A7)
342*59bfda1fSAndroid Build Coastguard Worker #define LC	(19)
343*59bfda1fSAndroid Build Coastguard Worker #define VC	(21)
344*59bfda1fSAndroid Build Coastguard Worker #define TC	(28)
345*59bfda1fSAndroid Build Coastguard Worker #define NC	(VC * TC)
346*59bfda1fSAndroid Build Coastguard Worker #define SC	(LC * NC)
347*59bfda1fSAndroid Build Coastguard Worker 
348*59bfda1fSAndroid Build Coastguard Worker /* Algorithmic decomposition of hangul syllable. */
349*59bfda1fSAndroid Build Coastguard Worker static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)350*59bfda1fSAndroid Build Coastguard Worker utf8hangul(const char *str, unsigned char *hangul)
351*59bfda1fSAndroid Build Coastguard Worker {
352*59bfda1fSAndroid Build Coastguard Worker 	unsigned int	si;
353*59bfda1fSAndroid Build Coastguard Worker 	unsigned int	li;
354*59bfda1fSAndroid Build Coastguard Worker 	unsigned int	vi;
355*59bfda1fSAndroid Build Coastguard Worker 	unsigned int	ti;
356*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	*h;
357*59bfda1fSAndroid Build Coastguard Worker 
358*59bfda1fSAndroid Build Coastguard Worker 	/* Calculate the SI, LI, VI, and TI values. */
359*59bfda1fSAndroid Build Coastguard Worker 	si = utf8decode3(str) - SB;
360*59bfda1fSAndroid Build Coastguard Worker 	li = si / NC;
361*59bfda1fSAndroid Build Coastguard Worker 	vi = (si % NC) / TC;
362*59bfda1fSAndroid Build Coastguard Worker 	ti = si % TC;
363*59bfda1fSAndroid Build Coastguard Worker 
364*59bfda1fSAndroid Build Coastguard Worker 	/* Fill in base of leaf. */
365*59bfda1fSAndroid Build Coastguard Worker 	h = hangul;
366*59bfda1fSAndroid Build Coastguard Worker 	LEAF_GEN(h) = 2;
367*59bfda1fSAndroid Build Coastguard Worker 	LEAF_CCC(h) = DECOMPOSE;
368*59bfda1fSAndroid Build Coastguard Worker 	h += 2;
369*59bfda1fSAndroid Build Coastguard Worker 
370*59bfda1fSAndroid Build Coastguard Worker 	/* Add LPart, a 3-byte UTF-8 sequence. */
371*59bfda1fSAndroid Build Coastguard Worker 	h += utf8encode3((char *)h, li + LB);
372*59bfda1fSAndroid Build Coastguard Worker 
373*59bfda1fSAndroid Build Coastguard Worker 	/* Add VPart, a 3-byte UTF-8 sequence. */
374*59bfda1fSAndroid Build Coastguard Worker 	h += utf8encode3((char *)h, vi + VB);
375*59bfda1fSAndroid Build Coastguard Worker 
376*59bfda1fSAndroid Build Coastguard Worker 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
377*59bfda1fSAndroid Build Coastguard Worker 	if (ti)
378*59bfda1fSAndroid Build Coastguard Worker 		h += utf8encode3((char *)h, ti + TB);
379*59bfda1fSAndroid Build Coastguard Worker 
380*59bfda1fSAndroid Build Coastguard Worker 	/* Terminate string. */
381*59bfda1fSAndroid Build Coastguard Worker 	h[0] = '\0';
382*59bfda1fSAndroid Build Coastguard Worker 
383*59bfda1fSAndroid Build Coastguard Worker 	return hangul;
384*59bfda1fSAndroid Build Coastguard Worker }
385*59bfda1fSAndroid Build Coastguard Worker 
386*59bfda1fSAndroid Build Coastguard Worker /*
387*59bfda1fSAndroid Build Coastguard Worker  * Use trie to scan s, touching at most len bytes.
388*59bfda1fSAndroid Build Coastguard Worker  * Returns the leaf if one exists, NULL otherwise.
389*59bfda1fSAndroid Build Coastguard Worker  *
390*59bfda1fSAndroid Build Coastguard Worker  * A non-NULL return guarantees that the UTF-8 sequence starting at s
391*59bfda1fSAndroid Build Coastguard Worker  * is well-formed and corresponds to a known unicode code point.  The
392*59bfda1fSAndroid Build Coastguard Worker  * shorthand for this will be "is valid UTF-8 unicode".
393*59bfda1fSAndroid Build Coastguard Worker  */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)394*59bfda1fSAndroid Build Coastguard Worker static utf8leaf_t *utf8nlookup(const struct utf8data *data,
395*59bfda1fSAndroid Build Coastguard Worker 			       unsigned char *hangul, const char *s, size_t len)
396*59bfda1fSAndroid Build Coastguard Worker {
397*59bfda1fSAndroid Build Coastguard Worker 	utf8trie_t	*trie;
398*59bfda1fSAndroid Build Coastguard Worker 	int		offlen;
399*59bfda1fSAndroid Build Coastguard Worker 	int		offset;
400*59bfda1fSAndroid Build Coastguard Worker 	int		mask;
401*59bfda1fSAndroid Build Coastguard Worker 	int		node;
402*59bfda1fSAndroid Build Coastguard Worker 
403*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
404*59bfda1fSAndroid Build Coastguard Worker 		return NULL;
405*59bfda1fSAndroid Build Coastguard Worker 	if (len == 0)
406*59bfda1fSAndroid Build Coastguard Worker 		return NULL;
407*59bfda1fSAndroid Build Coastguard Worker 
408*59bfda1fSAndroid Build Coastguard Worker 	trie = utf8data + data->offset;
409*59bfda1fSAndroid Build Coastguard Worker 	node = 1;
410*59bfda1fSAndroid Build Coastguard Worker 	while (node) {
411*59bfda1fSAndroid Build Coastguard Worker 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
412*59bfda1fSAndroid Build Coastguard Worker 		if (*trie & NEXTBYTE) {
413*59bfda1fSAndroid Build Coastguard Worker 			if (--len == 0)
414*59bfda1fSAndroid Build Coastguard Worker 				return NULL;
415*59bfda1fSAndroid Build Coastguard Worker 			s++;
416*59bfda1fSAndroid Build Coastguard Worker 		}
417*59bfda1fSAndroid Build Coastguard Worker 		mask = 1 << (*trie & BITNUM);
418*59bfda1fSAndroid Build Coastguard Worker 		if (*s & mask) {
419*59bfda1fSAndroid Build Coastguard Worker 			/* Right leg */
420*59bfda1fSAndroid Build Coastguard Worker 			if (offlen) {
421*59bfda1fSAndroid Build Coastguard Worker 				/* Right node at offset of trie */
422*59bfda1fSAndroid Build Coastguard Worker 				node = (*trie & RIGHTNODE);
423*59bfda1fSAndroid Build Coastguard Worker 				offset = trie[offlen];
424*59bfda1fSAndroid Build Coastguard Worker 				while (--offlen) {
425*59bfda1fSAndroid Build Coastguard Worker 					offset <<= 8;
426*59bfda1fSAndroid Build Coastguard Worker 					offset |= trie[offlen];
427*59bfda1fSAndroid Build Coastguard Worker 				}
428*59bfda1fSAndroid Build Coastguard Worker 				trie += offset;
429*59bfda1fSAndroid Build Coastguard Worker 			} else if (*trie & RIGHTPATH) {
430*59bfda1fSAndroid Build Coastguard Worker 				/* Right node after this node */
431*59bfda1fSAndroid Build Coastguard Worker 				node = (*trie & TRIENODE);
432*59bfda1fSAndroid Build Coastguard Worker 				trie++;
433*59bfda1fSAndroid Build Coastguard Worker 			} else {
434*59bfda1fSAndroid Build Coastguard Worker 				/* No right node. */
435*59bfda1fSAndroid Build Coastguard Worker 				return NULL;
436*59bfda1fSAndroid Build Coastguard Worker 			}
437*59bfda1fSAndroid Build Coastguard Worker 		} else {
438*59bfda1fSAndroid Build Coastguard Worker 			/* Left leg */
439*59bfda1fSAndroid Build Coastguard Worker 			if (offlen) {
440*59bfda1fSAndroid Build Coastguard Worker 				/* Left node after this node. */
441*59bfda1fSAndroid Build Coastguard Worker 				node = (*trie & LEFTNODE);
442*59bfda1fSAndroid Build Coastguard Worker 				trie += offlen + 1;
443*59bfda1fSAndroid Build Coastguard Worker 			} else if (*trie & RIGHTPATH) {
444*59bfda1fSAndroid Build Coastguard Worker 				/* No left node. */
445*59bfda1fSAndroid Build Coastguard Worker 				return NULL;
446*59bfda1fSAndroid Build Coastguard Worker 			} else {
447*59bfda1fSAndroid Build Coastguard Worker 				/* Left node after this node */
448*59bfda1fSAndroid Build Coastguard Worker 				node = (*trie & TRIENODE);
449*59bfda1fSAndroid Build Coastguard Worker 				trie++;
450*59bfda1fSAndroid Build Coastguard Worker 			}
451*59bfda1fSAndroid Build Coastguard Worker 		}
452*59bfda1fSAndroid Build Coastguard Worker 	}
453*59bfda1fSAndroid Build Coastguard Worker 	/*
454*59bfda1fSAndroid Build Coastguard Worker 	 * Hangul decomposition is done algorithmically. These are the
455*59bfda1fSAndroid Build Coastguard Worker 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
456*59bfda1fSAndroid Build Coastguard Worker 	 * always 3 bytes long, so s has been advanced twice, and the
457*59bfda1fSAndroid Build Coastguard Worker 	 * start of the sequence is at s-2.
458*59bfda1fSAndroid Build Coastguard Worker 	 */
459*59bfda1fSAndroid Build Coastguard Worker 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
460*59bfda1fSAndroid Build Coastguard Worker 		trie = utf8hangul(s - 2, hangul);
461*59bfda1fSAndroid Build Coastguard Worker 	return trie;
462*59bfda1fSAndroid Build Coastguard Worker }
463*59bfda1fSAndroid Build Coastguard Worker 
464*59bfda1fSAndroid Build Coastguard Worker /*
465*59bfda1fSAndroid Build Coastguard Worker  * Use trie to scan s.
466*59bfda1fSAndroid Build Coastguard Worker  * Returns the leaf if one exists, NULL otherwise.
467*59bfda1fSAndroid Build Coastguard Worker  *
468*59bfda1fSAndroid Build Coastguard Worker  * Forwards to utf8nlookup().
469*59bfda1fSAndroid Build Coastguard Worker  */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)470*59bfda1fSAndroid Build Coastguard Worker static utf8leaf_t *utf8lookup(const struct utf8data *data,
471*59bfda1fSAndroid Build Coastguard Worker 			      unsigned char *hangul, const char *s)
472*59bfda1fSAndroid Build Coastguard Worker {
473*59bfda1fSAndroid Build Coastguard Worker 	return utf8nlookup(data, hangul, s, (size_t)-1);
474*59bfda1fSAndroid Build Coastguard Worker }
475*59bfda1fSAndroid Build Coastguard Worker 
476*59bfda1fSAndroid Build Coastguard Worker #if 0
477*59bfda1fSAndroid Build Coastguard Worker /*
478*59bfda1fSAndroid Build Coastguard Worker  * Maximum age of any character in s.
479*59bfda1fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
480*59bfda1fSAndroid Build Coastguard Worker  * Return 0 if only non-assigned code points are used.
481*59bfda1fSAndroid Build Coastguard Worker  */
482*59bfda1fSAndroid Build Coastguard Worker static int utf8agemax(const struct utf8data *data, const char *s)
483*59bfda1fSAndroid Build Coastguard Worker {
484*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
485*59bfda1fSAndroid Build Coastguard Worker 	int		age = 0;
486*59bfda1fSAndroid Build Coastguard Worker 	int		leaf_age;
487*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
488*59bfda1fSAndroid Build Coastguard Worker 
489*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
490*59bfda1fSAndroid Build Coastguard Worker 		return -1;
491*59bfda1fSAndroid Build Coastguard Worker 
492*59bfda1fSAndroid Build Coastguard Worker 	while (*s) {
493*59bfda1fSAndroid Build Coastguard Worker 		leaf = utf8lookup(data, hangul, s);
494*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
495*59bfda1fSAndroid Build Coastguard Worker 			return -1;
496*59bfda1fSAndroid Build Coastguard Worker 
497*59bfda1fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
498*59bfda1fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age > age)
499*59bfda1fSAndroid Build Coastguard Worker 			age = leaf_age;
500*59bfda1fSAndroid Build Coastguard Worker 		s += utf8clen(s);
501*59bfda1fSAndroid Build Coastguard Worker 	}
502*59bfda1fSAndroid Build Coastguard Worker 	return age;
503*59bfda1fSAndroid Build Coastguard Worker }
504*59bfda1fSAndroid Build Coastguard Worker #endif
505*59bfda1fSAndroid Build Coastguard Worker 
506*59bfda1fSAndroid Build Coastguard Worker #if 0
507*59bfda1fSAndroid Build Coastguard Worker /*
508*59bfda1fSAndroid Build Coastguard Worker  * Minimum age of any character in s.
509*59bfda1fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
510*59bfda1fSAndroid Build Coastguard Worker  * Return 0 if non-assigned code points are used.
511*59bfda1fSAndroid Build Coastguard Worker  */
512*59bfda1fSAndroid Build Coastguard Worker static int utf8agemin(const struct utf8data *data, const char *s)
513*59bfda1fSAndroid Build Coastguard Worker {
514*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
515*59bfda1fSAndroid Build Coastguard Worker 	int		age;
516*59bfda1fSAndroid Build Coastguard Worker 	int		leaf_age;
517*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
518*59bfda1fSAndroid Build Coastguard Worker 
519*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
520*59bfda1fSAndroid Build Coastguard Worker 		return -1;
521*59bfda1fSAndroid Build Coastguard Worker 	age = data->maxage;
522*59bfda1fSAndroid Build Coastguard Worker 	while (*s) {
523*59bfda1fSAndroid Build Coastguard Worker 		leaf = utf8lookup(data, hangul, s);
524*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
525*59bfda1fSAndroid Build Coastguard Worker 			return -1;
526*59bfda1fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
527*59bfda1fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age < age)
528*59bfda1fSAndroid Build Coastguard Worker 			age = leaf_age;
529*59bfda1fSAndroid Build Coastguard Worker 		s += utf8clen(s);
530*59bfda1fSAndroid Build Coastguard Worker 	}
531*59bfda1fSAndroid Build Coastguard Worker 	return age;
532*59bfda1fSAndroid Build Coastguard Worker }
533*59bfda1fSAndroid Build Coastguard Worker #endif
534*59bfda1fSAndroid Build Coastguard Worker 
535*59bfda1fSAndroid Build Coastguard Worker #if 0
536*59bfda1fSAndroid Build Coastguard Worker /*
537*59bfda1fSAndroid Build Coastguard Worker  * Maximum age of any character in s, touch at most len bytes.
538*59bfda1fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
539*59bfda1fSAndroid Build Coastguard Worker  */
540*59bfda1fSAndroid Build Coastguard Worker static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
541*59bfda1fSAndroid Build Coastguard Worker {
542*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
543*59bfda1fSAndroid Build Coastguard Worker 	int		age = 0;
544*59bfda1fSAndroid Build Coastguard Worker 	int		leaf_age;
545*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
546*59bfda1fSAndroid Build Coastguard Worker 
547*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
548*59bfda1fSAndroid Build Coastguard Worker 		return -1;
549*59bfda1fSAndroid Build Coastguard Worker 
550*59bfda1fSAndroid Build Coastguard Worker 	while (len && *s) {
551*59bfda1fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
552*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
553*59bfda1fSAndroid Build Coastguard Worker 			return -1;
554*59bfda1fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
555*59bfda1fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age > age)
556*59bfda1fSAndroid Build Coastguard Worker 			age = leaf_age;
557*59bfda1fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
558*59bfda1fSAndroid Build Coastguard Worker 		s += utf8clen(s);
559*59bfda1fSAndroid Build Coastguard Worker 	}
560*59bfda1fSAndroid Build Coastguard Worker 	return age;
561*59bfda1fSAndroid Build Coastguard Worker }
562*59bfda1fSAndroid Build Coastguard Worker #endif
563*59bfda1fSAndroid Build Coastguard Worker 
564*59bfda1fSAndroid Build Coastguard Worker #if 0
565*59bfda1fSAndroid Build Coastguard Worker /*
566*59bfda1fSAndroid Build Coastguard Worker  * Maximum age of any character in s, touch at most len bytes.
567*59bfda1fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
568*59bfda1fSAndroid Build Coastguard Worker  */
569*59bfda1fSAndroid Build Coastguard Worker static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
570*59bfda1fSAndroid Build Coastguard Worker {
571*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
572*59bfda1fSAndroid Build Coastguard Worker 	int		leaf_age;
573*59bfda1fSAndroid Build Coastguard Worker 	int		age;
574*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
575*59bfda1fSAndroid Build Coastguard Worker 
576*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
577*59bfda1fSAndroid Build Coastguard Worker 		return -1;
578*59bfda1fSAndroid Build Coastguard Worker 	age = data->maxage;
579*59bfda1fSAndroid Build Coastguard Worker 	while (len && *s) {
580*59bfda1fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
581*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
582*59bfda1fSAndroid Build Coastguard Worker 			return -1;
583*59bfda1fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
584*59bfda1fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age < age)
585*59bfda1fSAndroid Build Coastguard Worker 			age = leaf_age;
586*59bfda1fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
587*59bfda1fSAndroid Build Coastguard Worker 		s += utf8clen(s);
588*59bfda1fSAndroid Build Coastguard Worker 	}
589*59bfda1fSAndroid Build Coastguard Worker 	return age;
590*59bfda1fSAndroid Build Coastguard Worker }
591*59bfda1fSAndroid Build Coastguard Worker #endif
592*59bfda1fSAndroid Build Coastguard Worker 
593*59bfda1fSAndroid Build Coastguard Worker #if 0
594*59bfda1fSAndroid Build Coastguard Worker /*
595*59bfda1fSAndroid Build Coastguard Worker  * Length of the normalization of s.
596*59bfda1fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
597*59bfda1fSAndroid Build Coastguard Worker  *
598*59bfda1fSAndroid Build Coastguard Worker  * A string of Default_Ignorable_Code_Point has length 0.
599*59bfda1fSAndroid Build Coastguard Worker  */
600*59bfda1fSAndroid Build Coastguard Worker static ssize_t utf8len(const struct utf8data *data, const char *s)
601*59bfda1fSAndroid Build Coastguard Worker {
602*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
603*59bfda1fSAndroid Build Coastguard Worker 	size_t		ret = 0;
604*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
605*59bfda1fSAndroid Build Coastguard Worker 
606*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
607*59bfda1fSAndroid Build Coastguard Worker 		return -1;
608*59bfda1fSAndroid Build Coastguard Worker 	while (*s) {
609*59bfda1fSAndroid Build Coastguard Worker 		leaf = utf8lookup(data, hangul, s);
610*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
611*59bfda1fSAndroid Build Coastguard Worker 			return -1;
612*59bfda1fSAndroid Build Coastguard Worker 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
613*59bfda1fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
614*59bfda1fSAndroid Build Coastguard Worker 		else if (LEAF_CCC(leaf) == DECOMPOSE)
615*59bfda1fSAndroid Build Coastguard Worker 			ret += strlen(LEAF_STR(leaf));
616*59bfda1fSAndroid Build Coastguard Worker 		else
617*59bfda1fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
618*59bfda1fSAndroid Build Coastguard Worker 		s += utf8clen(s);
619*59bfda1fSAndroid Build Coastguard Worker 	}
620*59bfda1fSAndroid Build Coastguard Worker 	return ret;
621*59bfda1fSAndroid Build Coastguard Worker }
622*59bfda1fSAndroid Build Coastguard Worker #endif
623*59bfda1fSAndroid Build Coastguard Worker 
624*59bfda1fSAndroid Build Coastguard Worker #if 0
625*59bfda1fSAndroid Build Coastguard Worker /*
626*59bfda1fSAndroid Build Coastguard Worker  * Length of the normalization of s, touch at most len bytes.
627*59bfda1fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
628*59bfda1fSAndroid Build Coastguard Worker  */
629*59bfda1fSAndroid Build Coastguard Worker static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
630*59bfda1fSAndroid Build Coastguard Worker {
631*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
632*59bfda1fSAndroid Build Coastguard Worker 	size_t		ret = 0;
633*59bfda1fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
634*59bfda1fSAndroid Build Coastguard Worker 
635*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
636*59bfda1fSAndroid Build Coastguard Worker 		return -1;
637*59bfda1fSAndroid Build Coastguard Worker 	while (len && *s) {
638*59bfda1fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
639*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
640*59bfda1fSAndroid Build Coastguard Worker 			return -1;
641*59bfda1fSAndroid Build Coastguard Worker 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
642*59bfda1fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
643*59bfda1fSAndroid Build Coastguard Worker 		else if (LEAF_CCC(leaf) == DECOMPOSE)
644*59bfda1fSAndroid Build Coastguard Worker 			ret += strlen(LEAF_STR(leaf));
645*59bfda1fSAndroid Build Coastguard Worker 		else
646*59bfda1fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
647*59bfda1fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
648*59bfda1fSAndroid Build Coastguard Worker 		s += utf8clen(s);
649*59bfda1fSAndroid Build Coastguard Worker 	}
650*59bfda1fSAndroid Build Coastguard Worker 	return ret;
651*59bfda1fSAndroid Build Coastguard Worker }
652*59bfda1fSAndroid Build Coastguard Worker #endif
653*59bfda1fSAndroid Build Coastguard Worker 
654*59bfda1fSAndroid Build Coastguard Worker /*
655*59bfda1fSAndroid Build Coastguard Worker  * Set up an utf8cursor for use by utf8byte().
656*59bfda1fSAndroid Build Coastguard Worker  *
657*59bfda1fSAndroid Build Coastguard Worker  *   u8c    : pointer to cursor.
658*59bfda1fSAndroid Build Coastguard Worker  *   data   : const struct utf8data to use for normalization.
659*59bfda1fSAndroid Build Coastguard Worker  *   s      : string.
660*59bfda1fSAndroid Build Coastguard Worker  *   len    : length of s.
661*59bfda1fSAndroid Build Coastguard Worker  *
662*59bfda1fSAndroid Build Coastguard Worker  * Returns -1 on error, 0 on success.
663*59bfda1fSAndroid Build Coastguard Worker  */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)664*59bfda1fSAndroid Build Coastguard Worker static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
665*59bfda1fSAndroid Build Coastguard Worker 		const char *s, size_t len)
666*59bfda1fSAndroid Build Coastguard Worker {
667*59bfda1fSAndroid Build Coastguard Worker 	if (!data)
668*59bfda1fSAndroid Build Coastguard Worker 		return -1;
669*59bfda1fSAndroid Build Coastguard Worker 	if (!s)
670*59bfda1fSAndroid Build Coastguard Worker 		return -1;
671*59bfda1fSAndroid Build Coastguard Worker 	u8c->data = data;
672*59bfda1fSAndroid Build Coastguard Worker 	u8c->s = s;
673*59bfda1fSAndroid Build Coastguard Worker 	u8c->p = NULL;
674*59bfda1fSAndroid Build Coastguard Worker 	u8c->ss = NULL;
675*59bfda1fSAndroid Build Coastguard Worker 	u8c->sp = NULL;
676*59bfda1fSAndroid Build Coastguard Worker 	u8c->len = len;
677*59bfda1fSAndroid Build Coastguard Worker 	u8c->slen = 0;
678*59bfda1fSAndroid Build Coastguard Worker 	u8c->ccc = STOPPER;
679*59bfda1fSAndroid Build Coastguard Worker 	u8c->nccc = STOPPER;
680*59bfda1fSAndroid Build Coastguard Worker 	/* Check we didn't clobber the maximum length. */
681*59bfda1fSAndroid Build Coastguard Worker 	if (u8c->len != len)
682*59bfda1fSAndroid Build Coastguard Worker 		return -1;
683*59bfda1fSAndroid Build Coastguard Worker 	/* The first byte of s may not be an utf8 continuation. */
684*59bfda1fSAndroid Build Coastguard Worker 	if (len > 0 && (*s & 0xC0) == 0x80)
685*59bfda1fSAndroid Build Coastguard Worker 		return -1;
686*59bfda1fSAndroid Build Coastguard Worker 	return 0;
687*59bfda1fSAndroid Build Coastguard Worker }
688*59bfda1fSAndroid Build Coastguard Worker 
689*59bfda1fSAndroid Build Coastguard Worker #if 0
690*59bfda1fSAndroid Build Coastguard Worker /*
691*59bfda1fSAndroid Build Coastguard Worker  * Set up an utf8cursor for use by utf8byte().
692*59bfda1fSAndroid Build Coastguard Worker  *
693*59bfda1fSAndroid Build Coastguard Worker  *   u8c    : pointer to cursor.
694*59bfda1fSAndroid Build Coastguard Worker  *   data   : const struct utf8data to use for normalization.
695*59bfda1fSAndroid Build Coastguard Worker  *   s      : NUL-terminated string.
696*59bfda1fSAndroid Build Coastguard Worker  *
697*59bfda1fSAndroid Build Coastguard Worker  * Returns -1 on error, 0 on success.
698*59bfda1fSAndroid Build Coastguard Worker  */
699*59bfda1fSAndroid Build Coastguard Worker static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
700*59bfda1fSAndroid Build Coastguard Worker 	       const char *s)
701*59bfda1fSAndroid Build Coastguard Worker {
702*59bfda1fSAndroid Build Coastguard Worker 	return utf8ncursor(u8c, data, s, (unsigned int)-1);
703*59bfda1fSAndroid Build Coastguard Worker }
704*59bfda1fSAndroid Build Coastguard Worker #endif
705*59bfda1fSAndroid Build Coastguard Worker 
706*59bfda1fSAndroid Build Coastguard Worker /*
707*59bfda1fSAndroid Build Coastguard Worker  * Get one byte from the normalized form of the string described by u8c.
708*59bfda1fSAndroid Build Coastguard Worker  *
709*59bfda1fSAndroid Build Coastguard Worker  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
710*59bfda1fSAndroid Build Coastguard Worker  *
711*59bfda1fSAndroid Build Coastguard Worker  * The cursor keeps track of the location in the string in u8c->s.
712*59bfda1fSAndroid Build Coastguard Worker  * When a character is decomposed, the current location is stored in
713*59bfda1fSAndroid Build Coastguard Worker  * u8c->p, and u8c->s is set to the start of the decomposition. Note
714*59bfda1fSAndroid Build Coastguard Worker  * that bytes from a decomposition do not count against u8c->len.
715*59bfda1fSAndroid Build Coastguard Worker  *
716*59bfda1fSAndroid Build Coastguard Worker  * Characters are emitted if they match the current CCC in u8c->ccc.
717*59bfda1fSAndroid Build Coastguard Worker  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
718*59bfda1fSAndroid Build Coastguard Worker  * and the function returns 0 in that case.
719*59bfda1fSAndroid Build Coastguard Worker  *
720*59bfda1fSAndroid Build Coastguard Worker  * Sorting by CCC is done by repeatedly scanning the string.  The
721*59bfda1fSAndroid Build Coastguard Worker  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
722*59bfda1fSAndroid Build Coastguard Worker  * the start of the scan.  The first pass finds the lowest CCC to be
723*59bfda1fSAndroid Build Coastguard Worker  * emitted and stores it in u8c->nccc, the second pass emits the
724*59bfda1fSAndroid Build Coastguard Worker  * characters with this CCC and finds the next lowest CCC. This limits
725*59bfda1fSAndroid Build Coastguard Worker  * the number of passes to 1 + the number of different CCCs in the
726*59bfda1fSAndroid Build Coastguard Worker  * sequence being scanned.
727*59bfda1fSAndroid Build Coastguard Worker  *
728*59bfda1fSAndroid Build Coastguard Worker  * Therefore:
729*59bfda1fSAndroid Build Coastguard Worker  *  u8c->p  != NULL -> a decomposition is being scanned.
730*59bfda1fSAndroid Build Coastguard Worker  *  u8c->ss != NULL -> this is a repeating scan.
731*59bfda1fSAndroid Build Coastguard Worker  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
732*59bfda1fSAndroid Build Coastguard Worker  */
utf8byte(struct utf8cursor * u8c)733*59bfda1fSAndroid Build Coastguard Worker static int utf8byte(struct utf8cursor *u8c)
734*59bfda1fSAndroid Build Coastguard Worker {
735*59bfda1fSAndroid Build Coastguard Worker 	utf8leaf_t *leaf;
736*59bfda1fSAndroid Build Coastguard Worker 	int ccc;
737*59bfda1fSAndroid Build Coastguard Worker 
738*59bfda1fSAndroid Build Coastguard Worker 	for (;;) {
739*59bfda1fSAndroid Build Coastguard Worker 		/* Check for the end of a decomposed character. */
740*59bfda1fSAndroid Build Coastguard Worker 		if (u8c->p && *u8c->s == '\0') {
741*59bfda1fSAndroid Build Coastguard Worker 			u8c->s = u8c->p;
742*59bfda1fSAndroid Build Coastguard Worker 			u8c->p = NULL;
743*59bfda1fSAndroid Build Coastguard Worker 		}
744*59bfda1fSAndroid Build Coastguard Worker 
745*59bfda1fSAndroid Build Coastguard Worker 		/* Check for end-of-string. */
746*59bfda1fSAndroid Build Coastguard Worker 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
747*59bfda1fSAndroid Build Coastguard Worker 			/* There is no next byte. */
748*59bfda1fSAndroid Build Coastguard Worker 			if (u8c->ccc == STOPPER)
749*59bfda1fSAndroid Build Coastguard Worker 				return 0;
750*59bfda1fSAndroid Build Coastguard Worker 			/* End-of-string during a scan counts as a stopper. */
751*59bfda1fSAndroid Build Coastguard Worker 			ccc = STOPPER;
752*59bfda1fSAndroid Build Coastguard Worker 			goto ccc_mismatch;
753*59bfda1fSAndroid Build Coastguard Worker 		} else if ((*u8c->s & 0xC0) == 0x80) {
754*59bfda1fSAndroid Build Coastguard Worker 			/* This is a continuation of the current character. */
755*59bfda1fSAndroid Build Coastguard Worker 			if (!u8c->p)
756*59bfda1fSAndroid Build Coastguard Worker 				u8c->len--;
757*59bfda1fSAndroid Build Coastguard Worker 			return (unsigned char)*u8c->s++;
758*59bfda1fSAndroid Build Coastguard Worker 		}
759*59bfda1fSAndroid Build Coastguard Worker 
760*59bfda1fSAndroid Build Coastguard Worker 		/* Look up the data for the current character. */
761*59bfda1fSAndroid Build Coastguard Worker 		if (u8c->p) {
762*59bfda1fSAndroid Build Coastguard Worker 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
763*59bfda1fSAndroid Build Coastguard Worker 		} else {
764*59bfda1fSAndroid Build Coastguard Worker 			leaf = utf8nlookup(u8c->data, u8c->hangul,
765*59bfda1fSAndroid Build Coastguard Worker 					   u8c->s, u8c->len);
766*59bfda1fSAndroid Build Coastguard Worker 		}
767*59bfda1fSAndroid Build Coastguard Worker 
768*59bfda1fSAndroid Build Coastguard Worker 		/* No leaf found implies that the input is a binary blob. */
769*59bfda1fSAndroid Build Coastguard Worker 		if (!leaf)
770*59bfda1fSAndroid Build Coastguard Worker 			return -1;
771*59bfda1fSAndroid Build Coastguard Worker 
772*59bfda1fSAndroid Build Coastguard Worker 		ccc = LEAF_CCC(leaf);
773*59bfda1fSAndroid Build Coastguard Worker 		/* Characters that are too new have CCC 0. */
774*59bfda1fSAndroid Build Coastguard Worker 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
775*59bfda1fSAndroid Build Coastguard Worker 			ccc = STOPPER;
776*59bfda1fSAndroid Build Coastguard Worker 		} else if (ccc == DECOMPOSE) {
777*59bfda1fSAndroid Build Coastguard Worker 			u8c->len -= utf8clen(u8c->s);
778*59bfda1fSAndroid Build Coastguard Worker 			u8c->p = u8c->s + utf8clen(u8c->s);
779*59bfda1fSAndroid Build Coastguard Worker 			u8c->s = LEAF_STR(leaf);
780*59bfda1fSAndroid Build Coastguard Worker 			/* Empty decomposition implies CCC 0. */
781*59bfda1fSAndroid Build Coastguard Worker 			if (*u8c->s == '\0') {
782*59bfda1fSAndroid Build Coastguard Worker 				if (u8c->ccc == STOPPER)
783*59bfda1fSAndroid Build Coastguard Worker 					continue;
784*59bfda1fSAndroid Build Coastguard Worker 				ccc = STOPPER;
785*59bfda1fSAndroid Build Coastguard Worker 				goto ccc_mismatch;
786*59bfda1fSAndroid Build Coastguard Worker 			}
787*59bfda1fSAndroid Build Coastguard Worker 
788*59bfda1fSAndroid Build Coastguard Worker 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
789*59bfda1fSAndroid Build Coastguard Worker 			if (!leaf)
790*59bfda1fSAndroid Build Coastguard Worker 				return -1;
791*59bfda1fSAndroid Build Coastguard Worker 			ccc = LEAF_CCC(leaf);
792*59bfda1fSAndroid Build Coastguard Worker 		}
793*59bfda1fSAndroid Build Coastguard Worker 
794*59bfda1fSAndroid Build Coastguard Worker 		/*
795*59bfda1fSAndroid Build Coastguard Worker 		 * If this is not a stopper, then see if it updates
796*59bfda1fSAndroid Build Coastguard Worker 		 * the next canonical class to be emitted.
797*59bfda1fSAndroid Build Coastguard Worker 		 */
798*59bfda1fSAndroid Build Coastguard Worker 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
799*59bfda1fSAndroid Build Coastguard Worker 			u8c->nccc = ccc;
800*59bfda1fSAndroid Build Coastguard Worker 
801*59bfda1fSAndroid Build Coastguard Worker 		/*
802*59bfda1fSAndroid Build Coastguard Worker 		 * Return the current byte if this is the current
803*59bfda1fSAndroid Build Coastguard Worker 		 * combining class.
804*59bfda1fSAndroid Build Coastguard Worker 		 */
805*59bfda1fSAndroid Build Coastguard Worker 		if (ccc == u8c->ccc) {
806*59bfda1fSAndroid Build Coastguard Worker 			if (!u8c->p)
807*59bfda1fSAndroid Build Coastguard Worker 				u8c->len--;
808*59bfda1fSAndroid Build Coastguard Worker 			return (unsigned char)*u8c->s++;
809*59bfda1fSAndroid Build Coastguard Worker 		}
810*59bfda1fSAndroid Build Coastguard Worker 
811*59bfda1fSAndroid Build Coastguard Worker 		/* Current combining class mismatch. */
812*59bfda1fSAndroid Build Coastguard Worker ccc_mismatch:
813*59bfda1fSAndroid Build Coastguard Worker 		if (u8c->nccc == STOPPER) {
814*59bfda1fSAndroid Build Coastguard Worker 			/*
815*59bfda1fSAndroid Build Coastguard Worker 			 * Scan forward for the first canonical class
816*59bfda1fSAndroid Build Coastguard Worker 			 * to be emitted.  Save the position from
817*59bfda1fSAndroid Build Coastguard Worker 			 * which to restart.
818*59bfda1fSAndroid Build Coastguard Worker 			 */
819*59bfda1fSAndroid Build Coastguard Worker 			u8c->ccc = MINCCC - 1;
820*59bfda1fSAndroid Build Coastguard Worker 			u8c->nccc = ccc;
821*59bfda1fSAndroid Build Coastguard Worker 			u8c->sp = u8c->p;
822*59bfda1fSAndroid Build Coastguard Worker 			u8c->ss = u8c->s;
823*59bfda1fSAndroid Build Coastguard Worker 			u8c->slen = u8c->len;
824*59bfda1fSAndroid Build Coastguard Worker 			if (!u8c->p)
825*59bfda1fSAndroid Build Coastguard Worker 				u8c->len -= utf8clen(u8c->s);
826*59bfda1fSAndroid Build Coastguard Worker 			u8c->s += utf8clen(u8c->s);
827*59bfda1fSAndroid Build Coastguard Worker 		} else if (ccc != STOPPER) {
828*59bfda1fSAndroid Build Coastguard Worker 			/* Not a stopper, and not the ccc we're emitting. */
829*59bfda1fSAndroid Build Coastguard Worker 			if (!u8c->p)
830*59bfda1fSAndroid Build Coastguard Worker 				u8c->len -= utf8clen(u8c->s);
831*59bfda1fSAndroid Build Coastguard Worker 			u8c->s += utf8clen(u8c->s);
832*59bfda1fSAndroid Build Coastguard Worker 		} else if (u8c->nccc != MAXCCC + 1) {
833*59bfda1fSAndroid Build Coastguard Worker 			/* At a stopper, restart for next ccc. */
834*59bfda1fSAndroid Build Coastguard Worker 			u8c->ccc = u8c->nccc;
835*59bfda1fSAndroid Build Coastguard Worker 			u8c->nccc = MAXCCC + 1;
836*59bfda1fSAndroid Build Coastguard Worker 			u8c->s = u8c->ss;
837*59bfda1fSAndroid Build Coastguard Worker 			u8c->p = u8c->sp;
838*59bfda1fSAndroid Build Coastguard Worker 			u8c->len = u8c->slen;
839*59bfda1fSAndroid Build Coastguard Worker 		} else {
840*59bfda1fSAndroid Build Coastguard Worker 			/* All done, proceed from here. */
841*59bfda1fSAndroid Build Coastguard Worker 			u8c->ccc = STOPPER;
842*59bfda1fSAndroid Build Coastguard Worker 			u8c->nccc = STOPPER;
843*59bfda1fSAndroid Build Coastguard Worker 			u8c->sp = NULL;
844*59bfda1fSAndroid Build Coastguard Worker 			u8c->ss = NULL;
845*59bfda1fSAndroid Build Coastguard Worker 			u8c->slen = 0;
846*59bfda1fSAndroid Build Coastguard Worker 		}
847*59bfda1fSAndroid Build Coastguard Worker 	}
848*59bfda1fSAndroid Build Coastguard Worker }
849*59bfda1fSAndroid Build Coastguard Worker 
850*59bfda1fSAndroid Build Coastguard Worker #if 0
851*59bfda1fSAndroid Build Coastguard Worker /*
852*59bfda1fSAndroid Build Coastguard Worker  * Look for the correct const struct utf8data for a unicode version.
853*59bfda1fSAndroid Build Coastguard Worker  * Returns NULL if the version requested is too new.
854*59bfda1fSAndroid Build Coastguard Worker  *
855*59bfda1fSAndroid Build Coastguard Worker  * Two normalization forms are supported: nfdi and nfdicf.
856*59bfda1fSAndroid Build Coastguard Worker  *
857*59bfda1fSAndroid Build Coastguard Worker  * nfdi:
858*59bfda1fSAndroid Build Coastguard Worker  *  - Apply unicode normalization form NFD.
859*59bfda1fSAndroid Build Coastguard Worker  *  - Remove any Default_Ignorable_Code_Point.
860*59bfda1fSAndroid Build Coastguard Worker  *
861*59bfda1fSAndroid Build Coastguard Worker  * nfdicf:
862*59bfda1fSAndroid Build Coastguard Worker  *  - Apply unicode normalization form NFD.
863*59bfda1fSAndroid Build Coastguard Worker  *  - Remove any Default_Ignorable_Code_Point.
864*59bfda1fSAndroid Build Coastguard Worker  *  - Apply a full casefold (C + F).
865*59bfda1fSAndroid Build Coastguard Worker  */
866*59bfda1fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdi(unsigned int maxage)
867*59bfda1fSAndroid Build Coastguard Worker {
868*59bfda1fSAndroid Build Coastguard Worker 	int i = ARRAY_SIZE(utf8nfdidata) - 1;
869*59bfda1fSAndroid Build Coastguard Worker 
870*59bfda1fSAndroid Build Coastguard Worker 	while (maxage < utf8nfdidata[i].maxage)
871*59bfda1fSAndroid Build Coastguard Worker 		i--;
872*59bfda1fSAndroid Build Coastguard Worker 	if (maxage > utf8nfdidata[i].maxage)
873*59bfda1fSAndroid Build Coastguard Worker 		return NULL;
874*59bfda1fSAndroid Build Coastguard Worker 	return &utf8nfdidata[i];
875*59bfda1fSAndroid Build Coastguard Worker }
876*59bfda1fSAndroid Build Coastguard Worker #endif
877*59bfda1fSAndroid Build Coastguard Worker 
utf8nfdicf(unsigned int maxage)878*59bfda1fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdicf(unsigned int maxage)
879*59bfda1fSAndroid Build Coastguard Worker {
880*59bfda1fSAndroid Build Coastguard Worker 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
881*59bfda1fSAndroid Build Coastguard Worker 
882*59bfda1fSAndroid Build Coastguard Worker 	while (maxage < utf8nfdicfdata[i].maxage)
883*59bfda1fSAndroid Build Coastguard Worker 		i--;
884*59bfda1fSAndroid Build Coastguard Worker 	if (maxage > utf8nfdicfdata[i].maxage)
885*59bfda1fSAndroid Build Coastguard Worker 		return NULL;
886*59bfda1fSAndroid Build Coastguard Worker 	return &utf8nfdicfdata[i];
887*59bfda1fSAndroid Build Coastguard Worker }
888*59bfda1fSAndroid Build Coastguard Worker 
utf8_casefold(const struct f2fs_nls_table * table,const unsigned char * str,size_t len,unsigned char * dest,size_t dlen)889*59bfda1fSAndroid Build Coastguard Worker static int utf8_casefold(const struct f2fs_nls_table *table,
890*59bfda1fSAndroid Build Coastguard Worker 			  const unsigned char *str, size_t len,
891*59bfda1fSAndroid Build Coastguard Worker 			  unsigned char *dest, size_t dlen)
892*59bfda1fSAndroid Build Coastguard Worker {
893*59bfda1fSAndroid Build Coastguard Worker 	const struct utf8data *data = utf8nfdicf(table->version);
894*59bfda1fSAndroid Build Coastguard Worker 	struct utf8cursor cur;
895*59bfda1fSAndroid Build Coastguard Worker 	size_t nlen = 0;
896*59bfda1fSAndroid Build Coastguard Worker 
897*59bfda1fSAndroid Build Coastguard Worker 	if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
898*59bfda1fSAndroid Build Coastguard Worker 		goto invalid_seq;
899*59bfda1fSAndroid Build Coastguard Worker 
900*59bfda1fSAndroid Build Coastguard Worker 	for (nlen = 0; nlen < dlen; nlen++) {
901*59bfda1fSAndroid Build Coastguard Worker 		int c = utf8byte(&cur);
902*59bfda1fSAndroid Build Coastguard Worker 
903*59bfda1fSAndroid Build Coastguard Worker 		dest[nlen] = c;
904*59bfda1fSAndroid Build Coastguard Worker 		if (!c)
905*59bfda1fSAndroid Build Coastguard Worker 			return nlen;
906*59bfda1fSAndroid Build Coastguard Worker 		if (c == -1)
907*59bfda1fSAndroid Build Coastguard Worker 			break;
908*59bfda1fSAndroid Build Coastguard Worker 	}
909*59bfda1fSAndroid Build Coastguard Worker 
910*59bfda1fSAndroid Build Coastguard Worker 	return -ENAMETOOLONG;
911*59bfda1fSAndroid Build Coastguard Worker 
912*59bfda1fSAndroid Build Coastguard Worker invalid_seq:
913*59bfda1fSAndroid Build Coastguard Worker 	if (dlen < len)
914*59bfda1fSAndroid Build Coastguard Worker 		return -ENAMETOOLONG;
915*59bfda1fSAndroid Build Coastguard Worker 
916*59bfda1fSAndroid Build Coastguard Worker 	/* Signal invalid sequence */
917*59bfda1fSAndroid Build Coastguard Worker 	return -EINVAL;
918*59bfda1fSAndroid Build Coastguard Worker }
919*59bfda1fSAndroid Build Coastguard Worker 
920*59bfda1fSAndroid Build Coastguard Worker static const struct f2fs_nls_ops utf8_ops = {
921*59bfda1fSAndroid Build Coastguard Worker 	.casefold = utf8_casefold,
922*59bfda1fSAndroid Build Coastguard Worker };
923*59bfda1fSAndroid Build Coastguard Worker 
924*59bfda1fSAndroid Build Coastguard Worker static const struct f2fs_nls_table nls_utf8 = {
925*59bfda1fSAndroid Build Coastguard Worker 	.ops = &utf8_ops,
926*59bfda1fSAndroid Build Coastguard Worker 	.version = UNICODE_AGE(12, 1, 0),
927*59bfda1fSAndroid Build Coastguard Worker };
928*59bfda1fSAndroid Build Coastguard Worker 
f2fs_load_nls_table(int encoding)929*59bfda1fSAndroid Build Coastguard Worker const struct f2fs_nls_table *f2fs_load_nls_table(int encoding)
930*59bfda1fSAndroid Build Coastguard Worker {
931*59bfda1fSAndroid Build Coastguard Worker 	if (encoding == F2FS_ENC_UTF8_12_1)
932*59bfda1fSAndroid Build Coastguard Worker 		return &nls_utf8;
933*59bfda1fSAndroid Build Coastguard Worker 
934*59bfda1fSAndroid Build Coastguard Worker 	return NULL;
935*59bfda1fSAndroid Build Coastguard Worker }
936