xref: /aosp_15_r20/external/e2fsprogs/lib/ext2fs/nls_utf8.c (revision 6a54128f25917bfc36a8a6e9d722c04a0b4641b6)
1*6a54128fSAndroid Build Coastguard Worker /*
2*6a54128fSAndroid Build Coastguard Worker  * Copyright (c) 2014 SGI.
3*6a54128fSAndroid Build Coastguard Worker  * Copyright (c) 2018 Collabora Ltd.
4*6a54128fSAndroid Build Coastguard Worker  * All rights reserved.
5*6a54128fSAndroid Build Coastguard Worker  *
6*6a54128fSAndroid Build Coastguard Worker  * This program is free software; you can redistribute it and/or
7*6a54128fSAndroid Build Coastguard Worker  * modify it under the terms of the GNU General Public License as
8*6a54128fSAndroid Build Coastguard Worker  * published by the Free Software Foundation.
9*6a54128fSAndroid Build Coastguard Worker  *
10*6a54128fSAndroid Build Coastguard Worker  * This program is distributed in the hope that it would be useful,
11*6a54128fSAndroid Build Coastguard Worker  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12*6a54128fSAndroid Build Coastguard Worker  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13*6a54128fSAndroid Build Coastguard Worker  * GNU General Public License for more details.
14*6a54128fSAndroid Build Coastguard Worker  *
15*6a54128fSAndroid Build Coastguard Worker  */
16*6a54128fSAndroid Build Coastguard Worker 
17*6a54128fSAndroid Build Coastguard Worker /*
18*6a54128fSAndroid Build Coastguard Worker  * This code is adapted from the Linux Kernel.  We have a
19*6a54128fSAndroid Build Coastguard Worker  * userspace version here such that the hashes will match that
20*6a54128fSAndroid Build Coastguard Worker  * implementation.
21*6a54128fSAndroid Build Coastguard Worker  */
22*6a54128fSAndroid Build Coastguard Worker 
23*6a54128fSAndroid Build Coastguard Worker #include "config.h"
24*6a54128fSAndroid Build Coastguard Worker #include <stdint.h>
25*6a54128fSAndroid Build Coastguard Worker #include <unistd.h>
26*6a54128fSAndroid Build Coastguard Worker #include <string.h>
27*6a54128fSAndroid Build Coastguard Worker #include <limits.h>
28*6a54128fSAndroid Build Coastguard Worker #include <errno.h>
29*6a54128fSAndroid Build Coastguard Worker 
30*6a54128fSAndroid Build Coastguard Worker #include "ext2_fs.h"
31*6a54128fSAndroid Build Coastguard Worker #include "ext2fs.h"
32*6a54128fSAndroid Build Coastguard Worker #include "ext2fsP.h"
33*6a54128fSAndroid Build Coastguard Worker 
34*6a54128fSAndroid Build Coastguard Worker /* Encoding a unicode version number as a single unsigned int. */
35*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MAJ_SHIFT		(16)
36*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MIN_SHIFT		(8)
37*6a54128fSAndroid Build Coastguard Worker 
38*6a54128fSAndroid Build Coastguard Worker #define UNICODE_AGE(MAJ, MIN, REV)			\
39*6a54128fSAndroid Build Coastguard Worker 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
40*6a54128fSAndroid Build Coastguard Worker 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
41*6a54128fSAndroid Build Coastguard Worker 	 ((unsigned int)(REV)))
42*6a54128fSAndroid Build Coastguard Worker 
43*6a54128fSAndroid Build Coastguard Worker /* Needed in struct utf8cursor below. */
44*6a54128fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF	(12)
45*6a54128fSAndroid Build Coastguard Worker 
46*6a54128fSAndroid Build Coastguard Worker /*
47*6a54128fSAndroid Build Coastguard Worker  * Cursor structure used by the normalizer.
48*6a54128fSAndroid Build Coastguard Worker  */
49*6a54128fSAndroid Build Coastguard Worker struct utf8cursor {
50*6a54128fSAndroid Build Coastguard Worker 	const struct utf8data	*data;
51*6a54128fSAndroid Build Coastguard Worker 	const char	*s;
52*6a54128fSAndroid Build Coastguard Worker 	const char	*p;
53*6a54128fSAndroid Build Coastguard Worker 	const char	*ss;
54*6a54128fSAndroid Build Coastguard Worker 	const char	*sp;
55*6a54128fSAndroid Build Coastguard Worker 	unsigned int	len;
56*6a54128fSAndroid Build Coastguard Worker 	unsigned int	slen;
57*6a54128fSAndroid Build Coastguard Worker 	short int	ccc;
58*6a54128fSAndroid Build Coastguard Worker 	short int	nccc;
59*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
60*6a54128fSAndroid Build Coastguard Worker };
61*6a54128fSAndroid Build Coastguard Worker 
62*6a54128fSAndroid Build Coastguard Worker /*
63*6a54128fSAndroid Build Coastguard Worker  * Initialize a utf8cursor to normalize a string.
64*6a54128fSAndroid Build Coastguard Worker  * Returns 0 on success.
65*6a54128fSAndroid Build Coastguard Worker  * Returns -1 on failure.
66*6a54128fSAndroid Build Coastguard Worker  */
67*6a54128fSAndroid Build Coastguard Worker // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
68*6a54128fSAndroid Build Coastguard Worker //		      const char *s);
69*6a54128fSAndroid Build Coastguard Worker // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
70*6a54128fSAndroid Build Coastguard Worker //		       const char *s, size_t len);
71*6a54128fSAndroid Build Coastguard Worker 
72*6a54128fSAndroid Build Coastguard Worker /*
73*6a54128fSAndroid Build Coastguard Worker  * Get the next byte in the normalization.
74*6a54128fSAndroid Build Coastguard Worker  * Returns a value > 0 && < 256 on success.
75*6a54128fSAndroid Build Coastguard Worker  * Returns 0 when the end of the normalization is reached.
76*6a54128fSAndroid Build Coastguard Worker  * Returns -1 if the string being normalized is not valid UTF-8.
77*6a54128fSAndroid Build Coastguard Worker  */
78*6a54128fSAndroid Build Coastguard Worker // extern int utf8byte(struct utf8cursor *u8c);
79*6a54128fSAndroid Build Coastguard Worker 
80*6a54128fSAndroid Build Coastguard Worker 
81*6a54128fSAndroid Build Coastguard Worker struct utf8data {
82*6a54128fSAndroid Build Coastguard Worker 	unsigned int maxage;
83*6a54128fSAndroid Build Coastguard Worker 	unsigned int offset;
84*6a54128fSAndroid Build Coastguard Worker };
85*6a54128fSAndroid Build Coastguard Worker 
86*6a54128fSAndroid Build Coastguard Worker #define __INCLUDED_FROM_UTF8NORM_C__
87*6a54128fSAndroid Build Coastguard Worker #include "utf8data.h"
88*6a54128fSAndroid Build Coastguard Worker #undef __INCLUDED_FROM_UTF8NORM_C__
89*6a54128fSAndroid Build Coastguard Worker 
90*6a54128fSAndroid Build Coastguard Worker #define ARRAY_SIZE(array)			\
91*6a54128fSAndroid Build Coastguard Worker         (sizeof(array) / sizeof(array[0]))
92*6a54128fSAndroid Build Coastguard Worker 
93*6a54128fSAndroid Build Coastguard Worker #if 0
94*6a54128fSAndroid Build Coastguard Worker /* Highest unicode version supported by the data tables. */
95*6a54128fSAndroid Build Coastguard Worker static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
96*6a54128fSAndroid Build Coastguard Worker {
97*6a54128fSAndroid Build Coastguard Worker 	int i = ARRAY_SIZE(utf8agetab) - 1;
98*6a54128fSAndroid Build Coastguard Worker 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
99*6a54128fSAndroid Build Coastguard Worker 
100*6a54128fSAndroid Build Coastguard Worker 	while (i >= 0 && utf8agetab[i] != 0) {
101*6a54128fSAndroid Build Coastguard Worker 		if (sb_utf8version == utf8agetab[i])
102*6a54128fSAndroid Build Coastguard Worker 			return 1;
103*6a54128fSAndroid Build Coastguard Worker 		i--;
104*6a54128fSAndroid Build Coastguard Worker 	}
105*6a54128fSAndroid Build Coastguard Worker 	return 0;
106*6a54128fSAndroid Build Coastguard Worker }
107*6a54128fSAndroid Build Coastguard Worker #endif
108*6a54128fSAndroid Build Coastguard Worker 
109*6a54128fSAndroid Build Coastguard Worker #if 0
110*6a54128fSAndroid Build Coastguard Worker static int utf8version_latest(void)
111*6a54128fSAndroid Build Coastguard Worker {
112*6a54128fSAndroid Build Coastguard Worker 	return utf8vers;
113*6a54128fSAndroid Build Coastguard Worker }
114*6a54128fSAndroid Build Coastguard Worker #endif
115*6a54128fSAndroid Build Coastguard Worker 
116*6a54128fSAndroid Build Coastguard Worker /*
117*6a54128fSAndroid Build Coastguard Worker  * UTF-8 valid ranges.
118*6a54128fSAndroid Build Coastguard Worker  *
119*6a54128fSAndroid Build Coastguard Worker  * The UTF-8 encoding spreads the bits of a 32bit word over several
120*6a54128fSAndroid Build Coastguard Worker  * bytes. This table gives the ranges that can be held and how they'd
121*6a54128fSAndroid Build Coastguard Worker  * be represented.
122*6a54128fSAndroid Build Coastguard Worker  *
123*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x0000007F: 0xxxxxxx
124*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
125*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
126*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
127*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
128*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
129*6a54128fSAndroid Build Coastguard Worker  *
130*6a54128fSAndroid Build Coastguard Worker  * There is an additional requirement on UTF-8, in that only the
131*6a54128fSAndroid Build Coastguard Worker  * shortest representation of a 32bit value is to be used.  A decoder
132*6a54128fSAndroid Build Coastguard Worker  * must not decode sequences that do not satisfy this requirement.
133*6a54128fSAndroid Build Coastguard Worker  * Thus the allowed ranges have a lower bound.
134*6a54128fSAndroid Build Coastguard Worker  *
135*6a54128fSAndroid Build Coastguard Worker  * 0x00000000 0x0000007F: 0xxxxxxx
136*6a54128fSAndroid Build Coastguard Worker  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
137*6a54128fSAndroid Build Coastguard Worker  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
138*6a54128fSAndroid Build Coastguard Worker  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139*6a54128fSAndroid Build Coastguard Worker  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
140*6a54128fSAndroid Build Coastguard Worker  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
141*6a54128fSAndroid Build Coastguard Worker  *
142*6a54128fSAndroid Build Coastguard Worker  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
143*6a54128fSAndroid Build Coastguard Worker  * 17 planes of 65536 values.  This limits the sequences actually seen
144*6a54128fSAndroid Build Coastguard Worker  * even more, to just the following.
145*6a54128fSAndroid Build Coastguard Worker  *
146*6a54128fSAndroid Build Coastguard Worker  *          0 -     0x7F: 0                   - 0x7F
147*6a54128fSAndroid Build Coastguard Worker  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
148*6a54128fSAndroid Build Coastguard Worker  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
149*6a54128fSAndroid Build Coastguard Worker  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
150*6a54128fSAndroid Build Coastguard Worker  *
151*6a54128fSAndroid Build Coastguard Worker  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
152*6a54128fSAndroid Build Coastguard Worker  *
153*6a54128fSAndroid Build Coastguard Worker  * Note that the longest sequence seen with valid usage is 4 bytes,
154*6a54128fSAndroid Build Coastguard Worker  * the same a single UTF-32 character.  This makes the UTF-8
155*6a54128fSAndroid Build Coastguard Worker  * representation of Unicode strictly smaller than UTF-32.
156*6a54128fSAndroid Build Coastguard Worker  *
157*6a54128fSAndroid Build Coastguard Worker  * The shortest sequence requirement was introduced by:
158*6a54128fSAndroid Build Coastguard Worker  *    Corrigendum #1: UTF-8 Shortest Form
159*6a54128fSAndroid Build Coastguard Worker  * It can be found here:
160*6a54128fSAndroid Build Coastguard Worker  *    http://www.unicode.org/versions/corrigendum1.html
161*6a54128fSAndroid Build Coastguard Worker  *
162*6a54128fSAndroid Build Coastguard Worker  */
163*6a54128fSAndroid Build Coastguard Worker 
164*6a54128fSAndroid Build Coastguard Worker /*
165*6a54128fSAndroid Build Coastguard Worker  * Return the number of bytes used by the current UTF-8 sequence.
166*6a54128fSAndroid Build Coastguard Worker  * Assumes the input points to the first byte of a valid UTF-8
167*6a54128fSAndroid Build Coastguard Worker  * sequence.
168*6a54128fSAndroid Build Coastguard Worker  */
utf8clen(const char * s)169*6a54128fSAndroid Build Coastguard Worker static inline int utf8clen(const char *s)
170*6a54128fSAndroid Build Coastguard Worker {
171*6a54128fSAndroid Build Coastguard Worker 	unsigned char c = *s;
172*6a54128fSAndroid Build Coastguard Worker 
173*6a54128fSAndroid Build Coastguard Worker 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
174*6a54128fSAndroid Build Coastguard Worker }
175*6a54128fSAndroid Build Coastguard Worker 
176*6a54128fSAndroid Build Coastguard Worker /*
177*6a54128fSAndroid Build Coastguard Worker  * Decode a 3-byte UTF-8 sequence.
178*6a54128fSAndroid Build Coastguard Worker  */
179*6a54128fSAndroid Build Coastguard Worker static unsigned int
utf8decode3(const char * str)180*6a54128fSAndroid Build Coastguard Worker utf8decode3(const char *str)
181*6a54128fSAndroid Build Coastguard Worker {
182*6a54128fSAndroid Build Coastguard Worker 	unsigned int		uc;
183*6a54128fSAndroid Build Coastguard Worker 
184*6a54128fSAndroid Build Coastguard Worker 	uc = *str++ & 0x0F;
185*6a54128fSAndroid Build Coastguard Worker 	uc <<= 6;
186*6a54128fSAndroid Build Coastguard Worker 	uc |= *str++ & 0x3F;
187*6a54128fSAndroid Build Coastguard Worker 	uc <<= 6;
188*6a54128fSAndroid Build Coastguard Worker 	uc |= *str++ & 0x3F;
189*6a54128fSAndroid Build Coastguard Worker 
190*6a54128fSAndroid Build Coastguard Worker 	return uc;
191*6a54128fSAndroid Build Coastguard Worker }
192*6a54128fSAndroid Build Coastguard Worker 
193*6a54128fSAndroid Build Coastguard Worker /*
194*6a54128fSAndroid Build Coastguard Worker  * Encode a 3-byte UTF-8 sequence.
195*6a54128fSAndroid Build Coastguard Worker  */
196*6a54128fSAndroid Build Coastguard Worker static int
utf8encode3(char * str,unsigned int val)197*6a54128fSAndroid Build Coastguard Worker utf8encode3(char *str, unsigned int val)
198*6a54128fSAndroid Build Coastguard Worker {
199*6a54128fSAndroid Build Coastguard Worker 	str[2] = (val & 0x3F) | 0x80;
200*6a54128fSAndroid Build Coastguard Worker 	val >>= 6;
201*6a54128fSAndroid Build Coastguard Worker 	str[1] = (val & 0x3F) | 0x80;
202*6a54128fSAndroid Build Coastguard Worker 	val >>= 6;
203*6a54128fSAndroid Build Coastguard Worker 	str[0] = val | 0xE0;
204*6a54128fSAndroid Build Coastguard Worker 
205*6a54128fSAndroid Build Coastguard Worker 	return 3;
206*6a54128fSAndroid Build Coastguard Worker }
207*6a54128fSAndroid Build Coastguard Worker 
208*6a54128fSAndroid Build Coastguard Worker /*
209*6a54128fSAndroid Build Coastguard Worker  * utf8trie_t
210*6a54128fSAndroid Build Coastguard Worker  *
211*6a54128fSAndroid Build Coastguard Worker  * A compact binary tree, used to decode UTF-8 characters.
212*6a54128fSAndroid Build Coastguard Worker  *
213*6a54128fSAndroid Build Coastguard Worker  * Internal nodes are one byte for the node itself, and up to three
214*6a54128fSAndroid Build Coastguard Worker  * bytes for an offset into the tree.  The first byte contains the
215*6a54128fSAndroid Build Coastguard Worker  * following information:
216*6a54128fSAndroid Build Coastguard Worker  *  NEXTBYTE  - flag        - advance to next byte if set
217*6a54128fSAndroid Build Coastguard Worker  *  BITNUM    - 3 bit field - the bit number to tested
218*6a54128fSAndroid Build Coastguard Worker  *  OFFLEN    - 2 bit field - number of bytes in the offset
219*6a54128fSAndroid Build Coastguard Worker  * if offlen == 0 (non-branching node)
220*6a54128fSAndroid Build Coastguard Worker  *  RIGHTPATH - 1 bit field - set if the following node is for the
221*6a54128fSAndroid Build Coastguard Worker  *                            right-hand path (tested bit is set)
222*6a54128fSAndroid Build Coastguard Worker  *  TRIENODE  - 1 bit field - set if the following node is an internal
223*6a54128fSAndroid Build Coastguard Worker  *                            node, otherwise it is a leaf node
224*6a54128fSAndroid Build Coastguard Worker  * if offlen != 0 (branching node)
225*6a54128fSAndroid Build Coastguard Worker  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
226*6a54128fSAndroid Build Coastguard Worker  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
227*6a54128fSAndroid Build Coastguard Worker  *
228*6a54128fSAndroid Build Coastguard Worker  * Due to the way utf8 works, there cannot be branching nodes with
229*6a54128fSAndroid Build Coastguard Worker  * NEXTBYTE set, and moreover those nodes always have a righthand
230*6a54128fSAndroid Build Coastguard Worker  * descendant.
231*6a54128fSAndroid Build Coastguard Worker  */
232*6a54128fSAndroid Build Coastguard Worker typedef const unsigned char utf8trie_t;
233*6a54128fSAndroid Build Coastguard Worker #define BITNUM		0x07
234*6a54128fSAndroid Build Coastguard Worker #define NEXTBYTE	0x08
235*6a54128fSAndroid Build Coastguard Worker #define OFFLEN		0x30
236*6a54128fSAndroid Build Coastguard Worker #define OFFLEN_SHIFT	4
237*6a54128fSAndroid Build Coastguard Worker #define RIGHTPATH	0x40
238*6a54128fSAndroid Build Coastguard Worker #define TRIENODE	0x80
239*6a54128fSAndroid Build Coastguard Worker #define RIGHTNODE	0x40
240*6a54128fSAndroid Build Coastguard Worker #define LEFTNODE	0x80
241*6a54128fSAndroid Build Coastguard Worker 
242*6a54128fSAndroid Build Coastguard Worker /*
243*6a54128fSAndroid Build Coastguard Worker  * utf8leaf_t
244*6a54128fSAndroid Build Coastguard Worker  *
245*6a54128fSAndroid Build Coastguard Worker  * The leaves of the trie are embedded in the trie, and so the same
246*6a54128fSAndroid Build Coastguard Worker  * underlying datatype: unsigned char.
247*6a54128fSAndroid Build Coastguard Worker  *
248*6a54128fSAndroid Build Coastguard Worker  * leaf[0]: The unicode version, stored as a generation number that is
249*6a54128fSAndroid Build Coastguard Worker  *          an index into utf8agetab[].  With this we can filter code
250*6a54128fSAndroid Build Coastguard Worker  *          points based on the unicode version in which they were
251*6a54128fSAndroid Build Coastguard Worker  *          defined.  The CCC of a non-defined code point is 0.
252*6a54128fSAndroid Build Coastguard Worker  * leaf[1]: Canonical Combining Class. During normalization, we need
253*6a54128fSAndroid Build Coastguard Worker  *          to do a stable sort into ascending order of all characters
254*6a54128fSAndroid Build Coastguard Worker  *          with a non-zero CCC that occur between two characters with
255*6a54128fSAndroid Build Coastguard Worker  *          a CCC of 0, or at the begin or end of a string.
256*6a54128fSAndroid Build Coastguard Worker  *          The unicode standard guarantees that all CCC values are
257*6a54128fSAndroid Build Coastguard Worker  *          between 0 and 254 inclusive, which leaves 255 available as
258*6a54128fSAndroid Build Coastguard Worker  *          a special value.
259*6a54128fSAndroid Build Coastguard Worker  *          Code points with CCC 0 are known as stoppers.
260*6a54128fSAndroid Build Coastguard Worker  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
261*6a54128fSAndroid Build Coastguard Worker  *          start of a NUL-terminated string that is the decomposition
262*6a54128fSAndroid Build Coastguard Worker  *          of the character.
263*6a54128fSAndroid Build Coastguard Worker  *          The CCC of a decomposable character is the same as the CCC
264*6a54128fSAndroid Build Coastguard Worker  *          of the first character of its decomposition.
265*6a54128fSAndroid Build Coastguard Worker  *          Some characters decompose as the empty string: these are
266*6a54128fSAndroid Build Coastguard Worker  *          characters with the Default_Ignorable_Code_Point property.
267*6a54128fSAndroid Build Coastguard Worker  *          These do affect normalization, as they all have CCC 0.
268*6a54128fSAndroid Build Coastguard Worker  *
269*6a54128fSAndroid Build Coastguard Worker  * The decompositions in the trie have been fully expanded, with the
270*6a54128fSAndroid Build Coastguard Worker  * exception of Hangul syllables, which are decomposed algorithmically.
271*6a54128fSAndroid Build Coastguard Worker  *
272*6a54128fSAndroid Build Coastguard Worker  * Casefolding, if applicable, is also done using decompositions.
273*6a54128fSAndroid Build Coastguard Worker  *
274*6a54128fSAndroid Build Coastguard Worker  * The trie is constructed in such a way that leaves exist for all
275*6a54128fSAndroid Build Coastguard Worker  * UTF-8 sequences that match the criteria from the "UTF-8 valid
276*6a54128fSAndroid Build Coastguard Worker  * ranges" comment above, and only for those sequences.  Therefore a
277*6a54128fSAndroid Build Coastguard Worker  * lookup in the trie can be used to validate the UTF-8 input.
278*6a54128fSAndroid Build Coastguard Worker  */
279*6a54128fSAndroid Build Coastguard Worker typedef const unsigned char utf8leaf_t;
280*6a54128fSAndroid Build Coastguard Worker 
281*6a54128fSAndroid Build Coastguard Worker #define LEAF_GEN(LEAF)	((LEAF)[0])
282*6a54128fSAndroid Build Coastguard Worker #define LEAF_CCC(LEAF)	((LEAF)[1])
283*6a54128fSAndroid Build Coastguard Worker #define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))
284*6a54128fSAndroid Build Coastguard Worker 
285*6a54128fSAndroid Build Coastguard Worker #define MINCCC		(0)
286*6a54128fSAndroid Build Coastguard Worker #define MAXCCC		(254)
287*6a54128fSAndroid Build Coastguard Worker #define STOPPER		(0)
288*6a54128fSAndroid Build Coastguard Worker #define	DECOMPOSE	(255)
289*6a54128fSAndroid Build Coastguard Worker 
290*6a54128fSAndroid Build Coastguard Worker /* Marker for hangul syllable decomposition. */
291*6a54128fSAndroid Build Coastguard Worker #define HANGUL		((char)(255))
292*6a54128fSAndroid Build Coastguard Worker /* Size of the synthesized leaf used for Hangul syllable decomposition. */
293*6a54128fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF	(12)
294*6a54128fSAndroid Build Coastguard Worker 
295*6a54128fSAndroid Build Coastguard Worker /*
296*6a54128fSAndroid Build Coastguard Worker  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
297*6a54128fSAndroid Build Coastguard Worker  *
298*6a54128fSAndroid Build Coastguard Worker  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
299*6a54128fSAndroid Build Coastguard Worker  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
300*6a54128fSAndroid Build Coastguard Worker  *
301*6a54128fSAndroid Build Coastguard Worker  * SBase = 0xAC00
302*6a54128fSAndroid Build Coastguard Worker  * LBase = 0x1100
303*6a54128fSAndroid Build Coastguard Worker  * VBase = 0x1161
304*6a54128fSAndroid Build Coastguard Worker  * TBase = 0x11A7
305*6a54128fSAndroid Build Coastguard Worker  * LCount = 19
306*6a54128fSAndroid Build Coastguard Worker  * VCount = 21
307*6a54128fSAndroid Build Coastguard Worker  * TCount = 28
308*6a54128fSAndroid Build Coastguard Worker  * NCount = 588 (VCount * TCount)
309*6a54128fSAndroid Build Coastguard Worker  * SCount = 11172 (LCount * NCount)
310*6a54128fSAndroid Build Coastguard Worker  *
311*6a54128fSAndroid Build Coastguard Worker  * Decomposition:
312*6a54128fSAndroid Build Coastguard Worker  *   SIndex = s - SBase
313*6a54128fSAndroid Build Coastguard Worker  *
314*6a54128fSAndroid Build Coastguard Worker  * LV (Canonical/Full)
315*6a54128fSAndroid Build Coastguard Worker  *   LIndex = SIndex / NCount
316*6a54128fSAndroid Build Coastguard Worker  *   VIndex = (Sindex % NCount) / TCount
317*6a54128fSAndroid Build Coastguard Worker  *   LPart = LBase + LIndex
318*6a54128fSAndroid Build Coastguard Worker  *   VPart = VBase + VIndex
319*6a54128fSAndroid Build Coastguard Worker  *
320*6a54128fSAndroid Build Coastguard Worker  * LVT (Canonical)
321*6a54128fSAndroid Build Coastguard Worker  *   LVIndex = (SIndex / TCount) * TCount
322*6a54128fSAndroid Build Coastguard Worker  *   TIndex = (Sindex % TCount)
323*6a54128fSAndroid Build Coastguard Worker  *   LVPart = SBase + LVIndex
324*6a54128fSAndroid Build Coastguard Worker  *   TPart = TBase + TIndex
325*6a54128fSAndroid Build Coastguard Worker  *
326*6a54128fSAndroid Build Coastguard Worker  * LVT (Full)
327*6a54128fSAndroid Build Coastguard Worker  *   LIndex = SIndex / NCount
328*6a54128fSAndroid Build Coastguard Worker  *   VIndex = (Sindex % NCount) / TCount
329*6a54128fSAndroid Build Coastguard Worker  *   TIndex = (Sindex % TCount)
330*6a54128fSAndroid Build Coastguard Worker  *   LPart = LBase + LIndex
331*6a54128fSAndroid Build Coastguard Worker  *   VPart = VBase + VIndex
332*6a54128fSAndroid Build Coastguard Worker  *   if (TIndex == 0) {
333*6a54128fSAndroid Build Coastguard Worker  *          d = <LPart, VPart>
334*6a54128fSAndroid Build Coastguard Worker  *   } else {
335*6a54128fSAndroid Build Coastguard Worker  *          TPart = TBase + TIndex
336*6a54128fSAndroid Build Coastguard Worker  *          d = <LPart, TPart, VPart>
337*6a54128fSAndroid Build Coastguard Worker  *   }
338*6a54128fSAndroid Build Coastguard Worker  */
339*6a54128fSAndroid Build Coastguard Worker 
340*6a54128fSAndroid Build Coastguard Worker /* Constants */
341*6a54128fSAndroid Build Coastguard Worker #define SB	(0xAC00)
342*6a54128fSAndroid Build Coastguard Worker #define LB	(0x1100)
343*6a54128fSAndroid Build Coastguard Worker #define VB	(0x1161)
344*6a54128fSAndroid Build Coastguard Worker #define TB	(0x11A7)
345*6a54128fSAndroid Build Coastguard Worker #define LC	(19)
346*6a54128fSAndroid Build Coastguard Worker #define VC	(21)
347*6a54128fSAndroid Build Coastguard Worker #define TC	(28)
348*6a54128fSAndroid Build Coastguard Worker #define NC	(VC * TC)
349*6a54128fSAndroid Build Coastguard Worker #define SC	(LC * NC)
350*6a54128fSAndroid Build Coastguard Worker 
351*6a54128fSAndroid Build Coastguard Worker /* Algorithmic decomposition of hangul syllable. */
352*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)353*6a54128fSAndroid Build Coastguard Worker utf8hangul(const char *str, unsigned char *hangul)
354*6a54128fSAndroid Build Coastguard Worker {
355*6a54128fSAndroid Build Coastguard Worker 	unsigned int	si;
356*6a54128fSAndroid Build Coastguard Worker 	unsigned int	li;
357*6a54128fSAndroid Build Coastguard Worker 	unsigned int	vi;
358*6a54128fSAndroid Build Coastguard Worker 	unsigned int	ti;
359*6a54128fSAndroid Build Coastguard Worker 	unsigned char	*h;
360*6a54128fSAndroid Build Coastguard Worker 
361*6a54128fSAndroid Build Coastguard Worker 	/* Calculate the SI, LI, VI, and TI values. */
362*6a54128fSAndroid Build Coastguard Worker 	si = utf8decode3(str) - SB;
363*6a54128fSAndroid Build Coastguard Worker 	li = si / NC;
364*6a54128fSAndroid Build Coastguard Worker 	vi = (si % NC) / TC;
365*6a54128fSAndroid Build Coastguard Worker 	ti = si % TC;
366*6a54128fSAndroid Build Coastguard Worker 
367*6a54128fSAndroid Build Coastguard Worker 	/* Fill in base of leaf. */
368*6a54128fSAndroid Build Coastguard Worker 	h = hangul;
369*6a54128fSAndroid Build Coastguard Worker 	LEAF_GEN(h) = 2;
370*6a54128fSAndroid Build Coastguard Worker 	LEAF_CCC(h) = DECOMPOSE;
371*6a54128fSAndroid Build Coastguard Worker 	h += 2;
372*6a54128fSAndroid Build Coastguard Worker 
373*6a54128fSAndroid Build Coastguard Worker 	/* Add LPart, a 3-byte UTF-8 sequence. */
374*6a54128fSAndroid Build Coastguard Worker 	h += utf8encode3((char *)h, li + LB);
375*6a54128fSAndroid Build Coastguard Worker 
376*6a54128fSAndroid Build Coastguard Worker 	/* Add VPart, a 3-byte UTF-8 sequence. */
377*6a54128fSAndroid Build Coastguard Worker 	h += utf8encode3((char *)h, vi + VB);
378*6a54128fSAndroid Build Coastguard Worker 
379*6a54128fSAndroid Build Coastguard Worker 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
380*6a54128fSAndroid Build Coastguard Worker 	if (ti)
381*6a54128fSAndroid Build Coastguard Worker 		h += utf8encode3((char *)h, ti + TB);
382*6a54128fSAndroid Build Coastguard Worker 
383*6a54128fSAndroid Build Coastguard Worker 	/* Terminate string. */
384*6a54128fSAndroid Build Coastguard Worker 	h[0] = '\0';
385*6a54128fSAndroid Build Coastguard Worker 
386*6a54128fSAndroid Build Coastguard Worker 	return hangul;
387*6a54128fSAndroid Build Coastguard Worker }
388*6a54128fSAndroid Build Coastguard Worker 
389*6a54128fSAndroid Build Coastguard Worker /*
390*6a54128fSAndroid Build Coastguard Worker  * Use trie to scan s, touching at most len bytes.
391*6a54128fSAndroid Build Coastguard Worker  * Returns the leaf if one exists, NULL otherwise.
392*6a54128fSAndroid Build Coastguard Worker  *
393*6a54128fSAndroid Build Coastguard Worker  * A non-NULL return guarantees that the UTF-8 sequence starting at s
394*6a54128fSAndroid Build Coastguard Worker  * is well-formed and corresponds to a known unicode code point.  The
395*6a54128fSAndroid Build Coastguard Worker  * shorthand for this will be "is valid UTF-8 unicode".
396*6a54128fSAndroid Build Coastguard Worker  */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)397*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8nlookup(const struct utf8data *data,
398*6a54128fSAndroid Build Coastguard Worker 			       unsigned char *hangul, const char *s, size_t len)
399*6a54128fSAndroid Build Coastguard Worker {
400*6a54128fSAndroid Build Coastguard Worker 	utf8trie_t	*trie;
401*6a54128fSAndroid Build Coastguard Worker 	int		offlen;
402*6a54128fSAndroid Build Coastguard Worker 	int		offset;
403*6a54128fSAndroid Build Coastguard Worker 	int		mask;
404*6a54128fSAndroid Build Coastguard Worker 	int		node;
405*6a54128fSAndroid Build Coastguard Worker 
406*6a54128fSAndroid Build Coastguard Worker 	if (!data)
407*6a54128fSAndroid Build Coastguard Worker 		return NULL;
408*6a54128fSAndroid Build Coastguard Worker 	if (len == 0)
409*6a54128fSAndroid Build Coastguard Worker 		return NULL;
410*6a54128fSAndroid Build Coastguard Worker 
411*6a54128fSAndroid Build Coastguard Worker 	trie = utf8data + data->offset;
412*6a54128fSAndroid Build Coastguard Worker 	node = 1;
413*6a54128fSAndroid Build Coastguard Worker 	while (node) {
414*6a54128fSAndroid Build Coastguard Worker 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
415*6a54128fSAndroid Build Coastguard Worker 		if (*trie & NEXTBYTE) {
416*6a54128fSAndroid Build Coastguard Worker 			if (--len == 0)
417*6a54128fSAndroid Build Coastguard Worker 				return NULL;
418*6a54128fSAndroid Build Coastguard Worker 			s++;
419*6a54128fSAndroid Build Coastguard Worker 		}
420*6a54128fSAndroid Build Coastguard Worker 		mask = 1 << (*trie & BITNUM);
421*6a54128fSAndroid Build Coastguard Worker 		if (*s & mask) {
422*6a54128fSAndroid Build Coastguard Worker 			/* Right leg */
423*6a54128fSAndroid Build Coastguard Worker 			if (offlen) {
424*6a54128fSAndroid Build Coastguard Worker 				/* Right node at offset of trie */
425*6a54128fSAndroid Build Coastguard Worker 				node = (*trie & RIGHTNODE);
426*6a54128fSAndroid Build Coastguard Worker 				offset = trie[offlen];
427*6a54128fSAndroid Build Coastguard Worker 				while (--offlen) {
428*6a54128fSAndroid Build Coastguard Worker 					offset <<= 8;
429*6a54128fSAndroid Build Coastguard Worker 					offset |= trie[offlen];
430*6a54128fSAndroid Build Coastguard Worker 				}
431*6a54128fSAndroid Build Coastguard Worker 				trie += offset;
432*6a54128fSAndroid Build Coastguard Worker 			} else if (*trie & RIGHTPATH) {
433*6a54128fSAndroid Build Coastguard Worker 				/* Right node after this node */
434*6a54128fSAndroid Build Coastguard Worker 				node = (*trie & TRIENODE);
435*6a54128fSAndroid Build Coastguard Worker 				trie++;
436*6a54128fSAndroid Build Coastguard Worker 			} else {
437*6a54128fSAndroid Build Coastguard Worker 				/* No right node. */
438*6a54128fSAndroid Build Coastguard Worker 				return NULL;
439*6a54128fSAndroid Build Coastguard Worker 			}
440*6a54128fSAndroid Build Coastguard Worker 		} else {
441*6a54128fSAndroid Build Coastguard Worker 			/* Left leg */
442*6a54128fSAndroid Build Coastguard Worker 			if (offlen) {
443*6a54128fSAndroid Build Coastguard Worker 				/* Left node after this node. */
444*6a54128fSAndroid Build Coastguard Worker 				node = (*trie & LEFTNODE);
445*6a54128fSAndroid Build Coastguard Worker 				trie += offlen + 1;
446*6a54128fSAndroid Build Coastguard Worker 			} else if (*trie & RIGHTPATH) {
447*6a54128fSAndroid Build Coastguard Worker 				/* No left node. */
448*6a54128fSAndroid Build Coastguard Worker 				return NULL;
449*6a54128fSAndroid Build Coastguard Worker 			} else {
450*6a54128fSAndroid Build Coastguard Worker 				/* Left node after this node */
451*6a54128fSAndroid Build Coastguard Worker 				node = (*trie & TRIENODE);
452*6a54128fSAndroid Build Coastguard Worker 				trie++;
453*6a54128fSAndroid Build Coastguard Worker 			}
454*6a54128fSAndroid Build Coastguard Worker 		}
455*6a54128fSAndroid Build Coastguard Worker 	}
456*6a54128fSAndroid Build Coastguard Worker 	/*
457*6a54128fSAndroid Build Coastguard Worker 	 * Hangul decomposition is done algorithmically. These are the
458*6a54128fSAndroid Build Coastguard Worker 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
459*6a54128fSAndroid Build Coastguard Worker 	 * always 3 bytes long, so s has been advanced twice, and the
460*6a54128fSAndroid Build Coastguard Worker 	 * start of the sequence is at s-2.
461*6a54128fSAndroid Build Coastguard Worker 	 */
462*6a54128fSAndroid Build Coastguard Worker 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
463*6a54128fSAndroid Build Coastguard Worker 		trie = utf8hangul(s - 2, hangul);
464*6a54128fSAndroid Build Coastguard Worker 	return trie;
465*6a54128fSAndroid Build Coastguard Worker }
466*6a54128fSAndroid Build Coastguard Worker 
467*6a54128fSAndroid Build Coastguard Worker /*
468*6a54128fSAndroid Build Coastguard Worker  * Use trie to scan s.
469*6a54128fSAndroid Build Coastguard Worker  * Returns the leaf if one exists, NULL otherwise.
470*6a54128fSAndroid Build Coastguard Worker  *
471*6a54128fSAndroid Build Coastguard Worker  * Forwards to utf8nlookup().
472*6a54128fSAndroid Build Coastguard Worker  */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)473*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8lookup(const struct utf8data *data,
474*6a54128fSAndroid Build Coastguard Worker 			      unsigned char *hangul, const char *s)
475*6a54128fSAndroid Build Coastguard Worker {
476*6a54128fSAndroid Build Coastguard Worker 	return utf8nlookup(data, hangul, s, (size_t)-1);
477*6a54128fSAndroid Build Coastguard Worker }
478*6a54128fSAndroid Build Coastguard Worker 
479*6a54128fSAndroid Build Coastguard Worker #if 0
480*6a54128fSAndroid Build Coastguard Worker /*
481*6a54128fSAndroid Build Coastguard Worker  * Maximum age of any character in s.
482*6a54128fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
483*6a54128fSAndroid Build Coastguard Worker  * Return 0 if only non-assigned code points are used.
484*6a54128fSAndroid Build Coastguard Worker  */
485*6a54128fSAndroid Build Coastguard Worker static int utf8agemax(const struct utf8data *data, const char *s)
486*6a54128fSAndroid Build Coastguard Worker {
487*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
488*6a54128fSAndroid Build Coastguard Worker 	int		age = 0;
489*6a54128fSAndroid Build Coastguard Worker 	int		leaf_age;
490*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
491*6a54128fSAndroid Build Coastguard Worker 
492*6a54128fSAndroid Build Coastguard Worker 	if (!data)
493*6a54128fSAndroid Build Coastguard Worker 		return -1;
494*6a54128fSAndroid Build Coastguard Worker 
495*6a54128fSAndroid Build Coastguard Worker 	while (*s) {
496*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8lookup(data, hangul, s);
497*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
498*6a54128fSAndroid Build Coastguard Worker 			return -1;
499*6a54128fSAndroid Build Coastguard Worker 
500*6a54128fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
501*6a54128fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age > age)
502*6a54128fSAndroid Build Coastguard Worker 			age = leaf_age;
503*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
504*6a54128fSAndroid Build Coastguard Worker 	}
505*6a54128fSAndroid Build Coastguard Worker 	return age;
506*6a54128fSAndroid Build Coastguard Worker }
507*6a54128fSAndroid Build Coastguard Worker #endif
508*6a54128fSAndroid Build Coastguard Worker 
509*6a54128fSAndroid Build Coastguard Worker #if 0
510*6a54128fSAndroid Build Coastguard Worker /*
511*6a54128fSAndroid Build Coastguard Worker  * Minimum age of any character in s.
512*6a54128fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
513*6a54128fSAndroid Build Coastguard Worker  * Return 0 if non-assigned code points are used.
514*6a54128fSAndroid Build Coastguard Worker  */
515*6a54128fSAndroid Build Coastguard Worker static int utf8agemin(const struct utf8data *data, const char *s)
516*6a54128fSAndroid Build Coastguard Worker {
517*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
518*6a54128fSAndroid Build Coastguard Worker 	int		age;
519*6a54128fSAndroid Build Coastguard Worker 	int		leaf_age;
520*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
521*6a54128fSAndroid Build Coastguard Worker 
522*6a54128fSAndroid Build Coastguard Worker 	if (!data)
523*6a54128fSAndroid Build Coastguard Worker 		return -1;
524*6a54128fSAndroid Build Coastguard Worker 	age = data->maxage;
525*6a54128fSAndroid Build Coastguard Worker 	while (*s) {
526*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8lookup(data, hangul, s);
527*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
528*6a54128fSAndroid Build Coastguard Worker 			return -1;
529*6a54128fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
530*6a54128fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age < age)
531*6a54128fSAndroid Build Coastguard Worker 			age = leaf_age;
532*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
533*6a54128fSAndroid Build Coastguard Worker 	}
534*6a54128fSAndroid Build Coastguard Worker 	return age;
535*6a54128fSAndroid Build Coastguard Worker }
536*6a54128fSAndroid Build Coastguard Worker #endif
537*6a54128fSAndroid Build Coastguard Worker 
538*6a54128fSAndroid Build Coastguard Worker #if 0
539*6a54128fSAndroid Build Coastguard Worker /*
540*6a54128fSAndroid Build Coastguard Worker  * Maximum age of any character in s, touch at most len bytes.
541*6a54128fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
542*6a54128fSAndroid Build Coastguard Worker  */
543*6a54128fSAndroid Build Coastguard Worker static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
544*6a54128fSAndroid Build Coastguard Worker {
545*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
546*6a54128fSAndroid Build Coastguard Worker 	int		age = 0;
547*6a54128fSAndroid Build Coastguard Worker 	int		leaf_age;
548*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
549*6a54128fSAndroid Build Coastguard Worker 
550*6a54128fSAndroid Build Coastguard Worker 	if (!data)
551*6a54128fSAndroid Build Coastguard Worker 		return -1;
552*6a54128fSAndroid Build Coastguard Worker 
553*6a54128fSAndroid Build Coastguard Worker 	while (len && *s) {
554*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
555*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
556*6a54128fSAndroid Build Coastguard Worker 			return -1;
557*6a54128fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
558*6a54128fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age > age)
559*6a54128fSAndroid Build Coastguard Worker 			age = leaf_age;
560*6a54128fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
561*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
562*6a54128fSAndroid Build Coastguard Worker 	}
563*6a54128fSAndroid Build Coastguard Worker 	return age;
564*6a54128fSAndroid Build Coastguard Worker }
565*6a54128fSAndroid Build Coastguard Worker #endif
566*6a54128fSAndroid Build Coastguard Worker 
567*6a54128fSAndroid Build Coastguard Worker #if 0
568*6a54128fSAndroid Build Coastguard Worker /*
569*6a54128fSAndroid Build Coastguard Worker  * Maximum age of any character in s, touch at most len bytes.
570*6a54128fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
571*6a54128fSAndroid Build Coastguard Worker  */
572*6a54128fSAndroid Build Coastguard Worker static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
573*6a54128fSAndroid Build Coastguard Worker {
574*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
575*6a54128fSAndroid Build Coastguard Worker 	int		leaf_age;
576*6a54128fSAndroid Build Coastguard Worker 	int		age;
577*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
578*6a54128fSAndroid Build Coastguard Worker 
579*6a54128fSAndroid Build Coastguard Worker 	if (!data)
580*6a54128fSAndroid Build Coastguard Worker 		return -1;
581*6a54128fSAndroid Build Coastguard Worker 	age = data->maxage;
582*6a54128fSAndroid Build Coastguard Worker 	while (len && *s) {
583*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
584*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
585*6a54128fSAndroid Build Coastguard Worker 			return -1;
586*6a54128fSAndroid Build Coastguard Worker 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
587*6a54128fSAndroid Build Coastguard Worker 		if (leaf_age <= data->maxage && leaf_age < age)
588*6a54128fSAndroid Build Coastguard Worker 			age = leaf_age;
589*6a54128fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
590*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
591*6a54128fSAndroid Build Coastguard Worker 	}
592*6a54128fSAndroid Build Coastguard Worker 	return age;
593*6a54128fSAndroid Build Coastguard Worker }
594*6a54128fSAndroid Build Coastguard Worker #endif
595*6a54128fSAndroid Build Coastguard Worker 
596*6a54128fSAndroid Build Coastguard Worker #if 0
597*6a54128fSAndroid Build Coastguard Worker /*
598*6a54128fSAndroid Build Coastguard Worker  * Length of the normalization of s.
599*6a54128fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
600*6a54128fSAndroid Build Coastguard Worker  *
601*6a54128fSAndroid Build Coastguard Worker  * A string of Default_Ignorable_Code_Point has length 0.
602*6a54128fSAndroid Build Coastguard Worker  */
603*6a54128fSAndroid Build Coastguard Worker static ssize_t utf8len(const struct utf8data *data, const char *s)
604*6a54128fSAndroid Build Coastguard Worker {
605*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
606*6a54128fSAndroid Build Coastguard Worker 	size_t		ret = 0;
607*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
608*6a54128fSAndroid Build Coastguard Worker 
609*6a54128fSAndroid Build Coastguard Worker 	if (!data)
610*6a54128fSAndroid Build Coastguard Worker 		return -1;
611*6a54128fSAndroid Build Coastguard Worker 	while (*s) {
612*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8lookup(data, hangul, s);
613*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
614*6a54128fSAndroid Build Coastguard Worker 			return -1;
615*6a54128fSAndroid Build Coastguard Worker 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
616*6a54128fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
617*6a54128fSAndroid Build Coastguard Worker 		else if (LEAF_CCC(leaf) == DECOMPOSE)
618*6a54128fSAndroid Build Coastguard Worker 			ret += strlen(LEAF_STR(leaf));
619*6a54128fSAndroid Build Coastguard Worker 		else
620*6a54128fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
621*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
622*6a54128fSAndroid Build Coastguard Worker 	}
623*6a54128fSAndroid Build Coastguard Worker 	return ret;
624*6a54128fSAndroid Build Coastguard Worker }
625*6a54128fSAndroid Build Coastguard Worker #endif
626*6a54128fSAndroid Build Coastguard Worker 
627*6a54128fSAndroid Build Coastguard Worker #if 0
628*6a54128fSAndroid Build Coastguard Worker /*
629*6a54128fSAndroid Build Coastguard Worker  * Length of the normalization of s, touch at most len bytes.
630*6a54128fSAndroid Build Coastguard Worker  * Return -1 if s is not valid UTF-8 unicode.
631*6a54128fSAndroid Build Coastguard Worker  */
632*6a54128fSAndroid Build Coastguard Worker static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
633*6a54128fSAndroid Build Coastguard Worker {
634*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
635*6a54128fSAndroid Build Coastguard Worker 	size_t		ret = 0;
636*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
637*6a54128fSAndroid Build Coastguard Worker 
638*6a54128fSAndroid Build Coastguard Worker 	if (!data)
639*6a54128fSAndroid Build Coastguard Worker 		return -1;
640*6a54128fSAndroid Build Coastguard Worker 	while (len && *s) {
641*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
642*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
643*6a54128fSAndroid Build Coastguard Worker 			return -1;
644*6a54128fSAndroid Build Coastguard Worker 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
645*6a54128fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
646*6a54128fSAndroid Build Coastguard Worker 		else if (LEAF_CCC(leaf) == DECOMPOSE)
647*6a54128fSAndroid Build Coastguard Worker 			ret += strlen(LEAF_STR(leaf));
648*6a54128fSAndroid Build Coastguard Worker 		else
649*6a54128fSAndroid Build Coastguard Worker 			ret += utf8clen(s);
650*6a54128fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
651*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
652*6a54128fSAndroid Build Coastguard Worker 	}
653*6a54128fSAndroid Build Coastguard Worker 	return ret;
654*6a54128fSAndroid Build Coastguard Worker }
655*6a54128fSAndroid Build Coastguard Worker #endif
656*6a54128fSAndroid Build Coastguard Worker 
657*6a54128fSAndroid Build Coastguard Worker /*
658*6a54128fSAndroid Build Coastguard Worker  * Set up an utf8cursor for use by utf8byte().
659*6a54128fSAndroid Build Coastguard Worker  *
660*6a54128fSAndroid Build Coastguard Worker  *   u8c    : pointer to cursor.
661*6a54128fSAndroid Build Coastguard Worker  *   data   : const struct utf8data to use for normalization.
662*6a54128fSAndroid Build Coastguard Worker  *   s      : string.
663*6a54128fSAndroid Build Coastguard Worker  *   len    : length of s.
664*6a54128fSAndroid Build Coastguard Worker  *
665*6a54128fSAndroid Build Coastguard Worker  * Returns -1 on error, 0 on success.
666*6a54128fSAndroid Build Coastguard Worker  */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)667*6a54128fSAndroid Build Coastguard Worker static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
668*6a54128fSAndroid Build Coastguard Worker 		const char *s, size_t len)
669*6a54128fSAndroid Build Coastguard Worker {
670*6a54128fSAndroid Build Coastguard Worker 	if (!data)
671*6a54128fSAndroid Build Coastguard Worker 		return -1;
672*6a54128fSAndroid Build Coastguard Worker 	if (!s)
673*6a54128fSAndroid Build Coastguard Worker 		return -1;
674*6a54128fSAndroid Build Coastguard Worker 	u8c->data = data;
675*6a54128fSAndroid Build Coastguard Worker 	u8c->s = s;
676*6a54128fSAndroid Build Coastguard Worker 	u8c->p = NULL;
677*6a54128fSAndroid Build Coastguard Worker 	u8c->ss = NULL;
678*6a54128fSAndroid Build Coastguard Worker 	u8c->sp = NULL;
679*6a54128fSAndroid Build Coastguard Worker 	u8c->len = len;
680*6a54128fSAndroid Build Coastguard Worker 	u8c->slen = 0;
681*6a54128fSAndroid Build Coastguard Worker 	u8c->ccc = STOPPER;
682*6a54128fSAndroid Build Coastguard Worker 	u8c->nccc = STOPPER;
683*6a54128fSAndroid Build Coastguard Worker 	/* Check we didn't clobber the maximum length. */
684*6a54128fSAndroid Build Coastguard Worker 	if (u8c->len != len)
685*6a54128fSAndroid Build Coastguard Worker 		return -1;
686*6a54128fSAndroid Build Coastguard Worker 	/* The first byte of s may not be an utf8 continuation. */
687*6a54128fSAndroid Build Coastguard Worker 	if (len > 0 && (*s & 0xC0) == 0x80)
688*6a54128fSAndroid Build Coastguard Worker 		return -1;
689*6a54128fSAndroid Build Coastguard Worker 	return 0;
690*6a54128fSAndroid Build Coastguard Worker }
691*6a54128fSAndroid Build Coastguard Worker 
692*6a54128fSAndroid Build Coastguard Worker #if 0
693*6a54128fSAndroid Build Coastguard Worker /*
694*6a54128fSAndroid Build Coastguard Worker  * Set up an utf8cursor for use by utf8byte().
695*6a54128fSAndroid Build Coastguard Worker  *
696*6a54128fSAndroid Build Coastguard Worker  *   u8c    : pointer to cursor.
697*6a54128fSAndroid Build Coastguard Worker  *   data   : const struct utf8data to use for normalization.
698*6a54128fSAndroid Build Coastguard Worker  *   s      : NUL-terminated string.
699*6a54128fSAndroid Build Coastguard Worker  *
700*6a54128fSAndroid Build Coastguard Worker  * Returns -1 on error, 0 on success.
701*6a54128fSAndroid Build Coastguard Worker  */
702*6a54128fSAndroid Build Coastguard Worker static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
703*6a54128fSAndroid Build Coastguard Worker 	       const char *s)
704*6a54128fSAndroid Build Coastguard Worker {
705*6a54128fSAndroid Build Coastguard Worker 	return utf8ncursor(u8c, data, s, (unsigned int)-1);
706*6a54128fSAndroid Build Coastguard Worker }
707*6a54128fSAndroid Build Coastguard Worker #endif
708*6a54128fSAndroid Build Coastguard Worker 
709*6a54128fSAndroid Build Coastguard Worker /*
710*6a54128fSAndroid Build Coastguard Worker  * Get one byte from the normalized form of the string described by u8c.
711*6a54128fSAndroid Build Coastguard Worker  *
712*6a54128fSAndroid Build Coastguard Worker  * Returns the byte cast to an unsigned char on success, and -1 on failure.
713*6a54128fSAndroid Build Coastguard Worker  *
714*6a54128fSAndroid Build Coastguard Worker  * The cursor keeps track of the location in the string in u8c->s.
715*6a54128fSAndroid Build Coastguard Worker  * When a character is decomposed, the current location is stored in
716*6a54128fSAndroid Build Coastguard Worker  * u8c->p, and u8c->s is set to the start of the decomposition. Note
717*6a54128fSAndroid Build Coastguard Worker  * that bytes from a decomposition do not count against u8c->len.
718*6a54128fSAndroid Build Coastguard Worker  *
719*6a54128fSAndroid Build Coastguard Worker  * Characters are emitted if they match the current CCC in u8c->ccc.
720*6a54128fSAndroid Build Coastguard Worker  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
721*6a54128fSAndroid Build Coastguard Worker  * and the function returns 0 in that case.
722*6a54128fSAndroid Build Coastguard Worker  *
723*6a54128fSAndroid Build Coastguard Worker  * Sorting by CCC is done by repeatedly scanning the string.  The
724*6a54128fSAndroid Build Coastguard Worker  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
725*6a54128fSAndroid Build Coastguard Worker  * the start of the scan.  The first pass finds the lowest CCC to be
726*6a54128fSAndroid Build Coastguard Worker  * emitted and stores it in u8c->nccc, the second pass emits the
727*6a54128fSAndroid Build Coastguard Worker  * characters with this CCC and finds the next lowest CCC. This limits
728*6a54128fSAndroid Build Coastguard Worker  * the number of passes to 1 + the number of different CCCs in the
729*6a54128fSAndroid Build Coastguard Worker  * sequence being scanned.
730*6a54128fSAndroid Build Coastguard Worker  *
731*6a54128fSAndroid Build Coastguard Worker  * Therefore:
732*6a54128fSAndroid Build Coastguard Worker  *  u8c->p  != NULL -> a decomposition is being scanned.
733*6a54128fSAndroid Build Coastguard Worker  *  u8c->ss != NULL -> this is a repeating scan.
734*6a54128fSAndroid Build Coastguard Worker  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
735*6a54128fSAndroid Build Coastguard Worker  */
utf8byte(struct utf8cursor * u8c)736*6a54128fSAndroid Build Coastguard Worker static int utf8byte(struct utf8cursor *u8c)
737*6a54128fSAndroid Build Coastguard Worker {
738*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t *leaf;
739*6a54128fSAndroid Build Coastguard Worker 	int ccc;
740*6a54128fSAndroid Build Coastguard Worker 
741*6a54128fSAndroid Build Coastguard Worker 	for (;;) {
742*6a54128fSAndroid Build Coastguard Worker 		/* Check for the end of a decomposed character. */
743*6a54128fSAndroid Build Coastguard Worker 		if (u8c->p && *u8c->s == '\0') {
744*6a54128fSAndroid Build Coastguard Worker 			u8c->s = u8c->p;
745*6a54128fSAndroid Build Coastguard Worker 			u8c->p = NULL;
746*6a54128fSAndroid Build Coastguard Worker 		}
747*6a54128fSAndroid Build Coastguard Worker 
748*6a54128fSAndroid Build Coastguard Worker 		/* Check for end-of-string. */
749*6a54128fSAndroid Build Coastguard Worker 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
750*6a54128fSAndroid Build Coastguard Worker 			/* There is no next byte. */
751*6a54128fSAndroid Build Coastguard Worker 			if (u8c->ccc == STOPPER)
752*6a54128fSAndroid Build Coastguard Worker 				return 0;
753*6a54128fSAndroid Build Coastguard Worker 			/* End-of-string during a scan counts as a stopper. */
754*6a54128fSAndroid Build Coastguard Worker 			ccc = STOPPER;
755*6a54128fSAndroid Build Coastguard Worker 			goto ccc_mismatch;
756*6a54128fSAndroid Build Coastguard Worker 		} else if ((*u8c->s & 0xC0) == 0x80) {
757*6a54128fSAndroid Build Coastguard Worker 			/* This is a continuation of the current character. */
758*6a54128fSAndroid Build Coastguard Worker 			if (!u8c->p)
759*6a54128fSAndroid Build Coastguard Worker 				u8c->len--;
760*6a54128fSAndroid Build Coastguard Worker 			return (unsigned char)*u8c->s++;
761*6a54128fSAndroid Build Coastguard Worker 		}
762*6a54128fSAndroid Build Coastguard Worker 
763*6a54128fSAndroid Build Coastguard Worker 		/* Look up the data for the current character. */
764*6a54128fSAndroid Build Coastguard Worker 		if (u8c->p) {
765*6a54128fSAndroid Build Coastguard Worker 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
766*6a54128fSAndroid Build Coastguard Worker 		} else {
767*6a54128fSAndroid Build Coastguard Worker 			leaf = utf8nlookup(u8c->data, u8c->hangul,
768*6a54128fSAndroid Build Coastguard Worker 					   u8c->s, u8c->len);
769*6a54128fSAndroid Build Coastguard Worker 		}
770*6a54128fSAndroid Build Coastguard Worker 
771*6a54128fSAndroid Build Coastguard Worker 		/* No leaf found implies that the input is a binary blob. */
772*6a54128fSAndroid Build Coastguard Worker 		if (!leaf)
773*6a54128fSAndroid Build Coastguard Worker 			return -1;
774*6a54128fSAndroid Build Coastguard Worker 
775*6a54128fSAndroid Build Coastguard Worker 		ccc = LEAF_CCC(leaf);
776*6a54128fSAndroid Build Coastguard Worker 		/* Characters that are too new have CCC 0. */
777*6a54128fSAndroid Build Coastguard Worker 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
778*6a54128fSAndroid Build Coastguard Worker 			ccc = STOPPER;
779*6a54128fSAndroid Build Coastguard Worker 		} else if (ccc == DECOMPOSE) {
780*6a54128fSAndroid Build Coastguard Worker 			u8c->len -= utf8clen(u8c->s);
781*6a54128fSAndroid Build Coastguard Worker 			u8c->p = u8c->s + utf8clen(u8c->s);
782*6a54128fSAndroid Build Coastguard Worker 			u8c->s = LEAF_STR(leaf);
783*6a54128fSAndroid Build Coastguard Worker 			/* Empty decomposition implies CCC 0. */
784*6a54128fSAndroid Build Coastguard Worker 			if (*u8c->s == '\0') {
785*6a54128fSAndroid Build Coastguard Worker 				if (u8c->ccc == STOPPER)
786*6a54128fSAndroid Build Coastguard Worker 					continue;
787*6a54128fSAndroid Build Coastguard Worker 				ccc = STOPPER;
788*6a54128fSAndroid Build Coastguard Worker 				goto ccc_mismatch;
789*6a54128fSAndroid Build Coastguard Worker 			}
790*6a54128fSAndroid Build Coastguard Worker 
791*6a54128fSAndroid Build Coastguard Worker 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
792*6a54128fSAndroid Build Coastguard Worker 			if (!leaf)
793*6a54128fSAndroid Build Coastguard Worker 				return -1;
794*6a54128fSAndroid Build Coastguard Worker 			ccc = LEAF_CCC(leaf);
795*6a54128fSAndroid Build Coastguard Worker 		}
796*6a54128fSAndroid Build Coastguard Worker 
797*6a54128fSAndroid Build Coastguard Worker 		/*
798*6a54128fSAndroid Build Coastguard Worker 		 * If this is not a stopper, then see if it updates
799*6a54128fSAndroid Build Coastguard Worker 		 * the next canonical class to be emitted.
800*6a54128fSAndroid Build Coastguard Worker 		 */
801*6a54128fSAndroid Build Coastguard Worker 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
802*6a54128fSAndroid Build Coastguard Worker 			u8c->nccc = ccc;
803*6a54128fSAndroid Build Coastguard Worker 
804*6a54128fSAndroid Build Coastguard Worker 		/*
805*6a54128fSAndroid Build Coastguard Worker 		 * Return the current byte if this is the current
806*6a54128fSAndroid Build Coastguard Worker 		 * combining class.
807*6a54128fSAndroid Build Coastguard Worker 		 */
808*6a54128fSAndroid Build Coastguard Worker 		if (ccc == u8c->ccc) {
809*6a54128fSAndroid Build Coastguard Worker 			if (!u8c->p)
810*6a54128fSAndroid Build Coastguard Worker 				u8c->len--;
811*6a54128fSAndroid Build Coastguard Worker 			return (unsigned char)*u8c->s++;
812*6a54128fSAndroid Build Coastguard Worker 		}
813*6a54128fSAndroid Build Coastguard Worker 
814*6a54128fSAndroid Build Coastguard Worker 		/* Current combining class mismatch. */
815*6a54128fSAndroid Build Coastguard Worker ccc_mismatch:
816*6a54128fSAndroid Build Coastguard Worker 		if (u8c->nccc == STOPPER) {
817*6a54128fSAndroid Build Coastguard Worker 			/*
818*6a54128fSAndroid Build Coastguard Worker 			 * Scan forward for the first canonical class
819*6a54128fSAndroid Build Coastguard Worker 			 * to be emitted.  Save the position from
820*6a54128fSAndroid Build Coastguard Worker 			 * which to restart.
821*6a54128fSAndroid Build Coastguard Worker 			 */
822*6a54128fSAndroid Build Coastguard Worker 			u8c->ccc = MINCCC - 1;
823*6a54128fSAndroid Build Coastguard Worker 			u8c->nccc = ccc;
824*6a54128fSAndroid Build Coastguard Worker 			u8c->sp = u8c->p;
825*6a54128fSAndroid Build Coastguard Worker 			u8c->ss = u8c->s;
826*6a54128fSAndroid Build Coastguard Worker 			u8c->slen = u8c->len;
827*6a54128fSAndroid Build Coastguard Worker 			if (!u8c->p)
828*6a54128fSAndroid Build Coastguard Worker 				u8c->len -= utf8clen(u8c->s);
829*6a54128fSAndroid Build Coastguard Worker 			u8c->s += utf8clen(u8c->s);
830*6a54128fSAndroid Build Coastguard Worker 		} else if (ccc != STOPPER) {
831*6a54128fSAndroid Build Coastguard Worker 			/* Not a stopper, and not the ccc we're emitting. */
832*6a54128fSAndroid Build Coastguard Worker 			if (!u8c->p)
833*6a54128fSAndroid Build Coastguard Worker 				u8c->len -= utf8clen(u8c->s);
834*6a54128fSAndroid Build Coastguard Worker 			u8c->s += utf8clen(u8c->s);
835*6a54128fSAndroid Build Coastguard Worker 		} else if (u8c->nccc != MAXCCC + 1) {
836*6a54128fSAndroid Build Coastguard Worker 			/* At a stopper, restart for next ccc. */
837*6a54128fSAndroid Build Coastguard Worker 			u8c->ccc = u8c->nccc;
838*6a54128fSAndroid Build Coastguard Worker 			u8c->nccc = MAXCCC + 1;
839*6a54128fSAndroid Build Coastguard Worker 			u8c->s = u8c->ss;
840*6a54128fSAndroid Build Coastguard Worker 			u8c->p = u8c->sp;
841*6a54128fSAndroid Build Coastguard Worker 			u8c->len = u8c->slen;
842*6a54128fSAndroid Build Coastguard Worker 		} else {
843*6a54128fSAndroid Build Coastguard Worker 			/* All done, proceed from here. */
844*6a54128fSAndroid Build Coastguard Worker 			u8c->ccc = STOPPER;
845*6a54128fSAndroid Build Coastguard Worker 			u8c->nccc = STOPPER;
846*6a54128fSAndroid Build Coastguard Worker 			u8c->sp = NULL;
847*6a54128fSAndroid Build Coastguard Worker 			u8c->ss = NULL;
848*6a54128fSAndroid Build Coastguard Worker 			u8c->slen = 0;
849*6a54128fSAndroid Build Coastguard Worker 		}
850*6a54128fSAndroid Build Coastguard Worker 	}
851*6a54128fSAndroid Build Coastguard Worker }
852*6a54128fSAndroid Build Coastguard Worker 
853*6a54128fSAndroid Build Coastguard Worker #if 0
854*6a54128fSAndroid Build Coastguard Worker /*
855*6a54128fSAndroid Build Coastguard Worker  * Look for the correct const struct utf8data for a unicode version.
856*6a54128fSAndroid Build Coastguard Worker  * Returns NULL if the version requested is too new.
857*6a54128fSAndroid Build Coastguard Worker  *
858*6a54128fSAndroid Build Coastguard Worker  * Two normalization forms are supported: nfdi and nfdicf.
859*6a54128fSAndroid Build Coastguard Worker  *
860*6a54128fSAndroid Build Coastguard Worker  * nfdi:
861*6a54128fSAndroid Build Coastguard Worker  *  - Apply unicode normalization form NFD.
862*6a54128fSAndroid Build Coastguard Worker  *  - Remove any Default_Ignorable_Code_Point.
863*6a54128fSAndroid Build Coastguard Worker  *
864*6a54128fSAndroid Build Coastguard Worker  * nfdicf:
865*6a54128fSAndroid Build Coastguard Worker  *  - Apply unicode normalization form NFD.
866*6a54128fSAndroid Build Coastguard Worker  *  - Remove any Default_Ignorable_Code_Point.
867*6a54128fSAndroid Build Coastguard Worker  *  - Apply a full casefold (C + F).
868*6a54128fSAndroid Build Coastguard Worker  */
869*6a54128fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdi(unsigned int maxage)
870*6a54128fSAndroid Build Coastguard Worker {
871*6a54128fSAndroid Build Coastguard Worker 	int i = ARRAY_SIZE(utf8nfdidata) - 1;
872*6a54128fSAndroid Build Coastguard Worker 
873*6a54128fSAndroid Build Coastguard Worker 	while (maxage < utf8nfdidata[i].maxage)
874*6a54128fSAndroid Build Coastguard Worker 		i--;
875*6a54128fSAndroid Build Coastguard Worker 	if (maxage > utf8nfdidata[i].maxage)
876*6a54128fSAndroid Build Coastguard Worker 		return NULL;
877*6a54128fSAndroid Build Coastguard Worker 	return &utf8nfdidata[i];
878*6a54128fSAndroid Build Coastguard Worker }
879*6a54128fSAndroid Build Coastguard Worker #endif
880*6a54128fSAndroid Build Coastguard Worker 
utf8nfdicf(unsigned int maxage)881*6a54128fSAndroid Build Coastguard Worker static const struct utf8data *utf8nfdicf(unsigned int maxage)
882*6a54128fSAndroid Build Coastguard Worker {
883*6a54128fSAndroid Build Coastguard Worker 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
884*6a54128fSAndroid Build Coastguard Worker 
885*6a54128fSAndroid Build Coastguard Worker 	while (maxage < utf8nfdicfdata[i].maxage)
886*6a54128fSAndroid Build Coastguard Worker 		i--;
887*6a54128fSAndroid Build Coastguard Worker 	if (maxage > utf8nfdicfdata[i].maxage)
888*6a54128fSAndroid Build Coastguard Worker 		return NULL;
889*6a54128fSAndroid Build Coastguard Worker 	return &utf8nfdicfdata[i];
890*6a54128fSAndroid Build Coastguard Worker }
891*6a54128fSAndroid Build Coastguard Worker 
utf8_casefold(const struct ext2fs_nls_table * table,const unsigned char * str,size_t len,unsigned char * dest,size_t dlen)892*6a54128fSAndroid Build Coastguard Worker static int utf8_casefold(const struct ext2fs_nls_table *table,
893*6a54128fSAndroid Build Coastguard Worker 			  const unsigned char *str, size_t len,
894*6a54128fSAndroid Build Coastguard Worker 			  unsigned char *dest, size_t dlen)
895*6a54128fSAndroid Build Coastguard Worker {
896*6a54128fSAndroid Build Coastguard Worker 	const struct utf8data *data = utf8nfdicf(table->version);
897*6a54128fSAndroid Build Coastguard Worker 	struct utf8cursor cur;
898*6a54128fSAndroid Build Coastguard Worker 	size_t nlen = 0;
899*6a54128fSAndroid Build Coastguard Worker 
900*6a54128fSAndroid Build Coastguard Worker 	if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
901*6a54128fSAndroid Build Coastguard Worker 		goto invalid_seq;
902*6a54128fSAndroid Build Coastguard Worker 
903*6a54128fSAndroid Build Coastguard Worker 	for (nlen = 0; nlen < dlen; nlen++) {
904*6a54128fSAndroid Build Coastguard Worker 		int c = utf8byte(&cur);
905*6a54128fSAndroid Build Coastguard Worker 
906*6a54128fSAndroid Build Coastguard Worker 		dest[nlen] = c;
907*6a54128fSAndroid Build Coastguard Worker 		if (!c)
908*6a54128fSAndroid Build Coastguard Worker 			return nlen;
909*6a54128fSAndroid Build Coastguard Worker 		if (c == -1)
910*6a54128fSAndroid Build Coastguard Worker 			break;
911*6a54128fSAndroid Build Coastguard Worker 	}
912*6a54128fSAndroid Build Coastguard Worker 
913*6a54128fSAndroid Build Coastguard Worker 	return -ENAMETOOLONG;
914*6a54128fSAndroid Build Coastguard Worker 
915*6a54128fSAndroid Build Coastguard Worker invalid_seq:
916*6a54128fSAndroid Build Coastguard Worker 	if (dlen < len)
917*6a54128fSAndroid Build Coastguard Worker 		return -ENAMETOOLONG;
918*6a54128fSAndroid Build Coastguard Worker 
919*6a54128fSAndroid Build Coastguard Worker 	/* Signal invalid sequence */
920*6a54128fSAndroid Build Coastguard Worker 	return -EINVAL;
921*6a54128fSAndroid Build Coastguard Worker }
922*6a54128fSAndroid Build Coastguard Worker 
utf8_validate(const struct ext2fs_nls_table * table,char * s,size_t len,char ** pos)923*6a54128fSAndroid Build Coastguard Worker static int utf8_validate(const struct ext2fs_nls_table *table,
924*6a54128fSAndroid Build Coastguard Worker 			 char *s, size_t len, char **pos)
925*6a54128fSAndroid Build Coastguard Worker {
926*6a54128fSAndroid Build Coastguard Worker 	const struct utf8data *data = utf8nfdicf(table->version);
927*6a54128fSAndroid Build Coastguard Worker 	utf8leaf_t	*leaf;
928*6a54128fSAndroid Build Coastguard Worker 	unsigned char	hangul[UTF8HANGULLEAF];
929*6a54128fSAndroid Build Coastguard Worker 
930*6a54128fSAndroid Build Coastguard Worker 	if (!data)
931*6a54128fSAndroid Build Coastguard Worker 		return -1;
932*6a54128fSAndroid Build Coastguard Worker 	while (len && *s) {
933*6a54128fSAndroid Build Coastguard Worker 		leaf = utf8nlookup(data, hangul, s, len);
934*6a54128fSAndroid Build Coastguard Worker 		if (!leaf) {
935*6a54128fSAndroid Build Coastguard Worker 			*pos = s;
936*6a54128fSAndroid Build Coastguard Worker 			return 1;
937*6a54128fSAndroid Build Coastguard Worker 		}
938*6a54128fSAndroid Build Coastguard Worker 		len -= utf8clen(s);
939*6a54128fSAndroid Build Coastguard Worker 		s += utf8clen(s);
940*6a54128fSAndroid Build Coastguard Worker 	}
941*6a54128fSAndroid Build Coastguard Worker 	return 0;
942*6a54128fSAndroid Build Coastguard Worker }
943*6a54128fSAndroid Build Coastguard Worker 
utf8_casefold_cmp(const struct ext2fs_nls_table * table,const unsigned char * str1,size_t len1,const unsigned char * str2,size_t len2)944*6a54128fSAndroid Build Coastguard Worker static int utf8_casefold_cmp(const struct ext2fs_nls_table *table,
945*6a54128fSAndroid Build Coastguard Worker 			     const unsigned char *str1, size_t len1,
946*6a54128fSAndroid Build Coastguard Worker 			     const unsigned char *str2, size_t len2)
947*6a54128fSAndroid Build Coastguard Worker {
948*6a54128fSAndroid Build Coastguard Worker 	const struct utf8data *data = utf8nfdicf(table->version);
949*6a54128fSAndroid Build Coastguard Worker 	int c1, c2;
950*6a54128fSAndroid Build Coastguard Worker 	struct utf8cursor cur1, cur2;
951*6a54128fSAndroid Build Coastguard Worker 
952*6a54128fSAndroid Build Coastguard Worker 	if (utf8ncursor(&cur1, data, (const char *) str1, len1) < 0)
953*6a54128fSAndroid Build Coastguard Worker 		return -1;
954*6a54128fSAndroid Build Coastguard Worker 	if (utf8ncursor(&cur2, data, (const char *) str2, len2) < 0)
955*6a54128fSAndroid Build Coastguard Worker 		return -1;
956*6a54128fSAndroid Build Coastguard Worker 
957*6a54128fSAndroid Build Coastguard Worker 	do {
958*6a54128fSAndroid Build Coastguard Worker 		c1 = utf8byte(&cur1);
959*6a54128fSAndroid Build Coastguard Worker 		c2 = utf8byte(&cur2);
960*6a54128fSAndroid Build Coastguard Worker 
961*6a54128fSAndroid Build Coastguard Worker 		if (c1 < 0 || c2 < 0)
962*6a54128fSAndroid Build Coastguard Worker 			return -1;
963*6a54128fSAndroid Build Coastguard Worker 		if (c1 != c2)
964*6a54128fSAndroid Build Coastguard Worker 			return c1 - c2;
965*6a54128fSAndroid Build Coastguard Worker 	} while (c1);
966*6a54128fSAndroid Build Coastguard Worker 
967*6a54128fSAndroid Build Coastguard Worker 	return 0;
968*6a54128fSAndroid Build Coastguard Worker }
969*6a54128fSAndroid Build Coastguard Worker 
970*6a54128fSAndroid Build Coastguard Worker static const struct ext2fs_nls_ops utf8_ops = {
971*6a54128fSAndroid Build Coastguard Worker 	.casefold = utf8_casefold,
972*6a54128fSAndroid Build Coastguard Worker 	.validate = utf8_validate,
973*6a54128fSAndroid Build Coastguard Worker 	.casefold_cmp = utf8_casefold_cmp,
974*6a54128fSAndroid Build Coastguard Worker };
975*6a54128fSAndroid Build Coastguard Worker 
976*6a54128fSAndroid Build Coastguard Worker static const struct ext2fs_nls_table nls_utf8 = {
977*6a54128fSAndroid Build Coastguard Worker 	.ops = &utf8_ops,
978*6a54128fSAndroid Build Coastguard Worker 	.version = UNICODE_AGE(12, 1, 0),
979*6a54128fSAndroid Build Coastguard Worker };
980*6a54128fSAndroid Build Coastguard Worker 
ext2fs_load_nls_table(int encoding)981*6a54128fSAndroid Build Coastguard Worker const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding)
982*6a54128fSAndroid Build Coastguard Worker {
983*6a54128fSAndroid Build Coastguard Worker 	if (encoding == EXT4_ENC_UTF8_12_1)
984*6a54128fSAndroid Build Coastguard Worker 		return &nls_utf8;
985*6a54128fSAndroid Build Coastguard Worker 
986*6a54128fSAndroid Build Coastguard Worker 	return NULL;
987*6a54128fSAndroid Build Coastguard Worker }
988*6a54128fSAndroid Build Coastguard Worker 
ext2fs_check_encoded_name(const struct ext2fs_nls_table * table,char * name,size_t len,char ** pos)989*6a54128fSAndroid Build Coastguard Worker int ext2fs_check_encoded_name(const struct ext2fs_nls_table *table,
990*6a54128fSAndroid Build Coastguard Worker 			      char *name, size_t len, char **pos)
991*6a54128fSAndroid Build Coastguard Worker {
992*6a54128fSAndroid Build Coastguard Worker 	return table->ops->validate(table, name, len, pos);
993*6a54128fSAndroid Build Coastguard Worker }
994*6a54128fSAndroid Build Coastguard Worker 
ext2fs_casefold_cmp(const struct ext2fs_nls_table * table,const unsigned char * str1,size_t len1,const unsigned char * str2,size_t len2)995*6a54128fSAndroid Build Coastguard Worker int ext2fs_casefold_cmp(const struct ext2fs_nls_table *table,
996*6a54128fSAndroid Build Coastguard Worker 			const unsigned char *str1, size_t len1,
997*6a54128fSAndroid Build Coastguard Worker 			const unsigned char *str2, size_t len2)
998*6a54128fSAndroid Build Coastguard Worker {
999*6a54128fSAndroid Build Coastguard Worker 	return table->ops->casefold_cmp(table, str1, len1, str2, len2);
1000*6a54128fSAndroid Build Coastguard Worker }
1001