1*6a54128fSAndroid Build Coastguard Worker /*
2*6a54128fSAndroid Build Coastguard Worker * Copyright (c) 2014 SGI.
3*6a54128fSAndroid Build Coastguard Worker * All rights reserved.
4*6a54128fSAndroid Build Coastguard Worker *
5*6a54128fSAndroid Build Coastguard Worker * This program is free software; you can redistribute it and/or
6*6a54128fSAndroid Build Coastguard Worker * modify it under the terms of the GNU General Public License as
7*6a54128fSAndroid Build Coastguard Worker * published by the Free Software Foundation.
8*6a54128fSAndroid Build Coastguard Worker *
9*6a54128fSAndroid Build Coastguard Worker * This program is distributed in the hope that it would be useful,
10*6a54128fSAndroid Build Coastguard Worker * but WITHOUT ANY WARRANTY; without even the implied warranty of
11*6a54128fSAndroid Build Coastguard Worker * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12*6a54128fSAndroid Build Coastguard Worker * GNU General Public License for more details.
13*6a54128fSAndroid Build Coastguard Worker *
14*6a54128fSAndroid Build Coastguard Worker * You should have received a copy of the GNU General Public License
15*6a54128fSAndroid Build Coastguard Worker * along with this program; if not, write the Free Software Foundation,
16*6a54128fSAndroid Build Coastguard Worker * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*6a54128fSAndroid Build Coastguard Worker */
18*6a54128fSAndroid Build Coastguard Worker
19*6a54128fSAndroid Build Coastguard Worker /* Generator for a compact trie for unicode normalization */
20*6a54128fSAndroid Build Coastguard Worker
21*6a54128fSAndroid Build Coastguard Worker #include <sys/types.h>
22*6a54128fSAndroid Build Coastguard Worker #include <stddef.h>
23*6a54128fSAndroid Build Coastguard Worker #include <stdlib.h>
24*6a54128fSAndroid Build Coastguard Worker #include <stdio.h>
25*6a54128fSAndroid Build Coastguard Worker #include <assert.h>
26*6a54128fSAndroid Build Coastguard Worker #include <string.h>
27*6a54128fSAndroid Build Coastguard Worker #include <unistd.h>
28*6a54128fSAndroid Build Coastguard Worker #include <errno.h>
29*6a54128fSAndroid Build Coastguard Worker
30*6a54128fSAndroid Build Coastguard Worker /* Default names of the in- and output files. */
31*6a54128fSAndroid Build Coastguard Worker
32*6a54128fSAndroid Build Coastguard Worker #define AGE_NAME "DerivedAge.txt"
33*6a54128fSAndroid Build Coastguard Worker #define CCC_NAME "DerivedCombiningClass.txt"
34*6a54128fSAndroid Build Coastguard Worker #define PROP_NAME "DerivedCoreProperties.txt"
35*6a54128fSAndroid Build Coastguard Worker #define DATA_NAME "UnicodeData.txt"
36*6a54128fSAndroid Build Coastguard Worker #define FOLD_NAME "CaseFolding.txt"
37*6a54128fSAndroid Build Coastguard Worker #define NORM_NAME "NormalizationCorrections.txt"
38*6a54128fSAndroid Build Coastguard Worker #define TEST_NAME "NormalizationTest.txt"
39*6a54128fSAndroid Build Coastguard Worker #define UTF8_NAME "utf8data.h"
40*6a54128fSAndroid Build Coastguard Worker
41*6a54128fSAndroid Build Coastguard Worker const char *age_name = AGE_NAME;
42*6a54128fSAndroid Build Coastguard Worker const char *ccc_name = CCC_NAME;
43*6a54128fSAndroid Build Coastguard Worker const char *prop_name = PROP_NAME;
44*6a54128fSAndroid Build Coastguard Worker const char *data_name = DATA_NAME;
45*6a54128fSAndroid Build Coastguard Worker const char *fold_name = FOLD_NAME;
46*6a54128fSAndroid Build Coastguard Worker const char *norm_name = NORM_NAME;
47*6a54128fSAndroid Build Coastguard Worker const char *test_name = TEST_NAME;
48*6a54128fSAndroid Build Coastguard Worker const char *utf8_name = UTF8_NAME;
49*6a54128fSAndroid Build Coastguard Worker
50*6a54128fSAndroid Build Coastguard Worker int verbose = 0;
51*6a54128fSAndroid Build Coastguard Worker
52*6a54128fSAndroid Build Coastguard Worker /* An arbitrary line size limit on input lines. */
53*6a54128fSAndroid Build Coastguard Worker
54*6a54128fSAndroid Build Coastguard Worker #define LINESIZE 1024
55*6a54128fSAndroid Build Coastguard Worker char line[LINESIZE];
56*6a54128fSAndroid Build Coastguard Worker char buf0[LINESIZE];
57*6a54128fSAndroid Build Coastguard Worker char buf1[LINESIZE];
58*6a54128fSAndroid Build Coastguard Worker char buf2[LINESIZE];
59*6a54128fSAndroid Build Coastguard Worker char buf3[LINESIZE];
60*6a54128fSAndroid Build Coastguard Worker
61*6a54128fSAndroid Build Coastguard Worker const char *argv0;
62*6a54128fSAndroid Build Coastguard Worker
63*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
64*6a54128fSAndroid Build Coastguard Worker
65*6a54128fSAndroid Build Coastguard Worker /*
66*6a54128fSAndroid Build Coastguard Worker * Unicode version numbers consist of three parts: major, minor, and a
67*6a54128fSAndroid Build Coastguard Worker * revision. These numbers are packed into an unsigned int to obtain
68*6a54128fSAndroid Build Coastguard Worker * a single version number.
69*6a54128fSAndroid Build Coastguard Worker *
70*6a54128fSAndroid Build Coastguard Worker * To save space in the generated trie, the unicode version is not
71*6a54128fSAndroid Build Coastguard Worker * stored directly, instead we calculate a generation number from the
72*6a54128fSAndroid Build Coastguard Worker * unicode versions seen in the DerivedAge file, and use that as an
73*6a54128fSAndroid Build Coastguard Worker * index into a table of unicode versions.
74*6a54128fSAndroid Build Coastguard Worker */
75*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MAJ_SHIFT (16)
76*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MIN_SHIFT (8)
77*6a54128fSAndroid Build Coastguard Worker
78*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MAJ_MAX ((unsigned short)-1)
79*6a54128fSAndroid Build Coastguard Worker #define UNICODE_MIN_MAX ((unsigned char)-1)
80*6a54128fSAndroid Build Coastguard Worker #define UNICODE_REV_MAX ((unsigned char)-1)
81*6a54128fSAndroid Build Coastguard Worker
82*6a54128fSAndroid Build Coastguard Worker #define UNICODE_AGE(MAJ,MIN,REV) \
83*6a54128fSAndroid Build Coastguard Worker (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
84*6a54128fSAndroid Build Coastguard Worker ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
85*6a54128fSAndroid Build Coastguard Worker ((unsigned int)(REV)))
86*6a54128fSAndroid Build Coastguard Worker
87*6a54128fSAndroid Build Coastguard Worker unsigned int *ages;
88*6a54128fSAndroid Build Coastguard Worker int ages_count;
89*6a54128fSAndroid Build Coastguard Worker
90*6a54128fSAndroid Build Coastguard Worker unsigned int unicode_maxage;
91*6a54128fSAndroid Build Coastguard Worker
age_valid(unsigned int major,unsigned int minor,unsigned int revision)92*6a54128fSAndroid Build Coastguard Worker static int age_valid(unsigned int major, unsigned int minor,
93*6a54128fSAndroid Build Coastguard Worker unsigned int revision)
94*6a54128fSAndroid Build Coastguard Worker {
95*6a54128fSAndroid Build Coastguard Worker if (major > UNICODE_MAJ_MAX)
96*6a54128fSAndroid Build Coastguard Worker return 0;
97*6a54128fSAndroid Build Coastguard Worker if (minor > UNICODE_MIN_MAX)
98*6a54128fSAndroid Build Coastguard Worker return 0;
99*6a54128fSAndroid Build Coastguard Worker if (revision > UNICODE_REV_MAX)
100*6a54128fSAndroid Build Coastguard Worker return 0;
101*6a54128fSAndroid Build Coastguard Worker return 1;
102*6a54128fSAndroid Build Coastguard Worker }
103*6a54128fSAndroid Build Coastguard Worker
104*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
105*6a54128fSAndroid Build Coastguard Worker
106*6a54128fSAndroid Build Coastguard Worker /*
107*6a54128fSAndroid Build Coastguard Worker * utf8trie_t
108*6a54128fSAndroid Build Coastguard Worker *
109*6a54128fSAndroid Build Coastguard Worker * A compact binary tree, used to decode UTF-8 characters.
110*6a54128fSAndroid Build Coastguard Worker *
111*6a54128fSAndroid Build Coastguard Worker * Internal nodes are one byte for the node itself, and up to three
112*6a54128fSAndroid Build Coastguard Worker * bytes for an offset into the tree. The first byte contains the
113*6a54128fSAndroid Build Coastguard Worker * following information:
114*6a54128fSAndroid Build Coastguard Worker * NEXTBYTE - flag - advance to next byte if set
115*6a54128fSAndroid Build Coastguard Worker * BITNUM - 3 bit field - the bit number to tested
116*6a54128fSAndroid Build Coastguard Worker * OFFLEN - 2 bit field - number of bytes in the offset
117*6a54128fSAndroid Build Coastguard Worker * if offlen == 0 (non-branching node)
118*6a54128fSAndroid Build Coastguard Worker * RIGHTPATH - 1 bit field - set if the following node is for the
119*6a54128fSAndroid Build Coastguard Worker * right-hand path (tested bit is set)
120*6a54128fSAndroid Build Coastguard Worker * TRIENODE - 1 bit field - set if the following node is an internal
121*6a54128fSAndroid Build Coastguard Worker * node, otherwise it is a leaf node
122*6a54128fSAndroid Build Coastguard Worker * if offlen != 0 (branching node)
123*6a54128fSAndroid Build Coastguard Worker * LEFTNODE - 1 bit field - set if the left-hand node is internal
124*6a54128fSAndroid Build Coastguard Worker * RIGHTNODE - 1 bit field - set if the right-hand node is internal
125*6a54128fSAndroid Build Coastguard Worker *
126*6a54128fSAndroid Build Coastguard Worker * Due to the way utf8 works, there cannot be branching nodes with
127*6a54128fSAndroid Build Coastguard Worker * NEXTBYTE set, and moreover those nodes always have a righthand
128*6a54128fSAndroid Build Coastguard Worker * descendant.
129*6a54128fSAndroid Build Coastguard Worker */
130*6a54128fSAndroid Build Coastguard Worker typedef unsigned char utf8trie_t;
131*6a54128fSAndroid Build Coastguard Worker #define BITNUM 0x07
132*6a54128fSAndroid Build Coastguard Worker #define NEXTBYTE 0x08
133*6a54128fSAndroid Build Coastguard Worker #define OFFLEN 0x30
134*6a54128fSAndroid Build Coastguard Worker #define OFFLEN_SHIFT 4
135*6a54128fSAndroid Build Coastguard Worker #define RIGHTPATH 0x40
136*6a54128fSAndroid Build Coastguard Worker #define TRIENODE 0x80
137*6a54128fSAndroid Build Coastguard Worker #define RIGHTNODE 0x40
138*6a54128fSAndroid Build Coastguard Worker #define LEFTNODE 0x80
139*6a54128fSAndroid Build Coastguard Worker
140*6a54128fSAndroid Build Coastguard Worker /*
141*6a54128fSAndroid Build Coastguard Worker * utf8leaf_t
142*6a54128fSAndroid Build Coastguard Worker *
143*6a54128fSAndroid Build Coastguard Worker * The leaves of the trie are embedded in the trie, and so the same
144*6a54128fSAndroid Build Coastguard Worker * underlying datatype, unsigned char.
145*6a54128fSAndroid Build Coastguard Worker *
146*6a54128fSAndroid Build Coastguard Worker * leaf[0]: The unicode version, stored as a generation number that is
147*6a54128fSAndroid Build Coastguard Worker * an index into utf8agetab[]. With this we can filter code
148*6a54128fSAndroid Build Coastguard Worker * points based on the unicode version in which they were
149*6a54128fSAndroid Build Coastguard Worker * defined. The CCC of a non-defined code point is 0.
150*6a54128fSAndroid Build Coastguard Worker * leaf[1]: Canonical Combining Class. During normalization, we need
151*6a54128fSAndroid Build Coastguard Worker * to do a stable sort into ascending order of all characters
152*6a54128fSAndroid Build Coastguard Worker * with a non-zero CCC that occur between two characters with
153*6a54128fSAndroid Build Coastguard Worker * a CCC of 0, or at the begin or end of a string.
154*6a54128fSAndroid Build Coastguard Worker * The unicode standard guarantees that all CCC values are
155*6a54128fSAndroid Build Coastguard Worker * between 0 and 254 inclusive, which leaves 255 available as
156*6a54128fSAndroid Build Coastguard Worker * a special value.
157*6a54128fSAndroid Build Coastguard Worker * Code points with CCC 0 are known as stoppers.
158*6a54128fSAndroid Build Coastguard Worker * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
159*6a54128fSAndroid Build Coastguard Worker * start of a NUL-terminated string that is the decomposition
160*6a54128fSAndroid Build Coastguard Worker * of the character.
161*6a54128fSAndroid Build Coastguard Worker * The CCC of a decomposable character is the same as the CCC
162*6a54128fSAndroid Build Coastguard Worker * of the first character of its decomposition.
163*6a54128fSAndroid Build Coastguard Worker * Some characters decompose as the empty string: these are
164*6a54128fSAndroid Build Coastguard Worker * characters with the Default_Ignorable_Code_Point property.
165*6a54128fSAndroid Build Coastguard Worker * These do affect normalization, as they all have CCC 0.
166*6a54128fSAndroid Build Coastguard Worker *
167*6a54128fSAndroid Build Coastguard Worker * The decompositions in the trie have been fully expanded.
168*6a54128fSAndroid Build Coastguard Worker *
169*6a54128fSAndroid Build Coastguard Worker * Casefolding, if applicable, is also done using decompositions.
170*6a54128fSAndroid Build Coastguard Worker */
171*6a54128fSAndroid Build Coastguard Worker typedef unsigned char utf8leaf_t;
172*6a54128fSAndroid Build Coastguard Worker
173*6a54128fSAndroid Build Coastguard Worker #define LEAF_GEN(LEAF) ((LEAF)[0])
174*6a54128fSAndroid Build Coastguard Worker #define LEAF_CCC(LEAF) ((LEAF)[1])
175*6a54128fSAndroid Build Coastguard Worker #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
176*6a54128fSAndroid Build Coastguard Worker
177*6a54128fSAndroid Build Coastguard Worker #define MAXGEN (255)
178*6a54128fSAndroid Build Coastguard Worker
179*6a54128fSAndroid Build Coastguard Worker #define MINCCC (0)
180*6a54128fSAndroid Build Coastguard Worker #define MAXCCC (254)
181*6a54128fSAndroid Build Coastguard Worker #define STOPPER (0)
182*6a54128fSAndroid Build Coastguard Worker #define DECOMPOSE (255)
183*6a54128fSAndroid Build Coastguard Worker #define HANGUL ((char)(255))
184*6a54128fSAndroid Build Coastguard Worker
185*6a54128fSAndroid Build Coastguard Worker #define UTF8HANGULLEAF (12)
186*6a54128fSAndroid Build Coastguard Worker
187*6a54128fSAndroid Build Coastguard Worker struct tree;
188*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
189*6a54128fSAndroid Build Coastguard Worker const char *, size_t);
190*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
191*6a54128fSAndroid Build Coastguard Worker
192*6a54128fSAndroid Build Coastguard Worker unsigned char *utf8data;
193*6a54128fSAndroid Build Coastguard Worker size_t utf8data_size;
194*6a54128fSAndroid Build Coastguard Worker
195*6a54128fSAndroid Build Coastguard Worker utf8trie_t *nfkdi;
196*6a54128fSAndroid Build Coastguard Worker utf8trie_t *nfkdicf;
197*6a54128fSAndroid Build Coastguard Worker
198*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
199*6a54128fSAndroid Build Coastguard Worker
200*6a54128fSAndroid Build Coastguard Worker /*
201*6a54128fSAndroid Build Coastguard Worker * UTF8 valid ranges.
202*6a54128fSAndroid Build Coastguard Worker *
203*6a54128fSAndroid Build Coastguard Worker * The UTF-8 encoding spreads the bits of a 32bit word over several
204*6a54128fSAndroid Build Coastguard Worker * bytes. This table gives the ranges that can be held and how they'd
205*6a54128fSAndroid Build Coastguard Worker * be represented.
206*6a54128fSAndroid Build Coastguard Worker *
207*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x0000007F: 0xxxxxxx
208*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
209*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
210*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
213*6a54128fSAndroid Build Coastguard Worker *
214*6a54128fSAndroid Build Coastguard Worker * There is an additional requirement on UTF-8, in that only the
215*6a54128fSAndroid Build Coastguard Worker * shortest representation of a 32bit value is to be used. A decoder
216*6a54128fSAndroid Build Coastguard Worker * must not decode sequences that do not satisfy this requirement.
217*6a54128fSAndroid Build Coastguard Worker * Thus the allowed ranges have a lower bound.
218*6a54128fSAndroid Build Coastguard Worker *
219*6a54128fSAndroid Build Coastguard Worker * 0x00000000 0x0000007F: 0xxxxxxx
220*6a54128fSAndroid Build Coastguard Worker * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
221*6a54128fSAndroid Build Coastguard Worker * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
222*6a54128fSAndroid Build Coastguard Worker * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
223*6a54128fSAndroid Build Coastguard Worker * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
224*6a54128fSAndroid Build Coastguard Worker * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
225*6a54128fSAndroid Build Coastguard Worker *
226*6a54128fSAndroid Build Coastguard Worker * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
227*6a54128fSAndroid Build Coastguard Worker * 17 planes of 65536 values. This limits the sequences actually seen
228*6a54128fSAndroid Build Coastguard Worker * even more, to just the following.
229*6a54128fSAndroid Build Coastguard Worker *
230*6a54128fSAndroid Build Coastguard Worker * 0 - 0x7f: 0 0x7f
231*6a54128fSAndroid Build Coastguard Worker * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
232*6a54128fSAndroid Build Coastguard Worker * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
233*6a54128fSAndroid Build Coastguard Worker * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
234*6a54128fSAndroid Build Coastguard Worker *
235*6a54128fSAndroid Build Coastguard Worker * Even within those ranges not all values are allowed: the surrogates
236*6a54128fSAndroid Build Coastguard Worker * 0xd800 - 0xdfff should never be seen.
237*6a54128fSAndroid Build Coastguard Worker *
238*6a54128fSAndroid Build Coastguard Worker * Note that the longest sequence seen with valid usage is 4 bytes,
239*6a54128fSAndroid Build Coastguard Worker * the same a single UTF-32 character. This makes the UTF-8
240*6a54128fSAndroid Build Coastguard Worker * representation of Unicode strictly smaller than UTF-32.
241*6a54128fSAndroid Build Coastguard Worker *
242*6a54128fSAndroid Build Coastguard Worker * The shortest sequence requirement was introduced by:
243*6a54128fSAndroid Build Coastguard Worker * Corrigendum #1: UTF-8 Shortest Form
244*6a54128fSAndroid Build Coastguard Worker * It can be found here:
245*6a54128fSAndroid Build Coastguard Worker * http://www.unicode.org/versions/corrigendum1.html
246*6a54128fSAndroid Build Coastguard Worker *
247*6a54128fSAndroid Build Coastguard Worker */
248*6a54128fSAndroid Build Coastguard Worker
249*6a54128fSAndroid Build Coastguard Worker #define UTF8_2_BITS 0xC0
250*6a54128fSAndroid Build Coastguard Worker #define UTF8_3_BITS 0xE0
251*6a54128fSAndroid Build Coastguard Worker #define UTF8_4_BITS 0xF0
252*6a54128fSAndroid Build Coastguard Worker #define UTF8_N_BITS 0x80
253*6a54128fSAndroid Build Coastguard Worker #define UTF8_2_MASK 0xE0
254*6a54128fSAndroid Build Coastguard Worker #define UTF8_3_MASK 0xF0
255*6a54128fSAndroid Build Coastguard Worker #define UTF8_4_MASK 0xF8
256*6a54128fSAndroid Build Coastguard Worker #define UTF8_N_MASK 0xC0
257*6a54128fSAndroid Build Coastguard Worker #define UTF8_V_MASK 0x3F
258*6a54128fSAndroid Build Coastguard Worker #define UTF8_V_SHIFT 6
259*6a54128fSAndroid Build Coastguard Worker
utf8encode(char * str,unsigned int val)260*6a54128fSAndroid Build Coastguard Worker static int utf8encode(char *str, unsigned int val)
261*6a54128fSAndroid Build Coastguard Worker {
262*6a54128fSAndroid Build Coastguard Worker int len;
263*6a54128fSAndroid Build Coastguard Worker
264*6a54128fSAndroid Build Coastguard Worker if (val < 0x80) {
265*6a54128fSAndroid Build Coastguard Worker str[0] = val;
266*6a54128fSAndroid Build Coastguard Worker len = 1;
267*6a54128fSAndroid Build Coastguard Worker } else if (val < 0x800) {
268*6a54128fSAndroid Build Coastguard Worker str[1] = val & UTF8_V_MASK;
269*6a54128fSAndroid Build Coastguard Worker str[1] |= UTF8_N_BITS;
270*6a54128fSAndroid Build Coastguard Worker val >>= UTF8_V_SHIFT;
271*6a54128fSAndroid Build Coastguard Worker str[0] = val;
272*6a54128fSAndroid Build Coastguard Worker str[0] |= UTF8_2_BITS;
273*6a54128fSAndroid Build Coastguard Worker len = 2;
274*6a54128fSAndroid Build Coastguard Worker } else if (val < 0x10000) {
275*6a54128fSAndroid Build Coastguard Worker str[2] = val & UTF8_V_MASK;
276*6a54128fSAndroid Build Coastguard Worker str[2] |= UTF8_N_BITS;
277*6a54128fSAndroid Build Coastguard Worker val >>= UTF8_V_SHIFT;
278*6a54128fSAndroid Build Coastguard Worker str[1] = val & UTF8_V_MASK;
279*6a54128fSAndroid Build Coastguard Worker str[1] |= UTF8_N_BITS;
280*6a54128fSAndroid Build Coastguard Worker val >>= UTF8_V_SHIFT;
281*6a54128fSAndroid Build Coastguard Worker str[0] = val;
282*6a54128fSAndroid Build Coastguard Worker str[0] |= UTF8_3_BITS;
283*6a54128fSAndroid Build Coastguard Worker len = 3;
284*6a54128fSAndroid Build Coastguard Worker } else if (val < 0x110000) {
285*6a54128fSAndroid Build Coastguard Worker str[3] = val & UTF8_V_MASK;
286*6a54128fSAndroid Build Coastguard Worker str[3] |= UTF8_N_BITS;
287*6a54128fSAndroid Build Coastguard Worker val >>= UTF8_V_SHIFT;
288*6a54128fSAndroid Build Coastguard Worker str[2] = val & UTF8_V_MASK;
289*6a54128fSAndroid Build Coastguard Worker str[2] |= UTF8_N_BITS;
290*6a54128fSAndroid Build Coastguard Worker val >>= UTF8_V_SHIFT;
291*6a54128fSAndroid Build Coastguard Worker str[1] = val & UTF8_V_MASK;
292*6a54128fSAndroid Build Coastguard Worker str[1] |= UTF8_N_BITS;
293*6a54128fSAndroid Build Coastguard Worker val >>= UTF8_V_SHIFT;
294*6a54128fSAndroid Build Coastguard Worker str[0] = val;
295*6a54128fSAndroid Build Coastguard Worker str[0] |= UTF8_4_BITS;
296*6a54128fSAndroid Build Coastguard Worker len = 4;
297*6a54128fSAndroid Build Coastguard Worker } else {
298*6a54128fSAndroid Build Coastguard Worker printf("%#x: illegal val\n", val);
299*6a54128fSAndroid Build Coastguard Worker len = 0;
300*6a54128fSAndroid Build Coastguard Worker }
301*6a54128fSAndroid Build Coastguard Worker return len;
302*6a54128fSAndroid Build Coastguard Worker }
303*6a54128fSAndroid Build Coastguard Worker
utf8decode(const char * str)304*6a54128fSAndroid Build Coastguard Worker static unsigned int utf8decode(const char *str)
305*6a54128fSAndroid Build Coastguard Worker {
306*6a54128fSAndroid Build Coastguard Worker const unsigned char *s = (const unsigned char*)str;
307*6a54128fSAndroid Build Coastguard Worker unsigned int unichar = 0;
308*6a54128fSAndroid Build Coastguard Worker
309*6a54128fSAndroid Build Coastguard Worker if (*s < 0x80) {
310*6a54128fSAndroid Build Coastguard Worker unichar = *s;
311*6a54128fSAndroid Build Coastguard Worker } else if (*s < UTF8_3_BITS) {
312*6a54128fSAndroid Build Coastguard Worker unichar = *s++ & 0x1F;
313*6a54128fSAndroid Build Coastguard Worker unichar <<= UTF8_V_SHIFT;
314*6a54128fSAndroid Build Coastguard Worker unichar |= *s & 0x3F;
315*6a54128fSAndroid Build Coastguard Worker } else if (*s < UTF8_4_BITS) {
316*6a54128fSAndroid Build Coastguard Worker unichar = *s++ & 0x0F;
317*6a54128fSAndroid Build Coastguard Worker unichar <<= UTF8_V_SHIFT;
318*6a54128fSAndroid Build Coastguard Worker unichar |= *s++ & 0x3F;
319*6a54128fSAndroid Build Coastguard Worker unichar <<= UTF8_V_SHIFT;
320*6a54128fSAndroid Build Coastguard Worker unichar |= *s & 0x3F;
321*6a54128fSAndroid Build Coastguard Worker } else {
322*6a54128fSAndroid Build Coastguard Worker unichar = *s++ & 0x0F;
323*6a54128fSAndroid Build Coastguard Worker unichar <<= UTF8_V_SHIFT;
324*6a54128fSAndroid Build Coastguard Worker unichar |= *s++ & 0x3F;
325*6a54128fSAndroid Build Coastguard Worker unichar <<= UTF8_V_SHIFT;
326*6a54128fSAndroid Build Coastguard Worker unichar |= *s++ & 0x3F;
327*6a54128fSAndroid Build Coastguard Worker unichar <<= UTF8_V_SHIFT;
328*6a54128fSAndroid Build Coastguard Worker unichar |= *s & 0x3F;
329*6a54128fSAndroid Build Coastguard Worker }
330*6a54128fSAndroid Build Coastguard Worker return unichar;
331*6a54128fSAndroid Build Coastguard Worker }
332*6a54128fSAndroid Build Coastguard Worker
utf32valid(unsigned int unichar)333*6a54128fSAndroid Build Coastguard Worker static int utf32valid(unsigned int unichar)
334*6a54128fSAndroid Build Coastguard Worker {
335*6a54128fSAndroid Build Coastguard Worker return unichar < 0x110000;
336*6a54128fSAndroid Build Coastguard Worker }
337*6a54128fSAndroid Build Coastguard Worker
338*6a54128fSAndroid Build Coastguard Worker #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
339*6a54128fSAndroid Build Coastguard Worker
340*6a54128fSAndroid Build Coastguard Worker #define NODE 1
341*6a54128fSAndroid Build Coastguard Worker #define LEAF 0
342*6a54128fSAndroid Build Coastguard Worker
343*6a54128fSAndroid Build Coastguard Worker struct tree {
344*6a54128fSAndroid Build Coastguard Worker void *root;
345*6a54128fSAndroid Build Coastguard Worker int childnode;
346*6a54128fSAndroid Build Coastguard Worker const char *type;
347*6a54128fSAndroid Build Coastguard Worker unsigned int maxage;
348*6a54128fSAndroid Build Coastguard Worker struct tree *next;
349*6a54128fSAndroid Build Coastguard Worker int (*leaf_equal)(void *, void *);
350*6a54128fSAndroid Build Coastguard Worker void (*leaf_print)(void *, int);
351*6a54128fSAndroid Build Coastguard Worker int (*leaf_mark)(void *);
352*6a54128fSAndroid Build Coastguard Worker int (*leaf_size)(void *);
353*6a54128fSAndroid Build Coastguard Worker int *(*leaf_index)(struct tree *, void *);
354*6a54128fSAndroid Build Coastguard Worker unsigned char *(*leaf_emit)(void *, unsigned char *);
355*6a54128fSAndroid Build Coastguard Worker int leafindex[0x110000];
356*6a54128fSAndroid Build Coastguard Worker int index;
357*6a54128fSAndroid Build Coastguard Worker };
358*6a54128fSAndroid Build Coastguard Worker
359*6a54128fSAndroid Build Coastguard Worker struct node {
360*6a54128fSAndroid Build Coastguard Worker int index;
361*6a54128fSAndroid Build Coastguard Worker int offset;
362*6a54128fSAndroid Build Coastguard Worker int mark;
363*6a54128fSAndroid Build Coastguard Worker int size;
364*6a54128fSAndroid Build Coastguard Worker struct node *parent;
365*6a54128fSAndroid Build Coastguard Worker void *left;
366*6a54128fSAndroid Build Coastguard Worker void *right;
367*6a54128fSAndroid Build Coastguard Worker unsigned char bitnum;
368*6a54128fSAndroid Build Coastguard Worker unsigned char nextbyte;
369*6a54128fSAndroid Build Coastguard Worker unsigned char leftnode;
370*6a54128fSAndroid Build Coastguard Worker unsigned char rightnode;
371*6a54128fSAndroid Build Coastguard Worker unsigned int keybits;
372*6a54128fSAndroid Build Coastguard Worker unsigned int keymask;
373*6a54128fSAndroid Build Coastguard Worker };
374*6a54128fSAndroid Build Coastguard Worker
375*6a54128fSAndroid Build Coastguard Worker /*
376*6a54128fSAndroid Build Coastguard Worker * Example lookup function for a tree.
377*6a54128fSAndroid Build Coastguard Worker */
lookup(struct tree * tree,const char * key)378*6a54128fSAndroid Build Coastguard Worker static void *lookup(struct tree *tree, const char *key)
379*6a54128fSAndroid Build Coastguard Worker {
380*6a54128fSAndroid Build Coastguard Worker struct node *node;
381*6a54128fSAndroid Build Coastguard Worker void *leaf = NULL;
382*6a54128fSAndroid Build Coastguard Worker
383*6a54128fSAndroid Build Coastguard Worker node = tree->root;
384*6a54128fSAndroid Build Coastguard Worker while (!leaf && node) {
385*6a54128fSAndroid Build Coastguard Worker if (node->nextbyte)
386*6a54128fSAndroid Build Coastguard Worker key++;
387*6a54128fSAndroid Build Coastguard Worker if (*key & (1 << (node->bitnum & 7))) {
388*6a54128fSAndroid Build Coastguard Worker /* Right leg */
389*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == NODE) {
390*6a54128fSAndroid Build Coastguard Worker node = node->right;
391*6a54128fSAndroid Build Coastguard Worker } else if (node->rightnode == LEAF) {
392*6a54128fSAndroid Build Coastguard Worker leaf = node->right;
393*6a54128fSAndroid Build Coastguard Worker } else {
394*6a54128fSAndroid Build Coastguard Worker node = NULL;
395*6a54128fSAndroid Build Coastguard Worker }
396*6a54128fSAndroid Build Coastguard Worker } else {
397*6a54128fSAndroid Build Coastguard Worker /* Left leg */
398*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == NODE) {
399*6a54128fSAndroid Build Coastguard Worker node = node->left;
400*6a54128fSAndroid Build Coastguard Worker } else if (node->leftnode == LEAF) {
401*6a54128fSAndroid Build Coastguard Worker leaf = node->left;
402*6a54128fSAndroid Build Coastguard Worker } else {
403*6a54128fSAndroid Build Coastguard Worker node = NULL;
404*6a54128fSAndroid Build Coastguard Worker }
405*6a54128fSAndroid Build Coastguard Worker }
406*6a54128fSAndroid Build Coastguard Worker }
407*6a54128fSAndroid Build Coastguard Worker
408*6a54128fSAndroid Build Coastguard Worker return leaf;
409*6a54128fSAndroid Build Coastguard Worker }
410*6a54128fSAndroid Build Coastguard Worker
411*6a54128fSAndroid Build Coastguard Worker /*
412*6a54128fSAndroid Build Coastguard Worker * A simple non-recursive tree walker: keep track of visits to the
413*6a54128fSAndroid Build Coastguard Worker * left and right branches in the leftmask and rightmask.
414*6a54128fSAndroid Build Coastguard Worker */
tree_walk(struct tree * tree)415*6a54128fSAndroid Build Coastguard Worker static void tree_walk(struct tree *tree)
416*6a54128fSAndroid Build Coastguard Worker {
417*6a54128fSAndroid Build Coastguard Worker struct node *node;
418*6a54128fSAndroid Build Coastguard Worker unsigned int leftmask;
419*6a54128fSAndroid Build Coastguard Worker unsigned int rightmask;
420*6a54128fSAndroid Build Coastguard Worker unsigned int bitmask;
421*6a54128fSAndroid Build Coastguard Worker int indent = 1;
422*6a54128fSAndroid Build Coastguard Worker int nodes, singletons, leaves;
423*6a54128fSAndroid Build Coastguard Worker
424*6a54128fSAndroid Build Coastguard Worker nodes = singletons = leaves = 0;
425*6a54128fSAndroid Build Coastguard Worker
426*6a54128fSAndroid Build Coastguard Worker printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
427*6a54128fSAndroid Build Coastguard Worker if (tree->childnode == LEAF) {
428*6a54128fSAndroid Build Coastguard Worker assert(tree->root);
429*6a54128fSAndroid Build Coastguard Worker tree->leaf_print(tree->root, indent);
430*6a54128fSAndroid Build Coastguard Worker leaves = 1;
431*6a54128fSAndroid Build Coastguard Worker } else {
432*6a54128fSAndroid Build Coastguard Worker assert(tree->childnode == NODE);
433*6a54128fSAndroid Build Coastguard Worker node = tree->root;
434*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
435*6a54128fSAndroid Build Coastguard Worker while (node) {
436*6a54128fSAndroid Build Coastguard Worker printf("%*snode @ %p bitnum %d nextbyte %d"
437*6a54128fSAndroid Build Coastguard Worker " left %p right %p mask %x bits %x\n",
438*6a54128fSAndroid Build Coastguard Worker indent, "", node,
439*6a54128fSAndroid Build Coastguard Worker node->bitnum, node->nextbyte,
440*6a54128fSAndroid Build Coastguard Worker node->left, node->right,
441*6a54128fSAndroid Build Coastguard Worker node->keymask, node->keybits);
442*6a54128fSAndroid Build Coastguard Worker nodes += 1;
443*6a54128fSAndroid Build Coastguard Worker if (!(node->left && node->right))
444*6a54128fSAndroid Build Coastguard Worker singletons += 1;
445*6a54128fSAndroid Build Coastguard Worker
446*6a54128fSAndroid Build Coastguard Worker while (node) {
447*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
448*6a54128fSAndroid Build Coastguard Worker if ((leftmask & bitmask) == 0) {
449*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
450*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF) {
451*6a54128fSAndroid Build Coastguard Worker assert(node->left);
452*6a54128fSAndroid Build Coastguard Worker tree->leaf_print(node->left,
453*6a54128fSAndroid Build Coastguard Worker indent+1);
454*6a54128fSAndroid Build Coastguard Worker leaves += 1;
455*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
456*6a54128fSAndroid Build Coastguard Worker assert(node->leftnode == NODE);
457*6a54128fSAndroid Build Coastguard Worker indent += 1;
458*6a54128fSAndroid Build Coastguard Worker node = node->left;
459*6a54128fSAndroid Build Coastguard Worker break;
460*6a54128fSAndroid Build Coastguard Worker }
461*6a54128fSAndroid Build Coastguard Worker }
462*6a54128fSAndroid Build Coastguard Worker if ((rightmask & bitmask) == 0) {
463*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
464*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF) {
465*6a54128fSAndroid Build Coastguard Worker assert(node->right);
466*6a54128fSAndroid Build Coastguard Worker tree->leaf_print(node->right,
467*6a54128fSAndroid Build Coastguard Worker indent+1);
468*6a54128fSAndroid Build Coastguard Worker leaves += 1;
469*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
470*6a54128fSAndroid Build Coastguard Worker assert(node->rightnode == NODE);
471*6a54128fSAndroid Build Coastguard Worker indent += 1;
472*6a54128fSAndroid Build Coastguard Worker node = node->right;
473*6a54128fSAndroid Build Coastguard Worker break;
474*6a54128fSAndroid Build Coastguard Worker }
475*6a54128fSAndroid Build Coastguard Worker }
476*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
477*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
478*6a54128fSAndroid Build Coastguard Worker node = node->parent;
479*6a54128fSAndroid Build Coastguard Worker indent -= 1;
480*6a54128fSAndroid Build Coastguard Worker }
481*6a54128fSAndroid Build Coastguard Worker }
482*6a54128fSAndroid Build Coastguard Worker }
483*6a54128fSAndroid Build Coastguard Worker printf("nodes %d leaves %d singletons %d\n",
484*6a54128fSAndroid Build Coastguard Worker nodes, leaves, singletons);
485*6a54128fSAndroid Build Coastguard Worker }
486*6a54128fSAndroid Build Coastguard Worker
487*6a54128fSAndroid Build Coastguard Worker /*
488*6a54128fSAndroid Build Coastguard Worker * Allocate an initialize a new internal node.
489*6a54128fSAndroid Build Coastguard Worker */
alloc_node(struct node * parent)490*6a54128fSAndroid Build Coastguard Worker static struct node *alloc_node(struct node *parent)
491*6a54128fSAndroid Build Coastguard Worker {
492*6a54128fSAndroid Build Coastguard Worker struct node *node;
493*6a54128fSAndroid Build Coastguard Worker int bitnum;
494*6a54128fSAndroid Build Coastguard Worker
495*6a54128fSAndroid Build Coastguard Worker node = malloc(sizeof(*node));
496*6a54128fSAndroid Build Coastguard Worker node->left = node->right = NULL;
497*6a54128fSAndroid Build Coastguard Worker node->parent = parent;
498*6a54128fSAndroid Build Coastguard Worker node->leftnode = NODE;
499*6a54128fSAndroid Build Coastguard Worker node->rightnode = NODE;
500*6a54128fSAndroid Build Coastguard Worker node->keybits = 0;
501*6a54128fSAndroid Build Coastguard Worker node->keymask = 0;
502*6a54128fSAndroid Build Coastguard Worker node->mark = 0;
503*6a54128fSAndroid Build Coastguard Worker node->index = 0;
504*6a54128fSAndroid Build Coastguard Worker node->offset = -1;
505*6a54128fSAndroid Build Coastguard Worker node->size = 4;
506*6a54128fSAndroid Build Coastguard Worker
507*6a54128fSAndroid Build Coastguard Worker if (node->parent) {
508*6a54128fSAndroid Build Coastguard Worker bitnum = parent->bitnum;
509*6a54128fSAndroid Build Coastguard Worker if ((bitnum & 7) == 0) {
510*6a54128fSAndroid Build Coastguard Worker node->bitnum = bitnum + 7 + 8;
511*6a54128fSAndroid Build Coastguard Worker node->nextbyte = 1;
512*6a54128fSAndroid Build Coastguard Worker } else {
513*6a54128fSAndroid Build Coastguard Worker node->bitnum = bitnum - 1;
514*6a54128fSAndroid Build Coastguard Worker node->nextbyte = 0;
515*6a54128fSAndroid Build Coastguard Worker }
516*6a54128fSAndroid Build Coastguard Worker } else {
517*6a54128fSAndroid Build Coastguard Worker node->bitnum = 7;
518*6a54128fSAndroid Build Coastguard Worker node->nextbyte = 0;
519*6a54128fSAndroid Build Coastguard Worker }
520*6a54128fSAndroid Build Coastguard Worker
521*6a54128fSAndroid Build Coastguard Worker return node;
522*6a54128fSAndroid Build Coastguard Worker }
523*6a54128fSAndroid Build Coastguard Worker
524*6a54128fSAndroid Build Coastguard Worker /*
525*6a54128fSAndroid Build Coastguard Worker * Insert a new leaf into the tree, and collapse any subtrees that are
526*6a54128fSAndroid Build Coastguard Worker * fully populated and end in identical leaves. A nextbyte tagged
527*6a54128fSAndroid Build Coastguard Worker * internal node will not be removed to preserve the tree's integrity.
528*6a54128fSAndroid Build Coastguard Worker * Note that due to the structure of utf8, no nextbyte tagged node
529*6a54128fSAndroid Build Coastguard Worker * will be a candidate for removal.
530*6a54128fSAndroid Build Coastguard Worker */
insert(struct tree * tree,char * key,int keylen,void * leaf)531*6a54128fSAndroid Build Coastguard Worker static int insert(struct tree *tree, char *key, int keylen, void *leaf)
532*6a54128fSAndroid Build Coastguard Worker {
533*6a54128fSAndroid Build Coastguard Worker struct node *node;
534*6a54128fSAndroid Build Coastguard Worker struct node *parent;
535*6a54128fSAndroid Build Coastguard Worker void **cursor;
536*6a54128fSAndroid Build Coastguard Worker int keybits;
537*6a54128fSAndroid Build Coastguard Worker
538*6a54128fSAndroid Build Coastguard Worker assert(keylen >= 1 && keylen <= 4);
539*6a54128fSAndroid Build Coastguard Worker
540*6a54128fSAndroid Build Coastguard Worker node = NULL;
541*6a54128fSAndroid Build Coastguard Worker cursor = &tree->root;
542*6a54128fSAndroid Build Coastguard Worker keybits = 8 * keylen;
543*6a54128fSAndroid Build Coastguard Worker
544*6a54128fSAndroid Build Coastguard Worker /* Insert, creating path along the way. */
545*6a54128fSAndroid Build Coastguard Worker while (keybits) {
546*6a54128fSAndroid Build Coastguard Worker if (!*cursor)
547*6a54128fSAndroid Build Coastguard Worker *cursor = alloc_node(node);
548*6a54128fSAndroid Build Coastguard Worker node = *cursor;
549*6a54128fSAndroid Build Coastguard Worker if (node->nextbyte)
550*6a54128fSAndroid Build Coastguard Worker key++;
551*6a54128fSAndroid Build Coastguard Worker if (*key & (1 << (node->bitnum & 7)))
552*6a54128fSAndroid Build Coastguard Worker cursor = &node->right;
553*6a54128fSAndroid Build Coastguard Worker else
554*6a54128fSAndroid Build Coastguard Worker cursor = &node->left;
555*6a54128fSAndroid Build Coastguard Worker keybits--;
556*6a54128fSAndroid Build Coastguard Worker }
557*6a54128fSAndroid Build Coastguard Worker *cursor = leaf;
558*6a54128fSAndroid Build Coastguard Worker
559*6a54128fSAndroid Build Coastguard Worker /* Merge subtrees if possible. */
560*6a54128fSAndroid Build Coastguard Worker while (node) {
561*6a54128fSAndroid Build Coastguard Worker if (*key & (1 << (node->bitnum & 7)))
562*6a54128fSAndroid Build Coastguard Worker node->rightnode = LEAF;
563*6a54128fSAndroid Build Coastguard Worker else
564*6a54128fSAndroid Build Coastguard Worker node->leftnode = LEAF;
565*6a54128fSAndroid Build Coastguard Worker if (node->nextbyte)
566*6a54128fSAndroid Build Coastguard Worker break;
567*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == NODE || node->rightnode == NODE)
568*6a54128fSAndroid Build Coastguard Worker break;
569*6a54128fSAndroid Build Coastguard Worker assert(node->left);
570*6a54128fSAndroid Build Coastguard Worker assert(node->right);
571*6a54128fSAndroid Build Coastguard Worker /* Compare */
572*6a54128fSAndroid Build Coastguard Worker if (! tree->leaf_equal(node->left, node->right))
573*6a54128fSAndroid Build Coastguard Worker break;
574*6a54128fSAndroid Build Coastguard Worker /* Keep left, drop right leaf. */
575*6a54128fSAndroid Build Coastguard Worker leaf = node->left;
576*6a54128fSAndroid Build Coastguard Worker /* Check in parent */
577*6a54128fSAndroid Build Coastguard Worker parent = node->parent;
578*6a54128fSAndroid Build Coastguard Worker if (!parent) {
579*6a54128fSAndroid Build Coastguard Worker /* root of tree! */
580*6a54128fSAndroid Build Coastguard Worker tree->root = leaf;
581*6a54128fSAndroid Build Coastguard Worker tree->childnode = LEAF;
582*6a54128fSAndroid Build Coastguard Worker } else if (parent->left == node) {
583*6a54128fSAndroid Build Coastguard Worker parent->left = leaf;
584*6a54128fSAndroid Build Coastguard Worker parent->leftnode = LEAF;
585*6a54128fSAndroid Build Coastguard Worker if (parent->right) {
586*6a54128fSAndroid Build Coastguard Worker parent->keymask = 0;
587*6a54128fSAndroid Build Coastguard Worker parent->keybits = 0;
588*6a54128fSAndroid Build Coastguard Worker } else {
589*6a54128fSAndroid Build Coastguard Worker parent->keymask |= (1 << node->bitnum);
590*6a54128fSAndroid Build Coastguard Worker }
591*6a54128fSAndroid Build Coastguard Worker } else if (parent->right == node) {
592*6a54128fSAndroid Build Coastguard Worker parent->right = leaf;
593*6a54128fSAndroid Build Coastguard Worker parent->rightnode = LEAF;
594*6a54128fSAndroid Build Coastguard Worker if (parent->left) {
595*6a54128fSAndroid Build Coastguard Worker parent->keymask = 0;
596*6a54128fSAndroid Build Coastguard Worker parent->keybits = 0;
597*6a54128fSAndroid Build Coastguard Worker } else {
598*6a54128fSAndroid Build Coastguard Worker parent->keymask |= (1 << node->bitnum);
599*6a54128fSAndroid Build Coastguard Worker parent->keybits |= (1 << node->bitnum);
600*6a54128fSAndroid Build Coastguard Worker }
601*6a54128fSAndroid Build Coastguard Worker } else {
602*6a54128fSAndroid Build Coastguard Worker /* internal tree error */
603*6a54128fSAndroid Build Coastguard Worker assert(0);
604*6a54128fSAndroid Build Coastguard Worker }
605*6a54128fSAndroid Build Coastguard Worker free(node);
606*6a54128fSAndroid Build Coastguard Worker node = parent;
607*6a54128fSAndroid Build Coastguard Worker }
608*6a54128fSAndroid Build Coastguard Worker
609*6a54128fSAndroid Build Coastguard Worker /* Propagate keymasks up along singleton chains. */
610*6a54128fSAndroid Build Coastguard Worker while (node) {
611*6a54128fSAndroid Build Coastguard Worker parent = node->parent;
612*6a54128fSAndroid Build Coastguard Worker if (!parent)
613*6a54128fSAndroid Build Coastguard Worker break;
614*6a54128fSAndroid Build Coastguard Worker /* Nix the mask for parents with two children. */
615*6a54128fSAndroid Build Coastguard Worker if (node->keymask == 0) {
616*6a54128fSAndroid Build Coastguard Worker parent->keymask = 0;
617*6a54128fSAndroid Build Coastguard Worker parent->keybits = 0;
618*6a54128fSAndroid Build Coastguard Worker } else if (parent->left && parent->right) {
619*6a54128fSAndroid Build Coastguard Worker parent->keymask = 0;
620*6a54128fSAndroid Build Coastguard Worker parent->keybits = 0;
621*6a54128fSAndroid Build Coastguard Worker } else {
622*6a54128fSAndroid Build Coastguard Worker assert((parent->keymask & node->keymask) == 0);
623*6a54128fSAndroid Build Coastguard Worker parent->keymask |= node->keymask;
624*6a54128fSAndroid Build Coastguard Worker parent->keymask |= (1 << parent->bitnum);
625*6a54128fSAndroid Build Coastguard Worker parent->keybits |= node->keybits;
626*6a54128fSAndroid Build Coastguard Worker if (parent->right)
627*6a54128fSAndroid Build Coastguard Worker parent->keybits |= (1 << parent->bitnum);
628*6a54128fSAndroid Build Coastguard Worker }
629*6a54128fSAndroid Build Coastguard Worker node = parent;
630*6a54128fSAndroid Build Coastguard Worker }
631*6a54128fSAndroid Build Coastguard Worker
632*6a54128fSAndroid Build Coastguard Worker return 0;
633*6a54128fSAndroid Build Coastguard Worker }
634*6a54128fSAndroid Build Coastguard Worker
635*6a54128fSAndroid Build Coastguard Worker /*
636*6a54128fSAndroid Build Coastguard Worker * Prune internal nodes.
637*6a54128fSAndroid Build Coastguard Worker *
638*6a54128fSAndroid Build Coastguard Worker * Fully populated subtrees that end at the same leaf have already
639*6a54128fSAndroid Build Coastguard Worker * been collapsed. There are still internal nodes that have for both
640*6a54128fSAndroid Build Coastguard Worker * their left and right branches a sequence of singletons that make
641*6a54128fSAndroid Build Coastguard Worker * identical choices and end in identical leaves. The keymask and
642*6a54128fSAndroid Build Coastguard Worker * keybits collected in the nodes describe the choices made in these
643*6a54128fSAndroid Build Coastguard Worker * singleton chains. When they are identical for the left and right
644*6a54128fSAndroid Build Coastguard Worker * branch of a node, and the two leaves comare identical, the node in
645*6a54128fSAndroid Build Coastguard Worker * question can be removed.
646*6a54128fSAndroid Build Coastguard Worker *
647*6a54128fSAndroid Build Coastguard Worker * Note that nodes with the nextbyte tag set will not be removed by
648*6a54128fSAndroid Build Coastguard Worker * this to ensure tree integrity. Note as well that the structure of
649*6a54128fSAndroid Build Coastguard Worker * utf8 ensures that these nodes would not have been candidates for
650*6a54128fSAndroid Build Coastguard Worker * removal in any case.
651*6a54128fSAndroid Build Coastguard Worker */
prune(struct tree * tree)652*6a54128fSAndroid Build Coastguard Worker static void prune(struct tree *tree)
653*6a54128fSAndroid Build Coastguard Worker {
654*6a54128fSAndroid Build Coastguard Worker struct node *node;
655*6a54128fSAndroid Build Coastguard Worker struct node *left;
656*6a54128fSAndroid Build Coastguard Worker struct node *right;
657*6a54128fSAndroid Build Coastguard Worker struct node *parent;
658*6a54128fSAndroid Build Coastguard Worker void *leftleaf;
659*6a54128fSAndroid Build Coastguard Worker void *rightleaf;
660*6a54128fSAndroid Build Coastguard Worker unsigned int leftmask;
661*6a54128fSAndroid Build Coastguard Worker unsigned int rightmask;
662*6a54128fSAndroid Build Coastguard Worker unsigned int bitmask;
663*6a54128fSAndroid Build Coastguard Worker int count;
664*6a54128fSAndroid Build Coastguard Worker
665*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
666*6a54128fSAndroid Build Coastguard Worker printf("Pruning %s_%x\n", tree->type, tree->maxage);
667*6a54128fSAndroid Build Coastguard Worker
668*6a54128fSAndroid Build Coastguard Worker count = 0;
669*6a54128fSAndroid Build Coastguard Worker if (tree->childnode == LEAF)
670*6a54128fSAndroid Build Coastguard Worker return;
671*6a54128fSAndroid Build Coastguard Worker if (!tree->root)
672*6a54128fSAndroid Build Coastguard Worker return;
673*6a54128fSAndroid Build Coastguard Worker
674*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
675*6a54128fSAndroid Build Coastguard Worker node = tree->root;
676*6a54128fSAndroid Build Coastguard Worker while (node) {
677*6a54128fSAndroid Build Coastguard Worker if (node->nextbyte)
678*6a54128fSAndroid Build Coastguard Worker goto advance;
679*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF)
680*6a54128fSAndroid Build Coastguard Worker goto advance;
681*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF)
682*6a54128fSAndroid Build Coastguard Worker goto advance;
683*6a54128fSAndroid Build Coastguard Worker if (!node->left)
684*6a54128fSAndroid Build Coastguard Worker goto advance;
685*6a54128fSAndroid Build Coastguard Worker if (!node->right)
686*6a54128fSAndroid Build Coastguard Worker goto advance;
687*6a54128fSAndroid Build Coastguard Worker left = node->left;
688*6a54128fSAndroid Build Coastguard Worker right = node->right;
689*6a54128fSAndroid Build Coastguard Worker if (left->keymask == 0)
690*6a54128fSAndroid Build Coastguard Worker goto advance;
691*6a54128fSAndroid Build Coastguard Worker if (right->keymask == 0)
692*6a54128fSAndroid Build Coastguard Worker goto advance;
693*6a54128fSAndroid Build Coastguard Worker if (left->keymask != right->keymask)
694*6a54128fSAndroid Build Coastguard Worker goto advance;
695*6a54128fSAndroid Build Coastguard Worker if (left->keybits != right->keybits)
696*6a54128fSAndroid Build Coastguard Worker goto advance;
697*6a54128fSAndroid Build Coastguard Worker leftleaf = NULL;
698*6a54128fSAndroid Build Coastguard Worker while (!leftleaf) {
699*6a54128fSAndroid Build Coastguard Worker assert(left->left || left->right);
700*6a54128fSAndroid Build Coastguard Worker if (left->leftnode == LEAF)
701*6a54128fSAndroid Build Coastguard Worker leftleaf = left->left;
702*6a54128fSAndroid Build Coastguard Worker else if (left->rightnode == LEAF)
703*6a54128fSAndroid Build Coastguard Worker leftleaf = left->right;
704*6a54128fSAndroid Build Coastguard Worker else if (left->left)
705*6a54128fSAndroid Build Coastguard Worker left = left->left;
706*6a54128fSAndroid Build Coastguard Worker else if (left->right)
707*6a54128fSAndroid Build Coastguard Worker left = left->right;
708*6a54128fSAndroid Build Coastguard Worker else
709*6a54128fSAndroid Build Coastguard Worker assert(0);
710*6a54128fSAndroid Build Coastguard Worker }
711*6a54128fSAndroid Build Coastguard Worker rightleaf = NULL;
712*6a54128fSAndroid Build Coastguard Worker while (!rightleaf) {
713*6a54128fSAndroid Build Coastguard Worker assert(right->left || right->right);
714*6a54128fSAndroid Build Coastguard Worker if (right->leftnode == LEAF)
715*6a54128fSAndroid Build Coastguard Worker rightleaf = right->left;
716*6a54128fSAndroid Build Coastguard Worker else if (right->rightnode == LEAF)
717*6a54128fSAndroid Build Coastguard Worker rightleaf = right->right;
718*6a54128fSAndroid Build Coastguard Worker else if (right->left)
719*6a54128fSAndroid Build Coastguard Worker right = right->left;
720*6a54128fSAndroid Build Coastguard Worker else if (right->right)
721*6a54128fSAndroid Build Coastguard Worker right = right->right;
722*6a54128fSAndroid Build Coastguard Worker else
723*6a54128fSAndroid Build Coastguard Worker assert(0);
724*6a54128fSAndroid Build Coastguard Worker }
725*6a54128fSAndroid Build Coastguard Worker if (! tree->leaf_equal(leftleaf, rightleaf))
726*6a54128fSAndroid Build Coastguard Worker goto advance;
727*6a54128fSAndroid Build Coastguard Worker /*
728*6a54128fSAndroid Build Coastguard Worker * This node has identical singleton-only subtrees.
729*6a54128fSAndroid Build Coastguard Worker * Remove it.
730*6a54128fSAndroid Build Coastguard Worker */
731*6a54128fSAndroid Build Coastguard Worker parent = node->parent;
732*6a54128fSAndroid Build Coastguard Worker left = node->left;
733*6a54128fSAndroid Build Coastguard Worker right = node->right;
734*6a54128fSAndroid Build Coastguard Worker if (parent->left == node)
735*6a54128fSAndroid Build Coastguard Worker parent->left = left;
736*6a54128fSAndroid Build Coastguard Worker else if (parent->right == node)
737*6a54128fSAndroid Build Coastguard Worker parent->right = left;
738*6a54128fSAndroid Build Coastguard Worker else
739*6a54128fSAndroid Build Coastguard Worker assert(0);
740*6a54128fSAndroid Build Coastguard Worker left->parent = parent;
741*6a54128fSAndroid Build Coastguard Worker left->keymask |= (1 << node->bitnum);
742*6a54128fSAndroid Build Coastguard Worker node->left = NULL;
743*6a54128fSAndroid Build Coastguard Worker while (node) {
744*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
745*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
746*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
747*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == NODE && node->left) {
748*6a54128fSAndroid Build Coastguard Worker left = node->left;
749*6a54128fSAndroid Build Coastguard Worker free(node);
750*6a54128fSAndroid Build Coastguard Worker count++;
751*6a54128fSAndroid Build Coastguard Worker node = left;
752*6a54128fSAndroid Build Coastguard Worker } else if (node->rightnode == NODE && node->right) {
753*6a54128fSAndroid Build Coastguard Worker right = node->right;
754*6a54128fSAndroid Build Coastguard Worker free(node);
755*6a54128fSAndroid Build Coastguard Worker count++;
756*6a54128fSAndroid Build Coastguard Worker node = right;
757*6a54128fSAndroid Build Coastguard Worker } else {
758*6a54128fSAndroid Build Coastguard Worker node = NULL;
759*6a54128fSAndroid Build Coastguard Worker }
760*6a54128fSAndroid Build Coastguard Worker }
761*6a54128fSAndroid Build Coastguard Worker /* Propagate keymasks up along singleton chains. */
762*6a54128fSAndroid Build Coastguard Worker node = parent;
763*6a54128fSAndroid Build Coastguard Worker /* Force re-check */
764*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
765*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
766*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
767*6a54128fSAndroid Build Coastguard Worker for (;;) {
768*6a54128fSAndroid Build Coastguard Worker if (node->left && node->right)
769*6a54128fSAndroid Build Coastguard Worker break;
770*6a54128fSAndroid Build Coastguard Worker if (node->left) {
771*6a54128fSAndroid Build Coastguard Worker left = node->left;
772*6a54128fSAndroid Build Coastguard Worker node->keymask |= left->keymask;
773*6a54128fSAndroid Build Coastguard Worker node->keybits |= left->keybits;
774*6a54128fSAndroid Build Coastguard Worker }
775*6a54128fSAndroid Build Coastguard Worker if (node->right) {
776*6a54128fSAndroid Build Coastguard Worker right = node->right;
777*6a54128fSAndroid Build Coastguard Worker node->keymask |= right->keymask;
778*6a54128fSAndroid Build Coastguard Worker node->keybits |= right->keybits;
779*6a54128fSAndroid Build Coastguard Worker }
780*6a54128fSAndroid Build Coastguard Worker node->keymask |= (1 << node->bitnum);
781*6a54128fSAndroid Build Coastguard Worker node = node->parent;
782*6a54128fSAndroid Build Coastguard Worker /* Force re-check */
783*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
784*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
785*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
786*6a54128fSAndroid Build Coastguard Worker }
787*6a54128fSAndroid Build Coastguard Worker advance:
788*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
789*6a54128fSAndroid Build Coastguard Worker if ((leftmask & bitmask) == 0 &&
790*6a54128fSAndroid Build Coastguard Worker node->leftnode == NODE &&
791*6a54128fSAndroid Build Coastguard Worker node->left) {
792*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
793*6a54128fSAndroid Build Coastguard Worker node = node->left;
794*6a54128fSAndroid Build Coastguard Worker } else if ((rightmask & bitmask) == 0 &&
795*6a54128fSAndroid Build Coastguard Worker node->rightnode == NODE &&
796*6a54128fSAndroid Build Coastguard Worker node->right) {
797*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
798*6a54128fSAndroid Build Coastguard Worker node = node->right;
799*6a54128fSAndroid Build Coastguard Worker } else {
800*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
801*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
802*6a54128fSAndroid Build Coastguard Worker node = node->parent;
803*6a54128fSAndroid Build Coastguard Worker }
804*6a54128fSAndroid Build Coastguard Worker }
805*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
806*6a54128fSAndroid Build Coastguard Worker printf("Pruned %d nodes\n", count);
807*6a54128fSAndroid Build Coastguard Worker }
808*6a54128fSAndroid Build Coastguard Worker
809*6a54128fSAndroid Build Coastguard Worker /*
810*6a54128fSAndroid Build Coastguard Worker * Mark the nodes in the tree that lead to leaves that must be
811*6a54128fSAndroid Build Coastguard Worker * emitted.
812*6a54128fSAndroid Build Coastguard Worker */
mark_nodes(struct tree * tree)813*6a54128fSAndroid Build Coastguard Worker static void mark_nodes(struct tree *tree)
814*6a54128fSAndroid Build Coastguard Worker {
815*6a54128fSAndroid Build Coastguard Worker struct node *node;
816*6a54128fSAndroid Build Coastguard Worker struct node *n;
817*6a54128fSAndroid Build Coastguard Worker unsigned int leftmask;
818*6a54128fSAndroid Build Coastguard Worker unsigned int rightmask;
819*6a54128fSAndroid Build Coastguard Worker unsigned int bitmask;
820*6a54128fSAndroid Build Coastguard Worker int marked;
821*6a54128fSAndroid Build Coastguard Worker
822*6a54128fSAndroid Build Coastguard Worker marked = 0;
823*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
824*6a54128fSAndroid Build Coastguard Worker printf("Marking %s_%x\n", tree->type, tree->maxage);
825*6a54128fSAndroid Build Coastguard Worker if (tree->childnode == LEAF)
826*6a54128fSAndroid Build Coastguard Worker goto done;
827*6a54128fSAndroid Build Coastguard Worker
828*6a54128fSAndroid Build Coastguard Worker assert(tree->childnode == NODE);
829*6a54128fSAndroid Build Coastguard Worker node = tree->root;
830*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
831*6a54128fSAndroid Build Coastguard Worker while (node) {
832*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
833*6a54128fSAndroid Build Coastguard Worker if ((leftmask & bitmask) == 0) {
834*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
835*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF) {
836*6a54128fSAndroid Build Coastguard Worker assert(node->left);
837*6a54128fSAndroid Build Coastguard Worker if (tree->leaf_mark(node->left)) {
838*6a54128fSAndroid Build Coastguard Worker n = node;
839*6a54128fSAndroid Build Coastguard Worker while (n && !n->mark) {
840*6a54128fSAndroid Build Coastguard Worker marked++;
841*6a54128fSAndroid Build Coastguard Worker n->mark = 1;
842*6a54128fSAndroid Build Coastguard Worker n = n->parent;
843*6a54128fSAndroid Build Coastguard Worker }
844*6a54128fSAndroid Build Coastguard Worker }
845*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
846*6a54128fSAndroid Build Coastguard Worker assert(node->leftnode == NODE);
847*6a54128fSAndroid Build Coastguard Worker node = node->left;
848*6a54128fSAndroid Build Coastguard Worker continue;
849*6a54128fSAndroid Build Coastguard Worker }
850*6a54128fSAndroid Build Coastguard Worker }
851*6a54128fSAndroid Build Coastguard Worker if ((rightmask & bitmask) == 0) {
852*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
853*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF) {
854*6a54128fSAndroid Build Coastguard Worker assert(node->right);
855*6a54128fSAndroid Build Coastguard Worker if (tree->leaf_mark(node->right)) {
856*6a54128fSAndroid Build Coastguard Worker n = node;
857*6a54128fSAndroid Build Coastguard Worker while (n && !n->mark) {
858*6a54128fSAndroid Build Coastguard Worker marked++;
859*6a54128fSAndroid Build Coastguard Worker n->mark = 1;
860*6a54128fSAndroid Build Coastguard Worker n = n->parent;
861*6a54128fSAndroid Build Coastguard Worker }
862*6a54128fSAndroid Build Coastguard Worker }
863*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
864*6a54128fSAndroid Build Coastguard Worker assert(node->rightnode == NODE);
865*6a54128fSAndroid Build Coastguard Worker node = node->right;
866*6a54128fSAndroid Build Coastguard Worker continue;
867*6a54128fSAndroid Build Coastguard Worker }
868*6a54128fSAndroid Build Coastguard Worker }
869*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
870*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
871*6a54128fSAndroid Build Coastguard Worker node = node->parent;
872*6a54128fSAndroid Build Coastguard Worker }
873*6a54128fSAndroid Build Coastguard Worker
874*6a54128fSAndroid Build Coastguard Worker /* second pass: left siblings and singletons */
875*6a54128fSAndroid Build Coastguard Worker
876*6a54128fSAndroid Build Coastguard Worker assert(tree->childnode == NODE);
877*6a54128fSAndroid Build Coastguard Worker node = tree->root;
878*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
879*6a54128fSAndroid Build Coastguard Worker while (node) {
880*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
881*6a54128fSAndroid Build Coastguard Worker if ((leftmask & bitmask) == 0) {
882*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
883*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF) {
884*6a54128fSAndroid Build Coastguard Worker assert(node->left);
885*6a54128fSAndroid Build Coastguard Worker if (tree->leaf_mark(node->left)) {
886*6a54128fSAndroid Build Coastguard Worker n = node;
887*6a54128fSAndroid Build Coastguard Worker while (n && !n->mark) {
888*6a54128fSAndroid Build Coastguard Worker marked++;
889*6a54128fSAndroid Build Coastguard Worker n->mark = 1;
890*6a54128fSAndroid Build Coastguard Worker n = n->parent;
891*6a54128fSAndroid Build Coastguard Worker }
892*6a54128fSAndroid Build Coastguard Worker }
893*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
894*6a54128fSAndroid Build Coastguard Worker assert(node->leftnode == NODE);
895*6a54128fSAndroid Build Coastguard Worker node = node->left;
896*6a54128fSAndroid Build Coastguard Worker if (!node->mark && node->parent->mark) {
897*6a54128fSAndroid Build Coastguard Worker marked++;
898*6a54128fSAndroid Build Coastguard Worker node->mark = 1;
899*6a54128fSAndroid Build Coastguard Worker }
900*6a54128fSAndroid Build Coastguard Worker continue;
901*6a54128fSAndroid Build Coastguard Worker }
902*6a54128fSAndroid Build Coastguard Worker }
903*6a54128fSAndroid Build Coastguard Worker if ((rightmask & bitmask) == 0) {
904*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
905*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF) {
906*6a54128fSAndroid Build Coastguard Worker assert(node->right);
907*6a54128fSAndroid Build Coastguard Worker if (tree->leaf_mark(node->right)) {
908*6a54128fSAndroid Build Coastguard Worker n = node;
909*6a54128fSAndroid Build Coastguard Worker while (n && !n->mark) {
910*6a54128fSAndroid Build Coastguard Worker marked++;
911*6a54128fSAndroid Build Coastguard Worker n->mark = 1;
912*6a54128fSAndroid Build Coastguard Worker n = n->parent;
913*6a54128fSAndroid Build Coastguard Worker }
914*6a54128fSAndroid Build Coastguard Worker }
915*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
916*6a54128fSAndroid Build Coastguard Worker assert(node->rightnode == NODE);
917*6a54128fSAndroid Build Coastguard Worker node = node->right;
918*6a54128fSAndroid Build Coastguard Worker if (!node->mark && node->parent->mark &&
919*6a54128fSAndroid Build Coastguard Worker !node->parent->left) {
920*6a54128fSAndroid Build Coastguard Worker marked++;
921*6a54128fSAndroid Build Coastguard Worker node->mark = 1;
922*6a54128fSAndroid Build Coastguard Worker }
923*6a54128fSAndroid Build Coastguard Worker continue;
924*6a54128fSAndroid Build Coastguard Worker }
925*6a54128fSAndroid Build Coastguard Worker }
926*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
927*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
928*6a54128fSAndroid Build Coastguard Worker node = node->parent;
929*6a54128fSAndroid Build Coastguard Worker }
930*6a54128fSAndroid Build Coastguard Worker done:
931*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
932*6a54128fSAndroid Build Coastguard Worker printf("Marked %d nodes\n", marked);
933*6a54128fSAndroid Build Coastguard Worker }
934*6a54128fSAndroid Build Coastguard Worker
935*6a54128fSAndroid Build Coastguard Worker /*
936*6a54128fSAndroid Build Coastguard Worker * Compute the index of each node and leaf, which is the offset in the
937*6a54128fSAndroid Build Coastguard Worker * emitted trie. These values must be pre-computed because relative
938*6a54128fSAndroid Build Coastguard Worker * offsets between nodes are used to navigate the tree.
939*6a54128fSAndroid Build Coastguard Worker */
index_nodes(struct tree * tree,int index)940*6a54128fSAndroid Build Coastguard Worker static int index_nodes(struct tree *tree, int index)
941*6a54128fSAndroid Build Coastguard Worker {
942*6a54128fSAndroid Build Coastguard Worker struct node *node;
943*6a54128fSAndroid Build Coastguard Worker unsigned int leftmask;
944*6a54128fSAndroid Build Coastguard Worker unsigned int rightmask;
945*6a54128fSAndroid Build Coastguard Worker unsigned int bitmask;
946*6a54128fSAndroid Build Coastguard Worker int count;
947*6a54128fSAndroid Build Coastguard Worker int indent;
948*6a54128fSAndroid Build Coastguard Worker
949*6a54128fSAndroid Build Coastguard Worker /* Align to a cache line (or half a cache line?). */
950*6a54128fSAndroid Build Coastguard Worker while (index % 64)
951*6a54128fSAndroid Build Coastguard Worker index++;
952*6a54128fSAndroid Build Coastguard Worker tree->index = index;
953*6a54128fSAndroid Build Coastguard Worker indent = 1;
954*6a54128fSAndroid Build Coastguard Worker count = 0;
955*6a54128fSAndroid Build Coastguard Worker
956*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
957*6a54128fSAndroid Build Coastguard Worker printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
958*6a54128fSAndroid Build Coastguard Worker if (tree->childnode == LEAF) {
959*6a54128fSAndroid Build Coastguard Worker index += tree->leaf_size(tree->root);
960*6a54128fSAndroid Build Coastguard Worker goto done;
961*6a54128fSAndroid Build Coastguard Worker }
962*6a54128fSAndroid Build Coastguard Worker
963*6a54128fSAndroid Build Coastguard Worker assert(tree->childnode == NODE);
964*6a54128fSAndroid Build Coastguard Worker node = tree->root;
965*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
966*6a54128fSAndroid Build Coastguard Worker while (node) {
967*6a54128fSAndroid Build Coastguard Worker if (!node->mark)
968*6a54128fSAndroid Build Coastguard Worker goto skip;
969*6a54128fSAndroid Build Coastguard Worker count++;
970*6a54128fSAndroid Build Coastguard Worker if (node->index != index)
971*6a54128fSAndroid Build Coastguard Worker node->index = index;
972*6a54128fSAndroid Build Coastguard Worker index += node->size;
973*6a54128fSAndroid Build Coastguard Worker skip:
974*6a54128fSAndroid Build Coastguard Worker while (node) {
975*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
976*6a54128fSAndroid Build Coastguard Worker if (node->mark && (leftmask & bitmask) == 0) {
977*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
978*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF) {
979*6a54128fSAndroid Build Coastguard Worker assert(node->left);
980*6a54128fSAndroid Build Coastguard Worker *tree->leaf_index(tree, node->left) =
981*6a54128fSAndroid Build Coastguard Worker index;
982*6a54128fSAndroid Build Coastguard Worker index += tree->leaf_size(node->left);
983*6a54128fSAndroid Build Coastguard Worker count++;
984*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
985*6a54128fSAndroid Build Coastguard Worker assert(node->leftnode == NODE);
986*6a54128fSAndroid Build Coastguard Worker indent += 1;
987*6a54128fSAndroid Build Coastguard Worker node = node->left;
988*6a54128fSAndroid Build Coastguard Worker break;
989*6a54128fSAndroid Build Coastguard Worker }
990*6a54128fSAndroid Build Coastguard Worker }
991*6a54128fSAndroid Build Coastguard Worker if (node->mark && (rightmask & bitmask) == 0) {
992*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
993*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF) {
994*6a54128fSAndroid Build Coastguard Worker assert(node->right);
995*6a54128fSAndroid Build Coastguard Worker *tree->leaf_index(tree, node->right) = index;
996*6a54128fSAndroid Build Coastguard Worker index += tree->leaf_size(node->right);
997*6a54128fSAndroid Build Coastguard Worker count++;
998*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
999*6a54128fSAndroid Build Coastguard Worker assert(node->rightnode == NODE);
1000*6a54128fSAndroid Build Coastguard Worker indent += 1;
1001*6a54128fSAndroid Build Coastguard Worker node = node->right;
1002*6a54128fSAndroid Build Coastguard Worker break;
1003*6a54128fSAndroid Build Coastguard Worker }
1004*6a54128fSAndroid Build Coastguard Worker }
1005*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
1006*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
1007*6a54128fSAndroid Build Coastguard Worker node = node->parent;
1008*6a54128fSAndroid Build Coastguard Worker indent -= 1;
1009*6a54128fSAndroid Build Coastguard Worker }
1010*6a54128fSAndroid Build Coastguard Worker }
1011*6a54128fSAndroid Build Coastguard Worker done:
1012*6a54128fSAndroid Build Coastguard Worker /* Round up to a multiple of 16 */
1013*6a54128fSAndroid Build Coastguard Worker while (index % 16)
1014*6a54128fSAndroid Build Coastguard Worker index++;
1015*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
1016*6a54128fSAndroid Build Coastguard Worker printf("Final index %d\n", index);
1017*6a54128fSAndroid Build Coastguard Worker return index;
1018*6a54128fSAndroid Build Coastguard Worker }
1019*6a54128fSAndroid Build Coastguard Worker
1020*6a54128fSAndroid Build Coastguard Worker /*
1021*6a54128fSAndroid Build Coastguard Worker * Mark the nodes in a subtree, helper for size_nodes().
1022*6a54128fSAndroid Build Coastguard Worker */
mark_subtree(struct node * node)1023*6a54128fSAndroid Build Coastguard Worker static int mark_subtree(struct node *node)
1024*6a54128fSAndroid Build Coastguard Worker {
1025*6a54128fSAndroid Build Coastguard Worker int changed;
1026*6a54128fSAndroid Build Coastguard Worker
1027*6a54128fSAndroid Build Coastguard Worker if (!node || node->mark)
1028*6a54128fSAndroid Build Coastguard Worker return 0;
1029*6a54128fSAndroid Build Coastguard Worker node->mark = 1;
1030*6a54128fSAndroid Build Coastguard Worker node->index = node->parent->index;
1031*6a54128fSAndroid Build Coastguard Worker changed = 1;
1032*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == NODE)
1033*6a54128fSAndroid Build Coastguard Worker changed += mark_subtree(node->left);
1034*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == NODE)
1035*6a54128fSAndroid Build Coastguard Worker changed += mark_subtree(node->right);
1036*6a54128fSAndroid Build Coastguard Worker return changed;
1037*6a54128fSAndroid Build Coastguard Worker }
1038*6a54128fSAndroid Build Coastguard Worker
1039*6a54128fSAndroid Build Coastguard Worker /*
1040*6a54128fSAndroid Build Coastguard Worker * Compute the size of nodes and leaves. We start by assuming that
1041*6a54128fSAndroid Build Coastguard Worker * each node needs to store a three-byte offset. The indexes of the
1042*6a54128fSAndroid Build Coastguard Worker * nodes are calculated based on that, and then this function is
1043*6a54128fSAndroid Build Coastguard Worker * called to see if the sizes of some nodes can be reduced. This is
1044*6a54128fSAndroid Build Coastguard Worker * repeated until no more changes are seen.
1045*6a54128fSAndroid Build Coastguard Worker */
size_nodes(struct tree * tree)1046*6a54128fSAndroid Build Coastguard Worker static int size_nodes(struct tree *tree)
1047*6a54128fSAndroid Build Coastguard Worker {
1048*6a54128fSAndroid Build Coastguard Worker struct tree *next;
1049*6a54128fSAndroid Build Coastguard Worker struct node *node;
1050*6a54128fSAndroid Build Coastguard Worker struct node *right;
1051*6a54128fSAndroid Build Coastguard Worker struct node *n;
1052*6a54128fSAndroid Build Coastguard Worker unsigned int leftmask;
1053*6a54128fSAndroid Build Coastguard Worker unsigned int rightmask;
1054*6a54128fSAndroid Build Coastguard Worker unsigned int bitmask;
1055*6a54128fSAndroid Build Coastguard Worker unsigned int pathbits;
1056*6a54128fSAndroid Build Coastguard Worker unsigned int pathmask;
1057*6a54128fSAndroid Build Coastguard Worker unsigned int nbit;
1058*6a54128fSAndroid Build Coastguard Worker int changed;
1059*6a54128fSAndroid Build Coastguard Worker int offset;
1060*6a54128fSAndroid Build Coastguard Worker int size;
1061*6a54128fSAndroid Build Coastguard Worker int indent;
1062*6a54128fSAndroid Build Coastguard Worker
1063*6a54128fSAndroid Build Coastguard Worker indent = 1;
1064*6a54128fSAndroid Build Coastguard Worker changed = 0;
1065*6a54128fSAndroid Build Coastguard Worker size = 0;
1066*6a54128fSAndroid Build Coastguard Worker
1067*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
1068*6a54128fSAndroid Build Coastguard Worker printf("Sizing %s_%x\n", tree->type, tree->maxage);
1069*6a54128fSAndroid Build Coastguard Worker if (tree->childnode == LEAF)
1070*6a54128fSAndroid Build Coastguard Worker goto done;
1071*6a54128fSAndroid Build Coastguard Worker
1072*6a54128fSAndroid Build Coastguard Worker assert(tree->childnode == NODE);
1073*6a54128fSAndroid Build Coastguard Worker pathbits = 0;
1074*6a54128fSAndroid Build Coastguard Worker pathmask = 0;
1075*6a54128fSAndroid Build Coastguard Worker node = tree->root;
1076*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
1077*6a54128fSAndroid Build Coastguard Worker while (node) {
1078*6a54128fSAndroid Build Coastguard Worker if (!node->mark)
1079*6a54128fSAndroid Build Coastguard Worker goto skip;
1080*6a54128fSAndroid Build Coastguard Worker offset = 0;
1081*6a54128fSAndroid Build Coastguard Worker if (!node->left || !node->right) {
1082*6a54128fSAndroid Build Coastguard Worker size = 1;
1083*6a54128fSAndroid Build Coastguard Worker } else {
1084*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == NODE) {
1085*6a54128fSAndroid Build Coastguard Worker /*
1086*6a54128fSAndroid Build Coastguard Worker * If the right node is not marked,
1087*6a54128fSAndroid Build Coastguard Worker * look for a corresponding node in
1088*6a54128fSAndroid Build Coastguard Worker * the next tree. Such a node need
1089*6a54128fSAndroid Build Coastguard Worker * not exist.
1090*6a54128fSAndroid Build Coastguard Worker */
1091*6a54128fSAndroid Build Coastguard Worker right = node->right;
1092*6a54128fSAndroid Build Coastguard Worker next = tree->next;
1093*6a54128fSAndroid Build Coastguard Worker while (!right->mark) {
1094*6a54128fSAndroid Build Coastguard Worker assert(next);
1095*6a54128fSAndroid Build Coastguard Worker n = next->root;
1096*6a54128fSAndroid Build Coastguard Worker while (n->bitnum != node->bitnum) {
1097*6a54128fSAndroid Build Coastguard Worker nbit = 1 << n->bitnum;
1098*6a54128fSAndroid Build Coastguard Worker if (!(pathmask & nbit))
1099*6a54128fSAndroid Build Coastguard Worker break;
1100*6a54128fSAndroid Build Coastguard Worker if (pathbits & nbit) {
1101*6a54128fSAndroid Build Coastguard Worker if (n->rightnode == LEAF)
1102*6a54128fSAndroid Build Coastguard Worker break;
1103*6a54128fSAndroid Build Coastguard Worker n = n->right;
1104*6a54128fSAndroid Build Coastguard Worker } else {
1105*6a54128fSAndroid Build Coastguard Worker if (n->leftnode == LEAF)
1106*6a54128fSAndroid Build Coastguard Worker break;
1107*6a54128fSAndroid Build Coastguard Worker n = n->left;
1108*6a54128fSAndroid Build Coastguard Worker }
1109*6a54128fSAndroid Build Coastguard Worker }
1110*6a54128fSAndroid Build Coastguard Worker if (n->bitnum != node->bitnum)
1111*6a54128fSAndroid Build Coastguard Worker break;
1112*6a54128fSAndroid Build Coastguard Worker n = n->right;
1113*6a54128fSAndroid Build Coastguard Worker right = n;
1114*6a54128fSAndroid Build Coastguard Worker next = next->next;
1115*6a54128fSAndroid Build Coastguard Worker }
1116*6a54128fSAndroid Build Coastguard Worker /* Make sure the right node is marked. */
1117*6a54128fSAndroid Build Coastguard Worker if (!right->mark)
1118*6a54128fSAndroid Build Coastguard Worker changed += mark_subtree(right);
1119*6a54128fSAndroid Build Coastguard Worker offset = right->index - node->index;
1120*6a54128fSAndroid Build Coastguard Worker } else {
1121*6a54128fSAndroid Build Coastguard Worker offset = *tree->leaf_index(tree, node->right);
1122*6a54128fSAndroid Build Coastguard Worker offset -= node->index;
1123*6a54128fSAndroid Build Coastguard Worker }
1124*6a54128fSAndroid Build Coastguard Worker assert(offset >= 0);
1125*6a54128fSAndroid Build Coastguard Worker assert(offset <= 0xffffff);
1126*6a54128fSAndroid Build Coastguard Worker if (offset <= 0xff) {
1127*6a54128fSAndroid Build Coastguard Worker size = 2;
1128*6a54128fSAndroid Build Coastguard Worker } else if (offset <= 0xffff) {
1129*6a54128fSAndroid Build Coastguard Worker size = 3;
1130*6a54128fSAndroid Build Coastguard Worker } else { /* offset <= 0xffffff */
1131*6a54128fSAndroid Build Coastguard Worker size = 4;
1132*6a54128fSAndroid Build Coastguard Worker }
1133*6a54128fSAndroid Build Coastguard Worker }
1134*6a54128fSAndroid Build Coastguard Worker if (node->size != size || node->offset != offset) {
1135*6a54128fSAndroid Build Coastguard Worker node->size = size;
1136*6a54128fSAndroid Build Coastguard Worker node->offset = offset;
1137*6a54128fSAndroid Build Coastguard Worker changed++;
1138*6a54128fSAndroid Build Coastguard Worker }
1139*6a54128fSAndroid Build Coastguard Worker skip:
1140*6a54128fSAndroid Build Coastguard Worker while (node) {
1141*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
1142*6a54128fSAndroid Build Coastguard Worker pathmask |= bitmask;
1143*6a54128fSAndroid Build Coastguard Worker if (node->mark && (leftmask & bitmask) == 0) {
1144*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
1145*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF) {
1146*6a54128fSAndroid Build Coastguard Worker assert(node->left);
1147*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
1148*6a54128fSAndroid Build Coastguard Worker assert(node->leftnode == NODE);
1149*6a54128fSAndroid Build Coastguard Worker indent += 1;
1150*6a54128fSAndroid Build Coastguard Worker node = node->left;
1151*6a54128fSAndroid Build Coastguard Worker break;
1152*6a54128fSAndroid Build Coastguard Worker }
1153*6a54128fSAndroid Build Coastguard Worker }
1154*6a54128fSAndroid Build Coastguard Worker if (node->mark && (rightmask & bitmask) == 0) {
1155*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
1156*6a54128fSAndroid Build Coastguard Worker pathbits |= bitmask;
1157*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF) {
1158*6a54128fSAndroid Build Coastguard Worker assert(node->right);
1159*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
1160*6a54128fSAndroid Build Coastguard Worker assert(node->rightnode == NODE);
1161*6a54128fSAndroid Build Coastguard Worker indent += 1;
1162*6a54128fSAndroid Build Coastguard Worker node = node->right;
1163*6a54128fSAndroid Build Coastguard Worker break;
1164*6a54128fSAndroid Build Coastguard Worker }
1165*6a54128fSAndroid Build Coastguard Worker }
1166*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
1167*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
1168*6a54128fSAndroid Build Coastguard Worker pathmask &= ~bitmask;
1169*6a54128fSAndroid Build Coastguard Worker pathbits &= ~bitmask;
1170*6a54128fSAndroid Build Coastguard Worker node = node->parent;
1171*6a54128fSAndroid Build Coastguard Worker indent -= 1;
1172*6a54128fSAndroid Build Coastguard Worker }
1173*6a54128fSAndroid Build Coastguard Worker }
1174*6a54128fSAndroid Build Coastguard Worker done:
1175*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
1176*6a54128fSAndroid Build Coastguard Worker printf("Found %d changes\n", changed);
1177*6a54128fSAndroid Build Coastguard Worker return changed;
1178*6a54128fSAndroid Build Coastguard Worker }
1179*6a54128fSAndroid Build Coastguard Worker
1180*6a54128fSAndroid Build Coastguard Worker /*
1181*6a54128fSAndroid Build Coastguard Worker * Emit a trie for the given tree into the data array.
1182*6a54128fSAndroid Build Coastguard Worker */
emit(struct tree * tree,unsigned char * data)1183*6a54128fSAndroid Build Coastguard Worker static void emit(struct tree *tree, unsigned char *data)
1184*6a54128fSAndroid Build Coastguard Worker {
1185*6a54128fSAndroid Build Coastguard Worker struct node *node;
1186*6a54128fSAndroid Build Coastguard Worker unsigned int leftmask;
1187*6a54128fSAndroid Build Coastguard Worker unsigned int rightmask;
1188*6a54128fSAndroid Build Coastguard Worker unsigned int bitmask;
1189*6a54128fSAndroid Build Coastguard Worker int offlen;
1190*6a54128fSAndroid Build Coastguard Worker int offset;
1191*6a54128fSAndroid Build Coastguard Worker int index;
1192*6a54128fSAndroid Build Coastguard Worker int indent;
1193*6a54128fSAndroid Build Coastguard Worker int size;
1194*6a54128fSAndroid Build Coastguard Worker int bytes;
1195*6a54128fSAndroid Build Coastguard Worker int leaves;
1196*6a54128fSAndroid Build Coastguard Worker int nodes[4];
1197*6a54128fSAndroid Build Coastguard Worker unsigned char byte;
1198*6a54128fSAndroid Build Coastguard Worker
1199*6a54128fSAndroid Build Coastguard Worker nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1200*6a54128fSAndroid Build Coastguard Worker leaves = 0;
1201*6a54128fSAndroid Build Coastguard Worker bytes = 0;
1202*6a54128fSAndroid Build Coastguard Worker index = tree->index;
1203*6a54128fSAndroid Build Coastguard Worker data += index;
1204*6a54128fSAndroid Build Coastguard Worker indent = 1;
1205*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
1206*6a54128fSAndroid Build Coastguard Worker printf("Emitting %s_%x\n", tree->type, tree->maxage);
1207*6a54128fSAndroid Build Coastguard Worker if (tree->childnode == LEAF) {
1208*6a54128fSAndroid Build Coastguard Worker assert(tree->root);
1209*6a54128fSAndroid Build Coastguard Worker tree->leaf_emit(tree->root, data);
1210*6a54128fSAndroid Build Coastguard Worker size = tree->leaf_size(tree->root);
1211*6a54128fSAndroid Build Coastguard Worker index += size;
1212*6a54128fSAndroid Build Coastguard Worker leaves++;
1213*6a54128fSAndroid Build Coastguard Worker goto done;
1214*6a54128fSAndroid Build Coastguard Worker }
1215*6a54128fSAndroid Build Coastguard Worker
1216*6a54128fSAndroid Build Coastguard Worker assert(tree->childnode == NODE);
1217*6a54128fSAndroid Build Coastguard Worker node = tree->root;
1218*6a54128fSAndroid Build Coastguard Worker leftmask = rightmask = 0;
1219*6a54128fSAndroid Build Coastguard Worker while (node) {
1220*6a54128fSAndroid Build Coastguard Worker if (!node->mark)
1221*6a54128fSAndroid Build Coastguard Worker goto skip;
1222*6a54128fSAndroid Build Coastguard Worker assert(node->offset != -1);
1223*6a54128fSAndroid Build Coastguard Worker assert(node->index == index);
1224*6a54128fSAndroid Build Coastguard Worker
1225*6a54128fSAndroid Build Coastguard Worker byte = 0;
1226*6a54128fSAndroid Build Coastguard Worker if (node->nextbyte)
1227*6a54128fSAndroid Build Coastguard Worker byte |= NEXTBYTE;
1228*6a54128fSAndroid Build Coastguard Worker byte |= (node->bitnum & BITNUM);
1229*6a54128fSAndroid Build Coastguard Worker if (node->left && node->right) {
1230*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == NODE)
1231*6a54128fSAndroid Build Coastguard Worker byte |= LEFTNODE;
1232*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == NODE)
1233*6a54128fSAndroid Build Coastguard Worker byte |= RIGHTNODE;
1234*6a54128fSAndroid Build Coastguard Worker if (node->offset <= 0xff)
1235*6a54128fSAndroid Build Coastguard Worker offlen = 1;
1236*6a54128fSAndroid Build Coastguard Worker else if (node->offset <= 0xffff)
1237*6a54128fSAndroid Build Coastguard Worker offlen = 2;
1238*6a54128fSAndroid Build Coastguard Worker else
1239*6a54128fSAndroid Build Coastguard Worker offlen = 3;
1240*6a54128fSAndroid Build Coastguard Worker nodes[offlen]++;
1241*6a54128fSAndroid Build Coastguard Worker offset = node->offset;
1242*6a54128fSAndroid Build Coastguard Worker byte |= offlen << OFFLEN_SHIFT;
1243*6a54128fSAndroid Build Coastguard Worker *data++ = byte;
1244*6a54128fSAndroid Build Coastguard Worker index++;
1245*6a54128fSAndroid Build Coastguard Worker while (offlen--) {
1246*6a54128fSAndroid Build Coastguard Worker *data++ = offset & 0xff;
1247*6a54128fSAndroid Build Coastguard Worker index++;
1248*6a54128fSAndroid Build Coastguard Worker offset >>= 8;
1249*6a54128fSAndroid Build Coastguard Worker }
1250*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
1251*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == NODE)
1252*6a54128fSAndroid Build Coastguard Worker byte |= TRIENODE;
1253*6a54128fSAndroid Build Coastguard Worker nodes[0]++;
1254*6a54128fSAndroid Build Coastguard Worker *data++ = byte;
1255*6a54128fSAndroid Build Coastguard Worker index++;
1256*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
1257*6a54128fSAndroid Build Coastguard Worker byte |= RIGHTNODE;
1258*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == NODE)
1259*6a54128fSAndroid Build Coastguard Worker byte |= TRIENODE;
1260*6a54128fSAndroid Build Coastguard Worker nodes[0]++;
1261*6a54128fSAndroid Build Coastguard Worker *data++ = byte;
1262*6a54128fSAndroid Build Coastguard Worker index++;
1263*6a54128fSAndroid Build Coastguard Worker } else {
1264*6a54128fSAndroid Build Coastguard Worker assert(0);
1265*6a54128fSAndroid Build Coastguard Worker }
1266*6a54128fSAndroid Build Coastguard Worker skip:
1267*6a54128fSAndroid Build Coastguard Worker while (node) {
1268*6a54128fSAndroid Build Coastguard Worker bitmask = 1 << node->bitnum;
1269*6a54128fSAndroid Build Coastguard Worker if (node->mark && (leftmask & bitmask) == 0) {
1270*6a54128fSAndroid Build Coastguard Worker leftmask |= bitmask;
1271*6a54128fSAndroid Build Coastguard Worker if (node->leftnode == LEAF) {
1272*6a54128fSAndroid Build Coastguard Worker assert(node->left);
1273*6a54128fSAndroid Build Coastguard Worker data = tree->leaf_emit(node->left,
1274*6a54128fSAndroid Build Coastguard Worker data);
1275*6a54128fSAndroid Build Coastguard Worker size = tree->leaf_size(node->left);
1276*6a54128fSAndroid Build Coastguard Worker index += size;
1277*6a54128fSAndroid Build Coastguard Worker bytes += size;
1278*6a54128fSAndroid Build Coastguard Worker leaves++;
1279*6a54128fSAndroid Build Coastguard Worker } else if (node->left) {
1280*6a54128fSAndroid Build Coastguard Worker assert(node->leftnode == NODE);
1281*6a54128fSAndroid Build Coastguard Worker indent += 1;
1282*6a54128fSAndroid Build Coastguard Worker node = node->left;
1283*6a54128fSAndroid Build Coastguard Worker break;
1284*6a54128fSAndroid Build Coastguard Worker }
1285*6a54128fSAndroid Build Coastguard Worker }
1286*6a54128fSAndroid Build Coastguard Worker if (node->mark && (rightmask & bitmask) == 0) {
1287*6a54128fSAndroid Build Coastguard Worker rightmask |= bitmask;
1288*6a54128fSAndroid Build Coastguard Worker if (node->rightnode == LEAF) {
1289*6a54128fSAndroid Build Coastguard Worker assert(node->right);
1290*6a54128fSAndroid Build Coastguard Worker data = tree->leaf_emit(node->right,
1291*6a54128fSAndroid Build Coastguard Worker data);
1292*6a54128fSAndroid Build Coastguard Worker size = tree->leaf_size(node->right);
1293*6a54128fSAndroid Build Coastguard Worker index += size;
1294*6a54128fSAndroid Build Coastguard Worker bytes += size;
1295*6a54128fSAndroid Build Coastguard Worker leaves++;
1296*6a54128fSAndroid Build Coastguard Worker } else if (node->right) {
1297*6a54128fSAndroid Build Coastguard Worker assert(node->rightnode == NODE);
1298*6a54128fSAndroid Build Coastguard Worker indent += 1;
1299*6a54128fSAndroid Build Coastguard Worker node = node->right;
1300*6a54128fSAndroid Build Coastguard Worker break;
1301*6a54128fSAndroid Build Coastguard Worker }
1302*6a54128fSAndroid Build Coastguard Worker }
1303*6a54128fSAndroid Build Coastguard Worker leftmask &= ~bitmask;
1304*6a54128fSAndroid Build Coastguard Worker rightmask &= ~bitmask;
1305*6a54128fSAndroid Build Coastguard Worker node = node->parent;
1306*6a54128fSAndroid Build Coastguard Worker indent -= 1;
1307*6a54128fSAndroid Build Coastguard Worker }
1308*6a54128fSAndroid Build Coastguard Worker }
1309*6a54128fSAndroid Build Coastguard Worker done:
1310*6a54128fSAndroid Build Coastguard Worker if (verbose > 0) {
1311*6a54128fSAndroid Build Coastguard Worker printf("Emitted %d (%d) leaves",
1312*6a54128fSAndroid Build Coastguard Worker leaves, bytes);
1313*6a54128fSAndroid Build Coastguard Worker printf(" %d (%d+%d+%d+%d) nodes",
1314*6a54128fSAndroid Build Coastguard Worker nodes[0] + nodes[1] + nodes[2] + nodes[3],
1315*6a54128fSAndroid Build Coastguard Worker nodes[0], nodes[1], nodes[2], nodes[3]);
1316*6a54128fSAndroid Build Coastguard Worker printf(" %d total\n", index - tree->index);
1317*6a54128fSAndroid Build Coastguard Worker }
1318*6a54128fSAndroid Build Coastguard Worker }
1319*6a54128fSAndroid Build Coastguard Worker
1320*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
1321*6a54128fSAndroid Build Coastguard Worker
1322*6a54128fSAndroid Build Coastguard Worker /*
1323*6a54128fSAndroid Build Coastguard Worker * Unicode data.
1324*6a54128fSAndroid Build Coastguard Worker *
1325*6a54128fSAndroid Build Coastguard Worker * We need to keep track of the Canonical Combining Class, the Age,
1326*6a54128fSAndroid Build Coastguard Worker * and decompositions for a code point.
1327*6a54128fSAndroid Build Coastguard Worker *
1328*6a54128fSAndroid Build Coastguard Worker * For the Age, we store the index into the ages table. Effectively
1329*6a54128fSAndroid Build Coastguard Worker * this is a generation number that the table maps to a unicode
1330*6a54128fSAndroid Build Coastguard Worker * version.
1331*6a54128fSAndroid Build Coastguard Worker *
1332*6a54128fSAndroid Build Coastguard Worker * The correction field is used to indicate that this entry is in the
1333*6a54128fSAndroid Build Coastguard Worker * corrections array, which contains decompositions that were
1334*6a54128fSAndroid Build Coastguard Worker * corrected in later revisions. The value of the correction field is
1335*6a54128fSAndroid Build Coastguard Worker * the Unicode version in which the mapping was corrected.
1336*6a54128fSAndroid Build Coastguard Worker */
1337*6a54128fSAndroid Build Coastguard Worker struct unicode_data {
1338*6a54128fSAndroid Build Coastguard Worker unsigned int code;
1339*6a54128fSAndroid Build Coastguard Worker int ccc;
1340*6a54128fSAndroid Build Coastguard Worker int gen;
1341*6a54128fSAndroid Build Coastguard Worker int correction;
1342*6a54128fSAndroid Build Coastguard Worker unsigned int *utf32nfkdi;
1343*6a54128fSAndroid Build Coastguard Worker unsigned int *utf32nfkdicf;
1344*6a54128fSAndroid Build Coastguard Worker char *utf8nfkdi;
1345*6a54128fSAndroid Build Coastguard Worker char *utf8nfkdicf;
1346*6a54128fSAndroid Build Coastguard Worker };
1347*6a54128fSAndroid Build Coastguard Worker
1348*6a54128fSAndroid Build Coastguard Worker struct unicode_data unicode_data[0x110000];
1349*6a54128fSAndroid Build Coastguard Worker struct unicode_data *corrections;
1350*6a54128fSAndroid Build Coastguard Worker int corrections_count;
1351*6a54128fSAndroid Build Coastguard Worker
1352*6a54128fSAndroid Build Coastguard Worker struct tree *nfkdi_tree;
1353*6a54128fSAndroid Build Coastguard Worker struct tree *nfkdicf_tree;
1354*6a54128fSAndroid Build Coastguard Worker
1355*6a54128fSAndroid Build Coastguard Worker struct tree *trees;
1356*6a54128fSAndroid Build Coastguard Worker int trees_count;
1357*6a54128fSAndroid Build Coastguard Worker
1358*6a54128fSAndroid Build Coastguard Worker /*
1359*6a54128fSAndroid Build Coastguard Worker * Check the corrections array to see if this entry was corrected at
1360*6a54128fSAndroid Build Coastguard Worker * some point.
1361*6a54128fSAndroid Build Coastguard Worker */
corrections_lookup(struct unicode_data * u)1362*6a54128fSAndroid Build Coastguard Worker static struct unicode_data *corrections_lookup(struct unicode_data *u)
1363*6a54128fSAndroid Build Coastguard Worker {
1364*6a54128fSAndroid Build Coastguard Worker int i;
1365*6a54128fSAndroid Build Coastguard Worker
1366*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != corrections_count; i++)
1367*6a54128fSAndroid Build Coastguard Worker if (u->code == corrections[i].code)
1368*6a54128fSAndroid Build Coastguard Worker return &corrections[i];
1369*6a54128fSAndroid Build Coastguard Worker return u;
1370*6a54128fSAndroid Build Coastguard Worker }
1371*6a54128fSAndroid Build Coastguard Worker
nfkdi_equal(void * l,void * r)1372*6a54128fSAndroid Build Coastguard Worker static int nfkdi_equal(void *l, void *r)
1373*6a54128fSAndroid Build Coastguard Worker {
1374*6a54128fSAndroid Build Coastguard Worker struct unicode_data *left = l;
1375*6a54128fSAndroid Build Coastguard Worker struct unicode_data *right = r;
1376*6a54128fSAndroid Build Coastguard Worker
1377*6a54128fSAndroid Build Coastguard Worker if (left->gen != right->gen)
1378*6a54128fSAndroid Build Coastguard Worker return 0;
1379*6a54128fSAndroid Build Coastguard Worker if (left->ccc != right->ccc)
1380*6a54128fSAndroid Build Coastguard Worker return 0;
1381*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdi && right->utf8nfkdi &&
1382*6a54128fSAndroid Build Coastguard Worker strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1383*6a54128fSAndroid Build Coastguard Worker return 1;
1384*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdi || right->utf8nfkdi)
1385*6a54128fSAndroid Build Coastguard Worker return 0;
1386*6a54128fSAndroid Build Coastguard Worker return 1;
1387*6a54128fSAndroid Build Coastguard Worker }
1388*6a54128fSAndroid Build Coastguard Worker
nfkdicf_equal(void * l,void * r)1389*6a54128fSAndroid Build Coastguard Worker static int nfkdicf_equal(void *l, void *r)
1390*6a54128fSAndroid Build Coastguard Worker {
1391*6a54128fSAndroid Build Coastguard Worker struct unicode_data *left = l;
1392*6a54128fSAndroid Build Coastguard Worker struct unicode_data *right = r;
1393*6a54128fSAndroid Build Coastguard Worker
1394*6a54128fSAndroid Build Coastguard Worker if (left->gen != right->gen)
1395*6a54128fSAndroid Build Coastguard Worker return 0;
1396*6a54128fSAndroid Build Coastguard Worker if (left->ccc != right->ccc)
1397*6a54128fSAndroid Build Coastguard Worker return 0;
1398*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdicf && right->utf8nfkdicf &&
1399*6a54128fSAndroid Build Coastguard Worker strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
1400*6a54128fSAndroid Build Coastguard Worker return 1;
1401*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdicf && right->utf8nfkdicf)
1402*6a54128fSAndroid Build Coastguard Worker return 0;
1403*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdicf || right->utf8nfkdicf)
1404*6a54128fSAndroid Build Coastguard Worker return 0;
1405*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdi && right->utf8nfkdi &&
1406*6a54128fSAndroid Build Coastguard Worker strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1407*6a54128fSAndroid Build Coastguard Worker return 1;
1408*6a54128fSAndroid Build Coastguard Worker if (left->utf8nfkdi || right->utf8nfkdi)
1409*6a54128fSAndroid Build Coastguard Worker return 0;
1410*6a54128fSAndroid Build Coastguard Worker return 1;
1411*6a54128fSAndroid Build Coastguard Worker }
1412*6a54128fSAndroid Build Coastguard Worker
nfkdi_print(void * l,int indent)1413*6a54128fSAndroid Build Coastguard Worker static void nfkdi_print(void *l, int indent)
1414*6a54128fSAndroid Build Coastguard Worker {
1415*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1416*6a54128fSAndroid Build Coastguard Worker
1417*6a54128fSAndroid Build Coastguard Worker printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1418*6a54128fSAndroid Build Coastguard Worker leaf->code, leaf->ccc, leaf->gen);
1419*6a54128fSAndroid Build Coastguard Worker if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1420*6a54128fSAndroid Build Coastguard Worker printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1421*6a54128fSAndroid Build Coastguard Worker else if (leaf->utf8nfkdi)
1422*6a54128fSAndroid Build Coastguard Worker printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1423*6a54128fSAndroid Build Coastguard Worker printf("\n");
1424*6a54128fSAndroid Build Coastguard Worker }
1425*6a54128fSAndroid Build Coastguard Worker
nfkdicf_print(void * l,int indent)1426*6a54128fSAndroid Build Coastguard Worker static void nfkdicf_print(void *l, int indent)
1427*6a54128fSAndroid Build Coastguard Worker {
1428*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1429*6a54128fSAndroid Build Coastguard Worker
1430*6a54128fSAndroid Build Coastguard Worker printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1431*6a54128fSAndroid Build Coastguard Worker leaf->code, leaf->ccc, leaf->gen);
1432*6a54128fSAndroid Build Coastguard Worker if (leaf->utf8nfkdicf)
1433*6a54128fSAndroid Build Coastguard Worker printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
1434*6a54128fSAndroid Build Coastguard Worker else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1435*6a54128fSAndroid Build Coastguard Worker printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1436*6a54128fSAndroid Build Coastguard Worker else if (leaf->utf8nfkdi)
1437*6a54128fSAndroid Build Coastguard Worker printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1438*6a54128fSAndroid Build Coastguard Worker printf("\n");
1439*6a54128fSAndroid Build Coastguard Worker }
1440*6a54128fSAndroid Build Coastguard Worker
nfkdi_mark(void * l)1441*6a54128fSAndroid Build Coastguard Worker static int nfkdi_mark(void *l)
1442*6a54128fSAndroid Build Coastguard Worker {
1443*6a54128fSAndroid Build Coastguard Worker return 1;
1444*6a54128fSAndroid Build Coastguard Worker }
1445*6a54128fSAndroid Build Coastguard Worker
nfkdicf_mark(void * l)1446*6a54128fSAndroid Build Coastguard Worker static int nfkdicf_mark(void *l)
1447*6a54128fSAndroid Build Coastguard Worker {
1448*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1449*6a54128fSAndroid Build Coastguard Worker
1450*6a54128fSAndroid Build Coastguard Worker if (leaf->utf8nfkdicf)
1451*6a54128fSAndroid Build Coastguard Worker return 1;
1452*6a54128fSAndroid Build Coastguard Worker return 0;
1453*6a54128fSAndroid Build Coastguard Worker }
1454*6a54128fSAndroid Build Coastguard Worker
correction_mark(void * l)1455*6a54128fSAndroid Build Coastguard Worker static int correction_mark(void *l)
1456*6a54128fSAndroid Build Coastguard Worker {
1457*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1458*6a54128fSAndroid Build Coastguard Worker
1459*6a54128fSAndroid Build Coastguard Worker return leaf->correction;
1460*6a54128fSAndroid Build Coastguard Worker }
1461*6a54128fSAndroid Build Coastguard Worker
nfkdi_size(void * l)1462*6a54128fSAndroid Build Coastguard Worker static int nfkdi_size(void *l)
1463*6a54128fSAndroid Build Coastguard Worker {
1464*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1465*6a54128fSAndroid Build Coastguard Worker
1466*6a54128fSAndroid Build Coastguard Worker int size = 2;
1467*6a54128fSAndroid Build Coastguard Worker if (HANGUL_SYLLABLE(leaf->code))
1468*6a54128fSAndroid Build Coastguard Worker size += 1;
1469*6a54128fSAndroid Build Coastguard Worker else if (leaf->utf8nfkdi)
1470*6a54128fSAndroid Build Coastguard Worker size += strlen(leaf->utf8nfkdi) + 1;
1471*6a54128fSAndroid Build Coastguard Worker return size;
1472*6a54128fSAndroid Build Coastguard Worker }
1473*6a54128fSAndroid Build Coastguard Worker
nfkdicf_size(void * l)1474*6a54128fSAndroid Build Coastguard Worker static int nfkdicf_size(void *l)
1475*6a54128fSAndroid Build Coastguard Worker {
1476*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1477*6a54128fSAndroid Build Coastguard Worker
1478*6a54128fSAndroid Build Coastguard Worker int size = 2;
1479*6a54128fSAndroid Build Coastguard Worker if (HANGUL_SYLLABLE(leaf->code))
1480*6a54128fSAndroid Build Coastguard Worker size += 1;
1481*6a54128fSAndroid Build Coastguard Worker else if (leaf->utf8nfkdicf)
1482*6a54128fSAndroid Build Coastguard Worker size += strlen(leaf->utf8nfkdicf) + 1;
1483*6a54128fSAndroid Build Coastguard Worker else if (leaf->utf8nfkdi)
1484*6a54128fSAndroid Build Coastguard Worker size += strlen(leaf->utf8nfkdi) + 1;
1485*6a54128fSAndroid Build Coastguard Worker return size;
1486*6a54128fSAndroid Build Coastguard Worker }
1487*6a54128fSAndroid Build Coastguard Worker
nfkdi_index(struct tree * tree,void * l)1488*6a54128fSAndroid Build Coastguard Worker static int *nfkdi_index(struct tree *tree, void *l)
1489*6a54128fSAndroid Build Coastguard Worker {
1490*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1491*6a54128fSAndroid Build Coastguard Worker
1492*6a54128fSAndroid Build Coastguard Worker return &tree->leafindex[leaf->code];
1493*6a54128fSAndroid Build Coastguard Worker }
1494*6a54128fSAndroid Build Coastguard Worker
nfkdicf_index(struct tree * tree,void * l)1495*6a54128fSAndroid Build Coastguard Worker static int *nfkdicf_index(struct tree *tree, void *l)
1496*6a54128fSAndroid Build Coastguard Worker {
1497*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1498*6a54128fSAndroid Build Coastguard Worker
1499*6a54128fSAndroid Build Coastguard Worker return &tree->leafindex[leaf->code];
1500*6a54128fSAndroid Build Coastguard Worker }
1501*6a54128fSAndroid Build Coastguard Worker
nfkdi_emit(void * l,unsigned char * data)1502*6a54128fSAndroid Build Coastguard Worker static unsigned char *nfkdi_emit(void *l, unsigned char *data)
1503*6a54128fSAndroid Build Coastguard Worker {
1504*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1505*6a54128fSAndroid Build Coastguard Worker unsigned char *s;
1506*6a54128fSAndroid Build Coastguard Worker
1507*6a54128fSAndroid Build Coastguard Worker *data++ = leaf->gen;
1508*6a54128fSAndroid Build Coastguard Worker if (HANGUL_SYLLABLE(leaf->code)) {
1509*6a54128fSAndroid Build Coastguard Worker *data++ = DECOMPOSE;
1510*6a54128fSAndroid Build Coastguard Worker *data++ = HANGUL;
1511*6a54128fSAndroid Build Coastguard Worker } else if (leaf->utf8nfkdi) {
1512*6a54128fSAndroid Build Coastguard Worker *data++ = DECOMPOSE;
1513*6a54128fSAndroid Build Coastguard Worker s = (unsigned char*)leaf->utf8nfkdi;
1514*6a54128fSAndroid Build Coastguard Worker while ((*data++ = *s++) != 0)
1515*6a54128fSAndroid Build Coastguard Worker ;
1516*6a54128fSAndroid Build Coastguard Worker } else {
1517*6a54128fSAndroid Build Coastguard Worker *data++ = leaf->ccc;
1518*6a54128fSAndroid Build Coastguard Worker }
1519*6a54128fSAndroid Build Coastguard Worker return data;
1520*6a54128fSAndroid Build Coastguard Worker }
1521*6a54128fSAndroid Build Coastguard Worker
nfkdicf_emit(void * l,unsigned char * data)1522*6a54128fSAndroid Build Coastguard Worker static unsigned char *nfkdicf_emit(void *l, unsigned char *data)
1523*6a54128fSAndroid Build Coastguard Worker {
1524*6a54128fSAndroid Build Coastguard Worker struct unicode_data *leaf = l;
1525*6a54128fSAndroid Build Coastguard Worker unsigned char *s;
1526*6a54128fSAndroid Build Coastguard Worker
1527*6a54128fSAndroid Build Coastguard Worker *data++ = leaf->gen;
1528*6a54128fSAndroid Build Coastguard Worker if (HANGUL_SYLLABLE(leaf->code)) {
1529*6a54128fSAndroid Build Coastguard Worker *data++ = DECOMPOSE;
1530*6a54128fSAndroid Build Coastguard Worker *data++ = HANGUL;
1531*6a54128fSAndroid Build Coastguard Worker } else if (leaf->utf8nfkdicf) {
1532*6a54128fSAndroid Build Coastguard Worker *data++ = DECOMPOSE;
1533*6a54128fSAndroid Build Coastguard Worker s = (unsigned char*)leaf->utf8nfkdicf;
1534*6a54128fSAndroid Build Coastguard Worker while ((*data++ = *s++) != 0)
1535*6a54128fSAndroid Build Coastguard Worker ;
1536*6a54128fSAndroid Build Coastguard Worker } else if (leaf->utf8nfkdi) {
1537*6a54128fSAndroid Build Coastguard Worker *data++ = DECOMPOSE;
1538*6a54128fSAndroid Build Coastguard Worker s = (unsigned char*)leaf->utf8nfkdi;
1539*6a54128fSAndroid Build Coastguard Worker while ((*data++ = *s++) != 0)
1540*6a54128fSAndroid Build Coastguard Worker ;
1541*6a54128fSAndroid Build Coastguard Worker } else {
1542*6a54128fSAndroid Build Coastguard Worker *data++ = leaf->ccc;
1543*6a54128fSAndroid Build Coastguard Worker }
1544*6a54128fSAndroid Build Coastguard Worker return data;
1545*6a54128fSAndroid Build Coastguard Worker }
1546*6a54128fSAndroid Build Coastguard Worker
utf8_create(struct unicode_data * data)1547*6a54128fSAndroid Build Coastguard Worker static void utf8_create(struct unicode_data *data)
1548*6a54128fSAndroid Build Coastguard Worker {
1549*6a54128fSAndroid Build Coastguard Worker char utf[18*4+1];
1550*6a54128fSAndroid Build Coastguard Worker char *u;
1551*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
1552*6a54128fSAndroid Build Coastguard Worker int i;
1553*6a54128fSAndroid Build Coastguard Worker
1554*6a54128fSAndroid Build Coastguard Worker if (data->utf8nfkdi) {
1555*6a54128fSAndroid Build Coastguard Worker assert(data->utf8nfkdi[0] == HANGUL);
1556*6a54128fSAndroid Build Coastguard Worker return;
1557*6a54128fSAndroid Build Coastguard Worker }
1558*6a54128fSAndroid Build Coastguard Worker
1559*6a54128fSAndroid Build Coastguard Worker u = utf;
1560*6a54128fSAndroid Build Coastguard Worker um = data->utf32nfkdi;
1561*6a54128fSAndroid Build Coastguard Worker if (um) {
1562*6a54128fSAndroid Build Coastguard Worker for (i = 0; um[i]; i++)
1563*6a54128fSAndroid Build Coastguard Worker u += utf8encode(u, um[i]);
1564*6a54128fSAndroid Build Coastguard Worker *u = '\0';
1565*6a54128fSAndroid Build Coastguard Worker data->utf8nfkdi = strdup(utf);
1566*6a54128fSAndroid Build Coastguard Worker }
1567*6a54128fSAndroid Build Coastguard Worker u = utf;
1568*6a54128fSAndroid Build Coastguard Worker um = data->utf32nfkdicf;
1569*6a54128fSAndroid Build Coastguard Worker if (um) {
1570*6a54128fSAndroid Build Coastguard Worker for (i = 0; um[i]; i++)
1571*6a54128fSAndroid Build Coastguard Worker u += utf8encode(u, um[i]);
1572*6a54128fSAndroid Build Coastguard Worker *u = '\0';
1573*6a54128fSAndroid Build Coastguard Worker if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf))
1574*6a54128fSAndroid Build Coastguard Worker data->utf8nfkdicf = strdup(utf);
1575*6a54128fSAndroid Build Coastguard Worker }
1576*6a54128fSAndroid Build Coastguard Worker }
1577*6a54128fSAndroid Build Coastguard Worker
utf8_init(void)1578*6a54128fSAndroid Build Coastguard Worker static void utf8_init(void)
1579*6a54128fSAndroid Build Coastguard Worker {
1580*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
1581*6a54128fSAndroid Build Coastguard Worker int i;
1582*6a54128fSAndroid Build Coastguard Worker
1583*6a54128fSAndroid Build Coastguard Worker for (unichar = 0; unichar != 0x110000; unichar++)
1584*6a54128fSAndroid Build Coastguard Worker utf8_create(&unicode_data[unichar]);
1585*6a54128fSAndroid Build Coastguard Worker
1586*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != corrections_count; i++)
1587*6a54128fSAndroid Build Coastguard Worker utf8_create(&corrections[i]);
1588*6a54128fSAndroid Build Coastguard Worker }
1589*6a54128fSAndroid Build Coastguard Worker
trees_init(void)1590*6a54128fSAndroid Build Coastguard Worker static void trees_init(void)
1591*6a54128fSAndroid Build Coastguard Worker {
1592*6a54128fSAndroid Build Coastguard Worker struct unicode_data *data;
1593*6a54128fSAndroid Build Coastguard Worker unsigned int maxage;
1594*6a54128fSAndroid Build Coastguard Worker unsigned int nextage;
1595*6a54128fSAndroid Build Coastguard Worker int count;
1596*6a54128fSAndroid Build Coastguard Worker int i;
1597*6a54128fSAndroid Build Coastguard Worker int j;
1598*6a54128fSAndroid Build Coastguard Worker
1599*6a54128fSAndroid Build Coastguard Worker /* Count the number of different ages. */
1600*6a54128fSAndroid Build Coastguard Worker count = 0;
1601*6a54128fSAndroid Build Coastguard Worker nextage = (unsigned int)-1;
1602*6a54128fSAndroid Build Coastguard Worker do {
1603*6a54128fSAndroid Build Coastguard Worker maxage = nextage;
1604*6a54128fSAndroid Build Coastguard Worker nextage = 0;
1605*6a54128fSAndroid Build Coastguard Worker for (i = 0; i <= corrections_count; i++) {
1606*6a54128fSAndroid Build Coastguard Worker data = &corrections[i];
1607*6a54128fSAndroid Build Coastguard Worker if (nextage < data->correction &&
1608*6a54128fSAndroid Build Coastguard Worker data->correction < maxage)
1609*6a54128fSAndroid Build Coastguard Worker nextage = data->correction;
1610*6a54128fSAndroid Build Coastguard Worker }
1611*6a54128fSAndroid Build Coastguard Worker count++;
1612*6a54128fSAndroid Build Coastguard Worker } while (nextage);
1613*6a54128fSAndroid Build Coastguard Worker
1614*6a54128fSAndroid Build Coastguard Worker /* Two trees per age: nfkdi and nfkdicf */
1615*6a54128fSAndroid Build Coastguard Worker trees_count = count * 2;
1616*6a54128fSAndroid Build Coastguard Worker trees = calloc(trees_count, sizeof(struct tree));
1617*6a54128fSAndroid Build Coastguard Worker
1618*6a54128fSAndroid Build Coastguard Worker /* Assign ages to the trees. */
1619*6a54128fSAndroid Build Coastguard Worker count = trees_count;
1620*6a54128fSAndroid Build Coastguard Worker nextage = (unsigned int)-1;
1621*6a54128fSAndroid Build Coastguard Worker do {
1622*6a54128fSAndroid Build Coastguard Worker maxage = nextage;
1623*6a54128fSAndroid Build Coastguard Worker trees[--count].maxage = maxage;
1624*6a54128fSAndroid Build Coastguard Worker trees[--count].maxage = maxage;
1625*6a54128fSAndroid Build Coastguard Worker nextage = 0;
1626*6a54128fSAndroid Build Coastguard Worker for (i = 0; i <= corrections_count; i++) {
1627*6a54128fSAndroid Build Coastguard Worker data = &corrections[i];
1628*6a54128fSAndroid Build Coastguard Worker if (nextage < data->correction &&
1629*6a54128fSAndroid Build Coastguard Worker data->correction < maxage)
1630*6a54128fSAndroid Build Coastguard Worker nextage = data->correction;
1631*6a54128fSAndroid Build Coastguard Worker }
1632*6a54128fSAndroid Build Coastguard Worker } while (nextage);
1633*6a54128fSAndroid Build Coastguard Worker
1634*6a54128fSAndroid Build Coastguard Worker /* The ages assigned above are off by one. */
1635*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++) {
1636*6a54128fSAndroid Build Coastguard Worker j = 0;
1637*6a54128fSAndroid Build Coastguard Worker while (ages[j] < trees[i].maxage)
1638*6a54128fSAndroid Build Coastguard Worker j++;
1639*6a54128fSAndroid Build Coastguard Worker trees[i].maxage = ages[j-1];
1640*6a54128fSAndroid Build Coastguard Worker }
1641*6a54128fSAndroid Build Coastguard Worker
1642*6a54128fSAndroid Build Coastguard Worker /* Set up the forwarding between trees. */
1643*6a54128fSAndroid Build Coastguard Worker trees[trees_count-2].next = &trees[trees_count-1];
1644*6a54128fSAndroid Build Coastguard Worker trees[trees_count-1].leaf_mark = nfkdi_mark;
1645*6a54128fSAndroid Build Coastguard Worker trees[trees_count-2].leaf_mark = nfkdicf_mark;
1646*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count-2; i += 2) {
1647*6a54128fSAndroid Build Coastguard Worker trees[i].next = &trees[trees_count-2];
1648*6a54128fSAndroid Build Coastguard Worker trees[i].leaf_mark = correction_mark;
1649*6a54128fSAndroid Build Coastguard Worker trees[i+1].next = &trees[trees_count-1];
1650*6a54128fSAndroid Build Coastguard Worker trees[i+1].leaf_mark = correction_mark;
1651*6a54128fSAndroid Build Coastguard Worker }
1652*6a54128fSAndroid Build Coastguard Worker
1653*6a54128fSAndroid Build Coastguard Worker /* Assign the callouts. */
1654*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i += 2) {
1655*6a54128fSAndroid Build Coastguard Worker trees[i].type = "nfkdicf";
1656*6a54128fSAndroid Build Coastguard Worker trees[i].leaf_equal = nfkdicf_equal;
1657*6a54128fSAndroid Build Coastguard Worker trees[i].leaf_print = nfkdicf_print;
1658*6a54128fSAndroid Build Coastguard Worker trees[i].leaf_size = nfkdicf_size;
1659*6a54128fSAndroid Build Coastguard Worker trees[i].leaf_index = nfkdicf_index;
1660*6a54128fSAndroid Build Coastguard Worker trees[i].leaf_emit = nfkdicf_emit;
1661*6a54128fSAndroid Build Coastguard Worker
1662*6a54128fSAndroid Build Coastguard Worker trees[i+1].type = "nfkdi";
1663*6a54128fSAndroid Build Coastguard Worker trees[i+1].leaf_equal = nfkdi_equal;
1664*6a54128fSAndroid Build Coastguard Worker trees[i+1].leaf_print = nfkdi_print;
1665*6a54128fSAndroid Build Coastguard Worker trees[i+1].leaf_size = nfkdi_size;
1666*6a54128fSAndroid Build Coastguard Worker trees[i+1].leaf_index = nfkdi_index;
1667*6a54128fSAndroid Build Coastguard Worker trees[i+1].leaf_emit = nfkdi_emit;
1668*6a54128fSAndroid Build Coastguard Worker }
1669*6a54128fSAndroid Build Coastguard Worker
1670*6a54128fSAndroid Build Coastguard Worker /* Finish init. */
1671*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1672*6a54128fSAndroid Build Coastguard Worker trees[i].childnode = NODE;
1673*6a54128fSAndroid Build Coastguard Worker }
1674*6a54128fSAndroid Build Coastguard Worker
trees_populate(void)1675*6a54128fSAndroid Build Coastguard Worker static void trees_populate(void)
1676*6a54128fSAndroid Build Coastguard Worker {
1677*6a54128fSAndroid Build Coastguard Worker struct unicode_data *data;
1678*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
1679*6a54128fSAndroid Build Coastguard Worker char keyval[4];
1680*6a54128fSAndroid Build Coastguard Worker int keylen;
1681*6a54128fSAndroid Build Coastguard Worker int i;
1682*6a54128fSAndroid Build Coastguard Worker
1683*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++) {
1684*6a54128fSAndroid Build Coastguard Worker if (verbose > 0) {
1685*6a54128fSAndroid Build Coastguard Worker printf("Populating %s_%x\n",
1686*6a54128fSAndroid Build Coastguard Worker trees[i].type, trees[i].maxage);
1687*6a54128fSAndroid Build Coastguard Worker }
1688*6a54128fSAndroid Build Coastguard Worker for (unichar = 0; unichar != 0x110000; unichar++) {
1689*6a54128fSAndroid Build Coastguard Worker if (unicode_data[unichar].gen < 0)
1690*6a54128fSAndroid Build Coastguard Worker continue;
1691*6a54128fSAndroid Build Coastguard Worker keylen = utf8encode(keyval, unichar);
1692*6a54128fSAndroid Build Coastguard Worker data = corrections_lookup(&unicode_data[unichar]);
1693*6a54128fSAndroid Build Coastguard Worker if (data->correction <= trees[i].maxage)
1694*6a54128fSAndroid Build Coastguard Worker data = &unicode_data[unichar];
1695*6a54128fSAndroid Build Coastguard Worker insert(&trees[i], keyval, keylen, data);
1696*6a54128fSAndroid Build Coastguard Worker }
1697*6a54128fSAndroid Build Coastguard Worker }
1698*6a54128fSAndroid Build Coastguard Worker }
1699*6a54128fSAndroid Build Coastguard Worker
trees_reduce(void)1700*6a54128fSAndroid Build Coastguard Worker static void trees_reduce(void)
1701*6a54128fSAndroid Build Coastguard Worker {
1702*6a54128fSAndroid Build Coastguard Worker int i;
1703*6a54128fSAndroid Build Coastguard Worker int size;
1704*6a54128fSAndroid Build Coastguard Worker int changed;
1705*6a54128fSAndroid Build Coastguard Worker
1706*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1707*6a54128fSAndroid Build Coastguard Worker prune(&trees[i]);
1708*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1709*6a54128fSAndroid Build Coastguard Worker mark_nodes(&trees[i]);
1710*6a54128fSAndroid Build Coastguard Worker do {
1711*6a54128fSAndroid Build Coastguard Worker size = 0;
1712*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1713*6a54128fSAndroid Build Coastguard Worker size = index_nodes(&trees[i], size);
1714*6a54128fSAndroid Build Coastguard Worker changed = 0;
1715*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1716*6a54128fSAndroid Build Coastguard Worker changed += size_nodes(&trees[i]);
1717*6a54128fSAndroid Build Coastguard Worker } while (changed);
1718*6a54128fSAndroid Build Coastguard Worker
1719*6a54128fSAndroid Build Coastguard Worker utf8data = calloc(size, 1);
1720*6a54128fSAndroid Build Coastguard Worker utf8data_size = size;
1721*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1722*6a54128fSAndroid Build Coastguard Worker emit(&trees[i], utf8data);
1723*6a54128fSAndroid Build Coastguard Worker
1724*6a54128fSAndroid Build Coastguard Worker if (verbose > 0) {
1725*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++) {
1726*6a54128fSAndroid Build Coastguard Worker printf("%s_%x idx %d\n",
1727*6a54128fSAndroid Build Coastguard Worker trees[i].type, trees[i].maxage, trees[i].index);
1728*6a54128fSAndroid Build Coastguard Worker }
1729*6a54128fSAndroid Build Coastguard Worker }
1730*6a54128fSAndroid Build Coastguard Worker
1731*6a54128fSAndroid Build Coastguard Worker nfkdi = utf8data + trees[trees_count-1].index;
1732*6a54128fSAndroid Build Coastguard Worker nfkdicf = utf8data + trees[trees_count-2].index;
1733*6a54128fSAndroid Build Coastguard Worker
1734*6a54128fSAndroid Build Coastguard Worker nfkdi_tree = &trees[trees_count-1];
1735*6a54128fSAndroid Build Coastguard Worker nfkdicf_tree = &trees[trees_count-2];
1736*6a54128fSAndroid Build Coastguard Worker }
1737*6a54128fSAndroid Build Coastguard Worker
verify(struct tree * tree)1738*6a54128fSAndroid Build Coastguard Worker static void verify(struct tree *tree)
1739*6a54128fSAndroid Build Coastguard Worker {
1740*6a54128fSAndroid Build Coastguard Worker struct unicode_data *data;
1741*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
1742*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
1743*6a54128fSAndroid Build Coastguard Worker char key[4];
1744*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
1745*6a54128fSAndroid Build Coastguard Worker int report;
1746*6a54128fSAndroid Build Coastguard Worker int nocf;
1747*6a54128fSAndroid Build Coastguard Worker
1748*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
1749*6a54128fSAndroid Build Coastguard Worker printf("Verifying %s_%x\n", tree->type, tree->maxage);
1750*6a54128fSAndroid Build Coastguard Worker nocf = strcmp(tree->type, "nfkdicf");
1751*6a54128fSAndroid Build Coastguard Worker
1752*6a54128fSAndroid Build Coastguard Worker for (unichar = 0; unichar != 0x110000; unichar++) {
1753*6a54128fSAndroid Build Coastguard Worker report = 0;
1754*6a54128fSAndroid Build Coastguard Worker data = corrections_lookup(&unicode_data[unichar]);
1755*6a54128fSAndroid Build Coastguard Worker if (data->correction <= tree->maxage)
1756*6a54128fSAndroid Build Coastguard Worker data = &unicode_data[unichar];
1757*6a54128fSAndroid Build Coastguard Worker utf8encode(key,unichar);
1758*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(tree, hangul, key);
1759*6a54128fSAndroid Build Coastguard Worker
1760*6a54128fSAndroid Build Coastguard Worker if (!leaf) {
1761*6a54128fSAndroid Build Coastguard Worker if (data->gen != -1)
1762*6a54128fSAndroid Build Coastguard Worker report++;
1763*6a54128fSAndroid Build Coastguard Worker if (unichar < 0xd800 || unichar > 0xdfff)
1764*6a54128fSAndroid Build Coastguard Worker report++;
1765*6a54128fSAndroid Build Coastguard Worker } else {
1766*6a54128fSAndroid Build Coastguard Worker if (unichar >= 0xd800 && unichar <= 0xdfff)
1767*6a54128fSAndroid Build Coastguard Worker report++;
1768*6a54128fSAndroid Build Coastguard Worker if (data->gen == -1)
1769*6a54128fSAndroid Build Coastguard Worker report++;
1770*6a54128fSAndroid Build Coastguard Worker if (data->gen != LEAF_GEN(leaf))
1771*6a54128fSAndroid Build Coastguard Worker report++;
1772*6a54128fSAndroid Build Coastguard Worker if (LEAF_CCC(leaf) == DECOMPOSE) {
1773*6a54128fSAndroid Build Coastguard Worker if (HANGUL_SYLLABLE(data->code)) {
1774*6a54128fSAndroid Build Coastguard Worker if (data->utf8nfkdi[0] != HANGUL)
1775*6a54128fSAndroid Build Coastguard Worker report++;
1776*6a54128fSAndroid Build Coastguard Worker } else if (nocf) {
1777*6a54128fSAndroid Build Coastguard Worker if (!data->utf8nfkdi) {
1778*6a54128fSAndroid Build Coastguard Worker report++;
1779*6a54128fSAndroid Build Coastguard Worker } else if (strcmp(data->utf8nfkdi,
1780*6a54128fSAndroid Build Coastguard Worker LEAF_STR(leaf))) {
1781*6a54128fSAndroid Build Coastguard Worker report++;
1782*6a54128fSAndroid Build Coastguard Worker }
1783*6a54128fSAndroid Build Coastguard Worker } else {
1784*6a54128fSAndroid Build Coastguard Worker if (!data->utf8nfkdicf &&
1785*6a54128fSAndroid Build Coastguard Worker !data->utf8nfkdi) {
1786*6a54128fSAndroid Build Coastguard Worker report++;
1787*6a54128fSAndroid Build Coastguard Worker } else if (data->utf8nfkdicf) {
1788*6a54128fSAndroid Build Coastguard Worker if (strcmp(data->utf8nfkdicf,
1789*6a54128fSAndroid Build Coastguard Worker LEAF_STR(leaf)))
1790*6a54128fSAndroid Build Coastguard Worker report++;
1791*6a54128fSAndroid Build Coastguard Worker } else if (strcmp(data->utf8nfkdi,
1792*6a54128fSAndroid Build Coastguard Worker LEAF_STR(leaf))) {
1793*6a54128fSAndroid Build Coastguard Worker report++;
1794*6a54128fSAndroid Build Coastguard Worker }
1795*6a54128fSAndroid Build Coastguard Worker }
1796*6a54128fSAndroid Build Coastguard Worker } else if (data->ccc != LEAF_CCC(leaf)) {
1797*6a54128fSAndroid Build Coastguard Worker report++;
1798*6a54128fSAndroid Build Coastguard Worker }
1799*6a54128fSAndroid Build Coastguard Worker }
1800*6a54128fSAndroid Build Coastguard Worker if (report) {
1801*6a54128fSAndroid Build Coastguard Worker printf("%X code %X gen %d ccc %d"
1802*6a54128fSAndroid Build Coastguard Worker " nfkdi -> \"%s\"",
1803*6a54128fSAndroid Build Coastguard Worker unichar, data->code, data->gen,
1804*6a54128fSAndroid Build Coastguard Worker data->ccc,
1805*6a54128fSAndroid Build Coastguard Worker data->utf8nfkdi);
1806*6a54128fSAndroid Build Coastguard Worker if (leaf) {
1807*6a54128fSAndroid Build Coastguard Worker printf(" gen %d ccc %d"
1808*6a54128fSAndroid Build Coastguard Worker " nfkdi -> \"%s\"",
1809*6a54128fSAndroid Build Coastguard Worker LEAF_GEN(leaf),
1810*6a54128fSAndroid Build Coastguard Worker LEAF_CCC(leaf),
1811*6a54128fSAndroid Build Coastguard Worker LEAF_CCC(leaf) == DECOMPOSE ?
1812*6a54128fSAndroid Build Coastguard Worker LEAF_STR(leaf) : "");
1813*6a54128fSAndroid Build Coastguard Worker }
1814*6a54128fSAndroid Build Coastguard Worker printf("\n");
1815*6a54128fSAndroid Build Coastguard Worker }
1816*6a54128fSAndroid Build Coastguard Worker }
1817*6a54128fSAndroid Build Coastguard Worker }
1818*6a54128fSAndroid Build Coastguard Worker
trees_verify(void)1819*6a54128fSAndroid Build Coastguard Worker static void trees_verify(void)
1820*6a54128fSAndroid Build Coastguard Worker {
1821*6a54128fSAndroid Build Coastguard Worker int i;
1822*6a54128fSAndroid Build Coastguard Worker
1823*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != trees_count; i++)
1824*6a54128fSAndroid Build Coastguard Worker verify(&trees[i]);
1825*6a54128fSAndroid Build Coastguard Worker }
1826*6a54128fSAndroid Build Coastguard Worker
1827*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
1828*6a54128fSAndroid Build Coastguard Worker
help(void)1829*6a54128fSAndroid Build Coastguard Worker static void help(void)
1830*6a54128fSAndroid Build Coastguard Worker {
1831*6a54128fSAndroid Build Coastguard Worker printf("Usage: %s [options]\n", argv0);
1832*6a54128fSAndroid Build Coastguard Worker printf("\n");
1833*6a54128fSAndroid Build Coastguard Worker printf("This program creates an a data trie used for parsing and\n");
1834*6a54128fSAndroid Build Coastguard Worker printf("normalization of UTF-8 strings. The trie is derived from\n");
1835*6a54128fSAndroid Build Coastguard Worker printf("a set of input files from the Unicode character database\n");
1836*6a54128fSAndroid Build Coastguard Worker printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1837*6a54128fSAndroid Build Coastguard Worker printf("\n");
1838*6a54128fSAndroid Build Coastguard Worker printf("The generated tree supports two normalization forms:\n");
1839*6a54128fSAndroid Build Coastguard Worker printf("\n");
1840*6a54128fSAndroid Build Coastguard Worker printf("\tnfkdi:\n");
1841*6a54128fSAndroid Build Coastguard Worker printf("\t- Apply unicode normalization form NFKD.\n");
1842*6a54128fSAndroid Build Coastguard Worker printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1843*6a54128fSAndroid Build Coastguard Worker printf("\n");
1844*6a54128fSAndroid Build Coastguard Worker printf("\tnfkdicf:\n");
1845*6a54128fSAndroid Build Coastguard Worker printf("\t- Apply unicode normalization form NFKD.\n");
1846*6a54128fSAndroid Build Coastguard Worker printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1847*6a54128fSAndroid Build Coastguard Worker printf("\t- Apply a full casefold (C + F).\n");
1848*6a54128fSAndroid Build Coastguard Worker printf("\n");
1849*6a54128fSAndroid Build Coastguard Worker printf("These forms were chosen as being most useful when dealing\n");
1850*6a54128fSAndroid Build Coastguard Worker printf("with file names: NFKD catches most cases where characters\n");
1851*6a54128fSAndroid Build Coastguard Worker printf("should be considered equivalent. The ignorables are mostly\n");
1852*6a54128fSAndroid Build Coastguard Worker printf("invisible, making names hard to type.\n");
1853*6a54128fSAndroid Build Coastguard Worker printf("\n");
1854*6a54128fSAndroid Build Coastguard Worker printf("The options to specify the files to be used are listed\n");
1855*6a54128fSAndroid Build Coastguard Worker printf("below with their default values, which are the names used\n");
1856*6a54128fSAndroid Build Coastguard Worker printf("by version 11.0.0 of the Unicode Character Database.\n");
1857*6a54128fSAndroid Build Coastguard Worker printf("\n");
1858*6a54128fSAndroid Build Coastguard Worker printf("The input files:\n");
1859*6a54128fSAndroid Build Coastguard Worker printf("\t-a %s\n", AGE_NAME);
1860*6a54128fSAndroid Build Coastguard Worker printf("\t-c %s\n", CCC_NAME);
1861*6a54128fSAndroid Build Coastguard Worker printf("\t-p %s\n", PROP_NAME);
1862*6a54128fSAndroid Build Coastguard Worker printf("\t-d %s\n", DATA_NAME);
1863*6a54128fSAndroid Build Coastguard Worker printf("\t-f %s\n", FOLD_NAME);
1864*6a54128fSAndroid Build Coastguard Worker printf("\t-n %s\n", NORM_NAME);
1865*6a54128fSAndroid Build Coastguard Worker printf("\n");
1866*6a54128fSAndroid Build Coastguard Worker printf("Additionally, the generated tables are tested using:\n");
1867*6a54128fSAndroid Build Coastguard Worker printf("\t-t %s\n", TEST_NAME);
1868*6a54128fSAndroid Build Coastguard Worker printf("\n");
1869*6a54128fSAndroid Build Coastguard Worker printf("Finally, the output file:\n");
1870*6a54128fSAndroid Build Coastguard Worker printf("\t-o %s\n", UTF8_NAME);
1871*6a54128fSAndroid Build Coastguard Worker printf("\n");
1872*6a54128fSAndroid Build Coastguard Worker }
1873*6a54128fSAndroid Build Coastguard Worker
usage(void)1874*6a54128fSAndroid Build Coastguard Worker static void usage(void)
1875*6a54128fSAndroid Build Coastguard Worker {
1876*6a54128fSAndroid Build Coastguard Worker help();
1877*6a54128fSAndroid Build Coastguard Worker exit(1);
1878*6a54128fSAndroid Build Coastguard Worker }
1879*6a54128fSAndroid Build Coastguard Worker
open_fail(const char * name,int error)1880*6a54128fSAndroid Build Coastguard Worker static void open_fail(const char *name, int error)
1881*6a54128fSAndroid Build Coastguard Worker {
1882*6a54128fSAndroid Build Coastguard Worker printf("Error %d opening %s: %s\n", error, name, strerror(error));
1883*6a54128fSAndroid Build Coastguard Worker exit(1);
1884*6a54128fSAndroid Build Coastguard Worker }
1885*6a54128fSAndroid Build Coastguard Worker
file_fail(const char * filename)1886*6a54128fSAndroid Build Coastguard Worker static void file_fail(const char *filename)
1887*6a54128fSAndroid Build Coastguard Worker {
1888*6a54128fSAndroid Build Coastguard Worker printf("Error parsing %s\n", filename);
1889*6a54128fSAndroid Build Coastguard Worker exit(1);
1890*6a54128fSAndroid Build Coastguard Worker }
1891*6a54128fSAndroid Build Coastguard Worker
line_fail(const char * filename,const char * line)1892*6a54128fSAndroid Build Coastguard Worker static void line_fail(const char *filename, const char *line)
1893*6a54128fSAndroid Build Coastguard Worker {
1894*6a54128fSAndroid Build Coastguard Worker printf("Error parsing %s:%s\n", filename, line);
1895*6a54128fSAndroid Build Coastguard Worker exit(1);
1896*6a54128fSAndroid Build Coastguard Worker }
1897*6a54128fSAndroid Build Coastguard Worker
1898*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
1899*6a54128fSAndroid Build Coastguard Worker
print_utf32(unsigned int * utf32str)1900*6a54128fSAndroid Build Coastguard Worker static void print_utf32(unsigned int *utf32str)
1901*6a54128fSAndroid Build Coastguard Worker {
1902*6a54128fSAndroid Build Coastguard Worker int i;
1903*6a54128fSAndroid Build Coastguard Worker
1904*6a54128fSAndroid Build Coastguard Worker for (i = 0; utf32str[i]; i++)
1905*6a54128fSAndroid Build Coastguard Worker printf(" %X", utf32str[i]);
1906*6a54128fSAndroid Build Coastguard Worker }
1907*6a54128fSAndroid Build Coastguard Worker
print_utf32nfkdi(unsigned int unichar)1908*6a54128fSAndroid Build Coastguard Worker static void print_utf32nfkdi(unsigned int unichar)
1909*6a54128fSAndroid Build Coastguard Worker {
1910*6a54128fSAndroid Build Coastguard Worker printf(" %X ->", unichar);
1911*6a54128fSAndroid Build Coastguard Worker print_utf32(unicode_data[unichar].utf32nfkdi);
1912*6a54128fSAndroid Build Coastguard Worker printf("\n");
1913*6a54128fSAndroid Build Coastguard Worker }
1914*6a54128fSAndroid Build Coastguard Worker
print_utf32nfkdicf(unsigned int unichar)1915*6a54128fSAndroid Build Coastguard Worker static void print_utf32nfkdicf(unsigned int unichar)
1916*6a54128fSAndroid Build Coastguard Worker {
1917*6a54128fSAndroid Build Coastguard Worker printf(" %X ->", unichar);
1918*6a54128fSAndroid Build Coastguard Worker print_utf32(unicode_data[unichar].utf32nfkdicf);
1919*6a54128fSAndroid Build Coastguard Worker printf("\n");
1920*6a54128fSAndroid Build Coastguard Worker }
1921*6a54128fSAndroid Build Coastguard Worker
1922*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
1923*6a54128fSAndroid Build Coastguard Worker
age_init(void)1924*6a54128fSAndroid Build Coastguard Worker static void age_init(void)
1925*6a54128fSAndroid Build Coastguard Worker {
1926*6a54128fSAndroid Build Coastguard Worker FILE *file;
1927*6a54128fSAndroid Build Coastguard Worker unsigned int first;
1928*6a54128fSAndroid Build Coastguard Worker unsigned int last;
1929*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
1930*6a54128fSAndroid Build Coastguard Worker unsigned int major;
1931*6a54128fSAndroid Build Coastguard Worker unsigned int minor;
1932*6a54128fSAndroid Build Coastguard Worker unsigned int revision;
1933*6a54128fSAndroid Build Coastguard Worker int gen;
1934*6a54128fSAndroid Build Coastguard Worker int count;
1935*6a54128fSAndroid Build Coastguard Worker int ret;
1936*6a54128fSAndroid Build Coastguard Worker
1937*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
1938*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", age_name);
1939*6a54128fSAndroid Build Coastguard Worker
1940*6a54128fSAndroid Build Coastguard Worker file = fopen(age_name, "r");
1941*6a54128fSAndroid Build Coastguard Worker if (!file)
1942*6a54128fSAndroid Build Coastguard Worker open_fail(age_name, errno);
1943*6a54128fSAndroid Build Coastguard Worker count = 0;
1944*6a54128fSAndroid Build Coastguard Worker
1945*6a54128fSAndroid Build Coastguard Worker gen = 0;
1946*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
1947*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "# Age=V%d_%d_%d",
1948*6a54128fSAndroid Build Coastguard Worker &major, &minor, &revision);
1949*6a54128fSAndroid Build Coastguard Worker if (ret == 3) {
1950*6a54128fSAndroid Build Coastguard Worker ages_count++;
1951*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
1952*6a54128fSAndroid Build Coastguard Worker printf(" Age V%d_%d_%d\n",
1953*6a54128fSAndroid Build Coastguard Worker major, minor, revision);
1954*6a54128fSAndroid Build Coastguard Worker if (!age_valid(major, minor, revision))
1955*6a54128fSAndroid Build Coastguard Worker line_fail(age_name, line);
1956*6a54128fSAndroid Build Coastguard Worker continue;
1957*6a54128fSAndroid Build Coastguard Worker }
1958*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1959*6a54128fSAndroid Build Coastguard Worker if (ret == 2) {
1960*6a54128fSAndroid Build Coastguard Worker ages_count++;
1961*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
1962*6a54128fSAndroid Build Coastguard Worker printf(" Age V%d_%d\n", major, minor);
1963*6a54128fSAndroid Build Coastguard Worker if (!age_valid(major, minor, 0))
1964*6a54128fSAndroid Build Coastguard Worker line_fail(age_name, line);
1965*6a54128fSAndroid Build Coastguard Worker continue;
1966*6a54128fSAndroid Build Coastguard Worker }
1967*6a54128fSAndroid Build Coastguard Worker }
1968*6a54128fSAndroid Build Coastguard Worker
1969*6a54128fSAndroid Build Coastguard Worker /* We must have found something above. */
1970*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
1971*6a54128fSAndroid Build Coastguard Worker printf("%d age entries\n", ages_count);
1972*6a54128fSAndroid Build Coastguard Worker if (ages_count == 0 || ages_count > MAXGEN)
1973*6a54128fSAndroid Build Coastguard Worker file_fail(age_name);
1974*6a54128fSAndroid Build Coastguard Worker
1975*6a54128fSAndroid Build Coastguard Worker /* There is a 0 entry. */
1976*6a54128fSAndroid Build Coastguard Worker ages_count++;
1977*6a54128fSAndroid Build Coastguard Worker ages = calloc(ages_count + 1, sizeof(*ages));
1978*6a54128fSAndroid Build Coastguard Worker /* And a guard entry. */
1979*6a54128fSAndroid Build Coastguard Worker ages[ages_count] = (unsigned int)-1;
1980*6a54128fSAndroid Build Coastguard Worker
1981*6a54128fSAndroid Build Coastguard Worker rewind(file);
1982*6a54128fSAndroid Build Coastguard Worker count = 0;
1983*6a54128fSAndroid Build Coastguard Worker gen = 0;
1984*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
1985*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "# Age=V%d_%d_%d",
1986*6a54128fSAndroid Build Coastguard Worker &major, &minor, &revision);
1987*6a54128fSAndroid Build Coastguard Worker if (ret == 3) {
1988*6a54128fSAndroid Build Coastguard Worker ages[++gen] =
1989*6a54128fSAndroid Build Coastguard Worker UNICODE_AGE(major, minor, revision);
1990*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
1991*6a54128fSAndroid Build Coastguard Worker printf(" Age V%d_%d_%d = gen %d\n",
1992*6a54128fSAndroid Build Coastguard Worker major, minor, revision, gen);
1993*6a54128fSAndroid Build Coastguard Worker if (!age_valid(major, minor, revision))
1994*6a54128fSAndroid Build Coastguard Worker line_fail(age_name, line);
1995*6a54128fSAndroid Build Coastguard Worker continue;
1996*6a54128fSAndroid Build Coastguard Worker }
1997*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1998*6a54128fSAndroid Build Coastguard Worker if (ret == 2) {
1999*6a54128fSAndroid Build Coastguard Worker ages[++gen] = UNICODE_AGE(major, minor, 0);
2000*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2001*6a54128fSAndroid Build Coastguard Worker printf(" Age V%d_%d = %d\n",
2002*6a54128fSAndroid Build Coastguard Worker major, minor, gen);
2003*6a54128fSAndroid Build Coastguard Worker if (!age_valid(major, minor, 0))
2004*6a54128fSAndroid Build Coastguard Worker line_fail(age_name, line);
2005*6a54128fSAndroid Build Coastguard Worker continue;
2006*6a54128fSAndroid Build Coastguard Worker }
2007*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X..%X ; %d.%d #",
2008*6a54128fSAndroid Build Coastguard Worker &first, &last, &major, &minor);
2009*6a54128fSAndroid Build Coastguard Worker if (ret == 4) {
2010*6a54128fSAndroid Build Coastguard Worker for (unichar = first; unichar <= last; unichar++)
2011*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].gen = gen;
2012*6a54128fSAndroid Build Coastguard Worker count += 1 + last - first;
2013*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2014*6a54128fSAndroid Build Coastguard Worker printf(" %X..%X gen %d\n", first, last, gen);
2015*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(first) || !utf32valid(last))
2016*6a54128fSAndroid Build Coastguard Worker line_fail(age_name, line);
2017*6a54128fSAndroid Build Coastguard Worker continue;
2018*6a54128fSAndroid Build Coastguard Worker }
2019*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2020*6a54128fSAndroid Build Coastguard Worker if (ret == 3) {
2021*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].gen = gen;
2022*6a54128fSAndroid Build Coastguard Worker count++;
2023*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2024*6a54128fSAndroid Build Coastguard Worker printf(" %X gen %d\n", unichar, gen);
2025*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar))
2026*6a54128fSAndroid Build Coastguard Worker line_fail(age_name, line);
2027*6a54128fSAndroid Build Coastguard Worker continue;
2028*6a54128fSAndroid Build Coastguard Worker }
2029*6a54128fSAndroid Build Coastguard Worker }
2030*6a54128fSAndroid Build Coastguard Worker unicode_maxage = ages[gen];
2031*6a54128fSAndroid Build Coastguard Worker fclose(file);
2032*6a54128fSAndroid Build Coastguard Worker
2033*6a54128fSAndroid Build Coastguard Worker /* Nix surrogate block */
2034*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2035*6a54128fSAndroid Build Coastguard Worker printf(" Removing surrogate block D800..DFFF\n");
2036*6a54128fSAndroid Build Coastguard Worker for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2037*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].gen = -1;
2038*6a54128fSAndroid Build Coastguard Worker
2039*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2040*6a54128fSAndroid Build Coastguard Worker printf("Found %d entries\n", count);
2041*6a54128fSAndroid Build Coastguard Worker if (count == 0)
2042*6a54128fSAndroid Build Coastguard Worker file_fail(age_name);
2043*6a54128fSAndroid Build Coastguard Worker }
2044*6a54128fSAndroid Build Coastguard Worker
ccc_init(void)2045*6a54128fSAndroid Build Coastguard Worker static void ccc_init(void)
2046*6a54128fSAndroid Build Coastguard Worker {
2047*6a54128fSAndroid Build Coastguard Worker FILE *file;
2048*6a54128fSAndroid Build Coastguard Worker unsigned int first;
2049*6a54128fSAndroid Build Coastguard Worker unsigned int last;
2050*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2051*6a54128fSAndroid Build Coastguard Worker unsigned int value;
2052*6a54128fSAndroid Build Coastguard Worker int count;
2053*6a54128fSAndroid Build Coastguard Worker int ret;
2054*6a54128fSAndroid Build Coastguard Worker
2055*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2056*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", ccc_name);
2057*6a54128fSAndroid Build Coastguard Worker
2058*6a54128fSAndroid Build Coastguard Worker file = fopen(ccc_name, "r");
2059*6a54128fSAndroid Build Coastguard Worker if (!file)
2060*6a54128fSAndroid Build Coastguard Worker open_fail(ccc_name, errno);
2061*6a54128fSAndroid Build Coastguard Worker
2062*6a54128fSAndroid Build Coastguard Worker count = 0;
2063*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
2064*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2065*6a54128fSAndroid Build Coastguard Worker if (ret == 3) {
2066*6a54128fSAndroid Build Coastguard Worker for (unichar = first; unichar <= last; unichar++) {
2067*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].ccc = value;
2068*6a54128fSAndroid Build Coastguard Worker count++;
2069*6a54128fSAndroid Build Coastguard Worker }
2070*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2071*6a54128fSAndroid Build Coastguard Worker printf(" %X..%X ccc %d\n", first, last, value);
2072*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(first) || !utf32valid(last))
2073*6a54128fSAndroid Build Coastguard Worker line_fail(ccc_name, line);
2074*6a54128fSAndroid Build Coastguard Worker continue;
2075*6a54128fSAndroid Build Coastguard Worker }
2076*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X ; %d #", &unichar, &value);
2077*6a54128fSAndroid Build Coastguard Worker if (ret == 2) {
2078*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].ccc = value;
2079*6a54128fSAndroid Build Coastguard Worker count++;
2080*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2081*6a54128fSAndroid Build Coastguard Worker printf(" %X ccc %d\n", unichar, value);
2082*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar))
2083*6a54128fSAndroid Build Coastguard Worker line_fail(ccc_name, line);
2084*6a54128fSAndroid Build Coastguard Worker continue;
2085*6a54128fSAndroid Build Coastguard Worker }
2086*6a54128fSAndroid Build Coastguard Worker }
2087*6a54128fSAndroid Build Coastguard Worker fclose(file);
2088*6a54128fSAndroid Build Coastguard Worker
2089*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2090*6a54128fSAndroid Build Coastguard Worker printf("Found %d entries\n", count);
2091*6a54128fSAndroid Build Coastguard Worker if (count == 0)
2092*6a54128fSAndroid Build Coastguard Worker file_fail(ccc_name);
2093*6a54128fSAndroid Build Coastguard Worker }
2094*6a54128fSAndroid Build Coastguard Worker
nfkdi_init(void)2095*6a54128fSAndroid Build Coastguard Worker static void nfkdi_init(void)
2096*6a54128fSAndroid Build Coastguard Worker {
2097*6a54128fSAndroid Build Coastguard Worker FILE *file;
2098*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2099*6a54128fSAndroid Build Coastguard Worker unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2100*6a54128fSAndroid Build Coastguard Worker char *s;
2101*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2102*6a54128fSAndroid Build Coastguard Worker int count;
2103*6a54128fSAndroid Build Coastguard Worker int i;
2104*6a54128fSAndroid Build Coastguard Worker int ret;
2105*6a54128fSAndroid Build Coastguard Worker
2106*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2107*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", data_name);
2108*6a54128fSAndroid Build Coastguard Worker file = fopen(data_name, "r");
2109*6a54128fSAndroid Build Coastguard Worker if (!file)
2110*6a54128fSAndroid Build Coastguard Worker open_fail(data_name, errno);
2111*6a54128fSAndroid Build Coastguard Worker
2112*6a54128fSAndroid Build Coastguard Worker count = 0;
2113*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
2114*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2115*6a54128fSAndroid Build Coastguard Worker &unichar, buf0);
2116*6a54128fSAndroid Build Coastguard Worker if (ret != 2)
2117*6a54128fSAndroid Build Coastguard Worker continue;
2118*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar))
2119*6a54128fSAndroid Build Coastguard Worker line_fail(data_name, line);
2120*6a54128fSAndroid Build Coastguard Worker
2121*6a54128fSAndroid Build Coastguard Worker s = buf0;
2122*6a54128fSAndroid Build Coastguard Worker /* skip over <tag> */
2123*6a54128fSAndroid Build Coastguard Worker if (*s == '<')
2124*6a54128fSAndroid Build Coastguard Worker while (*s++ != ' ')
2125*6a54128fSAndroid Build Coastguard Worker ;
2126*6a54128fSAndroid Build Coastguard Worker /* decode the decomposition into UTF-32 */
2127*6a54128fSAndroid Build Coastguard Worker i = 0;
2128*6a54128fSAndroid Build Coastguard Worker while (*s) {
2129*6a54128fSAndroid Build Coastguard Worker mapping[i] = strtoul(s, &s, 16);
2130*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(mapping[i]))
2131*6a54128fSAndroid Build Coastguard Worker line_fail(data_name, line);
2132*6a54128fSAndroid Build Coastguard Worker i++;
2133*6a54128fSAndroid Build Coastguard Worker }
2134*6a54128fSAndroid Build Coastguard Worker mapping[i++] = 0;
2135*6a54128fSAndroid Build Coastguard Worker
2136*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2137*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2138*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdi = um;
2139*6a54128fSAndroid Build Coastguard Worker
2140*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2141*6a54128fSAndroid Build Coastguard Worker print_utf32nfkdi(unichar);
2142*6a54128fSAndroid Build Coastguard Worker count++;
2143*6a54128fSAndroid Build Coastguard Worker }
2144*6a54128fSAndroid Build Coastguard Worker fclose(file);
2145*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2146*6a54128fSAndroid Build Coastguard Worker printf("Found %d entries\n", count);
2147*6a54128fSAndroid Build Coastguard Worker if (count == 0)
2148*6a54128fSAndroid Build Coastguard Worker file_fail(data_name);
2149*6a54128fSAndroid Build Coastguard Worker }
2150*6a54128fSAndroid Build Coastguard Worker
nfkdicf_init(void)2151*6a54128fSAndroid Build Coastguard Worker static void nfkdicf_init(void)
2152*6a54128fSAndroid Build Coastguard Worker {
2153*6a54128fSAndroid Build Coastguard Worker FILE *file;
2154*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2155*6a54128fSAndroid Build Coastguard Worker unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2156*6a54128fSAndroid Build Coastguard Worker char status;
2157*6a54128fSAndroid Build Coastguard Worker char *s;
2158*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2159*6a54128fSAndroid Build Coastguard Worker int i;
2160*6a54128fSAndroid Build Coastguard Worker int count;
2161*6a54128fSAndroid Build Coastguard Worker int ret;
2162*6a54128fSAndroid Build Coastguard Worker
2163*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2164*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", fold_name);
2165*6a54128fSAndroid Build Coastguard Worker file = fopen(fold_name, "r");
2166*6a54128fSAndroid Build Coastguard Worker if (!file)
2167*6a54128fSAndroid Build Coastguard Worker open_fail(fold_name, errno);
2168*6a54128fSAndroid Build Coastguard Worker
2169*6a54128fSAndroid Build Coastguard Worker count = 0;
2170*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
2171*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2172*6a54128fSAndroid Build Coastguard Worker if (ret != 3)
2173*6a54128fSAndroid Build Coastguard Worker continue;
2174*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar))
2175*6a54128fSAndroid Build Coastguard Worker line_fail(fold_name, line);
2176*6a54128fSAndroid Build Coastguard Worker /* Use the C+F casefold. */
2177*6a54128fSAndroid Build Coastguard Worker if (status != 'C' && status != 'F')
2178*6a54128fSAndroid Build Coastguard Worker continue;
2179*6a54128fSAndroid Build Coastguard Worker s = buf0;
2180*6a54128fSAndroid Build Coastguard Worker if (*s == '<')
2181*6a54128fSAndroid Build Coastguard Worker while (*s++ != ' ')
2182*6a54128fSAndroid Build Coastguard Worker ;
2183*6a54128fSAndroid Build Coastguard Worker i = 0;
2184*6a54128fSAndroid Build Coastguard Worker while (*s) {
2185*6a54128fSAndroid Build Coastguard Worker mapping[i] = strtoul(s, &s, 16);
2186*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(mapping[i]))
2187*6a54128fSAndroid Build Coastguard Worker line_fail(fold_name, line);
2188*6a54128fSAndroid Build Coastguard Worker i++;
2189*6a54128fSAndroid Build Coastguard Worker }
2190*6a54128fSAndroid Build Coastguard Worker mapping[i++] = 0;
2191*6a54128fSAndroid Build Coastguard Worker
2192*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2193*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2194*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdicf = um;
2195*6a54128fSAndroid Build Coastguard Worker
2196*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2197*6a54128fSAndroid Build Coastguard Worker print_utf32nfkdicf(unichar);
2198*6a54128fSAndroid Build Coastguard Worker count++;
2199*6a54128fSAndroid Build Coastguard Worker }
2200*6a54128fSAndroid Build Coastguard Worker fclose(file);
2201*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2202*6a54128fSAndroid Build Coastguard Worker printf("Found %d entries\n", count);
2203*6a54128fSAndroid Build Coastguard Worker if (count == 0)
2204*6a54128fSAndroid Build Coastguard Worker file_fail(fold_name);
2205*6a54128fSAndroid Build Coastguard Worker }
2206*6a54128fSAndroid Build Coastguard Worker
ignore_init(void)2207*6a54128fSAndroid Build Coastguard Worker static void ignore_init(void)
2208*6a54128fSAndroid Build Coastguard Worker {
2209*6a54128fSAndroid Build Coastguard Worker FILE *file;
2210*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2211*6a54128fSAndroid Build Coastguard Worker unsigned int first;
2212*6a54128fSAndroid Build Coastguard Worker unsigned int last;
2213*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2214*6a54128fSAndroid Build Coastguard Worker int count;
2215*6a54128fSAndroid Build Coastguard Worker int ret;
2216*6a54128fSAndroid Build Coastguard Worker
2217*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2218*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", prop_name);
2219*6a54128fSAndroid Build Coastguard Worker file = fopen(prop_name, "r");
2220*6a54128fSAndroid Build Coastguard Worker if (!file)
2221*6a54128fSAndroid Build Coastguard Worker open_fail(prop_name, errno);
2222*6a54128fSAndroid Build Coastguard Worker assert(file);
2223*6a54128fSAndroid Build Coastguard Worker count = 0;
2224*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
2225*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2226*6a54128fSAndroid Build Coastguard Worker if (ret == 3) {
2227*6a54128fSAndroid Build Coastguard Worker if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2228*6a54128fSAndroid Build Coastguard Worker continue;
2229*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(first) || !utf32valid(last))
2230*6a54128fSAndroid Build Coastguard Worker line_fail(prop_name, line);
2231*6a54128fSAndroid Build Coastguard Worker for (unichar = first; unichar <= last; unichar++) {
2232*6a54128fSAndroid Build Coastguard Worker free(unicode_data[unichar].utf32nfkdi);
2233*6a54128fSAndroid Build Coastguard Worker um = malloc(sizeof(unsigned int));
2234*6a54128fSAndroid Build Coastguard Worker *um = 0;
2235*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdi = um;
2236*6a54128fSAndroid Build Coastguard Worker free(unicode_data[unichar].utf32nfkdicf);
2237*6a54128fSAndroid Build Coastguard Worker um = malloc(sizeof(unsigned int));
2238*6a54128fSAndroid Build Coastguard Worker *um = 0;
2239*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdicf = um;
2240*6a54128fSAndroid Build Coastguard Worker count++;
2241*6a54128fSAndroid Build Coastguard Worker }
2242*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2243*6a54128fSAndroid Build Coastguard Worker printf(" %X..%X Default_Ignorable_Code_Point\n",
2244*6a54128fSAndroid Build Coastguard Worker first, last);
2245*6a54128fSAndroid Build Coastguard Worker continue;
2246*6a54128fSAndroid Build Coastguard Worker }
2247*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2248*6a54128fSAndroid Build Coastguard Worker if (ret == 2) {
2249*6a54128fSAndroid Build Coastguard Worker if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2250*6a54128fSAndroid Build Coastguard Worker continue;
2251*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar))
2252*6a54128fSAndroid Build Coastguard Worker line_fail(prop_name, line);
2253*6a54128fSAndroid Build Coastguard Worker free(unicode_data[unichar].utf32nfkdi);
2254*6a54128fSAndroid Build Coastguard Worker um = malloc(sizeof(unsigned int));
2255*6a54128fSAndroid Build Coastguard Worker *um = 0;
2256*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdi = um;
2257*6a54128fSAndroid Build Coastguard Worker free(unicode_data[unichar].utf32nfkdicf);
2258*6a54128fSAndroid Build Coastguard Worker um = malloc(sizeof(unsigned int));
2259*6a54128fSAndroid Build Coastguard Worker *um = 0;
2260*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdicf = um;
2261*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2262*6a54128fSAndroid Build Coastguard Worker printf(" %X Default_Ignorable_Code_Point\n",
2263*6a54128fSAndroid Build Coastguard Worker unichar);
2264*6a54128fSAndroid Build Coastguard Worker count++;
2265*6a54128fSAndroid Build Coastguard Worker continue;
2266*6a54128fSAndroid Build Coastguard Worker }
2267*6a54128fSAndroid Build Coastguard Worker }
2268*6a54128fSAndroid Build Coastguard Worker fclose(file);
2269*6a54128fSAndroid Build Coastguard Worker
2270*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2271*6a54128fSAndroid Build Coastguard Worker printf("Found %d entries\n", count);
2272*6a54128fSAndroid Build Coastguard Worker if (count == 0)
2273*6a54128fSAndroid Build Coastguard Worker file_fail(prop_name);
2274*6a54128fSAndroid Build Coastguard Worker }
2275*6a54128fSAndroid Build Coastguard Worker
corrections_init(void)2276*6a54128fSAndroid Build Coastguard Worker static void corrections_init(void)
2277*6a54128fSAndroid Build Coastguard Worker {
2278*6a54128fSAndroid Build Coastguard Worker FILE *file;
2279*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2280*6a54128fSAndroid Build Coastguard Worker unsigned int major;
2281*6a54128fSAndroid Build Coastguard Worker unsigned int minor;
2282*6a54128fSAndroid Build Coastguard Worker unsigned int revision;
2283*6a54128fSAndroid Build Coastguard Worker unsigned int age;
2284*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2285*6a54128fSAndroid Build Coastguard Worker unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2286*6a54128fSAndroid Build Coastguard Worker char *s;
2287*6a54128fSAndroid Build Coastguard Worker int i;
2288*6a54128fSAndroid Build Coastguard Worker int count;
2289*6a54128fSAndroid Build Coastguard Worker int ret;
2290*6a54128fSAndroid Build Coastguard Worker
2291*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2292*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", norm_name);
2293*6a54128fSAndroid Build Coastguard Worker file = fopen(norm_name, "r");
2294*6a54128fSAndroid Build Coastguard Worker if (!file)
2295*6a54128fSAndroid Build Coastguard Worker open_fail(norm_name, errno);
2296*6a54128fSAndroid Build Coastguard Worker
2297*6a54128fSAndroid Build Coastguard Worker count = 0;
2298*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
2299*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2300*6a54128fSAndroid Build Coastguard Worker &unichar, buf0, buf1,
2301*6a54128fSAndroid Build Coastguard Worker &major, &minor, &revision);
2302*6a54128fSAndroid Build Coastguard Worker if (ret != 6)
2303*6a54128fSAndroid Build Coastguard Worker continue;
2304*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2305*6a54128fSAndroid Build Coastguard Worker line_fail(norm_name, line);
2306*6a54128fSAndroid Build Coastguard Worker count++;
2307*6a54128fSAndroid Build Coastguard Worker }
2308*6a54128fSAndroid Build Coastguard Worker corrections = calloc(count, sizeof(struct unicode_data));
2309*6a54128fSAndroid Build Coastguard Worker corrections_count = count;
2310*6a54128fSAndroid Build Coastguard Worker rewind(file);
2311*6a54128fSAndroid Build Coastguard Worker
2312*6a54128fSAndroid Build Coastguard Worker count = 0;
2313*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
2314*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2315*6a54128fSAndroid Build Coastguard Worker &unichar, buf0, buf1,
2316*6a54128fSAndroid Build Coastguard Worker &major, &minor, &revision);
2317*6a54128fSAndroid Build Coastguard Worker if (ret != 6)
2318*6a54128fSAndroid Build Coastguard Worker continue;
2319*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2320*6a54128fSAndroid Build Coastguard Worker line_fail(norm_name, line);
2321*6a54128fSAndroid Build Coastguard Worker corrections[count] = unicode_data[unichar];
2322*6a54128fSAndroid Build Coastguard Worker assert(corrections[count].code == unichar);
2323*6a54128fSAndroid Build Coastguard Worker age = UNICODE_AGE(major, minor, revision);
2324*6a54128fSAndroid Build Coastguard Worker corrections[count].correction = age;
2325*6a54128fSAndroid Build Coastguard Worker
2326*6a54128fSAndroid Build Coastguard Worker i = 0;
2327*6a54128fSAndroid Build Coastguard Worker s = buf0;
2328*6a54128fSAndroid Build Coastguard Worker while (*s) {
2329*6a54128fSAndroid Build Coastguard Worker mapping[i] = strtoul(s, &s, 16);
2330*6a54128fSAndroid Build Coastguard Worker if (!utf32valid(mapping[i]))
2331*6a54128fSAndroid Build Coastguard Worker line_fail(norm_name, line);
2332*6a54128fSAndroid Build Coastguard Worker i++;
2333*6a54128fSAndroid Build Coastguard Worker }
2334*6a54128fSAndroid Build Coastguard Worker mapping[i++] = 0;
2335*6a54128fSAndroid Build Coastguard Worker
2336*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2337*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2338*6a54128fSAndroid Build Coastguard Worker corrections[count].utf32nfkdi = um;
2339*6a54128fSAndroid Build Coastguard Worker
2340*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2341*6a54128fSAndroid Build Coastguard Worker printf(" %X -> %s -> %s V%d_%d_%d\n",
2342*6a54128fSAndroid Build Coastguard Worker unichar, buf0, buf1, major, minor, revision);
2343*6a54128fSAndroid Build Coastguard Worker count++;
2344*6a54128fSAndroid Build Coastguard Worker }
2345*6a54128fSAndroid Build Coastguard Worker fclose(file);
2346*6a54128fSAndroid Build Coastguard Worker
2347*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2348*6a54128fSAndroid Build Coastguard Worker printf("Found %d entries\n", count);
2349*6a54128fSAndroid Build Coastguard Worker if (count == 0)
2350*6a54128fSAndroid Build Coastguard Worker file_fail(norm_name);
2351*6a54128fSAndroid Build Coastguard Worker }
2352*6a54128fSAndroid Build Coastguard Worker
2353*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
2354*6a54128fSAndroid Build Coastguard Worker
2355*6a54128fSAndroid Build Coastguard Worker /*
2356*6a54128fSAndroid Build Coastguard Worker * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2357*6a54128fSAndroid Build Coastguard Worker *
2358*6a54128fSAndroid Build Coastguard Worker * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2359*6a54128fSAndroid Build Coastguard Worker * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2360*6a54128fSAndroid Build Coastguard Worker *
2361*6a54128fSAndroid Build Coastguard Worker * SBase = 0xAC00
2362*6a54128fSAndroid Build Coastguard Worker * LBase = 0x1100
2363*6a54128fSAndroid Build Coastguard Worker * VBase = 0x1161
2364*6a54128fSAndroid Build Coastguard Worker * TBase = 0x11A7
2365*6a54128fSAndroid Build Coastguard Worker * LCount = 19
2366*6a54128fSAndroid Build Coastguard Worker * VCount = 21
2367*6a54128fSAndroid Build Coastguard Worker * TCount = 28
2368*6a54128fSAndroid Build Coastguard Worker * NCount = 588 (VCount * TCount)
2369*6a54128fSAndroid Build Coastguard Worker * SCount = 11172 (LCount * NCount)
2370*6a54128fSAndroid Build Coastguard Worker *
2371*6a54128fSAndroid Build Coastguard Worker * Decomposition:
2372*6a54128fSAndroid Build Coastguard Worker * SIndex = s - SBase
2373*6a54128fSAndroid Build Coastguard Worker *
2374*6a54128fSAndroid Build Coastguard Worker * LV (Canonical/Full)
2375*6a54128fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
2376*6a54128fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
2377*6a54128fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
2378*6a54128fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
2379*6a54128fSAndroid Build Coastguard Worker *
2380*6a54128fSAndroid Build Coastguard Worker * LVT (Canonical)
2381*6a54128fSAndroid Build Coastguard Worker * LVIndex = (SIndex / TCount) * TCount
2382*6a54128fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
2383*6a54128fSAndroid Build Coastguard Worker * LVPart = SBase + LVIndex
2384*6a54128fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
2385*6a54128fSAndroid Build Coastguard Worker *
2386*6a54128fSAndroid Build Coastguard Worker * LVT (Full)
2387*6a54128fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
2388*6a54128fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
2389*6a54128fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
2390*6a54128fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
2391*6a54128fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
2392*6a54128fSAndroid Build Coastguard Worker * if (TIndex == 0) {
2393*6a54128fSAndroid Build Coastguard Worker * d = <LPart, VPart>
2394*6a54128fSAndroid Build Coastguard Worker * } else {
2395*6a54128fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
2396*6a54128fSAndroid Build Coastguard Worker * d = <LPart, VPart, TPart>
2397*6a54128fSAndroid Build Coastguard Worker * }
2398*6a54128fSAndroid Build Coastguard Worker *
2399*6a54128fSAndroid Build Coastguard Worker */
2400*6a54128fSAndroid Build Coastguard Worker
hangul_decompose(void)2401*6a54128fSAndroid Build Coastguard Worker static void hangul_decompose(void)
2402*6a54128fSAndroid Build Coastguard Worker {
2403*6a54128fSAndroid Build Coastguard Worker unsigned int sb = 0xAC00;
2404*6a54128fSAndroid Build Coastguard Worker unsigned int lb = 0x1100;
2405*6a54128fSAndroid Build Coastguard Worker unsigned int vb = 0x1161;
2406*6a54128fSAndroid Build Coastguard Worker unsigned int tb = 0x11a7;
2407*6a54128fSAndroid Build Coastguard Worker /* unsigned int lc = 19; */
2408*6a54128fSAndroid Build Coastguard Worker unsigned int vc = 21;
2409*6a54128fSAndroid Build Coastguard Worker unsigned int tc = 28;
2410*6a54128fSAndroid Build Coastguard Worker unsigned int nc = (vc * tc);
2411*6a54128fSAndroid Build Coastguard Worker /* unsigned int sc = (lc * nc); */
2412*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2413*6a54128fSAndroid Build Coastguard Worker unsigned int mapping[4];
2414*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2415*6a54128fSAndroid Build Coastguard Worker int count;
2416*6a54128fSAndroid Build Coastguard Worker int i;
2417*6a54128fSAndroid Build Coastguard Worker
2418*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2419*6a54128fSAndroid Build Coastguard Worker printf("Decomposing hangul\n");
2420*6a54128fSAndroid Build Coastguard Worker /* Hangul */
2421*6a54128fSAndroid Build Coastguard Worker count = 0;
2422*6a54128fSAndroid Build Coastguard Worker for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2423*6a54128fSAndroid Build Coastguard Worker unsigned int si = unichar - sb;
2424*6a54128fSAndroid Build Coastguard Worker unsigned int li = si / nc;
2425*6a54128fSAndroid Build Coastguard Worker unsigned int vi = (si % nc) / tc;
2426*6a54128fSAndroid Build Coastguard Worker unsigned int ti = si % tc;
2427*6a54128fSAndroid Build Coastguard Worker
2428*6a54128fSAndroid Build Coastguard Worker i = 0;
2429*6a54128fSAndroid Build Coastguard Worker mapping[i++] = lb + li;
2430*6a54128fSAndroid Build Coastguard Worker mapping[i++] = vb + vi;
2431*6a54128fSAndroid Build Coastguard Worker if (ti)
2432*6a54128fSAndroid Build Coastguard Worker mapping[i++] = tb + ti;
2433*6a54128fSAndroid Build Coastguard Worker mapping[i++] = 0;
2434*6a54128fSAndroid Build Coastguard Worker
2435*6a54128fSAndroid Build Coastguard Worker assert(!unicode_data[unichar].utf32nfkdi);
2436*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2437*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2438*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdi = um;
2439*6a54128fSAndroid Build Coastguard Worker
2440*6a54128fSAndroid Build Coastguard Worker assert(!unicode_data[unichar].utf32nfkdicf);
2441*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2442*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2443*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdicf = um;
2444*6a54128fSAndroid Build Coastguard Worker
2445*6a54128fSAndroid Build Coastguard Worker /*
2446*6a54128fSAndroid Build Coastguard Worker * Add a cookie as a reminder that the hangul syllable
2447*6a54128fSAndroid Build Coastguard Worker * decompositions must not be stored in the generated
2448*6a54128fSAndroid Build Coastguard Worker * trie.
2449*6a54128fSAndroid Build Coastguard Worker */
2450*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf8nfkdi = malloc(2);
2451*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf8nfkdi[0] = HANGUL;
2452*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf8nfkdi[1] = '\0';
2453*6a54128fSAndroid Build Coastguard Worker
2454*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2455*6a54128fSAndroid Build Coastguard Worker print_utf32nfkdi(unichar);
2456*6a54128fSAndroid Build Coastguard Worker
2457*6a54128fSAndroid Build Coastguard Worker count++;
2458*6a54128fSAndroid Build Coastguard Worker }
2459*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2460*6a54128fSAndroid Build Coastguard Worker printf("Created %d entries\n", count);
2461*6a54128fSAndroid Build Coastguard Worker }
2462*6a54128fSAndroid Build Coastguard Worker
nfkdi_decompose(void)2463*6a54128fSAndroid Build Coastguard Worker static void nfkdi_decompose(void)
2464*6a54128fSAndroid Build Coastguard Worker {
2465*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2466*6a54128fSAndroid Build Coastguard Worker unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2467*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2468*6a54128fSAndroid Build Coastguard Worker unsigned int *dc;
2469*6a54128fSAndroid Build Coastguard Worker int count;
2470*6a54128fSAndroid Build Coastguard Worker int i;
2471*6a54128fSAndroid Build Coastguard Worker int j;
2472*6a54128fSAndroid Build Coastguard Worker int ret;
2473*6a54128fSAndroid Build Coastguard Worker
2474*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2475*6a54128fSAndroid Build Coastguard Worker printf("Decomposing nfkdi\n");
2476*6a54128fSAndroid Build Coastguard Worker
2477*6a54128fSAndroid Build Coastguard Worker count = 0;
2478*6a54128fSAndroid Build Coastguard Worker for (unichar = 0; unichar != 0x110000; unichar++) {
2479*6a54128fSAndroid Build Coastguard Worker if (!unicode_data[unichar].utf32nfkdi)
2480*6a54128fSAndroid Build Coastguard Worker continue;
2481*6a54128fSAndroid Build Coastguard Worker for (;;) {
2482*6a54128fSAndroid Build Coastguard Worker ret = 1;
2483*6a54128fSAndroid Build Coastguard Worker i = 0;
2484*6a54128fSAndroid Build Coastguard Worker um = unicode_data[unichar].utf32nfkdi;
2485*6a54128fSAndroid Build Coastguard Worker while (*um) {
2486*6a54128fSAndroid Build Coastguard Worker dc = unicode_data[*um].utf32nfkdi;
2487*6a54128fSAndroid Build Coastguard Worker if (dc) {
2488*6a54128fSAndroid Build Coastguard Worker for (j = 0; dc[j]; j++)
2489*6a54128fSAndroid Build Coastguard Worker mapping[i++] = dc[j];
2490*6a54128fSAndroid Build Coastguard Worker ret = 0;
2491*6a54128fSAndroid Build Coastguard Worker } else {
2492*6a54128fSAndroid Build Coastguard Worker mapping[i++] = *um;
2493*6a54128fSAndroid Build Coastguard Worker }
2494*6a54128fSAndroid Build Coastguard Worker um++;
2495*6a54128fSAndroid Build Coastguard Worker }
2496*6a54128fSAndroid Build Coastguard Worker mapping[i++] = 0;
2497*6a54128fSAndroid Build Coastguard Worker if (ret)
2498*6a54128fSAndroid Build Coastguard Worker break;
2499*6a54128fSAndroid Build Coastguard Worker free(unicode_data[unichar].utf32nfkdi);
2500*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2501*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2502*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdi = um;
2503*6a54128fSAndroid Build Coastguard Worker }
2504*6a54128fSAndroid Build Coastguard Worker /* Add this decomposition to nfkdicf if there is no entry. */
2505*6a54128fSAndroid Build Coastguard Worker if (!unicode_data[unichar].utf32nfkdicf) {
2506*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2507*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2508*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdicf = um;
2509*6a54128fSAndroid Build Coastguard Worker }
2510*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2511*6a54128fSAndroid Build Coastguard Worker print_utf32nfkdi(unichar);
2512*6a54128fSAndroid Build Coastguard Worker count++;
2513*6a54128fSAndroid Build Coastguard Worker }
2514*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2515*6a54128fSAndroid Build Coastguard Worker printf("Processed %d entries\n", count);
2516*6a54128fSAndroid Build Coastguard Worker }
2517*6a54128fSAndroid Build Coastguard Worker
nfkdicf_decompose(void)2518*6a54128fSAndroid Build Coastguard Worker static void nfkdicf_decompose(void)
2519*6a54128fSAndroid Build Coastguard Worker {
2520*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2521*6a54128fSAndroid Build Coastguard Worker unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2522*6a54128fSAndroid Build Coastguard Worker unsigned int *um;
2523*6a54128fSAndroid Build Coastguard Worker unsigned int *dc;
2524*6a54128fSAndroid Build Coastguard Worker int count;
2525*6a54128fSAndroid Build Coastguard Worker int i;
2526*6a54128fSAndroid Build Coastguard Worker int j;
2527*6a54128fSAndroid Build Coastguard Worker int ret;
2528*6a54128fSAndroid Build Coastguard Worker
2529*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2530*6a54128fSAndroid Build Coastguard Worker printf("Decomposing nfkdicf\n");
2531*6a54128fSAndroid Build Coastguard Worker count = 0;
2532*6a54128fSAndroid Build Coastguard Worker for (unichar = 0; unichar != 0x110000; unichar++) {
2533*6a54128fSAndroid Build Coastguard Worker if (!unicode_data[unichar].utf32nfkdicf)
2534*6a54128fSAndroid Build Coastguard Worker continue;
2535*6a54128fSAndroid Build Coastguard Worker for (;;) {
2536*6a54128fSAndroid Build Coastguard Worker ret = 1;
2537*6a54128fSAndroid Build Coastguard Worker i = 0;
2538*6a54128fSAndroid Build Coastguard Worker um = unicode_data[unichar].utf32nfkdicf;
2539*6a54128fSAndroid Build Coastguard Worker while (*um) {
2540*6a54128fSAndroid Build Coastguard Worker dc = unicode_data[*um].utf32nfkdicf;
2541*6a54128fSAndroid Build Coastguard Worker if (dc) {
2542*6a54128fSAndroid Build Coastguard Worker for (j = 0; dc[j]; j++)
2543*6a54128fSAndroid Build Coastguard Worker mapping[i++] = dc[j];
2544*6a54128fSAndroid Build Coastguard Worker ret = 0;
2545*6a54128fSAndroid Build Coastguard Worker } else {
2546*6a54128fSAndroid Build Coastguard Worker mapping[i++] = *um;
2547*6a54128fSAndroid Build Coastguard Worker }
2548*6a54128fSAndroid Build Coastguard Worker um++;
2549*6a54128fSAndroid Build Coastguard Worker }
2550*6a54128fSAndroid Build Coastguard Worker mapping[i++] = 0;
2551*6a54128fSAndroid Build Coastguard Worker if (ret)
2552*6a54128fSAndroid Build Coastguard Worker break;
2553*6a54128fSAndroid Build Coastguard Worker free(unicode_data[unichar].utf32nfkdicf);
2554*6a54128fSAndroid Build Coastguard Worker um = malloc(i * sizeof(unsigned int));
2555*6a54128fSAndroid Build Coastguard Worker memcpy(um, mapping, i * sizeof(unsigned int));
2556*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].utf32nfkdicf = um;
2557*6a54128fSAndroid Build Coastguard Worker }
2558*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
2559*6a54128fSAndroid Build Coastguard Worker print_utf32nfkdicf(unichar);
2560*6a54128fSAndroid Build Coastguard Worker count++;
2561*6a54128fSAndroid Build Coastguard Worker }
2562*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
2563*6a54128fSAndroid Build Coastguard Worker printf("Processed %d entries\n", count);
2564*6a54128fSAndroid Build Coastguard Worker }
2565*6a54128fSAndroid Build Coastguard Worker
2566*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
2567*6a54128fSAndroid Build Coastguard Worker
2568*6a54128fSAndroid Build Coastguard Worker int utf8agemax(struct tree *, const char *);
2569*6a54128fSAndroid Build Coastguard Worker int utf8nagemax(struct tree *, const char *, size_t);
2570*6a54128fSAndroid Build Coastguard Worker int utf8agemin(struct tree *, const char *);
2571*6a54128fSAndroid Build Coastguard Worker int utf8nagemin(struct tree *, const char *, size_t);
2572*6a54128fSAndroid Build Coastguard Worker ssize_t utf8len(struct tree *, const char *);
2573*6a54128fSAndroid Build Coastguard Worker ssize_t utf8nlen(struct tree *, const char *, size_t);
2574*6a54128fSAndroid Build Coastguard Worker struct utf8cursor;
2575*6a54128fSAndroid Build Coastguard Worker int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2576*6a54128fSAndroid Build Coastguard Worker int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2577*6a54128fSAndroid Build Coastguard Worker int utf8byte(struct utf8cursor *);
2578*6a54128fSAndroid Build Coastguard Worker
2579*6a54128fSAndroid Build Coastguard Worker /*
2580*6a54128fSAndroid Build Coastguard Worker * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2581*6a54128fSAndroid Build Coastguard Worker *
2582*6a54128fSAndroid Build Coastguard Worker * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2583*6a54128fSAndroid Build Coastguard Worker * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2584*6a54128fSAndroid Build Coastguard Worker *
2585*6a54128fSAndroid Build Coastguard Worker * SBase = 0xAC00
2586*6a54128fSAndroid Build Coastguard Worker * LBase = 0x1100
2587*6a54128fSAndroid Build Coastguard Worker * VBase = 0x1161
2588*6a54128fSAndroid Build Coastguard Worker * TBase = 0x11A7
2589*6a54128fSAndroid Build Coastguard Worker * LCount = 19
2590*6a54128fSAndroid Build Coastguard Worker * VCount = 21
2591*6a54128fSAndroid Build Coastguard Worker * TCount = 28
2592*6a54128fSAndroid Build Coastguard Worker * NCount = 588 (VCount * TCount)
2593*6a54128fSAndroid Build Coastguard Worker * SCount = 11172 (LCount * NCount)
2594*6a54128fSAndroid Build Coastguard Worker *
2595*6a54128fSAndroid Build Coastguard Worker * Decomposition:
2596*6a54128fSAndroid Build Coastguard Worker * SIndex = s - SBase
2597*6a54128fSAndroid Build Coastguard Worker *
2598*6a54128fSAndroid Build Coastguard Worker * LV (Canonical/Full)
2599*6a54128fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
2600*6a54128fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
2601*6a54128fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
2602*6a54128fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
2603*6a54128fSAndroid Build Coastguard Worker *
2604*6a54128fSAndroid Build Coastguard Worker * LVT (Canonical)
2605*6a54128fSAndroid Build Coastguard Worker * LVIndex = (SIndex / TCount) * TCount
2606*6a54128fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
2607*6a54128fSAndroid Build Coastguard Worker * LVPart = SBase + LVIndex
2608*6a54128fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
2609*6a54128fSAndroid Build Coastguard Worker *
2610*6a54128fSAndroid Build Coastguard Worker * LVT (Full)
2611*6a54128fSAndroid Build Coastguard Worker * LIndex = SIndex / NCount
2612*6a54128fSAndroid Build Coastguard Worker * VIndex = (Sindex % NCount) / TCount
2613*6a54128fSAndroid Build Coastguard Worker * TIndex = (Sindex % TCount)
2614*6a54128fSAndroid Build Coastguard Worker * LPart = LBase + LIndex
2615*6a54128fSAndroid Build Coastguard Worker * VPart = VBase + VIndex
2616*6a54128fSAndroid Build Coastguard Worker * if (TIndex == 0) {
2617*6a54128fSAndroid Build Coastguard Worker * d = <LPart, VPart>
2618*6a54128fSAndroid Build Coastguard Worker * } else {
2619*6a54128fSAndroid Build Coastguard Worker * TPart = TBase + TIndex
2620*6a54128fSAndroid Build Coastguard Worker * d = <LPart, VPart, TPart>
2621*6a54128fSAndroid Build Coastguard Worker * }
2622*6a54128fSAndroid Build Coastguard Worker */
2623*6a54128fSAndroid Build Coastguard Worker
2624*6a54128fSAndroid Build Coastguard Worker /* Constants */
2625*6a54128fSAndroid Build Coastguard Worker #define SB (0xAC00)
2626*6a54128fSAndroid Build Coastguard Worker #define LB (0x1100)
2627*6a54128fSAndroid Build Coastguard Worker #define VB (0x1161)
2628*6a54128fSAndroid Build Coastguard Worker #define TB (0x11A7)
2629*6a54128fSAndroid Build Coastguard Worker #define LC (19)
2630*6a54128fSAndroid Build Coastguard Worker #define VC (21)
2631*6a54128fSAndroid Build Coastguard Worker #define TC (28)
2632*6a54128fSAndroid Build Coastguard Worker #define NC (VC * TC)
2633*6a54128fSAndroid Build Coastguard Worker #define SC (LC * NC)
2634*6a54128fSAndroid Build Coastguard Worker
2635*6a54128fSAndroid Build Coastguard Worker /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)2636*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2637*6a54128fSAndroid Build Coastguard Worker {
2638*6a54128fSAndroid Build Coastguard Worker unsigned int si;
2639*6a54128fSAndroid Build Coastguard Worker unsigned int li;
2640*6a54128fSAndroid Build Coastguard Worker unsigned int vi;
2641*6a54128fSAndroid Build Coastguard Worker unsigned int ti;
2642*6a54128fSAndroid Build Coastguard Worker unsigned char *h;
2643*6a54128fSAndroid Build Coastguard Worker
2644*6a54128fSAndroid Build Coastguard Worker /* Calculate the SI, LI, VI, and TI values. */
2645*6a54128fSAndroid Build Coastguard Worker si = utf8decode(str) - SB;
2646*6a54128fSAndroid Build Coastguard Worker li = si / NC;
2647*6a54128fSAndroid Build Coastguard Worker vi = (si % NC) / TC;
2648*6a54128fSAndroid Build Coastguard Worker ti = si % TC;
2649*6a54128fSAndroid Build Coastguard Worker
2650*6a54128fSAndroid Build Coastguard Worker /* Fill in base of leaf. */
2651*6a54128fSAndroid Build Coastguard Worker h = hangul;
2652*6a54128fSAndroid Build Coastguard Worker LEAF_GEN(h) = 2;
2653*6a54128fSAndroid Build Coastguard Worker LEAF_CCC(h) = DECOMPOSE;
2654*6a54128fSAndroid Build Coastguard Worker h += 2;
2655*6a54128fSAndroid Build Coastguard Worker
2656*6a54128fSAndroid Build Coastguard Worker /* Add LPart, a 3-byte UTF-8 sequence. */
2657*6a54128fSAndroid Build Coastguard Worker h += utf8encode((char *)h, li + LB);
2658*6a54128fSAndroid Build Coastguard Worker
2659*6a54128fSAndroid Build Coastguard Worker /* Add VPart, a 3-byte UTF-8 sequence. */
2660*6a54128fSAndroid Build Coastguard Worker h += utf8encode((char *)h, vi + VB);
2661*6a54128fSAndroid Build Coastguard Worker
2662*6a54128fSAndroid Build Coastguard Worker /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2663*6a54128fSAndroid Build Coastguard Worker if (ti)
2664*6a54128fSAndroid Build Coastguard Worker h += utf8encode((char *)h, ti + TB);
2665*6a54128fSAndroid Build Coastguard Worker
2666*6a54128fSAndroid Build Coastguard Worker /* Terminate string. */
2667*6a54128fSAndroid Build Coastguard Worker h[0] = '\0';
2668*6a54128fSAndroid Build Coastguard Worker
2669*6a54128fSAndroid Build Coastguard Worker return hangul;
2670*6a54128fSAndroid Build Coastguard Worker }
2671*6a54128fSAndroid Build Coastguard Worker
2672*6a54128fSAndroid Build Coastguard Worker /*
2673*6a54128fSAndroid Build Coastguard Worker * Use trie to scan s, touching at most len bytes.
2674*6a54128fSAndroid Build Coastguard Worker * Returns the leaf if one exists, NULL otherwise.
2675*6a54128fSAndroid Build Coastguard Worker *
2676*6a54128fSAndroid Build Coastguard Worker * A non-NULL return guarantees that the UTF-8 sequence starting at s
2677*6a54128fSAndroid Build Coastguard Worker * is well-formed and corresponds to a known unicode code point. The
2678*6a54128fSAndroid Build Coastguard Worker * shorthand for this will be "is valid UTF-8 unicode".
2679*6a54128fSAndroid Build Coastguard Worker */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)2680*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2681*6a54128fSAndroid Build Coastguard Worker const char *s, size_t len)
2682*6a54128fSAndroid Build Coastguard Worker {
2683*6a54128fSAndroid Build Coastguard Worker utf8trie_t *trie = utf8data + tree->index;
2684*6a54128fSAndroid Build Coastguard Worker int offlen;
2685*6a54128fSAndroid Build Coastguard Worker int offset;
2686*6a54128fSAndroid Build Coastguard Worker int mask;
2687*6a54128fSAndroid Build Coastguard Worker int node;
2688*6a54128fSAndroid Build Coastguard Worker
2689*6a54128fSAndroid Build Coastguard Worker if (!tree)
2690*6a54128fSAndroid Build Coastguard Worker return NULL;
2691*6a54128fSAndroid Build Coastguard Worker if (len == 0)
2692*6a54128fSAndroid Build Coastguard Worker return NULL;
2693*6a54128fSAndroid Build Coastguard Worker node = 1;
2694*6a54128fSAndroid Build Coastguard Worker while (node) {
2695*6a54128fSAndroid Build Coastguard Worker offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2696*6a54128fSAndroid Build Coastguard Worker if (*trie & NEXTBYTE) {
2697*6a54128fSAndroid Build Coastguard Worker if (--len == 0)
2698*6a54128fSAndroid Build Coastguard Worker return NULL;
2699*6a54128fSAndroid Build Coastguard Worker s++;
2700*6a54128fSAndroid Build Coastguard Worker }
2701*6a54128fSAndroid Build Coastguard Worker mask = 1 << (*trie & BITNUM);
2702*6a54128fSAndroid Build Coastguard Worker if (*s & mask) {
2703*6a54128fSAndroid Build Coastguard Worker /* Right leg */
2704*6a54128fSAndroid Build Coastguard Worker if (offlen) {
2705*6a54128fSAndroid Build Coastguard Worker /* Right node at offset of trie */
2706*6a54128fSAndroid Build Coastguard Worker node = (*trie & RIGHTNODE);
2707*6a54128fSAndroid Build Coastguard Worker offset = trie[offlen];
2708*6a54128fSAndroid Build Coastguard Worker while (--offlen) {
2709*6a54128fSAndroid Build Coastguard Worker offset <<= 8;
2710*6a54128fSAndroid Build Coastguard Worker offset |= trie[offlen];
2711*6a54128fSAndroid Build Coastguard Worker }
2712*6a54128fSAndroid Build Coastguard Worker trie += offset;
2713*6a54128fSAndroid Build Coastguard Worker } else if (*trie & RIGHTPATH) {
2714*6a54128fSAndroid Build Coastguard Worker /* Right node after this node */
2715*6a54128fSAndroid Build Coastguard Worker node = (*trie & TRIENODE);
2716*6a54128fSAndroid Build Coastguard Worker trie++;
2717*6a54128fSAndroid Build Coastguard Worker } else {
2718*6a54128fSAndroid Build Coastguard Worker /* No right node. */
2719*6a54128fSAndroid Build Coastguard Worker return NULL;
2720*6a54128fSAndroid Build Coastguard Worker }
2721*6a54128fSAndroid Build Coastguard Worker } else {
2722*6a54128fSAndroid Build Coastguard Worker /* Left leg */
2723*6a54128fSAndroid Build Coastguard Worker if (offlen) {
2724*6a54128fSAndroid Build Coastguard Worker /* Left node after this node. */
2725*6a54128fSAndroid Build Coastguard Worker node = (*trie & LEFTNODE);
2726*6a54128fSAndroid Build Coastguard Worker trie += offlen + 1;
2727*6a54128fSAndroid Build Coastguard Worker } else if (*trie & RIGHTPATH) {
2728*6a54128fSAndroid Build Coastguard Worker /* No left node. */
2729*6a54128fSAndroid Build Coastguard Worker return NULL;
2730*6a54128fSAndroid Build Coastguard Worker } else {
2731*6a54128fSAndroid Build Coastguard Worker /* Left node after this node */
2732*6a54128fSAndroid Build Coastguard Worker node = (*trie & TRIENODE);
2733*6a54128fSAndroid Build Coastguard Worker trie++;
2734*6a54128fSAndroid Build Coastguard Worker }
2735*6a54128fSAndroid Build Coastguard Worker }
2736*6a54128fSAndroid Build Coastguard Worker }
2737*6a54128fSAndroid Build Coastguard Worker /*
2738*6a54128fSAndroid Build Coastguard Worker * Hangul decomposition is done algorithmically. These are the
2739*6a54128fSAndroid Build Coastguard Worker * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2740*6a54128fSAndroid Build Coastguard Worker * always 3 bytes long, so s has been advanced twice, and the
2741*6a54128fSAndroid Build Coastguard Worker * start of the sequence is at s-2.
2742*6a54128fSAndroid Build Coastguard Worker */
2743*6a54128fSAndroid Build Coastguard Worker if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2744*6a54128fSAndroid Build Coastguard Worker trie = utf8hangul(s - 2, hangul);
2745*6a54128fSAndroid Build Coastguard Worker return trie;
2746*6a54128fSAndroid Build Coastguard Worker }
2747*6a54128fSAndroid Build Coastguard Worker
2748*6a54128fSAndroid Build Coastguard Worker /*
2749*6a54128fSAndroid Build Coastguard Worker * Use trie to scan s.
2750*6a54128fSAndroid Build Coastguard Worker * Returns the leaf if one exists, NULL otherwise.
2751*6a54128fSAndroid Build Coastguard Worker *
2752*6a54128fSAndroid Build Coastguard Worker * Forwards to trie_nlookup().
2753*6a54128fSAndroid Build Coastguard Worker */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)2754*6a54128fSAndroid Build Coastguard Worker static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2755*6a54128fSAndroid Build Coastguard Worker const char *s)
2756*6a54128fSAndroid Build Coastguard Worker {
2757*6a54128fSAndroid Build Coastguard Worker return utf8nlookup(tree, hangul, s, (size_t)-1);
2758*6a54128fSAndroid Build Coastguard Worker }
2759*6a54128fSAndroid Build Coastguard Worker
2760*6a54128fSAndroid Build Coastguard Worker /*
2761*6a54128fSAndroid Build Coastguard Worker * Return the number of bytes used by the current UTF-8 sequence.
2762*6a54128fSAndroid Build Coastguard Worker * Assumes the input points to the first byte of a valid UTF-8
2763*6a54128fSAndroid Build Coastguard Worker * sequence.
2764*6a54128fSAndroid Build Coastguard Worker */
utf8clen(const char * s)2765*6a54128fSAndroid Build Coastguard Worker static inline int utf8clen(const char *s)
2766*6a54128fSAndroid Build Coastguard Worker {
2767*6a54128fSAndroid Build Coastguard Worker unsigned char c = *s;
2768*6a54128fSAndroid Build Coastguard Worker return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2769*6a54128fSAndroid Build Coastguard Worker }
2770*6a54128fSAndroid Build Coastguard Worker
2771*6a54128fSAndroid Build Coastguard Worker /*
2772*6a54128fSAndroid Build Coastguard Worker * Maximum age of any character in s.
2773*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
2774*6a54128fSAndroid Build Coastguard Worker * Return 0 if only non-assigned code points are used.
2775*6a54128fSAndroid Build Coastguard Worker */
utf8agemax(struct tree * tree,const char * s)2776*6a54128fSAndroid Build Coastguard Worker int utf8agemax(struct tree *tree, const char *s)
2777*6a54128fSAndroid Build Coastguard Worker {
2778*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
2779*6a54128fSAndroid Build Coastguard Worker int age = 0;
2780*6a54128fSAndroid Build Coastguard Worker int leaf_age;
2781*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2782*6a54128fSAndroid Build Coastguard Worker
2783*6a54128fSAndroid Build Coastguard Worker if (!tree)
2784*6a54128fSAndroid Build Coastguard Worker return -1;
2785*6a54128fSAndroid Build Coastguard Worker
2786*6a54128fSAndroid Build Coastguard Worker while (*s) {
2787*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(tree, hangul, s);
2788*6a54128fSAndroid Build Coastguard Worker if (!leaf)
2789*6a54128fSAndroid Build Coastguard Worker return -1;
2790*6a54128fSAndroid Build Coastguard Worker leaf_age = ages[LEAF_GEN(leaf)];
2791*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= tree->maxage && leaf_age > age)
2792*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
2793*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
2794*6a54128fSAndroid Build Coastguard Worker }
2795*6a54128fSAndroid Build Coastguard Worker return age;
2796*6a54128fSAndroid Build Coastguard Worker }
2797*6a54128fSAndroid Build Coastguard Worker
2798*6a54128fSAndroid Build Coastguard Worker /*
2799*6a54128fSAndroid Build Coastguard Worker * Minimum age of any character in s.
2800*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
2801*6a54128fSAndroid Build Coastguard Worker * Return 0 if non-assigned code points are used.
2802*6a54128fSAndroid Build Coastguard Worker */
utf8agemin(struct tree * tree,const char * s)2803*6a54128fSAndroid Build Coastguard Worker int utf8agemin(struct tree *tree, const char *s)
2804*6a54128fSAndroid Build Coastguard Worker {
2805*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
2806*6a54128fSAndroid Build Coastguard Worker int age;
2807*6a54128fSAndroid Build Coastguard Worker int leaf_age;
2808*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2809*6a54128fSAndroid Build Coastguard Worker
2810*6a54128fSAndroid Build Coastguard Worker if (!tree)
2811*6a54128fSAndroid Build Coastguard Worker return -1;
2812*6a54128fSAndroid Build Coastguard Worker age = tree->maxage;
2813*6a54128fSAndroid Build Coastguard Worker while (*s) {
2814*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(tree, hangul, s);
2815*6a54128fSAndroid Build Coastguard Worker if (!leaf)
2816*6a54128fSAndroid Build Coastguard Worker return -1;
2817*6a54128fSAndroid Build Coastguard Worker leaf_age = ages[LEAF_GEN(leaf)];
2818*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= tree->maxage && leaf_age < age)
2819*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
2820*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
2821*6a54128fSAndroid Build Coastguard Worker }
2822*6a54128fSAndroid Build Coastguard Worker return age;
2823*6a54128fSAndroid Build Coastguard Worker }
2824*6a54128fSAndroid Build Coastguard Worker
2825*6a54128fSAndroid Build Coastguard Worker /*
2826*6a54128fSAndroid Build Coastguard Worker * Maximum age of any character in s, touch at most len bytes.
2827*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
2828*6a54128fSAndroid Build Coastguard Worker */
utf8nagemax(struct tree * tree,const char * s,size_t len)2829*6a54128fSAndroid Build Coastguard Worker int utf8nagemax(struct tree *tree, const char *s, size_t len)
2830*6a54128fSAndroid Build Coastguard Worker {
2831*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
2832*6a54128fSAndroid Build Coastguard Worker int age = 0;
2833*6a54128fSAndroid Build Coastguard Worker int leaf_age;
2834*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2835*6a54128fSAndroid Build Coastguard Worker
2836*6a54128fSAndroid Build Coastguard Worker if (!tree)
2837*6a54128fSAndroid Build Coastguard Worker return -1;
2838*6a54128fSAndroid Build Coastguard Worker
2839*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
2840*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(tree, hangul, s, len);
2841*6a54128fSAndroid Build Coastguard Worker if (!leaf)
2842*6a54128fSAndroid Build Coastguard Worker return -1;
2843*6a54128fSAndroid Build Coastguard Worker leaf_age = ages[LEAF_GEN(leaf)];
2844*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= tree->maxage && leaf_age > age)
2845*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
2846*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
2847*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
2848*6a54128fSAndroid Build Coastguard Worker }
2849*6a54128fSAndroid Build Coastguard Worker return age;
2850*6a54128fSAndroid Build Coastguard Worker }
2851*6a54128fSAndroid Build Coastguard Worker
2852*6a54128fSAndroid Build Coastguard Worker /*
2853*6a54128fSAndroid Build Coastguard Worker * Maximum age of any character in s, touch at most len bytes.
2854*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
2855*6a54128fSAndroid Build Coastguard Worker */
utf8nagemin(struct tree * tree,const char * s,size_t len)2856*6a54128fSAndroid Build Coastguard Worker int utf8nagemin(struct tree *tree, const char *s, size_t len)
2857*6a54128fSAndroid Build Coastguard Worker {
2858*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
2859*6a54128fSAndroid Build Coastguard Worker int leaf_age;
2860*6a54128fSAndroid Build Coastguard Worker int age;
2861*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2862*6a54128fSAndroid Build Coastguard Worker
2863*6a54128fSAndroid Build Coastguard Worker if (!tree)
2864*6a54128fSAndroid Build Coastguard Worker return -1;
2865*6a54128fSAndroid Build Coastguard Worker age = tree->maxage;
2866*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
2867*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(tree, hangul, s, len);
2868*6a54128fSAndroid Build Coastguard Worker if (!leaf)
2869*6a54128fSAndroid Build Coastguard Worker return -1;
2870*6a54128fSAndroid Build Coastguard Worker leaf_age = ages[LEAF_GEN(leaf)];
2871*6a54128fSAndroid Build Coastguard Worker if (leaf_age <= tree->maxage && leaf_age < age)
2872*6a54128fSAndroid Build Coastguard Worker age = leaf_age;
2873*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
2874*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
2875*6a54128fSAndroid Build Coastguard Worker }
2876*6a54128fSAndroid Build Coastguard Worker return age;
2877*6a54128fSAndroid Build Coastguard Worker }
2878*6a54128fSAndroid Build Coastguard Worker
2879*6a54128fSAndroid Build Coastguard Worker /*
2880*6a54128fSAndroid Build Coastguard Worker * Length of the normalization of s.
2881*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
2882*6a54128fSAndroid Build Coastguard Worker *
2883*6a54128fSAndroid Build Coastguard Worker * A string of Default_Ignorable_Code_Point has length 0.
2884*6a54128fSAndroid Build Coastguard Worker */
utf8len(struct tree * tree,const char * s)2885*6a54128fSAndroid Build Coastguard Worker ssize_t utf8len(struct tree *tree, const char *s)
2886*6a54128fSAndroid Build Coastguard Worker {
2887*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
2888*6a54128fSAndroid Build Coastguard Worker size_t ret = 0;
2889*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2890*6a54128fSAndroid Build Coastguard Worker
2891*6a54128fSAndroid Build Coastguard Worker if (!tree)
2892*6a54128fSAndroid Build Coastguard Worker return -1;
2893*6a54128fSAndroid Build Coastguard Worker while (*s) {
2894*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(tree, hangul, s);
2895*6a54128fSAndroid Build Coastguard Worker if (!leaf)
2896*6a54128fSAndroid Build Coastguard Worker return -1;
2897*6a54128fSAndroid Build Coastguard Worker if (ages[LEAF_GEN(leaf)] > tree->maxage)
2898*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
2899*6a54128fSAndroid Build Coastguard Worker else if (LEAF_CCC(leaf) == DECOMPOSE)
2900*6a54128fSAndroid Build Coastguard Worker ret += strlen(LEAF_STR(leaf));
2901*6a54128fSAndroid Build Coastguard Worker else
2902*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
2903*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
2904*6a54128fSAndroid Build Coastguard Worker }
2905*6a54128fSAndroid Build Coastguard Worker return ret;
2906*6a54128fSAndroid Build Coastguard Worker }
2907*6a54128fSAndroid Build Coastguard Worker
2908*6a54128fSAndroid Build Coastguard Worker /*
2909*6a54128fSAndroid Build Coastguard Worker * Length of the normalization of s, touch at most len bytes.
2910*6a54128fSAndroid Build Coastguard Worker * Return -1 if s is not valid UTF-8 unicode.
2911*6a54128fSAndroid Build Coastguard Worker */
utf8nlen(struct tree * tree,const char * s,size_t len)2912*6a54128fSAndroid Build Coastguard Worker ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2913*6a54128fSAndroid Build Coastguard Worker {
2914*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
2915*6a54128fSAndroid Build Coastguard Worker size_t ret = 0;
2916*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2917*6a54128fSAndroid Build Coastguard Worker
2918*6a54128fSAndroid Build Coastguard Worker if (!tree)
2919*6a54128fSAndroid Build Coastguard Worker return -1;
2920*6a54128fSAndroid Build Coastguard Worker while (len && *s) {
2921*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(tree, hangul, s, len);
2922*6a54128fSAndroid Build Coastguard Worker if (!leaf)
2923*6a54128fSAndroid Build Coastguard Worker return -1;
2924*6a54128fSAndroid Build Coastguard Worker if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
2926*6a54128fSAndroid Build Coastguard Worker else if (LEAF_CCC(leaf) == DECOMPOSE)
2927*6a54128fSAndroid Build Coastguard Worker ret += strlen(LEAF_STR(leaf));
2928*6a54128fSAndroid Build Coastguard Worker else
2929*6a54128fSAndroid Build Coastguard Worker ret += utf8clen(s);
2930*6a54128fSAndroid Build Coastguard Worker len -= utf8clen(s);
2931*6a54128fSAndroid Build Coastguard Worker s += utf8clen(s);
2932*6a54128fSAndroid Build Coastguard Worker }
2933*6a54128fSAndroid Build Coastguard Worker return ret;
2934*6a54128fSAndroid Build Coastguard Worker }
2935*6a54128fSAndroid Build Coastguard Worker
2936*6a54128fSAndroid Build Coastguard Worker /*
2937*6a54128fSAndroid Build Coastguard Worker * Cursor structure used by the normalizer.
2938*6a54128fSAndroid Build Coastguard Worker */
2939*6a54128fSAndroid Build Coastguard Worker struct utf8cursor {
2940*6a54128fSAndroid Build Coastguard Worker struct tree *tree;
2941*6a54128fSAndroid Build Coastguard Worker const char *s;
2942*6a54128fSAndroid Build Coastguard Worker const char *p;
2943*6a54128fSAndroid Build Coastguard Worker const char *ss;
2944*6a54128fSAndroid Build Coastguard Worker const char *sp;
2945*6a54128fSAndroid Build Coastguard Worker unsigned int len;
2946*6a54128fSAndroid Build Coastguard Worker unsigned int slen;
2947*6a54128fSAndroid Build Coastguard Worker short int ccc;
2948*6a54128fSAndroid Build Coastguard Worker short int nccc;
2949*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
2950*6a54128fSAndroid Build Coastguard Worker unsigned char hangul[UTF8HANGULLEAF];
2951*6a54128fSAndroid Build Coastguard Worker };
2952*6a54128fSAndroid Build Coastguard Worker
2953*6a54128fSAndroid Build Coastguard Worker /*
2954*6a54128fSAndroid Build Coastguard Worker * Set up an utf8cursor for use by utf8byte().
2955*6a54128fSAndroid Build Coastguard Worker *
2956*6a54128fSAndroid Build Coastguard Worker * s : string.
2957*6a54128fSAndroid Build Coastguard Worker * len : length of s.
2958*6a54128fSAndroid Build Coastguard Worker * u8c : pointer to cursor.
2959*6a54128fSAndroid Build Coastguard Worker * trie : utf8trie_t to use for normalization.
2960*6a54128fSAndroid Build Coastguard Worker *
2961*6a54128fSAndroid Build Coastguard Worker * Returns -1 on error, 0 on success.
2962*6a54128fSAndroid Build Coastguard Worker */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)2963*6a54128fSAndroid Build Coastguard Worker int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2964*6a54128fSAndroid Build Coastguard Worker size_t len)
2965*6a54128fSAndroid Build Coastguard Worker {
2966*6a54128fSAndroid Build Coastguard Worker if (!tree)
2967*6a54128fSAndroid Build Coastguard Worker return -1;
2968*6a54128fSAndroid Build Coastguard Worker if (!s)
2969*6a54128fSAndroid Build Coastguard Worker return -1;
2970*6a54128fSAndroid Build Coastguard Worker u8c->tree = tree;
2971*6a54128fSAndroid Build Coastguard Worker u8c->s = s;
2972*6a54128fSAndroid Build Coastguard Worker u8c->p = NULL;
2973*6a54128fSAndroid Build Coastguard Worker u8c->ss = NULL;
2974*6a54128fSAndroid Build Coastguard Worker u8c->sp = NULL;
2975*6a54128fSAndroid Build Coastguard Worker u8c->len = len;
2976*6a54128fSAndroid Build Coastguard Worker u8c->slen = 0;
2977*6a54128fSAndroid Build Coastguard Worker u8c->ccc = STOPPER;
2978*6a54128fSAndroid Build Coastguard Worker u8c->nccc = STOPPER;
2979*6a54128fSAndroid Build Coastguard Worker u8c->unichar = 0;
2980*6a54128fSAndroid Build Coastguard Worker /* Check we didn't clobber the maximum length. */
2981*6a54128fSAndroid Build Coastguard Worker if (u8c->len != len)
2982*6a54128fSAndroid Build Coastguard Worker return -1;
2983*6a54128fSAndroid Build Coastguard Worker /* The first byte of s may not be an utf8 continuation. */
2984*6a54128fSAndroid Build Coastguard Worker if (len > 0 && (*s & 0xC0) == 0x80)
2985*6a54128fSAndroid Build Coastguard Worker return -1;
2986*6a54128fSAndroid Build Coastguard Worker return 0;
2987*6a54128fSAndroid Build Coastguard Worker }
2988*6a54128fSAndroid Build Coastguard Worker
2989*6a54128fSAndroid Build Coastguard Worker /*
2990*6a54128fSAndroid Build Coastguard Worker * Set up an utf8cursor for use by utf8byte().
2991*6a54128fSAndroid Build Coastguard Worker *
2992*6a54128fSAndroid Build Coastguard Worker * s : NUL-terminated string.
2993*6a54128fSAndroid Build Coastguard Worker * u8c : pointer to cursor.
2994*6a54128fSAndroid Build Coastguard Worker * trie : utf8trie_t to use for normalization.
2995*6a54128fSAndroid Build Coastguard Worker *
2996*6a54128fSAndroid Build Coastguard Worker * Returns -1 on error, 0 on success.
2997*6a54128fSAndroid Build Coastguard Worker */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)2998*6a54128fSAndroid Build Coastguard Worker int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2999*6a54128fSAndroid Build Coastguard Worker {
3000*6a54128fSAndroid Build Coastguard Worker return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3001*6a54128fSAndroid Build Coastguard Worker }
3002*6a54128fSAndroid Build Coastguard Worker
3003*6a54128fSAndroid Build Coastguard Worker /*
3004*6a54128fSAndroid Build Coastguard Worker * Get one byte from the normalized form of the string described by u8c.
3005*6a54128fSAndroid Build Coastguard Worker *
3006*6a54128fSAndroid Build Coastguard Worker * Returns the byte cast to an unsigned char on success, and -1 on failure.
3007*6a54128fSAndroid Build Coastguard Worker *
3008*6a54128fSAndroid Build Coastguard Worker * The cursor keeps track of the location in the string in u8c->s.
3009*6a54128fSAndroid Build Coastguard Worker * When a character is decomposed, the current location is stored in
3010*6a54128fSAndroid Build Coastguard Worker * u8c->p, and u8c->s is set to the start of the decomposition. Note
3011*6a54128fSAndroid Build Coastguard Worker * that bytes from a decomposition do not count against u8c->len.
3012*6a54128fSAndroid Build Coastguard Worker *
3013*6a54128fSAndroid Build Coastguard Worker * Characters are emitted if they match the current CCC in u8c->ccc.
3014*6a54128fSAndroid Build Coastguard Worker * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3015*6a54128fSAndroid Build Coastguard Worker * and the function returns 0 in that case.
3016*6a54128fSAndroid Build Coastguard Worker *
3017*6a54128fSAndroid Build Coastguard Worker * Sorting by CCC is done by repeatedly scanning the string. The
3018*6a54128fSAndroid Build Coastguard Worker * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3019*6a54128fSAndroid Build Coastguard Worker * the start of the scan. The first pass finds the lowest CCC to be
3020*6a54128fSAndroid Build Coastguard Worker * emitted and stores it in u8c->nccc, the second pass emits the
3021*6a54128fSAndroid Build Coastguard Worker * characters with this CCC and finds the next lowest CCC. This limits
3022*6a54128fSAndroid Build Coastguard Worker * the number of passes to 1 + the number of different CCCs in the
3023*6a54128fSAndroid Build Coastguard Worker * sequence being scanned.
3024*6a54128fSAndroid Build Coastguard Worker *
3025*6a54128fSAndroid Build Coastguard Worker * Therefore:
3026*6a54128fSAndroid Build Coastguard Worker * u8c->p != NULL -> a decomposition is being scanned.
3027*6a54128fSAndroid Build Coastguard Worker * u8c->ss != NULL -> this is a repeating scan.
3028*6a54128fSAndroid Build Coastguard Worker * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3029*6a54128fSAndroid Build Coastguard Worker */
utf8byte(struct utf8cursor * u8c)3030*6a54128fSAndroid Build Coastguard Worker int utf8byte(struct utf8cursor *u8c)
3031*6a54128fSAndroid Build Coastguard Worker {
3032*6a54128fSAndroid Build Coastguard Worker utf8leaf_t *leaf;
3033*6a54128fSAndroid Build Coastguard Worker int ccc;
3034*6a54128fSAndroid Build Coastguard Worker
3035*6a54128fSAndroid Build Coastguard Worker for (;;) {
3036*6a54128fSAndroid Build Coastguard Worker /* Check for the end of a decomposed character. */
3037*6a54128fSAndroid Build Coastguard Worker if (u8c->p && *u8c->s == '\0') {
3038*6a54128fSAndroid Build Coastguard Worker u8c->s = u8c->p;
3039*6a54128fSAndroid Build Coastguard Worker u8c->p = NULL;
3040*6a54128fSAndroid Build Coastguard Worker }
3041*6a54128fSAndroid Build Coastguard Worker
3042*6a54128fSAndroid Build Coastguard Worker /* Check for end-of-string. */
3043*6a54128fSAndroid Build Coastguard Worker if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3044*6a54128fSAndroid Build Coastguard Worker /* There is no next byte. */
3045*6a54128fSAndroid Build Coastguard Worker if (u8c->ccc == STOPPER)
3046*6a54128fSAndroid Build Coastguard Worker return 0;
3047*6a54128fSAndroid Build Coastguard Worker /* End-of-string during a scan counts as a stopper. */
3048*6a54128fSAndroid Build Coastguard Worker ccc = STOPPER;
3049*6a54128fSAndroid Build Coastguard Worker goto ccc_mismatch;
3050*6a54128fSAndroid Build Coastguard Worker } else if ((*u8c->s & 0xC0) == 0x80) {
3051*6a54128fSAndroid Build Coastguard Worker /* This is a continuation of the current character. */
3052*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
3053*6a54128fSAndroid Build Coastguard Worker u8c->len--;
3054*6a54128fSAndroid Build Coastguard Worker return (unsigned char)*u8c->s++;
3055*6a54128fSAndroid Build Coastguard Worker }
3056*6a54128fSAndroid Build Coastguard Worker
3057*6a54128fSAndroid Build Coastguard Worker /* Look up the data for the current character. */
3058*6a54128fSAndroid Build Coastguard Worker if (u8c->p) {
3059*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3060*6a54128fSAndroid Build Coastguard Worker } else {
3061*6a54128fSAndroid Build Coastguard Worker leaf = utf8nlookup(u8c->tree, u8c->hangul,
3062*6a54128fSAndroid Build Coastguard Worker u8c->s, u8c->len);
3063*6a54128fSAndroid Build Coastguard Worker }
3064*6a54128fSAndroid Build Coastguard Worker
3065*6a54128fSAndroid Build Coastguard Worker /* No leaf found implies that the input is a binary blob. */
3066*6a54128fSAndroid Build Coastguard Worker if (!leaf)
3067*6a54128fSAndroid Build Coastguard Worker return -1;
3068*6a54128fSAndroid Build Coastguard Worker
3069*6a54128fSAndroid Build Coastguard Worker /* Characters that are too new have CCC 0. */
3070*6a54128fSAndroid Build Coastguard Worker if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3071*6a54128fSAndroid Build Coastguard Worker ccc = STOPPER;
3072*6a54128fSAndroid Build Coastguard Worker } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3073*6a54128fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
3074*6a54128fSAndroid Build Coastguard Worker u8c->p = u8c->s + utf8clen(u8c->s);
3075*6a54128fSAndroid Build Coastguard Worker u8c->s = LEAF_STR(leaf);
3076*6a54128fSAndroid Build Coastguard Worker /* Empty decomposition implies CCC 0. */
3077*6a54128fSAndroid Build Coastguard Worker if (*u8c->s == '\0') {
3078*6a54128fSAndroid Build Coastguard Worker if (u8c->ccc == STOPPER)
3079*6a54128fSAndroid Build Coastguard Worker continue;
3080*6a54128fSAndroid Build Coastguard Worker ccc = STOPPER;
3081*6a54128fSAndroid Build Coastguard Worker goto ccc_mismatch;
3082*6a54128fSAndroid Build Coastguard Worker }
3083*6a54128fSAndroid Build Coastguard Worker leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3084*6a54128fSAndroid Build Coastguard Worker ccc = LEAF_CCC(leaf);
3085*6a54128fSAndroid Build Coastguard Worker }
3086*6a54128fSAndroid Build Coastguard Worker u8c->unichar = utf8decode(u8c->s);
3087*6a54128fSAndroid Build Coastguard Worker
3088*6a54128fSAndroid Build Coastguard Worker /*
3089*6a54128fSAndroid Build Coastguard Worker * If this is not a stopper, then see if it updates
3090*6a54128fSAndroid Build Coastguard Worker * the next canonical class to be emitted.
3091*6a54128fSAndroid Build Coastguard Worker */
3092*6a54128fSAndroid Build Coastguard Worker if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3093*6a54128fSAndroid Build Coastguard Worker u8c->nccc = ccc;
3094*6a54128fSAndroid Build Coastguard Worker
3095*6a54128fSAndroid Build Coastguard Worker /*
3096*6a54128fSAndroid Build Coastguard Worker * Return the current byte if this is the current
3097*6a54128fSAndroid Build Coastguard Worker * combining class.
3098*6a54128fSAndroid Build Coastguard Worker */
3099*6a54128fSAndroid Build Coastguard Worker if (ccc == u8c->ccc) {
3100*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
3101*6a54128fSAndroid Build Coastguard Worker u8c->len--;
3102*6a54128fSAndroid Build Coastguard Worker return (unsigned char)*u8c->s++;
3103*6a54128fSAndroid Build Coastguard Worker }
3104*6a54128fSAndroid Build Coastguard Worker
3105*6a54128fSAndroid Build Coastguard Worker /* Current combining class mismatch. */
3106*6a54128fSAndroid Build Coastguard Worker ccc_mismatch:
3107*6a54128fSAndroid Build Coastguard Worker if (u8c->nccc == STOPPER) {
3108*6a54128fSAndroid Build Coastguard Worker /*
3109*6a54128fSAndroid Build Coastguard Worker * Scan forward for the first canonical class
3110*6a54128fSAndroid Build Coastguard Worker * to be emitted. Save the position from
3111*6a54128fSAndroid Build Coastguard Worker * which to restart.
3112*6a54128fSAndroid Build Coastguard Worker */
3113*6a54128fSAndroid Build Coastguard Worker assert(u8c->ccc == STOPPER);
3114*6a54128fSAndroid Build Coastguard Worker u8c->ccc = MINCCC - 1;
3115*6a54128fSAndroid Build Coastguard Worker u8c->nccc = ccc;
3116*6a54128fSAndroid Build Coastguard Worker u8c->sp = u8c->p;
3117*6a54128fSAndroid Build Coastguard Worker u8c->ss = u8c->s;
3118*6a54128fSAndroid Build Coastguard Worker u8c->slen = u8c->len;
3119*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
3120*6a54128fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
3121*6a54128fSAndroid Build Coastguard Worker u8c->s += utf8clen(u8c->s);
3122*6a54128fSAndroid Build Coastguard Worker } else if (ccc != STOPPER) {
3123*6a54128fSAndroid Build Coastguard Worker /* Not a stopper, and not the ccc we're emitting. */
3124*6a54128fSAndroid Build Coastguard Worker if (!u8c->p)
3125*6a54128fSAndroid Build Coastguard Worker u8c->len -= utf8clen(u8c->s);
3126*6a54128fSAndroid Build Coastguard Worker u8c->s += utf8clen(u8c->s);
3127*6a54128fSAndroid Build Coastguard Worker } else if (u8c->nccc != MAXCCC + 1) {
3128*6a54128fSAndroid Build Coastguard Worker /* At a stopper, restart for next ccc. */
3129*6a54128fSAndroid Build Coastguard Worker u8c->ccc = u8c->nccc;
3130*6a54128fSAndroid Build Coastguard Worker u8c->nccc = MAXCCC + 1;
3131*6a54128fSAndroid Build Coastguard Worker u8c->s = u8c->ss;
3132*6a54128fSAndroid Build Coastguard Worker u8c->p = u8c->sp;
3133*6a54128fSAndroid Build Coastguard Worker u8c->len = u8c->slen;
3134*6a54128fSAndroid Build Coastguard Worker } else {
3135*6a54128fSAndroid Build Coastguard Worker /* All done, proceed from here. */
3136*6a54128fSAndroid Build Coastguard Worker u8c->ccc = STOPPER;
3137*6a54128fSAndroid Build Coastguard Worker u8c->nccc = STOPPER;
3138*6a54128fSAndroid Build Coastguard Worker u8c->sp = NULL;
3139*6a54128fSAndroid Build Coastguard Worker u8c->ss = NULL;
3140*6a54128fSAndroid Build Coastguard Worker u8c->slen = 0;
3141*6a54128fSAndroid Build Coastguard Worker }
3142*6a54128fSAndroid Build Coastguard Worker }
3143*6a54128fSAndroid Build Coastguard Worker }
3144*6a54128fSAndroid Build Coastguard Worker
3145*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
3146*6a54128fSAndroid Build Coastguard Worker
normalize_line(struct tree * tree)3147*6a54128fSAndroid Build Coastguard Worker static int normalize_line(struct tree *tree)
3148*6a54128fSAndroid Build Coastguard Worker {
3149*6a54128fSAndroid Build Coastguard Worker char *s;
3150*6a54128fSAndroid Build Coastguard Worker char *t;
3151*6a54128fSAndroid Build Coastguard Worker int c;
3152*6a54128fSAndroid Build Coastguard Worker struct utf8cursor u8c;
3153*6a54128fSAndroid Build Coastguard Worker
3154*6a54128fSAndroid Build Coastguard Worker /* First test: null-terminated string. */
3155*6a54128fSAndroid Build Coastguard Worker s = buf2;
3156*6a54128fSAndroid Build Coastguard Worker t = buf3;
3157*6a54128fSAndroid Build Coastguard Worker if (utf8cursor(&u8c, tree, s))
3158*6a54128fSAndroid Build Coastguard Worker return -1;
3159*6a54128fSAndroid Build Coastguard Worker while ((c = utf8byte(&u8c)) > 0)
3160*6a54128fSAndroid Build Coastguard Worker if (c != (unsigned char)*t++)
3161*6a54128fSAndroid Build Coastguard Worker return -1;
3162*6a54128fSAndroid Build Coastguard Worker if (c < 0)
3163*6a54128fSAndroid Build Coastguard Worker return -1;
3164*6a54128fSAndroid Build Coastguard Worker if (*t != 0)
3165*6a54128fSAndroid Build Coastguard Worker return -1;
3166*6a54128fSAndroid Build Coastguard Worker
3167*6a54128fSAndroid Build Coastguard Worker /* Second test: length-limited string. */
3168*6a54128fSAndroid Build Coastguard Worker s = buf2;
3169*6a54128fSAndroid Build Coastguard Worker /* Replace NUL with a value that will cause an error if seen. */
3170*6a54128fSAndroid Build Coastguard Worker s[strlen(s) + 1] = -1;
3171*6a54128fSAndroid Build Coastguard Worker t = buf3;
3172*6a54128fSAndroid Build Coastguard Worker if (utf8cursor(&u8c, tree, s))
3173*6a54128fSAndroid Build Coastguard Worker return -1;
3174*6a54128fSAndroid Build Coastguard Worker while ((c = utf8byte(&u8c)) > 0)
3175*6a54128fSAndroid Build Coastguard Worker if (c != (unsigned char)*t++)
3176*6a54128fSAndroid Build Coastguard Worker return -1;
3177*6a54128fSAndroid Build Coastguard Worker if (c < 0)
3178*6a54128fSAndroid Build Coastguard Worker return -1;
3179*6a54128fSAndroid Build Coastguard Worker if (*t != 0)
3180*6a54128fSAndroid Build Coastguard Worker return -1;
3181*6a54128fSAndroid Build Coastguard Worker
3182*6a54128fSAndroid Build Coastguard Worker return 0;
3183*6a54128fSAndroid Build Coastguard Worker }
3184*6a54128fSAndroid Build Coastguard Worker
normalization_test(void)3185*6a54128fSAndroid Build Coastguard Worker static void normalization_test(void)
3186*6a54128fSAndroid Build Coastguard Worker {
3187*6a54128fSAndroid Build Coastguard Worker FILE *file;
3188*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
3189*6a54128fSAndroid Build Coastguard Worker struct unicode_data *data;
3190*6a54128fSAndroid Build Coastguard Worker char *s;
3191*6a54128fSAndroid Build Coastguard Worker char *t;
3192*6a54128fSAndroid Build Coastguard Worker int ret;
3193*6a54128fSAndroid Build Coastguard Worker int ignorables;
3194*6a54128fSAndroid Build Coastguard Worker int tests = 0;
3195*6a54128fSAndroid Build Coastguard Worker int failures = 0;
3196*6a54128fSAndroid Build Coastguard Worker
3197*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
3198*6a54128fSAndroid Build Coastguard Worker printf("Parsing %s\n", test_name);
3199*6a54128fSAndroid Build Coastguard Worker /* Step one, read data from file. */
3200*6a54128fSAndroid Build Coastguard Worker file = fopen(test_name, "r");
3201*6a54128fSAndroid Build Coastguard Worker if (!file)
3202*6a54128fSAndroid Build Coastguard Worker open_fail(test_name, errno);
3203*6a54128fSAndroid Build Coastguard Worker
3204*6a54128fSAndroid Build Coastguard Worker while (fgets(line, LINESIZE, file)) {
3205*6a54128fSAndroid Build Coastguard Worker ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
3206*6a54128fSAndroid Build Coastguard Worker buf0, buf1);
3207*6a54128fSAndroid Build Coastguard Worker if (ret != 2 || *line == '#')
3208*6a54128fSAndroid Build Coastguard Worker continue;
3209*6a54128fSAndroid Build Coastguard Worker s = buf0;
3210*6a54128fSAndroid Build Coastguard Worker t = buf2;
3211*6a54128fSAndroid Build Coastguard Worker while (*s) {
3212*6a54128fSAndroid Build Coastguard Worker unichar = strtoul(s, &s, 16);
3213*6a54128fSAndroid Build Coastguard Worker t += utf8encode(t, unichar);
3214*6a54128fSAndroid Build Coastguard Worker }
3215*6a54128fSAndroid Build Coastguard Worker *t = '\0';
3216*6a54128fSAndroid Build Coastguard Worker
3217*6a54128fSAndroid Build Coastguard Worker ignorables = 0;
3218*6a54128fSAndroid Build Coastguard Worker s = buf1;
3219*6a54128fSAndroid Build Coastguard Worker t = buf3;
3220*6a54128fSAndroid Build Coastguard Worker while (*s) {
3221*6a54128fSAndroid Build Coastguard Worker unichar = strtoul(s, &s, 16);
3222*6a54128fSAndroid Build Coastguard Worker data = &unicode_data[unichar];
3223*6a54128fSAndroid Build Coastguard Worker if (data->utf8nfkdi && !*data->utf8nfkdi)
3224*6a54128fSAndroid Build Coastguard Worker ignorables = 1;
3225*6a54128fSAndroid Build Coastguard Worker else
3226*6a54128fSAndroid Build Coastguard Worker t += utf8encode(t, unichar);
3227*6a54128fSAndroid Build Coastguard Worker }
3228*6a54128fSAndroid Build Coastguard Worker *t = '\0';
3229*6a54128fSAndroid Build Coastguard Worker
3230*6a54128fSAndroid Build Coastguard Worker tests++;
3231*6a54128fSAndroid Build Coastguard Worker if (normalize_line(nfkdi_tree) < 0) {
3232*6a54128fSAndroid Build Coastguard Worker printf("Line %s -> %s", buf0, buf1);
3233*6a54128fSAndroid Build Coastguard Worker if (ignorables)
3234*6a54128fSAndroid Build Coastguard Worker printf(" (ignorables removed)");
3235*6a54128fSAndroid Build Coastguard Worker printf(" failure\n");
3236*6a54128fSAndroid Build Coastguard Worker failures++;
3237*6a54128fSAndroid Build Coastguard Worker }
3238*6a54128fSAndroid Build Coastguard Worker }
3239*6a54128fSAndroid Build Coastguard Worker fclose(file);
3240*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
3241*6a54128fSAndroid Build Coastguard Worker printf("Ran %d tests with %d failures\n", tests, failures);
3242*6a54128fSAndroid Build Coastguard Worker if (failures)
3243*6a54128fSAndroid Build Coastguard Worker file_fail(test_name);
3244*6a54128fSAndroid Build Coastguard Worker }
3245*6a54128fSAndroid Build Coastguard Worker
3246*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
3247*6a54128fSAndroid Build Coastguard Worker
write_file(void)3248*6a54128fSAndroid Build Coastguard Worker static void write_file(void)
3249*6a54128fSAndroid Build Coastguard Worker {
3250*6a54128fSAndroid Build Coastguard Worker FILE *file;
3251*6a54128fSAndroid Build Coastguard Worker int i;
3252*6a54128fSAndroid Build Coastguard Worker int j;
3253*6a54128fSAndroid Build Coastguard Worker int t;
3254*6a54128fSAndroid Build Coastguard Worker int gen;
3255*6a54128fSAndroid Build Coastguard Worker
3256*6a54128fSAndroid Build Coastguard Worker if (verbose > 0)
3257*6a54128fSAndroid Build Coastguard Worker printf("Writing %s\n", utf8_name);
3258*6a54128fSAndroid Build Coastguard Worker file = fopen(utf8_name, "w");
3259*6a54128fSAndroid Build Coastguard Worker if (!file)
3260*6a54128fSAndroid Build Coastguard Worker open_fail(utf8_name, errno);
3261*6a54128fSAndroid Build Coastguard Worker
3262*6a54128fSAndroid Build Coastguard Worker fprintf(file, "/* This file is generated code, do not edit. */\n");
3263*6a54128fSAndroid Build Coastguard Worker fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3264*6a54128fSAndroid Build Coastguard Worker fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3265*6a54128fSAndroid Build Coastguard Worker fprintf(file, "#endif\n");
3266*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\n");
3267*6a54128fSAndroid Build Coastguard Worker fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3268*6a54128fSAndroid Build Coastguard Worker unicode_maxage);
3269*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\n");
3270*6a54128fSAndroid Build Coastguard Worker fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3271*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != ages_count; i++)
3272*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\t%#x%s\n", ages[i],
3273*6a54128fSAndroid Build Coastguard Worker ages[i] == unicode_maxage ? "" : ",");
3274*6a54128fSAndroid Build Coastguard Worker fprintf(file, "};\n");
3275*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\n");
3276*6a54128fSAndroid Build Coastguard Worker fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
3277*6a54128fSAndroid Build Coastguard Worker t = 0;
3278*6a54128fSAndroid Build Coastguard Worker for (gen = 0; gen < ages_count; gen++) {
3279*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\t{ %#x, %d }%s\n",
3280*6a54128fSAndroid Build Coastguard Worker ages[gen], trees[t].index,
3281*6a54128fSAndroid Build Coastguard Worker ages[gen] == unicode_maxage ? "" : ",");
3282*6a54128fSAndroid Build Coastguard Worker if (trees[t].maxage == ages[gen])
3283*6a54128fSAndroid Build Coastguard Worker t += 2;
3284*6a54128fSAndroid Build Coastguard Worker }
3285*6a54128fSAndroid Build Coastguard Worker fprintf(file, "};\n");
3286*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\n");
3287*6a54128fSAndroid Build Coastguard Worker fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
3288*6a54128fSAndroid Build Coastguard Worker t = 1;
3289*6a54128fSAndroid Build Coastguard Worker for (gen = 0; gen < ages_count; gen++) {
3290*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\t{ %#x, %d }%s\n",
3291*6a54128fSAndroid Build Coastguard Worker ages[gen], trees[t].index,
3292*6a54128fSAndroid Build Coastguard Worker ages[gen] == unicode_maxage ? "" : ",");
3293*6a54128fSAndroid Build Coastguard Worker if (trees[t].maxage == ages[gen])
3294*6a54128fSAndroid Build Coastguard Worker t += 2;
3295*6a54128fSAndroid Build Coastguard Worker }
3296*6a54128fSAndroid Build Coastguard Worker fprintf(file, "};\n");
3297*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\n");
3298*6a54128fSAndroid Build Coastguard Worker fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3299*6a54128fSAndroid Build Coastguard Worker utf8data_size);
3300*6a54128fSAndroid Build Coastguard Worker t = 0;
3301*6a54128fSAndroid Build Coastguard Worker for (i = 0; i != utf8data_size; i += 16) {
3302*6a54128fSAndroid Build Coastguard Worker if (i == trees[t].index) {
3303*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\t/* %s_%x */\n",
3304*6a54128fSAndroid Build Coastguard Worker trees[t].type, trees[t].maxage);
3305*6a54128fSAndroid Build Coastguard Worker if (t < trees_count-1)
3306*6a54128fSAndroid Build Coastguard Worker t++;
3307*6a54128fSAndroid Build Coastguard Worker }
3308*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\t");
3309*6a54128fSAndroid Build Coastguard Worker for (j = i; j != i + 16; j++)
3310*6a54128fSAndroid Build Coastguard Worker fprintf(file, "0x%.2x%s", utf8data[j],
3311*6a54128fSAndroid Build Coastguard Worker (j < utf8data_size -1 ? "," : ""));
3312*6a54128fSAndroid Build Coastguard Worker fprintf(file, "\n");
3313*6a54128fSAndroid Build Coastguard Worker }
3314*6a54128fSAndroid Build Coastguard Worker fprintf(file, "};\n");
3315*6a54128fSAndroid Build Coastguard Worker fclose(file);
3316*6a54128fSAndroid Build Coastguard Worker }
3317*6a54128fSAndroid Build Coastguard Worker
3318*6a54128fSAndroid Build Coastguard Worker /* ------------------------------------------------------------------ */
3319*6a54128fSAndroid Build Coastguard Worker
main(int argc,char * argv[])3320*6a54128fSAndroid Build Coastguard Worker int main(int argc, char *argv[])
3321*6a54128fSAndroid Build Coastguard Worker {
3322*6a54128fSAndroid Build Coastguard Worker unsigned int unichar;
3323*6a54128fSAndroid Build Coastguard Worker int opt;
3324*6a54128fSAndroid Build Coastguard Worker
3325*6a54128fSAndroid Build Coastguard Worker argv0 = argv[0];
3326*6a54128fSAndroid Build Coastguard Worker
3327*6a54128fSAndroid Build Coastguard Worker while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3328*6a54128fSAndroid Build Coastguard Worker switch (opt) {
3329*6a54128fSAndroid Build Coastguard Worker case 'a':
3330*6a54128fSAndroid Build Coastguard Worker age_name = optarg;
3331*6a54128fSAndroid Build Coastguard Worker break;
3332*6a54128fSAndroid Build Coastguard Worker case 'c':
3333*6a54128fSAndroid Build Coastguard Worker ccc_name = optarg;
3334*6a54128fSAndroid Build Coastguard Worker break;
3335*6a54128fSAndroid Build Coastguard Worker case 'd':
3336*6a54128fSAndroid Build Coastguard Worker data_name = optarg;
3337*6a54128fSAndroid Build Coastguard Worker break;
3338*6a54128fSAndroid Build Coastguard Worker case 'f':
3339*6a54128fSAndroid Build Coastguard Worker fold_name = optarg;
3340*6a54128fSAndroid Build Coastguard Worker break;
3341*6a54128fSAndroid Build Coastguard Worker case 'n':
3342*6a54128fSAndroid Build Coastguard Worker norm_name = optarg;
3343*6a54128fSAndroid Build Coastguard Worker break;
3344*6a54128fSAndroid Build Coastguard Worker case 'o':
3345*6a54128fSAndroid Build Coastguard Worker utf8_name = optarg;
3346*6a54128fSAndroid Build Coastguard Worker break;
3347*6a54128fSAndroid Build Coastguard Worker case 'p':
3348*6a54128fSAndroid Build Coastguard Worker prop_name = optarg;
3349*6a54128fSAndroid Build Coastguard Worker break;
3350*6a54128fSAndroid Build Coastguard Worker case 't':
3351*6a54128fSAndroid Build Coastguard Worker test_name = optarg;
3352*6a54128fSAndroid Build Coastguard Worker break;
3353*6a54128fSAndroid Build Coastguard Worker case 'v':
3354*6a54128fSAndroid Build Coastguard Worker verbose++;
3355*6a54128fSAndroid Build Coastguard Worker break;
3356*6a54128fSAndroid Build Coastguard Worker case 'h':
3357*6a54128fSAndroid Build Coastguard Worker help();
3358*6a54128fSAndroid Build Coastguard Worker exit(0);
3359*6a54128fSAndroid Build Coastguard Worker default:
3360*6a54128fSAndroid Build Coastguard Worker usage();
3361*6a54128fSAndroid Build Coastguard Worker }
3362*6a54128fSAndroid Build Coastguard Worker }
3363*6a54128fSAndroid Build Coastguard Worker
3364*6a54128fSAndroid Build Coastguard Worker if (verbose > 1)
3365*6a54128fSAndroid Build Coastguard Worker help();
3366*6a54128fSAndroid Build Coastguard Worker for (unichar = 0; unichar != 0x110000; unichar++)
3367*6a54128fSAndroid Build Coastguard Worker unicode_data[unichar].code = unichar;
3368*6a54128fSAndroid Build Coastguard Worker age_init();
3369*6a54128fSAndroid Build Coastguard Worker ccc_init();
3370*6a54128fSAndroid Build Coastguard Worker nfkdi_init();
3371*6a54128fSAndroid Build Coastguard Worker nfkdicf_init();
3372*6a54128fSAndroid Build Coastguard Worker ignore_init();
3373*6a54128fSAndroid Build Coastguard Worker corrections_init();
3374*6a54128fSAndroid Build Coastguard Worker hangul_decompose();
3375*6a54128fSAndroid Build Coastguard Worker nfkdi_decompose();
3376*6a54128fSAndroid Build Coastguard Worker nfkdicf_decompose();
3377*6a54128fSAndroid Build Coastguard Worker utf8_init();
3378*6a54128fSAndroid Build Coastguard Worker trees_init();
3379*6a54128fSAndroid Build Coastguard Worker trees_populate();
3380*6a54128fSAndroid Build Coastguard Worker trees_reduce();
3381*6a54128fSAndroid Build Coastguard Worker trees_verify();
3382*6a54128fSAndroid Build Coastguard Worker /* Prevent "unused function" warning. */
3383*6a54128fSAndroid Build Coastguard Worker (void)lookup(nfkdi_tree, " ");
3384*6a54128fSAndroid Build Coastguard Worker if (verbose > 2)
3385*6a54128fSAndroid Build Coastguard Worker tree_walk(nfkdi_tree);
3386*6a54128fSAndroid Build Coastguard Worker if (verbose > 2)
3387*6a54128fSAndroid Build Coastguard Worker tree_walk(nfkdicf_tree);
3388*6a54128fSAndroid Build Coastguard Worker normalization_test();
3389*6a54128fSAndroid Build Coastguard Worker write_file();
3390*6a54128fSAndroid Build Coastguard Worker
3391*6a54128fSAndroid Build Coastguard Worker return 0;
3392*6a54128fSAndroid Build Coastguard Worker }
3393