xref: /aosp_15_r20/frameworks/minikin/tools/mk_hyb_file.py (revision 834a2baab5fdfc28e9a428ee87c7ea8f6a06a53d)
1*834a2baaSAndroid Build Coastguard Worker#!/usr/bin/env python3
2*834a2baaSAndroid Build Coastguard Worker
3*834a2baaSAndroid Build Coastguard Worker# Copyright (C) 2015 The Android Open Source Project
4*834a2baaSAndroid Build Coastguard Worker#
5*834a2baaSAndroid Build Coastguard Worker# Licensed under the Apache License, Version 2.0 (the 'License');
6*834a2baaSAndroid Build Coastguard Worker# you may not use this file except in compliance with the License.
7*834a2baaSAndroid Build Coastguard Worker# You may obtain a copy of the License at
8*834a2baaSAndroid Build Coastguard Worker#
9*834a2baaSAndroid Build Coastguard Worker#      http://www.apache.org/licenses/LICENSE-2.0
10*834a2baaSAndroid Build Coastguard Worker#
11*834a2baaSAndroid Build Coastguard Worker# Unless required by applicable law or agreed to in writing, software
12*834a2baaSAndroid Build Coastguard Worker# distributed under the License is distributed on an 'AS IS' BASIS,
13*834a2baaSAndroid Build Coastguard Worker# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*834a2baaSAndroid Build Coastguard Worker# See the License for the specific language governing permissions and
15*834a2baaSAndroid Build Coastguard Worker# limitations under the License.
16*834a2baaSAndroid Build Coastguard Worker
17*834a2baaSAndroid Build Coastguard Worker"""
18*834a2baaSAndroid Build Coastguard WorkerConvert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
19*834a2baaSAndroid Build Coastguard Workerinto binary format. See doc/hyb_file_format.md for more information.
20*834a2baaSAndroid Build Coastguard Worker
21*834a2baaSAndroid Build Coastguard WorkerUsage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb
22*834a2baaSAndroid Build Coastguard Worker
23*834a2baaSAndroid Build Coastguard WorkerOptional -v parameter turns on verbose debugging.
24*834a2baaSAndroid Build Coastguard Worker
25*834a2baaSAndroid Build Coastguard Worker"""
26*834a2baaSAndroid Build Coastguard Worker
27*834a2baaSAndroid Build Coastguard Workerfrom __future__ import print_function
28*834a2baaSAndroid Build Coastguard Worker
29*834a2baaSAndroid Build Coastguard Workerimport io
30*834a2baaSAndroid Build Coastguard Workerimport sys
31*834a2baaSAndroid Build Coastguard Workerimport struct
32*834a2baaSAndroid Build Coastguard Workerimport math
33*834a2baaSAndroid Build Coastguard Workerimport getopt
34*834a2baaSAndroid Build Coastguard Worker
35*834a2baaSAndroid Build Coastguard Worker
36*834a2baaSAndroid Build Coastguard WorkerVERBOSE = False
37*834a2baaSAndroid Build Coastguard Worker
38*834a2baaSAndroid Build Coastguard Worker# U+00DF is LATIN SMALL LETTER SHARP S
39*834a2baaSAndroid Build Coastguard Worker# U+1E9E is LATIN CAPITAL LETTER SHARP S
40*834a2baaSAndroid Build Coastguard WorkerSHARP_S_TO_DOUBLE = u'\u00dfSS'
41*834a2baaSAndroid Build Coastguard WorkerSHARP_S_TO_CAPITAL = u'\u00df\u1e9e'
42*834a2baaSAndroid Build Coastguard Worker
43*834a2baaSAndroid Build Coastguard Workerif sys.version_info[0] >= 3:
44*834a2baaSAndroid Build Coastguard Worker    def unichr(x):
45*834a2baaSAndroid Build Coastguard Worker        return chr(x)
46*834a2baaSAndroid Build Coastguard Worker
47*834a2baaSAndroid Build Coastguard Worker
48*834a2baaSAndroid Build Coastguard Worker# number of bits required to represent numbers up to n inclusive
49*834a2baaSAndroid Build Coastguard Workerdef num_bits(n):
50*834a2baaSAndroid Build Coastguard Worker    return 1 + int(math.log(n, 2)) if n > 0 else 0
51*834a2baaSAndroid Build Coastguard Worker
52*834a2baaSAndroid Build Coastguard Worker
53*834a2baaSAndroid Build Coastguard Workerclass Node:
54*834a2baaSAndroid Build Coastguard Worker
55*834a2baaSAndroid Build Coastguard Worker    def __init__(self):
56*834a2baaSAndroid Build Coastguard Worker        self.succ = {}
57*834a2baaSAndroid Build Coastguard Worker        self.res = None
58*834a2baaSAndroid Build Coastguard Worker        self.fsm_pat = None
59*834a2baaSAndroid Build Coastguard Worker        self.fail = None
60*834a2baaSAndroid Build Coastguard Worker
61*834a2baaSAndroid Build Coastguard Worker
62*834a2baaSAndroid Build Coastguard Worker# List of free slots, implemented as doubly linked list
63*834a2baaSAndroid Build Coastguard Workerclass Freelist:
64*834a2baaSAndroid Build Coastguard Worker
65*834a2baaSAndroid Build Coastguard Worker    def __init__(self):
66*834a2baaSAndroid Build Coastguard Worker        self.first = None
67*834a2baaSAndroid Build Coastguard Worker        self.last = None
68*834a2baaSAndroid Build Coastguard Worker        self.pred = []
69*834a2baaSAndroid Build Coastguard Worker        self.succ = []
70*834a2baaSAndroid Build Coastguard Worker
71*834a2baaSAndroid Build Coastguard Worker    def grow(self):
72*834a2baaSAndroid Build Coastguard Worker        this = len(self.pred)
73*834a2baaSAndroid Build Coastguard Worker        self.pred.append(self.last)
74*834a2baaSAndroid Build Coastguard Worker        self.succ.append(None)
75*834a2baaSAndroid Build Coastguard Worker        if self.last is None:
76*834a2baaSAndroid Build Coastguard Worker            self.first = this
77*834a2baaSAndroid Build Coastguard Worker        else:
78*834a2baaSAndroid Build Coastguard Worker            self.succ[self.last] = this
79*834a2baaSAndroid Build Coastguard Worker        self.last = this
80*834a2baaSAndroid Build Coastguard Worker
81*834a2baaSAndroid Build Coastguard Worker    def next(self, cursor):
82*834a2baaSAndroid Build Coastguard Worker        if cursor == 0:
83*834a2baaSAndroid Build Coastguard Worker            cursor = self.first
84*834a2baaSAndroid Build Coastguard Worker        if cursor is None:
85*834a2baaSAndroid Build Coastguard Worker            self.grow()
86*834a2baaSAndroid Build Coastguard Worker            result = self.last
87*834a2baaSAndroid Build Coastguard Worker        else:
88*834a2baaSAndroid Build Coastguard Worker            result = cursor
89*834a2baaSAndroid Build Coastguard Worker        return result, self.succ[result]
90*834a2baaSAndroid Build Coastguard Worker
91*834a2baaSAndroid Build Coastguard Worker    def is_free(self, ix):
92*834a2baaSAndroid Build Coastguard Worker        while ix >= len(self.pred):
93*834a2baaSAndroid Build Coastguard Worker            self.grow()
94*834a2baaSAndroid Build Coastguard Worker        return self.pred[ix] != -1
95*834a2baaSAndroid Build Coastguard Worker
96*834a2baaSAndroid Build Coastguard Worker    def use(self, ix):
97*834a2baaSAndroid Build Coastguard Worker        if self.pred[ix] is None:
98*834a2baaSAndroid Build Coastguard Worker            self.first = self.succ[ix]
99*834a2baaSAndroid Build Coastguard Worker        else:
100*834a2baaSAndroid Build Coastguard Worker            self.succ[self.pred[ix]] = self.succ[ix]
101*834a2baaSAndroid Build Coastguard Worker        if self.succ[ix] is None:
102*834a2baaSAndroid Build Coastguard Worker            self.last = self.pred[ix]
103*834a2baaSAndroid Build Coastguard Worker        else:
104*834a2baaSAndroid Build Coastguard Worker            self.pred[self.succ[ix]] = self.pred[ix]
105*834a2baaSAndroid Build Coastguard Worker        if self.pred[ix] == -1:
106*834a2baaSAndroid Build Coastguard Worker            assert self.pred[ix] != -1, 'double free!'
107*834a2baaSAndroid Build Coastguard Worker        self.pred[ix] = -1
108*834a2baaSAndroid Build Coastguard Worker
109*834a2baaSAndroid Build Coastguard Worker
110*834a2baaSAndroid Build Coastguard Workerdef combine(a, b):
111*834a2baaSAndroid Build Coastguard Worker    if a is None: return b
112*834a2baaSAndroid Build Coastguard Worker    if b is None: return a
113*834a2baaSAndroid Build Coastguard Worker    if len(b) < len(a): a, b = b, a
114*834a2baaSAndroid Build Coastguard Worker    res = b[:len(b) - len(a)]
115*834a2baaSAndroid Build Coastguard Worker    for i in range(len(a)):
116*834a2baaSAndroid Build Coastguard Worker        res.append(max(a[i], b[i + len(b) - len(a)]))
117*834a2baaSAndroid Build Coastguard Worker    return res
118*834a2baaSAndroid Build Coastguard Worker
119*834a2baaSAndroid Build Coastguard Worker
120*834a2baaSAndroid Build Coastguard Workerdef trim(pattern):
121*834a2baaSAndroid Build Coastguard Worker    for ix in range(len(pattern)):
122*834a2baaSAndroid Build Coastguard Worker        if pattern[ix] != 0:
123*834a2baaSAndroid Build Coastguard Worker            return pattern[ix:]
124*834a2baaSAndroid Build Coastguard Worker
125*834a2baaSAndroid Build Coastguard Worker
126*834a2baaSAndroid Build Coastguard Workerdef pat_to_binary(pattern):
127*834a2baaSAndroid Build Coastguard Worker    return b''.join(struct.pack('B', x) for x in pattern)
128*834a2baaSAndroid Build Coastguard Worker
129*834a2baaSAndroid Build Coastguard Worker
130*834a2baaSAndroid Build Coastguard Workerclass Hyph:
131*834a2baaSAndroid Build Coastguard Worker
132*834a2baaSAndroid Build Coastguard Worker    def __init__(self):
133*834a2baaSAndroid Build Coastguard Worker        self.root = Node()
134*834a2baaSAndroid Build Coastguard Worker        self.root.str = '<root>'
135*834a2baaSAndroid Build Coastguard Worker        self.node_list = [self.root]
136*834a2baaSAndroid Build Coastguard Worker
137*834a2baaSAndroid Build Coastguard Worker    # Add a pattern (word fragment with numeric codes, such as ".ad4der")
138*834a2baaSAndroid Build Coastguard Worker    def add_pat(self, pat):
139*834a2baaSAndroid Build Coastguard Worker        lastWasLetter = False
140*834a2baaSAndroid Build Coastguard Worker        haveSeenNumber = False
141*834a2baaSAndroid Build Coastguard Worker        result = []
142*834a2baaSAndroid Build Coastguard Worker        word = ''
143*834a2baaSAndroid Build Coastguard Worker        for c in pat:
144*834a2baaSAndroid Build Coastguard Worker            if c.isdigit():
145*834a2baaSAndroid Build Coastguard Worker                result.append(int(c))
146*834a2baaSAndroid Build Coastguard Worker                lastWasLetter = False
147*834a2baaSAndroid Build Coastguard Worker                haveSeenNumber = True
148*834a2baaSAndroid Build Coastguard Worker            else:
149*834a2baaSAndroid Build Coastguard Worker                word += c
150*834a2baaSAndroid Build Coastguard Worker                if lastWasLetter and haveSeenNumber:
151*834a2baaSAndroid Build Coastguard Worker                    result.append(0)
152*834a2baaSAndroid Build Coastguard Worker                lastWasLetter = True
153*834a2baaSAndroid Build Coastguard Worker        if lastWasLetter:
154*834a2baaSAndroid Build Coastguard Worker            result.append(0)
155*834a2baaSAndroid Build Coastguard Worker
156*834a2baaSAndroid Build Coastguard Worker        self.add_word_res(word, result)
157*834a2baaSAndroid Build Coastguard Worker
158*834a2baaSAndroid Build Coastguard Worker    # Add an exception (word with hyphens, such as "ta-ble")
159*834a2baaSAndroid Build Coastguard Worker    def add_exception(self, hyph_word):
160*834a2baaSAndroid Build Coastguard Worker        res = []
161*834a2baaSAndroid Build Coastguard Worker        word = ['.']
162*834a2baaSAndroid Build Coastguard Worker        need_10 = False
163*834a2baaSAndroid Build Coastguard Worker        for c in hyph_word:
164*834a2baaSAndroid Build Coastguard Worker            if c == '-':
165*834a2baaSAndroid Build Coastguard Worker                res.append(11)
166*834a2baaSAndroid Build Coastguard Worker                need_10 = False
167*834a2baaSAndroid Build Coastguard Worker            else:
168*834a2baaSAndroid Build Coastguard Worker                if need_10:
169*834a2baaSAndroid Build Coastguard Worker                    res.append(10)
170*834a2baaSAndroid Build Coastguard Worker                word.append(c)
171*834a2baaSAndroid Build Coastguard Worker                need_10 = True
172*834a2baaSAndroid Build Coastguard Worker        word.append('.')
173*834a2baaSAndroid Build Coastguard Worker        res.append(0)
174*834a2baaSAndroid Build Coastguard Worker        res.append(0)
175*834a2baaSAndroid Build Coastguard Worker        if VERBOSE:
176*834a2baaSAndroid Build Coastguard Worker            print(word, res)
177*834a2baaSAndroid Build Coastguard Worker        self.add_word_res(''.join(word), res)
178*834a2baaSAndroid Build Coastguard Worker
179*834a2baaSAndroid Build Coastguard Worker    def add_word_res(self, word, result):
180*834a2baaSAndroid Build Coastguard Worker        if VERBOSE:
181*834a2baaSAndroid Build Coastguard Worker            print(word, result)
182*834a2baaSAndroid Build Coastguard Worker
183*834a2baaSAndroid Build Coastguard Worker        t = self.root
184*834a2baaSAndroid Build Coastguard Worker        s = ''
185*834a2baaSAndroid Build Coastguard Worker        for c in word:
186*834a2baaSAndroid Build Coastguard Worker            s += c
187*834a2baaSAndroid Build Coastguard Worker            if c not in t.succ:
188*834a2baaSAndroid Build Coastguard Worker                new_node = Node()
189*834a2baaSAndroid Build Coastguard Worker                new_node.str = s
190*834a2baaSAndroid Build Coastguard Worker                self.node_list.append(new_node)
191*834a2baaSAndroid Build Coastguard Worker                t.succ[c] = new_node
192*834a2baaSAndroid Build Coastguard Worker            t = t.succ[c]
193*834a2baaSAndroid Build Coastguard Worker        t.res = result
194*834a2baaSAndroid Build Coastguard Worker
195*834a2baaSAndroid Build Coastguard Worker    def pack(self, node_list, ch_map, use_node=False):
196*834a2baaSAndroid Build Coastguard Worker        size = 0
197*834a2baaSAndroid Build Coastguard Worker        self.node_map = {}
198*834a2baaSAndroid Build Coastguard Worker        nodes = Freelist()
199*834a2baaSAndroid Build Coastguard Worker        edges = Freelist()
200*834a2baaSAndroid Build Coastguard Worker        edge_start = 1 if use_node else 0
201*834a2baaSAndroid Build Coastguard Worker        for node in node_list:
202*834a2baaSAndroid Build Coastguard Worker            succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
203*834a2baaSAndroid Build Coastguard Worker            if len(succ):
204*834a2baaSAndroid Build Coastguard Worker                cursor = 0
205*834a2baaSAndroid Build Coastguard Worker                while True:
206*834a2baaSAndroid Build Coastguard Worker                    edge_ix, cursor = edges.next(cursor)
207*834a2baaSAndroid Build Coastguard Worker                    ix = edge_ix - succ[0]
208*834a2baaSAndroid Build Coastguard Worker                    if (ix >= 0 and nodes.is_free(ix) and
209*834a2baaSAndroid Build Coastguard Worker                            all(edges.is_free(ix + s) for s in succ) and
210*834a2baaSAndroid Build Coastguard Worker                            ((not use_node) or edges.is_free(ix))):
211*834a2baaSAndroid Build Coastguard Worker                        break
212*834a2baaSAndroid Build Coastguard Worker            elif use_node:
213*834a2baaSAndroid Build Coastguard Worker                ix, _ = edges.next(0)
214*834a2baaSAndroid Build Coastguard Worker                nodes.is_free(ix)  # actually don't need nodes at all when use_node,
215*834a2baaSAndroid Build Coastguard Worker                # but keep it happy
216*834a2baaSAndroid Build Coastguard Worker            else:
217*834a2baaSAndroid Build Coastguard Worker                ix, _ = nodes.next(0)
218*834a2baaSAndroid Build Coastguard Worker            node.ix = ix
219*834a2baaSAndroid Build Coastguard Worker            self.node_map[ix] = node
220*834a2baaSAndroid Build Coastguard Worker            nodes.use(ix)
221*834a2baaSAndroid Build Coastguard Worker            size = max(size, ix)
222*834a2baaSAndroid Build Coastguard Worker            if use_node:
223*834a2baaSAndroid Build Coastguard Worker                edges.use(ix)
224*834a2baaSAndroid Build Coastguard Worker            for s in succ:
225*834a2baaSAndroid Build Coastguard Worker                edges.use(ix + s)
226*834a2baaSAndroid Build Coastguard Worker        size += max(ch_map.values()) + 1
227*834a2baaSAndroid Build Coastguard Worker        return size
228*834a2baaSAndroid Build Coastguard Worker
229*834a2baaSAndroid Build Coastguard Worker    # return list of nodes in bfs order
230*834a2baaSAndroid Build Coastguard Worker    def bfs(self, ch_map):
231*834a2baaSAndroid Build Coastguard Worker        result = [self.root]
232*834a2baaSAndroid Build Coastguard Worker        ix = 0
233*834a2baaSAndroid Build Coastguard Worker        while ix < len(result):
234*834a2baaSAndroid Build Coastguard Worker            node = result[ix]
235*834a2baaSAndroid Build Coastguard Worker            node.bfs_ix = ix
236*834a2baaSAndroid Build Coastguard Worker            mapped = {}
237*834a2baaSAndroid Build Coastguard Worker            for c, next in node.succ.items():
238*834a2baaSAndroid Build Coastguard Worker                assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
239*834a2baaSAndroid Build Coastguard Worker                mapped[ch_map[c]] = next
240*834a2baaSAndroid Build Coastguard Worker            for i in sorted(mapped.keys()):
241*834a2baaSAndroid Build Coastguard Worker                result.append(mapped[i])
242*834a2baaSAndroid Build Coastguard Worker            ix += 1
243*834a2baaSAndroid Build Coastguard Worker        self.bfs_order = result
244*834a2baaSAndroid Build Coastguard Worker        return result
245*834a2baaSAndroid Build Coastguard Worker
246*834a2baaSAndroid Build Coastguard Worker    # suffix compression - convert the trie into an acyclic digraph, merging nodes when
247*834a2baaSAndroid Build Coastguard Worker    # the subtries are identical
248*834a2baaSAndroid Build Coastguard Worker    def dedup(self):
249*834a2baaSAndroid Build Coastguard Worker        uniques = []
250*834a2baaSAndroid Build Coastguard Worker        dupmap = {}
251*834a2baaSAndroid Build Coastguard Worker        dedup_ix = [0] * len(self.bfs_order)
252*834a2baaSAndroid Build Coastguard Worker        for ix in reversed(range(len(self.bfs_order))):
253*834a2baaSAndroid Build Coastguard Worker            # construct string representation of node
254*834a2baaSAndroid Build Coastguard Worker            node = self.bfs_order[ix]
255*834a2baaSAndroid Build Coastguard Worker            if node.res is None:
256*834a2baaSAndroid Build Coastguard Worker                s = ''
257*834a2baaSAndroid Build Coastguard Worker            else:
258*834a2baaSAndroid Build Coastguard Worker                s = ''.join(str(c) for c in node.res)
259*834a2baaSAndroid Build Coastguard Worker            for c in sorted(node.succ.keys()):
260*834a2baaSAndroid Build Coastguard Worker                succ = node.succ[c]
261*834a2baaSAndroid Build Coastguard Worker                s += ' ' + c + str(dedup_ix[succ.bfs_ix])
262*834a2baaSAndroid Build Coastguard Worker            if s in dupmap:
263*834a2baaSAndroid Build Coastguard Worker                dedup_ix[ix] = dupmap[s]
264*834a2baaSAndroid Build Coastguard Worker            else:
265*834a2baaSAndroid Build Coastguard Worker                uniques.append(node)
266*834a2baaSAndroid Build Coastguard Worker                dedup_ix[ix] = ix
267*834a2baaSAndroid Build Coastguard Worker            dupmap[s] = dedup_ix[ix]
268*834a2baaSAndroid Build Coastguard Worker        uniques.reverse()
269*834a2baaSAndroid Build Coastguard Worker        if VERBOSE:
270*834a2baaSAndroid Build Coastguard Worker            print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
271*834a2baaSAndroid Build Coastguard Worker        return dedup_ix, uniques
272*834a2baaSAndroid Build Coastguard Worker
273*834a2baaSAndroid Build Coastguard Worker
274*834a2baaSAndroid Build Coastguard Worker# load the ".pat" file, which contains patterns such as a1b2c3
275*834a2baaSAndroid Build Coastguard Workerdef load(fn):
276*834a2baaSAndroid Build Coastguard Worker    hyph = Hyph()
277*834a2baaSAndroid Build Coastguard Worker    with io.open(fn, encoding='UTF-8') as f:
278*834a2baaSAndroid Build Coastguard Worker        for l in f:
279*834a2baaSAndroid Build Coastguard Worker            pat = l.strip()
280*834a2baaSAndroid Build Coastguard Worker            hyph.add_pat(pat)
281*834a2baaSAndroid Build Coastguard Worker    return hyph
282*834a2baaSAndroid Build Coastguard Worker
283*834a2baaSAndroid Build Coastguard Worker
284*834a2baaSAndroid Build Coastguard Worker# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
285*834a2baaSAndroid Build Coastguard Workerdef load_chr(fn):
286*834a2baaSAndroid Build Coastguard Worker    ch_map = {'.': 0}
287*834a2baaSAndroid Build Coastguard Worker    with io.open(fn, encoding='UTF-8') as f:
288*834a2baaSAndroid Build Coastguard Worker        for i, l in enumerate(f):
289*834a2baaSAndroid Build Coastguard Worker            l = l.strip()
290*834a2baaSAndroid Build Coastguard Worker            if len(l) > 2:
291*834a2baaSAndroid Build Coastguard Worker                if l == SHARP_S_TO_DOUBLE:
292*834a2baaSAndroid Build Coastguard Worker                    # replace with lowercasing from capital letter sharp s
293*834a2baaSAndroid Build Coastguard Worker                    l = SHARP_S_TO_CAPITAL
294*834a2baaSAndroid Build Coastguard Worker                else:
295*834a2baaSAndroid Build Coastguard Worker                    # lowercase maps to multi-character uppercase sequence, ignore uppercase for now
296*834a2baaSAndroid Build Coastguard Worker                    l = l[:1]
297*834a2baaSAndroid Build Coastguard Worker            else:
298*834a2baaSAndroid Build Coastguard Worker                assert len(l) == 2, 'expected 2 chars in chr'
299*834a2baaSAndroid Build Coastguard Worker            for c in l:
300*834a2baaSAndroid Build Coastguard Worker                ch_map[c] = i + 1
301*834a2baaSAndroid Build Coastguard Worker    return ch_map
302*834a2baaSAndroid Build Coastguard Worker
303*834a2baaSAndroid Build Coastguard Worker
304*834a2baaSAndroid Build Coastguard Worker# load exceptions with explicit hyphens
305*834a2baaSAndroid Build Coastguard Workerdef load_hyp(hyph, fn):
306*834a2baaSAndroid Build Coastguard Worker    with io.open(fn, encoding='UTF-8') as f:
307*834a2baaSAndroid Build Coastguard Worker        for l in f:
308*834a2baaSAndroid Build Coastguard Worker            hyph.add_exception(l.strip())
309*834a2baaSAndroid Build Coastguard Worker
310*834a2baaSAndroid Build Coastguard Worker
311*834a2baaSAndroid Build Coastguard Workerdef generate_header(alphabet, trie, pattern):
312*834a2baaSAndroid Build Coastguard Worker    alphabet_off = 6 * 4
313*834a2baaSAndroid Build Coastguard Worker    trie_off = alphabet_off + len(alphabet)
314*834a2baaSAndroid Build Coastguard Worker    pattern_off = trie_off + len(trie)
315*834a2baaSAndroid Build Coastguard Worker    file_size = pattern_off + len(pattern)
316*834a2baaSAndroid Build Coastguard Worker    data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
317*834a2baaSAndroid Build Coastguard Worker    return struct.pack('<6I', *data)
318*834a2baaSAndroid Build Coastguard Worker
319*834a2baaSAndroid Build Coastguard Worker
320*834a2baaSAndroid Build Coastguard Workerdef generate_alphabet(ch_map):
321*834a2baaSAndroid Build Coastguard Worker    ch_map = ch_map.copy()
322*834a2baaSAndroid Build Coastguard Worker    del ch_map['.']
323*834a2baaSAndroid Build Coastguard Worker    min_ch = ord(min(ch_map))
324*834a2baaSAndroid Build Coastguard Worker    max_ch = ord(max(ch_map))
325*834a2baaSAndroid Build Coastguard Worker    if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
326*834a2baaSAndroid Build Coastguard Worker        # generate format 0
327*834a2baaSAndroid Build Coastguard Worker        data = [0] * (max_ch - min_ch + 1)
328*834a2baaSAndroid Build Coastguard Worker        for c, val in ch_map.items():
329*834a2baaSAndroid Build Coastguard Worker            data[ord(c) - min_ch] = val
330*834a2baaSAndroid Build Coastguard Worker        result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
331*834a2baaSAndroid Build Coastguard Worker        for b in data:
332*834a2baaSAndroid Build Coastguard Worker            result.append(struct.pack('<B', b))
333*834a2baaSAndroid Build Coastguard Worker    else:
334*834a2baaSAndroid Build Coastguard Worker        # generate format 1
335*834a2baaSAndroid Build Coastguard Worker        assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
336*834a2baaSAndroid Build Coastguard Worker        result = [struct.pack('<2I', 1, len(ch_map))]
337*834a2baaSAndroid Build Coastguard Worker        for c, val in sorted(ch_map.items()):
338*834a2baaSAndroid Build Coastguard Worker            data = (ord(c) << 11) | val
339*834a2baaSAndroid Build Coastguard Worker            result.append(struct.pack('<I', data))
340*834a2baaSAndroid Build Coastguard Worker    binary = b''.join(result)
341*834a2baaSAndroid Build Coastguard Worker    if len(binary) % 4 != 0:
342*834a2baaSAndroid Build Coastguard Worker        binary += b'\x00' * (4 - len(binary) % 4)
343*834a2baaSAndroid Build Coastguard Worker    return binary
344*834a2baaSAndroid Build Coastguard Worker
345*834a2baaSAndroid Build Coastguard Worker
346*834a2baaSAndroid Build Coastguard Worker# assumes hyph structure has been packed, ie node.ix values have been set
347*834a2baaSAndroid Build Coastguard Workerdef generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
348*834a2baaSAndroid Build Coastguard Worker    ch_array = [0] * n_trie
349*834a2baaSAndroid Build Coastguard Worker    link_array = [0] * n_trie
350*834a2baaSAndroid Build Coastguard Worker    pat_array = [0] * n_trie
351*834a2baaSAndroid Build Coastguard Worker    link_shift = num_bits(max(ch_map.values()))
352*834a2baaSAndroid Build Coastguard Worker    char_mask = (1 << link_shift) - 1
353*834a2baaSAndroid Build Coastguard Worker    pattern_shift = link_shift + num_bits(n_trie - 1)
354*834a2baaSAndroid Build Coastguard Worker    link_mask = (1 << pattern_shift) - (1 << link_shift)
355*834a2baaSAndroid Build Coastguard Worker    result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]
356*834a2baaSAndroid Build Coastguard Worker
357*834a2baaSAndroid Build Coastguard Worker    for node in dedup_nodes:
358*834a2baaSAndroid Build Coastguard Worker        ix = node.ix
359*834a2baaSAndroid Build Coastguard Worker        if node.res is not None:
360*834a2baaSAndroid Build Coastguard Worker            pat_array[ix] = patmap[pat_to_binary(node.res)]
361*834a2baaSAndroid Build Coastguard Worker        for c, next in node.succ.items():
362*834a2baaSAndroid Build Coastguard Worker            c_num = ch_map[c]
363*834a2baaSAndroid Build Coastguard Worker            link_ix = ix + c_num
364*834a2baaSAndroid Build Coastguard Worker            ch_array[link_ix] = c_num
365*834a2baaSAndroid Build Coastguard Worker            if dedup_ix is None:
366*834a2baaSAndroid Build Coastguard Worker                dedup_next = next
367*834a2baaSAndroid Build Coastguard Worker            else:
368*834a2baaSAndroid Build Coastguard Worker                dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
369*834a2baaSAndroid Build Coastguard Worker            link_array[link_ix] = dedup_next.ix
370*834a2baaSAndroid Build Coastguard Worker
371*834a2baaSAndroid Build Coastguard Worker    for i in range(n_trie):
372*834a2baaSAndroid Build Coastguard Worker        #print((pat_array[i], link_array[i], ch_array[i]))
373*834a2baaSAndroid Build Coastguard Worker        packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
374*834a2baaSAndroid Build Coastguard Worker        result.append(struct.pack('<I', packed))
375*834a2baaSAndroid Build Coastguard Worker    return b''.join(result)
376*834a2baaSAndroid Build Coastguard Worker
377*834a2baaSAndroid Build Coastguard Worker
378*834a2baaSAndroid Build Coastguard Workerdef generate_pattern(pats):
379*834a2baaSAndroid Build Coastguard Worker    pat_array = [0]
380*834a2baaSAndroid Build Coastguard Worker    patmap = {b'': 0}
381*834a2baaSAndroid Build Coastguard Worker
382*834a2baaSAndroid Build Coastguard Worker    raw_pat_array = []
383*834a2baaSAndroid Build Coastguard Worker    raw_pat_size = 0
384*834a2baaSAndroid Build Coastguard Worker    raw_patmap = {}
385*834a2baaSAndroid Build Coastguard Worker
386*834a2baaSAndroid Build Coastguard Worker    for pat in pats:
387*834a2baaSAndroid Build Coastguard Worker        if pat is None:
388*834a2baaSAndroid Build Coastguard Worker            continue
389*834a2baaSAndroid Build Coastguard Worker        pat_str = pat_to_binary(pat)
390*834a2baaSAndroid Build Coastguard Worker        if pat_str not in patmap:
391*834a2baaSAndroid Build Coastguard Worker            shift = 0
392*834a2baaSAndroid Build Coastguard Worker            while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
393*834a2baaSAndroid Build Coastguard Worker                shift += 1
394*834a2baaSAndroid Build Coastguard Worker            rawpat = pat_str[:len(pat) - shift]
395*834a2baaSAndroid Build Coastguard Worker            if rawpat not in raw_patmap:
396*834a2baaSAndroid Build Coastguard Worker                raw_patmap[rawpat] = raw_pat_size
397*834a2baaSAndroid Build Coastguard Worker                raw_pat_array.append(rawpat)
398*834a2baaSAndroid Build Coastguard Worker                raw_pat_size += len(rawpat)
399*834a2baaSAndroid Build Coastguard Worker            data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
400*834a2baaSAndroid Build Coastguard Worker            patmap[pat_str] = len(pat_array)
401*834a2baaSAndroid Build Coastguard Worker            pat_array.append(data)
402*834a2baaSAndroid Build Coastguard Worker    data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
403*834a2baaSAndroid Build Coastguard Worker    result = [struct.pack('<4I', *data)]
404*834a2baaSAndroid Build Coastguard Worker    for x in pat_array:
405*834a2baaSAndroid Build Coastguard Worker        result.append(struct.pack('<I', x))
406*834a2baaSAndroid Build Coastguard Worker    result.extend(raw_pat_array)
407*834a2baaSAndroid Build Coastguard Worker    return patmap, b''.join(result)
408*834a2baaSAndroid Build Coastguard Worker
409*834a2baaSAndroid Build Coastguard Worker
410*834a2baaSAndroid Build Coastguard Workerdef generate_hyb_file(hyph, ch_map, hyb_fn):
411*834a2baaSAndroid Build Coastguard Worker    bfs = hyph.bfs(ch_map)
412*834a2baaSAndroid Build Coastguard Worker    dedup_ix, dedup_nodes = hyph.dedup()
413*834a2baaSAndroid Build Coastguard Worker    n_trie = hyph.pack(dedup_nodes, ch_map)
414*834a2baaSAndroid Build Coastguard Worker    alphabet = generate_alphabet(ch_map)
415*834a2baaSAndroid Build Coastguard Worker    patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
416*834a2baaSAndroid Build Coastguard Worker    trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
417*834a2baaSAndroid Build Coastguard Worker    header = generate_header(alphabet, trie, pattern)
418*834a2baaSAndroid Build Coastguard Worker
419*834a2baaSAndroid Build Coastguard Worker    with open(hyb_fn, 'wb') as f:
420*834a2baaSAndroid Build Coastguard Worker        f.write(header)
421*834a2baaSAndroid Build Coastguard Worker        f.write(alphabet)
422*834a2baaSAndroid Build Coastguard Worker        f.write(trie)
423*834a2baaSAndroid Build Coastguard Worker        f.write(pattern)
424*834a2baaSAndroid Build Coastguard Worker
425*834a2baaSAndroid Build Coastguard Worker
426*834a2baaSAndroid Build Coastguard Worker# Verify that the file contains the same lines as the lines argument, in arbitrary order
427*834a2baaSAndroid Build Coastguard Workerdef verify_file_sorted(lines, fn):
428*834a2baaSAndroid Build Coastguard Worker    file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')]
429*834a2baaSAndroid Build Coastguard Worker    line_set = set(lines)
430*834a2baaSAndroid Build Coastguard Worker    file_set = set(file_lines)
431*834a2baaSAndroid Build Coastguard Worker    if SHARP_S_TO_DOUBLE in file_set:
432*834a2baaSAndroid Build Coastguard Worker        # ignore difference of double capital letter s and capital letter sharp s
433*834a2baaSAndroid Build Coastguard Worker        file_set.symmetric_difference_update([SHARP_S_TO_DOUBLE, SHARP_S_TO_CAPITAL])
434*834a2baaSAndroid Build Coastguard Worker    if line_set == file_set:
435*834a2baaSAndroid Build Coastguard Worker        return True
436*834a2baaSAndroid Build Coastguard Worker    for line in line_set - file_set:
437*834a2baaSAndroid Build Coastguard Worker        print(repr(line) + ' in reconstruction, not in file')
438*834a2baaSAndroid Build Coastguard Worker    for line in file_set - line_set:
439*834a2baaSAndroid Build Coastguard Worker        print(repr(line) + ' in file, not in reconstruction')
440*834a2baaSAndroid Build Coastguard Worker    return False
441*834a2baaSAndroid Build Coastguard Worker
442*834a2baaSAndroid Build Coastguard Worker
443*834a2baaSAndroid Build Coastguard Workerdef map_to_chr(alphabet_map):
444*834a2baaSAndroid Build Coastguard Worker    result = []
445*834a2baaSAndroid Build Coastguard Worker    ch_map = {}
446*834a2baaSAndroid Build Coastguard Worker    for val in alphabet_map.values():
447*834a2baaSAndroid Build Coastguard Worker        chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
448*834a2baaSAndroid Build Coastguard Worker        # non-cased characters (like Ethopic) are in both, matching chr file
449*834a2baaSAndroid Build Coastguard Worker        lowercase = [ch for ch in chs if not ch.isupper()]
450*834a2baaSAndroid Build Coastguard Worker        uppercase = [ch for ch in chs if not ch.islower()]
451*834a2baaSAndroid Build Coastguard Worker        # print(val, `lowercase`, `uppercase`)
452*834a2baaSAndroid Build Coastguard Worker        assert len(lowercase) == 1, 'expected 1 lowercase character'
453*834a2baaSAndroid Build Coastguard Worker        assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
454*834a2baaSAndroid Build Coastguard Worker        ch_map[val] = lowercase[0]
455*834a2baaSAndroid Build Coastguard Worker        result.append(''.join(lowercase + uppercase))
456*834a2baaSAndroid Build Coastguard Worker    ch_map[0] = '.'
457*834a2baaSAndroid Build Coastguard Worker    return (ch_map, result)
458*834a2baaSAndroid Build Coastguard Worker
459*834a2baaSAndroid Build Coastguard Worker
460*834a2baaSAndroid Build Coastguard Workerdef get_pattern(pattern_data, ix):
461*834a2baaSAndroid Build Coastguard Worker    pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
462*834a2baaSAndroid Build Coastguard Worker    entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
463*834a2baaSAndroid Build Coastguard Worker    pat_len = entry >> 26
464*834a2baaSAndroid Build Coastguard Worker    pat_shift = (entry >> 20) & 0x1f
465*834a2baaSAndroid Build Coastguard Worker    offset = pattern_offset + (entry & 0xfffff)
466*834a2baaSAndroid Build Coastguard Worker    return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift
467*834a2baaSAndroid Build Coastguard Worker
468*834a2baaSAndroid Build Coastguard Worker
469*834a2baaSAndroid Build Coastguard Workerdef traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
470*834a2baaSAndroid Build Coastguard Worker    (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
471*834a2baaSAndroid Build Coastguard Worker    node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
472*834a2baaSAndroid Build Coastguard Worker    pattern = node_entry >> pattern_shift
473*834a2baaSAndroid Build Coastguard Worker    if pattern:
474*834a2baaSAndroid Build Coastguard Worker        result = []
475*834a2baaSAndroid Build Coastguard Worker        is_exception = False
476*834a2baaSAndroid Build Coastguard Worker        pat = get_pattern(pattern_data, pattern)
477*834a2baaSAndroid Build Coastguard Worker        for i in range(len(s) + 1):
478*834a2baaSAndroid Build Coastguard Worker            pat_off = i - 1 + len(pat) - len(s)
479*834a2baaSAndroid Build Coastguard Worker            if pat_off < 0:
480*834a2baaSAndroid Build Coastguard Worker                code = 0
481*834a2baaSAndroid Build Coastguard Worker            else:
482*834a2baaSAndroid Build Coastguard Worker                code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
483*834a2baaSAndroid Build Coastguard Worker            if 1 <= code <= 9:
484*834a2baaSAndroid Build Coastguard Worker                result.append('%d' % code)
485*834a2baaSAndroid Build Coastguard Worker            elif code == 10:
486*834a2baaSAndroid Build Coastguard Worker                is_exception = True
487*834a2baaSAndroid Build Coastguard Worker            elif code == 11:
488*834a2baaSAndroid Build Coastguard Worker                result.append('-')
489*834a2baaSAndroid Build Coastguard Worker                is_exception = True
490*834a2baaSAndroid Build Coastguard Worker            else:
491*834a2baaSAndroid Build Coastguard Worker                assert code == 0, 'unexpected code'
492*834a2baaSAndroid Build Coastguard Worker            if i < len(s):
493*834a2baaSAndroid Build Coastguard Worker                result.append(s[i])
494*834a2baaSAndroid Build Coastguard Worker        pat_str = ''.join(result)
495*834a2baaSAndroid Build Coastguard Worker        #print(`pat_str`, `pat`)
496*834a2baaSAndroid Build Coastguard Worker        if is_exception:
497*834a2baaSAndroid Build Coastguard Worker            assert pat_str[0] == '.', "expected leading '.'"
498*834a2baaSAndroid Build Coastguard Worker            assert pat_str[-1] == '.', "expected trailing '.'"
499*834a2baaSAndroid Build Coastguard Worker            exceptions.append(pat_str[1:-1])  # strip leading and trailing '.'
500*834a2baaSAndroid Build Coastguard Worker        else:
501*834a2baaSAndroid Build Coastguard Worker            patterns.append(pat_str)
502*834a2baaSAndroid Build Coastguard Worker    for ch in ch_map:
503*834a2baaSAndroid Build Coastguard Worker        edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
504*834a2baaSAndroid Build Coastguard Worker        link = (edge_entry & link_mask) >> link_shift
505*834a2baaSAndroid Build Coastguard Worker        if link != 0 and ch == (edge_entry & char_mask):
506*834a2baaSAndroid Build Coastguard Worker            sch = s + ch_map[ch]
507*834a2baaSAndroid Build Coastguard Worker            traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)
508*834a2baaSAndroid Build Coastguard Worker
509*834a2baaSAndroid Build Coastguard Worker
510*834a2baaSAndroid Build Coastguard Worker# Verify the generated binary file by reconstructing the textual representations
511*834a2baaSAndroid Build Coastguard Worker# from the binary hyb file, then checking that they're identical (mod the order of
512*834a2baaSAndroid Build Coastguard Worker# lines within the file, which is irrelevant). This function makes assumptions that
513*834a2baaSAndroid Build Coastguard Worker# are stronger than absolutely necessary (in particular, that the patterns are in
514*834a2baaSAndroid Build Coastguard Worker# lowercase as defined by python islower).
515*834a2baaSAndroid Build Coastguard Workerdef verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
516*834a2baaSAndroid Build Coastguard Worker    with open(hyb_fn, 'rb') as f:
517*834a2baaSAndroid Build Coastguard Worker        hyb_data = f.read()
518*834a2baaSAndroid Build Coastguard Worker    header = hyb_data[0: 6 * 4]
519*834a2baaSAndroid Build Coastguard Worker    (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
520*834a2baaSAndroid Build Coastguard Worker    alphabet_data = hyb_data[alphabet_off:trie_off]
521*834a2baaSAndroid Build Coastguard Worker    trie_data = hyb_data[trie_off:pattern_off]
522*834a2baaSAndroid Build Coastguard Worker    pattern_data = hyb_data[pattern_off:file_size]
523*834a2baaSAndroid Build Coastguard Worker
524*834a2baaSAndroid Build Coastguard Worker    # reconstruct alphabet table
525*834a2baaSAndroid Build Coastguard Worker    alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
526*834a2baaSAndroid Build Coastguard Worker    alphabet_map = {}
527*834a2baaSAndroid Build Coastguard Worker    if alphabet_version == 0:
528*834a2baaSAndroid Build Coastguard Worker        (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
529*834a2baaSAndroid Build Coastguard Worker        for ch in range(min_ch, max_ch):
530*834a2baaSAndroid Build Coastguard Worker            offset = 12 + ch - min_ch
531*834a2baaSAndroid Build Coastguard Worker            b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
532*834a2baaSAndroid Build Coastguard Worker            if b != 0:
533*834a2baaSAndroid Build Coastguard Worker                alphabet_map[unichr(ch)] = b
534*834a2baaSAndroid Build Coastguard Worker    else:
535*834a2baaSAndroid Build Coastguard Worker        assert alphabet_version == 1
536*834a2baaSAndroid Build Coastguard Worker        n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
537*834a2baaSAndroid Build Coastguard Worker        for i in range(n_entries):
538*834a2baaSAndroid Build Coastguard Worker            entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
539*834a2baaSAndroid Build Coastguard Worker            alphabet_map[unichr(entry >> 11)] = entry & 0x7ff
540*834a2baaSAndroid Build Coastguard Worker
541*834a2baaSAndroid Build Coastguard Worker    ch_map, reconstructed_chr = map_to_chr(alphabet_map)
542*834a2baaSAndroid Build Coastguard Worker
543*834a2baaSAndroid Build Coastguard Worker    # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
544*834a2baaSAndroid Build Coastguard Worker    if u'\u0587' in reconstructed_chr:
545*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u0587')
546*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u0587\u0535\u0552')
547*834a2baaSAndroid Build Coastguard Worker
548*834a2baaSAndroid Build Coastguard Worker    # EXCEPTION for Greek (el), we don't really deal with the uppercase form of
549*834a2baaSAndroid Build Coastguard Worker    # U+03C2, U+03C3, U+0390, U+03B0
550*834a2baaSAndroid Build Coastguard Worker    if u'\u03C2' in reconstructed_chr:
551*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u03C2')
552*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u03C2\u03A3')
553*834a2baaSAndroid Build Coastguard Worker
554*834a2baaSAndroid Build Coastguard Worker    if u'\u03C3' in reconstructed_chr:
555*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u03C3')
556*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u03C3\u03A3')
557*834a2baaSAndroid Build Coastguard Worker
558*834a2baaSAndroid Build Coastguard Worker    if u'\u0390' in reconstructed_chr:
559*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u0390')
560*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u0390\u0390')
561*834a2baaSAndroid Build Coastguard Worker
562*834a2baaSAndroid Build Coastguard Worker    if u'\u03B0' in reconstructed_chr:
563*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u03B0')
564*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u03B0\u03B0')
565*834a2baaSAndroid Build Coastguard Worker
566*834a2baaSAndroid Build Coastguard Worker    if u'\u1c86' in reconstructed_chr:
567*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u1c86')
568*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u1c86\u1c86')
569*834a2baaSAndroid Build Coastguard Worker
570*834a2baaSAndroid Build Coastguard Worker    if u'\u1c82' in reconstructed_chr:
571*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.remove(u'\u1c82')
572*834a2baaSAndroid Build Coastguard Worker      reconstructed_chr.append(u'\u1c82\u1c82')
573*834a2baaSAndroid Build Coastguard Worker
574*834a2baaSAndroid Build Coastguard Worker    assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'
575*834a2baaSAndroid Build Coastguard Worker
576*834a2baaSAndroid Build Coastguard Worker    # reconstruct trie
577*834a2baaSAndroid Build Coastguard Worker    patterns = []
578*834a2baaSAndroid Build Coastguard Worker    exceptions = []
579*834a2baaSAndroid Build Coastguard Worker    traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)
580*834a2baaSAndroid Build Coastguard Worker
581*834a2baaSAndroid Build Coastguard Worker    # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0>
582*834a2baaSAndroid Build Coastguard Worker    if u'\u044c' in patterns:
583*834a2baaSAndroid Build Coastguard Worker        patterns.remove(u'\u044c')
584*834a2baaSAndroid Build Coastguard Worker        patterns.append(u'0\u044c0')
585*834a2baaSAndroid Build Coastguard Worker
586*834a2baaSAndroid Build Coastguard Worker    assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
587*834a2baaSAndroid Build Coastguard Worker    assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'
588*834a2baaSAndroid Build Coastguard Worker
589*834a2baaSAndroid Build Coastguard Worker
590*834a2baaSAndroid Build Coastguard Workerdef main():
591*834a2baaSAndroid Build Coastguard Worker    global VERBOSE
592*834a2baaSAndroid Build Coastguard Worker    try:
593*834a2baaSAndroid Build Coastguard Worker        opts, args = getopt.getopt(sys.argv[1:], 'v')
594*834a2baaSAndroid Build Coastguard Worker    except getopt.GetoptError as err:
595*834a2baaSAndroid Build Coastguard Worker        print(str(err))
596*834a2baaSAndroid Build Coastguard Worker        sys.exit(1)
597*834a2baaSAndroid Build Coastguard Worker    for o, _ in opts:
598*834a2baaSAndroid Build Coastguard Worker        if o == '-v':
599*834a2baaSAndroid Build Coastguard Worker            VERBOSE = True
600*834a2baaSAndroid Build Coastguard Worker    pat_fn, out_fn = args
601*834a2baaSAndroid Build Coastguard Worker    hyph = load(pat_fn)
602*834a2baaSAndroid Build Coastguard Worker    if pat_fn.endswith('.pat.txt'):
603*834a2baaSAndroid Build Coastguard Worker        chr_fn = pat_fn[:-8] + '.chr.txt'
604*834a2baaSAndroid Build Coastguard Worker        ch_map = load_chr(chr_fn)
605*834a2baaSAndroid Build Coastguard Worker        hyp_fn = pat_fn[:-8] + '.hyp.txt'
606*834a2baaSAndroid Build Coastguard Worker        load_hyp(hyph, hyp_fn)
607*834a2baaSAndroid Build Coastguard Worker        generate_hyb_file(hyph, ch_map, out_fn)
608*834a2baaSAndroid Build Coastguard Worker        verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)
609*834a2baaSAndroid Build Coastguard Worker
610*834a2baaSAndroid Build Coastguard Workerif __name__ == '__main__':
611*834a2baaSAndroid Build Coastguard Worker    main()
612