1*834a2baaSAndroid Build Coastguard Worker#!/usr/bin/env python3 2*834a2baaSAndroid Build Coastguard Worker 3*834a2baaSAndroid Build Coastguard Worker# Copyright (C) 2015 The Android Open Source Project 4*834a2baaSAndroid Build Coastguard Worker# 5*834a2baaSAndroid Build Coastguard Worker# Licensed under the Apache License, Version 2.0 (the 'License'); 6*834a2baaSAndroid Build Coastguard Worker# you may not use this file except in compliance with the License. 7*834a2baaSAndroid Build Coastguard Worker# You may obtain a copy of the License at 8*834a2baaSAndroid Build Coastguard Worker# 9*834a2baaSAndroid Build Coastguard Worker# http://www.apache.org/licenses/LICENSE-2.0 10*834a2baaSAndroid Build Coastguard Worker# 11*834a2baaSAndroid Build Coastguard Worker# Unless required by applicable law or agreed to in writing, software 12*834a2baaSAndroid Build Coastguard Worker# distributed under the License is distributed on an 'AS IS' BASIS, 13*834a2baaSAndroid Build Coastguard Worker# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*834a2baaSAndroid Build Coastguard Worker# See the License for the specific language governing permissions and 15*834a2baaSAndroid Build Coastguard Worker# limitations under the License. 16*834a2baaSAndroid Build Coastguard Worker 17*834a2baaSAndroid Build Coastguard Worker""" 18*834a2baaSAndroid Build Coastguard WorkerConvert hyphen files in standard TeX format (a trio of pat, chr, and hyp) 19*834a2baaSAndroid Build Coastguard Workerinto binary format. See doc/hyb_file_format.md for more information. 20*834a2baaSAndroid Build Coastguard Worker 21*834a2baaSAndroid Build Coastguard WorkerUsage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb 22*834a2baaSAndroid Build Coastguard Worker 23*834a2baaSAndroid Build Coastguard WorkerOptional -v parameter turns on verbose debugging. 24*834a2baaSAndroid Build Coastguard Worker 25*834a2baaSAndroid Build Coastguard Worker""" 26*834a2baaSAndroid Build Coastguard Worker 27*834a2baaSAndroid Build Coastguard Workerfrom __future__ import print_function 28*834a2baaSAndroid Build Coastguard Worker 29*834a2baaSAndroid Build Coastguard Workerimport io 30*834a2baaSAndroid Build Coastguard Workerimport sys 31*834a2baaSAndroid Build Coastguard Workerimport struct 32*834a2baaSAndroid Build Coastguard Workerimport math 33*834a2baaSAndroid Build Coastguard Workerimport getopt 34*834a2baaSAndroid Build Coastguard Worker 35*834a2baaSAndroid Build Coastguard Worker 36*834a2baaSAndroid Build Coastguard WorkerVERBOSE = False 37*834a2baaSAndroid Build Coastguard Worker 38*834a2baaSAndroid Build Coastguard Worker# U+00DF is LATIN SMALL LETTER SHARP S 39*834a2baaSAndroid Build Coastguard Worker# U+1E9E is LATIN CAPITAL LETTER SHARP S 40*834a2baaSAndroid Build Coastguard WorkerSHARP_S_TO_DOUBLE = u'\u00dfSS' 41*834a2baaSAndroid Build Coastguard WorkerSHARP_S_TO_CAPITAL = u'\u00df\u1e9e' 42*834a2baaSAndroid Build Coastguard Worker 43*834a2baaSAndroid Build Coastguard Workerif sys.version_info[0] >= 3: 44*834a2baaSAndroid Build Coastguard Worker def unichr(x): 45*834a2baaSAndroid Build Coastguard Worker return chr(x) 46*834a2baaSAndroid Build Coastguard Worker 47*834a2baaSAndroid Build Coastguard Worker 48*834a2baaSAndroid Build Coastguard Worker# number of bits required to represent numbers up to n inclusive 49*834a2baaSAndroid Build Coastguard Workerdef num_bits(n): 50*834a2baaSAndroid Build Coastguard Worker return 1 + int(math.log(n, 2)) if n > 0 else 0 51*834a2baaSAndroid Build Coastguard Worker 52*834a2baaSAndroid Build Coastguard Worker 53*834a2baaSAndroid Build Coastguard Workerclass Node: 54*834a2baaSAndroid Build Coastguard Worker 55*834a2baaSAndroid Build Coastguard Worker def __init__(self): 56*834a2baaSAndroid Build Coastguard Worker self.succ = {} 57*834a2baaSAndroid Build Coastguard Worker self.res = None 58*834a2baaSAndroid Build Coastguard Worker self.fsm_pat = None 59*834a2baaSAndroid Build Coastguard Worker self.fail = None 60*834a2baaSAndroid Build Coastguard Worker 61*834a2baaSAndroid Build Coastguard Worker 62*834a2baaSAndroid Build Coastguard Worker# List of free slots, implemented as doubly linked list 63*834a2baaSAndroid Build Coastguard Workerclass Freelist: 64*834a2baaSAndroid Build Coastguard Worker 65*834a2baaSAndroid Build Coastguard Worker def __init__(self): 66*834a2baaSAndroid Build Coastguard Worker self.first = None 67*834a2baaSAndroid Build Coastguard Worker self.last = None 68*834a2baaSAndroid Build Coastguard Worker self.pred = [] 69*834a2baaSAndroid Build Coastguard Worker self.succ = [] 70*834a2baaSAndroid Build Coastguard Worker 71*834a2baaSAndroid Build Coastguard Worker def grow(self): 72*834a2baaSAndroid Build Coastguard Worker this = len(self.pred) 73*834a2baaSAndroid Build Coastguard Worker self.pred.append(self.last) 74*834a2baaSAndroid Build Coastguard Worker self.succ.append(None) 75*834a2baaSAndroid Build Coastguard Worker if self.last is None: 76*834a2baaSAndroid Build Coastguard Worker self.first = this 77*834a2baaSAndroid Build Coastguard Worker else: 78*834a2baaSAndroid Build Coastguard Worker self.succ[self.last] = this 79*834a2baaSAndroid Build Coastguard Worker self.last = this 80*834a2baaSAndroid Build Coastguard Worker 81*834a2baaSAndroid Build Coastguard Worker def next(self, cursor): 82*834a2baaSAndroid Build Coastguard Worker if cursor == 0: 83*834a2baaSAndroid Build Coastguard Worker cursor = self.first 84*834a2baaSAndroid Build Coastguard Worker if cursor is None: 85*834a2baaSAndroid Build Coastguard Worker self.grow() 86*834a2baaSAndroid Build Coastguard Worker result = self.last 87*834a2baaSAndroid Build Coastguard Worker else: 88*834a2baaSAndroid Build Coastguard Worker result = cursor 89*834a2baaSAndroid Build Coastguard Worker return result, self.succ[result] 90*834a2baaSAndroid Build Coastguard Worker 91*834a2baaSAndroid Build Coastguard Worker def is_free(self, ix): 92*834a2baaSAndroid Build Coastguard Worker while ix >= len(self.pred): 93*834a2baaSAndroid Build Coastguard Worker self.grow() 94*834a2baaSAndroid Build Coastguard Worker return self.pred[ix] != -1 95*834a2baaSAndroid Build Coastguard Worker 96*834a2baaSAndroid Build Coastguard Worker def use(self, ix): 97*834a2baaSAndroid Build Coastguard Worker if self.pred[ix] is None: 98*834a2baaSAndroid Build Coastguard Worker self.first = self.succ[ix] 99*834a2baaSAndroid Build Coastguard Worker else: 100*834a2baaSAndroid Build Coastguard Worker self.succ[self.pred[ix]] = self.succ[ix] 101*834a2baaSAndroid Build Coastguard Worker if self.succ[ix] is None: 102*834a2baaSAndroid Build Coastguard Worker self.last = self.pred[ix] 103*834a2baaSAndroid Build Coastguard Worker else: 104*834a2baaSAndroid Build Coastguard Worker self.pred[self.succ[ix]] = self.pred[ix] 105*834a2baaSAndroid Build Coastguard Worker if self.pred[ix] == -1: 106*834a2baaSAndroid Build Coastguard Worker assert self.pred[ix] != -1, 'double free!' 107*834a2baaSAndroid Build Coastguard Worker self.pred[ix] = -1 108*834a2baaSAndroid Build Coastguard Worker 109*834a2baaSAndroid Build Coastguard Worker 110*834a2baaSAndroid Build Coastguard Workerdef combine(a, b): 111*834a2baaSAndroid Build Coastguard Worker if a is None: return b 112*834a2baaSAndroid Build Coastguard Worker if b is None: return a 113*834a2baaSAndroid Build Coastguard Worker if len(b) < len(a): a, b = b, a 114*834a2baaSAndroid Build Coastguard Worker res = b[:len(b) - len(a)] 115*834a2baaSAndroid Build Coastguard Worker for i in range(len(a)): 116*834a2baaSAndroid Build Coastguard Worker res.append(max(a[i], b[i + len(b) - len(a)])) 117*834a2baaSAndroid Build Coastguard Worker return res 118*834a2baaSAndroid Build Coastguard Worker 119*834a2baaSAndroid Build Coastguard Worker 120*834a2baaSAndroid Build Coastguard Workerdef trim(pattern): 121*834a2baaSAndroid Build Coastguard Worker for ix in range(len(pattern)): 122*834a2baaSAndroid Build Coastguard Worker if pattern[ix] != 0: 123*834a2baaSAndroid Build Coastguard Worker return pattern[ix:] 124*834a2baaSAndroid Build Coastguard Worker 125*834a2baaSAndroid Build Coastguard Worker 126*834a2baaSAndroid Build Coastguard Workerdef pat_to_binary(pattern): 127*834a2baaSAndroid Build Coastguard Worker return b''.join(struct.pack('B', x) for x in pattern) 128*834a2baaSAndroid Build Coastguard Worker 129*834a2baaSAndroid Build Coastguard Worker 130*834a2baaSAndroid Build Coastguard Workerclass Hyph: 131*834a2baaSAndroid Build Coastguard Worker 132*834a2baaSAndroid Build Coastguard Worker def __init__(self): 133*834a2baaSAndroid Build Coastguard Worker self.root = Node() 134*834a2baaSAndroid Build Coastguard Worker self.root.str = '<root>' 135*834a2baaSAndroid Build Coastguard Worker self.node_list = [self.root] 136*834a2baaSAndroid Build Coastguard Worker 137*834a2baaSAndroid Build Coastguard Worker # Add a pattern (word fragment with numeric codes, such as ".ad4der") 138*834a2baaSAndroid Build Coastguard Worker def add_pat(self, pat): 139*834a2baaSAndroid Build Coastguard Worker lastWasLetter = False 140*834a2baaSAndroid Build Coastguard Worker haveSeenNumber = False 141*834a2baaSAndroid Build Coastguard Worker result = [] 142*834a2baaSAndroid Build Coastguard Worker word = '' 143*834a2baaSAndroid Build Coastguard Worker for c in pat: 144*834a2baaSAndroid Build Coastguard Worker if c.isdigit(): 145*834a2baaSAndroid Build Coastguard Worker result.append(int(c)) 146*834a2baaSAndroid Build Coastguard Worker lastWasLetter = False 147*834a2baaSAndroid Build Coastguard Worker haveSeenNumber = True 148*834a2baaSAndroid Build Coastguard Worker else: 149*834a2baaSAndroid Build Coastguard Worker word += c 150*834a2baaSAndroid Build Coastguard Worker if lastWasLetter and haveSeenNumber: 151*834a2baaSAndroid Build Coastguard Worker result.append(0) 152*834a2baaSAndroid Build Coastguard Worker lastWasLetter = True 153*834a2baaSAndroid Build Coastguard Worker if lastWasLetter: 154*834a2baaSAndroid Build Coastguard Worker result.append(0) 155*834a2baaSAndroid Build Coastguard Worker 156*834a2baaSAndroid Build Coastguard Worker self.add_word_res(word, result) 157*834a2baaSAndroid Build Coastguard Worker 158*834a2baaSAndroid Build Coastguard Worker # Add an exception (word with hyphens, such as "ta-ble") 159*834a2baaSAndroid Build Coastguard Worker def add_exception(self, hyph_word): 160*834a2baaSAndroid Build Coastguard Worker res = [] 161*834a2baaSAndroid Build Coastguard Worker word = ['.'] 162*834a2baaSAndroid Build Coastguard Worker need_10 = False 163*834a2baaSAndroid Build Coastguard Worker for c in hyph_word: 164*834a2baaSAndroid Build Coastguard Worker if c == '-': 165*834a2baaSAndroid Build Coastguard Worker res.append(11) 166*834a2baaSAndroid Build Coastguard Worker need_10 = False 167*834a2baaSAndroid Build Coastguard Worker else: 168*834a2baaSAndroid Build Coastguard Worker if need_10: 169*834a2baaSAndroid Build Coastguard Worker res.append(10) 170*834a2baaSAndroid Build Coastguard Worker word.append(c) 171*834a2baaSAndroid Build Coastguard Worker need_10 = True 172*834a2baaSAndroid Build Coastguard Worker word.append('.') 173*834a2baaSAndroid Build Coastguard Worker res.append(0) 174*834a2baaSAndroid Build Coastguard Worker res.append(0) 175*834a2baaSAndroid Build Coastguard Worker if VERBOSE: 176*834a2baaSAndroid Build Coastguard Worker print(word, res) 177*834a2baaSAndroid Build Coastguard Worker self.add_word_res(''.join(word), res) 178*834a2baaSAndroid Build Coastguard Worker 179*834a2baaSAndroid Build Coastguard Worker def add_word_res(self, word, result): 180*834a2baaSAndroid Build Coastguard Worker if VERBOSE: 181*834a2baaSAndroid Build Coastguard Worker print(word, result) 182*834a2baaSAndroid Build Coastguard Worker 183*834a2baaSAndroid Build Coastguard Worker t = self.root 184*834a2baaSAndroid Build Coastguard Worker s = '' 185*834a2baaSAndroid Build Coastguard Worker for c in word: 186*834a2baaSAndroid Build Coastguard Worker s += c 187*834a2baaSAndroid Build Coastguard Worker if c not in t.succ: 188*834a2baaSAndroid Build Coastguard Worker new_node = Node() 189*834a2baaSAndroid Build Coastguard Worker new_node.str = s 190*834a2baaSAndroid Build Coastguard Worker self.node_list.append(new_node) 191*834a2baaSAndroid Build Coastguard Worker t.succ[c] = new_node 192*834a2baaSAndroid Build Coastguard Worker t = t.succ[c] 193*834a2baaSAndroid Build Coastguard Worker t.res = result 194*834a2baaSAndroid Build Coastguard Worker 195*834a2baaSAndroid Build Coastguard Worker def pack(self, node_list, ch_map, use_node=False): 196*834a2baaSAndroid Build Coastguard Worker size = 0 197*834a2baaSAndroid Build Coastguard Worker self.node_map = {} 198*834a2baaSAndroid Build Coastguard Worker nodes = Freelist() 199*834a2baaSAndroid Build Coastguard Worker edges = Freelist() 200*834a2baaSAndroid Build Coastguard Worker edge_start = 1 if use_node else 0 201*834a2baaSAndroid Build Coastguard Worker for node in node_list: 202*834a2baaSAndroid Build Coastguard Worker succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) 203*834a2baaSAndroid Build Coastguard Worker if len(succ): 204*834a2baaSAndroid Build Coastguard Worker cursor = 0 205*834a2baaSAndroid Build Coastguard Worker while True: 206*834a2baaSAndroid Build Coastguard Worker edge_ix, cursor = edges.next(cursor) 207*834a2baaSAndroid Build Coastguard Worker ix = edge_ix - succ[0] 208*834a2baaSAndroid Build Coastguard Worker if (ix >= 0 and nodes.is_free(ix) and 209*834a2baaSAndroid Build Coastguard Worker all(edges.is_free(ix + s) for s in succ) and 210*834a2baaSAndroid Build Coastguard Worker ((not use_node) or edges.is_free(ix))): 211*834a2baaSAndroid Build Coastguard Worker break 212*834a2baaSAndroid Build Coastguard Worker elif use_node: 213*834a2baaSAndroid Build Coastguard Worker ix, _ = edges.next(0) 214*834a2baaSAndroid Build Coastguard Worker nodes.is_free(ix) # actually don't need nodes at all when use_node, 215*834a2baaSAndroid Build Coastguard Worker # but keep it happy 216*834a2baaSAndroid Build Coastguard Worker else: 217*834a2baaSAndroid Build Coastguard Worker ix, _ = nodes.next(0) 218*834a2baaSAndroid Build Coastguard Worker node.ix = ix 219*834a2baaSAndroid Build Coastguard Worker self.node_map[ix] = node 220*834a2baaSAndroid Build Coastguard Worker nodes.use(ix) 221*834a2baaSAndroid Build Coastguard Worker size = max(size, ix) 222*834a2baaSAndroid Build Coastguard Worker if use_node: 223*834a2baaSAndroid Build Coastguard Worker edges.use(ix) 224*834a2baaSAndroid Build Coastguard Worker for s in succ: 225*834a2baaSAndroid Build Coastguard Worker edges.use(ix + s) 226*834a2baaSAndroid Build Coastguard Worker size += max(ch_map.values()) + 1 227*834a2baaSAndroid Build Coastguard Worker return size 228*834a2baaSAndroid Build Coastguard Worker 229*834a2baaSAndroid Build Coastguard Worker # return list of nodes in bfs order 230*834a2baaSAndroid Build Coastguard Worker def bfs(self, ch_map): 231*834a2baaSAndroid Build Coastguard Worker result = [self.root] 232*834a2baaSAndroid Build Coastguard Worker ix = 0 233*834a2baaSAndroid Build Coastguard Worker while ix < len(result): 234*834a2baaSAndroid Build Coastguard Worker node = result[ix] 235*834a2baaSAndroid Build Coastguard Worker node.bfs_ix = ix 236*834a2baaSAndroid Build Coastguard Worker mapped = {} 237*834a2baaSAndroid Build Coastguard Worker for c, next in node.succ.items(): 238*834a2baaSAndroid Build Coastguard Worker assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) 239*834a2baaSAndroid Build Coastguard Worker mapped[ch_map[c]] = next 240*834a2baaSAndroid Build Coastguard Worker for i in sorted(mapped.keys()): 241*834a2baaSAndroid Build Coastguard Worker result.append(mapped[i]) 242*834a2baaSAndroid Build Coastguard Worker ix += 1 243*834a2baaSAndroid Build Coastguard Worker self.bfs_order = result 244*834a2baaSAndroid Build Coastguard Worker return result 245*834a2baaSAndroid Build Coastguard Worker 246*834a2baaSAndroid Build Coastguard Worker # suffix compression - convert the trie into an acyclic digraph, merging nodes when 247*834a2baaSAndroid Build Coastguard Worker # the subtries are identical 248*834a2baaSAndroid Build Coastguard Worker def dedup(self): 249*834a2baaSAndroid Build Coastguard Worker uniques = [] 250*834a2baaSAndroid Build Coastguard Worker dupmap = {} 251*834a2baaSAndroid Build Coastguard Worker dedup_ix = [0] * len(self.bfs_order) 252*834a2baaSAndroid Build Coastguard Worker for ix in reversed(range(len(self.bfs_order))): 253*834a2baaSAndroid Build Coastguard Worker # construct string representation of node 254*834a2baaSAndroid Build Coastguard Worker node = self.bfs_order[ix] 255*834a2baaSAndroid Build Coastguard Worker if node.res is None: 256*834a2baaSAndroid Build Coastguard Worker s = '' 257*834a2baaSAndroid Build Coastguard Worker else: 258*834a2baaSAndroid Build Coastguard Worker s = ''.join(str(c) for c in node.res) 259*834a2baaSAndroid Build Coastguard Worker for c in sorted(node.succ.keys()): 260*834a2baaSAndroid Build Coastguard Worker succ = node.succ[c] 261*834a2baaSAndroid Build Coastguard Worker s += ' ' + c + str(dedup_ix[succ.bfs_ix]) 262*834a2baaSAndroid Build Coastguard Worker if s in dupmap: 263*834a2baaSAndroid Build Coastguard Worker dedup_ix[ix] = dupmap[s] 264*834a2baaSAndroid Build Coastguard Worker else: 265*834a2baaSAndroid Build Coastguard Worker uniques.append(node) 266*834a2baaSAndroid Build Coastguard Worker dedup_ix[ix] = ix 267*834a2baaSAndroid Build Coastguard Worker dupmap[s] = dedup_ix[ix] 268*834a2baaSAndroid Build Coastguard Worker uniques.reverse() 269*834a2baaSAndroid Build Coastguard Worker if VERBOSE: 270*834a2baaSAndroid Build Coastguard Worker print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') 271*834a2baaSAndroid Build Coastguard Worker return dedup_ix, uniques 272*834a2baaSAndroid Build Coastguard Worker 273*834a2baaSAndroid Build Coastguard Worker 274*834a2baaSAndroid Build Coastguard Worker# load the ".pat" file, which contains patterns such as a1b2c3 275*834a2baaSAndroid Build Coastguard Workerdef load(fn): 276*834a2baaSAndroid Build Coastguard Worker hyph = Hyph() 277*834a2baaSAndroid Build Coastguard Worker with io.open(fn, encoding='UTF-8') as f: 278*834a2baaSAndroid Build Coastguard Worker for l in f: 279*834a2baaSAndroid Build Coastguard Worker pat = l.strip() 280*834a2baaSAndroid Build Coastguard Worker hyph.add_pat(pat) 281*834a2baaSAndroid Build Coastguard Worker return hyph 282*834a2baaSAndroid Build Coastguard Worker 283*834a2baaSAndroid Build Coastguard Worker 284*834a2baaSAndroid Build Coastguard Worker# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. 285*834a2baaSAndroid Build Coastguard Workerdef load_chr(fn): 286*834a2baaSAndroid Build Coastguard Worker ch_map = {'.': 0} 287*834a2baaSAndroid Build Coastguard Worker with io.open(fn, encoding='UTF-8') as f: 288*834a2baaSAndroid Build Coastguard Worker for i, l in enumerate(f): 289*834a2baaSAndroid Build Coastguard Worker l = l.strip() 290*834a2baaSAndroid Build Coastguard Worker if len(l) > 2: 291*834a2baaSAndroid Build Coastguard Worker if l == SHARP_S_TO_DOUBLE: 292*834a2baaSAndroid Build Coastguard Worker # replace with lowercasing from capital letter sharp s 293*834a2baaSAndroid Build Coastguard Worker l = SHARP_S_TO_CAPITAL 294*834a2baaSAndroid Build Coastguard Worker else: 295*834a2baaSAndroid Build Coastguard Worker # lowercase maps to multi-character uppercase sequence, ignore uppercase for now 296*834a2baaSAndroid Build Coastguard Worker l = l[:1] 297*834a2baaSAndroid Build Coastguard Worker else: 298*834a2baaSAndroid Build Coastguard Worker assert len(l) == 2, 'expected 2 chars in chr' 299*834a2baaSAndroid Build Coastguard Worker for c in l: 300*834a2baaSAndroid Build Coastguard Worker ch_map[c] = i + 1 301*834a2baaSAndroid Build Coastguard Worker return ch_map 302*834a2baaSAndroid Build Coastguard Worker 303*834a2baaSAndroid Build Coastguard Worker 304*834a2baaSAndroid Build Coastguard Worker# load exceptions with explicit hyphens 305*834a2baaSAndroid Build Coastguard Workerdef load_hyp(hyph, fn): 306*834a2baaSAndroid Build Coastguard Worker with io.open(fn, encoding='UTF-8') as f: 307*834a2baaSAndroid Build Coastguard Worker for l in f: 308*834a2baaSAndroid Build Coastguard Worker hyph.add_exception(l.strip()) 309*834a2baaSAndroid Build Coastguard Worker 310*834a2baaSAndroid Build Coastguard Worker 311*834a2baaSAndroid Build Coastguard Workerdef generate_header(alphabet, trie, pattern): 312*834a2baaSAndroid Build Coastguard Worker alphabet_off = 6 * 4 313*834a2baaSAndroid Build Coastguard Worker trie_off = alphabet_off + len(alphabet) 314*834a2baaSAndroid Build Coastguard Worker pattern_off = trie_off + len(trie) 315*834a2baaSAndroid Build Coastguard Worker file_size = pattern_off + len(pattern) 316*834a2baaSAndroid Build Coastguard Worker data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] 317*834a2baaSAndroid Build Coastguard Worker return struct.pack('<6I', *data) 318*834a2baaSAndroid Build Coastguard Worker 319*834a2baaSAndroid Build Coastguard Worker 320*834a2baaSAndroid Build Coastguard Workerdef generate_alphabet(ch_map): 321*834a2baaSAndroid Build Coastguard Worker ch_map = ch_map.copy() 322*834a2baaSAndroid Build Coastguard Worker del ch_map['.'] 323*834a2baaSAndroid Build Coastguard Worker min_ch = ord(min(ch_map)) 324*834a2baaSAndroid Build Coastguard Worker max_ch = ord(max(ch_map)) 325*834a2baaSAndroid Build Coastguard Worker if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: 326*834a2baaSAndroid Build Coastguard Worker # generate format 0 327*834a2baaSAndroid Build Coastguard Worker data = [0] * (max_ch - min_ch + 1) 328*834a2baaSAndroid Build Coastguard Worker for c, val in ch_map.items(): 329*834a2baaSAndroid Build Coastguard Worker data[ord(c) - min_ch] = val 330*834a2baaSAndroid Build Coastguard Worker result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] 331*834a2baaSAndroid Build Coastguard Worker for b in data: 332*834a2baaSAndroid Build Coastguard Worker result.append(struct.pack('<B', b)) 333*834a2baaSAndroid Build Coastguard Worker else: 334*834a2baaSAndroid Build Coastguard Worker # generate format 1 335*834a2baaSAndroid Build Coastguard Worker assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded' 336*834a2baaSAndroid Build Coastguard Worker result = [struct.pack('<2I', 1, len(ch_map))] 337*834a2baaSAndroid Build Coastguard Worker for c, val in sorted(ch_map.items()): 338*834a2baaSAndroid Build Coastguard Worker data = (ord(c) << 11) | val 339*834a2baaSAndroid Build Coastguard Worker result.append(struct.pack('<I', data)) 340*834a2baaSAndroid Build Coastguard Worker binary = b''.join(result) 341*834a2baaSAndroid Build Coastguard Worker if len(binary) % 4 != 0: 342*834a2baaSAndroid Build Coastguard Worker binary += b'\x00' * (4 - len(binary) % 4) 343*834a2baaSAndroid Build Coastguard Worker return binary 344*834a2baaSAndroid Build Coastguard Worker 345*834a2baaSAndroid Build Coastguard Worker 346*834a2baaSAndroid Build Coastguard Worker# assumes hyph structure has been packed, ie node.ix values have been set 347*834a2baaSAndroid Build Coastguard Workerdef generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap): 348*834a2baaSAndroid Build Coastguard Worker ch_array = [0] * n_trie 349*834a2baaSAndroid Build Coastguard Worker link_array = [0] * n_trie 350*834a2baaSAndroid Build Coastguard Worker pat_array = [0] * n_trie 351*834a2baaSAndroid Build Coastguard Worker link_shift = num_bits(max(ch_map.values())) 352*834a2baaSAndroid Build Coastguard Worker char_mask = (1 << link_shift) - 1 353*834a2baaSAndroid Build Coastguard Worker pattern_shift = link_shift + num_bits(n_trie - 1) 354*834a2baaSAndroid Build Coastguard Worker link_mask = (1 << pattern_shift) - (1 << link_shift) 355*834a2baaSAndroid Build Coastguard Worker result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)] 356*834a2baaSAndroid Build Coastguard Worker 357*834a2baaSAndroid Build Coastguard Worker for node in dedup_nodes: 358*834a2baaSAndroid Build Coastguard Worker ix = node.ix 359*834a2baaSAndroid Build Coastguard Worker if node.res is not None: 360*834a2baaSAndroid Build Coastguard Worker pat_array[ix] = patmap[pat_to_binary(node.res)] 361*834a2baaSAndroid Build Coastguard Worker for c, next in node.succ.items(): 362*834a2baaSAndroid Build Coastguard Worker c_num = ch_map[c] 363*834a2baaSAndroid Build Coastguard Worker link_ix = ix + c_num 364*834a2baaSAndroid Build Coastguard Worker ch_array[link_ix] = c_num 365*834a2baaSAndroid Build Coastguard Worker if dedup_ix is None: 366*834a2baaSAndroid Build Coastguard Worker dedup_next = next 367*834a2baaSAndroid Build Coastguard Worker else: 368*834a2baaSAndroid Build Coastguard Worker dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]] 369*834a2baaSAndroid Build Coastguard Worker link_array[link_ix] = dedup_next.ix 370*834a2baaSAndroid Build Coastguard Worker 371*834a2baaSAndroid Build Coastguard Worker for i in range(n_trie): 372*834a2baaSAndroid Build Coastguard Worker #print((pat_array[i], link_array[i], ch_array[i])) 373*834a2baaSAndroid Build Coastguard Worker packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i] 374*834a2baaSAndroid Build Coastguard Worker result.append(struct.pack('<I', packed)) 375*834a2baaSAndroid Build Coastguard Worker return b''.join(result) 376*834a2baaSAndroid Build Coastguard Worker 377*834a2baaSAndroid Build Coastguard Worker 378*834a2baaSAndroid Build Coastguard Workerdef generate_pattern(pats): 379*834a2baaSAndroid Build Coastguard Worker pat_array = [0] 380*834a2baaSAndroid Build Coastguard Worker patmap = {b'': 0} 381*834a2baaSAndroid Build Coastguard Worker 382*834a2baaSAndroid Build Coastguard Worker raw_pat_array = [] 383*834a2baaSAndroid Build Coastguard Worker raw_pat_size = 0 384*834a2baaSAndroid Build Coastguard Worker raw_patmap = {} 385*834a2baaSAndroid Build Coastguard Worker 386*834a2baaSAndroid Build Coastguard Worker for pat in pats: 387*834a2baaSAndroid Build Coastguard Worker if pat is None: 388*834a2baaSAndroid Build Coastguard Worker continue 389*834a2baaSAndroid Build Coastguard Worker pat_str = pat_to_binary(pat) 390*834a2baaSAndroid Build Coastguard Worker if pat_str not in patmap: 391*834a2baaSAndroid Build Coastguard Worker shift = 0 392*834a2baaSAndroid Build Coastguard Worker while shift < len(pat) and pat[len(pat) - shift - 1] == 0: 393*834a2baaSAndroid Build Coastguard Worker shift += 1 394*834a2baaSAndroid Build Coastguard Worker rawpat = pat_str[:len(pat) - shift] 395*834a2baaSAndroid Build Coastguard Worker if rawpat not in raw_patmap: 396*834a2baaSAndroid Build Coastguard Worker raw_patmap[rawpat] = raw_pat_size 397*834a2baaSAndroid Build Coastguard Worker raw_pat_array.append(rawpat) 398*834a2baaSAndroid Build Coastguard Worker raw_pat_size += len(rawpat) 399*834a2baaSAndroid Build Coastguard Worker data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat] 400*834a2baaSAndroid Build Coastguard Worker patmap[pat_str] = len(pat_array) 401*834a2baaSAndroid Build Coastguard Worker pat_array.append(data) 402*834a2baaSAndroid Build Coastguard Worker data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size] 403*834a2baaSAndroid Build Coastguard Worker result = [struct.pack('<4I', *data)] 404*834a2baaSAndroid Build Coastguard Worker for x in pat_array: 405*834a2baaSAndroid Build Coastguard Worker result.append(struct.pack('<I', x)) 406*834a2baaSAndroid Build Coastguard Worker result.extend(raw_pat_array) 407*834a2baaSAndroid Build Coastguard Worker return patmap, b''.join(result) 408*834a2baaSAndroid Build Coastguard Worker 409*834a2baaSAndroid Build Coastguard Worker 410*834a2baaSAndroid Build Coastguard Workerdef generate_hyb_file(hyph, ch_map, hyb_fn): 411*834a2baaSAndroid Build Coastguard Worker bfs = hyph.bfs(ch_map) 412*834a2baaSAndroid Build Coastguard Worker dedup_ix, dedup_nodes = hyph.dedup() 413*834a2baaSAndroid Build Coastguard Worker n_trie = hyph.pack(dedup_nodes, ch_map) 414*834a2baaSAndroid Build Coastguard Worker alphabet = generate_alphabet(ch_map) 415*834a2baaSAndroid Build Coastguard Worker patmap, pattern = generate_pattern([n.res for n in hyph.node_list]) 416*834a2baaSAndroid Build Coastguard Worker trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap) 417*834a2baaSAndroid Build Coastguard Worker header = generate_header(alphabet, trie, pattern) 418*834a2baaSAndroid Build Coastguard Worker 419*834a2baaSAndroid Build Coastguard Worker with open(hyb_fn, 'wb') as f: 420*834a2baaSAndroid Build Coastguard Worker f.write(header) 421*834a2baaSAndroid Build Coastguard Worker f.write(alphabet) 422*834a2baaSAndroid Build Coastguard Worker f.write(trie) 423*834a2baaSAndroid Build Coastguard Worker f.write(pattern) 424*834a2baaSAndroid Build Coastguard Worker 425*834a2baaSAndroid Build Coastguard Worker 426*834a2baaSAndroid Build Coastguard Worker# Verify that the file contains the same lines as the lines argument, in arbitrary order 427*834a2baaSAndroid Build Coastguard Workerdef verify_file_sorted(lines, fn): 428*834a2baaSAndroid Build Coastguard Worker file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')] 429*834a2baaSAndroid Build Coastguard Worker line_set = set(lines) 430*834a2baaSAndroid Build Coastguard Worker file_set = set(file_lines) 431*834a2baaSAndroid Build Coastguard Worker if SHARP_S_TO_DOUBLE in file_set: 432*834a2baaSAndroid Build Coastguard Worker # ignore difference of double capital letter s and capital letter sharp s 433*834a2baaSAndroid Build Coastguard Worker file_set.symmetric_difference_update([SHARP_S_TO_DOUBLE, SHARP_S_TO_CAPITAL]) 434*834a2baaSAndroid Build Coastguard Worker if line_set == file_set: 435*834a2baaSAndroid Build Coastguard Worker return True 436*834a2baaSAndroid Build Coastguard Worker for line in line_set - file_set: 437*834a2baaSAndroid Build Coastguard Worker print(repr(line) + ' in reconstruction, not in file') 438*834a2baaSAndroid Build Coastguard Worker for line in file_set - line_set: 439*834a2baaSAndroid Build Coastguard Worker print(repr(line) + ' in file, not in reconstruction') 440*834a2baaSAndroid Build Coastguard Worker return False 441*834a2baaSAndroid Build Coastguard Worker 442*834a2baaSAndroid Build Coastguard Worker 443*834a2baaSAndroid Build Coastguard Workerdef map_to_chr(alphabet_map): 444*834a2baaSAndroid Build Coastguard Worker result = [] 445*834a2baaSAndroid Build Coastguard Worker ch_map = {} 446*834a2baaSAndroid Build Coastguard Worker for val in alphabet_map.values(): 447*834a2baaSAndroid Build Coastguard Worker chs = [ch for ch in alphabet_map if alphabet_map[ch] == val] 448*834a2baaSAndroid Build Coastguard Worker # non-cased characters (like Ethopic) are in both, matching chr file 449*834a2baaSAndroid Build Coastguard Worker lowercase = [ch for ch in chs if not ch.isupper()] 450*834a2baaSAndroid Build Coastguard Worker uppercase = [ch for ch in chs if not ch.islower()] 451*834a2baaSAndroid Build Coastguard Worker # print(val, `lowercase`, `uppercase`) 452*834a2baaSAndroid Build Coastguard Worker assert len(lowercase) == 1, 'expected 1 lowercase character' 453*834a2baaSAndroid Build Coastguard Worker assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character' 454*834a2baaSAndroid Build Coastguard Worker ch_map[val] = lowercase[0] 455*834a2baaSAndroid Build Coastguard Worker result.append(''.join(lowercase + uppercase)) 456*834a2baaSAndroid Build Coastguard Worker ch_map[0] = '.' 457*834a2baaSAndroid Build Coastguard Worker return (ch_map, result) 458*834a2baaSAndroid Build Coastguard Worker 459*834a2baaSAndroid Build Coastguard Worker 460*834a2baaSAndroid Build Coastguard Workerdef get_pattern(pattern_data, ix): 461*834a2baaSAndroid Build Coastguard Worker pattern_offset = struct.unpack('<I', pattern_data[8:12])[0] 462*834a2baaSAndroid Build Coastguard Worker entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0] 463*834a2baaSAndroid Build Coastguard Worker pat_len = entry >> 26 464*834a2baaSAndroid Build Coastguard Worker pat_shift = (entry >> 20) & 0x1f 465*834a2baaSAndroid Build Coastguard Worker offset = pattern_offset + (entry & 0xfffff) 466*834a2baaSAndroid Build Coastguard Worker return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift 467*834a2baaSAndroid Build Coastguard Worker 468*834a2baaSAndroid Build Coastguard Worker 469*834a2baaSAndroid Build Coastguard Workerdef traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): 470*834a2baaSAndroid Build Coastguard Worker (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) 471*834a2baaSAndroid Build Coastguard Worker node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0] 472*834a2baaSAndroid Build Coastguard Worker pattern = node_entry >> pattern_shift 473*834a2baaSAndroid Build Coastguard Worker if pattern: 474*834a2baaSAndroid Build Coastguard Worker result = [] 475*834a2baaSAndroid Build Coastguard Worker is_exception = False 476*834a2baaSAndroid Build Coastguard Worker pat = get_pattern(pattern_data, pattern) 477*834a2baaSAndroid Build Coastguard Worker for i in range(len(s) + 1): 478*834a2baaSAndroid Build Coastguard Worker pat_off = i - 1 + len(pat) - len(s) 479*834a2baaSAndroid Build Coastguard Worker if pat_off < 0: 480*834a2baaSAndroid Build Coastguard Worker code = 0 481*834a2baaSAndroid Build Coastguard Worker else: 482*834a2baaSAndroid Build Coastguard Worker code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] 483*834a2baaSAndroid Build Coastguard Worker if 1 <= code <= 9: 484*834a2baaSAndroid Build Coastguard Worker result.append('%d' % code) 485*834a2baaSAndroid Build Coastguard Worker elif code == 10: 486*834a2baaSAndroid Build Coastguard Worker is_exception = True 487*834a2baaSAndroid Build Coastguard Worker elif code == 11: 488*834a2baaSAndroid Build Coastguard Worker result.append('-') 489*834a2baaSAndroid Build Coastguard Worker is_exception = True 490*834a2baaSAndroid Build Coastguard Worker else: 491*834a2baaSAndroid Build Coastguard Worker assert code == 0, 'unexpected code' 492*834a2baaSAndroid Build Coastguard Worker if i < len(s): 493*834a2baaSAndroid Build Coastguard Worker result.append(s[i]) 494*834a2baaSAndroid Build Coastguard Worker pat_str = ''.join(result) 495*834a2baaSAndroid Build Coastguard Worker #print(`pat_str`, `pat`) 496*834a2baaSAndroid Build Coastguard Worker if is_exception: 497*834a2baaSAndroid Build Coastguard Worker assert pat_str[0] == '.', "expected leading '.'" 498*834a2baaSAndroid Build Coastguard Worker assert pat_str[-1] == '.', "expected trailing '.'" 499*834a2baaSAndroid Build Coastguard Worker exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' 500*834a2baaSAndroid Build Coastguard Worker else: 501*834a2baaSAndroid Build Coastguard Worker patterns.append(pat_str) 502*834a2baaSAndroid Build Coastguard Worker for ch in ch_map: 503*834a2baaSAndroid Build Coastguard Worker edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0] 504*834a2baaSAndroid Build Coastguard Worker link = (edge_entry & link_mask) >> link_shift 505*834a2baaSAndroid Build Coastguard Worker if link != 0 and ch == (edge_entry & char_mask): 506*834a2baaSAndroid Build Coastguard Worker sch = s + ch_map[ch] 507*834a2baaSAndroid Build Coastguard Worker traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) 508*834a2baaSAndroid Build Coastguard Worker 509*834a2baaSAndroid Build Coastguard Worker 510*834a2baaSAndroid Build Coastguard Worker# Verify the generated binary file by reconstructing the textual representations 511*834a2baaSAndroid Build Coastguard Worker# from the binary hyb file, then checking that they're identical (mod the order of 512*834a2baaSAndroid Build Coastguard Worker# lines within the file, which is irrelevant). This function makes assumptions that 513*834a2baaSAndroid Build Coastguard Worker# are stronger than absolutely necessary (in particular, that the patterns are in 514*834a2baaSAndroid Build Coastguard Worker# lowercase as defined by python islower). 515*834a2baaSAndroid Build Coastguard Workerdef verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): 516*834a2baaSAndroid Build Coastguard Worker with open(hyb_fn, 'rb') as f: 517*834a2baaSAndroid Build Coastguard Worker hyb_data = f.read() 518*834a2baaSAndroid Build Coastguard Worker header = hyb_data[0: 6 * 4] 519*834a2baaSAndroid Build Coastguard Worker (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) 520*834a2baaSAndroid Build Coastguard Worker alphabet_data = hyb_data[alphabet_off:trie_off] 521*834a2baaSAndroid Build Coastguard Worker trie_data = hyb_data[trie_off:pattern_off] 522*834a2baaSAndroid Build Coastguard Worker pattern_data = hyb_data[pattern_off:file_size] 523*834a2baaSAndroid Build Coastguard Worker 524*834a2baaSAndroid Build Coastguard Worker # reconstruct alphabet table 525*834a2baaSAndroid Build Coastguard Worker alphabet_version = struct.unpack('<I', alphabet_data[:4])[0] 526*834a2baaSAndroid Build Coastguard Worker alphabet_map = {} 527*834a2baaSAndroid Build Coastguard Worker if alphabet_version == 0: 528*834a2baaSAndroid Build Coastguard Worker (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12]) 529*834a2baaSAndroid Build Coastguard Worker for ch in range(min_ch, max_ch): 530*834a2baaSAndroid Build Coastguard Worker offset = 12 + ch - min_ch 531*834a2baaSAndroid Build Coastguard Worker b = struct.unpack('B', alphabet_data[offset : offset + 1])[0] 532*834a2baaSAndroid Build Coastguard Worker if b != 0: 533*834a2baaSAndroid Build Coastguard Worker alphabet_map[unichr(ch)] = b 534*834a2baaSAndroid Build Coastguard Worker else: 535*834a2baaSAndroid Build Coastguard Worker assert alphabet_version == 1 536*834a2baaSAndroid Build Coastguard Worker n_entries = struct.unpack('<I', alphabet_data[4:8])[0] 537*834a2baaSAndroid Build Coastguard Worker for i in range(n_entries): 538*834a2baaSAndroid Build Coastguard Worker entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0] 539*834a2baaSAndroid Build Coastguard Worker alphabet_map[unichr(entry >> 11)] = entry & 0x7ff 540*834a2baaSAndroid Build Coastguard Worker 541*834a2baaSAndroid Build Coastguard Worker ch_map, reconstructed_chr = map_to_chr(alphabet_map) 542*834a2baaSAndroid Build Coastguard Worker 543*834a2baaSAndroid Build Coastguard Worker # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 544*834a2baaSAndroid Build Coastguard Worker if u'\u0587' in reconstructed_chr: 545*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u0587') 546*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u0587\u0535\u0552') 547*834a2baaSAndroid Build Coastguard Worker 548*834a2baaSAndroid Build Coastguard Worker # EXCEPTION for Greek (el), we don't really deal with the uppercase form of 549*834a2baaSAndroid Build Coastguard Worker # U+03C2, U+03C3, U+0390, U+03B0 550*834a2baaSAndroid Build Coastguard Worker if u'\u03C2' in reconstructed_chr: 551*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u03C2') 552*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u03C2\u03A3') 553*834a2baaSAndroid Build Coastguard Worker 554*834a2baaSAndroid Build Coastguard Worker if u'\u03C3' in reconstructed_chr: 555*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u03C3') 556*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u03C3\u03A3') 557*834a2baaSAndroid Build Coastguard Worker 558*834a2baaSAndroid Build Coastguard Worker if u'\u0390' in reconstructed_chr: 559*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u0390') 560*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u0390\u0390') 561*834a2baaSAndroid Build Coastguard Worker 562*834a2baaSAndroid Build Coastguard Worker if u'\u03B0' in reconstructed_chr: 563*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u03B0') 564*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u03B0\u03B0') 565*834a2baaSAndroid Build Coastguard Worker 566*834a2baaSAndroid Build Coastguard Worker if u'\u1c86' in reconstructed_chr: 567*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u1c86') 568*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u1c86\u1c86') 569*834a2baaSAndroid Build Coastguard Worker 570*834a2baaSAndroid Build Coastguard Worker if u'\u1c82' in reconstructed_chr: 571*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.remove(u'\u1c82') 572*834a2baaSAndroid Build Coastguard Worker reconstructed_chr.append(u'\u1c82\u1c82') 573*834a2baaSAndroid Build Coastguard Worker 574*834a2baaSAndroid Build Coastguard Worker assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' 575*834a2baaSAndroid Build Coastguard Worker 576*834a2baaSAndroid Build Coastguard Worker # reconstruct trie 577*834a2baaSAndroid Build Coastguard Worker patterns = [] 578*834a2baaSAndroid Build Coastguard Worker exceptions = [] 579*834a2baaSAndroid Build Coastguard Worker traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) 580*834a2baaSAndroid Build Coastguard Worker 581*834a2baaSAndroid Build Coastguard Worker # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0> 582*834a2baaSAndroid Build Coastguard Worker if u'\u044c' in patterns: 583*834a2baaSAndroid Build Coastguard Worker patterns.remove(u'\u044c') 584*834a2baaSAndroid Build Coastguard Worker patterns.append(u'0\u044c0') 585*834a2baaSAndroid Build Coastguard Worker 586*834a2baaSAndroid Build Coastguard Worker assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' 587*834a2baaSAndroid Build Coastguard Worker assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' 588*834a2baaSAndroid Build Coastguard Worker 589*834a2baaSAndroid Build Coastguard Worker 590*834a2baaSAndroid Build Coastguard Workerdef main(): 591*834a2baaSAndroid Build Coastguard Worker global VERBOSE 592*834a2baaSAndroid Build Coastguard Worker try: 593*834a2baaSAndroid Build Coastguard Worker opts, args = getopt.getopt(sys.argv[1:], 'v') 594*834a2baaSAndroid Build Coastguard Worker except getopt.GetoptError as err: 595*834a2baaSAndroid Build Coastguard Worker print(str(err)) 596*834a2baaSAndroid Build Coastguard Worker sys.exit(1) 597*834a2baaSAndroid Build Coastguard Worker for o, _ in opts: 598*834a2baaSAndroid Build Coastguard Worker if o == '-v': 599*834a2baaSAndroid Build Coastguard Worker VERBOSE = True 600*834a2baaSAndroid Build Coastguard Worker pat_fn, out_fn = args 601*834a2baaSAndroid Build Coastguard Worker hyph = load(pat_fn) 602*834a2baaSAndroid Build Coastguard Worker if pat_fn.endswith('.pat.txt'): 603*834a2baaSAndroid Build Coastguard Worker chr_fn = pat_fn[:-8] + '.chr.txt' 604*834a2baaSAndroid Build Coastguard Worker ch_map = load_chr(chr_fn) 605*834a2baaSAndroid Build Coastguard Worker hyp_fn = pat_fn[:-8] + '.hyp.txt' 606*834a2baaSAndroid Build Coastguard Worker load_hyp(hyph, hyp_fn) 607*834a2baaSAndroid Build Coastguard Worker generate_hyb_file(hyph, ch_map, out_fn) 608*834a2baaSAndroid Build Coastguard Worker verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) 609*834a2baaSAndroid Build Coastguard Worker 610*834a2baaSAndroid Build Coastguard Workerif __name__ == '__main__': 611*834a2baaSAndroid Build Coastguard Worker main() 612