1*2d1272b8SAndroid Build Coastguard Worker#!/usr/bin/env python3 2*2d1272b8SAndroid Build Coastguard Worker 3*2d1272b8SAndroid Build Coastguard Worker"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h] 4*2d1272b8SAndroid Build Coastguard Worker 5*2d1272b8SAndroid Build Coastguard WorkerInput file: 6*2d1272b8SAndroid Build Coastguard Worker* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip 7*2d1272b8SAndroid Build Coastguard Worker""" 8*2d1272b8SAndroid Build Coastguard Worker 9*2d1272b8SAndroid Build Coastguard Workerimport sys, re 10*2d1272b8SAndroid Build Coastguard Workerimport logging 11*2d1272b8SAndroid Build Coastguard Workerlogging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 12*2d1272b8SAndroid Build Coastguard Worker 13*2d1272b8SAndroid Build Coastguard Workerif len (sys.argv) not in (2, 3): 14*2d1272b8SAndroid Build Coastguard Worker sys.exit (__doc__) 15*2d1272b8SAndroid Build Coastguard Worker 16*2d1272b8SAndroid Build Coastguard Worker# https://github.com/harfbuzz/packtab 17*2d1272b8SAndroid Build Coastguard Workerimport packTab 18*2d1272b8SAndroid Build Coastguard Workerimport packTab.ucdxml 19*2d1272b8SAndroid Build Coastguard Worker 20*2d1272b8SAndroid Build Coastguard Workerlogging.info('Loading UCDXML...') 21*2d1272b8SAndroid Build Coastguard Workerucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1]) 22*2d1272b8SAndroid Build Coastguard Workerucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml) 23*2d1272b8SAndroid Build Coastguard Worker 24*2d1272b8SAndroid Build Coastguard Workerhb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2] 25*2d1272b8SAndroid Build Coastguard Worker 26*2d1272b8SAndroid Build Coastguard Workerlogging.info('Preparing data tables...') 27*2d1272b8SAndroid Build Coastguard Worker 28*2d1272b8SAndroid Build Coastguard Worker 29*2d1272b8SAndroid Build Coastguard Worker# This is how the data is encoded: 30*2d1272b8SAndroid Build Coastguard Worker# 31*2d1272b8SAndroid Build Coastguard Worker# General_Category (gc), Canonical_Combining_Class (ccc), 32*2d1272b8SAndroid Build Coastguard Worker# and Script (sc) are encoded as integers. 33*2d1272b8SAndroid Build Coastguard Worker# 34*2d1272b8SAndroid Build Coastguard Worker# Mirroring character (bmg) is encoded as difference from 35*2d1272b8SAndroid Build Coastguard Worker# the original character. 36*2d1272b8SAndroid Build Coastguard Worker# 37*2d1272b8SAndroid Build Coastguard Worker# Composition & Decomposition (dm) are encoded elaborately, 38*2d1272b8SAndroid Build Coastguard Worker# as discussed below. 39*2d1272b8SAndroid Build Coastguard Worker 40*2d1272b8SAndroid Build Coastguard Workergc = [u['gc'] for u in ucd] 41*2d1272b8SAndroid Build Coastguard Workerccc = [int(u['ccc']) for u in ucd] 42*2d1272b8SAndroid Build Coastguard Workerbmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)] 43*2d1272b8SAndroid Build Coastguard Workersc = [u['sc'] for u in ucd] 44*2d1272b8SAndroid Build Coastguard Worker 45*2d1272b8SAndroid Build Coastguard Worker 46*2d1272b8SAndroid Build Coastguard Worker# Prepare Compose / Decompose data 47*2d1272b8SAndroid Build Coastguard Worker# 48*2d1272b8SAndroid Build Coastguard Worker# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic. 49*2d1272b8SAndroid Build Coastguard Worker 50*2d1272b8SAndroid Build Coastguard Workerdm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd) 51*2d1272b8SAndroid Build Coastguard Worker if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)} 52*2d1272b8SAndroid Build Coastguard Workerce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'} 53*2d1272b8SAndroid Build Coastguard Worker 54*2d1272b8SAndroid Build Coastguard Workerassert not any(v for v in dm.values() if len(v) not in (1,2)) 55*2d1272b8SAndroid Build Coastguard Workerdm1 = sorted(set(v for v in dm.values() if len(v) == 1)) 56*2d1272b8SAndroid Build Coastguard Workerassert all((v[0] >> 16) in (0,2) for v in dm1) 57*2d1272b8SAndroid Build Coastguard Workerdm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0] 58*2d1272b8SAndroid Build Coastguard Workerdm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2] 59*2d1272b8SAndroid Build Coastguard Workerdm1_order = {v:i+1 for i,v in enumerate(dm1)} 60*2d1272b8SAndroid Build Coastguard Worker 61*2d1272b8SAndroid Build Coastguard Workerdm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v) 62*2d1272b8SAndroid Build Coastguard Worker for i,v in dm.items() if len(v) == 2) 63*2d1272b8SAndroid Build Coastguard Worker 64*2d1272b8SAndroid Build Coastguard Workerfilt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and 65*2d1272b8SAndroid Build Coastguard Worker (v[1] & 0xFFFFFF80) == 0x0300 and 66*2d1272b8SAndroid Build Coastguard Worker (v[2] & 0xFFF0C000) == 0x0000) 67*2d1272b8SAndroid Build Coastguard Workerdm2_u32_array = [v for v in dm2 if filt(v[0])] 68*2d1272b8SAndroid Build Coastguard Workerdm2_u64_array = [v for v in dm2 if not filt(v[0])] 69*2d1272b8SAndroid Build Coastguard Workerassert dm2_u32_array + dm2_u64_array == dm2 70*2d1272b8SAndroid Build Coastguard Workerdm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array] 71*2d1272b8SAndroid Build Coastguard Workerdm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array] 72*2d1272b8SAndroid Build Coastguard Worker 73*2d1272b8SAndroid Build Coastguard Workerl = 1 + len(dm1_p0_array) + len(dm1_p2_array) 74*2d1272b8SAndroid Build Coastguard Workerdm2_order = {v[1]:i+l for i,v in enumerate(dm2)} 75*2d1272b8SAndroid Build Coastguard Worker 76*2d1272b8SAndroid Build Coastguard Workerdm_order = {None: 0} 77*2d1272b8SAndroid Build Coastguard Workerdm_order.update(dm1_order) 78*2d1272b8SAndroid Build Coastguard Workerdm_order.update(dm2_order) 79*2d1272b8SAndroid Build Coastguard Worker 80*2d1272b8SAndroid Build Coastguard Worker 81*2d1272b8SAndroid Build Coastguard Worker# Prepare General_Category / Script mapping arrays 82*2d1272b8SAndroid Build Coastguard Worker 83*2d1272b8SAndroid Build Coastguard Workergc_order = dict() 84*2d1272b8SAndroid Build Coastguard Workerfor i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 85*2d1272b8SAndroid Build Coastguard Worker 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 86*2d1272b8SAndroid Build Coastguard Worker 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)): 87*2d1272b8SAndroid Build Coastguard Worker gc_order[i] = v 88*2d1272b8SAndroid Build Coastguard Worker gc_order[v] = i 89*2d1272b8SAndroid Build Coastguard Worker 90*2d1272b8SAndroid Build Coastguard Workersc_order = dict() 91*2d1272b8SAndroid Build Coastguard Workersc_array = [] 92*2d1272b8SAndroid Build Coastguard Workersc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]") 93*2d1272b8SAndroid Build Coastguard Workerfor line in open(hb_common_h): 94*2d1272b8SAndroid Build Coastguard Worker m = sc_re.search (line) 95*2d1272b8SAndroid Build Coastguard Worker if not m: continue 96*2d1272b8SAndroid Build Coastguard Worker name = m.group(1) 97*2d1272b8SAndroid Build Coastguard Worker tag = ''.join(m.group(i) for i in range(2, 6)) 98*2d1272b8SAndroid Build Coastguard Worker i = len(sc_array) 99*2d1272b8SAndroid Build Coastguard Worker sc_order[tag] = i 100*2d1272b8SAndroid Build Coastguard Worker sc_order[i] = tag 101*2d1272b8SAndroid Build Coastguard Worker sc_array.append(name) 102*2d1272b8SAndroid Build Coastguard Worker 103*2d1272b8SAndroid Build Coastguard Worker 104*2d1272b8SAndroid Build Coastguard Worker# Write out main data 105*2d1272b8SAndroid Build Coastguard Worker 106*2d1272b8SAndroid Build Coastguard WorkerDEFAULT = 'DEFAULT' 107*2d1272b8SAndroid Build Coastguard WorkerCOMPACT = 'COMPACT' 108*2d1272b8SAndroid Build Coastguard WorkerSLOPPY = 'SLOPPY' 109*2d1272b8SAndroid Build Coastguard Worker 110*2d1272b8SAndroid Build Coastguard Workercompression_level = { 111*2d1272b8SAndroid Build Coastguard Worker DEFAULT: 5, 112*2d1272b8SAndroid Build Coastguard Worker COMPACT: 9, 113*2d1272b8SAndroid Build Coastguard Worker SLOPPY: 9, 114*2d1272b8SAndroid Build Coastguard Worker} 115*2d1272b8SAndroid Build Coastguard Worker 116*2d1272b8SAndroid Build Coastguard Workerlogging.info('Generating output...') 117*2d1272b8SAndroid Build Coastguard Workerprint("/* == Start of generated table == */") 118*2d1272b8SAndroid Build Coastguard Workerprint("/*") 119*2d1272b8SAndroid Build Coastguard Workerprint(" * The following table is generated by running:") 120*2d1272b8SAndroid Build Coastguard Workerprint(" *") 121*2d1272b8SAndroid Build Coastguard Workerprint(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml") 122*2d1272b8SAndroid Build Coastguard Workerprint(" *") 123*2d1272b8SAndroid Build Coastguard Workerprint(" * on file with this description:", ucdxml.description) 124*2d1272b8SAndroid Build Coastguard Workerprint(" */") 125*2d1272b8SAndroid Build Coastguard Workerprint() 126*2d1272b8SAndroid Build Coastguard Workerprint("#ifndef HB_UCD_TABLE_HH") 127*2d1272b8SAndroid Build Coastguard Workerprint("#define HB_UCD_TABLE_HH") 128*2d1272b8SAndroid Build Coastguard Workerprint() 129*2d1272b8SAndroid Build Coastguard Workerprint('#include "hb.hh"') 130*2d1272b8SAndroid Build Coastguard Workerprint() 131*2d1272b8SAndroid Build Coastguard Worker 132*2d1272b8SAndroid Build Coastguard Worker 133*2d1272b8SAndroid Build Coastguard Worker# Write mapping data 134*2d1272b8SAndroid Build Coastguard Worker 135*2d1272b8SAndroid Build Coastguard Workercode = packTab.Code('_hb_ucd') 136*2d1272b8SAndroid Build Coastguard Workersc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array) 137*2d1272b8SAndroid Build Coastguard Workerdm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array) 138*2d1272b8SAndroid Build Coastguard Workerdm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array) 139*2d1272b8SAndroid Build Coastguard Workerdm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array) 140*2d1272b8SAndroid Build Coastguard Workerdm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array) 141*2d1272b8SAndroid Build Coastguard Workercode.print_c(linkage='static inline') 142*2d1272b8SAndroid Build Coastguard Worker 143*2d1272b8SAndroid Build Coastguard Workerdatasets = [ 144*2d1272b8SAndroid Build Coastguard Worker ('gc', gc, 'Cn', gc_order), 145*2d1272b8SAndroid Build Coastguard Worker ('ccc', ccc, 0, None), 146*2d1272b8SAndroid Build Coastguard Worker ('bmg', bmg, 0, None), 147*2d1272b8SAndroid Build Coastguard Worker ('sc', sc, 'Zzzz', sc_order), 148*2d1272b8SAndroid Build Coastguard Worker ('dm', dm, None, dm_order), 149*2d1272b8SAndroid Build Coastguard Worker] 150*2d1272b8SAndroid Build Coastguard Worker 151*2d1272b8SAndroid Build Coastguard Worker 152*2d1272b8SAndroid Build Coastguard Worker# Write main data 153*2d1272b8SAndroid Build Coastguard Worker 154*2d1272b8SAndroid Build Coastguard Workerfor step in (DEFAULT, COMPACT, SLOPPY): 155*2d1272b8SAndroid Build Coastguard Worker compression = compression_level[step] 156*2d1272b8SAndroid Build Coastguard Worker logging.info(' Compression=%d:' % compression) 157*2d1272b8SAndroid Build Coastguard Worker print() 158*2d1272b8SAndroid Build Coastguard Worker if step == DEFAULT: 159*2d1272b8SAndroid Build Coastguard Worker print('#ifndef HB_OPTIMIZE_SIZE') 160*2d1272b8SAndroid Build Coastguard Worker elif step == COMPACT: 161*2d1272b8SAndroid Build Coastguard Worker print('#elif !defined(HB_NO_UCD_UNASSIGNED)') 162*2d1272b8SAndroid Build Coastguard Worker elif step == SLOPPY: 163*2d1272b8SAndroid Build Coastguard Worker print('#else') 164*2d1272b8SAndroid Build Coastguard Worker else: 165*2d1272b8SAndroid Build Coastguard Worker assert False 166*2d1272b8SAndroid Build Coastguard Worker print() 167*2d1272b8SAndroid Build Coastguard Worker 168*2d1272b8SAndroid Build Coastguard Worker if step == SLOPPY: 169*2d1272b8SAndroid Build Coastguard Worker for i in range(len(gc)): 170*2d1272b8SAndroid Build Coastguard Worker if (i % 128) and gc[i] == 'Cn': 171*2d1272b8SAndroid Build Coastguard Worker gc[i] = gc[i - 1] 172*2d1272b8SAndroid Build Coastguard Worker for i in range(len(gc) - 2, -1, -1): 173*2d1272b8SAndroid Build Coastguard Worker if ((i + 1) % 128) and gc[i] == 'Cn': 174*2d1272b8SAndroid Build Coastguard Worker gc[i] = gc[i + 1] 175*2d1272b8SAndroid Build Coastguard Worker for i in range(len(sc)): 176*2d1272b8SAndroid Build Coastguard Worker if (i % 128) and sc[i] == 'Zzzz': 177*2d1272b8SAndroid Build Coastguard Worker sc[i] = sc[i - 1] 178*2d1272b8SAndroid Build Coastguard Worker for i in range(len(sc) - 2, -1, -1): 179*2d1272b8SAndroid Build Coastguard Worker if ((i + 1) % 128) and sc[i] == 'Zzzz': 180*2d1272b8SAndroid Build Coastguard Worker sc[i] = sc[i + 1] 181*2d1272b8SAndroid Build Coastguard Worker 182*2d1272b8SAndroid Build Coastguard Worker 183*2d1272b8SAndroid Build Coastguard Worker code = packTab.Code('_hb_ucd') 184*2d1272b8SAndroid Build Coastguard Worker 185*2d1272b8SAndroid Build Coastguard Worker for name,data,default,mapping in datasets: 186*2d1272b8SAndroid Build Coastguard Worker sol = packTab.pack_table(data, default, mapping=mapping, compression=compression) 187*2d1272b8SAndroid Build Coastguard Worker logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost)) 188*2d1272b8SAndroid Build Coastguard Worker sol.genCode(code, name) 189*2d1272b8SAndroid Build Coastguard Worker 190*2d1272b8SAndroid Build Coastguard Worker code.print_c(linkage='static inline') 191*2d1272b8SAndroid Build Coastguard Worker 192*2d1272b8SAndroid Build Coastguard Worker print() 193*2d1272b8SAndroid Build Coastguard Worker 194*2d1272b8SAndroid Build Coastguard Worker 195*2d1272b8SAndroid Build Coastguard Workerprint('#endif') 196*2d1272b8SAndroid Build Coastguard Workerprint() 197*2d1272b8SAndroid Build Coastguard Worker 198*2d1272b8SAndroid Build Coastguard Workerprint() 199*2d1272b8SAndroid Build Coastguard Workerprint("#endif /* HB_UCD_TABLE_HH */") 200*2d1272b8SAndroid Build Coastguard Workerprint() 201*2d1272b8SAndroid Build Coastguard Workerprint("/* == End of generated table == */") 202*2d1272b8SAndroid Build Coastguard Workerlogging.info('Done.') 203