1#!/usr/bin/env python3 2 3"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h] 4 5Input file: 6* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip 7""" 8 9import sys, re 10import logging 11logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 12 13if len (sys.argv) not in (2, 3): 14 sys.exit (__doc__) 15 16# https://github.com/harfbuzz/packtab 17import packTab 18import packTab.ucdxml 19 20logging.info('Loading UCDXML...') 21ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1]) 22ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml) 23 24hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2] 25 26logging.info('Preparing data tables...') 27 28 29# This is how the data is encoded: 30# 31# General_Category (gc), Canonical_Combining_Class (ccc), 32# and Script (sc) are encoded as integers. 33# 34# Mirroring character (bmg) is encoded as difference from 35# the original character. 36# 37# Composition & Decomposition (dm) are encoded elaborately, 38# as discussed below. 39 40gc = [u['gc'] for u in ucd] 41ccc = [int(u['ccc']) for u in ucd] 42bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)] 43sc = [u['sc'] for u in ucd] 44 45 46# Prepare Compose / Decompose data 47# 48# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic. 49 50dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd) 51 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)} 52ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'} 53 54assert not any(v for v in dm.values() if len(v) not in (1,2)) 55dm1 = sorted(set(v for v in dm.values() if len(v) == 1)) 56assert all((v[0] >> 16) in (0,2) for v in dm1) 57dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0] 58dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2] 59dm1_order = {v:i+1 for i,v in enumerate(dm1)} 60 61dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v) 62 for i,v in dm.items() if len(v) == 2) 63 64filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and 65 (v[1] & 0xFFFFFF80) == 0x0300 and 66 (v[2] & 0xFFF0C000) == 0x0000) 67dm2_u32_array = [v for v in dm2 if filt(v[0])] 68dm2_u64_array = [v for v in dm2 if not filt(v[0])] 69assert dm2_u32_array + dm2_u64_array == dm2 70dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array] 71dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array] 72 73l = 1 + len(dm1_p0_array) + len(dm1_p2_array) 74dm2_order = {v[1]:i+l for i,v in enumerate(dm2)} 75 76dm_order = {None: 0} 77dm_order.update(dm1_order) 78dm_order.update(dm2_order) 79 80 81# Prepare General_Category / Script mapping arrays 82 83gc_order = dict() 84for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 85 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 86 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)): 87 gc_order[i] = v 88 gc_order[v] = i 89 90sc_order = dict() 91sc_array = [] 92sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]") 93for line in open(hb_common_h): 94 m = sc_re.search (line) 95 if not m: continue 96 name = m.group(1) 97 tag = ''.join(m.group(i) for i in range(2, 6)) 98 i = len(sc_array) 99 sc_order[tag] = i 100 sc_order[i] = tag 101 sc_array.append(name) 102 103 104# Write out main data 105 106DEFAULT = 'DEFAULT' 107COMPACT = 'COMPACT' 108SLOPPY = 'SLOPPY' 109 110compression_level = { 111 DEFAULT: 5, 112 COMPACT: 9, 113 SLOPPY: 9, 114} 115 116logging.info('Generating output...') 117print("/* == Start of generated table == */") 118print("/*") 119print(" * The following table is generated by running:") 120print(" *") 121print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml") 122print(" *") 123print(" * on file with this description:", ucdxml.description) 124print(" */") 125print() 126print("#ifndef HB_UCD_TABLE_HH") 127print("#define HB_UCD_TABLE_HH") 128print() 129print('#include "hb.hh"') 130print() 131 132 133# Write mapping data 134 135code = packTab.Code('_hb_ucd') 136sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array) 137dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array) 138dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array) 139dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array) 140dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array) 141code.print_c(linkage='static inline') 142 143datasets = [ 144 ('gc', gc, 'Cn', gc_order), 145 ('ccc', ccc, 0, None), 146 ('bmg', bmg, 0, None), 147 ('sc', sc, 'Zzzz', sc_order), 148 ('dm', dm, None, dm_order), 149] 150 151 152# Write main data 153 154for step in (DEFAULT, COMPACT, SLOPPY): 155 compression = compression_level[step] 156 logging.info(' Compression=%d:' % compression) 157 print() 158 if step == DEFAULT: 159 print('#ifndef HB_OPTIMIZE_SIZE') 160 elif step == COMPACT: 161 print('#elif !defined(HB_NO_UCD_UNASSIGNED)') 162 elif step == SLOPPY: 163 print('#else') 164 else: 165 assert False 166 print() 167 168 if step == SLOPPY: 169 for i in range(len(gc)): 170 if (i % 128) and gc[i] == 'Cn': 171 gc[i] = gc[i - 1] 172 for i in range(len(gc) - 2, -1, -1): 173 if ((i + 1) % 128) and gc[i] == 'Cn': 174 gc[i] = gc[i + 1] 175 for i in range(len(sc)): 176 if (i % 128) and sc[i] == 'Zzzz': 177 sc[i] = sc[i - 1] 178 for i in range(len(sc) - 2, -1, -1): 179 if ((i + 1) % 128) and sc[i] == 'Zzzz': 180 sc[i] = sc[i + 1] 181 182 183 code = packTab.Code('_hb_ucd') 184 185 for name,data,default,mapping in datasets: 186 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression) 187 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost)) 188 sol.genCode(code, name) 189 190 code.print_c(linkage='static inline') 191 192 print() 193 194 195print('#endif') 196print() 197 198print() 199print("#endif /* HB_UCD_TABLE_HH */") 200print() 201print("/* == End of generated table == */") 202logging.info('Done.') 203