xref: /aosp_15_r20/external/harfbuzz_ng/src/gen-ucd-table.py (revision 2d1272b857b1f7575e6e246373e1cb218663db8a)
1*2d1272b8SAndroid Build Coastguard Worker#!/usr/bin/env python3
2*2d1272b8SAndroid Build Coastguard Worker
3*2d1272b8SAndroid Build Coastguard Worker"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
4*2d1272b8SAndroid Build Coastguard Worker
5*2d1272b8SAndroid Build Coastguard WorkerInput file:
6*2d1272b8SAndroid Build Coastguard Worker* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
7*2d1272b8SAndroid Build Coastguard Worker"""
8*2d1272b8SAndroid Build Coastguard Worker
9*2d1272b8SAndroid Build Coastguard Workerimport sys, re
10*2d1272b8SAndroid Build Coastguard Workerimport logging
11*2d1272b8SAndroid Build Coastguard Workerlogging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
12*2d1272b8SAndroid Build Coastguard Worker
13*2d1272b8SAndroid Build Coastguard Workerif len (sys.argv) not in (2, 3):
14*2d1272b8SAndroid Build Coastguard Worker	sys.exit (__doc__)
15*2d1272b8SAndroid Build Coastguard Worker
16*2d1272b8SAndroid Build Coastguard Worker# https://github.com/harfbuzz/packtab
17*2d1272b8SAndroid Build Coastguard Workerimport packTab
18*2d1272b8SAndroid Build Coastguard Workerimport packTab.ucdxml
19*2d1272b8SAndroid Build Coastguard Worker
20*2d1272b8SAndroid Build Coastguard Workerlogging.info('Loading UCDXML...')
21*2d1272b8SAndroid Build Coastguard Workerucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
22*2d1272b8SAndroid Build Coastguard Workerucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
23*2d1272b8SAndroid Build Coastguard Worker
24*2d1272b8SAndroid Build Coastguard Workerhb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
25*2d1272b8SAndroid Build Coastguard Worker
26*2d1272b8SAndroid Build Coastguard Workerlogging.info('Preparing data tables...')
27*2d1272b8SAndroid Build Coastguard Worker
28*2d1272b8SAndroid Build Coastguard Worker
29*2d1272b8SAndroid Build Coastguard Worker# This is how the data is encoded:
30*2d1272b8SAndroid Build Coastguard Worker#
31*2d1272b8SAndroid Build Coastguard Worker# General_Category (gc), Canonical_Combining_Class (ccc),
32*2d1272b8SAndroid Build Coastguard Worker# and Script (sc) are encoded as integers.
33*2d1272b8SAndroid Build Coastguard Worker#
34*2d1272b8SAndroid Build Coastguard Worker# Mirroring character (bmg) is encoded as difference from
35*2d1272b8SAndroid Build Coastguard Worker# the original character.
36*2d1272b8SAndroid Build Coastguard Worker#
37*2d1272b8SAndroid Build Coastguard Worker# Composition & Decomposition (dm) are encoded elaborately,
38*2d1272b8SAndroid Build Coastguard Worker# as discussed below.
39*2d1272b8SAndroid Build Coastguard Worker
40*2d1272b8SAndroid Build Coastguard Workergc = [u['gc'] for u in ucd]
41*2d1272b8SAndroid Build Coastguard Workerccc = [int(u['ccc']) for u in ucd]
42*2d1272b8SAndroid Build Coastguard Workerbmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
43*2d1272b8SAndroid Build Coastguard Workersc = [u['sc'] for u in ucd]
44*2d1272b8SAndroid Build Coastguard Worker
45*2d1272b8SAndroid Build Coastguard Worker
46*2d1272b8SAndroid Build Coastguard Worker# Prepare Compose / Decompose data
47*2d1272b8SAndroid Build Coastguard Worker#
48*2d1272b8SAndroid Build Coastguard Worker# This code is very dense.  See hb_ucd_compose() / hb_ucd_decompose() for the logic.
49*2d1272b8SAndroid Build Coastguard Worker
50*2d1272b8SAndroid Build Coastguard Workerdm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
51*2d1272b8SAndroid Build Coastguard Worker      if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
52*2d1272b8SAndroid Build Coastguard Workerce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
53*2d1272b8SAndroid Build Coastguard Worker
54*2d1272b8SAndroid Build Coastguard Workerassert not any(v for v in dm.values() if len(v) not in (1,2))
55*2d1272b8SAndroid Build Coastguard Workerdm1 = sorted(set(v for v in dm.values() if len(v) == 1))
56*2d1272b8SAndroid Build Coastguard Workerassert all((v[0] >> 16) in (0,2) for v in dm1)
57*2d1272b8SAndroid Build Coastguard Workerdm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
58*2d1272b8SAndroid Build Coastguard Workerdm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
59*2d1272b8SAndroid Build Coastguard Workerdm1_order = {v:i+1 for i,v in enumerate(dm1)}
60*2d1272b8SAndroid Build Coastguard Worker
61*2d1272b8SAndroid Build Coastguard Workerdm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
62*2d1272b8SAndroid Build Coastguard Worker             for i,v in dm.items() if len(v) == 2)
63*2d1272b8SAndroid Build Coastguard Worker
64*2d1272b8SAndroid Build Coastguard Workerfilt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
65*2d1272b8SAndroid Build Coastguard Worker                  (v[1] & 0xFFFFFF80) == 0x0300 and
66*2d1272b8SAndroid Build Coastguard Worker                  (v[2] & 0xFFF0C000) == 0x0000)
67*2d1272b8SAndroid Build Coastguard Workerdm2_u32_array = [v for v in dm2 if filt(v[0])]
68*2d1272b8SAndroid Build Coastguard Workerdm2_u64_array = [v for v in dm2 if not filt(v[0])]
69*2d1272b8SAndroid Build Coastguard Workerassert dm2_u32_array + dm2_u64_array == dm2
70*2d1272b8SAndroid Build Coastguard Workerdm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
71*2d1272b8SAndroid Build Coastguard Workerdm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
72*2d1272b8SAndroid Build Coastguard Worker
73*2d1272b8SAndroid Build Coastguard Workerl = 1 + len(dm1_p0_array) + len(dm1_p2_array)
74*2d1272b8SAndroid Build Coastguard Workerdm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
75*2d1272b8SAndroid Build Coastguard Worker
76*2d1272b8SAndroid Build Coastguard Workerdm_order = {None: 0}
77*2d1272b8SAndroid Build Coastguard Workerdm_order.update(dm1_order)
78*2d1272b8SAndroid Build Coastguard Workerdm_order.update(dm2_order)
79*2d1272b8SAndroid Build Coastguard Worker
80*2d1272b8SAndroid Build Coastguard Worker
81*2d1272b8SAndroid Build Coastguard Worker# Prepare General_Category / Script mapping arrays
82*2d1272b8SAndroid Build Coastguard Worker
83*2d1272b8SAndroid Build Coastguard Workergc_order = dict()
84*2d1272b8SAndroid Build Coastguard Workerfor i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
85*2d1272b8SAndroid Build Coastguard Worker                      'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
86*2d1272b8SAndroid Build Coastguard Worker                      'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
87*2d1272b8SAndroid Build Coastguard Worker    gc_order[i] = v
88*2d1272b8SAndroid Build Coastguard Worker    gc_order[v] = i
89*2d1272b8SAndroid Build Coastguard Worker
90*2d1272b8SAndroid Build Coastguard Workersc_order = dict()
91*2d1272b8SAndroid Build Coastguard Workersc_array = []
92*2d1272b8SAndroid Build Coastguard Workersc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
93*2d1272b8SAndroid Build Coastguard Workerfor line in open(hb_common_h):
94*2d1272b8SAndroid Build Coastguard Worker    m = sc_re.search (line)
95*2d1272b8SAndroid Build Coastguard Worker    if not m: continue
96*2d1272b8SAndroid Build Coastguard Worker    name = m.group(1)
97*2d1272b8SAndroid Build Coastguard Worker    tag = ''.join(m.group(i) for i in range(2, 6))
98*2d1272b8SAndroid Build Coastguard Worker    i = len(sc_array)
99*2d1272b8SAndroid Build Coastguard Worker    sc_order[tag] = i
100*2d1272b8SAndroid Build Coastguard Worker    sc_order[i] = tag
101*2d1272b8SAndroid Build Coastguard Worker    sc_array.append(name)
102*2d1272b8SAndroid Build Coastguard Worker
103*2d1272b8SAndroid Build Coastguard Worker
104*2d1272b8SAndroid Build Coastguard Worker# Write out main data
105*2d1272b8SAndroid Build Coastguard Worker
106*2d1272b8SAndroid Build Coastguard WorkerDEFAULT = 'DEFAULT'
107*2d1272b8SAndroid Build Coastguard WorkerCOMPACT = 'COMPACT'
108*2d1272b8SAndroid Build Coastguard WorkerSLOPPY  = 'SLOPPY'
109*2d1272b8SAndroid Build Coastguard Worker
110*2d1272b8SAndroid Build Coastguard Workercompression_level = {
111*2d1272b8SAndroid Build Coastguard Worker    DEFAULT: 5,
112*2d1272b8SAndroid Build Coastguard Worker    COMPACT: 9,
113*2d1272b8SAndroid Build Coastguard Worker    SLOPPY:  9,
114*2d1272b8SAndroid Build Coastguard Worker}
115*2d1272b8SAndroid Build Coastguard Worker
116*2d1272b8SAndroid Build Coastguard Workerlogging.info('Generating output...')
117*2d1272b8SAndroid Build Coastguard Workerprint("/* == Start of generated table == */")
118*2d1272b8SAndroid Build Coastguard Workerprint("/*")
119*2d1272b8SAndroid Build Coastguard Workerprint(" * The following table is generated by running:")
120*2d1272b8SAndroid Build Coastguard Workerprint(" *")
121*2d1272b8SAndroid Build Coastguard Workerprint(" *   ./gen-ucd-table.py ucd.nounihan.grouped.xml")
122*2d1272b8SAndroid Build Coastguard Workerprint(" *")
123*2d1272b8SAndroid Build Coastguard Workerprint(" * on file with this description:", ucdxml.description)
124*2d1272b8SAndroid Build Coastguard Workerprint(" */")
125*2d1272b8SAndroid Build Coastguard Workerprint()
126*2d1272b8SAndroid Build Coastguard Workerprint("#ifndef HB_UCD_TABLE_HH")
127*2d1272b8SAndroid Build Coastguard Workerprint("#define HB_UCD_TABLE_HH")
128*2d1272b8SAndroid Build Coastguard Workerprint()
129*2d1272b8SAndroid Build Coastguard Workerprint('#include "hb.hh"')
130*2d1272b8SAndroid Build Coastguard Workerprint()
131*2d1272b8SAndroid Build Coastguard Worker
132*2d1272b8SAndroid Build Coastguard Worker
133*2d1272b8SAndroid Build Coastguard Worker# Write mapping data
134*2d1272b8SAndroid Build Coastguard Worker
135*2d1272b8SAndroid Build Coastguard Workercode = packTab.Code('_hb_ucd')
136*2d1272b8SAndroid Build Coastguard Workersc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
137*2d1272b8SAndroid Build Coastguard Workerdm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
138*2d1272b8SAndroid Build Coastguard Workerdm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
139*2d1272b8SAndroid Build Coastguard Workerdm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
140*2d1272b8SAndroid Build Coastguard Workerdm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
141*2d1272b8SAndroid Build Coastguard Workercode.print_c(linkage='static inline')
142*2d1272b8SAndroid Build Coastguard Worker
143*2d1272b8SAndroid Build Coastguard Workerdatasets = [
144*2d1272b8SAndroid Build Coastguard Worker    ('gc', gc, 'Cn', gc_order),
145*2d1272b8SAndroid Build Coastguard Worker    ('ccc', ccc, 0, None),
146*2d1272b8SAndroid Build Coastguard Worker    ('bmg', bmg, 0, None),
147*2d1272b8SAndroid Build Coastguard Worker    ('sc', sc, 'Zzzz', sc_order),
148*2d1272b8SAndroid Build Coastguard Worker    ('dm', dm, None, dm_order),
149*2d1272b8SAndroid Build Coastguard Worker]
150*2d1272b8SAndroid Build Coastguard Worker
151*2d1272b8SAndroid Build Coastguard Worker
152*2d1272b8SAndroid Build Coastguard Worker# Write main data
153*2d1272b8SAndroid Build Coastguard Worker
154*2d1272b8SAndroid Build Coastguard Workerfor step in (DEFAULT, COMPACT, SLOPPY):
155*2d1272b8SAndroid Build Coastguard Worker    compression = compression_level[step]
156*2d1272b8SAndroid Build Coastguard Worker    logging.info('  Compression=%d:' % compression)
157*2d1272b8SAndroid Build Coastguard Worker    print()
158*2d1272b8SAndroid Build Coastguard Worker    if step == DEFAULT:
159*2d1272b8SAndroid Build Coastguard Worker        print('#ifndef HB_OPTIMIZE_SIZE')
160*2d1272b8SAndroid Build Coastguard Worker    elif step == COMPACT:
161*2d1272b8SAndroid Build Coastguard Worker        print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
162*2d1272b8SAndroid Build Coastguard Worker    elif step == SLOPPY:
163*2d1272b8SAndroid Build Coastguard Worker        print('#else')
164*2d1272b8SAndroid Build Coastguard Worker    else:
165*2d1272b8SAndroid Build Coastguard Worker        assert False
166*2d1272b8SAndroid Build Coastguard Worker    print()
167*2d1272b8SAndroid Build Coastguard Worker
168*2d1272b8SAndroid Build Coastguard Worker    if step == SLOPPY:
169*2d1272b8SAndroid Build Coastguard Worker        for i in range(len(gc)):
170*2d1272b8SAndroid Build Coastguard Worker            if (i % 128) and gc[i] == 'Cn':
171*2d1272b8SAndroid Build Coastguard Worker                gc[i] = gc[i - 1]
172*2d1272b8SAndroid Build Coastguard Worker        for i in range(len(gc) - 2, -1, -1):
173*2d1272b8SAndroid Build Coastguard Worker            if ((i + 1) % 128) and gc[i] == 'Cn':
174*2d1272b8SAndroid Build Coastguard Worker                gc[i] = gc[i + 1]
175*2d1272b8SAndroid Build Coastguard Worker        for i in range(len(sc)):
176*2d1272b8SAndroid Build Coastguard Worker            if (i % 128) and sc[i] == 'Zzzz':
177*2d1272b8SAndroid Build Coastguard Worker                sc[i] = sc[i - 1]
178*2d1272b8SAndroid Build Coastguard Worker        for i in range(len(sc) - 2, -1, -1):
179*2d1272b8SAndroid Build Coastguard Worker            if ((i + 1) % 128) and sc[i] == 'Zzzz':
180*2d1272b8SAndroid Build Coastguard Worker                sc[i] = sc[i + 1]
181*2d1272b8SAndroid Build Coastguard Worker
182*2d1272b8SAndroid Build Coastguard Worker
183*2d1272b8SAndroid Build Coastguard Worker    code = packTab.Code('_hb_ucd')
184*2d1272b8SAndroid Build Coastguard Worker
185*2d1272b8SAndroid Build Coastguard Worker    for name,data,default,mapping in datasets:
186*2d1272b8SAndroid Build Coastguard Worker        sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
187*2d1272b8SAndroid Build Coastguard Worker        logging.info('      Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
188*2d1272b8SAndroid Build Coastguard Worker        sol.genCode(code, name)
189*2d1272b8SAndroid Build Coastguard Worker
190*2d1272b8SAndroid Build Coastguard Worker    code.print_c(linkage='static inline')
191*2d1272b8SAndroid Build Coastguard Worker
192*2d1272b8SAndroid Build Coastguard Worker    print()
193*2d1272b8SAndroid Build Coastguard Worker
194*2d1272b8SAndroid Build Coastguard Worker
195*2d1272b8SAndroid Build Coastguard Workerprint('#endif')
196*2d1272b8SAndroid Build Coastguard Workerprint()
197*2d1272b8SAndroid Build Coastguard Worker
198*2d1272b8SAndroid Build Coastguard Workerprint()
199*2d1272b8SAndroid Build Coastguard Workerprint("#endif /* HB_UCD_TABLE_HH */")
200*2d1272b8SAndroid Build Coastguard Workerprint()
201*2d1272b8SAndroid Build Coastguard Workerprint("/* == End of generated table == */")
202*2d1272b8SAndroid Build Coastguard Workerlogging.info('Done.')
203