xref: /aosp_15_r20/external/harfbuzz_ng/src/gen-ucd-table.py (revision 2d1272b857b1f7575e6e246373e1cb218663db8a)
1#!/usr/bin/env python3
2
3"""usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
4
5Input file:
6* https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
7"""
8
9import sys, re
10import logging
11logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
12
13if len (sys.argv) not in (2, 3):
14	sys.exit (__doc__)
15
16# https://github.com/harfbuzz/packtab
17import packTab
18import packTab.ucdxml
19
20logging.info('Loading UCDXML...')
21ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
22ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
23
24hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
25
26logging.info('Preparing data tables...')
27
28
29# This is how the data is encoded:
30#
31# General_Category (gc), Canonical_Combining_Class (ccc),
32# and Script (sc) are encoded as integers.
33#
34# Mirroring character (bmg) is encoded as difference from
35# the original character.
36#
37# Composition & Decomposition (dm) are encoded elaborately,
38# as discussed below.
39
40gc = [u['gc'] for u in ucd]
41ccc = [int(u['ccc']) for u in ucd]
42bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
43sc = [u['sc'] for u in ucd]
44
45
46# Prepare Compose / Decompose data
47#
48# This code is very dense.  See hb_ucd_compose() / hb_ucd_decompose() for the logic.
49
50dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
51      if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
52ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
53
54assert not any(v for v in dm.values() if len(v) not in (1,2))
55dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
56assert all((v[0] >> 16) in (0,2) for v in dm1)
57dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
58dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
59dm1_order = {v:i+1 for i,v in enumerate(dm1)}
60
61dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
62             for i,v in dm.items() if len(v) == 2)
63
64filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
65                  (v[1] & 0xFFFFFF80) == 0x0300 and
66                  (v[2] & 0xFFF0C000) == 0x0000)
67dm2_u32_array = [v for v in dm2 if filt(v[0])]
68dm2_u64_array = [v for v in dm2 if not filt(v[0])]
69assert dm2_u32_array + dm2_u64_array == dm2
70dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
71dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
72
73l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
74dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
75
76dm_order = {None: 0}
77dm_order.update(dm1_order)
78dm_order.update(dm2_order)
79
80
81# Prepare General_Category / Script mapping arrays
82
83gc_order = dict()
84for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
85                      'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
86                      'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
87    gc_order[i] = v
88    gc_order[v] = i
89
90sc_order = dict()
91sc_array = []
92sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
93for line in open(hb_common_h):
94    m = sc_re.search (line)
95    if not m: continue
96    name = m.group(1)
97    tag = ''.join(m.group(i) for i in range(2, 6))
98    i = len(sc_array)
99    sc_order[tag] = i
100    sc_order[i] = tag
101    sc_array.append(name)
102
103
104# Write out main data
105
106DEFAULT = 'DEFAULT'
107COMPACT = 'COMPACT'
108SLOPPY  = 'SLOPPY'
109
110compression_level = {
111    DEFAULT: 5,
112    COMPACT: 9,
113    SLOPPY:  9,
114}
115
116logging.info('Generating output...')
117print("/* == Start of generated table == */")
118print("/*")
119print(" * The following table is generated by running:")
120print(" *")
121print(" *   ./gen-ucd-table.py ucd.nounihan.grouped.xml")
122print(" *")
123print(" * on file with this description:", ucdxml.description)
124print(" */")
125print()
126print("#ifndef HB_UCD_TABLE_HH")
127print("#define HB_UCD_TABLE_HH")
128print()
129print('#include "hb.hh"')
130print()
131
132
133# Write mapping data
134
135code = packTab.Code('_hb_ucd')
136sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
137dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
138dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
139dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
140dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
141code.print_c(linkage='static inline')
142
143datasets = [
144    ('gc', gc, 'Cn', gc_order),
145    ('ccc', ccc, 0, None),
146    ('bmg', bmg, 0, None),
147    ('sc', sc, 'Zzzz', sc_order),
148    ('dm', dm, None, dm_order),
149]
150
151
152# Write main data
153
154for step in (DEFAULT, COMPACT, SLOPPY):
155    compression = compression_level[step]
156    logging.info('  Compression=%d:' % compression)
157    print()
158    if step == DEFAULT:
159        print('#ifndef HB_OPTIMIZE_SIZE')
160    elif step == COMPACT:
161        print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
162    elif step == SLOPPY:
163        print('#else')
164    else:
165        assert False
166    print()
167
168    if step == SLOPPY:
169        for i in range(len(gc)):
170            if (i % 128) and gc[i] == 'Cn':
171                gc[i] = gc[i - 1]
172        for i in range(len(gc) - 2, -1, -1):
173            if ((i + 1) % 128) and gc[i] == 'Cn':
174                gc[i] = gc[i + 1]
175        for i in range(len(sc)):
176            if (i % 128) and sc[i] == 'Zzzz':
177                sc[i] = sc[i - 1]
178        for i in range(len(sc) - 2, -1, -1):
179            if ((i + 1) % 128) and sc[i] == 'Zzzz':
180                sc[i] = sc[i + 1]
181
182
183    code = packTab.Code('_hb_ucd')
184
185    for name,data,default,mapping in datasets:
186        sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
187        logging.info('      Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
188        sol.genCode(code, name)
189
190    code.print_c(linkage='static inline')
191
192    print()
193
194
195print('#endif')
196print()
197
198print()
199print("#endif /* HB_UCD_TABLE_HH */")
200print()
201print("/* == End of generated table == */")
202logging.info('Done.')
203