xref: /aosp_15_r20/external/harfbuzz_ng/src/gen-use-table.py (revision 2d1272b857b1f7575e6e246373e1cb218663db8a)
1#!/usr/bin/env python3
2# flake8: noqa: F821
3
4"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
5
6Input files:
7* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
9* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
10* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
11* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
12* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
14* ms-use/IndicSyllabicCategory-Additional.txt
15* ms-use/IndicPositionalCategory-Additional.txt
16"""
17
18import logging
19logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
20
21
22import sys
23
24if len (sys.argv) != 10:
25	sys.exit (__doc__)
26
27DISABLED_SCRIPTS = {
28	'Arabic',
29	'Lao',
30	'Samaritan',
31	'Syriac',
32	'Thai',
33}
34
35files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
36
37headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
38for j in range(7, 9):
39	for line in files[j]:
40		line = line.rstrip()
41		if not line:
42			break
43		headers[j - 1].append(line)
44headers.append (["UnicodeData.txt does not have a header."])
45
46unicode_data = [{} for _ in files]
47values = [{} for _ in files]
48for i, f in enumerate (files):
49	for line in f:
50
51		j = line.find ('#')
52		if j >= 0:
53			line = line[:j]
54
55		fields = [x.strip () for x in line.split (';')]
56		if len (fields) == 1:
57			continue
58
59		uu = fields[0].split ('..')
60		start = int (uu[0], 16)
61		if len (uu) == 1:
62			end = start
63		else:
64			end = int (uu[1], 16)
65
66		t = fields[1 if i not in [2, 4] else 2]
67
68		if i == 2:
69			t = 'jt_' + t
70		elif i == 3 and t != 'Default_Ignorable_Code_Point':
71			continue
72		elif i == 7 and t == 'Consonant_Final_Modifier':
73			# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
74			t = 'Syllable_Modifier'
75		elif i == 8 and t == 'NA':
76			t = 'Not_Applicable'
77
78		i0 = i if i < 7 else i - 7
79		for u in range (start, end + 1):
80			unicode_data[i0][u] = t
81		values[i0][t] = values[i0].get (t, 0) + end - start + 1
82
83defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
84
85# Merge data into one dict:
86for i,v in enumerate (defaults):
87	values[i][v] = values[i].get (v, 0) + 1
88combined = {}
89for i,d in enumerate (unicode_data):
90	for u,v in d.items ():
91		if not u in combined:
92			if i >= 4:
93				continue
94			combined[u] = list (defaults)
95		combined[u][i] = v
96combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
97
98
99property_names = [
100	# General_Category
101	'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
102	'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
103	'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
104	# Indic_Syllabic_Category
105	'Other',
106	'Bindu',
107	'Visarga',
108	'Avagraha',
109	'Nukta',
110	'Virama',
111	'Pure_Killer',
112	'Reordering_Killer',
113	'Invisible_Stacker',
114	'Vowel_Independent',
115	'Vowel_Dependent',
116	'Vowel',
117	'Consonant_Placeholder',
118	'Consonant',
119	'Consonant_Dead',
120	'Consonant_With_Stacker',
121	'Consonant_Prefixed',
122	'Consonant_Preceding_Repha',
123	'Consonant_Succeeding_Repha',
124	'Consonant_Subjoined',
125	'Consonant_Medial',
126	'Consonant_Final',
127	'Consonant_Head_Letter',
128	'Consonant_Initial_Postfixed',
129	'Modifying_Letter',
130	'Tone_Letter',
131	'Tone_Mark',
132	'Gemination_Mark',
133	'Cantillation_Mark',
134	'Register_Shifter',
135	'Syllable_Modifier',
136	'Consonant_Killer',
137	'Non_Joiner',
138	'Joiner',
139	'Number_Joiner',
140	'Number',
141	'Brahmi_Joining_Number',
142	'Symbol_Modifier',
143	'Hieroglyph',
144	'Hieroglyph_Joiner',
145	'Hieroglyph_Mark_Begin',
146	'Hieroglyph_Mark_End',
147	'Hieroglyph_Mirror',
148	'Hieroglyph_Modifier',
149	'Hieroglyph_Segment_Begin',
150	'Hieroglyph_Segment_End',
151	# Indic_Positional_Category
152	'Not_Applicable',
153	'Right',
154	'Left',
155	'Visual_Order_Left',
156	'Left_And_Right',
157	'Top',
158	'Bottom',
159	'Top_And_Bottom',
160	'Top_And_Bottom_And_Left',
161	'Top_And_Right',
162	'Top_And_Left',
163	'Top_And_Left_And_Right',
164	'Bottom_And_Left',
165	'Bottom_And_Right',
166	'Top_And_Bottom_And_Right',
167	'Overstruck',
168	# Joining_Type
169	'jt_C',
170	'jt_D',
171	'jt_L',
172	'jt_R',
173	'jt_T',
174	'jt_U',
175	'jt_X',
176]
177
178class PropertyValue(object):
179	def __init__(self, name_):
180		self.name = name_
181	def __str__(self):
182		return self.name
183	def __eq__(self, other):
184		return self.name == (other if isinstance(other, str) else other.name)
185	def __ne__(self, other):
186		return not (self == other)
187	def __hash__(self):
188		return hash(str(self))
189
190property_values = {}
191
192for name in property_names:
193	value = PropertyValue(name)
194	assert value not in property_values
195	assert value not in globals()
196	property_values[name] = value
197globals().update(property_values)
198
199
200def is_BASE(U, UISC, UDI, UGC, AJT):
201	return (UISC in [Number, Consonant, Consonant_Head_Letter,
202			Tone_Letter,
203			Vowel_Independent,
204			] or
205		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
206		AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
207		(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
208					Consonant_Subjoined, Vowel, Vowel_Dependent]))
209def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
210	return UISC == Brahmi_Joining_Number
211def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
212	if UISC == Consonant_Placeholder: return True
213	return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
214def is_CGJ(U, UISC, UDI, UGC, AJT):
215	# Also includes VARIATION_SELECTOR and ZWJ
216	return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
217def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
218	return ((UISC == Consonant_Final and UGC != Lo) or
219		UISC == Consonant_Succeeding_Repha)
220def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
221	return UISC == Syllable_Modifier
222def is_CONS_MED(U, UISC, UDI, UGC, AJT):
223	# Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
224	return (UISC == Consonant_Medial and UGC != Lo or
225		UISC == Consonant_Initial_Postfixed)
226def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
227	return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
228def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
229	return UISC == Consonant_Subjoined and UGC != Lo
230def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
231	return UISC == Consonant_With_Stacker
232def is_HALANT(U, UISC, UDI, UGC, AJT):
233	return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
234def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
235	# Split off of HALANT
236	return U == 0x0DCA
237def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
238	return UISC == Number_Joiner
239def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
240	return UISC == Hieroglyph
241def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
242	return UISC == Hieroglyph_Joiner
243def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
244	return UISC == Hieroglyph_Mirror
245def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
246	return UISC == Hieroglyph_Modifier
247def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
248	return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
249def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
250	return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
251def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
252	# Split off of HALANT
253	return (UISC == Invisible_Stacker
254		and not is_SAKOT(U, UISC, UDI, UGC, AJT)
255	)
256def is_ZWNJ(U, UISC, UDI, UGC, AJT):
257	return UISC == Non_Joiner
258def is_OTHER(U, UISC, UDI, UGC, AJT):
259	# Also includes BASE_IND and SYM
260	return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
261		and not is_BASE(U, UISC, UDI, UGC, AJT)
262		and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
263		and not is_CGJ(U, UISC, UDI, UGC, AJT)
264		and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
265		and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
266	)
267def is_REORDERING_KILLER(U, UISC, UDI, UGC, AJT):
268	return UISC == Reordering_Killer
269def is_REPHA(U, UISC, UDI, UGC, AJT):
270	return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
271def is_SAKOT(U, UISC, UDI, UGC, AJT):
272	# Split off of HALANT
273	return U == 0x1A60
274def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
275	return UISC == Symbol_Modifier
276def is_VOWEL(U, UISC, UDI, UGC, AJT):
277	return (UISC == Pure_Killer or
278		UGC != Lo and UISC in [Vowel, Vowel_Dependent])
279def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
280	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
281		UGC != Lo and UISC == Bindu)
282def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
283	# Also includes Rsv
284	return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
285		and UISC == Other
286		and not is_CGJ(U, UISC, UDI, UGC, AJT)
287	) or UGC == Cn
288
289use_mapping = {
290	'B':	is_BASE,
291	'N':	is_BASE_NUM,
292	'GB':	is_BASE_OTHER,
293	'CGJ':	is_CGJ,
294	'F':	is_CONS_FINAL,
295	'FM':	is_CONS_FINAL_MOD,
296	'M':	is_CONS_MED,
297	'CM':	is_CONS_MOD,
298	'SUB':	is_CONS_SUB,
299	'CS':	is_CONS_WITH_STACKER,
300	'H':	is_HALANT,
301	'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
302	'HN':	is_HALANT_NUM,
303	'IS':	is_INVISIBLE_STACKER,
304	'G':	is_HIEROGLYPH,
305	'HM':	is_HIEROGLYPH_MOD,
306	'HR':	is_HIEROGLYPH_MIRROR,
307	'J':	is_HIEROGLYPH_JOINER,
308	'SB':	is_HIEROGLYPH_SEGMENT_BEGIN,
309	'SE':	is_HIEROGLYPH_SEGMENT_END,
310	'ZWNJ':	is_ZWNJ,
311	'O':	is_OTHER,
312	'RK':	is_REORDERING_KILLER,
313	'R':	is_REPHA,
314	'Sk':	is_SAKOT,
315	'SM':	is_SYM_MOD,
316	'V':	is_VOWEL,
317	'VM':	is_VOWEL_MOD,
318	'WJ':	is_Word_Joiner,
319}
320
321use_positions = {
322	'F': {
323		'Abv': [Top],
324		'Blw': [Bottom],
325		'Pst': [Right],
326	},
327	'M': {
328		'Abv': [Top],
329		'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
330		'Pst': [Right],
331		'Pre': [Left, Top_And_Bottom_And_Left],
332	},
333	'CM': {
334		'Abv': [Top],
335		'Blw': [Bottom, Overstruck],
336	},
337	'V': {
338		'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
339		'Blw': [Bottom, Overstruck, Bottom_And_Right],
340		'Pst': [Right],
341		'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
342	},
343	'VM': {
344		'Abv': [Top],
345		'Blw': [Bottom, Overstruck],
346		'Pst': [Right],
347		'Pre': [Left],
348	},
349	'SM': {
350		'Abv': [Top],
351		'Blw': [Bottom],
352	},
353	'H': None,
354	'HM': None,
355	'HR': None,
356	'HVM': None,
357	'IS': None,
358	'B': None,
359	'FM': {
360		'Abv': [Top],
361		'Blw': [Bottom],
362		'Pst': [Not_Applicable],
363	},
364	'R': None,
365	'RK': None,
366	'SUB': None,
367}
368
369def map_to_use(data):
370	out = {}
371	items = use_mapping.items()
372	for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
373
374		# Resolve Indic_Syllabic_Category
375
376		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
377		if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
378
379		# Tibetan:
380		# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
381		if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
382
383		# TODO: U+1CED should only be allowed after some of
384		# the nasalization marks, maybe only for U+1CE9..U+1CF1.
385		if U == 0x1CED: UISC = Tone_Mark
386
387		values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
388		assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
389		USE = values[0]
390
391		# Resolve Indic_Positional_Category
392
393		# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
394		#  and https://github.com/harfbuzz/harfbuzz/issues/1631
395		if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
396
397		# TODO: https://github.com/microsoft/font-tools/issues/17#issuecomment-2346952091
398		if U == 0x113CF: UIPC = Bottom
399
400		assert (UIPC in [Not_Applicable, Visual_Order_Left] or
401			U in {0x0F7F, 0x11A3A} or
402			USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
403
404		pos_mapping = use_positions.get(USE, None)
405		if pos_mapping:
406			values = [k for k,v in pos_mapping.items() if v and UIPC in v]
407			assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
408			USE = USE + values[0]
409
410		out[U] = (USE, UBlock)
411	return out
412
413use_data = map_to_use(combined)
414
415print ("/* == Start of generated table == */")
416print ("/*")
417print (" * The following table is generated by running:")
418print (" *")
419print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
420print (" *")
421print (" * on files with these headers:")
422print (" *")
423for h in headers:
424	for l in h:
425		print (" * %s" % (l.strip()))
426print (" */")
427print ()
428print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
429print ("#define HB_OT_SHAPER_USE_TABLE_HH")
430print ()
431print ('#include "hb.hh"')
432print ()
433print ('#include "hb-ot-shaper-use-machine.hh"')
434print ()
435
436total = 0
437used = 0
438last_block = None
439def print_block (block, start, end, use_data):
440	global total, used, last_block
441	if block and block != last_block:
442		print ()
443		print ()
444		print ("  /* %s */" % block)
445		if start % 16:
446			print (' ' * (20 + (start % 16 * 6)), end='')
447	num = 0
448	assert start % 8 == 0
449	assert (end+1) % 8 == 0
450	for u in range (start, end+1):
451		if u % 16 == 0:
452			print ()
453			print ("  /* %04X */" % u, end='')
454		if u in use_data:
455			num += 1
456		d = use_data.get (u)
457		if d is not None:
458			d = d[0]
459		elif u in unicode_data[4]:
460			d = 'O'
461		else:
462			d = 'WJ'
463		print ("%6s," % d, end='')
464
465	total += end - start + 1
466	used += num
467	if block:
468		last_block = block
469
470uu = sorted (use_data.keys ())
471
472last = -100000
473num = 0
474offset = 0
475starts = []
476ends = []
477print ('#pragma GCC diagnostic push')
478print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
479for k,v in sorted(use_mapping.items()):
480	if k in use_positions and use_positions[k]: continue
481	print ("#define %s	USE(%s)	/* %s */" % (k, k, v.__name__[3:]))
482for k,v in sorted(use_positions.items()):
483	if not v: continue
484	for suf in v.keys():
485		tag = k + suf
486		print ("#define %s	USE(%s)" % (tag, tag))
487print ('#pragma GCC diagnostic pop')
488print ("")
489
490
491import packTab
492data = {u:v[0] for u,v in use_data.items()}
493
494DEFAULT = 5
495COMPACT = 9
496for compression in (DEFAULT, COMPACT):
497
498    logging.info('  Compression=%d:' % compression)
499    print()
500    if compression == DEFAULT:
501        print('#ifndef HB_OPTIMIZE_SIZE')
502    elif compression == COMPACT:
503        print('#else')
504    else:
505        assert False
506    print()
507
508    code = packTab.Code('hb_use')
509    sol = packTab.pack_table(data, compression=compression, default='O')
510    logging.info('      FullCost=%d' % (sol.fullCost))
511    sol.genCode(code, f'get_category')
512    code.print_c(linkage='static inline')
513    print ()
514
515print('#endif')
516
517print ()
518for k in sorted(use_mapping.keys()):
519	if k in use_positions and use_positions[k]: continue
520	print ("#undef %s" % k)
521for k,v in sorted(use_positions.items()):
522	if not v: continue
523	for suf in v.keys():
524		tag = k + suf
525		print ("#undef %s" % tag)
526print ()
527print ()
528print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
529print ("/* == End of generated table == */")
530