xref: /aosp_15_r20/external/harfbuzz_ng/src/gen-arabic-table.py (revision 2d1272b857b1f7575e6e246373e1cb218663db8a)
1#!/usr/bin/env python3
2
3"""usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
4
5Input files:
6* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
7* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
8* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
9"""
10
11import os.path, sys
12
13if len (sys.argv) != 4:
14	sys.exit (__doc__)
15
16files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
17
18headers = [[files[0].readline (), files[0].readline ()], [files[2].readline (), files[2].readline ()]]
19headers.append (["UnicodeData.txt does not have a header."])
20while files[0].readline ().find ('##################') < 0:
21	pass
22
23blocks = {}
24def read_blocks(f):
25	global blocks
26	for line in f:
27
28		j = line.find ('#')
29		if j >= 0:
30			line = line[:j]
31
32		fields = [x.strip () for x in line.split (';')]
33		if len (fields) == 1:
34			continue
35
36		uu = fields[0].split ('..')
37		start = int (uu[0], 16)
38		if len (uu) == 1:
39			end = start
40		else:
41			end = int (uu[1], 16)
42
43		t = fields[1]
44
45		for u in range (start, end + 1):
46			blocks[u] = t
47
48def print_joining_table(f):
49
50	values = {}
51	for line in f:
52
53		if line[0] == '#':
54			continue
55
56		fields = [x.strip () for x in line.split (';')]
57		if len (fields) == 1:
58			continue
59
60		u = int (fields[0], 16)
61
62		if fields[3] in ["ALAPH", "DALATH RISH"]:
63			value = "JOINING_GROUP_" + fields[3].replace(' ', '_')
64		else:
65			value = "JOINING_TYPE_" + fields[2]
66		values[u] = value
67
68	short_value = {}
69	for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])):
70		short = ''.join(x[0] for x in value.split('_')[2:])
71		assert short not in short_value.values()
72		short_value[value] = short
73
74	print ()
75	for value,short in short_value.items():
76		print ("#define %s	%s" % (short, value))
77
78	uu = sorted(values.keys())
79	num = len(values)
80	all_blocks = set([blocks[u] for u in uu])
81
82	last = -100000
83	ranges = []
84	for u in uu:
85		if u - last <= 1+16*5:
86			ranges[-1][-1] = u
87		else:
88			ranges.append([u,u])
89		last = u
90
91	print ()
92	print ("static const uint8_t joining_table[] =")
93	print ("{")
94	last_block = None
95	offset = 0
96	for start,end in ranges:
97
98		print ()
99		print ("#define joining_offset_0x%04xu %d" % (start, offset))
100
101		for u in range(start, end+1):
102
103			block = blocks.get(u, last_block)
104			value = values.get(u, "JOINING_TYPE_X")
105
106			if block != last_block or u == start:
107				if u != start:
108					print ()
109				if block in all_blocks:
110					print ("\n  /* %s */" % block)
111				else:
112					print ("\n  /* FILLER */")
113				last_block = block
114				if u % 32 != 0:
115					print ()
116					print ("  /* %04X */" % (u//32*32), "  " * (u % 32), end="")
117
118			if u % 32 == 0:
119				print ()
120				print ("  /* %04X */ " % u, end="")
121			print ("%s," % short_value[value], end="")
122		print ()
123
124		offset += end - start + 1
125	print ()
126	occupancy = num * 100. / offset
127	print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
128	print ()
129
130	page_bits = 12
131	print ()
132	print ("static unsigned int")
133	print ("joining_type (hb_codepoint_t u)")
134	print ("{")
135	print ("  switch (u >> %d)" % page_bits)
136	print ("  {")
137	pages = set([u>>page_bits for u in [s for s,e in ranges]+[e for s,e in ranges]])
138	for p in sorted(pages):
139		print ("    case 0x%0Xu:" % p)
140		for (start,end) in ranges:
141			if p not in [start>>page_bits, end>>page_bits]: continue
142			offset = "joining_offset_0x%04xu" % start
143			print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return joining_table[u - 0x%04Xu + %s];" % (start, end, start, offset))
144		print ("      break;")
145		print ("")
146	print ("    default:")
147	print ("      break;")
148	print ("  }")
149	print ("  return X;")
150	print ("}")
151	print ()
152	for value,short in short_value.items():
153		print ("#undef %s" % (short))
154	print ()
155
156LIGATURES = (
157	0xF2EE, 0xFC08, 0xFC0E, 0xFC12, 0xFC32, 0xFC3F, 0xFC40, 0xFC41, 0xFC42,
158	0xFC44, 0xFC4E, 0xFC5E, 0xFC60, 0xFC61, 0xFC62, 0xFC6A, 0xFC6D, 0xFC6F,
159	0xFC70, 0xFC73, 0xFC75, 0xFC86, 0xFC8F, 0xFC91, 0xFC94, 0xFC9C, 0xFC9D,
160	0xFC9E, 0xFC9F, 0xFCA1, 0xFCA2, 0xFCA3, 0xFCA4, 0xFCA8, 0xFCAA, 0xFCAC,
161	0xFCB0, 0xFCC9, 0xFCCA, 0xFCCB, 0xFCCC, 0xFCCD, 0xFCCE, 0xFCCF, 0xFCD0,
162	0xFCD1, 0xFCD2, 0xFCD3, 0xFCD5, 0xFCDA, 0xFCDB, 0xFCDC, 0xFCDD, 0xFD30,
163	0xFD88, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC,
164	0xF201, 0xF211, 0xF2EE,
165)
166
167def print_shaping_table(f):
168
169	shapes = {}
170	ligatures = {}
171	names = {}
172	lines = f.readlines()
173	lines += [
174		"F201;PUA ARABIC LIGATURE LELLAH ISOLATED FORM;Lo;0;AL;<isolated> 0644 0644 0647;;;;N;;;;;",
175		"F211;PUA ARABIC LIGATURE LAM WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 0644 0645 062C;;;;N;;;;;",
176		"F2EE;PUA ARABIC LIGATURE SHADDA WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0020 064B 0651;;;;N;;;;;",
177	]
178	for line in lines:
179
180		fields = [x.strip () for x in line.split (';')]
181		if fields[5][0:1] != '<':
182			continue
183
184		items = fields[5].split (' ')
185		shape, items = items[0][1:-1], tuple (int (x, 16) for x in items[1:])
186		c = int (fields[0], 16)
187
188		if not shape in ['initial', 'medial', 'isolated', 'final']:
189			continue
190
191		if len (items) != 1:
192			# Mark ligatures start with space and are in visual order, so we
193			# remove the space and reverse the items.
194			if items[0] == 0x0020:
195				items = items[:0:-1]
196				shape = None
197			# We only care about a subset of ligatures
198			if c not in LIGATURES:
199				continue
200
201			# Save ligature
202			names[c] = fields[1]
203			if items not in ligatures:
204				ligatures[items] = {}
205			ligatures[items][shape] = c
206		else:
207			# Save shape
208			if items[0] not in names:
209				names[items[0]] = fields[1]
210			else:
211				names[items[0]] = os.path.commonprefix ([names[items[0]], fields[1]]).strip ()
212			if items[0] not in shapes:
213				shapes[items[0]] = {}
214			shapes[items[0]][shape] = c
215
216	print ()
217	print ("static const uint16_t shaping_table[][4] =")
218	print ("{")
219
220	keys = shapes.keys ()
221	min_u, max_u = min (keys), max (keys)
222	for u in range (min_u, max_u + 1):
223		s = [shapes[u][shape] if u in shapes and shape in shapes[u] else 0
224		     for shape in  ['initial', 'medial', 'final', 'isolated']]
225		value = ', '.join ("0x%04Xu" % c for c in s)
226		print ("  {%s}, /* U+%04X %s */" % (value, u, names[u] if u in names else ""))
227
228	print ("};")
229	print ()
230	print ("#define SHAPING_TABLE_FIRST	0x%04Xu" % min_u)
231	print ("#define SHAPING_TABLE_LAST	0x%04Xu" % max_u)
232	print ()
233
234	ligas_2 = {}
235	ligas_3 = {}
236	ligas_mark_2 = {}
237	for key in ligatures.keys ():
238		for shape in ligatures[key]:
239			c = ligatures[key][shape]
240			if len(key) == 3:
241				if shape == 'isolated':
242					liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
243				elif shape == 'final':
244					liga = (shapes[key[0]]['medial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
245				elif shape == 'initial':
246					liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['medial'])
247				else:
248					raise Exception ("Unexpected shape", shape)
249				if liga[0] not in ligas_3:
250					ligas_3[liga[0]] = []
251				ligas_3[liga[0]].append ((liga[1], liga[2], c))
252			elif len(key) == 2:
253				if shape is None:
254					liga = key
255					if liga[0] not in ligas_mark_2:
256						ligas_mark_2[liga[0]] = []
257					ligas_mark_2[liga[0]].append ((liga[1], c))
258					continue
259				elif shape == 'isolated':
260					liga = (shapes[key[0]]['initial'], shapes[key[1]]['final'])
261				elif shape == 'final':
262					liga = (shapes[key[0]]['medial'], shapes[key[1]]['final'])
263				elif shape == 'initial':
264					liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'])
265				else:
266					raise Exception ("Unexpected shape", shape)
267				if liga[0] not in ligas_2:
268					ligas_2[liga[0]] = []
269				ligas_2[liga[0]].append ((liga[1], c))
270			else:
271				raise Exception ("Unexpected number of ligature components", key)
272	max_i = max (len (ligas_2[l]) for l in ligas_2)
273	print ()
274	print ("static const struct ligature_set_t {")
275	print (" uint16_t first;")
276	print (" struct ligature_pairs_t {")
277	print ("   uint16_t components[1];")
278	print ("   uint16_t ligature;")
279	print (" } ligatures[%d];" % max_i)
280	print ("} ligature_table[] =")
281	print ("{")
282	for first in sorted (ligas_2.keys ()):
283
284		print ("  { 0x%04Xu, {" % (first))
285		for liga in ligas_2[first]:
286			print ("    { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
287		print ("  }},")
288
289	print ("};")
290	print ()
291
292	max_i = max (len (ligas_mark_2[l]) for l in ligas_mark_2)
293	print ()
294	print ("static const struct ligature_mark_set_t {")
295	print (" uint16_t first;")
296	print (" struct ligature_pairs_t {")
297	print ("   uint16_t components[1];")
298	print ("   uint16_t ligature;")
299	print (" } ligatures[%d];" % max_i)
300	print ("} ligature_mark_table[] =")
301	print ("{")
302	for first in sorted (ligas_mark_2.keys ()):
303
304		print ("  { 0x%04Xu, {" % (first))
305		for liga in ligas_mark_2[first]:
306			print ("    { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
307		print ("  }},")
308
309	print ("};")
310	print ()
311
312	max_i = max (len (ligas_3[l]) for l in ligas_3)
313	print ()
314	print ("static const struct ligature_3_set_t {")
315	print (" uint16_t first;")
316	print (" struct ligature_triplets_t {")
317	print ("   uint16_t components[2];")
318	print ("   uint16_t ligature;")
319	print (" } ligatures[%d];" % max_i)
320	print ("} ligature_3_table[] =")
321	print ("{")
322	for first in sorted (ligas_3.keys ()):
323
324		print ("  { 0x%04Xu, {" % (first))
325		for liga in ligas_3[first]:
326			print ("    { {0x%04Xu, 0x%04Xu}, 0x%04Xu}, /* %s */" % (liga[0], liga[1], liga[2], names[liga[2]]))
327		print ("  }},")
328
329	print ("};")
330	print ()
331
332
333
334print ("/* == Start of generated table == */")
335print ("/*")
336print (" * The following table is generated by running:")
337print (" *")
338print (" *   ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt")
339print (" *")
340print (" * on files with these headers:")
341print (" *")
342for h in headers:
343	for l in h:
344		print (" * %s" % (l.strip()))
345print (" */")
346print ()
347print ("#ifndef HB_OT_SHAPER_ARABIC_TABLE_HH")
348print ("#define HB_OT_SHAPER_ARABIC_TABLE_HH")
349print ()
350
351read_blocks (files[2])
352print_joining_table (files[0])
353print_shaping_table (files[1])
354
355print ()
356print ("#endif /* HB_OT_SHAPER_ARABIC_TABLE_HH */")
357print ()
358print ("/* == End of generated table == */")
359