1#!/usr/bin/env python3 2 3"""usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt 4 5Input files: 6* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt 7* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 8* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 9""" 10 11import os.path, sys 12 13if len (sys.argv) != 4: 14 sys.exit (__doc__) 15 16files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 17 18headers = [[files[0].readline (), files[0].readline ()], [files[2].readline (), files[2].readline ()]] 19headers.append (["UnicodeData.txt does not have a header."]) 20while files[0].readline ().find ('##################') < 0: 21 pass 22 23blocks = {} 24def read_blocks(f): 25 global blocks 26 for line in f: 27 28 j = line.find ('#') 29 if j >= 0: 30 line = line[:j] 31 32 fields = [x.strip () for x in line.split (';')] 33 if len (fields) == 1: 34 continue 35 36 uu = fields[0].split ('..') 37 start = int (uu[0], 16) 38 if len (uu) == 1: 39 end = start 40 else: 41 end = int (uu[1], 16) 42 43 t = fields[1] 44 45 for u in range (start, end + 1): 46 blocks[u] = t 47 48def print_joining_table(f): 49 50 values = {} 51 for line in f: 52 53 if line[0] == '#': 54 continue 55 56 fields = [x.strip () for x in line.split (';')] 57 if len (fields) == 1: 58 continue 59 60 u = int (fields[0], 16) 61 62 if fields[3] in ["ALAPH", "DALATH RISH"]: 63 value = "JOINING_GROUP_" + fields[3].replace(' ', '_') 64 else: 65 value = "JOINING_TYPE_" + fields[2] 66 values[u] = value 67 68 short_value = {} 69 for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])): 70 short = ''.join(x[0] for x in value.split('_')[2:]) 71 assert short not in short_value.values() 72 short_value[value] = short 73 74 print () 75 for value,short in short_value.items(): 76 print ("#define %s %s" % (short, value)) 77 78 uu = sorted(values.keys()) 79 num = len(values) 80 all_blocks = set([blocks[u] for u in uu]) 81 82 last = -100000 83 ranges = [] 84 for u in uu: 85 if u - last <= 1+16*5: 86 ranges[-1][-1] = u 87 else: 88 ranges.append([u,u]) 89 last = u 90 91 print () 92 print ("static const uint8_t joining_table[] =") 93 print ("{") 94 last_block = None 95 offset = 0 96 for start,end in ranges: 97 98 print () 99 print ("#define joining_offset_0x%04xu %d" % (start, offset)) 100 101 for u in range(start, end+1): 102 103 block = blocks.get(u, last_block) 104 value = values.get(u, "JOINING_TYPE_X") 105 106 if block != last_block or u == start: 107 if u != start: 108 print () 109 if block in all_blocks: 110 print ("\n /* %s */" % block) 111 else: 112 print ("\n /* FILLER */") 113 last_block = block 114 if u % 32 != 0: 115 print () 116 print (" /* %04X */" % (u//32*32), " " * (u % 32), end="") 117 118 if u % 32 == 0: 119 print () 120 print (" /* %04X */ " % u, end="") 121 print ("%s," % short_value[value], end="") 122 print () 123 124 offset += end - start + 1 125 print () 126 occupancy = num * 100. / offset 127 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 128 print () 129 130 page_bits = 12 131 print () 132 print ("static unsigned int") 133 print ("joining_type (hb_codepoint_t u)") 134 print ("{") 135 print (" switch (u >> %d)" % page_bits) 136 print (" {") 137 pages = set([u>>page_bits for u in [s for s,e in ranges]+[e for s,e in ranges]]) 138 for p in sorted(pages): 139 print (" case 0x%0Xu:" % p) 140 for (start,end) in ranges: 141 if p not in [start>>page_bits, end>>page_bits]: continue 142 offset = "joining_offset_0x%04xu" % start 143 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return joining_table[u - 0x%04Xu + %s];" % (start, end, start, offset)) 144 print (" break;") 145 print ("") 146 print (" default:") 147 print (" break;") 148 print (" }") 149 print (" return X;") 150 print ("}") 151 print () 152 for value,short in short_value.items(): 153 print ("#undef %s" % (short)) 154 print () 155 156LIGATURES = ( 157 0xF2EE, 0xFC08, 0xFC0E, 0xFC12, 0xFC32, 0xFC3F, 0xFC40, 0xFC41, 0xFC42, 158 0xFC44, 0xFC4E, 0xFC5E, 0xFC60, 0xFC61, 0xFC62, 0xFC6A, 0xFC6D, 0xFC6F, 159 0xFC70, 0xFC73, 0xFC75, 0xFC86, 0xFC8F, 0xFC91, 0xFC94, 0xFC9C, 0xFC9D, 160 0xFC9E, 0xFC9F, 0xFCA1, 0xFCA2, 0xFCA3, 0xFCA4, 0xFCA8, 0xFCAA, 0xFCAC, 161 0xFCB0, 0xFCC9, 0xFCCA, 0xFCCB, 0xFCCC, 0xFCCD, 0xFCCE, 0xFCCF, 0xFCD0, 162 0xFCD1, 0xFCD2, 0xFCD3, 0xFCD5, 0xFCDA, 0xFCDB, 0xFCDC, 0xFCDD, 0xFD30, 163 0xFD88, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC, 164 0xF201, 0xF211, 0xF2EE, 165) 166 167def print_shaping_table(f): 168 169 shapes = {} 170 ligatures = {} 171 names = {} 172 lines = f.readlines() 173 lines += [ 174 "F201;PUA ARABIC LIGATURE LELLAH ISOLATED FORM;Lo;0;AL;<isolated> 0644 0644 0647;;;;N;;;;;", 175 "F211;PUA ARABIC LIGATURE LAM WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 0644 0645 062C;;;;N;;;;;", 176 "F2EE;PUA ARABIC LIGATURE SHADDA WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0020 064B 0651;;;;N;;;;;", 177 ] 178 for line in lines: 179 180 fields = [x.strip () for x in line.split (';')] 181 if fields[5][0:1] != '<': 182 continue 183 184 items = fields[5].split (' ') 185 shape, items = items[0][1:-1], tuple (int (x, 16) for x in items[1:]) 186 c = int (fields[0], 16) 187 188 if not shape in ['initial', 'medial', 'isolated', 'final']: 189 continue 190 191 if len (items) != 1: 192 # Mark ligatures start with space and are in visual order, so we 193 # remove the space and reverse the items. 194 if items[0] == 0x0020: 195 items = items[:0:-1] 196 shape = None 197 # We only care about a subset of ligatures 198 if c not in LIGATURES: 199 continue 200 201 # Save ligature 202 names[c] = fields[1] 203 if items not in ligatures: 204 ligatures[items] = {} 205 ligatures[items][shape] = c 206 else: 207 # Save shape 208 if items[0] not in names: 209 names[items[0]] = fields[1] 210 else: 211 names[items[0]] = os.path.commonprefix ([names[items[0]], fields[1]]).strip () 212 if items[0] not in shapes: 213 shapes[items[0]] = {} 214 shapes[items[0]][shape] = c 215 216 print () 217 print ("static const uint16_t shaping_table[][4] =") 218 print ("{") 219 220 keys = shapes.keys () 221 min_u, max_u = min (keys), max (keys) 222 for u in range (min_u, max_u + 1): 223 s = [shapes[u][shape] if u in shapes and shape in shapes[u] else 0 224 for shape in ['initial', 'medial', 'final', 'isolated']] 225 value = ', '.join ("0x%04Xu" % c for c in s) 226 print (" {%s}, /* U+%04X %s */" % (value, u, names[u] if u in names else "")) 227 228 print ("};") 229 print () 230 print ("#define SHAPING_TABLE_FIRST 0x%04Xu" % min_u) 231 print ("#define SHAPING_TABLE_LAST 0x%04Xu" % max_u) 232 print () 233 234 ligas_2 = {} 235 ligas_3 = {} 236 ligas_mark_2 = {} 237 for key in ligatures.keys (): 238 for shape in ligatures[key]: 239 c = ligatures[key][shape] 240 if len(key) == 3: 241 if shape == 'isolated': 242 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['final']) 243 elif shape == 'final': 244 liga = (shapes[key[0]]['medial'], shapes[key[1]]['medial'], shapes[key[2]]['final']) 245 elif shape == 'initial': 246 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['medial']) 247 else: 248 raise Exception ("Unexpected shape", shape) 249 if liga[0] not in ligas_3: 250 ligas_3[liga[0]] = [] 251 ligas_3[liga[0]].append ((liga[1], liga[2], c)) 252 elif len(key) == 2: 253 if shape is None: 254 liga = key 255 if liga[0] not in ligas_mark_2: 256 ligas_mark_2[liga[0]] = [] 257 ligas_mark_2[liga[0]].append ((liga[1], c)) 258 continue 259 elif shape == 'isolated': 260 liga = (shapes[key[0]]['initial'], shapes[key[1]]['final']) 261 elif shape == 'final': 262 liga = (shapes[key[0]]['medial'], shapes[key[1]]['final']) 263 elif shape == 'initial': 264 liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial']) 265 else: 266 raise Exception ("Unexpected shape", shape) 267 if liga[0] not in ligas_2: 268 ligas_2[liga[0]] = [] 269 ligas_2[liga[0]].append ((liga[1], c)) 270 else: 271 raise Exception ("Unexpected number of ligature components", key) 272 max_i = max (len (ligas_2[l]) for l in ligas_2) 273 print () 274 print ("static const struct ligature_set_t {") 275 print (" uint16_t first;") 276 print (" struct ligature_pairs_t {") 277 print (" uint16_t components[1];") 278 print (" uint16_t ligature;") 279 print (" } ligatures[%d];" % max_i) 280 print ("} ligature_table[] =") 281 print ("{") 282 for first in sorted (ligas_2.keys ()): 283 284 print (" { 0x%04Xu, {" % (first)) 285 for liga in ligas_2[first]: 286 print (" { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]])) 287 print (" }},") 288 289 print ("};") 290 print () 291 292 max_i = max (len (ligas_mark_2[l]) for l in ligas_mark_2) 293 print () 294 print ("static const struct ligature_mark_set_t {") 295 print (" uint16_t first;") 296 print (" struct ligature_pairs_t {") 297 print (" uint16_t components[1];") 298 print (" uint16_t ligature;") 299 print (" } ligatures[%d];" % max_i) 300 print ("} ligature_mark_table[] =") 301 print ("{") 302 for first in sorted (ligas_mark_2.keys ()): 303 304 print (" { 0x%04Xu, {" % (first)) 305 for liga in ligas_mark_2[first]: 306 print (" { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]])) 307 print (" }},") 308 309 print ("};") 310 print () 311 312 max_i = max (len (ligas_3[l]) for l in ligas_3) 313 print () 314 print ("static const struct ligature_3_set_t {") 315 print (" uint16_t first;") 316 print (" struct ligature_triplets_t {") 317 print (" uint16_t components[2];") 318 print (" uint16_t ligature;") 319 print (" } ligatures[%d];" % max_i) 320 print ("} ligature_3_table[] =") 321 print ("{") 322 for first in sorted (ligas_3.keys ()): 323 324 print (" { 0x%04Xu, {" % (first)) 325 for liga in ligas_3[first]: 326 print (" { {0x%04Xu, 0x%04Xu}, 0x%04Xu}, /* %s */" % (liga[0], liga[1], liga[2], names[liga[2]])) 327 print (" }},") 328 329 print ("};") 330 print () 331 332 333 334print ("/* == Start of generated table == */") 335print ("/*") 336print (" * The following table is generated by running:") 337print (" *") 338print (" * ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt") 339print (" *") 340print (" * on files with these headers:") 341print (" *") 342for h in headers: 343 for l in h: 344 print (" * %s" % (l.strip())) 345print (" */") 346print () 347print ("#ifndef HB_OT_SHAPER_ARABIC_TABLE_HH") 348print ("#define HB_OT_SHAPER_ARABIC_TABLE_HH") 349print () 350 351read_blocks (files[2]) 352print_joining_table (files[0]) 353print_shaping_table (files[1]) 354 355print () 356print ("#endif /* HB_OT_SHAPER_ARABIC_TABLE_HH */") 357print () 358print ("/* == End of generated table == */") 359