1#!/usr/bin/env python3 2# flake8: noqa: F821 3 4"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt 5 6Input files: 7* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 9* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt 10* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 11* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 12* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt 14* ms-use/IndicSyllabicCategory-Additional.txt 15* ms-use/IndicPositionalCategory-Additional.txt 16""" 17 18import logging 19logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 20 21 22import sys 23 24if len (sys.argv) != 10: 25 sys.exit (__doc__) 26 27DISABLED_SCRIPTS = { 28 'Arabic', 29 'Lao', 30 'Samaritan', 31 'Syriac', 32 'Thai', 33} 34 35files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 36 37headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4] 38for j in range(7, 9): 39 for line in files[j]: 40 line = line.rstrip() 41 if not line: 42 break 43 headers[j - 1].append(line) 44headers.append (["UnicodeData.txt does not have a header."]) 45 46unicode_data = [{} for _ in files] 47values = [{} for _ in files] 48for i, f in enumerate (files): 49 for line in f: 50 51 j = line.find ('#') 52 if j >= 0: 53 line = line[:j] 54 55 fields = [x.strip () for x in line.split (';')] 56 if len (fields) == 1: 57 continue 58 59 uu = fields[0].split ('..') 60 start = int (uu[0], 16) 61 if len (uu) == 1: 62 end = start 63 else: 64 end = int (uu[1], 16) 65 66 t = fields[1 if i not in [2, 4] else 2] 67 68 if i == 2: 69 t = 'jt_' + t 70 elif i == 3 and t != 'Default_Ignorable_Code_Point': 71 continue 72 elif i == 7 and t == 'Consonant_Final_Modifier': 73 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336 74 t = 'Syllable_Modifier' 75 elif i == 8 and t == 'NA': 76 t = 'Not_Applicable' 77 78 i0 = i if i < 7 else i - 7 79 for u in range (start, end + 1): 80 unicode_data[i0][u] = t 81 values[i0][t] = values[i0].get (t, 0) + end - start + 1 82 83defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown') 84 85# Merge data into one dict: 86for i,v in enumerate (defaults): 87 values[i][v] = values[i].get (v, 0) + 1 88combined = {} 89for i,d in enumerate (unicode_data): 90 for u,v in d.items (): 91 if not u in combined: 92 if i >= 4: 93 continue 94 combined[u] = list (defaults) 95 combined[u][i] = v 96combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS} 97 98 99property_names = [ 100 # General_Category 101 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 102 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 103 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 104 # Indic_Syllabic_Category 105 'Other', 106 'Bindu', 107 'Visarga', 108 'Avagraha', 109 'Nukta', 110 'Virama', 111 'Pure_Killer', 112 'Reordering_Killer', 113 'Invisible_Stacker', 114 'Vowel_Independent', 115 'Vowel_Dependent', 116 'Vowel', 117 'Consonant_Placeholder', 118 'Consonant', 119 'Consonant_Dead', 120 'Consonant_With_Stacker', 121 'Consonant_Prefixed', 122 'Consonant_Preceding_Repha', 123 'Consonant_Succeeding_Repha', 124 'Consonant_Subjoined', 125 'Consonant_Medial', 126 'Consonant_Final', 127 'Consonant_Head_Letter', 128 'Consonant_Initial_Postfixed', 129 'Modifying_Letter', 130 'Tone_Letter', 131 'Tone_Mark', 132 'Gemination_Mark', 133 'Cantillation_Mark', 134 'Register_Shifter', 135 'Syllable_Modifier', 136 'Consonant_Killer', 137 'Non_Joiner', 138 'Joiner', 139 'Number_Joiner', 140 'Number', 141 'Brahmi_Joining_Number', 142 'Symbol_Modifier', 143 'Hieroglyph', 144 'Hieroglyph_Joiner', 145 'Hieroglyph_Mark_Begin', 146 'Hieroglyph_Mark_End', 147 'Hieroglyph_Mirror', 148 'Hieroglyph_Modifier', 149 'Hieroglyph_Segment_Begin', 150 'Hieroglyph_Segment_End', 151 # Indic_Positional_Category 152 'Not_Applicable', 153 'Right', 154 'Left', 155 'Visual_Order_Left', 156 'Left_And_Right', 157 'Top', 158 'Bottom', 159 'Top_And_Bottom', 160 'Top_And_Bottom_And_Left', 161 'Top_And_Right', 162 'Top_And_Left', 163 'Top_And_Left_And_Right', 164 'Bottom_And_Left', 165 'Bottom_And_Right', 166 'Top_And_Bottom_And_Right', 167 'Overstruck', 168 # Joining_Type 169 'jt_C', 170 'jt_D', 171 'jt_L', 172 'jt_R', 173 'jt_T', 174 'jt_U', 175 'jt_X', 176] 177 178class PropertyValue(object): 179 def __init__(self, name_): 180 self.name = name_ 181 def __str__(self): 182 return self.name 183 def __eq__(self, other): 184 return self.name == (other if isinstance(other, str) else other.name) 185 def __ne__(self, other): 186 return not (self == other) 187 def __hash__(self): 188 return hash(str(self)) 189 190property_values = {} 191 192for name in property_names: 193 value = PropertyValue(name) 194 assert value not in property_values 195 assert value not in globals() 196 property_values[name] = value 197globals().update(property_values) 198 199 200def is_BASE(U, UISC, UDI, UGC, AJT): 201 return (UISC in [Number, Consonant, Consonant_Head_Letter, 202 Tone_Letter, 203 Vowel_Independent, 204 ] or 205 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484 206 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or 207 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 208 Consonant_Subjoined, Vowel, Vowel_Dependent])) 209def is_BASE_NUM(U, UISC, UDI, UGC, AJT): 210 return UISC == Brahmi_Joining_Number 211def is_BASE_OTHER(U, UISC, UDI, UGC, AJT): 212 if UISC == Consonant_Placeholder: return True 213 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 214def is_CGJ(U, UISC, UDI, UGC, AJT): 215 # Also includes VARIATION_SELECTOR and ZWJ 216 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn] 217def is_CONS_FINAL(U, UISC, UDI, UGC, AJT): 218 return ((UISC == Consonant_Final and UGC != Lo) or 219 UISC == Consonant_Succeeding_Repha) 220def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT): 221 return UISC == Syllable_Modifier 222def is_CONS_MED(U, UISC, UDI, UGC, AJT): 223 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 224 return (UISC == Consonant_Medial and UGC != Lo or 225 UISC == Consonant_Initial_Postfixed) 226def is_CONS_MOD(U, UISC, UDI, UGC, AJT): 227 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 228def is_CONS_SUB(U, UISC, UDI, UGC, AJT): 229 return UISC == Consonant_Subjoined and UGC != Lo 230def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT): 231 return UISC == Consonant_With_Stacker 232def is_HALANT(U, UISC, UDI, UGC, AJT): 233 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT) 234def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT): 235 # Split off of HALANT 236 return U == 0x0DCA 237def is_HALANT_NUM(U, UISC, UDI, UGC, AJT): 238 return UISC == Number_Joiner 239def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT): 240 return UISC == Hieroglyph 241def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT): 242 return UISC == Hieroglyph_Joiner 243def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT): 244 return UISC == Hieroglyph_Mirror 245def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT): 246 return UISC == Hieroglyph_Modifier 247def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT): 248 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin] 249def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT): 250 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End] 251def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT): 252 # Split off of HALANT 253 return (UISC == Invisible_Stacker 254 and not is_SAKOT(U, UISC, UDI, UGC, AJT) 255 ) 256def is_ZWNJ(U, UISC, UDI, UGC, AJT): 257 return UISC == Non_Joiner 258def is_OTHER(U, UISC, UDI, UGC, AJT): 259 # Also includes BASE_IND and SYM 260 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other]) 261 and not is_BASE(U, UISC, UDI, UGC, AJT) 262 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT) 263 and not is_CGJ(U, UISC, UDI, UGC, AJT) 264 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT) 265 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT) 266 ) 267def is_REORDERING_KILLER(U, UISC, UDI, UGC, AJT): 268 return UISC == Reordering_Killer 269def is_REPHA(U, UISC, UDI, UGC, AJT): 270 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 271def is_SAKOT(U, UISC, UDI, UGC, AJT): 272 # Split off of HALANT 273 return U == 0x1A60 274def is_SYM_MOD(U, UISC, UDI, UGC, AJT): 275 return UISC == Symbol_Modifier 276def is_VOWEL(U, UISC, UDI, UGC, AJT): 277 return (UISC == Pure_Killer or 278 UGC != Lo and UISC in [Vowel, Vowel_Dependent]) 279def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT): 280 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 281 UGC != Lo and UISC == Bindu) 282def is_Word_Joiner(U, UISC, UDI, UGC, AJT): 283 # Also includes Rsv 284 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3] 285 and UISC == Other 286 and not is_CGJ(U, UISC, UDI, UGC, AJT) 287 ) or UGC == Cn 288 289use_mapping = { 290 'B': is_BASE, 291 'N': is_BASE_NUM, 292 'GB': is_BASE_OTHER, 293 'CGJ': is_CGJ, 294 'F': is_CONS_FINAL, 295 'FM': is_CONS_FINAL_MOD, 296 'M': is_CONS_MED, 297 'CM': is_CONS_MOD, 298 'SUB': is_CONS_SUB, 299 'CS': is_CONS_WITH_STACKER, 300 'H': is_HALANT, 301 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 302 'HN': is_HALANT_NUM, 303 'IS': is_INVISIBLE_STACKER, 304 'G': is_HIEROGLYPH, 305 'HM': is_HIEROGLYPH_MOD, 306 'HR': is_HIEROGLYPH_MIRROR, 307 'J': is_HIEROGLYPH_JOINER, 308 'SB': is_HIEROGLYPH_SEGMENT_BEGIN, 309 'SE': is_HIEROGLYPH_SEGMENT_END, 310 'ZWNJ': is_ZWNJ, 311 'O': is_OTHER, 312 'RK': is_REORDERING_KILLER, 313 'R': is_REPHA, 314 'Sk': is_SAKOT, 315 'SM': is_SYM_MOD, 316 'V': is_VOWEL, 317 'VM': is_VOWEL_MOD, 318 'WJ': is_Word_Joiner, 319} 320 321use_positions = { 322 'F': { 323 'Abv': [Top], 324 'Blw': [Bottom], 325 'Pst': [Right], 326 }, 327 'M': { 328 'Abv': [Top], 329 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right], 330 'Pst': [Right], 331 'Pre': [Left, Top_And_Bottom_And_Left], 332 }, 333 'CM': { 334 'Abv': [Top], 335 'Blw': [Bottom, Overstruck], 336 }, 337 'V': { 338 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 339 'Blw': [Bottom, Overstruck, Bottom_And_Right], 340 'Pst': [Right], 341 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 342 }, 343 'VM': { 344 'Abv': [Top], 345 'Blw': [Bottom, Overstruck], 346 'Pst': [Right], 347 'Pre': [Left], 348 }, 349 'SM': { 350 'Abv': [Top], 351 'Blw': [Bottom], 352 }, 353 'H': None, 354 'HM': None, 355 'HR': None, 356 'HVM': None, 357 'IS': None, 358 'B': None, 359 'FM': { 360 'Abv': [Top], 361 'Blw': [Bottom], 362 'Pst': [Not_Applicable], 363 }, 364 'R': None, 365 'RK': None, 366 'SUB': None, 367} 368 369def map_to_use(data): 370 out = {} 371 items = use_mapping.items() 372 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items(): 373 374 # Resolve Indic_Syllabic_Category 375 376 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 377 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 378 379 # Tibetan: 380 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 381 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 382 383 # TODO: U+1CED should only be allowed after some of 384 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 385 if U == 0x1CED: UISC = Tone_Mark 386 387 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)] 388 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values) 389 USE = values[0] 390 391 # Resolve Indic_Positional_Category 392 393 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 394 # and https://github.com/harfbuzz/harfbuzz/issues/1631 395 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 396 397 # TODO: https://github.com/microsoft/font-tools/issues/17#issuecomment-2346952091 398 if U == 0x113CF: UIPC = Bottom 399 400 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 401 U in {0x0F7F, 0x11A3A} or 402 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT) 403 404 pos_mapping = use_positions.get(USE, None) 405 if pos_mapping: 406 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 407 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values) 408 USE = USE + values[0] 409 410 out[U] = (USE, UBlock) 411 return out 412 413use_data = map_to_use(combined) 414 415print ("/* == Start of generated table == */") 416print ("/*") 417print (" * The following table is generated by running:") 418print (" *") 419print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) 420print (" *") 421print (" * on files with these headers:") 422print (" *") 423for h in headers: 424 for l in h: 425 print (" * %s" % (l.strip())) 426print (" */") 427print () 428print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH") 429print ("#define HB_OT_SHAPER_USE_TABLE_HH") 430print () 431print ('#include "hb.hh"') 432print () 433print ('#include "hb-ot-shaper-use-machine.hh"') 434print () 435 436total = 0 437used = 0 438last_block = None 439def print_block (block, start, end, use_data): 440 global total, used, last_block 441 if block and block != last_block: 442 print () 443 print () 444 print (" /* %s */" % block) 445 if start % 16: 446 print (' ' * (20 + (start % 16 * 6)), end='') 447 num = 0 448 assert start % 8 == 0 449 assert (end+1) % 8 == 0 450 for u in range (start, end+1): 451 if u % 16 == 0: 452 print () 453 print (" /* %04X */" % u, end='') 454 if u in use_data: 455 num += 1 456 d = use_data.get (u) 457 if d is not None: 458 d = d[0] 459 elif u in unicode_data[4]: 460 d = 'O' 461 else: 462 d = 'WJ' 463 print ("%6s," % d, end='') 464 465 total += end - start + 1 466 used += num 467 if block: 468 last_block = block 469 470uu = sorted (use_data.keys ()) 471 472last = -100000 473num = 0 474offset = 0 475starts = [] 476ends = [] 477print ('#pragma GCC diagnostic push') 478print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 479for k,v in sorted(use_mapping.items()): 480 if k in use_positions and use_positions[k]: continue 481 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:])) 482for k,v in sorted(use_positions.items()): 483 if not v: continue 484 for suf in v.keys(): 485 tag = k + suf 486 print ("#define %s USE(%s)" % (tag, tag)) 487print ('#pragma GCC diagnostic pop') 488print ("") 489 490 491import packTab 492data = {u:v[0] for u,v in use_data.items()} 493 494DEFAULT = 5 495COMPACT = 9 496for compression in (DEFAULT, COMPACT): 497 498 logging.info(' Compression=%d:' % compression) 499 print() 500 if compression == DEFAULT: 501 print('#ifndef HB_OPTIMIZE_SIZE') 502 elif compression == COMPACT: 503 print('#else') 504 else: 505 assert False 506 print() 507 508 code = packTab.Code('hb_use') 509 sol = packTab.pack_table(data, compression=compression, default='O') 510 logging.info(' FullCost=%d' % (sol.fullCost)) 511 sol.genCode(code, f'get_category') 512 code.print_c(linkage='static inline') 513 print () 514 515print('#endif') 516 517print () 518for k in sorted(use_mapping.keys()): 519 if k in use_positions and use_positions[k]: continue 520 print ("#undef %s" % k) 521for k,v in sorted(use_positions.items()): 522 if not v: continue 523 for suf in v.keys(): 524 tag = k + suf 525 print ("#undef %s" % tag) 526print () 527print () 528print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */") 529print ("/* == End of generated table == */") 530