1#!/usr/bin/env python 2 3import collections 4import copy 5import glob 6from os import path 7import re 8import sys 9from xml.etree import ElementTree 10 11from fontTools import ttLib 12 13# TODO(nona): Remove hard coded font version and unicode versions. 14# Figure out a way of giving this information with command lines. 15EMOJI_FONT_TO_UNICODE_MAP = { 16 '2.034': 15.0, 17 '2.042': 15.1, 18 '2.047': 16.0, 19} 20 21EMOJI_VS = 0xFE0F 22 23LANG_TO_SCRIPT = { 24 'af': 'Latn', 25 'as': 'Beng', 26 'am': 'Latn', 27 'be': 'Cyrl', 28 'bg': 'Cyrl', 29 'bn': 'Beng', 30 'cs': 'Latn', 31 'cu': 'Cyrl', 32 'cy': 'Latn', 33 'da': 'Latn', 34 'de': 'Latn', 35 'el': 'Latn', 36 'en': 'Latn', 37 'es': 'Latn', 38 'et': 'Latn', 39 'eu': 'Latn', 40 'fr': 'Latn', 41 'ga': 'Latn', 42 'gl': 'Latn', 43 'gu': 'Gujr', 44 'hi': 'Deva', 45 'hr': 'Latn', 46 'hu': 'Latn', 47 'hy': 'Armn', 48 'it': 'Latn', 49 'ja': 'Jpan', 50 'ka': 'Latn', 51 'kn': 'Knda', 52 'ko': 'Kore', 53 'la': 'Latn', 54 'lt': 'Latn', 55 'lv': 'Latn', 56 'ml': 'Mlym', 57 'mn': 'Cyrl', 58 'mr': 'Deva', 59 'nb': 'Latn', 60 'nl': 'Latn', 61 'nn': 'Latn', 62 'or': 'Orya', 63 'pa': 'Guru', 64 'pt': 'Latn', 65 'pl': 'Latn', 66 'ru': 'Latn', 67 'sk': 'Latn', 68 'sl': 'Latn', 69 'sq': 'Latn', 70 'sv': 'Latn', 71 'ta': 'Taml', 72 'te': 'Telu', 73 'tk': 'Latn', 74 'uk': 'Latn', 75} 76 77def lang_to_script(lang_code): 78 lang = lang_code.lower() 79 while lang not in LANG_TO_SCRIPT: 80 hyphen_idx = lang.rfind('-') 81 assert hyphen_idx != -1, ( 82 'We do not know what script the "%s" language is written in.' 83 % lang_code) 84 assumed_script = lang[hyphen_idx+1:] 85 if len(assumed_script) == 4 and assumed_script.isalpha(): 86 # This is actually the script 87 return assumed_script.title() 88 lang = lang[:hyphen_idx] 89 return LANG_TO_SCRIPT[lang] 90 91 92def printable(inp): 93 if type(inp) is set: # set of character sequences 94 return '{' + ', '.join([printable(seq) for seq in inp]) + '}' 95 if type(inp) is tuple: # character sequence 96 return '<' + (', '.join([printable(ch) for ch in inp])) + '>' 97 else: # single character 98 return 'U+%04X' % inp 99 100 101def open_font(font): 102 font_file, index = font 103 font_path = path.join(_fonts_dir, font_file) 104 if index is not None: 105 return ttLib.TTFont(font_path, fontNumber=index) 106 else: 107 return ttLib.TTFont(font_path) 108 109 110def get_best_cmap(font): 111 ttfont = open_font(font) 112 all_unicode_cmap = None 113 bmp_cmap = None 114 for cmap in ttfont['cmap'].tables: 115 specifier = (cmap.format, cmap.platformID, cmap.platEncID) 116 if specifier == (4, 3, 1): 117 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, ) 118 bmp_cmap = cmap 119 elif specifier == (12, 3, 10): 120 assert all_unicode_cmap is None, ( 121 'More than one UCS-4 cmap in %s' % (font, )) 122 all_unicode_cmap = cmap 123 124 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap 125 126 127def get_variation_sequences_cmap(font): 128 ttfont = open_font(font) 129 vs_cmap = None 130 for cmap in ttfont['cmap'].tables: 131 specifier = (cmap.format, cmap.platformID, cmap.platEncID) 132 if specifier == (14, 0, 5): 133 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, ) 134 vs_cmap = cmap 135 return vs_cmap 136 137 138def get_emoji_map(font): 139 # Add normal characters 140 emoji_map = copy.copy(get_best_cmap(font)) 141 reverse_cmap = {glyph: code for code, glyph in emoji_map.items() if not contains_pua(code) } 142 143 # Add variation sequences 144 vs_cmap = get_variation_sequences_cmap(font) 145 if vs_cmap: 146 for vs in vs_cmap.uvsDict: 147 for base, glyph in vs_cmap.uvsDict[vs]: 148 if glyph is None: 149 emoji_map[(base, vs)] = emoji_map[base] 150 else: 151 emoji_map[(base, vs)] = glyph 152 153 # Add GSUB rules 154 ttfont = open_font(font) 155 for lookup in ttfont['GSUB'].table.LookupList.Lookup: 156 if lookup.LookupType != 4: 157 # Other lookups are used in the emoji font for fallback. 158 # We ignore them for now. 159 continue 160 for subtable in lookup.SubTable: 161 ligatures = subtable.ligatures 162 for first_glyph in ligatures: 163 for ligature in ligatures[first_glyph]: 164 sequence = [first_glyph] + ligature.Component 165 sequence = [reverse_cmap[glyph] for glyph in sequence] 166 sequence = tuple(sequence) 167 # Make sure no starting subsequence of 'sequence' has been 168 # seen before. 169 for sub_len in range(2, len(sequence)+1): 170 subsequence = sequence[:sub_len] 171 assert subsequence not in emoji_map 172 emoji_map[sequence] = ligature.LigGlyph 173 174 return emoji_map 175 176 177def assert_font_supports_any_of_chars(font, chars): 178 best_cmap = get_best_cmap(font) 179 for char in chars: 180 if char in best_cmap: 181 return 182 sys.exit('None of characters in %s were found in %s' % (chars, font)) 183 184 185def assert_font_supports_all_of_chars(font, chars): 186 best_cmap = get_best_cmap(font) 187 for char in chars: 188 assert char in best_cmap, ( 189 'U+%04X was not found in %s' % (char, font)) 190 191 192def assert_font_supports_none_of_chars(font, chars, fallbackName): 193 best_cmap = get_best_cmap(font) 194 for char in chars: 195 if fallbackName: 196 assert char not in best_cmap, 'U+%04X was found in %s' % (char, font) 197 else: 198 assert char not in best_cmap, ( 199 'U+%04X was found in %s in fallback %s' % (char, font, fallbackName)) 200 201 202def assert_font_supports_all_sequences(font, sequences): 203 vs_dict = get_variation_sequences_cmap(font).uvsDict 204 for base, vs in sorted(sequences): 205 assert vs in vs_dict and (base, None) in vs_dict[vs], ( 206 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font)) 207 208 209def check_hyphens(hyphens_dir): 210 # Find all the scripts that need automatic hyphenation 211 scripts = set() 212 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')): 213 hyb_file = path.basename(hyb_file) 214 assert hyb_file.startswith('hyph-'), ( 215 'Unknown hyphenation file %s' % hyb_file) 216 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')] 217 scripts.add(lang_to_script(lang_code)) 218 219 HYPHENS = {0x002D, 0x2010} 220 for script in scripts: 221 fonts = _script_to_font_map[script] 222 assert fonts, 'No fonts found for the "%s" script' % script 223 for font in fonts: 224 assert_font_supports_any_of_chars(font, HYPHENS) 225 226 227class FontRecord(object): 228 def __init__(self, name, scripts, variant, weight, style, fallback_for, font): 229 self.name = name 230 self.scripts = scripts 231 self.variant = variant 232 self.weight = weight 233 self.style = style 234 self.fallback_for = fallback_for 235 self.font = font 236 237 238def parse_fonts_xml(fonts_xml_path): 239 global _script_to_font_map, _fallback_chains, _all_fonts 240 _script_to_font_map = collections.defaultdict(set) 241 _fallback_chains = {} 242 _all_fonts = [] 243 tree = ElementTree.parse(fonts_xml_path) 244 families = tree.findall('family') 245 # Minikin supports up to 254 but users can place their own font at the first 246 # place. Thus, 253 is the maximum allowed number of font families in the 247 # default collection. 248 assert len(families) < 254, ( 249 'System font collection can contains up to 253 font families.') 250 for family in families: 251 name = family.get('name') 252 variant = family.get('variant') 253 langs = family.get('lang') 254 ignoreAttr = family.get('ignore') 255 256 if name: 257 assert variant is None, ( 258 'No variant expected for LGC font %s.' % name) 259 assert langs is None, ( 260 'No language expected for LGC fonts %s.' % name) 261 assert name not in _fallback_chains, 'Duplicated name entry %s' % name 262 _fallback_chains[name] = [] 263 else: 264 assert variant in {None, 'elegant', 'compact'}, ( 265 'Unexpected value for variant: %s' % variant) 266 267 trim_re = re.compile(r"^[ \n\r\t]*(.+)[ \n\r\t]*$") 268 for family in families: 269 name = family.get('name') 270 variant = family.get('variant') 271 langs = family.get('lang') 272 ignoreAttr = family.get('ignore') 273 ignore = ignoreAttr == 'true' or ignoreAttr == '1' 274 275 if ignore: 276 continue 277 278 if langs: 279 langs = langs.split() 280 scripts = {lang_to_script(lang) for lang in langs} 281 else: 282 scripts = set() 283 284 for child in family: 285 assert child.tag == 'font', ( 286 'Unknown tag <%s>' % child.tag) 287 font_file = child.text.rstrip() 288 289 m = trim_re.match(font_file) 290 font_file = m.group(1) 291 292 # In case of variable font and it supports `wght` axis, the weight attribute can be 293 # dropped which is automatically adjusted at runtime. 294 if 'weight' in child: 295 weight = int(child.get('weight')) 296 assert weight % 100 == 0, ( 297 'Font weight "%d" is not a multiple of 100.' % weight) 298 else: 299 weight = None 300 301 # In case of variable font and it supports `ital` or `slnt` axes, the style attribute 302 # can be dropped which is automatically adjusted at runtime. 303 if 'style' in child: 304 style = child.get('style') 305 assert style in {'normal', 'italic'}, ( 306 'Unknown style "%s"' % style) 307 else: 308 style = None 309 310 fallback_for = child.get('fallbackFor') 311 312 assert not name or not fallback_for, ( 313 'name and fallbackFor cannot be present at the same time') 314 assert not fallback_for or fallback_for in _fallback_chains, ( 315 'Unknown fallback name: %s' % fallback_for) 316 317 index = child.get('index') 318 if index: 319 index = int(index) 320 321 if not path.exists(path.join(_fonts_dir, m.group(1))): 322 continue # Missing font is a valid case. Just ignore the missing font files. 323 324 record = FontRecord( 325 name, 326 frozenset(scripts), 327 variant, 328 weight, 329 style, 330 fallback_for, 331 (font_file, index)) 332 333 _all_fonts.append(record) 334 335 if not fallback_for: 336 if not name or name == 'sans-serif': 337 for _, fallback in _fallback_chains.items(): 338 fallback.append(record) 339 else: 340 _fallback_chains[name].append(record) 341 else: 342 _fallback_chains[fallback_for].append(record) 343 344 if name: # non-empty names are used for default LGC fonts 345 map_scripts = {'Latn', 'Grek', 'Cyrl'} 346 else: 347 map_scripts = scripts 348 for script in map_scripts: 349 _script_to_font_map[script].add((font_file, index)) 350 351 352def check_emoji_coverage(all_emoji, equivalent_emoji): 353 emoji_fonts = get_emoji_fonts() 354 check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji) 355 356 357def get_emoji_fonts(): 358 return [ record.font for record in _all_fonts if 'Zsye' in record.scripts ] 359 360def seq_any(sequence, pred): 361 if type(sequence) is tuple: 362 return any([pred(x) for x in sequence]) 363 else: 364 return pred(sequence) 365 366def seq_all(sequence, pred): 367 if type(sequence) is tuple: 368 return all([pred(x) for x in sequence]) 369 else: 370 return pred(sequence) 371 372def is_regional_indicator(x): 373 # regional indicator A..Z 374 return 0x1F1E6 <= x <= 0x1F1FF 375 376def is_flag_sequence(seq): 377 if type(seq) == int: 378 return False 379 len(seq) == 2 and is_regional_indicator(seq[0]) and is_regional_indicator(seq[1]) 380 381def is_tag(x): 382 # tag block 383 return 0xE0000 <= x <= 0xE007F 384 385def is_pua(x): 386 return 0xE000 <= x <= 0xF8FF or 0xF0000 <= x <= 0xFFFFD or 0x100000 <= x <= 0x10FFFD 387 388def contains_pua(sequence): 389 return seq_any(sequence, is_pua) 390 391def contains_regional_indicator(sequence): 392 return seq_any(sequence, is_regional_indicator) 393 394def only_tags(sequence): 395 return seq_all(sequence, is_tag) 396 397def get_psname(ttf): 398 return str(next(x for x in ttf['name'].names 399 if x.platformID == 3 and x.platEncID == 1 and x.nameID == 6)) 400 401def hex_strs(sequence): 402 if type(sequence) is tuple: 403 return tuple(f"{s:X}" for s in sequence) 404 return hex(sequence) 405 406def check_emoji_not_compat(all_emoji, equivalent_emoji): 407 compat_psnames = set() 408 for emoji_font in get_emoji_fonts(): 409 ttf = open_font(emoji_font) 410 psname = get_psname(ttf) 411 412 if "meta" in ttf: 413 assert 'Emji' not in ttf["meta"].data, 'NotoColorEmoji MUST be a compat font' 414 415def is_flag_emoji(font): 416 return 0x1F1E6 in get_best_cmap(font) 417 418def emoji_font_version_to_unicode_version(font_version): 419 version_str = '%.3f' % font_version 420 assert version_str in EMOJI_FONT_TO_UNICODE_MAP, 'Unknown emoji font verion: %s' % version_str 421 return EMOJI_FONT_TO_UNICODE_MAP[version_str] 422 423 424def check_emoji_font_coverage(emoji_fonts, all_emoji, equivalent_emoji): 425 coverages = [] 426 emoji_font_version = 0 427 emoji_flag_font_version = 0 428 for emoji_font in emoji_fonts: 429 coverages.append(get_emoji_map(emoji_font)) 430 431 # Find the largest version of the installed emoji font. 432 version = open_font(emoji_font)['head'].fontRevision 433 if is_flag_emoji(emoji_font): 434 emoji_flag_font_version = max(emoji_flag_font_version, version) 435 else: 436 emoji_font_version = max(emoji_font_version, version) 437 438 emoji_flag_unicode_version = emoji_font_version_to_unicode_version(emoji_flag_font_version) 439 emoji_unicode_version = emoji_font_version_to_unicode_version(emoji_font_version) 440 441 errors = [] 442 443 for sequence in all_emoji: 444 if all([sequence not in coverage for coverage in coverages]): 445 sequence_version = float(_age_by_chars[sequence]) 446 if is_flag_sequence(sequence): 447 if sequence_version <= emoji_flag_unicode_version: 448 errors.append('%s is not supported in the emoji font.' % printable(sequence)) 449 else: 450 if sequence_version <= emoji_unicode_version: 451 errors.append('%s is not supported in the emoji font.' % printable(sequence)) 452 453 for coverage in coverages: 454 for sequence in coverage: 455 if sequence in {0x0000, 0x000D, 0x0020}: 456 # The font needs to support a few extra characters, which is OK 457 continue 458 459 if contains_pua(sequence): 460 # The font needs to have some PUA for EmojiCompat library. 461 continue 462 463 if sequence not in all_emoji: 464 errors.append('%s support unexpected in the emoji font.' % printable(sequence)) 465 466 for first, second in equivalent_emoji.items(): 467 for coverage in coverages: 468 if first not in coverage or second not in coverage: 469 continue # sequence will be reported missing 470 if coverage[first] != coverage[second]: 471 errors.append('%s and %s should map to the same glyph.' % ( 472 printable(first), 473 printable(second))) 474 475 for coverage in coverages: 476 for glyph in set(coverage.values()): 477 maps_to_glyph = [ 478 seq for seq in coverage if coverage[seq] == glyph and not contains_pua(seq) ] 479 if len(maps_to_glyph) > 1: 480 # There are more than one sequences mapping to the same glyph. We 481 # need to make sure they were expected to be equivalent. 482 equivalent_seqs = set() 483 for seq in maps_to_glyph: 484 equivalent_seq = seq 485 while equivalent_seq in equivalent_emoji: 486 equivalent_seq = equivalent_emoji[equivalent_seq] 487 equivalent_seqs.add(equivalent_seq) 488 if len(equivalent_seqs) != 1: 489 errors.append('The sequences %s should not result in the same glyph %s' % ( 490 printable(equivalent_seqs), 491 glyph)) 492 493 assert not errors, '%d emoji font errors:\n%s\n%d emoji font coverage errors' % (len(errors), '\n'.join(errors), len(errors)) 494 495 496def check_emoji_defaults(default_emoji): 497 missing_text_chars = _emoji_properties['Emoji'] - default_emoji 498 for name, fallback_chain in _fallback_chains.items(): 499 emoji_font_seen = False 500 for record in fallback_chain: 501 if 'Zsye' in record.scripts: 502 emoji_font_seen = True 503 # No need to check the emoji font 504 continue 505 # For later fonts, we only check them if they have a script 506 # defined, since the defined script may get them to a higher 507 # score even if they appear after the emoji font. However, 508 # we should skip checking the text symbols font, since 509 # symbol fonts should be able to override the emoji display 510 # style when 'Zsym' is explicitly specified by the user. 511 if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts): 512 continue 513 514 # Check default emoji-style characters 515 assert_font_supports_none_of_chars(record.font, default_emoji, name) 516 517 # Mark default text-style characters appearing in fonts above the emoji 518 # font as seen 519 if not emoji_font_seen: 520 missing_text_chars -= set(get_best_cmap(record.font)) 521 522 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and 523 # webdings yet. 524 missing_text_chars -= _chars_by_age['7.0'] 525 assert missing_text_chars == set(), ( 526 'Text style version of some emoji characters are missing: ' + 527 repr(missing_text_chars)) 528 529 530def parse_unicode_seq(chars): 531 if ' ' in chars: # character sequence 532 sequence = [int(ch, 16) for ch in chars.split(' ')] 533 additions = [tuple(sequence)] 534 elif '..' in chars: # character range 535 char_start, char_end = chars.split('..') 536 char_start = int(char_start, 16) 537 char_end = int(char_end, 16) 538 additions = range(char_start, char_end+1) 539 else: # single character 540 additions = [int(chars, 16)] 541 return additions 542 543# Setting reverse to true returns a dictionary that maps the values to sets of 544# characters, useful for some binary properties. Otherwise, we get a 545# dictionary that maps characters to the property values, assuming there's only 546# one property in the file. 547def parse_unicode_datafile(file_path, reverse=False): 548 if reverse: 549 output_dict = collections.defaultdict(set) 550 else: 551 output_dict = {} 552 with open(file_path) as datafile: 553 for line in datafile: 554 if '#' in line: 555 line = line[:line.index('#')] 556 line = line.strip() 557 if not line: 558 continue 559 560 chars, prop = line.split(';')[:2] 561 chars = chars.strip() 562 prop = prop.strip() 563 564 additions = parse_unicode_seq(chars) 565 566 if reverse: 567 output_dict[prop].update(additions) 568 else: 569 for addition in additions: 570 assert addition not in output_dict 571 output_dict[addition] = prop 572 return output_dict 573 574def parse_sequence_age(file_path): 575 VERSION_RE = re.compile(r'E([\d\.]+)') 576 output_dict = {} 577 with open(file_path) as datafile: 578 for line in datafile: 579 comment = '' 580 if '#' in line: 581 hash_pos = line.index('#') 582 comment = line[hash_pos + 1:].strip() 583 line = line[:hash_pos] 584 line = line.strip() 585 if not line: 586 continue 587 588 chars = line[:line.index(';')].strip() 589 590 m = VERSION_RE.match(comment) 591 assert m, 'Version not found: unknown format: %s' % line 592 version = m.group(1) 593 594 additions = parse_unicode_seq(chars) 595 596 for addition in additions: 597 assert addition not in output_dict 598 output_dict[addition] = version 599 return output_dict 600 601def parse_emoji_variants(file_path): 602 emoji_set = set() 603 text_set = set() 604 with open(file_path) as datafile: 605 for line in datafile: 606 if '#' in line: 607 line = line[:line.index('#')] 608 line = line.strip() 609 if not line: 610 continue 611 sequence, description, _ = line.split(';') 612 sequence = sequence.strip().split(' ') 613 base = int(sequence[0], 16) 614 vs = int(sequence[1], 16) 615 description = description.strip() 616 if description == 'text style': 617 text_set.add((base, vs)) 618 elif description == 'emoji style': 619 emoji_set.add((base, vs)) 620 return text_set, emoji_set 621 622 623def parse_ucd(ucd_path): 624 global _emoji_properties, _chars_by_age, _age_by_chars 625 global _text_variation_sequences, _emoji_variation_sequences 626 global _emoji_sequences, _emoji_zwj_sequences 627 _emoji_properties = parse_unicode_datafile( 628 path.join(ucd_path, 'emoji-data.txt'), reverse=True) 629 emoji_properties_additions = parse_unicode_datafile( 630 path.join(ucd_path, 'additions', 'emoji-data.txt'), reverse=True) 631 for prop in emoji_properties_additions.keys(): 632 _emoji_properties[prop].update(emoji_properties_additions[prop]) 633 634 _chars_by_age = parse_unicode_datafile( 635 path.join(ucd_path, 'DerivedAge.txt'), reverse=True) 636 _age_by_chars = parse_unicode_datafile( 637 path.join(ucd_path, 'DerivedAge.txt')) 638 _age_by_chars.update(parse_sequence_age( 639 path.join(ucd_path, 'emoji-sequences.txt'))) 640 sequences = parse_emoji_variants( 641 path.join(ucd_path, 'emoji-variation-sequences.txt')) 642 _text_variation_sequences, _emoji_variation_sequences = sequences 643 _emoji_sequences = parse_unicode_datafile( 644 path.join(ucd_path, 'emoji-sequences.txt')) 645 _emoji_sequences.update(parse_unicode_datafile( 646 path.join(ucd_path, 'additions', 'emoji-sequences.txt'))) 647 _emoji_zwj_sequences = parse_unicode_datafile( 648 path.join(ucd_path, 'emoji-zwj-sequences.txt')) 649 _emoji_zwj_sequences.update(parse_unicode_datafile( 650 path.join(ucd_path, 'additions', 'emoji-zwj-sequences.txt'))) 651 652 exclusions = parse_unicode_datafile(path.join(ucd_path, 'additions', 'emoji-exclusions.txt')) 653 _emoji_sequences = remove_emoji_exclude(_emoji_sequences, exclusions) 654 _emoji_zwj_sequences = remove_emoji_exclude(_emoji_zwj_sequences, exclusions) 655 _emoji_variation_sequences = remove_emoji_variation_exclude(_emoji_variation_sequences, exclusions) 656 # Unicode 12.0 adds Basic_Emoji in emoji-sequences.txt. We ignore them here since we are already 657 # checking the emoji presentations with emoji-variation-sequences.txt. 658 # Please refer to http://unicode.org/reports/tr51/#def_basic_emoji_set . 659 _emoji_sequences = {k: v for k, v in _emoji_sequences.items() if not v == 'Basic_Emoji' } 660 661 662def remove_emoji_variation_exclude(source, items): 663 return source.difference(items.keys()) 664 665def remove_emoji_exclude(source, items): 666 return {k: v for k, v in source.items() if k not in items} 667 668def flag_sequence(territory_code): 669 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code) 670 671EQUIVALENT_FLAGS = { 672 flag_sequence('BV'): flag_sequence('NO'), 673 flag_sequence('CP'): flag_sequence('FR'), 674 flag_sequence('HM'): flag_sequence('AU'), 675 flag_sequence('SJ'): flag_sequence('NO'), 676 flag_sequence('UM'): flag_sequence('US'), 677} 678 679COMBINING_KEYCAP = 0x20E3 680 681LEGACY_ANDROID_EMOJI = { 682 0xFE4E5: flag_sequence('JP'), 683 0xFE4E6: flag_sequence('US'), 684 0xFE4E7: flag_sequence('FR'), 685 0xFE4E8: flag_sequence('DE'), 686 0xFE4E9: flag_sequence('IT'), 687 0xFE4EA: flag_sequence('GB'), 688 0xFE4EB: flag_sequence('ES'), 689 0xFE4EC: flag_sequence('RU'), 690 0xFE4ED: flag_sequence('CN'), 691 0xFE4EE: flag_sequence('KR'), 692 0xFE82C: (ord('#'), COMBINING_KEYCAP), 693 0xFE82E: (ord('1'), COMBINING_KEYCAP), 694 0xFE82F: (ord('2'), COMBINING_KEYCAP), 695 0xFE830: (ord('3'), COMBINING_KEYCAP), 696 0xFE831: (ord('4'), COMBINING_KEYCAP), 697 0xFE832: (ord('5'), COMBINING_KEYCAP), 698 0xFE833: (ord('6'), COMBINING_KEYCAP), 699 0xFE834: (ord('7'), COMBINING_KEYCAP), 700 0xFE835: (ord('8'), COMBINING_KEYCAP), 701 0xFE836: (ord('9'), COMBINING_KEYCAP), 702 0xFE837: (ord('0'), COMBINING_KEYCAP), 703} 704 705# This is used to define the emoji that should have the same glyph. 706# i.e. previously we had gender based Kiss (0x1F48F), which had the same glyph 707# with Kiss: Woman, Man (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468) 708# in that case a valid row would be: 709# (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F, 710ZWJ_IDENTICALS = { 711} 712 713SAME_FLAG_MAPPINGS = [ 714 # Diego Garcia and British Indian Ocean Territory 715 ((0x1F1EE, 0x1F1F4), (0x1F1E9, 0x1F1EC)), 716 # St. Martin and France 717 ((0x1F1F2, 0x1F1EB), (0x1F1EB, 0x1F1F7)), 718 # Spain and Ceuta & Melilla 719 ((0x1F1EA, 0x1F1F8), (0x1F1EA, 0x1F1E6)), 720] 721 722ZWJ = 0x200D 723 724EMPTY_FLAG_SEQUENCE = (0x1F3F4, 0xE007F) 725 726def is_fitzpatrick_modifier(cp): 727 return 0x1F3FB <= cp <= 0x1F3FF 728 729 730def reverse_emoji(seq): 731 rev = list(reversed(seq)) 732 # if there are fitzpatrick modifiers in the sequence, keep them after 733 # the emoji they modify 734 for i in range(1, len(rev)): 735 if is_fitzpatrick_modifier(rev[i-1]): 736 rev[i], rev[i-1] = rev[i-1], rev[i] 737 return tuple(rev) 738 739 740def compute_expected_emoji(): 741 equivalent_emoji = {} 742 sequence_pieces = set() 743 all_sequences = set() 744 all_sequences.update(_emoji_variation_sequences) 745 746 # add zwj sequences not in the current emoji-zwj-sequences.txt 747 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences) 748 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences) 749 750 # Add empty flag tag sequence that is supported as fallback 751 _emoji_sequences[EMPTY_FLAG_SEQUENCE] = 'Emoji_Tag_Sequence' 752 753 for sequence in _emoji_sequences.keys(): 754 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) 755 all_sequences.add(sequence) 756 sequence_pieces.update(sequence) 757 758 for sequence in adjusted_emoji_zwj_sequences.keys(): 759 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS) 760 all_sequences.add(sequence) 761 sequence_pieces.update(sequence) 762 763 for first, second in SAME_FLAG_MAPPINGS: 764 equivalent_emoji[first] = second 765 766 # Add all tag characters used in flags 767 sequence_pieces.update(range(0xE0030, 0xE0039 + 1)) 768 sequence_pieces.update(range(0xE0061, 0xE007A + 1)) 769 770 all_emoji = ( 771 _emoji_properties['Emoji'] | 772 all_sequences | 773 sequence_pieces | 774 set(LEGACY_ANDROID_EMOJI.keys())) 775 default_emoji = ( 776 _emoji_properties['Emoji_Presentation'] | 777 all_sequences | 778 set(LEGACY_ANDROID_EMOJI.keys())) 779 780 equivalent_emoji.update(EQUIVALENT_FLAGS) 781 equivalent_emoji.update(LEGACY_ANDROID_EMOJI) 782 equivalent_emoji.update(ZWJ_IDENTICALS) 783 784 for seq in _emoji_variation_sequences: 785 equivalent_emoji[seq] = seq[0] 786 787 return all_emoji, default_emoji, equivalent_emoji 788 789 790def check_compact_only_fallback(): 791 for name, fallback_chain in _fallback_chains.items(): 792 for record in fallback_chain: 793 if record.variant == 'compact': 794 same_script_elegants = [x for x in fallback_chain 795 if x.scripts == record.scripts and x.variant == 'elegant'] 796 assert same_script_elegants, ( 797 '%s must be in elegant of %s as fallback of "%s" too' % ( 798 record.font, record.scripts, record.fallback_for),) 799 800 801def check_vertical_metrics(): 802 for record in _all_fonts: 803 if record.name in ['sans-serif', 'sans-serif-condensed']: 804 font = open_font(record.font) 805 assert font['head'].yMax == 2163 and font['head'].yMin == -555, ( 806 'yMax and yMin of %s do not match expected values.' % ( 807 record.font,)) 808 809 if record.name in ['sans-serif', 'sans-serif-condensed', 810 'serif', 'monospace']: 811 font = open_font(record.font) 812 assert (font['hhea'].ascent == 1900 and 813 font['hhea'].descent == -500), ( 814 'ascent and descent of %s do not match expected ' 815 'values.' % (record.font,)) 816 817 818def check_cjk_punctuation(): 819 cjk_scripts = {'Hans', 'Hant', 'Jpan', 'Kore'} 820 cjk_punctuation = range(0x3000, 0x301F + 1) 821 for name, fallback_chain in _fallback_chains.items(): 822 for record in fallback_chain: 823 if record.scripts.intersection(cjk_scripts): 824 # CJK font seen. Stop checking the rest of the fonts. 825 break 826 assert_font_supports_none_of_chars(record.font, cjk_punctuation, name) 827 828def main(): 829 global _fonts_dir 830 target_out = sys.argv[1] 831 _fonts_dir = path.join(target_out, 'fonts') 832 833 fonts_xml_path = path.join(target_out, 'etc', 'font_fallback.xml') 834 835 parse_fonts_xml(fonts_xml_path) 836 837 check_compact_only_fallback() 838 839 check_vertical_metrics() 840 841 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data') 842 check_hyphens(hyphens_dir) 843 844 check_cjk_punctuation() 845 846 check_emoji = sys.argv[2] 847 if check_emoji == 'true': 848 ucd_path = sys.argv[3] 849 parse_ucd(ucd_path) 850 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji() 851 check_emoji_not_compat(all_emoji, equivalent_emoji) 852 check_emoji_coverage(all_emoji, equivalent_emoji) 853 check_emoji_defaults(default_emoji) 854 855 856if __name__ == '__main__': 857 main() 858