1#!/usr/bin/env python3 2 3import json 4import sys 5from dataclasses import dataclass 6 7# The basic idea is to find named character references using binary 8# search. Since entity strings may not have a terminator, this doesn't 9# work if one entity string is a prefix of another. In this case, 10# we branch to a subtable after matching the prefix. 11# 12# We create separate initial tables based on the first character 13# of the entity name. 14# 15# The following tables are generated: 16# 17# htmlEntAlpha: start and end of initial tables, indexing into 18# htmlEntValues 19# htmlEntValues: concatenation of all table values, which index into 20# htmlEntStrings 21# htmlEntStrings: variable sized records containing entity name, 22# replacement and optionally the position of a 23# subtable 24 25try: 26 with open('entities.json') as json_data: 27 ents = json.load(json_data) 28except FileNotFoundError: 29 print('entities.json not found, try curl -LJO', 30 'https://html.spec.whatwg.org/entities.json') 31 sys.exit(1) 32 33def to_cchars(s): 34 r = [] 35 36 for c in s.encode(): 37 if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'): 38 v = f"'{chr(c)}'" 39 else: 40 v = c 41 r += [ v ] 42 43 return r 44 45@dataclass 46class PrefixStackEntry: 47 prefix: str 48 table_id: int 49 50@dataclass 51class AlphaFixup: 52 table_id: int 53 char: int 54 55@dataclass 56class StringFixup: 57 table_id: int 58 string_index: int 59 super_table_id: int 60 super_offset: int 61 62# Remove entity strings without trailing semicolon 63keys = (key for key in ents.keys() if key.endswith(';')) 64 65# Sort entity strings 66keys = sorted(keys, key=lambda k: k[1:-1]) 67 68strings = [] 69tables = [] 70prefix_stack = [] 71alpha_fixups = [] 72string_fixups = [] 73for i in range(64): 74 tables.append([]) 75 76for i, key in enumerate(keys): 77 name = key[1:-1] 78 79 next_name = None 80 if i + 1 < len(keys): 81 next_name = keys[i+1][1:-1] 82 83 while prefix_stack and not name.startswith(prefix_stack[-1].prefix): 84 prefix_stack.pop() 85 86 # First character is initial prefix 87 if not prefix_stack: 88 table_id = len(tables) 89 tables.append([]) 90 91 prefix_stack.append(PrefixStackEntry(name[0], table_id)) 92 alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64)) 93 94 string_index = len(strings) 95 table = tables[prefix_stack[-1].table_id] 96 table_index = len(table) 97 table.append(string_index) 98 99 name_offset = len(prefix_stack[-1].prefix) 100 name_chars = to_cchars(name[name_offset:]) 101 repl_chars = to_cchars(ents[key]['characters']) 102 semicolon_flag = 0 103 if key[:-1] in ents: 104 semicolon_flag = 0x80 105 106 if next_name and next_name.startswith(name): 107 # Create subtable 108 109 strings += [ 110 len(name_chars) | semicolon_flag | 0x40, *name_chars, 111 0, 0, # subtable position, to be fixed up 112 len(repl_chars), *repl_chars, 113 ] 114 115 table_id = len(tables) 116 tables.append([]) 117 118 fixup_index = string_index + 1 + len(name_chars) 119 string_fixups.append(StringFixup( 120 table_id, fixup_index, prefix_stack[-1].table_id, table_index, 121 )) 122 123 prefix_stack.append(PrefixStackEntry(name, table_id)) 124 else: 125 strings += [ 126 len(name_chars) | semicolon_flag, *name_chars, 127 len(repl_chars), *repl_chars, 128 ] 129 130# Concat tables and record ranges 131ranges = [ 0 ] 132values = [] 133for table in tables: 134 values += table 135 ranges.append(len(values)) 136 137# Create alpha table 138alpha = [ 0 ] * (59 * 3) 139for fixup in alpha_fixups: 140 table_id, c = fixup.table_id, fixup.char 141 start = ranges[table_id] 142 end = ranges[table_id+1] 143 alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ] 144 145# Fix up subtable positions 146for fixup in string_fixups: 147 table_id, i = fixup.table_id, fixup.string_index 148 start = ranges[table_id] 149 end = ranges[table_id+1] 150 super_index = ranges[fixup.super_table_id] + fixup.super_offset 151 strings[i:i+2] = [ start - super_index, end - start ] 152 153# Print tables 154 155def gen_table(ctype, cname, values, fmt, elems_per_line): 156 count = len(values) 157 r = '' 158 159 for i in range(count): 160 if i != 0: r += ',' 161 if i % elems_per_line == 0: r += '\n ' 162 else: r += ' ' 163 r += fmt % values[i] 164 165 return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n' 166 167print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15)) 168print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10)) 169print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15)) 170