1*7c568831SAndroid Build Coastguard Worker#!/usr/bin/env python3 2*7c568831SAndroid Build Coastguard Worker 3*7c568831SAndroid Build Coastguard Workerimport json 4*7c568831SAndroid Build Coastguard Workerimport sys 5*7c568831SAndroid Build Coastguard Workerfrom dataclasses import dataclass 6*7c568831SAndroid Build Coastguard Worker 7*7c568831SAndroid Build Coastguard Worker# The basic idea is to find named character references using binary 8*7c568831SAndroid Build Coastguard Worker# search. Since entity strings may not have a terminator, this doesn't 9*7c568831SAndroid Build Coastguard Worker# work if one entity string is a prefix of another. In this case, 10*7c568831SAndroid Build Coastguard Worker# we branch to a subtable after matching the prefix. 11*7c568831SAndroid Build Coastguard Worker# 12*7c568831SAndroid Build Coastguard Worker# We create separate initial tables based on the first character 13*7c568831SAndroid Build Coastguard Worker# of the entity name. 14*7c568831SAndroid Build Coastguard Worker# 15*7c568831SAndroid Build Coastguard Worker# The following tables are generated: 16*7c568831SAndroid Build Coastguard Worker# 17*7c568831SAndroid Build Coastguard Worker# htmlEntAlpha: start and end of initial tables, indexing into 18*7c568831SAndroid Build Coastguard Worker# htmlEntValues 19*7c568831SAndroid Build Coastguard Worker# htmlEntValues: concatenation of all table values, which index into 20*7c568831SAndroid Build Coastguard Worker# htmlEntStrings 21*7c568831SAndroid Build Coastguard Worker# htmlEntStrings: variable sized records containing entity name, 22*7c568831SAndroid Build Coastguard Worker# replacement and optionally the position of a 23*7c568831SAndroid Build Coastguard Worker# subtable 24*7c568831SAndroid Build Coastguard Worker 25*7c568831SAndroid Build Coastguard Workertry: 26*7c568831SAndroid Build Coastguard Worker with open('entities.json') as json_data: 27*7c568831SAndroid Build Coastguard Worker ents = json.load(json_data) 28*7c568831SAndroid Build Coastguard Workerexcept FileNotFoundError: 29*7c568831SAndroid Build Coastguard Worker print('entities.json not found, try curl -LJO', 30*7c568831SAndroid Build Coastguard Worker 'https://html.spec.whatwg.org/entities.json') 31*7c568831SAndroid Build Coastguard Worker sys.exit(1) 32*7c568831SAndroid Build Coastguard Worker 33*7c568831SAndroid Build Coastguard Workerdef to_cchars(s): 34*7c568831SAndroid Build Coastguard Worker r = [] 35*7c568831SAndroid Build Coastguard Worker 36*7c568831SAndroid Build Coastguard Worker for c in s.encode(): 37*7c568831SAndroid Build Coastguard Worker if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'): 38*7c568831SAndroid Build Coastguard Worker v = f"'{chr(c)}'" 39*7c568831SAndroid Build Coastguard Worker else: 40*7c568831SAndroid Build Coastguard Worker v = c 41*7c568831SAndroid Build Coastguard Worker r += [ v ] 42*7c568831SAndroid Build Coastguard Worker 43*7c568831SAndroid Build Coastguard Worker return r 44*7c568831SAndroid Build Coastguard Worker 45*7c568831SAndroid Build Coastguard Worker@dataclass 46*7c568831SAndroid Build Coastguard Workerclass PrefixStackEntry: 47*7c568831SAndroid Build Coastguard Worker prefix: str 48*7c568831SAndroid Build Coastguard Worker table_id: int 49*7c568831SAndroid Build Coastguard Worker 50*7c568831SAndroid Build Coastguard Worker@dataclass 51*7c568831SAndroid Build Coastguard Workerclass AlphaFixup: 52*7c568831SAndroid Build Coastguard Worker table_id: int 53*7c568831SAndroid Build Coastguard Worker char: int 54*7c568831SAndroid Build Coastguard Worker 55*7c568831SAndroid Build Coastguard Worker@dataclass 56*7c568831SAndroid Build Coastguard Workerclass StringFixup: 57*7c568831SAndroid Build Coastguard Worker table_id: int 58*7c568831SAndroid Build Coastguard Worker string_index: int 59*7c568831SAndroid Build Coastguard Worker super_table_id: int 60*7c568831SAndroid Build Coastguard Worker super_offset: int 61*7c568831SAndroid Build Coastguard Worker 62*7c568831SAndroid Build Coastguard Worker# Remove entity strings without trailing semicolon 63*7c568831SAndroid Build Coastguard Workerkeys = (key for key in ents.keys() if key.endswith(';')) 64*7c568831SAndroid Build Coastguard Worker 65*7c568831SAndroid Build Coastguard Worker# Sort entity strings 66*7c568831SAndroid Build Coastguard Workerkeys = sorted(keys, key=lambda k: k[1:-1]) 67*7c568831SAndroid Build Coastguard Worker 68*7c568831SAndroid Build Coastguard Workerstrings = [] 69*7c568831SAndroid Build Coastguard Workertables = [] 70*7c568831SAndroid Build Coastguard Workerprefix_stack = [] 71*7c568831SAndroid Build Coastguard Workeralpha_fixups = [] 72*7c568831SAndroid Build Coastguard Workerstring_fixups = [] 73*7c568831SAndroid Build Coastguard Workerfor i in range(64): 74*7c568831SAndroid Build Coastguard Worker tables.append([]) 75*7c568831SAndroid Build Coastguard Worker 76*7c568831SAndroid Build Coastguard Workerfor i, key in enumerate(keys): 77*7c568831SAndroid Build Coastguard Worker name = key[1:-1] 78*7c568831SAndroid Build Coastguard Worker 79*7c568831SAndroid Build Coastguard Worker next_name = None 80*7c568831SAndroid Build Coastguard Worker if i + 1 < len(keys): 81*7c568831SAndroid Build Coastguard Worker next_name = keys[i+1][1:-1] 82*7c568831SAndroid Build Coastguard Worker 83*7c568831SAndroid Build Coastguard Worker while prefix_stack and not name.startswith(prefix_stack[-1].prefix): 84*7c568831SAndroid Build Coastguard Worker prefix_stack.pop() 85*7c568831SAndroid Build Coastguard Worker 86*7c568831SAndroid Build Coastguard Worker # First character is initial prefix 87*7c568831SAndroid Build Coastguard Worker if not prefix_stack: 88*7c568831SAndroid Build Coastguard Worker table_id = len(tables) 89*7c568831SAndroid Build Coastguard Worker tables.append([]) 90*7c568831SAndroid Build Coastguard Worker 91*7c568831SAndroid Build Coastguard Worker prefix_stack.append(PrefixStackEntry(name[0], table_id)) 92*7c568831SAndroid Build Coastguard Worker alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64)) 93*7c568831SAndroid Build Coastguard Worker 94*7c568831SAndroid Build Coastguard Worker string_index = len(strings) 95*7c568831SAndroid Build Coastguard Worker table = tables[prefix_stack[-1].table_id] 96*7c568831SAndroid Build Coastguard Worker table_index = len(table) 97*7c568831SAndroid Build Coastguard Worker table.append(string_index) 98*7c568831SAndroid Build Coastguard Worker 99*7c568831SAndroid Build Coastguard Worker name_offset = len(prefix_stack[-1].prefix) 100*7c568831SAndroid Build Coastguard Worker name_chars = to_cchars(name[name_offset:]) 101*7c568831SAndroid Build Coastguard Worker repl_chars = to_cchars(ents[key]['characters']) 102*7c568831SAndroid Build Coastguard Worker semicolon_flag = 0 103*7c568831SAndroid Build Coastguard Worker if key[:-1] in ents: 104*7c568831SAndroid Build Coastguard Worker semicolon_flag = 0x80 105*7c568831SAndroid Build Coastguard Worker 106*7c568831SAndroid Build Coastguard Worker if next_name and next_name.startswith(name): 107*7c568831SAndroid Build Coastguard Worker # Create subtable 108*7c568831SAndroid Build Coastguard Worker 109*7c568831SAndroid Build Coastguard Worker strings += [ 110*7c568831SAndroid Build Coastguard Worker len(name_chars) | semicolon_flag | 0x40, *name_chars, 111*7c568831SAndroid Build Coastguard Worker 0, 0, # subtable position, to be fixed up 112*7c568831SAndroid Build Coastguard Worker len(repl_chars), *repl_chars, 113*7c568831SAndroid Build Coastguard Worker ] 114*7c568831SAndroid Build Coastguard Worker 115*7c568831SAndroid Build Coastguard Worker table_id = len(tables) 116*7c568831SAndroid Build Coastguard Worker tables.append([]) 117*7c568831SAndroid Build Coastguard Worker 118*7c568831SAndroid Build Coastguard Worker fixup_index = string_index + 1 + len(name_chars) 119*7c568831SAndroid Build Coastguard Worker string_fixups.append(StringFixup( 120*7c568831SAndroid Build Coastguard Worker table_id, fixup_index, prefix_stack[-1].table_id, table_index, 121*7c568831SAndroid Build Coastguard Worker )) 122*7c568831SAndroid Build Coastguard Worker 123*7c568831SAndroid Build Coastguard Worker prefix_stack.append(PrefixStackEntry(name, table_id)) 124*7c568831SAndroid Build Coastguard Worker else: 125*7c568831SAndroid Build Coastguard Worker strings += [ 126*7c568831SAndroid Build Coastguard Worker len(name_chars) | semicolon_flag, *name_chars, 127*7c568831SAndroid Build Coastguard Worker len(repl_chars), *repl_chars, 128*7c568831SAndroid Build Coastguard Worker ] 129*7c568831SAndroid Build Coastguard Worker 130*7c568831SAndroid Build Coastguard Worker# Concat tables and record ranges 131*7c568831SAndroid Build Coastguard Workerranges = [ 0 ] 132*7c568831SAndroid Build Coastguard Workervalues = [] 133*7c568831SAndroid Build Coastguard Workerfor table in tables: 134*7c568831SAndroid Build Coastguard Worker values += table 135*7c568831SAndroid Build Coastguard Worker ranges.append(len(values)) 136*7c568831SAndroid Build Coastguard Worker 137*7c568831SAndroid Build Coastguard Worker# Create alpha table 138*7c568831SAndroid Build Coastguard Workeralpha = [ 0 ] * (59 * 3) 139*7c568831SAndroid Build Coastguard Workerfor fixup in alpha_fixups: 140*7c568831SAndroid Build Coastguard Worker table_id, c = fixup.table_id, fixup.char 141*7c568831SAndroid Build Coastguard Worker start = ranges[table_id] 142*7c568831SAndroid Build Coastguard Worker end = ranges[table_id+1] 143*7c568831SAndroid Build Coastguard Worker alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ] 144*7c568831SAndroid Build Coastguard Worker 145*7c568831SAndroid Build Coastguard Worker# Fix up subtable positions 146*7c568831SAndroid Build Coastguard Workerfor fixup in string_fixups: 147*7c568831SAndroid Build Coastguard Worker table_id, i = fixup.table_id, fixup.string_index 148*7c568831SAndroid Build Coastguard Worker start = ranges[table_id] 149*7c568831SAndroid Build Coastguard Worker end = ranges[table_id+1] 150*7c568831SAndroid Build Coastguard Worker super_index = ranges[fixup.super_table_id] + fixup.super_offset 151*7c568831SAndroid Build Coastguard Worker strings[i:i+2] = [ start - super_index, end - start ] 152*7c568831SAndroid Build Coastguard Worker 153*7c568831SAndroid Build Coastguard Worker# Print tables 154*7c568831SAndroid Build Coastguard Worker 155*7c568831SAndroid Build Coastguard Workerdef gen_table(ctype, cname, values, fmt, elems_per_line): 156*7c568831SAndroid Build Coastguard Worker count = len(values) 157*7c568831SAndroid Build Coastguard Worker r = '' 158*7c568831SAndroid Build Coastguard Worker 159*7c568831SAndroid Build Coastguard Worker for i in range(count): 160*7c568831SAndroid Build Coastguard Worker if i != 0: r += ',' 161*7c568831SAndroid Build Coastguard Worker if i % elems_per_line == 0: r += '\n ' 162*7c568831SAndroid Build Coastguard Worker else: r += ' ' 163*7c568831SAndroid Build Coastguard Worker r += fmt % values[i] 164*7c568831SAndroid Build Coastguard Worker 165*7c568831SAndroid Build Coastguard Worker return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n' 166*7c568831SAndroid Build Coastguard Worker 167*7c568831SAndroid Build Coastguard Workerprint(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15)) 168*7c568831SAndroid Build Coastguard Workerprint(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10)) 169*7c568831SAndroid Build Coastguard Workerprint(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15)) 170