xref: /aosp_15_r20/external/libxml2/tools/genHtmlEnt.py (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1*7c568831SAndroid Build Coastguard Worker#!/usr/bin/env python3
2*7c568831SAndroid Build Coastguard Worker
3*7c568831SAndroid Build Coastguard Workerimport json
4*7c568831SAndroid Build Coastguard Workerimport sys
5*7c568831SAndroid Build Coastguard Workerfrom dataclasses import dataclass
6*7c568831SAndroid Build Coastguard Worker
7*7c568831SAndroid Build Coastguard Worker# The basic idea is to find named character references using binary
8*7c568831SAndroid Build Coastguard Worker# search. Since entity strings may not have a terminator, this doesn't
9*7c568831SAndroid Build Coastguard Worker# work if one entity string is a prefix of another. In this case,
10*7c568831SAndroid Build Coastguard Worker# we branch to a subtable after matching the prefix.
11*7c568831SAndroid Build Coastguard Worker#
12*7c568831SAndroid Build Coastguard Worker# We create separate initial tables based on the first character
13*7c568831SAndroid Build Coastguard Worker# of the entity name.
14*7c568831SAndroid Build Coastguard Worker#
15*7c568831SAndroid Build Coastguard Worker# The following tables are generated:
16*7c568831SAndroid Build Coastguard Worker#
17*7c568831SAndroid Build Coastguard Worker# htmlEntAlpha:   start and end of initial tables, indexing into
18*7c568831SAndroid Build Coastguard Worker#                 htmlEntValues
19*7c568831SAndroid Build Coastguard Worker# htmlEntValues:  concatenation of all table values, which index into
20*7c568831SAndroid Build Coastguard Worker#                 htmlEntStrings
21*7c568831SAndroid Build Coastguard Worker# htmlEntStrings: variable sized records containing entity name,
22*7c568831SAndroid Build Coastguard Worker#                 replacement and optionally the position of a
23*7c568831SAndroid Build Coastguard Worker#                 subtable
24*7c568831SAndroid Build Coastguard Worker
25*7c568831SAndroid Build Coastguard Workertry:
26*7c568831SAndroid Build Coastguard Worker    with open('entities.json') as json_data:
27*7c568831SAndroid Build Coastguard Worker        ents = json.load(json_data)
28*7c568831SAndroid Build Coastguard Workerexcept FileNotFoundError:
29*7c568831SAndroid Build Coastguard Worker    print('entities.json not found, try curl -LJO',
30*7c568831SAndroid Build Coastguard Worker          'https://html.spec.whatwg.org/entities.json')
31*7c568831SAndroid Build Coastguard Worker    sys.exit(1)
32*7c568831SAndroid Build Coastguard Worker
33*7c568831SAndroid Build Coastguard Workerdef to_cchars(s):
34*7c568831SAndroid Build Coastguard Worker    r = []
35*7c568831SAndroid Build Coastguard Worker
36*7c568831SAndroid Build Coastguard Worker    for c in s.encode():
37*7c568831SAndroid Build Coastguard Worker        if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
38*7c568831SAndroid Build Coastguard Worker            v = f"'{chr(c)}'"
39*7c568831SAndroid Build Coastguard Worker        else:
40*7c568831SAndroid Build Coastguard Worker            v = c
41*7c568831SAndroid Build Coastguard Worker        r += [ v ]
42*7c568831SAndroid Build Coastguard Worker
43*7c568831SAndroid Build Coastguard Worker    return r
44*7c568831SAndroid Build Coastguard Worker
45*7c568831SAndroid Build Coastguard Worker@dataclass
46*7c568831SAndroid Build Coastguard Workerclass PrefixStackEntry:
47*7c568831SAndroid Build Coastguard Worker    prefix: str
48*7c568831SAndroid Build Coastguard Worker    table_id: int
49*7c568831SAndroid Build Coastguard Worker
50*7c568831SAndroid Build Coastguard Worker@dataclass
51*7c568831SAndroid Build Coastguard Workerclass AlphaFixup:
52*7c568831SAndroid Build Coastguard Worker    table_id: int
53*7c568831SAndroid Build Coastguard Worker    char: int
54*7c568831SAndroid Build Coastguard Worker
55*7c568831SAndroid Build Coastguard Worker@dataclass
56*7c568831SAndroid Build Coastguard Workerclass StringFixup:
57*7c568831SAndroid Build Coastguard Worker    table_id: int
58*7c568831SAndroid Build Coastguard Worker    string_index: int
59*7c568831SAndroid Build Coastguard Worker    super_table_id: int
60*7c568831SAndroid Build Coastguard Worker    super_offset: int
61*7c568831SAndroid Build Coastguard Worker
62*7c568831SAndroid Build Coastguard Worker# Remove entity strings without trailing semicolon
63*7c568831SAndroid Build Coastguard Workerkeys = (key for key in ents.keys() if key.endswith(';'))
64*7c568831SAndroid Build Coastguard Worker
65*7c568831SAndroid Build Coastguard Worker# Sort entity strings
66*7c568831SAndroid Build Coastguard Workerkeys = sorted(keys, key=lambda k: k[1:-1])
67*7c568831SAndroid Build Coastguard Worker
68*7c568831SAndroid Build Coastguard Workerstrings = []
69*7c568831SAndroid Build Coastguard Workertables = []
70*7c568831SAndroid Build Coastguard Workerprefix_stack = []
71*7c568831SAndroid Build Coastguard Workeralpha_fixups = []
72*7c568831SAndroid Build Coastguard Workerstring_fixups = []
73*7c568831SAndroid Build Coastguard Workerfor i in range(64):
74*7c568831SAndroid Build Coastguard Worker    tables.append([])
75*7c568831SAndroid Build Coastguard Worker
76*7c568831SAndroid Build Coastguard Workerfor i, key in enumerate(keys):
77*7c568831SAndroid Build Coastguard Worker    name = key[1:-1]
78*7c568831SAndroid Build Coastguard Worker
79*7c568831SAndroid Build Coastguard Worker    next_name = None
80*7c568831SAndroid Build Coastguard Worker    if i + 1 < len(keys):
81*7c568831SAndroid Build Coastguard Worker        next_name = keys[i+1][1:-1]
82*7c568831SAndroid Build Coastguard Worker
83*7c568831SAndroid Build Coastguard Worker    while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
84*7c568831SAndroid Build Coastguard Worker        prefix_stack.pop()
85*7c568831SAndroid Build Coastguard Worker
86*7c568831SAndroid Build Coastguard Worker    # First character is initial prefix
87*7c568831SAndroid Build Coastguard Worker    if not prefix_stack:
88*7c568831SAndroid Build Coastguard Worker        table_id = len(tables)
89*7c568831SAndroid Build Coastguard Worker        tables.append([])
90*7c568831SAndroid Build Coastguard Worker
91*7c568831SAndroid Build Coastguard Worker        prefix_stack.append(PrefixStackEntry(name[0], table_id))
92*7c568831SAndroid Build Coastguard Worker        alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
93*7c568831SAndroid Build Coastguard Worker
94*7c568831SAndroid Build Coastguard Worker    string_index = len(strings)
95*7c568831SAndroid Build Coastguard Worker    table = tables[prefix_stack[-1].table_id]
96*7c568831SAndroid Build Coastguard Worker    table_index = len(table)
97*7c568831SAndroid Build Coastguard Worker    table.append(string_index)
98*7c568831SAndroid Build Coastguard Worker
99*7c568831SAndroid Build Coastguard Worker    name_offset = len(prefix_stack[-1].prefix)
100*7c568831SAndroid Build Coastguard Worker    name_chars = to_cchars(name[name_offset:])
101*7c568831SAndroid Build Coastguard Worker    repl_chars = to_cchars(ents[key]['characters'])
102*7c568831SAndroid Build Coastguard Worker    semicolon_flag = 0
103*7c568831SAndroid Build Coastguard Worker    if key[:-1] in ents:
104*7c568831SAndroid Build Coastguard Worker        semicolon_flag = 0x80
105*7c568831SAndroid Build Coastguard Worker
106*7c568831SAndroid Build Coastguard Worker    if next_name and next_name.startswith(name):
107*7c568831SAndroid Build Coastguard Worker        # Create subtable
108*7c568831SAndroid Build Coastguard Worker
109*7c568831SAndroid Build Coastguard Worker        strings += [
110*7c568831SAndroid Build Coastguard Worker            len(name_chars) | semicolon_flag | 0x40, *name_chars,
111*7c568831SAndroid Build Coastguard Worker            0, 0, # subtable position, to be fixed up
112*7c568831SAndroid Build Coastguard Worker            len(repl_chars), *repl_chars,
113*7c568831SAndroid Build Coastguard Worker        ]
114*7c568831SAndroid Build Coastguard Worker
115*7c568831SAndroid Build Coastguard Worker        table_id = len(tables)
116*7c568831SAndroid Build Coastguard Worker        tables.append([])
117*7c568831SAndroid Build Coastguard Worker
118*7c568831SAndroid Build Coastguard Worker        fixup_index = string_index + 1 + len(name_chars)
119*7c568831SAndroid Build Coastguard Worker        string_fixups.append(StringFixup(
120*7c568831SAndroid Build Coastguard Worker            table_id, fixup_index, prefix_stack[-1].table_id, table_index,
121*7c568831SAndroid Build Coastguard Worker        ))
122*7c568831SAndroid Build Coastguard Worker
123*7c568831SAndroid Build Coastguard Worker        prefix_stack.append(PrefixStackEntry(name, table_id))
124*7c568831SAndroid Build Coastguard Worker    else:
125*7c568831SAndroid Build Coastguard Worker        strings += [
126*7c568831SAndroid Build Coastguard Worker            len(name_chars) | semicolon_flag, *name_chars,
127*7c568831SAndroid Build Coastguard Worker            len(repl_chars), *repl_chars,
128*7c568831SAndroid Build Coastguard Worker        ]
129*7c568831SAndroid Build Coastguard Worker
130*7c568831SAndroid Build Coastguard Worker# Concat tables and record ranges
131*7c568831SAndroid Build Coastguard Workerranges = [ 0 ]
132*7c568831SAndroid Build Coastguard Workervalues = []
133*7c568831SAndroid Build Coastguard Workerfor table in tables:
134*7c568831SAndroid Build Coastguard Worker    values += table
135*7c568831SAndroid Build Coastguard Worker    ranges.append(len(values))
136*7c568831SAndroid Build Coastguard Worker
137*7c568831SAndroid Build Coastguard Worker# Create alpha table
138*7c568831SAndroid Build Coastguard Workeralpha = [ 0 ] * (59 * 3)
139*7c568831SAndroid Build Coastguard Workerfor fixup in alpha_fixups:
140*7c568831SAndroid Build Coastguard Worker    table_id, c = fixup.table_id, fixup.char
141*7c568831SAndroid Build Coastguard Worker    start = ranges[table_id]
142*7c568831SAndroid Build Coastguard Worker    end = ranges[table_id+1]
143*7c568831SAndroid Build Coastguard Worker    alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
144*7c568831SAndroid Build Coastguard Worker
145*7c568831SAndroid Build Coastguard Worker# Fix up subtable positions
146*7c568831SAndroid Build Coastguard Workerfor fixup in string_fixups:
147*7c568831SAndroid Build Coastguard Worker    table_id, i = fixup.table_id, fixup.string_index
148*7c568831SAndroid Build Coastguard Worker    start = ranges[table_id]
149*7c568831SAndroid Build Coastguard Worker    end = ranges[table_id+1]
150*7c568831SAndroid Build Coastguard Worker    super_index = ranges[fixup.super_table_id] + fixup.super_offset
151*7c568831SAndroid Build Coastguard Worker    strings[i:i+2] = [ start - super_index, end - start ]
152*7c568831SAndroid Build Coastguard Worker
153*7c568831SAndroid Build Coastguard Worker# Print tables
154*7c568831SAndroid Build Coastguard Worker
155*7c568831SAndroid Build Coastguard Workerdef gen_table(ctype, cname, values, fmt, elems_per_line):
156*7c568831SAndroid Build Coastguard Worker    count = len(values)
157*7c568831SAndroid Build Coastguard Worker    r = ''
158*7c568831SAndroid Build Coastguard Worker
159*7c568831SAndroid Build Coastguard Worker    for i in range(count):
160*7c568831SAndroid Build Coastguard Worker        if i != 0: r += ','
161*7c568831SAndroid Build Coastguard Worker        if i % elems_per_line == 0: r += '\n    '
162*7c568831SAndroid Build Coastguard Worker        else: r += ' '
163*7c568831SAndroid Build Coastguard Worker        r += fmt % values[i]
164*7c568831SAndroid Build Coastguard Worker
165*7c568831SAndroid Build Coastguard Worker    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
166*7c568831SAndroid Build Coastguard Worker
167*7c568831SAndroid Build Coastguard Workerprint(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
168*7c568831SAndroid Build Coastguard Workerprint(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
169*7c568831SAndroid Build Coastguard Workerprint(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
170