xref: /aosp_15_r20/external/libxml2/tools/genHtmlEnt.py (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1#!/usr/bin/env python3
2
3import json
4import sys
5from dataclasses import dataclass
6
7# The basic idea is to find named character references using binary
8# search. Since entity strings may not have a terminator, this doesn't
9# work if one entity string is a prefix of another. In this case,
10# we branch to a subtable after matching the prefix.
11#
12# We create separate initial tables based on the first character
13# of the entity name.
14#
15# The following tables are generated:
16#
17# htmlEntAlpha:   start and end of initial tables, indexing into
18#                 htmlEntValues
19# htmlEntValues:  concatenation of all table values, which index into
20#                 htmlEntStrings
21# htmlEntStrings: variable sized records containing entity name,
22#                 replacement and optionally the position of a
23#                 subtable
24
25try:
26    with open('entities.json') as json_data:
27        ents = json.load(json_data)
28except FileNotFoundError:
29    print('entities.json not found, try curl -LJO',
30          'https://html.spec.whatwg.org/entities.json')
31    sys.exit(1)
32
33def to_cchars(s):
34    r = []
35
36    for c in s.encode():
37        if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
38            v = f"'{chr(c)}'"
39        else:
40            v = c
41        r += [ v ]
42
43    return r
44
45@dataclass
46class PrefixStackEntry:
47    prefix: str
48    table_id: int
49
50@dataclass
51class AlphaFixup:
52    table_id: int
53    char: int
54
55@dataclass
56class StringFixup:
57    table_id: int
58    string_index: int
59    super_table_id: int
60    super_offset: int
61
62# Remove entity strings without trailing semicolon
63keys = (key for key in ents.keys() if key.endswith(';'))
64
65# Sort entity strings
66keys = sorted(keys, key=lambda k: k[1:-1])
67
68strings = []
69tables = []
70prefix_stack = []
71alpha_fixups = []
72string_fixups = []
73for i in range(64):
74    tables.append([])
75
76for i, key in enumerate(keys):
77    name = key[1:-1]
78
79    next_name = None
80    if i + 1 < len(keys):
81        next_name = keys[i+1][1:-1]
82
83    while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
84        prefix_stack.pop()
85
86    # First character is initial prefix
87    if not prefix_stack:
88        table_id = len(tables)
89        tables.append([])
90
91        prefix_stack.append(PrefixStackEntry(name[0], table_id))
92        alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
93
94    string_index = len(strings)
95    table = tables[prefix_stack[-1].table_id]
96    table_index = len(table)
97    table.append(string_index)
98
99    name_offset = len(prefix_stack[-1].prefix)
100    name_chars = to_cchars(name[name_offset:])
101    repl_chars = to_cchars(ents[key]['characters'])
102    semicolon_flag = 0
103    if key[:-1] in ents:
104        semicolon_flag = 0x80
105
106    if next_name and next_name.startswith(name):
107        # Create subtable
108
109        strings += [
110            len(name_chars) | semicolon_flag | 0x40, *name_chars,
111            0, 0, # subtable position, to be fixed up
112            len(repl_chars), *repl_chars,
113        ]
114
115        table_id = len(tables)
116        tables.append([])
117
118        fixup_index = string_index + 1 + len(name_chars)
119        string_fixups.append(StringFixup(
120            table_id, fixup_index, prefix_stack[-1].table_id, table_index,
121        ))
122
123        prefix_stack.append(PrefixStackEntry(name, table_id))
124    else:
125        strings += [
126            len(name_chars) | semicolon_flag, *name_chars,
127            len(repl_chars), *repl_chars,
128        ]
129
130# Concat tables and record ranges
131ranges = [ 0 ]
132values = []
133for table in tables:
134    values += table
135    ranges.append(len(values))
136
137# Create alpha table
138alpha = [ 0 ] * (59 * 3)
139for fixup in alpha_fixups:
140    table_id, c = fixup.table_id, fixup.char
141    start = ranges[table_id]
142    end = ranges[table_id+1]
143    alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
144
145# Fix up subtable positions
146for fixup in string_fixups:
147    table_id, i = fixup.table_id, fixup.string_index
148    start = ranges[table_id]
149    end = ranges[table_id+1]
150    super_index = ranges[fixup.super_table_id] + fixup.super_offset
151    strings[i:i+2] = [ start - super_index, end - start ]
152
153# Print tables
154
155def gen_table(ctype, cname, values, fmt, elems_per_line):
156    count = len(values)
157    r = ''
158
159    for i in range(count):
160        if i != 0: r += ','
161        if i % elems_per_line == 0: r += '\n    '
162        else: r += ' '
163        r += fmt % values[i]
164
165    return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
166
167print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
168print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
169print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))
170