xref: /aosp_15_r20/external/libxml2/tools/genHtml5LibTests.py (revision 7c5688314b92172186c154356a6374bf7684c3ca)
1#!/usr/bin/env python3
2
3import glob
4import json
5import re
6
7state_map = {
8    'Data state':          0,
9    'RCDATA state':        1,
10    'RAWTEXT state':       2,
11    'PLAINTEXT state':     3,
12    'Script data state':   4,
13    'CDATA section state': 5,
14}
15
16for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
17    match = re.search('/([^/]*).test$', filename)
18    if match is None:
19        continue
20    testname = match[1]
21    if testname == 'xmlViolation':
22        continue
23
24    with open(filename) as json_data:
25        root = json.load(json_data)
26
27    test_out = open(f'test/html-tokenizer/{testname}.test', 'w')
28    result_out = open(f'result/html-tokenizer/{testname}.test', 'w')
29
30    counter = 0
31
32    for tests in root.values():
33        for test in tests:
34            input = test['input']
35
36            # Skip surrogate tests
37            if re.search(r'\\uD[89A-F]', input, re.I):
38                continue
39
40            input = re.sub(r'\\u([A-Fa-f0-9]{4})',
41                           lambda m: chr(int(m[1], 16)),
42                           input)
43
44            output = ''
45            for token in test['output']:
46                output += token[0] + '\n'
47
48                if token[0] == 'DOCTYPE':
49                    for i in range(1, 4):
50                        if token[i] is None:
51                            output += '<none>\n'
52                        else:
53                            output += token[i] + '\n'
54                else:
55                    output += token[1]
56                    if token[0] == 'StartTag':
57                        for name, value in token[2].items():
58                            output += f' {name}={value}'
59                    output += '\n'
60
61            output = re.sub(r'\\u([A-Fa-f0-9]{4})',
62                            lambda m: chr(int(m[1], 16)),
63                            output)
64            output = re.sub(r'\x00', '\uFFFD', output)
65
66            for state in test.get('initialStates', ['Data state']):
67                state_no = state_map.get(state)
68                if state_no is None:
69                    raise Exception(f'{filename}: unknown state: {state}')
70                if state_no == 5:
71                    continue
72
73                start_tag = test.get('lastStartTag', '-')
74
75                test_out.write(f'{counter} {start_tag} {state_no} '
76                               f'{len(input.encode())}\n')
77                test_out.write(input)
78                test_out.write('\n')
79
80                result_out.write(f'{counter}\n')
81                result_out.write(output)
82
83                counter += 1
84
85        test_out.close()
86        result_out.close()
87