1#!/usr/bin/env python3 2 3import glob 4import json 5import re 6 7state_map = { 8 'Data state': 0, 9 'RCDATA state': 1, 10 'RAWTEXT state': 2, 11 'PLAINTEXT state': 3, 12 'Script data state': 4, 13 'CDATA section state': 5, 14} 15 16for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')): 17 match = re.search('/([^/]*).test$', filename) 18 if match is None: 19 continue 20 testname = match[1] 21 if testname == 'xmlViolation': 22 continue 23 24 with open(filename) as json_data: 25 root = json.load(json_data) 26 27 test_out = open(f'test/html-tokenizer/{testname}.test', 'w') 28 result_out = open(f'result/html-tokenizer/{testname}.test', 'w') 29 30 counter = 0 31 32 for tests in root.values(): 33 for test in tests: 34 input = test['input'] 35 36 # Skip surrogate tests 37 if re.search(r'\\uD[89A-F]', input, re.I): 38 continue 39 40 input = re.sub(r'\\u([A-Fa-f0-9]{4})', 41 lambda m: chr(int(m[1], 16)), 42 input) 43 44 output = '' 45 for token in test['output']: 46 output += token[0] + '\n' 47 48 if token[0] == 'DOCTYPE': 49 for i in range(1, 4): 50 if token[i] is None: 51 output += '<none>\n' 52 else: 53 output += token[i] + '\n' 54 else: 55 output += token[1] 56 if token[0] == 'StartTag': 57 for name, value in token[2].items(): 58 output += f' {name}={value}' 59 output += '\n' 60 61 output = re.sub(r'\\u([A-Fa-f0-9]{4})', 62 lambda m: chr(int(m[1], 16)), 63 output) 64 output = re.sub(r'\x00', '\uFFFD', output) 65 66 for state in test.get('initialStates', ['Data state']): 67 state_no = state_map.get(state) 68 if state_no is None: 69 raise Exception(f'{filename}: unknown state: {state}') 70 if state_no == 5: 71 continue 72 73 start_tag = test.get('lastStartTag', '-') 74 75 test_out.write(f'{counter} {start_tag} {state_no} ' 76 f'{len(input.encode())}\n') 77 test_out.write(input) 78 test_out.write('\n') 79 80 result_out.write(f'{counter}\n') 81 result_out.write(output) 82 83 counter += 1 84 85 test_out.close() 86 result_out.close() 87