1#! /usr/bin/python 2 3# PCRE2 UNICODE PROPERTY SUPPORT 4# ------------------------------ 5 6# This script generates the pcre2_ucptables.c file, which contains tables for 7# recognizing Unicode property names. It is #included by pcre2_tables.c. In 8# order to reduce the number of relocations when loading the PCRE2 library, the 9# names are held as a single large string, with offsets in the table. This is 10# tedious to maintain by hand. Therefore, a script is used to generate the 11# table. 12 13# This script was created in December 2021 based on the previous GenerateUtt 14# script, whose output had to be manually edited into pcre2_tables.c. Here is 15# the history of the original script: 16 17# ----------------------------------------------------------------------------- 18# Modified by PH 17-March-2009 to generate the more verbose form that works 19# for UTF-support in EBCDIC as well as ASCII environments. 20# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. 21# Modified by PH 04-May-2010 to add new "X.." special categories. 22# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 23# Modified by ChPe 30-September-2012 to add this note; no other changes were 24# necessary for Unicode 6.2.0 support. 25# Modfied by PH 26-February-2013 to add the Xuc special category. 26# Comment modified by PH 13-May-2014 to update to PCRE2 file names. 27# Script updated to Python 3 by running it through the 2to3 converter. 28# Added script names for Unicode 7.0.0, 20-June-2014. 29# Added script names for Unicode 8.0.0, 19-June-2015. 30# Added script names for Unicode 10.0.0, 02-July-2017. 31# Added script names for Unicode 11.0.0, 03-July-2018. 32# Added 'Unknown' script, 01-October-2018. 33# Added script names for Unicode 12.1.0, 27-July-2019. 34# Added script names for Unicode 13.0.0, 10-March-2020. 35# Added Script names for Unicode 14.0.0, PCRE2-10.39 36# Added support for bidi class and bidi control, 06-December-2021 37# This also involved lower casing strings and removing underscores, in 38# accordance with Unicode's "loose matching" rules, which Perl observes. 39# Changed default script type from PT_SC to PT_SCX, 18-December-2021 40# ----------------------------------------------------------------------------- 41# 42# Note subsequent changes here: 43# 44# 27-December-2021: Added support for 4-letter script abbreviations. 45# 10-January-2022: Further updates for Boolean property support 46# ----------------------------------------------------------------------------- 47 48 49# Import common data lists and functions 50 51from GenerateCommon import \ 52 abbreviations, \ 53 bool_properties, \ 54 bidi_classes, \ 55 category_names, \ 56 general_category_names, \ 57 script_names, \ 58 open_output 59 60# Open the output file (no return on failure). This call also writes standard 61# header boilerplate. 62 63f = open_output("pcre2_ucptables.c") 64 65# The list in bidi_classes contains just the Unicode classes such as AN, LRE, 66# etc., along with comments. We need to add "bidi" in front of each value, in 67# order to create names that don't clash with other types of property. 68 69bidi_class_names = [] 70for i in range(0, len(bidi_classes), 2): 71 bidi_class_names.append("bidi" + bidi_classes[i]) 72 73# Remove the comments from other lists that contain them. 74 75category_names = category_names[::2] 76 77# Create standardized versions of the names by lowercasing and removing 78# underscores. 79 80def stdname(x): 81 return x.lower().replace('_', '') 82 83def stdnames(x): 84 y = [''] * len(x) 85 for i in range(len(x)): 86 y[i] = stdname(x[i]) 87 return y 88 89std_category_names = stdnames(category_names) 90std_general_category_names = stdnames(general_category_names) 91std_bidi_class_names = stdnames(bidi_class_names) 92std_bool_properties = stdnames(bool_properties) 93 94# Create the table, starting with the Unicode script, category and bidi class 95# names. We keep both the standardized name and the original, because the 96# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we 97# still use the full original names. 98 99utt_table = [] 100 101scx_end = script_names.index('Unknown') 102 103for idx, name in enumerate(script_names): 104 pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC' 105 utt_table.append((stdname(name), name, pt_type)) 106 for abbrev in abbreviations[name]: 107 utt_table.append((stdname(abbrev), name, pt_type)) 108 109# Add the remaining property lists 110 111utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) 112utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) 113utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) 114 115for name in bool_properties: 116 utt_table.append((stdname(name), name, 'PT_BOOL')) 117 if name in abbreviations: 118 for abbrev in abbreviations[name]: 119 utt_table.append((stdname(abbrev), name, 'PT_BOOL')) 120 121# Now add specials and synonyms. Note both the standardized and capitalized 122# forms are needed. 123 124utt_table.append(('any', 'Any', 'PT_ANY')) 125utt_table.append(('l&', 'L&', 'PT_LAMP')) 126utt_table.append(('lc', 'LC', 'PT_LAMP')) 127utt_table.append(('xan', 'Xan', 'PT_ALNUM')) 128utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) 129utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) 130utt_table.append(('xuc', 'Xuc', 'PT_UCNC')) 131utt_table.append(('xwd', 'Xwd', 'PT_WORD')) 132 133# Remove duplicates from the table and then sort it. 134 135utt_table = list(set(utt_table)) 136utt_table.sort() 137 138# Output file-specific heading 139 140f.write("""\ 141#ifdef SUPPORT_UNICODE 142 143/* The PRIV(utt)[] table below translates Unicode property names into type and 144code values. It is searched by binary chop, so must be in collating sequence of 145name. Originally, the table contained pointers to the name strings in the first 146field of each entry. However, that leads to a large number of relocations when 147a shared library is dynamically loaded. A significant reduction is made by 148putting all the names into a single, large string and using offsets instead. 149All letters are lower cased, and underscores are removed, in accordance with 150the "loose matching" rules that Unicode advises and Perl uses. */ 151\n""") 152 153# We have to use STR_ macros to define the strings so that it all works in 154# UTF-8 mode on EBCDIC platforms. 155 156for utt in utt_table: 157 f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND'))) 158 for c in utt[0]: 159 if c == '&': 160 f.write(' STR_AMPERSAND') 161 else: 162 f.write(' STR_%s' % c); 163 f.write(' "\\0"\n') 164 165# Output the long string of concatenated names 166 167f.write('\nconst char PRIV(utt_names)[] =\n'); 168last = '' 169for utt in utt_table: 170 if utt == utt_table[-1]: 171 last = ';' 172 f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last)) 173 174# Output the property type table 175 176f.write('\nconst ucp_type_table PRIV(utt)[] = {\n') 177offset = 0 178last = ',' 179for utt in utt_table: 180 if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 181 'PT_SPACE', 'PT_UCNC', 'PT_WORD'): 182 value = '0' 183 else: 184 value = 'ucp_' + utt[1] 185 if utt == utt_table[-1]: 186 last = '' 187 f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last)) 188 offset += len(utt[0]) + 1 189f.write('};\n\n') 190 191# Ending text 192 193f.write("""\ 194const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); 195 196#endif /* SUPPORT_UNICODE */ 197 198/* End of pcre2_ucptables.c */ 199""") 200 201f.close 202 203# End 204