1*22dc650dSSadaf Ebrahimi#! /usr/bin/python 2*22dc650dSSadaf Ebrahimi 3*22dc650dSSadaf Ebrahimi# PCRE2 UNICODE PROPERTY SUPPORT 4*22dc650dSSadaf Ebrahimi# ------------------------------ 5*22dc650dSSadaf Ebrahimi 6*22dc650dSSadaf Ebrahimi# This script generates the pcre2_ucptables.c file, which contains tables for 7*22dc650dSSadaf Ebrahimi# recognizing Unicode property names. It is #included by pcre2_tables.c. In 8*22dc650dSSadaf Ebrahimi# order to reduce the number of relocations when loading the PCRE2 library, the 9*22dc650dSSadaf Ebrahimi# names are held as a single large string, with offsets in the table. This is 10*22dc650dSSadaf Ebrahimi# tedious to maintain by hand. Therefore, a script is used to generate the 11*22dc650dSSadaf Ebrahimi# table. 12*22dc650dSSadaf Ebrahimi 13*22dc650dSSadaf Ebrahimi# This script was created in December 2021 based on the previous GenerateUtt 14*22dc650dSSadaf Ebrahimi# script, whose output had to be manually edited into pcre2_tables.c. Here is 15*22dc650dSSadaf Ebrahimi# the history of the original script: 16*22dc650dSSadaf Ebrahimi 17*22dc650dSSadaf Ebrahimi# ----------------------------------------------------------------------------- 18*22dc650dSSadaf Ebrahimi# Modified by PH 17-March-2009 to generate the more verbose form that works 19*22dc650dSSadaf Ebrahimi# for UTF-support in EBCDIC as well as ASCII environments. 20*22dc650dSSadaf Ebrahimi# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. 21*22dc650dSSadaf Ebrahimi# Modified by PH 04-May-2010 to add new "X.." special categories. 22*22dc650dSSadaf Ebrahimi# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 23*22dc650dSSadaf Ebrahimi# Modified by ChPe 30-September-2012 to add this note; no other changes were 24*22dc650dSSadaf Ebrahimi# necessary for Unicode 6.2.0 support. 25*22dc650dSSadaf Ebrahimi# Modfied by PH 26-February-2013 to add the Xuc special category. 26*22dc650dSSadaf Ebrahimi# Comment modified by PH 13-May-2014 to update to PCRE2 file names. 27*22dc650dSSadaf Ebrahimi# Script updated to Python 3 by running it through the 2to3 converter. 28*22dc650dSSadaf Ebrahimi# Added script names for Unicode 7.0.0, 20-June-2014. 29*22dc650dSSadaf Ebrahimi# Added script names for Unicode 8.0.0, 19-June-2015. 30*22dc650dSSadaf Ebrahimi# Added script names for Unicode 10.0.0, 02-July-2017. 31*22dc650dSSadaf Ebrahimi# Added script names for Unicode 11.0.0, 03-July-2018. 32*22dc650dSSadaf Ebrahimi# Added 'Unknown' script, 01-October-2018. 33*22dc650dSSadaf Ebrahimi# Added script names for Unicode 12.1.0, 27-July-2019. 34*22dc650dSSadaf Ebrahimi# Added script names for Unicode 13.0.0, 10-March-2020. 35*22dc650dSSadaf Ebrahimi# Added Script names for Unicode 14.0.0, PCRE2-10.39 36*22dc650dSSadaf Ebrahimi# Added support for bidi class and bidi control, 06-December-2021 37*22dc650dSSadaf Ebrahimi# This also involved lower casing strings and removing underscores, in 38*22dc650dSSadaf Ebrahimi# accordance with Unicode's "loose matching" rules, which Perl observes. 39*22dc650dSSadaf Ebrahimi# Changed default script type from PT_SC to PT_SCX, 18-December-2021 40*22dc650dSSadaf Ebrahimi# ----------------------------------------------------------------------------- 41*22dc650dSSadaf Ebrahimi# 42*22dc650dSSadaf Ebrahimi# Note subsequent changes here: 43*22dc650dSSadaf Ebrahimi# 44*22dc650dSSadaf Ebrahimi# 27-December-2021: Added support for 4-letter script abbreviations. 45*22dc650dSSadaf Ebrahimi# 10-January-2022: Further updates for Boolean property support 46*22dc650dSSadaf Ebrahimi# ----------------------------------------------------------------------------- 47*22dc650dSSadaf Ebrahimi 48*22dc650dSSadaf Ebrahimi 49*22dc650dSSadaf Ebrahimi# Import common data lists and functions 50*22dc650dSSadaf Ebrahimi 51*22dc650dSSadaf Ebrahimifrom GenerateCommon import \ 52*22dc650dSSadaf Ebrahimi abbreviations, \ 53*22dc650dSSadaf Ebrahimi bool_properties, \ 54*22dc650dSSadaf Ebrahimi bidi_classes, \ 55*22dc650dSSadaf Ebrahimi category_names, \ 56*22dc650dSSadaf Ebrahimi general_category_names, \ 57*22dc650dSSadaf Ebrahimi script_names, \ 58*22dc650dSSadaf Ebrahimi open_output 59*22dc650dSSadaf Ebrahimi 60*22dc650dSSadaf Ebrahimi# Open the output file (no return on failure). This call also writes standard 61*22dc650dSSadaf Ebrahimi# header boilerplate. 62*22dc650dSSadaf Ebrahimi 63*22dc650dSSadaf Ebrahimif = open_output("pcre2_ucptables.c") 64*22dc650dSSadaf Ebrahimi 65*22dc650dSSadaf Ebrahimi# The list in bidi_classes contains just the Unicode classes such as AN, LRE, 66*22dc650dSSadaf Ebrahimi# etc., along with comments. We need to add "bidi" in front of each value, in 67*22dc650dSSadaf Ebrahimi# order to create names that don't clash with other types of property. 68*22dc650dSSadaf Ebrahimi 69*22dc650dSSadaf Ebrahimibidi_class_names = [] 70*22dc650dSSadaf Ebrahimifor i in range(0, len(bidi_classes), 2): 71*22dc650dSSadaf Ebrahimi bidi_class_names.append("bidi" + bidi_classes[i]) 72*22dc650dSSadaf Ebrahimi 73*22dc650dSSadaf Ebrahimi# Remove the comments from other lists that contain them. 74*22dc650dSSadaf Ebrahimi 75*22dc650dSSadaf Ebrahimicategory_names = category_names[::2] 76*22dc650dSSadaf Ebrahimi 77*22dc650dSSadaf Ebrahimi# Create standardized versions of the names by lowercasing and removing 78*22dc650dSSadaf Ebrahimi# underscores. 79*22dc650dSSadaf Ebrahimi 80*22dc650dSSadaf Ebrahimidef stdname(x): 81*22dc650dSSadaf Ebrahimi return x.lower().replace('_', '') 82*22dc650dSSadaf Ebrahimi 83*22dc650dSSadaf Ebrahimidef stdnames(x): 84*22dc650dSSadaf Ebrahimi y = [''] * len(x) 85*22dc650dSSadaf Ebrahimi for i in range(len(x)): 86*22dc650dSSadaf Ebrahimi y[i] = stdname(x[i]) 87*22dc650dSSadaf Ebrahimi return y 88*22dc650dSSadaf Ebrahimi 89*22dc650dSSadaf Ebrahimistd_category_names = stdnames(category_names) 90*22dc650dSSadaf Ebrahimistd_general_category_names = stdnames(general_category_names) 91*22dc650dSSadaf Ebrahimistd_bidi_class_names = stdnames(bidi_class_names) 92*22dc650dSSadaf Ebrahimistd_bool_properties = stdnames(bool_properties) 93*22dc650dSSadaf Ebrahimi 94*22dc650dSSadaf Ebrahimi# Create the table, starting with the Unicode script, category and bidi class 95*22dc650dSSadaf Ebrahimi# names. We keep both the standardized name and the original, because the 96*22dc650dSSadaf Ebrahimi# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we 97*22dc650dSSadaf Ebrahimi# still use the full original names. 98*22dc650dSSadaf Ebrahimi 99*22dc650dSSadaf Ebrahimiutt_table = [] 100*22dc650dSSadaf Ebrahimi 101*22dc650dSSadaf Ebrahimiscx_end = script_names.index('Unknown') 102*22dc650dSSadaf Ebrahimi 103*22dc650dSSadaf Ebrahimifor idx, name in enumerate(script_names): 104*22dc650dSSadaf Ebrahimi pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC' 105*22dc650dSSadaf Ebrahimi utt_table.append((stdname(name), name, pt_type)) 106*22dc650dSSadaf Ebrahimi for abbrev in abbreviations[name]: 107*22dc650dSSadaf Ebrahimi utt_table.append((stdname(abbrev), name, pt_type)) 108*22dc650dSSadaf Ebrahimi 109*22dc650dSSadaf Ebrahimi# Add the remaining property lists 110*22dc650dSSadaf Ebrahimi 111*22dc650dSSadaf Ebrahimiutt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) 112*22dc650dSSadaf Ebrahimiutt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) 113*22dc650dSSadaf Ebrahimiutt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) 114*22dc650dSSadaf Ebrahimi 115*22dc650dSSadaf Ebrahimifor name in bool_properties: 116*22dc650dSSadaf Ebrahimi utt_table.append((stdname(name), name, 'PT_BOOL')) 117*22dc650dSSadaf Ebrahimi if name in abbreviations: 118*22dc650dSSadaf Ebrahimi for abbrev in abbreviations[name]: 119*22dc650dSSadaf Ebrahimi utt_table.append((stdname(abbrev), name, 'PT_BOOL')) 120*22dc650dSSadaf Ebrahimi 121*22dc650dSSadaf Ebrahimi# Now add specials and synonyms. Note both the standardized and capitalized 122*22dc650dSSadaf Ebrahimi# forms are needed. 123*22dc650dSSadaf Ebrahimi 124*22dc650dSSadaf Ebrahimiutt_table.append(('any', 'Any', 'PT_ANY')) 125*22dc650dSSadaf Ebrahimiutt_table.append(('l&', 'L&', 'PT_LAMP')) 126*22dc650dSSadaf Ebrahimiutt_table.append(('lc', 'LC', 'PT_LAMP')) 127*22dc650dSSadaf Ebrahimiutt_table.append(('xan', 'Xan', 'PT_ALNUM')) 128*22dc650dSSadaf Ebrahimiutt_table.append(('xps', 'Xps', 'PT_PXSPACE')) 129*22dc650dSSadaf Ebrahimiutt_table.append(('xsp', 'Xsp', 'PT_SPACE')) 130*22dc650dSSadaf Ebrahimiutt_table.append(('xuc', 'Xuc', 'PT_UCNC')) 131*22dc650dSSadaf Ebrahimiutt_table.append(('xwd', 'Xwd', 'PT_WORD')) 132*22dc650dSSadaf Ebrahimi 133*22dc650dSSadaf Ebrahimi# Remove duplicates from the table and then sort it. 134*22dc650dSSadaf Ebrahimi 135*22dc650dSSadaf Ebrahimiutt_table = list(set(utt_table)) 136*22dc650dSSadaf Ebrahimiutt_table.sort() 137*22dc650dSSadaf Ebrahimi 138*22dc650dSSadaf Ebrahimi# Output file-specific heading 139*22dc650dSSadaf Ebrahimi 140*22dc650dSSadaf Ebrahimif.write("""\ 141*22dc650dSSadaf Ebrahimi#ifdef SUPPORT_UNICODE 142*22dc650dSSadaf Ebrahimi 143*22dc650dSSadaf Ebrahimi/* The PRIV(utt)[] table below translates Unicode property names into type and 144*22dc650dSSadaf Ebrahimicode values. It is searched by binary chop, so must be in collating sequence of 145*22dc650dSSadaf Ebrahiminame. Originally, the table contained pointers to the name strings in the first 146*22dc650dSSadaf Ebrahimifield of each entry. However, that leads to a large number of relocations when 147*22dc650dSSadaf Ebrahimia shared library is dynamically loaded. A significant reduction is made by 148*22dc650dSSadaf Ebrahimiputting all the names into a single, large string and using offsets instead. 149*22dc650dSSadaf EbrahimiAll letters are lower cased, and underscores are removed, in accordance with 150*22dc650dSSadaf Ebrahimithe "loose matching" rules that Unicode advises and Perl uses. */ 151*22dc650dSSadaf Ebrahimi\n""") 152*22dc650dSSadaf Ebrahimi 153*22dc650dSSadaf Ebrahimi# We have to use STR_ macros to define the strings so that it all works in 154*22dc650dSSadaf Ebrahimi# UTF-8 mode on EBCDIC platforms. 155*22dc650dSSadaf Ebrahimi 156*22dc650dSSadaf Ebrahimifor utt in utt_table: 157*22dc650dSSadaf Ebrahimi f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND'))) 158*22dc650dSSadaf Ebrahimi for c in utt[0]: 159*22dc650dSSadaf Ebrahimi if c == '&': 160*22dc650dSSadaf Ebrahimi f.write(' STR_AMPERSAND') 161*22dc650dSSadaf Ebrahimi else: 162*22dc650dSSadaf Ebrahimi f.write(' STR_%s' % c); 163*22dc650dSSadaf Ebrahimi f.write(' "\\0"\n') 164*22dc650dSSadaf Ebrahimi 165*22dc650dSSadaf Ebrahimi# Output the long string of concatenated names 166*22dc650dSSadaf Ebrahimi 167*22dc650dSSadaf Ebrahimif.write('\nconst char PRIV(utt_names)[] =\n'); 168*22dc650dSSadaf Ebrahimilast = '' 169*22dc650dSSadaf Ebrahimifor utt in utt_table: 170*22dc650dSSadaf Ebrahimi if utt == utt_table[-1]: 171*22dc650dSSadaf Ebrahimi last = ';' 172*22dc650dSSadaf Ebrahimi f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last)) 173*22dc650dSSadaf Ebrahimi 174*22dc650dSSadaf Ebrahimi# Output the property type table 175*22dc650dSSadaf Ebrahimi 176*22dc650dSSadaf Ebrahimif.write('\nconst ucp_type_table PRIV(utt)[] = {\n') 177*22dc650dSSadaf Ebrahimioffset = 0 178*22dc650dSSadaf Ebrahimilast = ',' 179*22dc650dSSadaf Ebrahimifor utt in utt_table: 180*22dc650dSSadaf Ebrahimi if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 181*22dc650dSSadaf Ebrahimi 'PT_SPACE', 'PT_UCNC', 'PT_WORD'): 182*22dc650dSSadaf Ebrahimi value = '0' 183*22dc650dSSadaf Ebrahimi else: 184*22dc650dSSadaf Ebrahimi value = 'ucp_' + utt[1] 185*22dc650dSSadaf Ebrahimi if utt == utt_table[-1]: 186*22dc650dSSadaf Ebrahimi last = '' 187*22dc650dSSadaf Ebrahimi f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last)) 188*22dc650dSSadaf Ebrahimi offset += len(utt[0]) + 1 189*22dc650dSSadaf Ebrahimif.write('};\n\n') 190*22dc650dSSadaf Ebrahimi 191*22dc650dSSadaf Ebrahimi# Ending text 192*22dc650dSSadaf Ebrahimi 193*22dc650dSSadaf Ebrahimif.write("""\ 194*22dc650dSSadaf Ebrahimiconst size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); 195*22dc650dSSadaf Ebrahimi 196*22dc650dSSadaf Ebrahimi#endif /* SUPPORT_UNICODE */ 197*22dc650dSSadaf Ebrahimi 198*22dc650dSSadaf Ebrahimi/* End of pcre2_ucptables.c */ 199*22dc650dSSadaf Ebrahimi""") 200*22dc650dSSadaf Ebrahimi 201*22dc650dSSadaf Ebrahimif.close 202*22dc650dSSadaf Ebrahimi 203*22dc650dSSadaf Ebrahimi# End 204