xref: /aosp_15_r20/external/pcre/maint/GenerateUcpTables.py (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi#! /usr/bin/python
2*22dc650dSSadaf Ebrahimi
3*22dc650dSSadaf Ebrahimi#                   PCRE2 UNICODE PROPERTY SUPPORT
4*22dc650dSSadaf Ebrahimi#                   ------------------------------
5*22dc650dSSadaf Ebrahimi
6*22dc650dSSadaf Ebrahimi# This script generates the pcre2_ucptables.c file, which contains tables for
7*22dc650dSSadaf Ebrahimi# recognizing Unicode property names. It is #included by pcre2_tables.c. In
8*22dc650dSSadaf Ebrahimi# order to reduce the number of relocations when loading the PCRE2 library, the
9*22dc650dSSadaf Ebrahimi# names are held as a single large string, with offsets in the table. This is
10*22dc650dSSadaf Ebrahimi# tedious to maintain by hand. Therefore, a script is used to generate the
11*22dc650dSSadaf Ebrahimi# table.
12*22dc650dSSadaf Ebrahimi
13*22dc650dSSadaf Ebrahimi# This script was created in December 2021 based on the previous GenerateUtt
14*22dc650dSSadaf Ebrahimi# script, whose output had to be manually edited into pcre2_tables.c. Here is
15*22dc650dSSadaf Ebrahimi# the history of the original script:
16*22dc650dSSadaf Ebrahimi
17*22dc650dSSadaf Ebrahimi# -----------------------------------------------------------------------------
18*22dc650dSSadaf Ebrahimi# Modified by PH 17-March-2009 to generate the more verbose form that works
19*22dc650dSSadaf Ebrahimi# for UTF-support in EBCDIC as well as ASCII environments.
20*22dc650dSSadaf Ebrahimi# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
21*22dc650dSSadaf Ebrahimi# Modified by PH 04-May-2010 to add new "X.." special categories.
22*22dc650dSSadaf Ebrahimi# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
23*22dc650dSSadaf Ebrahimi# Modified by ChPe 30-September-2012 to add this note; no other changes were
24*22dc650dSSadaf Ebrahimi# necessary for Unicode 6.2.0 support.
25*22dc650dSSadaf Ebrahimi# Modfied by PH 26-February-2013 to add the Xuc special category.
26*22dc650dSSadaf Ebrahimi# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
27*22dc650dSSadaf Ebrahimi# Script updated to Python 3 by running it through the 2to3 converter.
28*22dc650dSSadaf Ebrahimi# Added script names for Unicode 7.0.0, 20-June-2014.
29*22dc650dSSadaf Ebrahimi# Added script names for Unicode 8.0.0, 19-June-2015.
30*22dc650dSSadaf Ebrahimi# Added script names for Unicode 10.0.0, 02-July-2017.
31*22dc650dSSadaf Ebrahimi# Added script names for Unicode 11.0.0, 03-July-2018.
32*22dc650dSSadaf Ebrahimi# Added 'Unknown' script, 01-October-2018.
33*22dc650dSSadaf Ebrahimi# Added script names for Unicode 12.1.0, 27-July-2019.
34*22dc650dSSadaf Ebrahimi# Added script names for Unicode 13.0.0, 10-March-2020.
35*22dc650dSSadaf Ebrahimi# Added Script names for Unicode 14.0.0, PCRE2-10.39
36*22dc650dSSadaf Ebrahimi# Added support for bidi class and bidi control, 06-December-2021
37*22dc650dSSadaf Ebrahimi#   This also involved lower casing strings and removing underscores, in
38*22dc650dSSadaf Ebrahimi#   accordance with Unicode's "loose matching" rules, which Perl observes.
39*22dc650dSSadaf Ebrahimi# Changed default script type from PT_SC to PT_SCX, 18-December-2021
40*22dc650dSSadaf Ebrahimi# -----------------------------------------------------------------------------
41*22dc650dSSadaf Ebrahimi#
42*22dc650dSSadaf Ebrahimi# Note subsequent changes here:
43*22dc650dSSadaf Ebrahimi#
44*22dc650dSSadaf Ebrahimi# 27-December-2021: Added support for 4-letter script abbreviations.
45*22dc650dSSadaf Ebrahimi# 10-January-2022:  Further updates for Boolean property support
46*22dc650dSSadaf Ebrahimi# -----------------------------------------------------------------------------
47*22dc650dSSadaf Ebrahimi
48*22dc650dSSadaf Ebrahimi
49*22dc650dSSadaf Ebrahimi# Import common data lists and functions
50*22dc650dSSadaf Ebrahimi
51*22dc650dSSadaf Ebrahimifrom GenerateCommon import \
52*22dc650dSSadaf Ebrahimi  abbreviations, \
53*22dc650dSSadaf Ebrahimi  bool_properties, \
54*22dc650dSSadaf Ebrahimi  bidi_classes, \
55*22dc650dSSadaf Ebrahimi  category_names, \
56*22dc650dSSadaf Ebrahimi  general_category_names, \
57*22dc650dSSadaf Ebrahimi  script_names, \
58*22dc650dSSadaf Ebrahimi  open_output
59*22dc650dSSadaf Ebrahimi
60*22dc650dSSadaf Ebrahimi# Open the output file (no return on failure). This call also writes standard
61*22dc650dSSadaf Ebrahimi# header boilerplate.
62*22dc650dSSadaf Ebrahimi
63*22dc650dSSadaf Ebrahimif = open_output("pcre2_ucptables.c")
64*22dc650dSSadaf Ebrahimi
65*22dc650dSSadaf Ebrahimi# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
66*22dc650dSSadaf Ebrahimi# etc., along with comments. We need to add "bidi" in front of each value, in
67*22dc650dSSadaf Ebrahimi# order to create names that don't clash with other types of property.
68*22dc650dSSadaf Ebrahimi
69*22dc650dSSadaf Ebrahimibidi_class_names = []
70*22dc650dSSadaf Ebrahimifor i in range(0, len(bidi_classes), 2):
71*22dc650dSSadaf Ebrahimi  bidi_class_names.append("bidi" + bidi_classes[i])
72*22dc650dSSadaf Ebrahimi
73*22dc650dSSadaf Ebrahimi# Remove the comments from other lists that contain them.
74*22dc650dSSadaf Ebrahimi
75*22dc650dSSadaf Ebrahimicategory_names = category_names[::2]
76*22dc650dSSadaf Ebrahimi
77*22dc650dSSadaf Ebrahimi# Create standardized versions of the names by lowercasing and removing
78*22dc650dSSadaf Ebrahimi# underscores.
79*22dc650dSSadaf Ebrahimi
80*22dc650dSSadaf Ebrahimidef stdname(x):
81*22dc650dSSadaf Ebrahimi  return x.lower().replace('_', '')
82*22dc650dSSadaf Ebrahimi
83*22dc650dSSadaf Ebrahimidef stdnames(x):
84*22dc650dSSadaf Ebrahimi  y = [''] * len(x)
85*22dc650dSSadaf Ebrahimi  for i in range(len(x)):
86*22dc650dSSadaf Ebrahimi    y[i] = stdname(x[i])
87*22dc650dSSadaf Ebrahimi  return y
88*22dc650dSSadaf Ebrahimi
89*22dc650dSSadaf Ebrahimistd_category_names = stdnames(category_names)
90*22dc650dSSadaf Ebrahimistd_general_category_names = stdnames(general_category_names)
91*22dc650dSSadaf Ebrahimistd_bidi_class_names = stdnames(bidi_class_names)
92*22dc650dSSadaf Ebrahimistd_bool_properties = stdnames(bool_properties)
93*22dc650dSSadaf Ebrahimi
94*22dc650dSSadaf Ebrahimi# Create the table, starting with the Unicode script, category and bidi class
95*22dc650dSSadaf Ebrahimi# names. We keep both the standardized name and the original, because the
96*22dc650dSSadaf Ebrahimi# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
97*22dc650dSSadaf Ebrahimi# still use the full original names.
98*22dc650dSSadaf Ebrahimi
99*22dc650dSSadaf Ebrahimiutt_table = []
100*22dc650dSSadaf Ebrahimi
101*22dc650dSSadaf Ebrahimiscx_end = script_names.index('Unknown')
102*22dc650dSSadaf Ebrahimi
103*22dc650dSSadaf Ebrahimifor idx, name in enumerate(script_names):
104*22dc650dSSadaf Ebrahimi  pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
105*22dc650dSSadaf Ebrahimi  utt_table.append((stdname(name), name, pt_type))
106*22dc650dSSadaf Ebrahimi  for abbrev in abbreviations[name]:
107*22dc650dSSadaf Ebrahimi    utt_table.append((stdname(abbrev), name, pt_type))
108*22dc650dSSadaf Ebrahimi
109*22dc650dSSadaf Ebrahimi# Add the remaining property lists
110*22dc650dSSadaf Ebrahimi
111*22dc650dSSadaf Ebrahimiutt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
112*22dc650dSSadaf Ebrahimiutt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
113*22dc650dSSadaf Ebrahimiutt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
114*22dc650dSSadaf Ebrahimi
115*22dc650dSSadaf Ebrahimifor name in bool_properties:
116*22dc650dSSadaf Ebrahimi  utt_table.append((stdname(name), name, 'PT_BOOL'))
117*22dc650dSSadaf Ebrahimi  if name in abbreviations:
118*22dc650dSSadaf Ebrahimi    for abbrev in abbreviations[name]:
119*22dc650dSSadaf Ebrahimi      utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
120*22dc650dSSadaf Ebrahimi
121*22dc650dSSadaf Ebrahimi# Now add specials and synonyms. Note both the standardized and capitalized
122*22dc650dSSadaf Ebrahimi# forms are needed.
123*22dc650dSSadaf Ebrahimi
124*22dc650dSSadaf Ebrahimiutt_table.append(('any', 'Any', 'PT_ANY'))
125*22dc650dSSadaf Ebrahimiutt_table.append(('l&',  'L&',  'PT_LAMP'))
126*22dc650dSSadaf Ebrahimiutt_table.append(('lc',  'LC',  'PT_LAMP'))
127*22dc650dSSadaf Ebrahimiutt_table.append(('xan', 'Xan', 'PT_ALNUM'))
128*22dc650dSSadaf Ebrahimiutt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
129*22dc650dSSadaf Ebrahimiutt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
130*22dc650dSSadaf Ebrahimiutt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
131*22dc650dSSadaf Ebrahimiutt_table.append(('xwd', 'Xwd', 'PT_WORD'))
132*22dc650dSSadaf Ebrahimi
133*22dc650dSSadaf Ebrahimi# Remove duplicates from the table and then sort it.
134*22dc650dSSadaf Ebrahimi
135*22dc650dSSadaf Ebrahimiutt_table = list(set(utt_table))
136*22dc650dSSadaf Ebrahimiutt_table.sort()
137*22dc650dSSadaf Ebrahimi
138*22dc650dSSadaf Ebrahimi# Output file-specific heading
139*22dc650dSSadaf Ebrahimi
140*22dc650dSSadaf Ebrahimif.write("""\
141*22dc650dSSadaf Ebrahimi#ifdef SUPPORT_UNICODE
142*22dc650dSSadaf Ebrahimi
143*22dc650dSSadaf Ebrahimi/* The PRIV(utt)[] table below translates Unicode property names into type and
144*22dc650dSSadaf Ebrahimicode values. It is searched by binary chop, so must be in collating sequence of
145*22dc650dSSadaf Ebrahiminame. Originally, the table contained pointers to the name strings in the first
146*22dc650dSSadaf Ebrahimifield of each entry. However, that leads to a large number of relocations when
147*22dc650dSSadaf Ebrahimia shared library is dynamically loaded. A significant reduction is made by
148*22dc650dSSadaf Ebrahimiputting all the names into a single, large string and using offsets instead.
149*22dc650dSSadaf EbrahimiAll letters are lower cased, and underscores are removed, in accordance with
150*22dc650dSSadaf Ebrahimithe "loose matching" rules that Unicode advises and Perl uses. */
151*22dc650dSSadaf Ebrahimi\n""")
152*22dc650dSSadaf Ebrahimi
153*22dc650dSSadaf Ebrahimi# We have to use STR_ macros to define the strings so that it all works in
154*22dc650dSSadaf Ebrahimi# UTF-8 mode on EBCDIC platforms.
155*22dc650dSSadaf Ebrahimi
156*22dc650dSSadaf Ebrahimifor utt in utt_table:
157*22dc650dSSadaf Ebrahimi  f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
158*22dc650dSSadaf Ebrahimi  for c in utt[0]:
159*22dc650dSSadaf Ebrahimi    if c == '&':
160*22dc650dSSadaf Ebrahimi      f.write(' STR_AMPERSAND')
161*22dc650dSSadaf Ebrahimi    else:
162*22dc650dSSadaf Ebrahimi      f.write(' STR_%s' % c);
163*22dc650dSSadaf Ebrahimi  f.write(' "\\0"\n')
164*22dc650dSSadaf Ebrahimi
165*22dc650dSSadaf Ebrahimi# Output the long string of concatenated names
166*22dc650dSSadaf Ebrahimi
167*22dc650dSSadaf Ebrahimif.write('\nconst char PRIV(utt_names)[] =\n');
168*22dc650dSSadaf Ebrahimilast = ''
169*22dc650dSSadaf Ebrahimifor utt in utt_table:
170*22dc650dSSadaf Ebrahimi  if utt == utt_table[-1]:
171*22dc650dSSadaf Ebrahimi    last = ';'
172*22dc650dSSadaf Ebrahimi  f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
173*22dc650dSSadaf Ebrahimi
174*22dc650dSSadaf Ebrahimi# Output the property type table
175*22dc650dSSadaf Ebrahimi
176*22dc650dSSadaf Ebrahimif.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
177*22dc650dSSadaf Ebrahimioffset = 0
178*22dc650dSSadaf Ebrahimilast = ','
179*22dc650dSSadaf Ebrahimifor utt in utt_table:
180*22dc650dSSadaf Ebrahimi  if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
181*22dc650dSSadaf Ebrahimi      'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
182*22dc650dSSadaf Ebrahimi    value = '0'
183*22dc650dSSadaf Ebrahimi  else:
184*22dc650dSSadaf Ebrahimi    value = 'ucp_' + utt[1]
185*22dc650dSSadaf Ebrahimi  if utt == utt_table[-1]:
186*22dc650dSSadaf Ebrahimi    last = ''
187*22dc650dSSadaf Ebrahimi  f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
188*22dc650dSSadaf Ebrahimi  offset += len(utt[0]) + 1
189*22dc650dSSadaf Ebrahimif.write('};\n\n')
190*22dc650dSSadaf Ebrahimi
191*22dc650dSSadaf Ebrahimi# Ending text
192*22dc650dSSadaf Ebrahimi
193*22dc650dSSadaf Ebrahimif.write("""\
194*22dc650dSSadaf Ebrahimiconst size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
195*22dc650dSSadaf Ebrahimi
196*22dc650dSSadaf Ebrahimi#endif /* SUPPORT_UNICODE */
197*22dc650dSSadaf Ebrahimi
198*22dc650dSSadaf Ebrahimi/* End of pcre2_ucptables.c */
199*22dc650dSSadaf Ebrahimi""")
200*22dc650dSSadaf Ebrahimi
201*22dc650dSSadaf Ebrahimif.close
202*22dc650dSSadaf Ebrahimi
203*22dc650dSSadaf Ebrahimi# End
204