xref: /aosp_15_r20/external/pcre/maint/GenerateUcpTables.py (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1#! /usr/bin/python
2
3#                   PCRE2 UNICODE PROPERTY SUPPORT
4#                   ------------------------------
5
6# This script generates the pcre2_ucptables.c file, which contains tables for
7# recognizing Unicode property names. It is #included by pcre2_tables.c. In
8# order to reduce the number of relocations when loading the PCRE2 library, the
9# names are held as a single large string, with offsets in the table. This is
10# tedious to maintain by hand. Therefore, a script is used to generate the
11# table.
12
13# This script was created in December 2021 based on the previous GenerateUtt
14# script, whose output had to be manually edited into pcre2_tables.c. Here is
15# the history of the original script:
16
17# -----------------------------------------------------------------------------
18# Modified by PH 17-March-2009 to generate the more verbose form that works
19# for UTF-support in EBCDIC as well as ASCII environments.
20# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
21# Modified by PH 04-May-2010 to add new "X.." special categories.
22# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
23# Modified by ChPe 30-September-2012 to add this note; no other changes were
24# necessary for Unicode 6.2.0 support.
25# Modfied by PH 26-February-2013 to add the Xuc special category.
26# Comment modified by PH 13-May-2014 to update to PCRE2 file names.
27# Script updated to Python 3 by running it through the 2to3 converter.
28# Added script names for Unicode 7.0.0, 20-June-2014.
29# Added script names for Unicode 8.0.0, 19-June-2015.
30# Added script names for Unicode 10.0.0, 02-July-2017.
31# Added script names for Unicode 11.0.0, 03-July-2018.
32# Added 'Unknown' script, 01-October-2018.
33# Added script names for Unicode 12.1.0, 27-July-2019.
34# Added script names for Unicode 13.0.0, 10-March-2020.
35# Added Script names for Unicode 14.0.0, PCRE2-10.39
36# Added support for bidi class and bidi control, 06-December-2021
37#   This also involved lower casing strings and removing underscores, in
38#   accordance with Unicode's "loose matching" rules, which Perl observes.
39# Changed default script type from PT_SC to PT_SCX, 18-December-2021
40# -----------------------------------------------------------------------------
41#
42# Note subsequent changes here:
43#
44# 27-December-2021: Added support for 4-letter script abbreviations.
45# 10-January-2022:  Further updates for Boolean property support
46# -----------------------------------------------------------------------------
47
48
49# Import common data lists and functions
50
51from GenerateCommon import \
52  abbreviations, \
53  bool_properties, \
54  bidi_classes, \
55  category_names, \
56  general_category_names, \
57  script_names, \
58  open_output
59
60# Open the output file (no return on failure). This call also writes standard
61# header boilerplate.
62
63f = open_output("pcre2_ucptables.c")
64
65# The list in bidi_classes contains just the Unicode classes such as AN, LRE,
66# etc., along with comments. We need to add "bidi" in front of each value, in
67# order to create names that don't clash with other types of property.
68
69bidi_class_names = []
70for i in range(0, len(bidi_classes), 2):
71  bidi_class_names.append("bidi" + bidi_classes[i])
72
73# Remove the comments from other lists that contain them.
74
75category_names = category_names[::2]
76
77# Create standardized versions of the names by lowercasing and removing
78# underscores.
79
80def stdname(x):
81  return x.lower().replace('_', '')
82
83def stdnames(x):
84  y = [''] * len(x)
85  for i in range(len(x)):
86    y[i] = stdname(x[i])
87  return y
88
89std_category_names = stdnames(category_names)
90std_general_category_names = stdnames(general_category_names)
91std_bidi_class_names = stdnames(bidi_class_names)
92std_bool_properties = stdnames(bool_properties)
93
94# Create the table, starting with the Unicode script, category and bidi class
95# names. We keep both the standardized name and the original, because the
96# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
97# still use the full original names.
98
99utt_table = []
100
101scx_end = script_names.index('Unknown')
102
103for idx, name in enumerate(script_names):
104  pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
105  utt_table.append((stdname(name), name, pt_type))
106  for abbrev in abbreviations[name]:
107    utt_table.append((stdname(abbrev), name, pt_type))
108
109# Add the remaining property lists
110
111utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names)))
112utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
113utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
114
115for name in bool_properties:
116  utt_table.append((stdname(name), name, 'PT_BOOL'))
117  if name in abbreviations:
118    for abbrev in abbreviations[name]:
119      utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
120
121# Now add specials and synonyms. Note both the standardized and capitalized
122# forms are needed.
123
124utt_table.append(('any', 'Any', 'PT_ANY'))
125utt_table.append(('l&',  'L&',  'PT_LAMP'))
126utt_table.append(('lc',  'LC',  'PT_LAMP'))
127utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
128utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
129utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
130utt_table.append(('xuc', 'Xuc', 'PT_UCNC'))
131utt_table.append(('xwd', 'Xwd', 'PT_WORD'))
132
133# Remove duplicates from the table and then sort it.
134
135utt_table = list(set(utt_table))
136utt_table.sort()
137
138# Output file-specific heading
139
140f.write("""\
141#ifdef SUPPORT_UNICODE
142
143/* The PRIV(utt)[] table below translates Unicode property names into type and
144code values. It is searched by binary chop, so must be in collating sequence of
145name. Originally, the table contained pointers to the name strings in the first
146field of each entry. However, that leads to a large number of relocations when
147a shared library is dynamically loaded. A significant reduction is made by
148putting all the names into a single, large string and using offsets instead.
149All letters are lower cased, and underscores are removed, in accordance with
150the "loose matching" rules that Unicode advises and Perl uses. */
151\n""")
152
153# We have to use STR_ macros to define the strings so that it all works in
154# UTF-8 mode on EBCDIC platforms.
155
156for utt in utt_table:
157  f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')))
158  for c in utt[0]:
159    if c == '&':
160      f.write(' STR_AMPERSAND')
161    else:
162      f.write(' STR_%s' % c);
163  f.write(' "\\0"\n')
164
165# Output the long string of concatenated names
166
167f.write('\nconst char PRIV(utt_names)[] =\n');
168last = ''
169for utt in utt_table:
170  if utt == utt_table[-1]:
171    last = ';'
172  f.write('  STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last))
173
174# Output the property type table
175
176f.write('\nconst ucp_type_table PRIV(utt)[] = {\n')
177offset = 0
178last = ','
179for utt in utt_table:
180  if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
181      'PT_SPACE', 'PT_UCNC', 'PT_WORD'):
182    value = '0'
183  else:
184    value = 'ucp_' + utt[1]
185  if utt == utt_table[-1]:
186    last = ''
187  f.write('  { %3d, %s, %s }%s\n' % (offset, utt[2], value, last))
188  offset += len(utt[0]) + 1
189f.write('};\n\n')
190
191# Ending text
192
193f.write("""\
194const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
195
196#endif /* SUPPORT_UNICODE */
197
198/* End of pcre2_ucptables.c */
199""")
200
201f.close
202
203# End
204