xref: /aosp_15_r20/external/pcre/maint/GenerateCommon.py (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1#! /usr/bin/python
2
3#                   PCRE2 UNICODE PROPERTY SUPPORT
4#                   ------------------------------
5
6# This file is a Python module containing common lists and functions for the
7# GenerateXXX scripts that create various.c and .h files from Unicode data
8# files. It was created as part of a re-organizaton of these scripts in
9# December 2021.
10
11
12import re
13
14
15# ---------------------------------------------------------------------------
16#                             DATA LISTS
17# ---------------------------------------------------------------------------
18
19# BIDI classes in the DerivedBidiClass.txt file, short and long identifiers.
20
21bidi_classes = [
22  'AL',  'Arabic_Letter',
23  'AN',  'Arabic_Number',
24  'B',   'Paragraph_Separator',
25  'BN',  'Boundary_Neutral',
26  'CS',  'Common_Separator',
27  'EN',  'European_Number',
28  'ES',  'European_Separator',
29  'ET',  'European_Terminator',
30  'FSI', 'First_Strong_Isolate',
31  'L',   'Left_To_Right',
32  'LRE', 'Left_To_Right_Embedding',
33  'LRI', 'Left_To_Right_Isolate',
34  'LRO', 'Left_To_Right_Override',
35  'NSM', 'Nonspacing_Mark',
36  'ON',  'Other_Neutral',
37  'PDF', 'Pop_Directional_Format',
38  'PDI', 'Pop_Directional_Isolate',
39  'R',   'Right_To_Left',
40  'RLE', 'Right_To_Left_Embedding',
41  'RLI', 'Right_To_Left_Isolate',
42  'RLO', 'Right_To_Left_Override',
43  'S',   'Segment_Separator',
44  'WS',  'White_Space'
45  ]
46
47# Particular category property names, with comments. NOTE: If ever this list
48# is changed, the table called "catposstab" in the pcre2_auto_possess.c file
49# must be edited to keep in step.
50
51category_names = [
52  'Cc', 'Control',
53  'Cf', 'Format',
54  'Cn', 'Unassigned',
55  'Co', 'Private use',
56  'Cs', 'Surrogate',
57  'Ll', 'Lower case letter',
58  'Lm', 'Modifier letter',
59  'Lo', 'Other letter',
60  'Lt', 'Title case letter',
61  'Lu', 'Upper case letter',
62  'Mc', 'Spacing mark',
63  'Me', 'Enclosing mark',
64  'Mn', 'Non-spacing mark',
65  'Nd', 'Decimal number',
66  'Nl', 'Letter number',
67  'No', 'Other number',
68  'Pc', 'Connector punctuation',
69  'Pd', 'Dash punctuation',
70  'Pe', 'Close punctuation',
71  'Pf', 'Final punctuation',
72  'Pi', 'Initial punctuation',
73  'Po', 'Other punctuation',
74  'Ps', 'Open punctuation',
75  'Sc', 'Currency symbol',
76  'Sk', 'Modifier symbol',
77  'Sm', 'Mathematical symbol',
78  'So', 'Other symbol',
79  'Zl', 'Line separator',
80  'Zp', 'Paragraph separator',
81  'Zs', 'Space separator'
82  ]
83
84# The Extended_Pictographic property is not found in the file where all the
85# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt
86# file, but we list it here so that the name has the correct index value.
87
88break_properties = [
89  'CR',                    ' 0',
90  'LF',                    ' 1',
91  'Control',               ' 2',
92  'Extend',                ' 3',
93  'Prepend',               ' 4',
94  'SpacingMark',           ' 5',
95  'L',                     ' 6 Hangul syllable type L',
96  'V',                     ' 7 Hangul syllable type V',
97  'T',                     ' 8 Hangul syllable type T',
98  'LV',                    ' 9 Hangul syllable type LV',
99  'LVT',                   '10 Hangul syllable type LVT',
100  'Regional_Indicator',    '11',
101  'Other',                 '12',
102  'ZWJ',                   '13',
103  'Extended_Pictographic', '14'
104  ]
105
106# List of files from which the names of Boolean properties are obtained, along
107# with a list of regex patterns for properties to be ignored, and a list of
108# extra pattern names to add.
109
110bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt']
111bool_propsignore = [r'^Other_', r'^Hyphen$']
112bool_propsextras = ['ASCII', 'Bidi_Mirrored']
113
114
115# ---------------------------------------------------------------------------
116#                   GET BOOLEAN PROPERTY NAMES
117# ---------------------------------------------------------------------------
118
119# Get a list of Boolean property names from a number of files.
120
121def getbpropslist():
122  bplist = []
123  bplast = ""
124
125  for filename in bool_propsfiles:
126    try:
127      file = open('Unicode.tables/' + filename, 'r')
128    except IOError:
129      print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
130      sys.exit(1)
131
132    for line in file:
133      line = re.sub(r'#.*', '', line)
134      data = list(map(str.strip, line.split(';')))
135      if len(data) <= 1 or data[1] == bplast:
136        continue
137      bplast = data[1]
138      for pat in bool_propsignore:
139        if re.match(pat, bplast) != None:
140          break
141      else:
142        if bplast not in bplist:
143          bplist.append(bplast)
144
145    file.close()
146
147  bplist.extend(bool_propsextras)
148  bplist.sort()
149  return bplist
150
151bool_properties = getbpropslist()
152bool_props_list_item_size = (len(bool_properties) + 31) // 32
153
154
155
156# ---------------------------------------------------------------------------
157#                  COLLECTING PROPERTY NAMES AND ALIASES
158# ---------------------------------------------------------------------------
159
160script_names = ['Unknown']
161abbreviations = {}
162
163def collect_property_names():
164  global script_names
165  global abbreviations
166
167  names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
168
169  last_script_name = ""
170  with open("Unicode.tables/Scripts.txt") as f:
171    for line in f:
172      match_obj = names_re.match(line)
173
174      if match_obj == None or match_obj.group(1) == last_script_name:
175        continue
176
177      last_script_name = match_obj.group(1)
178      script_names.append(last_script_name)
179
180  # Sometimes there is comment in the line
181  # so splitting around semicolon is not enough
182  value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
183
184  with open("Unicode.tables/PropertyValueAliases.txt") as f:
185    for line in f:
186      match_obj = value_alias_re.match(line)
187
188      if match_obj == None:
189        continue
190
191      if match_obj.group(1) == "sc":
192        if match_obj.group(2) == match_obj.group(3):
193          abbreviations[match_obj.group(3)] = ()
194        elif match_obj.group(4) == None:
195          abbreviations[match_obj.group(3)] = (match_obj.group(2),)
196        else:
197          abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
198
199  # We can also collect Boolean property abbreviations into the same dictionary
200
201  bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?')
202  with open("Unicode.tables/PropertyAliases.txt") as f:
203    for line in f:
204      match_obj = bin_alias_re.match(line)
205      if match_obj == None:
206        continue
207
208      if match_obj.group(2) != match_obj.group(1) and match_obj.group(2) in bool_properties:
209        if match_obj.group(3) == None:
210          abbreviations[match_obj.group(2)] = (match_obj.group(1),)
211        else:
212          abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3))
213
214collect_property_names()
215
216
217
218# ---------------------------------------------------------------------------
219#                      REORDERING SCRIPT NAMES
220# ---------------------------------------------------------------------------
221
222script_abbrevs = []
223
224def reorder_scripts():
225  global script_names
226  global script_abbrevs
227  global abbreviations
228
229  for name in script_names:
230    abbrevs = abbreviations[name]
231    script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
232
233  extended_script_abbrevs = set()
234  with open("Unicode.tables/ScriptExtensions.txt") as f:
235    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
236
237    for line in f:
238      match_obj = names_re.match(line)
239
240      if match_obj == None:
241        continue
242
243      for name in match_obj.group(1).split(" "):
244        extended_script_abbrevs.add(name)
245
246  new_script_names = []
247  new_script_abbrevs = []
248
249  for idx, abbrev in enumerate(script_abbrevs):
250    if abbrev in extended_script_abbrevs:
251      new_script_names.append(script_names[idx])
252      new_script_abbrevs.append(abbrev)
253
254  for idx, abbrev in enumerate(script_abbrevs):
255    if abbrev not in extended_script_abbrevs:
256      new_script_names.append(script_names[idx])
257      new_script_abbrevs.append(abbrev)
258
259  script_names = new_script_names
260  script_abbrevs = new_script_abbrevs
261
262reorder_scripts()
263script_list_item_size = (script_names.index('Unknown') + 31) // 32
264
265
266# ---------------------------------------------------------------------------
267#                         DERIVED LISTS
268# ---------------------------------------------------------------------------
269
270# Create general character property names from the first letters of the
271# particular categories.
272
273gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2))
274general_category_names = list(gcn_set)
275general_category_names.sort()
276
277
278# ---------------------------------------------------------------------------
279#                           FUNCTIONS
280# ---------------------------------------------------------------------------
281
282import sys
283
284# Open an output file, using the command's argument or a default. Write common
285# preliminary header information.
286
287def open_output(default):
288  if len(sys.argv) > 2:
289    print('** Too many arguments: just give a file name')
290    sys.exit(1)
291  if len(sys.argv) == 2:
292    output_name = sys.argv[1]
293  else:
294    output_name = default
295  try:
296    file = open(output_name, "w")
297  except IOError:
298    print("** Couldn't open %s" % output_name)
299    sys.exit(1)
300
301  script_name = sys.argv[0]
302  i = script_name.rfind('/')
303  if i >= 0:
304    script_name = script_name[i+1:]
305
306  file.write("""\
307/*************************************************
308*      Perl-Compatible Regular Expressions       *
309*************************************************/
310
311/* PCRE is a library of functions to support regular expressions whose syntax
312and semantics are as close as possible to those of the Perl 5 language.
313
314                       Written by Philip Hazel
315     Original API code Copyright (c) 1997-2012 University of Cambridge
316          New API code Copyright (c) 2016-2022 University of Cambridge
317
318This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
319""")
320
321  file.write("Instead, modify the maint/%s script and run it to generate\n"
322  "a new version of this code.\n\n" % script_name)
323
324  file.write("""\
325-----------------------------------------------------------------------------
326Redistribution and use in source and binary forms, with or without
327modification, are permitted provided that the following conditions are met:
328
329    * Redistributions of source code must retain the above copyright notice,
330      this list of conditions and the following disclaimer.
331
332    * Redistributions in binary form must reproduce the above copyright
333      notice, this list of conditions and the following disclaimer in the
334      documentation and/or other materials provided with the distribution.
335
336    * Neither the name of the University of Cambridge nor the names of its
337      contributors may be used to endorse or promote products derived from
338      this software without specific prior written permission.
339
340THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
341AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
342IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
343ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
344LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
345CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
346SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
347INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
348CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
349ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
350POSSIBILITY OF SUCH DAMAGE.
351-----------------------------------------------------------------------------
352*/
353\n""")
354  return file
355
356# End of UcpCommon.py
357