1#! /usr/bin/python 2 3# PCRE2 UNICODE PROPERTY SUPPORT 4# ------------------------------ 5 6# This file is a Python module containing common lists and functions for the 7# GenerateXXX scripts that create various.c and .h files from Unicode data 8# files. It was created as part of a re-organizaton of these scripts in 9# December 2021. 10 11 12import re 13 14 15# --------------------------------------------------------------------------- 16# DATA LISTS 17# --------------------------------------------------------------------------- 18 19# BIDI classes in the DerivedBidiClass.txt file, short and long identifiers. 20 21bidi_classes = [ 22 'AL', 'Arabic_Letter', 23 'AN', 'Arabic_Number', 24 'B', 'Paragraph_Separator', 25 'BN', 'Boundary_Neutral', 26 'CS', 'Common_Separator', 27 'EN', 'European_Number', 28 'ES', 'European_Separator', 29 'ET', 'European_Terminator', 30 'FSI', 'First_Strong_Isolate', 31 'L', 'Left_To_Right', 32 'LRE', 'Left_To_Right_Embedding', 33 'LRI', 'Left_To_Right_Isolate', 34 'LRO', 'Left_To_Right_Override', 35 'NSM', 'Nonspacing_Mark', 36 'ON', 'Other_Neutral', 37 'PDF', 'Pop_Directional_Format', 38 'PDI', 'Pop_Directional_Isolate', 39 'R', 'Right_To_Left', 40 'RLE', 'Right_To_Left_Embedding', 41 'RLI', 'Right_To_Left_Isolate', 42 'RLO', 'Right_To_Left_Override', 43 'S', 'Segment_Separator', 44 'WS', 'White_Space' 45 ] 46 47# Particular category property names, with comments. NOTE: If ever this list 48# is changed, the table called "catposstab" in the pcre2_auto_possess.c file 49# must be edited to keep in step. 50 51category_names = [ 52 'Cc', 'Control', 53 'Cf', 'Format', 54 'Cn', 'Unassigned', 55 'Co', 'Private use', 56 'Cs', 'Surrogate', 57 'Ll', 'Lower case letter', 58 'Lm', 'Modifier letter', 59 'Lo', 'Other letter', 60 'Lt', 'Title case letter', 61 'Lu', 'Upper case letter', 62 'Mc', 'Spacing mark', 63 'Me', 'Enclosing mark', 64 'Mn', 'Non-spacing mark', 65 'Nd', 'Decimal number', 66 'Nl', 'Letter number', 67 'No', 'Other number', 68 'Pc', 'Connector punctuation', 69 'Pd', 'Dash punctuation', 70 'Pe', 'Close punctuation', 71 'Pf', 'Final punctuation', 72 'Pi', 'Initial punctuation', 73 'Po', 'Other punctuation', 74 'Ps', 'Open punctuation', 75 'Sc', 'Currency symbol', 76 'Sk', 'Modifier symbol', 77 'Sm', 'Mathematical symbol', 78 'So', 'Other symbol', 79 'Zl', 'Line separator', 80 'Zp', 'Paragraph separator', 81 'Zs', 'Space separator' 82 ] 83 84# The Extended_Pictographic property is not found in the file where all the 85# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt 86# file, but we list it here so that the name has the correct index value. 87 88break_properties = [ 89 'CR', ' 0', 90 'LF', ' 1', 91 'Control', ' 2', 92 'Extend', ' 3', 93 'Prepend', ' 4', 94 'SpacingMark', ' 5', 95 'L', ' 6 Hangul syllable type L', 96 'V', ' 7 Hangul syllable type V', 97 'T', ' 8 Hangul syllable type T', 98 'LV', ' 9 Hangul syllable type LV', 99 'LVT', '10 Hangul syllable type LVT', 100 'Regional_Indicator', '11', 101 'Other', '12', 102 'ZWJ', '13', 103 'Extended_Pictographic', '14' 104 ] 105 106# List of files from which the names of Boolean properties are obtained, along 107# with a list of regex patterns for properties to be ignored, and a list of 108# extra pattern names to add. 109 110bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] 111bool_propsignore = [r'^Other_', r'^Hyphen$'] 112bool_propsextras = ['ASCII', 'Bidi_Mirrored'] 113 114 115# --------------------------------------------------------------------------- 116# GET BOOLEAN PROPERTY NAMES 117# --------------------------------------------------------------------------- 118 119# Get a list of Boolean property names from a number of files. 120 121def getbpropslist(): 122 bplist = [] 123 bplast = "" 124 125 for filename in bool_propsfiles: 126 try: 127 file = open('Unicode.tables/' + filename, 'r') 128 except IOError: 129 print(f"** Couldn't open {'Unicode.tables/' + filename}\n") 130 sys.exit(1) 131 132 for line in file: 133 line = re.sub(r'#.*', '', line) 134 data = list(map(str.strip, line.split(';'))) 135 if len(data) <= 1 or data[1] == bplast: 136 continue 137 bplast = data[1] 138 for pat in bool_propsignore: 139 if re.match(pat, bplast) != None: 140 break 141 else: 142 if bplast not in bplist: 143 bplist.append(bplast) 144 145 file.close() 146 147 bplist.extend(bool_propsextras) 148 bplist.sort() 149 return bplist 150 151bool_properties = getbpropslist() 152bool_props_list_item_size = (len(bool_properties) + 31) // 32 153 154 155 156# --------------------------------------------------------------------------- 157# COLLECTING PROPERTY NAMES AND ALIASES 158# --------------------------------------------------------------------------- 159 160script_names = ['Unknown'] 161abbreviations = {} 162 163def collect_property_names(): 164 global script_names 165 global abbreviations 166 167 names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') 168 169 last_script_name = "" 170 with open("Unicode.tables/Scripts.txt") as f: 171 for line in f: 172 match_obj = names_re.match(line) 173 174 if match_obj == None or match_obj.group(1) == last_script_name: 175 continue 176 177 last_script_name = match_obj.group(1) 178 script_names.append(last_script_name) 179 180 # Sometimes there is comment in the line 181 # so splitting around semicolon is not enough 182 value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') 183 184 with open("Unicode.tables/PropertyValueAliases.txt") as f: 185 for line in f: 186 match_obj = value_alias_re.match(line) 187 188 if match_obj == None: 189 continue 190 191 if match_obj.group(1) == "sc": 192 if match_obj.group(2) == match_obj.group(3): 193 abbreviations[match_obj.group(3)] = () 194 elif match_obj.group(4) == None: 195 abbreviations[match_obj.group(3)] = (match_obj.group(2),) 196 else: 197 abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) 198 199 # We can also collect Boolean property abbreviations into the same dictionary 200 201 bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') 202 with open("Unicode.tables/PropertyAliases.txt") as f: 203 for line in f: 204 match_obj = bin_alias_re.match(line) 205 if match_obj == None: 206 continue 207 208 if match_obj.group(2) != match_obj.group(1) and match_obj.group(2) in bool_properties: 209 if match_obj.group(3) == None: 210 abbreviations[match_obj.group(2)] = (match_obj.group(1),) 211 else: 212 abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) 213 214collect_property_names() 215 216 217 218# --------------------------------------------------------------------------- 219# REORDERING SCRIPT NAMES 220# --------------------------------------------------------------------------- 221 222script_abbrevs = [] 223 224def reorder_scripts(): 225 global script_names 226 global script_abbrevs 227 global abbreviations 228 229 for name in script_names: 230 abbrevs = abbreviations[name] 231 script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) 232 233 extended_script_abbrevs = set() 234 with open("Unicode.tables/ScriptExtensions.txt") as f: 235 names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #') 236 237 for line in f: 238 match_obj = names_re.match(line) 239 240 if match_obj == None: 241 continue 242 243 for name in match_obj.group(1).split(" "): 244 extended_script_abbrevs.add(name) 245 246 new_script_names = [] 247 new_script_abbrevs = [] 248 249 for idx, abbrev in enumerate(script_abbrevs): 250 if abbrev in extended_script_abbrevs: 251 new_script_names.append(script_names[idx]) 252 new_script_abbrevs.append(abbrev) 253 254 for idx, abbrev in enumerate(script_abbrevs): 255 if abbrev not in extended_script_abbrevs: 256 new_script_names.append(script_names[idx]) 257 new_script_abbrevs.append(abbrev) 258 259 script_names = new_script_names 260 script_abbrevs = new_script_abbrevs 261 262reorder_scripts() 263script_list_item_size = (script_names.index('Unknown') + 31) // 32 264 265 266# --------------------------------------------------------------------------- 267# DERIVED LISTS 268# --------------------------------------------------------------------------- 269 270# Create general character property names from the first letters of the 271# particular categories. 272 273gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) 274general_category_names = list(gcn_set) 275general_category_names.sort() 276 277 278# --------------------------------------------------------------------------- 279# FUNCTIONS 280# --------------------------------------------------------------------------- 281 282import sys 283 284# Open an output file, using the command's argument or a default. Write common 285# preliminary header information. 286 287def open_output(default): 288 if len(sys.argv) > 2: 289 print('** Too many arguments: just give a file name') 290 sys.exit(1) 291 if len(sys.argv) == 2: 292 output_name = sys.argv[1] 293 else: 294 output_name = default 295 try: 296 file = open(output_name, "w") 297 except IOError: 298 print("** Couldn't open %s" % output_name) 299 sys.exit(1) 300 301 script_name = sys.argv[0] 302 i = script_name.rfind('/') 303 if i >= 0: 304 script_name = script_name[i+1:] 305 306 file.write("""\ 307/************************************************* 308* Perl-Compatible Regular Expressions * 309*************************************************/ 310 311/* PCRE is a library of functions to support regular expressions whose syntax 312and semantics are as close as possible to those of the Perl 5 language. 313 314 Written by Philip Hazel 315 Original API code Copyright (c) 1997-2012 University of Cambridge 316 New API code Copyright (c) 2016-2022 University of Cambridge 317 318This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! 319""") 320 321 file.write("Instead, modify the maint/%s script and run it to generate\n" 322 "a new version of this code.\n\n" % script_name) 323 324 file.write("""\ 325----------------------------------------------------------------------------- 326Redistribution and use in source and binary forms, with or without 327modification, are permitted provided that the following conditions are met: 328 329 * Redistributions of source code must retain the above copyright notice, 330 this list of conditions and the following disclaimer. 331 332 * Redistributions in binary form must reproduce the above copyright 333 notice, this list of conditions and the following disclaimer in the 334 documentation and/or other materials provided with the distribution. 335 336 * Neither the name of the University of Cambridge nor the names of its 337 contributors may be used to endorse or promote products derived from 338 this software without specific prior written permission. 339 340THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 341AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 342IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 343ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 344LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 345CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 346SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 347INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 348CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 349ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 350POSSIBILITY OF SUCH DAMAGE. 351----------------------------------------------------------------------------- 352*/ 353\n""") 354 return file 355 356# End of UcpCommon.py 357