1# Copyright 2008 The RE2 Authors. All Rights Reserved. 2# Use of this source code is governed by a BSD-style 3# license that can be found in the LICENSE file. 4 5"""Parser for Unicode data files (as distributed by unicode.org).""" 6 7from __future__ import absolute_import 8from __future__ import division 9from __future__ import print_function 10 11import os 12import re 13import urllib.request 14 15# Directory or URL where Unicode tables reside. 16_UNICODE_DIR = "https://www.unicode.org/Public/15.1.0/ucd" 17 18# Largest valid Unicode code value. 19_RUNE_MAX = 0x10FFFF 20 21 22class Error(Exception): 23 """Unicode error base class.""" 24 25 26class InputError(Error): 27 """Unicode input error class. Raised on invalid input.""" 28 29 30def _UInt(s): 31 """Converts string to Unicode code point ('263A' => 0x263a). 32 33 Args: 34 s: string to convert 35 36 Returns: 37 Unicode code point 38 39 Raises: 40 InputError: the string is not a valid Unicode value. 41 """ 42 43 try: 44 v = int(s, 16) 45 except ValueError: 46 v = -1 47 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: 48 raise InputError("invalid Unicode value %s" % (s,)) 49 return v 50 51 52def _URange(s): 53 """Converts string to Unicode range. 54 55 '0001..0003' => [1, 2, 3]. 56 '0001' => [1]. 57 58 Args: 59 s: string to convert 60 61 Returns: 62 Unicode range 63 64 Raises: 65 InputError: the string is not a valid Unicode range. 66 """ 67 a = s.split("..") 68 if len(a) == 1: 69 return [_UInt(a[0])] 70 if len(a) == 2: 71 lo = _UInt(a[0]) 72 hi = _UInt(a[1]) 73 if lo < hi: 74 return range(lo, hi + 1) 75 raise InputError("invalid Unicode range %s" % (s,)) 76 77 78def _ParseContinue(s): 79 """Parses a Unicode continuation field. 80 81 These are of the form '<Name, First>' or '<Name, Last>'. 82 Instead of giving an explicit range in a single table entry, 83 some Unicode tables use two entries, one for the first 84 code value in the range and one for the last. 85 The first entry's description is '<Name, First>' instead of 'Name' 86 and the second is '<Name, Last>'. 87 88 '<Name, First>' => ('Name', 'First') 89 '<Name, Last>' => ('Name', 'Last') 90 'Anything else' => ('Anything else', None) 91 92 Args: 93 s: continuation field string 94 95 Returns: 96 pair: name and ('First', 'Last', or None) 97 """ 98 99 match = re.match("<(.*), (First|Last)>", s) 100 if match is not None: 101 return match.groups() 102 return (s, None) 103 104 105def ReadUnicodeTable(filename, nfields, doline): 106 """Generic Unicode table text file reader. 107 108 The reader takes care of stripping out comments and also 109 parsing the two different ways that the Unicode tables specify 110 code ranges (using the .. notation and splitting the range across 111 multiple lines). 112 113 Each non-comment line in the table is expected to have the given 114 number of fields. The first field is known to be the Unicode value 115 and the second field its description. 116 117 The reader calls doline(codes, fields) for each entry in the table. 118 If fn raises an exception, the reader prints that exception, 119 prefixed with the file name and line number, and continues 120 processing the file. When done with the file, the reader re-raises 121 the first exception encountered during the file. 122 123 Arguments: 124 filename: the Unicode data file to read, or a file-like object. 125 nfields: the number of expected fields per line in that file. 126 doline: the function to call for each table entry. 127 128 Raises: 129 InputError: nfields is invalid (must be >= 2). 130 """ 131 132 if nfields < 2: 133 raise InputError("invalid number of fields %d" % (nfields,)) 134 135 if type(filename) == str: 136 if filename.startswith("https://"): 137 fil = urllib.request.urlopen(filename) 138 else: 139 fil = open(filename, "rb") 140 else: 141 fil = filename 142 143 first = None # first code in multiline range 144 expect_last = None # tag expected for "Last" line in multiline range 145 lineno = 0 # current line number 146 for line in fil: 147 lineno += 1 148 try: 149 line = line.decode('latin1') 150 151 # Chop # comments and white space; ignore empty lines. 152 sharp = line.find("#") 153 if sharp >= 0: 154 line = line[:sharp] 155 line = line.strip() 156 if not line: 157 continue 158 159 # Split fields on ";", chop more white space. 160 # Must have the expected number of fields. 161 fields = [s.strip() for s in line.split(";")] 162 if len(fields) != nfields: 163 raise InputError("wrong number of fields %d %d - %s" % 164 (len(fields), nfields, line)) 165 166 # The Unicode text files have two different ways 167 # to list a Unicode range. Either the first field is 168 # itself a range (0000..FFFF), or the range is split 169 # across two lines, with the second field noting 170 # the continuation. 171 codes = _URange(fields[0]) 172 (name, cont) = _ParseContinue(fields[1]) 173 174 if expect_last is not None: 175 # If the last line gave the First code in a range, 176 # this one had better give the Last one. 177 if (len(codes) != 1 or codes[0] <= first or 178 cont != "Last" or name != expect_last): 179 raise InputError("expected Last line for %s" % 180 (expect_last,)) 181 codes = range(first, codes[0] + 1) 182 first = None 183 expect_last = None 184 fields[0] = "%04X..%04X" % (codes[0], codes[-1]) 185 fields[1] = name 186 elif cont == "First": 187 # Otherwise, if this is the First code in a range, 188 # remember it and go to the next line. 189 if len(codes) != 1: 190 raise InputError("bad First line: range given") 191 expect_last = name 192 first = codes[0] 193 continue 194 195 doline(codes, fields) 196 197 except Exception as e: 198 print("%s:%d: %s" % (filename, lineno, e)) 199 raise 200 201 if expect_last is not None: 202 raise InputError("expected Last line for %s; got EOF" % 203 (expect_last,)) 204 205 206def CaseGroups(unicode_dir=_UNICODE_DIR): 207 """Returns list of Unicode code groups equivalent under case folding. 208 209 Each group is a sorted list of code points, 210 and the list of groups is sorted by first code point 211 in the group. 212 213 Args: 214 unicode_dir: Unicode data directory 215 216 Returns: 217 list of Unicode code groups 218 """ 219 220 # Dict mapping lowercase code point to fold-equivalent group. 221 togroup = {} 222 223 def DoLine(codes, fields): 224 """Process single CaseFolding.txt line, updating togroup.""" 225 (_, foldtype, lower, _) = fields 226 if foldtype not in ("C", "S"): 227 return 228 lower = _UInt(lower) 229 togroup.setdefault(lower, [lower]).extend(codes) 230 231 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) 232 233 groups = list(togroup.values()) 234 for g in groups: 235 g.sort() 236 groups.sort() 237 return togroup, groups 238 239 240def Scripts(unicode_dir=_UNICODE_DIR): 241 """Returns dict mapping script names to code lists. 242 243 Args: 244 unicode_dir: Unicode data directory 245 246 Returns: 247 dict mapping script names to code lists 248 """ 249 250 scripts = {} 251 252 def DoLine(codes, fields): 253 """Process single Scripts.txt line, updating scripts.""" 254 (_, name) = fields 255 scripts.setdefault(name, []).extend(codes) 256 257 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) 258 return scripts 259 260 261def Categories(unicode_dir=_UNICODE_DIR): 262 """Returns dict mapping category names to code lists. 263 264 Args: 265 unicode_dir: Unicode data directory 266 267 Returns: 268 dict mapping category names to code lists 269 """ 270 271 categories = {} 272 273 def DoLine(codes, fields): 274 """Process single UnicodeData.txt line, updating categories.""" 275 category = fields[2] 276 categories.setdefault(category, []).extend(codes) 277 # Add codes from Lu into L, etc. 278 if len(category) > 1: 279 short = category[0] 280 categories.setdefault(short, []).extend(codes) 281 282 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) 283 return categories 284 285