xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/unicode.py (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1# Copyright 2008 The RE2 Authors.  All Rights Reserved.
2# Use of this source code is governed by a BSD-style
3# license that can be found in the LICENSE file.
4
5"""Parser for Unicode data files (as distributed by unicode.org)."""
6
7from __future__ import absolute_import
8from __future__ import division
9from __future__ import print_function
10
11import os
12import re
13import urllib.request
14
15# Directory or URL where Unicode tables reside.
16_UNICODE_DIR = "https://www.unicode.org/Public/15.1.0/ucd"
17
18# Largest valid Unicode code value.
19_RUNE_MAX = 0x10FFFF
20
21
22class Error(Exception):
23  """Unicode error base class."""
24
25
26class InputError(Error):
27  """Unicode input error class.  Raised on invalid input."""
28
29
30def _UInt(s):
31  """Converts string to Unicode code point ('263A' => 0x263a).
32
33  Args:
34    s: string to convert
35
36  Returns:
37    Unicode code point
38
39  Raises:
40    InputError: the string is not a valid Unicode value.
41  """
42
43  try:
44    v = int(s, 16)
45  except ValueError:
46    v = -1
47  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
48    raise InputError("invalid Unicode value %s" % (s,))
49  return v
50
51
52def _URange(s):
53  """Converts string to Unicode range.
54
55    '0001..0003' => [1, 2, 3].
56    '0001' => [1].
57
58  Args:
59    s: string to convert
60
61  Returns:
62    Unicode range
63
64  Raises:
65    InputError: the string is not a valid Unicode range.
66  """
67  a = s.split("..")
68  if len(a) == 1:
69    return [_UInt(a[0])]
70  if len(a) == 2:
71    lo = _UInt(a[0])
72    hi = _UInt(a[1])
73    if lo < hi:
74      return range(lo, hi + 1)
75  raise InputError("invalid Unicode range %s" % (s,))
76
77
78def _ParseContinue(s):
79  """Parses a Unicode continuation field.
80
81  These are of the form '<Name, First>' or '<Name, Last>'.
82  Instead of giving an explicit range in a single table entry,
83  some Unicode tables use two entries, one for the first
84  code value in the range and one for the last.
85  The first entry's description is '<Name, First>' instead of 'Name'
86  and the second is '<Name, Last>'.
87
88    '<Name, First>' => ('Name', 'First')
89    '<Name, Last>' => ('Name', 'Last')
90    'Anything else' => ('Anything else', None)
91
92  Args:
93    s: continuation field string
94
95  Returns:
96    pair: name and ('First', 'Last', or None)
97  """
98
99  match = re.match("<(.*), (First|Last)>", s)
100  if match is not None:
101    return match.groups()
102  return (s, None)
103
104
105def ReadUnicodeTable(filename, nfields, doline):
106  """Generic Unicode table text file reader.
107
108  The reader takes care of stripping out comments and also
109  parsing the two different ways that the Unicode tables specify
110  code ranges (using the .. notation and splitting the range across
111  multiple lines).
112
113  Each non-comment line in the table is expected to have the given
114  number of fields.  The first field is known to be the Unicode value
115  and the second field its description.
116
117  The reader calls doline(codes, fields) for each entry in the table.
118  If fn raises an exception, the reader prints that exception,
119  prefixed with the file name and line number, and continues
120  processing the file.  When done with the file, the reader re-raises
121  the first exception encountered during the file.
122
123  Arguments:
124    filename: the Unicode data file to read, or a file-like object.
125    nfields: the number of expected fields per line in that file.
126    doline: the function to call for each table entry.
127
128  Raises:
129    InputError: nfields is invalid (must be >= 2).
130  """
131
132  if nfields < 2:
133    raise InputError("invalid number of fields %d" % (nfields,))
134
135  if type(filename) == str:
136    if filename.startswith("https://"):
137      fil = urllib.request.urlopen(filename)
138    else:
139      fil = open(filename, "rb")
140  else:
141    fil = filename
142
143  first = None        # first code in multiline range
144  expect_last = None  # tag expected for "Last" line in multiline range
145  lineno = 0          # current line number
146  for line in fil:
147    lineno += 1
148    try:
149      line = line.decode('latin1')
150
151      # Chop # comments and white space; ignore empty lines.
152      sharp = line.find("#")
153      if sharp >= 0:
154        line = line[:sharp]
155      line = line.strip()
156      if not line:
157        continue
158
159      # Split fields on ";", chop more white space.
160      # Must have the expected number of fields.
161      fields = [s.strip() for s in line.split(";")]
162      if len(fields) != nfields:
163        raise InputError("wrong number of fields %d %d - %s" %
164                         (len(fields), nfields, line))
165
166      # The Unicode text files have two different ways
167      # to list a Unicode range.  Either the first field is
168      # itself a range (0000..FFFF), or the range is split
169      # across two lines, with the second field noting
170      # the continuation.
171      codes = _URange(fields[0])
172      (name, cont) = _ParseContinue(fields[1])
173
174      if expect_last is not None:
175        # If the last line gave the First code in a range,
176        # this one had better give the Last one.
177        if (len(codes) != 1 or codes[0] <= first or
178            cont != "Last" or name != expect_last):
179          raise InputError("expected Last line for %s" %
180                           (expect_last,))
181        codes = range(first, codes[0] + 1)
182        first = None
183        expect_last = None
184        fields[0] = "%04X..%04X" % (codes[0], codes[-1])
185        fields[1] = name
186      elif cont == "First":
187        # Otherwise, if this is the First code in a range,
188        # remember it and go to the next line.
189        if len(codes) != 1:
190          raise InputError("bad First line: range given")
191        expect_last = name
192        first = codes[0]
193        continue
194
195      doline(codes, fields)
196
197    except Exception as e:
198      print("%s:%d: %s" % (filename, lineno, e))
199      raise
200
201  if expect_last is not None:
202    raise InputError("expected Last line for %s; got EOF" %
203                     (expect_last,))
204
205
206def CaseGroups(unicode_dir=_UNICODE_DIR):
207  """Returns list of Unicode code groups equivalent under case folding.
208
209  Each group is a sorted list of code points,
210  and the list of groups is sorted by first code point
211  in the group.
212
213  Args:
214    unicode_dir: Unicode data directory
215
216  Returns:
217    list of Unicode code groups
218  """
219
220  # Dict mapping lowercase code point to fold-equivalent group.
221  togroup = {}
222
223  def DoLine(codes, fields):
224    """Process single CaseFolding.txt line, updating togroup."""
225    (_, foldtype, lower, _) = fields
226    if foldtype not in ("C", "S"):
227      return
228    lower = _UInt(lower)
229    togroup.setdefault(lower, [lower]).extend(codes)
230
231  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
232
233  groups = list(togroup.values())
234  for g in groups:
235    g.sort()
236  groups.sort()
237  return togroup, groups
238
239
240def Scripts(unicode_dir=_UNICODE_DIR):
241  """Returns dict mapping script names to code lists.
242
243  Args:
244    unicode_dir: Unicode data directory
245
246  Returns:
247    dict mapping script names to code lists
248  """
249
250  scripts = {}
251
252  def DoLine(codes, fields):
253    """Process single Scripts.txt line, updating scripts."""
254    (_, name) = fields
255    scripts.setdefault(name, []).extend(codes)
256
257  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
258  return scripts
259
260
261def Categories(unicode_dir=_UNICODE_DIR):
262  """Returns dict mapping category names to code lists.
263
264  Args:
265    unicode_dir: Unicode data directory
266
267  Returns:
268    dict mapping category names to code lists
269  """
270
271  categories = {}
272
273  def DoLine(codes, fields):
274    """Process single UnicodeData.txt line, updating categories."""
275    category = fields[2]
276    categories.setdefault(category, []).extend(codes)
277    # Add codes from Lu into L, etc.
278    if len(category) > 1:
279      short = category[0]
280      categories.setdefault(short, []).extend(codes)
281
282  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
283  return categories
284
285