xref: /aosp_15_r20/external/regex-re2/re2/make_unicode_casefold.py (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi#!/usr/bin/python
2*ccdc9c3eSSadaf Ebrahimi# coding=utf-8
3*ccdc9c3eSSadaf Ebrahimi#
4*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors.  All Rights Reserved.
5*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style
6*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file.
7*ccdc9c3eSSadaf Ebrahimi
8*ccdc9c3eSSadaf Ebrahimi# See unicode_casefold.h for description of case folding tables.
9*ccdc9c3eSSadaf Ebrahimi
10*ccdc9c3eSSadaf Ebrahimi"""Generate C++ table for Unicode case folding."""
11*ccdc9c3eSSadaf Ebrahimi
12*ccdc9c3eSSadaf Ebrahimiimport sys
13*ccdc9c3eSSadaf Ebrahimiimport unicode
14*ccdc9c3eSSadaf Ebrahimi
15*ccdc9c3eSSadaf Ebrahimi_header = """
16*ccdc9c3eSSadaf Ebrahimi// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
17*ccdc9c3eSSadaf Ebrahimi// make_unicode_casefold.py >unicode_casefold.cc
18*ccdc9c3eSSadaf Ebrahimi
19*ccdc9c3eSSadaf Ebrahimi#include "re2/unicode_casefold.h"
20*ccdc9c3eSSadaf Ebrahimi
21*ccdc9c3eSSadaf Ebrahiminamespace re2 {
22*ccdc9c3eSSadaf Ebrahimi
23*ccdc9c3eSSadaf Ebrahimi"""
24*ccdc9c3eSSadaf Ebrahimi
25*ccdc9c3eSSadaf Ebrahimi_trailer = """
26*ccdc9c3eSSadaf Ebrahimi
27*ccdc9c3eSSadaf Ebrahimi} // namespace re2
28*ccdc9c3eSSadaf Ebrahimi
29*ccdc9c3eSSadaf Ebrahimi"""
30*ccdc9c3eSSadaf Ebrahimi
31*ccdc9c3eSSadaf Ebrahimidef _Delta(a, b):
32*ccdc9c3eSSadaf Ebrahimi  """Compute the delta for b - a.  Even/odd and odd/even
33*ccdc9c3eSSadaf Ebrahimi     are handled specially, as described above."""
34*ccdc9c3eSSadaf Ebrahimi  if a+1 == b:
35*ccdc9c3eSSadaf Ebrahimi    if a%2 == 0:
36*ccdc9c3eSSadaf Ebrahimi      return 'EvenOdd'
37*ccdc9c3eSSadaf Ebrahimi    else:
38*ccdc9c3eSSadaf Ebrahimi      return 'OddEven'
39*ccdc9c3eSSadaf Ebrahimi  if a == b+1:
40*ccdc9c3eSSadaf Ebrahimi    if a%2 == 0:
41*ccdc9c3eSSadaf Ebrahimi      return 'OddEven'
42*ccdc9c3eSSadaf Ebrahimi    else:
43*ccdc9c3eSSadaf Ebrahimi      return 'EvenOdd'
44*ccdc9c3eSSadaf Ebrahimi  return b - a
45*ccdc9c3eSSadaf Ebrahimi
46*ccdc9c3eSSadaf Ebrahimidef _AddDelta(a, delta):
47*ccdc9c3eSSadaf Ebrahimi  """Return a + delta, handling EvenOdd and OddEven specially."""
48*ccdc9c3eSSadaf Ebrahimi  if type(delta) == int:
49*ccdc9c3eSSadaf Ebrahimi    return a+delta
50*ccdc9c3eSSadaf Ebrahimi  if delta == 'EvenOdd':
51*ccdc9c3eSSadaf Ebrahimi    if a%2 == 0:
52*ccdc9c3eSSadaf Ebrahimi      return a+1
53*ccdc9c3eSSadaf Ebrahimi    else:
54*ccdc9c3eSSadaf Ebrahimi      return a-1
55*ccdc9c3eSSadaf Ebrahimi  if delta == 'OddEven':
56*ccdc9c3eSSadaf Ebrahimi    if a%2 == 1:
57*ccdc9c3eSSadaf Ebrahimi      return a+1
58*ccdc9c3eSSadaf Ebrahimi    else:
59*ccdc9c3eSSadaf Ebrahimi      return a-1
60*ccdc9c3eSSadaf Ebrahimi  print >>sys.stderr, "Bad Delta: ", delta
61*ccdc9c3eSSadaf Ebrahimi  raise "Bad Delta"
62*ccdc9c3eSSadaf Ebrahimi
63*ccdc9c3eSSadaf Ebrahimidef _MakeRanges(pairs):
64*ccdc9c3eSSadaf Ebrahimi  """Turn a list like [(65,97), (66, 98), ..., (90,122)]
65*ccdc9c3eSSadaf Ebrahimi     into [(65, 90, +32)]."""
66*ccdc9c3eSSadaf Ebrahimi  ranges = []
67*ccdc9c3eSSadaf Ebrahimi  last = -100
68*ccdc9c3eSSadaf Ebrahimi
69*ccdc9c3eSSadaf Ebrahimi  def evenodd(last, a, b, r):
70*ccdc9c3eSSadaf Ebrahimi    if a != last+1 or b != _AddDelta(a, r[2]):
71*ccdc9c3eSSadaf Ebrahimi      return False
72*ccdc9c3eSSadaf Ebrahimi    r[1] = a
73*ccdc9c3eSSadaf Ebrahimi    return True
74*ccdc9c3eSSadaf Ebrahimi
75*ccdc9c3eSSadaf Ebrahimi  def evenoddpair(last, a, b, r):
76*ccdc9c3eSSadaf Ebrahimi    if a != last+2:
77*ccdc9c3eSSadaf Ebrahimi      return False
78*ccdc9c3eSSadaf Ebrahimi    delta = r[2]
79*ccdc9c3eSSadaf Ebrahimi    d = delta
80*ccdc9c3eSSadaf Ebrahimi    if type(delta) is not str:
81*ccdc9c3eSSadaf Ebrahimi      return False
82*ccdc9c3eSSadaf Ebrahimi    if delta.endswith('Skip'):
83*ccdc9c3eSSadaf Ebrahimi      d = delta[:-4]
84*ccdc9c3eSSadaf Ebrahimi    else:
85*ccdc9c3eSSadaf Ebrahimi      delta = d + 'Skip'
86*ccdc9c3eSSadaf Ebrahimi    if b != _AddDelta(a, d):
87*ccdc9c3eSSadaf Ebrahimi      return False
88*ccdc9c3eSSadaf Ebrahimi    r[1] = a
89*ccdc9c3eSSadaf Ebrahimi    r[2] = delta
90*ccdc9c3eSSadaf Ebrahimi    return True
91*ccdc9c3eSSadaf Ebrahimi
92*ccdc9c3eSSadaf Ebrahimi  for a, b in pairs:
93*ccdc9c3eSSadaf Ebrahimi    if ranges and evenodd(last, a, b, ranges[-1]):
94*ccdc9c3eSSadaf Ebrahimi      pass
95*ccdc9c3eSSadaf Ebrahimi    elif ranges and evenoddpair(last, a, b, ranges[-1]):
96*ccdc9c3eSSadaf Ebrahimi      pass
97*ccdc9c3eSSadaf Ebrahimi    else:
98*ccdc9c3eSSadaf Ebrahimi      ranges.append([a, a, _Delta(a, b)])
99*ccdc9c3eSSadaf Ebrahimi    last = a
100*ccdc9c3eSSadaf Ebrahimi  return ranges
101*ccdc9c3eSSadaf Ebrahimi
102*ccdc9c3eSSadaf Ebrahimi# The maximum size of a case-folding group.
103*ccdc9c3eSSadaf Ebrahimi# Case folding is implemented in parse.cc by a recursive process
104*ccdc9c3eSSadaf Ebrahimi# with a recursion depth equal to the size of the largest
105*ccdc9c3eSSadaf Ebrahimi# case-folding group, so it is important that this bound be small.
106*ccdc9c3eSSadaf Ebrahimi# The current tables have no group bigger than 4.
107*ccdc9c3eSSadaf Ebrahimi# If there are ever groups bigger than 10 or so, it will be
108*ccdc9c3eSSadaf Ebrahimi# time to rework the code in parse.cc.
109*ccdc9c3eSSadaf EbrahimiMaxCasefoldGroup = 4
110*ccdc9c3eSSadaf Ebrahimi
111*ccdc9c3eSSadaf Ebrahimidef main():
112*ccdc9c3eSSadaf Ebrahimi  lowergroups, casegroups = unicode.CaseGroups()
113*ccdc9c3eSSadaf Ebrahimi  foldpairs = []
114*ccdc9c3eSSadaf Ebrahimi  seen = {}
115*ccdc9c3eSSadaf Ebrahimi  for c in casegroups:
116*ccdc9c3eSSadaf Ebrahimi    if len(c) > MaxCasefoldGroup:
117*ccdc9c3eSSadaf Ebrahimi      raise unicode.Error("casefold group too long: %s" % (c,))
118*ccdc9c3eSSadaf Ebrahimi    for i in range(len(c)):
119*ccdc9c3eSSadaf Ebrahimi      if c[i-1] in seen:
120*ccdc9c3eSSadaf Ebrahimi        raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
121*ccdc9c3eSSadaf Ebrahimi      seen[c[i-1]] = True
122*ccdc9c3eSSadaf Ebrahimi      foldpairs.append([c[i-1], c[i]])
123*ccdc9c3eSSadaf Ebrahimi
124*ccdc9c3eSSadaf Ebrahimi  lowerpairs = []
125*ccdc9c3eSSadaf Ebrahimi  for lower, group in lowergroups.iteritems():
126*ccdc9c3eSSadaf Ebrahimi    for g in group:
127*ccdc9c3eSSadaf Ebrahimi      if g != lower:
128*ccdc9c3eSSadaf Ebrahimi        lowerpairs.append([g, lower])
129*ccdc9c3eSSadaf Ebrahimi
130*ccdc9c3eSSadaf Ebrahimi  def printpairs(name, foldpairs):
131*ccdc9c3eSSadaf Ebrahimi    foldpairs.sort()
132*ccdc9c3eSSadaf Ebrahimi    foldranges = _MakeRanges(foldpairs)
133*ccdc9c3eSSadaf Ebrahimi    print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
134*ccdc9c3eSSadaf Ebrahimi    print "const CaseFold unicode_%s[] = {" % (name,)
135*ccdc9c3eSSadaf Ebrahimi    for lo, hi, delta in foldranges:
136*ccdc9c3eSSadaf Ebrahimi      print "\t{ %d, %d, %s }," % (lo, hi, delta)
137*ccdc9c3eSSadaf Ebrahimi    print "};"
138*ccdc9c3eSSadaf Ebrahimi    print "const int num_unicode_%s = %d;" % (name, len(foldranges),)
139*ccdc9c3eSSadaf Ebrahimi    print ""
140*ccdc9c3eSSadaf Ebrahimi
141*ccdc9c3eSSadaf Ebrahimi  print _header
142*ccdc9c3eSSadaf Ebrahimi  printpairs("casefold", foldpairs)
143*ccdc9c3eSSadaf Ebrahimi  printpairs("tolower", lowerpairs)
144*ccdc9c3eSSadaf Ebrahimi  print _trailer
145*ccdc9c3eSSadaf Ebrahimi
146*ccdc9c3eSSadaf Ebrahimiif __name__ == '__main__':
147*ccdc9c3eSSadaf Ebrahimi  main()
148