xref: /aosp_15_r20/external/regex-re2/re2/make_unicode_groups.py (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi#!/usr/bin/python
2*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors.  All Rights Reserved.
3*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style
4*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file.
5*ccdc9c3eSSadaf Ebrahimi
6*ccdc9c3eSSadaf Ebrahimi"""Generate C++ tables for Unicode Script and Category groups."""
7*ccdc9c3eSSadaf Ebrahimi
8*ccdc9c3eSSadaf Ebrahimiimport sys
9*ccdc9c3eSSadaf Ebrahimiimport unicode
10*ccdc9c3eSSadaf Ebrahimi
11*ccdc9c3eSSadaf Ebrahimi_header = """
12*ccdc9c3eSSadaf Ebrahimi// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
13*ccdc9c3eSSadaf Ebrahimi// make_unicode_groups.py >unicode_groups.cc
14*ccdc9c3eSSadaf Ebrahimi
15*ccdc9c3eSSadaf Ebrahimi#include "re2/unicode_groups.h"
16*ccdc9c3eSSadaf Ebrahimi
17*ccdc9c3eSSadaf Ebrahiminamespace re2 {
18*ccdc9c3eSSadaf Ebrahimi
19*ccdc9c3eSSadaf Ebrahimi"""
20*ccdc9c3eSSadaf Ebrahimi
21*ccdc9c3eSSadaf Ebrahimi_trailer = """
22*ccdc9c3eSSadaf Ebrahimi
23*ccdc9c3eSSadaf Ebrahimi}  // namespace re2
24*ccdc9c3eSSadaf Ebrahimi
25*ccdc9c3eSSadaf Ebrahimi"""
26*ccdc9c3eSSadaf Ebrahimi
27*ccdc9c3eSSadaf Ebrahimin16 = 0
28*ccdc9c3eSSadaf Ebrahimin32 = 0
29*ccdc9c3eSSadaf Ebrahimi
30*ccdc9c3eSSadaf Ebrahimidef MakeRanges(codes):
31*ccdc9c3eSSadaf Ebrahimi  """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
32*ccdc9c3eSSadaf Ebrahimi  ranges = []
33*ccdc9c3eSSadaf Ebrahimi  last = -100
34*ccdc9c3eSSadaf Ebrahimi  for c in codes:
35*ccdc9c3eSSadaf Ebrahimi    if c == last+1:
36*ccdc9c3eSSadaf Ebrahimi      ranges[-1][1] = c
37*ccdc9c3eSSadaf Ebrahimi    else:
38*ccdc9c3eSSadaf Ebrahimi      ranges.append([c, c])
39*ccdc9c3eSSadaf Ebrahimi    last = c
40*ccdc9c3eSSadaf Ebrahimi  return ranges
41*ccdc9c3eSSadaf Ebrahimi
42*ccdc9c3eSSadaf Ebrahimidef PrintRanges(type, name, ranges):
43*ccdc9c3eSSadaf Ebrahimi  """Print the ranges as an array of type named name."""
44*ccdc9c3eSSadaf Ebrahimi  print "static const %s %s[] = {" % (type, name,)
45*ccdc9c3eSSadaf Ebrahimi  for lo, hi in ranges:
46*ccdc9c3eSSadaf Ebrahimi    print "\t{ %d, %d }," % (lo, hi)
47*ccdc9c3eSSadaf Ebrahimi  print "};"
48*ccdc9c3eSSadaf Ebrahimi
49*ccdc9c3eSSadaf Ebrahimi# def PrintCodes(type, name, codes):
50*ccdc9c3eSSadaf Ebrahimi#   """Print the codes as an array of type named name."""
51*ccdc9c3eSSadaf Ebrahimi#   print "static %s %s[] = {" % (type, name,)
52*ccdc9c3eSSadaf Ebrahimi#   for c in codes:
53*ccdc9c3eSSadaf Ebrahimi#     print "\t%d," % (c,)
54*ccdc9c3eSSadaf Ebrahimi#   print "};"
55*ccdc9c3eSSadaf Ebrahimi
56*ccdc9c3eSSadaf Ebrahimidef PrintGroup(name, codes):
57*ccdc9c3eSSadaf Ebrahimi  """Print the data structures for the group of codes.
58*ccdc9c3eSSadaf Ebrahimi  Return a UGroup literal for the group."""
59*ccdc9c3eSSadaf Ebrahimi
60*ccdc9c3eSSadaf Ebrahimi  # See unicode_groups.h for a description of the data structure.
61*ccdc9c3eSSadaf Ebrahimi
62*ccdc9c3eSSadaf Ebrahimi  # Split codes into 16-bit ranges and 32-bit ranges.
63*ccdc9c3eSSadaf Ebrahimi  range16 = MakeRanges([c for c in codes if c < 65536])
64*ccdc9c3eSSadaf Ebrahimi  range32 = MakeRanges([c for c in codes if c >= 65536])
65*ccdc9c3eSSadaf Ebrahimi
66*ccdc9c3eSSadaf Ebrahimi  # Pull singleton ranges out of range16.
67*ccdc9c3eSSadaf Ebrahimi  # code16 = [lo for lo, hi in range16 if lo == hi]
68*ccdc9c3eSSadaf Ebrahimi  # range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
69*ccdc9c3eSSadaf Ebrahimi
70*ccdc9c3eSSadaf Ebrahimi  global n16
71*ccdc9c3eSSadaf Ebrahimi  global n32
72*ccdc9c3eSSadaf Ebrahimi  n16 += len(range16)
73*ccdc9c3eSSadaf Ebrahimi  n32 += len(range32)
74*ccdc9c3eSSadaf Ebrahimi
75*ccdc9c3eSSadaf Ebrahimi  ugroup = "{ \"%s\", +1" % (name,)
76*ccdc9c3eSSadaf Ebrahimi  # if len(code16) > 0:
77*ccdc9c3eSSadaf Ebrahimi  #   PrintCodes("uint16_t", name+"_code16", code16)
78*ccdc9c3eSSadaf Ebrahimi  #   ugroup += ", %s_code16, %d" % (name, len(code16))
79*ccdc9c3eSSadaf Ebrahimi  # else:
80*ccdc9c3eSSadaf Ebrahimi  #   ugroup += ", 0, 0"
81*ccdc9c3eSSadaf Ebrahimi  if len(range16) > 0:
82*ccdc9c3eSSadaf Ebrahimi    PrintRanges("URange16", name+"_range16", range16)
83*ccdc9c3eSSadaf Ebrahimi    ugroup += ", %s_range16, %d" % (name, len(range16))
84*ccdc9c3eSSadaf Ebrahimi  else:
85*ccdc9c3eSSadaf Ebrahimi    ugroup += ", 0, 0"
86*ccdc9c3eSSadaf Ebrahimi  if len(range32) > 0:
87*ccdc9c3eSSadaf Ebrahimi    PrintRanges("URange32", name+"_range32", range32)
88*ccdc9c3eSSadaf Ebrahimi    ugroup += ", %s_range32, %d" % (name, len(range32))
89*ccdc9c3eSSadaf Ebrahimi  else:
90*ccdc9c3eSSadaf Ebrahimi    ugroup += ", 0, 0"
91*ccdc9c3eSSadaf Ebrahimi  ugroup += " }"
92*ccdc9c3eSSadaf Ebrahimi  return ugroup
93*ccdc9c3eSSadaf Ebrahimi
94*ccdc9c3eSSadaf Ebrahimidef main():
95*ccdc9c3eSSadaf Ebrahimi  print _header
96*ccdc9c3eSSadaf Ebrahimi  ugroups = []
97*ccdc9c3eSSadaf Ebrahimi  for name, codes in unicode.Categories().iteritems():
98*ccdc9c3eSSadaf Ebrahimi    ugroups.append(PrintGroup(name, codes))
99*ccdc9c3eSSadaf Ebrahimi  for name, codes in unicode.Scripts().iteritems():
100*ccdc9c3eSSadaf Ebrahimi    ugroups.append(PrintGroup(name, codes))
101*ccdc9c3eSSadaf Ebrahimi  print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
102*ccdc9c3eSSadaf Ebrahimi  print "const UGroup unicode_groups[] = {";
103*ccdc9c3eSSadaf Ebrahimi  ugroups.sort()
104*ccdc9c3eSSadaf Ebrahimi  for ug in ugroups:
105*ccdc9c3eSSadaf Ebrahimi    print "\t%s," % (ug,)
106*ccdc9c3eSSadaf Ebrahimi  print "};"
107*ccdc9c3eSSadaf Ebrahimi  print "const int num_unicode_groups = %d;" % (len(ugroups),)
108*ccdc9c3eSSadaf Ebrahimi  print _trailer
109*ccdc9c3eSSadaf Ebrahimi
110*ccdc9c3eSSadaf Ebrahimiif __name__ == '__main__':
111*ccdc9c3eSSadaf Ebrahimi  main()
112