1*ccdc9c3eSSadaf Ebrahimi#!/usr/bin/python 2*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors. All Rights Reserved. 3*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style 4*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file. 5*ccdc9c3eSSadaf Ebrahimi 6*ccdc9c3eSSadaf Ebrahimi"""Generate C++ tables for Unicode Script and Category groups.""" 7*ccdc9c3eSSadaf Ebrahimi 8*ccdc9c3eSSadaf Ebrahimiimport sys 9*ccdc9c3eSSadaf Ebrahimiimport unicode 10*ccdc9c3eSSadaf Ebrahimi 11*ccdc9c3eSSadaf Ebrahimi_header = """ 12*ccdc9c3eSSadaf Ebrahimi// GENERATED BY make_unicode_groups.py; DO NOT EDIT. 13*ccdc9c3eSSadaf Ebrahimi// make_unicode_groups.py >unicode_groups.cc 14*ccdc9c3eSSadaf Ebrahimi 15*ccdc9c3eSSadaf Ebrahimi#include "re2/unicode_groups.h" 16*ccdc9c3eSSadaf Ebrahimi 17*ccdc9c3eSSadaf Ebrahiminamespace re2 { 18*ccdc9c3eSSadaf Ebrahimi 19*ccdc9c3eSSadaf Ebrahimi""" 20*ccdc9c3eSSadaf Ebrahimi 21*ccdc9c3eSSadaf Ebrahimi_trailer = """ 22*ccdc9c3eSSadaf Ebrahimi 23*ccdc9c3eSSadaf Ebrahimi} // namespace re2 24*ccdc9c3eSSadaf Ebrahimi 25*ccdc9c3eSSadaf Ebrahimi""" 26*ccdc9c3eSSadaf Ebrahimi 27*ccdc9c3eSSadaf Ebrahimin16 = 0 28*ccdc9c3eSSadaf Ebrahimin32 = 0 29*ccdc9c3eSSadaf Ebrahimi 30*ccdc9c3eSSadaf Ebrahimidef MakeRanges(codes): 31*ccdc9c3eSSadaf Ebrahimi """Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]""" 32*ccdc9c3eSSadaf Ebrahimi ranges = [] 33*ccdc9c3eSSadaf Ebrahimi last = -100 34*ccdc9c3eSSadaf Ebrahimi for c in codes: 35*ccdc9c3eSSadaf Ebrahimi if c == last+1: 36*ccdc9c3eSSadaf Ebrahimi ranges[-1][1] = c 37*ccdc9c3eSSadaf Ebrahimi else: 38*ccdc9c3eSSadaf Ebrahimi ranges.append([c, c]) 39*ccdc9c3eSSadaf Ebrahimi last = c 40*ccdc9c3eSSadaf Ebrahimi return ranges 41*ccdc9c3eSSadaf Ebrahimi 42*ccdc9c3eSSadaf Ebrahimidef PrintRanges(type, name, ranges): 43*ccdc9c3eSSadaf Ebrahimi """Print the ranges as an array of type named name.""" 44*ccdc9c3eSSadaf Ebrahimi print "static const %s %s[] = {" % (type, name,) 45*ccdc9c3eSSadaf Ebrahimi for lo, hi in ranges: 46*ccdc9c3eSSadaf Ebrahimi print "\t{ %d, %d }," % (lo, hi) 47*ccdc9c3eSSadaf Ebrahimi print "};" 48*ccdc9c3eSSadaf Ebrahimi 49*ccdc9c3eSSadaf Ebrahimi# def PrintCodes(type, name, codes): 50*ccdc9c3eSSadaf Ebrahimi# """Print the codes as an array of type named name.""" 51*ccdc9c3eSSadaf Ebrahimi# print "static %s %s[] = {" % (type, name,) 52*ccdc9c3eSSadaf Ebrahimi# for c in codes: 53*ccdc9c3eSSadaf Ebrahimi# print "\t%d," % (c,) 54*ccdc9c3eSSadaf Ebrahimi# print "};" 55*ccdc9c3eSSadaf Ebrahimi 56*ccdc9c3eSSadaf Ebrahimidef PrintGroup(name, codes): 57*ccdc9c3eSSadaf Ebrahimi """Print the data structures for the group of codes. 58*ccdc9c3eSSadaf Ebrahimi Return a UGroup literal for the group.""" 59*ccdc9c3eSSadaf Ebrahimi 60*ccdc9c3eSSadaf Ebrahimi # See unicode_groups.h for a description of the data structure. 61*ccdc9c3eSSadaf Ebrahimi 62*ccdc9c3eSSadaf Ebrahimi # Split codes into 16-bit ranges and 32-bit ranges. 63*ccdc9c3eSSadaf Ebrahimi range16 = MakeRanges([c for c in codes if c < 65536]) 64*ccdc9c3eSSadaf Ebrahimi range32 = MakeRanges([c for c in codes if c >= 65536]) 65*ccdc9c3eSSadaf Ebrahimi 66*ccdc9c3eSSadaf Ebrahimi # Pull singleton ranges out of range16. 67*ccdc9c3eSSadaf Ebrahimi # code16 = [lo for lo, hi in range16 if lo == hi] 68*ccdc9c3eSSadaf Ebrahimi # range16 = [[lo, hi] for lo, hi in range16 if lo != hi] 69*ccdc9c3eSSadaf Ebrahimi 70*ccdc9c3eSSadaf Ebrahimi global n16 71*ccdc9c3eSSadaf Ebrahimi global n32 72*ccdc9c3eSSadaf Ebrahimi n16 += len(range16) 73*ccdc9c3eSSadaf Ebrahimi n32 += len(range32) 74*ccdc9c3eSSadaf Ebrahimi 75*ccdc9c3eSSadaf Ebrahimi ugroup = "{ \"%s\", +1" % (name,) 76*ccdc9c3eSSadaf Ebrahimi # if len(code16) > 0: 77*ccdc9c3eSSadaf Ebrahimi # PrintCodes("uint16_t", name+"_code16", code16) 78*ccdc9c3eSSadaf Ebrahimi # ugroup += ", %s_code16, %d" % (name, len(code16)) 79*ccdc9c3eSSadaf Ebrahimi # else: 80*ccdc9c3eSSadaf Ebrahimi # ugroup += ", 0, 0" 81*ccdc9c3eSSadaf Ebrahimi if len(range16) > 0: 82*ccdc9c3eSSadaf Ebrahimi PrintRanges("URange16", name+"_range16", range16) 83*ccdc9c3eSSadaf Ebrahimi ugroup += ", %s_range16, %d" % (name, len(range16)) 84*ccdc9c3eSSadaf Ebrahimi else: 85*ccdc9c3eSSadaf Ebrahimi ugroup += ", 0, 0" 86*ccdc9c3eSSadaf Ebrahimi if len(range32) > 0: 87*ccdc9c3eSSadaf Ebrahimi PrintRanges("URange32", name+"_range32", range32) 88*ccdc9c3eSSadaf Ebrahimi ugroup += ", %s_range32, %d" % (name, len(range32)) 89*ccdc9c3eSSadaf Ebrahimi else: 90*ccdc9c3eSSadaf Ebrahimi ugroup += ", 0, 0" 91*ccdc9c3eSSadaf Ebrahimi ugroup += " }" 92*ccdc9c3eSSadaf Ebrahimi return ugroup 93*ccdc9c3eSSadaf Ebrahimi 94*ccdc9c3eSSadaf Ebrahimidef main(): 95*ccdc9c3eSSadaf Ebrahimi print _header 96*ccdc9c3eSSadaf Ebrahimi ugroups = [] 97*ccdc9c3eSSadaf Ebrahimi for name, codes in unicode.Categories().iteritems(): 98*ccdc9c3eSSadaf Ebrahimi ugroups.append(PrintGroup(name, codes)) 99*ccdc9c3eSSadaf Ebrahimi for name, codes in unicode.Scripts().iteritems(): 100*ccdc9c3eSSadaf Ebrahimi ugroups.append(PrintGroup(name, codes)) 101*ccdc9c3eSSadaf Ebrahimi print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32) 102*ccdc9c3eSSadaf Ebrahimi print "const UGroup unicode_groups[] = {"; 103*ccdc9c3eSSadaf Ebrahimi ugroups.sort() 104*ccdc9c3eSSadaf Ebrahimi for ug in ugroups: 105*ccdc9c3eSSadaf Ebrahimi print "\t%s," % (ug,) 106*ccdc9c3eSSadaf Ebrahimi print "};" 107*ccdc9c3eSSadaf Ebrahimi print "const int num_unicode_groups = %d;" % (len(ugroups),) 108*ccdc9c3eSSadaf Ebrahimi print _trailer 109*ccdc9c3eSSadaf Ebrahimi 110*ccdc9c3eSSadaf Ebrahimiif __name__ == '__main__': 111*ccdc9c3eSSadaf Ebrahimi main() 112