1*ccdc9c3eSSadaf Ebrahimi#!/usr/bin/python 2*ccdc9c3eSSadaf Ebrahimi# coding=utf-8 3*ccdc9c3eSSadaf Ebrahimi# 4*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors. All Rights Reserved. 5*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style 6*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file. 7*ccdc9c3eSSadaf Ebrahimi 8*ccdc9c3eSSadaf Ebrahimi# See unicode_casefold.h for description of case folding tables. 9*ccdc9c3eSSadaf Ebrahimi 10*ccdc9c3eSSadaf Ebrahimi"""Generate C++ table for Unicode case folding.""" 11*ccdc9c3eSSadaf Ebrahimi 12*ccdc9c3eSSadaf Ebrahimiimport sys 13*ccdc9c3eSSadaf Ebrahimiimport unicode 14*ccdc9c3eSSadaf Ebrahimi 15*ccdc9c3eSSadaf Ebrahimi_header = """ 16*ccdc9c3eSSadaf Ebrahimi// GENERATED BY make_unicode_casefold.py; DO NOT EDIT. 17*ccdc9c3eSSadaf Ebrahimi// make_unicode_casefold.py >unicode_casefold.cc 18*ccdc9c3eSSadaf Ebrahimi 19*ccdc9c3eSSadaf Ebrahimi#include "re2/unicode_casefold.h" 20*ccdc9c3eSSadaf Ebrahimi 21*ccdc9c3eSSadaf Ebrahiminamespace re2 { 22*ccdc9c3eSSadaf Ebrahimi 23*ccdc9c3eSSadaf Ebrahimi""" 24*ccdc9c3eSSadaf Ebrahimi 25*ccdc9c3eSSadaf Ebrahimi_trailer = """ 26*ccdc9c3eSSadaf Ebrahimi 27*ccdc9c3eSSadaf Ebrahimi} // namespace re2 28*ccdc9c3eSSadaf Ebrahimi 29*ccdc9c3eSSadaf Ebrahimi""" 30*ccdc9c3eSSadaf Ebrahimi 31*ccdc9c3eSSadaf Ebrahimidef _Delta(a, b): 32*ccdc9c3eSSadaf Ebrahimi """Compute the delta for b - a. Even/odd and odd/even 33*ccdc9c3eSSadaf Ebrahimi are handled specially, as described above.""" 34*ccdc9c3eSSadaf Ebrahimi if a+1 == b: 35*ccdc9c3eSSadaf Ebrahimi if a%2 == 0: 36*ccdc9c3eSSadaf Ebrahimi return 'EvenOdd' 37*ccdc9c3eSSadaf Ebrahimi else: 38*ccdc9c3eSSadaf Ebrahimi return 'OddEven' 39*ccdc9c3eSSadaf Ebrahimi if a == b+1: 40*ccdc9c3eSSadaf Ebrahimi if a%2 == 0: 41*ccdc9c3eSSadaf Ebrahimi return 'OddEven' 42*ccdc9c3eSSadaf Ebrahimi else: 43*ccdc9c3eSSadaf Ebrahimi return 'EvenOdd' 44*ccdc9c3eSSadaf Ebrahimi return b - a 45*ccdc9c3eSSadaf Ebrahimi 46*ccdc9c3eSSadaf Ebrahimidef _AddDelta(a, delta): 47*ccdc9c3eSSadaf Ebrahimi """Return a + delta, handling EvenOdd and OddEven specially.""" 48*ccdc9c3eSSadaf Ebrahimi if type(delta) == int: 49*ccdc9c3eSSadaf Ebrahimi return a+delta 50*ccdc9c3eSSadaf Ebrahimi if delta == 'EvenOdd': 51*ccdc9c3eSSadaf Ebrahimi if a%2 == 0: 52*ccdc9c3eSSadaf Ebrahimi return a+1 53*ccdc9c3eSSadaf Ebrahimi else: 54*ccdc9c3eSSadaf Ebrahimi return a-1 55*ccdc9c3eSSadaf Ebrahimi if delta == 'OddEven': 56*ccdc9c3eSSadaf Ebrahimi if a%2 == 1: 57*ccdc9c3eSSadaf Ebrahimi return a+1 58*ccdc9c3eSSadaf Ebrahimi else: 59*ccdc9c3eSSadaf Ebrahimi return a-1 60*ccdc9c3eSSadaf Ebrahimi print >>sys.stderr, "Bad Delta: ", delta 61*ccdc9c3eSSadaf Ebrahimi raise "Bad Delta" 62*ccdc9c3eSSadaf Ebrahimi 63*ccdc9c3eSSadaf Ebrahimidef _MakeRanges(pairs): 64*ccdc9c3eSSadaf Ebrahimi """Turn a list like [(65,97), (66, 98), ..., (90,122)] 65*ccdc9c3eSSadaf Ebrahimi into [(65, 90, +32)].""" 66*ccdc9c3eSSadaf Ebrahimi ranges = [] 67*ccdc9c3eSSadaf Ebrahimi last = -100 68*ccdc9c3eSSadaf Ebrahimi 69*ccdc9c3eSSadaf Ebrahimi def evenodd(last, a, b, r): 70*ccdc9c3eSSadaf Ebrahimi if a != last+1 or b != _AddDelta(a, r[2]): 71*ccdc9c3eSSadaf Ebrahimi return False 72*ccdc9c3eSSadaf Ebrahimi r[1] = a 73*ccdc9c3eSSadaf Ebrahimi return True 74*ccdc9c3eSSadaf Ebrahimi 75*ccdc9c3eSSadaf Ebrahimi def evenoddpair(last, a, b, r): 76*ccdc9c3eSSadaf Ebrahimi if a != last+2: 77*ccdc9c3eSSadaf Ebrahimi return False 78*ccdc9c3eSSadaf Ebrahimi delta = r[2] 79*ccdc9c3eSSadaf Ebrahimi d = delta 80*ccdc9c3eSSadaf Ebrahimi if type(delta) is not str: 81*ccdc9c3eSSadaf Ebrahimi return False 82*ccdc9c3eSSadaf Ebrahimi if delta.endswith('Skip'): 83*ccdc9c3eSSadaf Ebrahimi d = delta[:-4] 84*ccdc9c3eSSadaf Ebrahimi else: 85*ccdc9c3eSSadaf Ebrahimi delta = d + 'Skip' 86*ccdc9c3eSSadaf Ebrahimi if b != _AddDelta(a, d): 87*ccdc9c3eSSadaf Ebrahimi return False 88*ccdc9c3eSSadaf Ebrahimi r[1] = a 89*ccdc9c3eSSadaf Ebrahimi r[2] = delta 90*ccdc9c3eSSadaf Ebrahimi return True 91*ccdc9c3eSSadaf Ebrahimi 92*ccdc9c3eSSadaf Ebrahimi for a, b in pairs: 93*ccdc9c3eSSadaf Ebrahimi if ranges and evenodd(last, a, b, ranges[-1]): 94*ccdc9c3eSSadaf Ebrahimi pass 95*ccdc9c3eSSadaf Ebrahimi elif ranges and evenoddpair(last, a, b, ranges[-1]): 96*ccdc9c3eSSadaf Ebrahimi pass 97*ccdc9c3eSSadaf Ebrahimi else: 98*ccdc9c3eSSadaf Ebrahimi ranges.append([a, a, _Delta(a, b)]) 99*ccdc9c3eSSadaf Ebrahimi last = a 100*ccdc9c3eSSadaf Ebrahimi return ranges 101*ccdc9c3eSSadaf Ebrahimi 102*ccdc9c3eSSadaf Ebrahimi# The maximum size of a case-folding group. 103*ccdc9c3eSSadaf Ebrahimi# Case folding is implemented in parse.cc by a recursive process 104*ccdc9c3eSSadaf Ebrahimi# with a recursion depth equal to the size of the largest 105*ccdc9c3eSSadaf Ebrahimi# case-folding group, so it is important that this bound be small. 106*ccdc9c3eSSadaf Ebrahimi# The current tables have no group bigger than 4. 107*ccdc9c3eSSadaf Ebrahimi# If there are ever groups bigger than 10 or so, it will be 108*ccdc9c3eSSadaf Ebrahimi# time to rework the code in parse.cc. 109*ccdc9c3eSSadaf EbrahimiMaxCasefoldGroup = 4 110*ccdc9c3eSSadaf Ebrahimi 111*ccdc9c3eSSadaf Ebrahimidef main(): 112*ccdc9c3eSSadaf Ebrahimi lowergroups, casegroups = unicode.CaseGroups() 113*ccdc9c3eSSadaf Ebrahimi foldpairs = [] 114*ccdc9c3eSSadaf Ebrahimi seen = {} 115*ccdc9c3eSSadaf Ebrahimi for c in casegroups: 116*ccdc9c3eSSadaf Ebrahimi if len(c) > MaxCasefoldGroup: 117*ccdc9c3eSSadaf Ebrahimi raise unicode.Error("casefold group too long: %s" % (c,)) 118*ccdc9c3eSSadaf Ebrahimi for i in range(len(c)): 119*ccdc9c3eSSadaf Ebrahimi if c[i-1] in seen: 120*ccdc9c3eSSadaf Ebrahimi raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i])) 121*ccdc9c3eSSadaf Ebrahimi seen[c[i-1]] = True 122*ccdc9c3eSSadaf Ebrahimi foldpairs.append([c[i-1], c[i]]) 123*ccdc9c3eSSadaf Ebrahimi 124*ccdc9c3eSSadaf Ebrahimi lowerpairs = [] 125*ccdc9c3eSSadaf Ebrahimi for lower, group in lowergroups.iteritems(): 126*ccdc9c3eSSadaf Ebrahimi for g in group: 127*ccdc9c3eSSadaf Ebrahimi if g != lower: 128*ccdc9c3eSSadaf Ebrahimi lowerpairs.append([g, lower]) 129*ccdc9c3eSSadaf Ebrahimi 130*ccdc9c3eSSadaf Ebrahimi def printpairs(name, foldpairs): 131*ccdc9c3eSSadaf Ebrahimi foldpairs.sort() 132*ccdc9c3eSSadaf Ebrahimi foldranges = _MakeRanges(foldpairs) 133*ccdc9c3eSSadaf Ebrahimi print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges)) 134*ccdc9c3eSSadaf Ebrahimi print "const CaseFold unicode_%s[] = {" % (name,) 135*ccdc9c3eSSadaf Ebrahimi for lo, hi, delta in foldranges: 136*ccdc9c3eSSadaf Ebrahimi print "\t{ %d, %d, %s }," % (lo, hi, delta) 137*ccdc9c3eSSadaf Ebrahimi print "};" 138*ccdc9c3eSSadaf Ebrahimi print "const int num_unicode_%s = %d;" % (name, len(foldranges),) 139*ccdc9c3eSSadaf Ebrahimi print "" 140*ccdc9c3eSSadaf Ebrahimi 141*ccdc9c3eSSadaf Ebrahimi print _header 142*ccdc9c3eSSadaf Ebrahimi printpairs("casefold", foldpairs) 143*ccdc9c3eSSadaf Ebrahimi printpairs("tolower", lowerpairs) 144*ccdc9c3eSSadaf Ebrahimi print _trailer 145*ccdc9c3eSSadaf Ebrahimi 146*ccdc9c3eSSadaf Ebrahimiif __name__ == '__main__': 147*ccdc9c3eSSadaf Ebrahimi main() 148