xref: /aosp_15_r20/external/cronet/third_party/re2/src/re2/unicode_casefold.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_UNICODE_CASEFOLD_H_
6 #define RE2_UNICODE_CASEFOLD_H_
7 
8 // Unicode case folding tables.
9 
10 // The Unicode case folding tables encode the mapping from one Unicode point
11 // to the next largest Unicode point with equivalent folding.  The largest
12 // point wraps back to the first.  For example, the tables map:
13 //
14 //     'A' -> 'a'
15 //     'a' -> 'A'
16 //
17 //     'K' -> 'k'
18 //     'k' -> 'K'  (Kelvin symbol)
19 //     'K' -> 'K'
20 //
21 // Like everything Unicode, these tables are big.  If we represent the table
22 // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
23 // Most table entries look like the ones around them:
24 // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
25 // Instead of listing all the pairs explicitly, we make a list of ranges
26 // and deltas, so that the table entries for 'A' through 'Z' can be represented
27 // as a single entry { 'A', 'Z', +32 }.
28 //
29 // In addition to blocks that map to each other (A-Z mapping to a-z)
30 // there are blocks of pairs that individually map to each other
31 // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
32 // For those, the special delta value EvenOdd marks even/odd pairs
33 // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
34 //
35 // In this form, the table has 274 entries, about 3kB.  If we were to split
36 // the table into one for 16-bit codes and an overflow table for larger ones,
37 // we could get it down to about 1.5kB, but that's not worth the complexity.
38 //
39 // The grouped form also allows for efficient fold range calculations
40 // rather than looping one character at a time.
41 
42 #include <stdint.h>
43 
44 #include "util/utf.h"
45 
46 namespace re2 {
47 
48 enum {
49   EvenOdd = 1,
50   OddEven = -1,
51   EvenOddSkip = 1<<30,
52   OddEvenSkip,
53 };
54 
55 struct CaseFold {
56   Rune lo;
57   Rune hi;
58   int32_t delta;
59 };
60 
61 extern const CaseFold unicode_casefold[];
62 extern const int num_unicode_casefold;
63 
64 extern const CaseFold unicode_tolower[];
65 extern const int num_unicode_tolower;
66 
67 // Returns the CaseFold* in the tables that contains rune.
68 // If rune is not in the tables, returns the first CaseFold* after rune.
69 // If rune is larger than any value in the tables, returns NULL.
70 extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
71 
72 // Returns the result of applying the fold f to the rune r.
73 extern Rune ApplyFold(const CaseFold *f, Rune r);
74 
75 }  // namespace re2
76 
77 #endif  // RE2_UNICODE_CASEFOLD_H_
78