1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_UNICODE_CASEFOLD_H_ 6 #define RE2_UNICODE_CASEFOLD_H_ 7 8 // Unicode case folding tables. 9 10 // The Unicode case folding tables encode the mapping from one Unicode point 11 // to the next largest Unicode point with equivalent folding. The largest 12 // point wraps back to the first. For example, the tables map: 13 // 14 // 'A' -> 'a' 15 // 'a' -> 'A' 16 // 17 // 'K' -> 'k' 18 // 'k' -> 'K' (Kelvin symbol) 19 // 'K' -> 'K' 20 // 21 // Like everything Unicode, these tables are big. If we represent the table 22 // as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. 23 // Most table entries look like the ones around them: 24 // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. 25 // Instead of listing all the pairs explicitly, we make a list of ranges 26 // and deltas, so that the table entries for 'A' through 'Z' can be represented 27 // as a single entry { 'A', 'Z', +32 }. 28 // 29 // In addition to blocks that map to each other (A-Z mapping to a-z) 30 // there are blocks of pairs that individually map to each other 31 // (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...). 32 // For those, the special delta value EvenOdd marks even/odd pairs 33 // (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs. 34 // 35 // In this form, the table has 274 entries, about 3kB. If we were to split 36 // the table into one for 16-bit codes and an overflow table for larger ones, 37 // we could get it down to about 1.5kB, but that's not worth the complexity. 38 // 39 // The grouped form also allows for efficient fold range calculations 40 // rather than looping one character at a time. 41 42 #include <stdint.h> 43 44 #include "util/utf.h" 45 46 namespace re2 { 47 48 enum { 49 EvenOdd = 1, 50 OddEven = -1, 51 EvenOddSkip = 1<<30, 52 OddEvenSkip, 53 }; 54 55 struct CaseFold { 56 Rune lo; 57 Rune hi; 58 int32_t delta; 59 }; 60 61 extern const CaseFold unicode_casefold[]; 62 extern const int num_unicode_casefold; 63 64 extern const CaseFold unicode_tolower[]; 65 extern const int num_unicode_tolower; 66 67 // Returns the CaseFold* in the tables that contains rune. 68 // If rune is not in the tables, returns the first CaseFold* after rune. 69 // If rune is larger than any value in the tables, returns NULL. 70 extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune); 71 72 // Returns the result of applying the fold f to the rune r. 73 extern Rune ApplyFold(const CaseFold *f, Rune r); 74 75 } // namespace re2 76 77 #endif // RE2_UNICODE_CASEFOLD_H_ 78