1 // Copyright 2008 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_UNICODE_GROUPS_H_ 6 #define RE2_UNICODE_GROUPS_H_ 7 8 // Unicode character groups. 9 10 // The codes get split into ranges of 16-bit codes 11 // and ranges of 32-bit codes. It would be simpler 12 // to use only 32-bit ranges, but these tables are large 13 // enough to warrant extra care. 14 // 15 // Using just 32-bit ranges gives 27 kB of data. 16 // Adding 16-bit ranges gives 18 kB of data. 17 // Adding an extra table of 16-bit singletons would reduce 18 // to 16.5 kB of data but make the data harder to use; 19 // we don't bother. 20 21 #include <stdint.h> 22 23 #include "util/utf.h" 24 25 namespace re2 { 26 27 struct URange16 28 { 29 uint16_t lo; 30 uint16_t hi; 31 }; 32 33 struct URange32 34 { 35 Rune lo; 36 Rune hi; 37 }; 38 39 struct UGroup 40 { 41 const char *name; 42 int sign; // +1 for [abc], -1 for [^abc] 43 const URange16 *r16; 44 int nr16; 45 const URange32 *r32; 46 int nr32; 47 }; 48 49 // Named by property or script name (e.g., "Nd", "N", "Han"). 50 // Negated groups are not included. 51 extern const UGroup unicode_groups[]; 52 extern const int num_unicode_groups; 53 54 // Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]"). 55 // Negated groups are included. 56 extern const UGroup posix_groups[]; 57 extern const int num_posix_groups; 58 59 // Named by Perl name (e.g., "\\d", "\\D"). 60 // Negated groups are included. 61 extern const UGroup perl_groups[]; 62 extern const int num_perl_groups; 63 64 } // namespace re2 65 66 #endif // RE2_UNICODE_GROUPS_H_ 67