1 // Copyright 2016 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 //////////////////////////////////////////////////////////////////////////////// 16 17 #ifndef UTIL_ENCODINGS_ENCODINGS_PB_H_ 18 #define UTIL_ENCODINGS_ENCODINGS_PB_H_ 19 20 enum Encoding { 21 ISO_8859_1 = 0, // Teragram ASCII 22 ISO_8859_2 = 1, // Teragram Latin2 23 ISO_8859_3 = 2, // in BasisTech but not in Teragram 24 ISO_8859_4 = 3, // Teragram Latin4 25 ISO_8859_5 = 4, // Teragram ISO-8859-5 26 ISO_8859_6 = 5, // Teragram Arabic 27 ISO_8859_7 = 6, // Teragram Greek 28 ISO_8859_8 = 7, // Teragram Hebrew 29 ISO_8859_9 = 8, // in BasisTech but not in Teragram 30 ISO_8859_10 = 9, // in BasisTech but not in Teragram 31 JAPANESE_EUC_JP = 10, // Teragram EUC_JP 32 JAPANESE_SHIFT_JIS = 11, // Teragram SJS 33 JAPANESE_JIS = 12, // Teragram JIS 34 CHINESE_BIG5 = 13, // Teragram BIG5 35 CHINESE_GB = 14, // Teragram GB 36 CHINESE_EUC_CN = 15, // Misnamed. Should be EUC_TW. Was Basis Tech 37 // CNS11643EUC, before that Teragram EUC-CN(!) 38 // See //i18n/basistech/basistech_encodings.h 39 KOREAN_EUC_KR = 16, // Teragram KSC 40 UNICODE = 17, // Teragram Unicode 41 CHINESE_EUC_DEC = 18, // Misnamed. Should be EUC_TW. Was Basis Tech 42 // CNS11643EUC, before that Teragram EUC. 43 CHINESE_CNS = 19, // Misnamed. Should be EUC_TW. Was Basis Tech 44 // CNS11643EUC, before that Teragram CNS. 45 CHINESE_BIG5_CP950 = 20, // Teragram BIG5_CP950 46 JAPANESE_CP932 = 21, // Teragram CP932 47 UTF8 = 22, 48 UNKNOWN_ENCODING = 23, 49 ASCII_7BIT = 24, // ISO_8859_1 with all characters <= 127. 50 // Should be present only in the crawler 51 // and in the repository, 52 // *never* as a result of Document::encoding(). 53 RUSSIAN_KOI8_R = 25, // Teragram KOI8R 54 RUSSIAN_CP1251 = 26, // Teragram CP1251 55 56 //---------------------------------------------------------- 57 // These are _not_ output from teragram. Instead, they are as 58 // detected in the headers of usenet articles. 59 MSFT_CP1252 = 27, // 27: CP1252 aka MSFT euro ascii 60 RUSSIAN_KOI8_RU = 28, // CP21866 aka KOI8-U, used for Ukrainian. 61 // Misnamed, this is _not_ KOI8-RU but KOI8-U. 62 // KOI8-U is used much more often than KOI8-RU. 63 MSFT_CP1250 = 29, // CP1250 aka MSFT eastern european 64 ISO_8859_15 = 30, // aka ISO_8859_0 aka ISO_8859_1 euroized 65 //---------------------------------------------------------- 66 67 //---------------------------------------------------------- 68 // These are in BasisTech but not in Teragram. They are 69 // needed for new interface languages. Now detected by 70 // research langid 71 MSFT_CP1254 = 31, // used for Turkish 72 MSFT_CP1257 = 32, // used in Baltic countries 73 //---------------------------------------------------------- 74 75 //---------------------------------------------------------- 76 //---------------------------------------------------------- 77 // New encodings detected by Teragram 78 ISO_8859_11 = 33, // aka TIS-620, used for Thai 79 MSFT_CP874 = 34, // used for Thai 80 MSFT_CP1256 = 35, // used for Arabic 81 82 //---------------------------------------------------------- 83 // Detected as ISO_8859_8 by Teragram, but can be found in META tags 84 MSFT_CP1255 = 36, // Logical Hebrew Microsoft 85 ISO_8859_8_I = 37, // Iso Hebrew Logical 86 HEBREW_VISUAL = 38, // Iso Hebrew Visual 87 //---------------------------------------------------------- 88 89 //---------------------------------------------------------- 90 // Detected by research langid 91 CZECH_CP852 = 39, 92 CZECH_CSN_369103 = 40, // aka ISO_IR_139 aka KOI8_CS 93 MSFT_CP1253 = 41, // used for Greek 94 RUSSIAN_CP866 = 42, 95 //---------------------------------------------------------- 96 97 //---------------------------------------------------------- 98 // Handled by iconv in glibc 99 ISO_8859_13 = 43, 100 ISO_2022_KR = 44, 101 GBK = 45, 102 GB18030 = 46, 103 BIG5_HKSCS = 47, 104 ISO_2022_CN = 48, 105 106 //----------------------------------------------------------- 107 // Detected by xin liu's detector 108 // Handled by transcoder 109 // (Indic encodings) 110 111 TSCII = 49, 112 TAMIL_MONO = 50, 113 TAMIL_BI = 51, 114 JAGRAN = 52, 115 116 117 MACINTOSH_ROMAN = 53, 118 UTF7 = 54, 119 BHASKAR = 55, // Indic encoding - Devanagari 120 HTCHANAKYA = 56, // 56 Indic encoding - Devanagari 121 122 //----------------------------------------------------------- 123 // These allow a single place (inputconverter and outputconverter) 124 // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8 125 // bulk conversions, with interchange-valid checking on input and 126 // fallback if needed on ouput. 127 UTF16BE = 57, // big-endian UTF-16 128 UTF16LE = 58, // little-endian UTF-16 129 UTF32BE = 59, // big-endian UTF-32 130 UTF32LE = 60, // little-endian UTF-32 131 //----------------------------------------------------------- 132 133 //----------------------------------------------------------- 134 // An encoding that means "This is not text, but it may have some 135 // simple ASCII text embedded". Intended input conversion (not yet 136 // implemented) is to keep strings of >=4 seven-bit ASCII characters 137 // (follow each kept string with an ASCII space), delete the rest of 138 // the bytes. This will pick up and allow indexing of e.g. captions 139 // in JPEGs. No output conversion needed. 140 BINARYENC = 61, 141 //----------------------------------------------------------- 142 143 //----------------------------------------------------------- 144 // Some Web pages allow a mixture of HZ-GB and GB-2312 by using 145 // ~{ ... ~} for 2-byte pairs, and the browsers support this. 146 HZ_GB_2312 = 62, 147 //----------------------------------------------------------- 148 149 //----------------------------------------------------------- 150 // Some external vendors make the common input error of 151 // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed. 152 UTF8UTF8 = 63, 153 //----------------------------------------------------------- 154 155 //----------------------------------------------------------- 156 // Handled by transcoder for tamil language specific font 157 // encodings without the support for detection at present. 158 TAM_ELANGO = 64, // Elango - Tamil 159 TAM_LTTMBARANI = 65, // Barani - Tamil 160 TAM_SHREE = 66, // Shree - Tamil 161 TAM_TBOOMIS = 67, // TBoomis - Tamil 162 TAM_TMNEWS = 68, // TMNews - Tamil 163 TAM_WEBTAMIL = 69, // Webtamil - Tamil 164 //----------------------------------------------------------- 165 166 //----------------------------------------------------------- 167 // Shift_JIS variants used by Japanese cell phone carriers. 168 KDDI_SHIFT_JIS = 70, 169 DOCOMO_SHIFT_JIS = 71, 170 SOFTBANK_SHIFT_JIS = 72, 171 // ISO-2022-JP variants used by KDDI and SoftBank. 172 KDDI_ISO_2022_JP = 73, 173 SOFTBANK_ISO_2022_JP = 74, 174 //----------------------------------------------------------- 175 176 NUM_ENCODINGS = 75, // Always keep this at the end. It is not a 177 // valid Encoding enum, it is only used to 178 // indicate the total number of Encodings. 179 }; 180 181 #endif // UTIL_ENCODINGS_ENCODINGS_PB_H_ 182