1 // Copyright 2016 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 //////////////////////////////////////////////////////////////////////////////// 16 17 #ifndef UTIL_ENCODINGS_ENCODINGS_H_ 18 #define UTIL_ENCODINGS_ENCODINGS_H_ 19 20 // This interface defines the Encoding enum and various functions that 21 // depend only on Encoding values. 22 23 // A hash-function for Encoding, hash<Encoding>, is defined in 24 // i18n/encodings/public/encodings-hash.h 25 26 // On some Windows projects, UNICODE may be defined, which would prevent the 27 // Encoding enum below from compiling. Note that this is a quick fix that does 28 // not break any existing projects. The UNICODE enum may someday be changed 29 // to something more specific and non-colliding, but this involves careful 30 // testing of changes in many other projects. 31 #undef UNICODE 32 33 // NOTE: The Encoding enum must always start at 0. This assumption has 34 // been made and used. 35 36 #ifndef SWIG 37 38 #include "util/encodings/encodings.pb.h" 39 40 #else 41 42 // TODO: Include a SWIG workaround header file. 43 44 #endif 45 46 const int kNumEncodings = NUM_ENCODINGS; 47 48 // some of the popular encoding aliases 49 // TODO: Make these static const Encoding values instead of macros. 50 #define LATIN1 ISO_8859_1 51 #define LATIN2 ISO_8859_2 52 #define LATIN3 ISO_8859_3 53 #define LATIN4 ISO_8859_4 54 #define CYRILLIC ISO_8859_5 55 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language 56 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language 57 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language 58 #define LATIN5 ISO_8859_9 59 #define LATIN6 ISO_8859_10 60 #define KOREAN_HANGUL KOREAN_EUC_KR 61 62 // The default Encoding (LATIN1). 63 Encoding default_encoding(); 64 65 66 67 // ************************************************************* 68 // Encoding predicates 69 // IsValidEncoding() 70 // IsEncEncCompatible 71 // IsSupersetOfAscii7Bit 72 // Is8BitEncoding 73 // IsCJKEncoding 74 // IsHebrewEncoding 75 // IsRightToLeftEncoding 76 // IsLogicalRightToLeftEncoding 77 // IsVisualRightToLeftEncoding 78 // IsIso2022Encoding 79 // IsIso2022JpOrVariant 80 // IsShiftJisOrVariant 81 // IsJapaneseCellPhoneCarrierSpecificEncoding 82 // ************************************************************* 83 84 // IsValidEncoding 85 // =================================== 86 // 87 // Function to check if the input language enum is within range. 88 // 89 90 bool IsValidEncoding(Encoding enc); 91 92 // 93 // IsEncEncCompatible 94 // ------------------ 95 // 96 // This function is to determine whether or not converting from the 97 // first encoding to the second requires any changes to the underlying 98 // text (e.g. ASCII_7BIT is a subset of UTF8). 99 // 100 // TODO: the current implementation is likely incomplete. It would be 101 // good to consider the full matrix of all pairs of encodings and to fish out 102 // all compatible pairs. 103 // 104 bool IsEncEncCompatible(const Encoding from, const Encoding to); 105 106 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given 107 // encoding represent the same characters as they do in ISO_8859_1. 108 109 // WARNING: This function does not currently return true for all encodings that 110 // are supersets of Ascii 7-bit. 111 bool IsSupersetOfAscii7Bit(Encoding e); 112 113 // To be an 8-bit encoding means that there are fewer than 256 symbols. 114 // Each byte determines a new character; there are no multi-byte sequences. 115 116 // WARNING: This function does not currently return true for all encodings that 117 // are 8-bit encodings. 118 bool Is8BitEncoding(Encoding e); 119 120 // IsCJKEncoding 121 // ------------- 122 // 123 // This function returns true if the encoding is either Chinese 124 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not 125 // considered a CJK encoding. 126 bool IsCJKEncoding(Encoding e); 127 128 // IsHebrewEncoding 129 // ------------- 130 // 131 // This function returns true if the encoding is a Hebrew specific 132 // encoding (not UTF8, etc). 133 bool IsHebrewEncoding(Encoding e); 134 135 // IsRightToLeftEncoding 136 // --------------------- 137 // 138 // Returns true if the encoding is a right-to-left encoding. 139 // 140 // Note that the name of this function is somewhat misleading. There is nothing 141 // "right to left" about these encodings. They merely contain code points for 142 // characters in RTL languages such as Hebrew and Arabic. But this is also 143 // true for UTF-8. 144 // 145 // TODO: Get rid of this function. The only special-case we 146 // should need to worry about are visual encodings. Anything we 147 // need to do for all 'RTL' encodings we need to do for UTF-8 as well. 148 bool IsRightToLeftEncoding(Encoding enc); 149 150 // IsLogicalRightToLeftEncoding 151 // ---------------------------- 152 // 153 // Returns true if the encoding is a logical right-to-left encoding. 154 // Logical right-to-left encodings are those that the browser renders 155 // right-to-left and applies the BiDi algorithm to. Therefore the characters 156 // appear in reading order in the file, and indexing, snippet generation etc. 157 // should all just work with no special processing. 158 // 159 // TODO: Get rid of this function. The only special-case we 160 // should need to worry about are visual encodings. 161 bool IsLogicalRightToLeftEncoding(Encoding enc); 162 163 // IsVisualRightToLeftEncoding 164 // --------------------------- 165 // 166 // Returns true if the encoding is a visual right-to-left encoding. 167 // Visual right-to-left encodings are those that the browser renders 168 // left-to-right and does not apply the BiDi algorithm to. Therefore each 169 // line appears in reverse order in the file, lines are manually wrapped 170 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of 171 // the prehistoric days when browsers couldn't render right-to-left, but 172 // unfortunately some visual pages persist to this day. These documents require 173 // special processing so that we don't index or snippet them with each line 174 // reversed. 175 bool IsVisualRightToLeftEncoding(Encoding enc); 176 177 // IsIso2022Encoding 178 // ----------------- 179 // 180 // Returns true if the encoding is a kind of ISO 2022 such as 181 // ISO-2022-JP. 182 bool IsIso2022Encoding(Encoding enc); 183 184 // IsIso2022JpOrVariant 185 // -------------------- 186 // 187 // Returns true if the encoding is ISO-2022-JP or a variant such as 188 // KDDI's ISO-2022-JP. 189 bool IsIso2022JpOrVariant(Encoding enc); 190 191 // IsShiftJisOrVariant 192 // -------------------- 193 // 194 // Returns true if the encoding is Shift_JIS or a variant such as 195 // KDDI's Shift_JIS. 196 bool IsShiftJisOrVariant(Encoding enc); 197 198 // IsJapanesCellPhoneCarrierSpecificEncoding 199 // ----------------------------------------- 200 // 201 // Returns true if it's Japanese cell phone carrier specific encoding 202 // such as KDDI_SHIFT_JIS. 203 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc); 204 205 206 207 // ************************************************************* 208 // ENCODING NAMES 209 // 210 // This interface defines a standard name for each valid encoding, and 211 // a standard name for invalid encodings. (Some names use all upper 212 // case, but others use mixed case.) 213 // 214 // EncodingName() [Encoding to name] 215 // MimeEncodingName() [Encoding to name] 216 // EncodingFromName() [name to Encoding] 217 // EncodingNameAliasToEncoding() [name to Encoding] 218 // default_encoding_name() 219 // invalid_encoding_name() 220 // ************************************************************* 221 222 // EncodingName 223 // ------------ 224 // 225 // Given the encoding, returns its standard name. 226 // Return invalid_encoding_name() if the encoding is invalid. 227 // 228 const char* EncodingName(Encoding enc); 229 230 // 231 // MimeEncodingName 232 // ---------------- 233 // 234 // Return the "preferred MIME name" of an encoding. 235 // 236 // This name is suitable for using in HTTP headers, HTML tags, 237 // and as the "charset" parameter of a MIME Content-Type. 238 const char* MimeEncodingName(Encoding enc); 239 240 241 // The maximum length of an encoding name 242 const int kMaxEncodingNameSize = 50; 243 244 // The standard name of the default encoding. 245 const char* default_encoding_name(); 246 247 // The name used for an invalid encoding. 248 const char* invalid_encoding_name(); 249 250 // EncodingFromName 251 // ---------------- 252 // 253 // If enc_name matches the standard name of an Encoding, using a 254 // case-insensitive comparison, set *encoding to that Encoding and 255 // return true. Otherwise set *encoding to UNKNOWN_ENCODING and 256 // return false. 257 // 258 // REQUIRES: encoding must not be NULL. 259 // 260 bool EncodingFromName(const char* enc_name, Encoding *encoding); 261 262 // 263 // EncodingNameAliasToEncoding 264 // --------------------------- 265 // 266 // If enc_name matches the standard name or an alias of an Encoding, 267 // using a case-insensitive comparison, return that 268 // Encoding. Otherwise, return UNKNOWN_ENCODING. 269 // 270 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for 271 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and 272 // common variations with hyphens and underscores (e.g., "koi8-u" and 273 // "koi8u" for RUSSIAN_KOI8_R). 274 275 Encoding EncodingNameAliasToEncoding(const char *enc_name); 276 277 // ************************************************************* 278 // Miscellany 279 // ************************************************************* 280 281 // PreferredWebOutputEncoding 282 // -------------------------- 283 // 284 // Some multi-byte encodings use byte values that coincide with the 285 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE 286 // can misinterpret these, as indicated in an external XSS report from 287 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We 288 // also use UTF8 instead of encodings that we don't support in our 289 // output, and we generally try to be conservative in what we send out. 290 // Where the client asks for single- or double-byte encodings that are 291 // not as common, we substitute a more common single- or double-byte 292 // encoding, if there is one, thereby preserving the client's intent 293 // to use less space than UTF-8. This also means that characters 294 // outside the destination set will be converted to HTML NCRs (&#NNN;) 295 // if requested. 296 Encoding PreferredWebOutputEncoding(Encoding enc); 297 298 299 #endif // UTIL_ENCODINGS_ENCODINGS_H_ 300