xref: /aosp_15_r20/external/cronet/third_party/ced/src/util/encodings/encodings.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16 
17 #ifndef UTIL_ENCODINGS_ENCODINGS_H_
18 #define UTIL_ENCODINGS_ENCODINGS_H_
19 
20 // This interface defines the Encoding enum and various functions that
21 // depend only on Encoding values.
22 
23 // A hash-function for Encoding, hash<Encoding>, is defined in
24 // i18n/encodings/public/encodings-hash.h
25 
26 // On some Windows projects, UNICODE may be defined, which would prevent the
27 // Encoding enum below from compiling. Note that this is a quick fix that does
28 // not break any existing projects. The UNICODE enum may someday be changed
29 // to something more specific and non-colliding, but this involves careful
30 // testing of changes in many other projects.
31 #undef UNICODE
32 
33 // NOTE: The Encoding enum must always start at 0. This assumption has
34 // been made and used.
35 
36 #ifndef SWIG
37 
38 #include "util/encodings/encodings.pb.h"
39 
40 #else
41 
42 // TODO: Include a SWIG workaround header file.
43 
44 #endif
45 
46 const int kNumEncodings = NUM_ENCODINGS;
47 
48 // some of the popular encoding aliases
49 // TODO: Make these static const Encoding values instead of macros.
50 #define LATIN1           ISO_8859_1
51 #define LATIN2           ISO_8859_2
52 #define LATIN3           ISO_8859_3
53 #define LATIN4           ISO_8859_4
54 #define CYRILLIC         ISO_8859_5
55 #define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
56 #define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
57 #define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
58 #define LATIN5           ISO_8859_9
59 #define LATIN6           ISO_8859_10
60 #define KOREAN_HANGUL    KOREAN_EUC_KR
61 
62 // The default Encoding (LATIN1).
63 Encoding default_encoding();
64 
65 
66 
67 // *************************************************************
68 // Encoding predicates
69 //   IsValidEncoding()
70 //   IsEncEncCompatible
71 //   IsSupersetOfAscii7Bit
72 //   Is8BitEncoding
73 //   IsCJKEncoding
74 //   IsHebrewEncoding
75 //   IsRightToLeftEncoding
76 //   IsLogicalRightToLeftEncoding
77 //   IsVisualRightToLeftEncoding
78 //   IsIso2022Encoding
79 //   IsIso2022JpOrVariant
80 //   IsShiftJisOrVariant
81 //   IsJapaneseCellPhoneCarrierSpecificEncoding
82 // *************************************************************
83 
84 // IsValidEncoding
85 // ===================================
86 //
87 // Function to check if the input language enum is within range.
88 //
89 
90 bool IsValidEncoding(Encoding enc);
91 
92 //
93 // IsEncEncCompatible
94 // ------------------
95 //
96 // This function is to determine whether or not converting from the
97 // first encoding to the second requires any changes to the underlying
98 // text (e.g.  ASCII_7BIT is a subset of UTF8).
99 //
100 // TODO: the current implementation is likely incomplete.  It would be
101 // good to consider the full matrix of all pairs of encodings and to fish out
102 // all compatible pairs.
103 //
104 bool IsEncEncCompatible(const Encoding from, const Encoding to);
105 
106 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
107 // encoding represent the same characters as they do in ISO_8859_1.
108 
109 // WARNING: This function does not currently return true for all encodings that
110 // are supersets of Ascii 7-bit.
111 bool IsSupersetOfAscii7Bit(Encoding e);
112 
113 // To be an 8-bit encoding means that there are fewer than 256 symbols.
114 // Each byte determines a new character; there are no multi-byte sequences.
115 
116 // WARNING: This function does not currently return true for all encodings that
117 // are 8-bit encodings.
118 bool Is8BitEncoding(Encoding e);
119 
120 // IsCJKEncoding
121 // -------------
122 //
123 // This function returns true if the encoding is either Chinese
124 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
125 // considered a CJK encoding.
126 bool IsCJKEncoding(Encoding e);
127 
128 // IsHebrewEncoding
129 // -------------
130 //
131 // This function returns true if the encoding is a Hebrew specific
132 // encoding (not UTF8, etc).
133 bool IsHebrewEncoding(Encoding e);
134 
135 // IsRightToLeftEncoding
136 // ---------------------
137 //
138 // Returns true if the encoding is a right-to-left encoding.
139 //
140 // Note that the name of this function is somewhat misleading. There is nothing
141 // "right to left" about these encodings. They merely contain code points for
142 // characters in RTL languages such as Hebrew and Arabic. But this is also
143 // true for UTF-8.
144 //
145 // TODO: Get rid of this function. The only special-case we
146 // should need to worry about are visual encodings. Anything we
147 // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
148 bool IsRightToLeftEncoding(Encoding enc);
149 
150 // IsLogicalRightToLeftEncoding
151 // ----------------------------
152 //
153 // Returns true if the encoding is a logical right-to-left encoding.
154 // Logical right-to-left encodings are those that the browser renders
155 // right-to-left and applies the BiDi algorithm to. Therefore the characters
156 // appear in reading order in the file, and indexing, snippet generation etc.
157 // should all just work with no special processing.
158 //
159 // TODO: Get rid of this function. The only special-case we
160 // should need to worry about are visual encodings.
161 bool IsLogicalRightToLeftEncoding(Encoding enc);
162 
163 // IsVisualRightToLeftEncoding
164 // ---------------------------
165 //
166 // Returns true if the encoding is a visual right-to-left encoding.
167 // Visual right-to-left encodings are those that the browser renders
168 // left-to-right and does not apply the BiDi algorithm to. Therefore each
169 // line appears in reverse order in the file, lines are manually wrapped
170 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
171 // the prehistoric days when browsers couldn't render right-to-left, but
172 // unfortunately some visual pages persist to this day. These documents require
173 // special processing so that we don't index or snippet them with each line
174 // reversed.
175 bool IsVisualRightToLeftEncoding(Encoding enc);
176 
177 // IsIso2022Encoding
178 // -----------------
179 //
180 // Returns true if the encoding is a kind of ISO 2022 such as
181 // ISO-2022-JP.
182 bool IsIso2022Encoding(Encoding enc);
183 
184 // IsIso2022JpOrVariant
185 // --------------------
186 //
187 // Returns true if the encoding is ISO-2022-JP or a variant such as
188 // KDDI's ISO-2022-JP.
189 bool IsIso2022JpOrVariant(Encoding enc);
190 
191 // IsShiftJisOrVariant
192 // --------------------
193 //
194 // Returns true if the encoding is Shift_JIS or a variant such as
195 // KDDI's Shift_JIS.
196 bool IsShiftJisOrVariant(Encoding enc);
197 
198 // IsJapanesCellPhoneCarrierSpecificEncoding
199 // -----------------------------------------
200 //
201 // Returns true if it's Japanese cell phone carrier specific encoding
202 // such as KDDI_SHIFT_JIS.
203 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
204 
205 
206 
207 // *************************************************************
208 // ENCODING NAMES
209 //
210 // This interface defines a standard name for each valid encoding, and
211 // a standard name for invalid encodings. (Some names use all upper
212 // case, but others use mixed case.)
213 //
214 //   EncodingName() [Encoding to name]
215 //   MimeEncodingName() [Encoding to name]
216 //   EncodingFromName() [name to Encoding]
217 //   EncodingNameAliasToEncoding() [name to Encoding]
218 //   default_encoding_name()
219 //   invalid_encoding_name()
220 // *************************************************************
221 
222 // EncodingName
223 // ------------
224 //
225 // Given the encoding, returns its standard name.
226 // Return invalid_encoding_name() if the encoding is invalid.
227 //
228 const char* EncodingName(Encoding enc);
229 
230 //
231 // MimeEncodingName
232 // ----------------
233 //
234 // Return the "preferred MIME name" of an encoding.
235 //
236 // This name is suitable for using in HTTP headers, HTML tags,
237 // and as the "charset" parameter of a MIME Content-Type.
238 const char* MimeEncodingName(Encoding enc);
239 
240 
241 // The maximum length of an encoding name
242 const int kMaxEncodingNameSize = 50;
243 
244 // The standard name of the default encoding.
245 const char* default_encoding_name();
246 
247 // The name used for an invalid encoding.
248 const char* invalid_encoding_name();
249 
250 // EncodingFromName
251 // ----------------
252 //
253 // If enc_name matches the standard name of an Encoding, using a
254 // case-insensitive comparison, set *encoding to that Encoding and
255 // return true.  Otherwise set *encoding to UNKNOWN_ENCODING and
256 // return false.
257 //
258 // REQUIRES: encoding must not be NULL.
259 //
260 bool EncodingFromName(const char* enc_name, Encoding *encoding);
261 
262 //
263 // EncodingNameAliasToEncoding
264 // ---------------------------
265 //
266 // If enc_name matches the standard name or an alias of an Encoding,
267 // using a case-insensitive comparison, return that
268 // Encoding. Otherwise, return UNKNOWN_ENCODING.
269 //
270 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
271 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
272 // common variations with hyphens and underscores (e.g., "koi8-u" and
273 // "koi8u" for RUSSIAN_KOI8_R).
274 
275 Encoding EncodingNameAliasToEncoding(const char *enc_name);
276 
277 // *************************************************************
278 // Miscellany
279 // *************************************************************
280 
281 // PreferredWebOutputEncoding
282 // --------------------------
283 //
284 // Some multi-byte encodings use byte values that coincide with the
285 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
286 // can misinterpret these, as indicated in an external XSS report from
287 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
288 // also use UTF8 instead of encodings that we don't support in our
289 // output, and we generally try to be conservative in what we send out.
290 // Where the client asks for single- or double-byte encodings that are
291 // not as common, we substitute a more common single- or double-byte
292 // encoding, if there is one, thereby preserving the client's intent
293 // to use less space than UTF-8. This also means that characters
294 // outside the destination set will be converted to HTML NCRs (&#NNN;)
295 // if requested.
296 Encoding PreferredWebOutputEncoding(Encoding enc);
297 
298 
299 #endif  // UTIL_ENCODINGS_ENCODINGS_H_
300