xref: /aosp_15_r20/external/cronet/third_party/ced/src/compact_enc_det/compact_enc_det.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16 
17 #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
18 #define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
19 
20 #include "util/encodings/encodings.h"  // for Encoding
21 #include "util/languages/languages.h"  // for Language
22 
23 #include <string.h>
24 
25 namespace CompactEncDet {
26   // We may want different statistics, depending on whether the text being
27   // identfied is from the web, from email, etc.  This is currently ignored,
28   // except WEB_CORPUS enables ignoring chars inside tags.
29   enum TextCorpusType {
30     WEB_CORPUS,
31     XML_CORPUS,
32     QUERY_CORPUS,       // Use this for vanilla plaintext
33     EMAIL_CORPUS,
34     NUM_CORPA,          // always last
35   };
36 
37   // Scan raw bytes and detect most likely encoding
38   // Design goals:
39   //   Skip over big initial stretches of seven-bit ASCII bytes very quickly
40   //   Thread safe
41   //   Works equally well on
42   //    50-byte queries,
43   //    5000-byte email and
44   //    50000-byte web pages
45   // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
46   //
47   // Inputs: text and text_length
48   //  web page's url (preferred) or just
49   //    top-level domain name (e.g. "com") or NULL as a hint
50   //  web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
51   //  web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
52   //  an Encoding or UNKNOWN_ENCODING as a hint
53   //  a Language or UNKNOWN_LANGUAGE as a hint
54   //  corpus type from the list above. Currently ignored; may select
55   //    different probability tables in the future
56   //  ignore_7bit if true says to NOT return the pure seven-bit encodings
57   //    ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
58   //    This may save a little scoring time on pure printable ASCII input text
59   // Outputs: bytes_consumed says how much of text_length was actually examined
60   //  is_reliable set true if the returned encoding is at least 2**10 time more
61   //  probable then the second-best encoding
62   // Return value: the most likely encoding for the input text
63   //
64   // Setting ignore_7bit_mail_encodings effectively turns off detection of
65   // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
66   // when corpus_type is QUERY_CORPUS.
67   Encoding DetectEncoding(
68       const char* text, int text_length, const char* url_hint,
69       const char* http_charset_hint, const char* meta_charset_hint,
70       const int encoding_hint,
71       const Language language_hint,  // User interface lang
72       const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
73       int* bytes_consumed, bool* is_reliable);
74 
75   // Support functions for unit test program
76   int BackmapEncodingToRankedEncoding(Encoding enc);
77   Encoding TopEncodingOfLangHint(const char* name);
78   Encoding TopEncodingOfTLDHint(const char* name);
79   Encoding TopEncodingOfCharsetHint(const char* name);
80   const char* Version(void);
81 }      // End namespace CompactEncDet
82 
83 #endif  // COMPACT_ENC_DET_COMPACT_ENC_DET_H_
84