1 // Copyright 2016 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 //////////////////////////////////////////////////////////////////////////////// 16 17 #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_ 18 #define COMPACT_ENC_DET_COMPACT_ENC_DET_H_ 19 20 #include "util/encodings/encodings.h" // for Encoding 21 #include "util/languages/languages.h" // for Language 22 23 #include <string.h> 24 25 namespace CompactEncDet { 26 // We may want different statistics, depending on whether the text being 27 // identfied is from the web, from email, etc. This is currently ignored, 28 // except WEB_CORPUS enables ignoring chars inside tags. 29 enum TextCorpusType { 30 WEB_CORPUS, 31 XML_CORPUS, 32 QUERY_CORPUS, // Use this for vanilla plaintext 33 EMAIL_CORPUS, 34 NUM_CORPA, // always last 35 }; 36 37 // Scan raw bytes and detect most likely encoding 38 // Design goals: 39 // Skip over big initial stretches of seven-bit ASCII bytes very quickly 40 // Thread safe 41 // Works equally well on 42 // 50-byte queries, 43 // 5000-byte email and 44 // 50000-byte web pages 45 // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1) 46 // 47 // Inputs: text and text_length 48 // web page's url (preferred) or just 49 // top-level domain name (e.g. "com") or NULL as a hint 50 // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint 51 // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint 52 // an Encoding or UNKNOWN_ENCODING as a hint 53 // a Language or UNKNOWN_LANGUAGE as a hint 54 // corpus type from the list above. Currently ignored; may select 55 // different probability tables in the future 56 // ignore_7bit if true says to NOT return the pure seven-bit encodings 57 // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7. 58 // This may save a little scoring time on pure printable ASCII input text 59 // Outputs: bytes_consumed says how much of text_length was actually examined 60 // is_reliable set true if the returned encoding is at least 2**10 time more 61 // probable then the second-best encoding 62 // Return value: the most likely encoding for the input text 63 // 64 // Setting ignore_7bit_mail_encodings effectively turns off detection of 65 // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true 66 // when corpus_type is QUERY_CORPUS. 67 Encoding DetectEncoding( 68 const char* text, int text_length, const char* url_hint, 69 const char* http_charset_hint, const char* meta_charset_hint, 70 const int encoding_hint, 71 const Language language_hint, // User interface lang 72 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings, 73 int* bytes_consumed, bool* is_reliable); 74 75 // Support functions for unit test program 76 int BackmapEncodingToRankedEncoding(Encoding enc); 77 Encoding TopEncodingOfLangHint(const char* name); 78 Encoding TopEncodingOfTLDHint(const char* name); 79 Encoding TopEncodingOfCharsetHint(const char* name); 80 const char* Version(void); 81 } // End namespace CompactEncDet 82 83 #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_ 84