1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
19
20 #include "lang_id/script/script-detector.h"
21
22 namespace libtextclassifier3 {
23 namespace mobile {
24 namespace lang_id {
25
26 // Unicode scripts we care about. To get compact and fast code, we detect only
27 // a few Unicode scripts that offer a strong indication about the language of
28 // the text (e.g., Hiragana -> Japanese).
29 enum Script {
30 // Special value to indicate internal errors in the script detection code.
31 kScriptError,
32
33 // Special values for all Unicode scripts that we do not detect. One special
34 // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
35 // already have that information, we use it). kScriptOtherUtf8OneByte means
36 // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
37 kScriptOtherUtf8OneByte,
38 kScriptOtherUtf8TwoBytes,
39 kScriptOtherUtf8ThreeBytes,
40 kScriptOtherUtf8FourBytes,
41
42 kScriptGreek,
43 kScriptCyrillic,
44 kScriptHebrew,
45 kScriptArabic,
46 kScriptHangulJamo, // Used primarily for Korean.
47 kScriptHiragana, // Used primarily for Japanese.
48 kScriptKatakana, // Used primarily for Japanese.
49
50 // Add new scripts here.
51
52 // Do not add any script after kNumRelevantScripts. This value indicates the
53 // number of elements in this enum Script (except this value) such that we can
54 // easily iterate over the scripts.
55 kNumRelevantScripts,
56 };
57
58 template<typename IntType>
InRange(IntType value,IntType low,IntType hi)59 inline bool InRange(IntType value, IntType low, IntType hi) {
60 return (value >= low) && (value <= hi);
61 }
62
63 // Returns Script for the UTF8 character that starts at address p.
64 // Precondition: p points to a valid UTF8 character of num_bytes bytes.
GetScript(const unsigned char * p,int num_bytes)65 inline Script GetScript(const unsigned char *p, int num_bytes) {
66 switch (num_bytes) {
67 case 1:
68 return kScriptOtherUtf8OneByte;
69
70 case 2: {
71 // 2-byte UTF8 characters have 11 bits of information. unsigned int has
72 // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
73 // it's enough. It's also usually the fastest int type on the current
74 // CPU, so it's better to use than int32.
75 static const unsigned int kGreekStart = 0x370;
76
77 // Commented out (unused in the code): kGreekEnd = 0x3FF;
78 static const unsigned int kCyrillicStart = 0x400;
79 static const unsigned int kCyrillicEnd = 0x4FF;
80 static const unsigned int kHebrewStart = 0x590;
81
82 // Commented out (unused in the code): kHebrewEnd = 0x5FF;
83 static const unsigned int kArabicStart = 0x600;
84 static const unsigned int kArabicEnd = 0x6FF;
85 const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
86 if (codepoint > kCyrillicEnd) {
87 if (codepoint >= kArabicStart) {
88 if (codepoint <= kArabicEnd) {
89 return kScriptArabic;
90 }
91 } else {
92 // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
93 // codepoint <= kHebrewEnd.
94 if (codepoint >= kHebrewStart) {
95 return kScriptHebrew;
96 }
97 }
98 } else {
99 if (codepoint >= kCyrillicStart) {
100 return kScriptCyrillic;
101 } else {
102 // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
103 // codepoint <= kGreekEnd.
104 if (codepoint >= kGreekStart) {
105 return kScriptGreek;
106 }
107 }
108 }
109 return kScriptOtherUtf8TwoBytes;
110 }
111
112 case 3: {
113 // 3-byte UTF8 characters have 16 bits of information. unsigned int has
114 // at least 16 bits.
115 static const unsigned int kHangulJamoStart = 0x1100;
116 static const unsigned int kHangulJamoEnd = 0x11FF;
117 static const unsigned int kHiraganaStart = 0x3041;
118 static const unsigned int kHiraganaEnd = 0x309F;
119
120 // Commented out (unused in the code): kKatakanaStart = 0x30A0;
121 static const unsigned int kKatakanaEnd = 0x30FF;
122 const unsigned int codepoint =
123 ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
124 if (codepoint > kHiraganaEnd) {
125 // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
126 // codepoint >= kKatakanaStart.
127 if (codepoint <= kKatakanaEnd) {
128 return kScriptKatakana;
129 }
130 } else {
131 if (codepoint >= kHiraganaStart) {
132 return kScriptHiragana;
133 } else {
134 if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
135 return kScriptHangulJamo;
136 }
137 }
138 }
139 return kScriptOtherUtf8ThreeBytes;
140 }
141
142 case 4:
143 return kScriptOtherUtf8FourBytes;
144
145 default:
146 return kScriptError;
147 }
148 }
149
150 // Returns Script for the UTF8 character that starts at address p. Similar to
151 // the previous version of GetScript, except for "char" vs "unsigned char".
152 // Most code works with "char *" pointers, ignoring the fact that char is
153 // unsigned (by default) on most platforms, but signed on iOS. This code takes
154 // care of making sure we always treat chars as unsigned.
GetScript(const char * p,int num_bytes)155 inline Script GetScript(const char *p, int num_bytes) {
156 return GetScript(reinterpret_cast<const unsigned char *>(p),
157 num_bytes);
158 }
159
160 class TinyScriptDetector : public ScriptDetector {
161 public:
162 ~TinyScriptDetector() override = default;
163
GetScript(const char * s,int num_bytes)164 int GetScript(const char *s, int num_bytes) const override {
165 // Add the namespace in indicate that we want to call the method outside
166 // this class, instead of performing an infinite recursive call.
167 return libtextclassifier3::mobile::lang_id::GetScript(s, num_bytes);
168 }
169
GetMaxScript()170 int GetMaxScript() const override {
171 return kNumRelevantScripts - 1;
172 }
173
174 SAFTM_DEFINE_REGISTRATION_METHOD("tiny-script-detector", TinyScriptDetector);
175 };
176
177 } // namespace lang_id
178 } // namespace mobile
179 } // namespace nlp_saft
180
181 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
182