xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/script/tiny-script-detector.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
19 
20 #include "lang_id/script/script-detector.h"
21 
22 namespace libtextclassifier3 {
23 namespace mobile {
24 namespace lang_id {
25 
26 // Unicode scripts we care about.  To get compact and fast code, we detect only
27 // a few Unicode scripts that offer a strong indication about the language of
28 // the text (e.g., Hiragana -> Japanese).
29 enum Script {
30   // Special value to indicate internal errors in the script detection code.
31   kScriptError,
32 
33   // Special values for all Unicode scripts that we do not detect.  One special
34   // value for Unicode characters of 1, 2, 3, respectively 4 bytes (as we
35   // already have that information, we use it).  kScriptOtherUtf8OneByte means
36   // ~Latin and kScriptOtherUtf8FourBytes means ~Han.
37   kScriptOtherUtf8OneByte,
38   kScriptOtherUtf8TwoBytes,
39   kScriptOtherUtf8ThreeBytes,
40   kScriptOtherUtf8FourBytes,
41 
42   kScriptGreek,
43   kScriptCyrillic,
44   kScriptHebrew,
45   kScriptArabic,
46   kScriptHangulJamo,  // Used primarily for Korean.
47   kScriptHiragana,    // Used primarily for Japanese.
48   kScriptKatakana,    // Used primarily for Japanese.
49 
50   // Add new scripts here.
51 
52   // Do not add any script after kNumRelevantScripts.  This value indicates the
53   // number of elements in this enum Script (except this value) such that we can
54   // easily iterate over the scripts.
55   kNumRelevantScripts,
56 };
57 
58 template<typename IntType>
InRange(IntType value,IntType low,IntType hi)59 inline bool InRange(IntType value, IntType low, IntType hi) {
60   return (value >= low) && (value <= hi);
61 }
62 
63 // Returns Script for the UTF8 character that starts at address p.
64 // Precondition: p points to a valid UTF8 character of num_bytes bytes.
GetScript(const unsigned char * p,int num_bytes)65 inline Script GetScript(const unsigned char *p, int num_bytes) {
66   switch (num_bytes) {
67     case 1:
68       return kScriptOtherUtf8OneByte;
69 
70     case 2: {
71       // 2-byte UTF8 characters have 11 bits of information.  unsigned int has
72       // at least 16 bits (http://en.cppreference.com/w/cpp/language/types) so
73       // it's enough.  It's also usually the fastest int type on the current
74       // CPU, so it's better to use than int32.
75       static const unsigned int kGreekStart = 0x370;
76 
77       // Commented out (unused in the code): kGreekEnd = 0x3FF;
78       static const unsigned int kCyrillicStart = 0x400;
79       static const unsigned int kCyrillicEnd = 0x4FF;
80       static const unsigned int kHebrewStart = 0x590;
81 
82       // Commented out (unused in the code): kHebrewEnd = 0x5FF;
83       static const unsigned int kArabicStart = 0x600;
84       static const unsigned int kArabicEnd = 0x6FF;
85       const unsigned int codepoint = ((p[0] & 0x1F) << 6) | (p[1] & 0x3F);
86       if (codepoint > kCyrillicEnd) {
87         if (codepoint >= kArabicStart) {
88           if (codepoint <= kArabicEnd) {
89             return kScriptArabic;
90           }
91         } else {
92           // At this point, codepoint < kArabicStart = kHebrewEnd + 1, so
93           // codepoint <= kHebrewEnd.
94           if (codepoint >= kHebrewStart) {
95             return kScriptHebrew;
96           }
97         }
98       } else {
99         if (codepoint >= kCyrillicStart) {
100           return kScriptCyrillic;
101         } else {
102           // At this point, codepoint < kCyrillicStart = kGreekEnd + 1, so
103           // codepoint <= kGreekEnd.
104           if (codepoint >= kGreekStart) {
105             return kScriptGreek;
106           }
107         }
108       }
109       return kScriptOtherUtf8TwoBytes;
110     }
111 
112     case 3: {
113       // 3-byte UTF8 characters have 16 bits of information.  unsigned int has
114       // at least 16 bits.
115       static const unsigned int kHangulJamoStart = 0x1100;
116       static const unsigned int kHangulJamoEnd = 0x11FF;
117       static const unsigned int kHiraganaStart = 0x3041;
118       static const unsigned int kHiraganaEnd = 0x309F;
119 
120       // Commented out (unused in the code): kKatakanaStart = 0x30A0;
121       static const unsigned int kKatakanaEnd = 0x30FF;
122       const unsigned int codepoint =
123           ((p[0] & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
124       if (codepoint > kHiraganaEnd) {
125         // On this branch, codepoint > kHiraganaEnd = kKatakanaStart - 1, so
126         // codepoint >= kKatakanaStart.
127         if (codepoint <= kKatakanaEnd) {
128           return kScriptKatakana;
129         }
130       } else {
131         if (codepoint >= kHiraganaStart) {
132           return kScriptHiragana;
133         } else {
134           if (InRange(codepoint, kHangulJamoStart, kHangulJamoEnd)) {
135             return kScriptHangulJamo;
136           }
137         }
138       }
139       return kScriptOtherUtf8ThreeBytes;
140     }
141 
142     case 4:
143       return kScriptOtherUtf8FourBytes;
144 
145     default:
146       return kScriptError;
147   }
148 }
149 
150 // Returns Script for the UTF8 character that starts at address p.  Similar to
151 // the previous version of GetScript, except for "char" vs "unsigned char".
152 // Most code works with "char *" pointers, ignoring the fact that char is
153 // unsigned (by default) on most platforms, but signed on iOS.  This code takes
154 // care of making sure we always treat chars as unsigned.
GetScript(const char * p,int num_bytes)155 inline Script GetScript(const char *p, int num_bytes) {
156   return GetScript(reinterpret_cast<const unsigned char *>(p),
157                    num_bytes);
158 }
159 
160 class TinyScriptDetector : public ScriptDetector {
161  public:
162   ~TinyScriptDetector() override = default;
163 
GetScript(const char * s,int num_bytes)164   int GetScript(const char *s, int num_bytes) const override {
165     // Add the namespace in indicate that we want to call the method outside
166     // this class, instead of performing an infinite recursive call.
167     return libtextclassifier3::mobile::lang_id::GetScript(s, num_bytes);
168   }
169 
GetMaxScript()170   int GetMaxScript() const override {
171     return kNumRelevantScripts - 1;
172   }
173 
174   SAFTM_DEFINE_REGISTRATION_METHOD("tiny-script-detector", TinyScriptDetector);
175 };
176 
177 }  // namespace lang_id
178 }  // namespace mobile
179 }  // namespace nlp_saft
180 
181 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_TINY_SCRIPT_DETECTOR_H_
182