xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/script/approx-script.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "lang_id/script/approx-script.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/integral-types.h"
20*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/logging.h"
21*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/utf8.h"
22*993b0882SAndroid Build Coastguard Worker #include "lang_id/script/approx-script-data.h"
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
25*993b0882SAndroid Build Coastguard Worker namespace mobile {
26*993b0882SAndroid Build Coastguard Worker 
27*993b0882SAndroid Build Coastguard Worker // int value of USCRIPT_UNKNOWN from enum UScriptCode (from
28*993b0882SAndroid Build Coastguard Worker // unicode/uscript.h).  Note: we do have a test that
29*993b0882SAndroid Build Coastguard Worker // USCRIPT_UNKNOWN evaluates to 103.
30*993b0882SAndroid Build Coastguard Worker const int kUnknownUscript = 103;
31*993b0882SAndroid Build Coastguard Worker 
32*993b0882SAndroid Build Coastguard Worker namespace {
33*993b0882SAndroid Build Coastguard Worker using approx_script_internal::kNumRanges;
34*993b0882SAndroid Build Coastguard Worker using approx_script_internal::kRangeFirst;
35*993b0882SAndroid Build Coastguard Worker using approx_script_internal::kRangeScript;
36*993b0882SAndroid Build Coastguard Worker using approx_script_internal::kRangeSizeMinusOne;
37*993b0882SAndroid Build Coastguard Worker 
Utf8ToCodepoint(const unsigned char * s,int num_bytes)38*993b0882SAndroid Build Coastguard Worker uint32 Utf8ToCodepoint(const unsigned char *s, int num_bytes) {
39*993b0882SAndroid Build Coastguard Worker   switch (num_bytes) {
40*993b0882SAndroid Build Coastguard Worker     case 1:
41*993b0882SAndroid Build Coastguard Worker       return s[0];
42*993b0882SAndroid Build Coastguard Worker     case 2:
43*993b0882SAndroid Build Coastguard Worker       return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
44*993b0882SAndroid Build Coastguard Worker     case 3:
45*993b0882SAndroid Build Coastguard Worker       return (((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F));
46*993b0882SAndroid Build Coastguard Worker     case 4:
47*993b0882SAndroid Build Coastguard Worker       return (((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) |
48*993b0882SAndroid Build Coastguard Worker               ((s[2] & 0x3F) << 6) | (s[3] & 0x3F));
49*993b0882SAndroid Build Coastguard Worker     default:
50*993b0882SAndroid Build Coastguard Worker       SAFTM_DLOG(FATAL) << "Illegal num_bytes: " << num_bytes;
51*993b0882SAndroid Build Coastguard Worker       return 0;
52*993b0882SAndroid Build Coastguard Worker   }
53*993b0882SAndroid Build Coastguard Worker }
54*993b0882SAndroid Build Coastguard Worker 
BinarySearch(uint32 codepoint,int start,int end)55*993b0882SAndroid Build Coastguard Worker inline int BinarySearch(uint32 codepoint, int start, int end) {
56*993b0882SAndroid Build Coastguard Worker   while (end > start + 1) {
57*993b0882SAndroid Build Coastguard Worker     // Due to the while loop condition, middle > start and middle < end.  Hence,
58*993b0882SAndroid Build Coastguard Worker     // on both branches of the if below, we strictly reduce the end - start
59*993b0882SAndroid Build Coastguard Worker     // value, so we eventually get that difference below 1 and complete the
60*993b0882SAndroid Build Coastguard Worker     // while loop.
61*993b0882SAndroid Build Coastguard Worker     int middle = (start + end) / 2;
62*993b0882SAndroid Build Coastguard Worker     if (codepoint < kRangeFirst[middle]) {
63*993b0882SAndroid Build Coastguard Worker       end = middle;
64*993b0882SAndroid Build Coastguard Worker     } else {
65*993b0882SAndroid Build Coastguard Worker       start = middle;
66*993b0882SAndroid Build Coastguard Worker     }
67*993b0882SAndroid Build Coastguard Worker   }
68*993b0882SAndroid Build Coastguard Worker 
69*993b0882SAndroid Build Coastguard Worker   if (end == start + 1) {
70*993b0882SAndroid Build Coastguard Worker     const uint32 range_start = kRangeFirst[start];
71*993b0882SAndroid Build Coastguard Worker     if ((codepoint >= range_start) &&
72*993b0882SAndroid Build Coastguard Worker         (codepoint <= range_start + kRangeSizeMinusOne[start])) {
73*993b0882SAndroid Build Coastguard Worker       return kRangeScript[start];
74*993b0882SAndroid Build Coastguard Worker     }
75*993b0882SAndroid Build Coastguard Worker   }
76*993b0882SAndroid Build Coastguard Worker 
77*993b0882SAndroid Build Coastguard Worker   return kUnknownUscript;
78*993b0882SAndroid Build Coastguard Worker }
79*993b0882SAndroid Build Coastguard Worker }  // namespace
80*993b0882SAndroid Build Coastguard Worker 
GetApproxScript(const unsigned char * s,int num_bytes)81*993b0882SAndroid Build Coastguard Worker int GetApproxScript(const unsigned char *s, int num_bytes) {
82*993b0882SAndroid Build Coastguard Worker   SAFTM_DCHECK_NE(s, nullptr);
83*993b0882SAndroid Build Coastguard Worker   SAFTM_DCHECK_EQ(num_bytes,
84*993b0882SAndroid Build Coastguard Worker                   utils::OneCharLen(reinterpret_cast<const char *>(s)));
85*993b0882SAndroid Build Coastguard Worker   uint32 codepoint = Utf8ToCodepoint(s, num_bytes);
86*993b0882SAndroid Build Coastguard Worker   return BinarySearch(codepoint, 0, kNumRanges);
87*993b0882SAndroid Build Coastguard Worker }
88*993b0882SAndroid Build Coastguard Worker 
GetMaxApproxScriptResult()89*993b0882SAndroid Build Coastguard Worker int GetMaxApproxScriptResult() { return approx_script_internal::kMaxScript; }
90*993b0882SAndroid Build Coastguard Worker 
91*993b0882SAndroid Build Coastguard Worker SAFTM_STATIC_REGISTRATION(ApproxScriptDetector);
92*993b0882SAndroid Build Coastguard Worker 
93*993b0882SAndroid Build Coastguard Worker }  // namespace mobile
94*993b0882SAndroid Build Coastguard Worker }  // namespace nlp_saft
95