xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/script/approx-script-data.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_DATA_H_
18*993b0882SAndroid Build Coastguard Worker #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_DATA_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/integral-types.h"
21*993b0882SAndroid Build Coastguard Worker 
22*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
23*993b0882SAndroid Build Coastguard Worker namespace mobile {
24*993b0882SAndroid Build Coastguard Worker namespace approx_script_internal {
25*993b0882SAndroid Build Coastguard Worker 
26*993b0882SAndroid Build Coastguard Worker // Number of contiguous ranges of same-script codepoints (see below).
27*993b0882SAndroid Build Coastguard Worker extern const int kNumRanges;
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker // Non-overlapping ranges of unicode characters.  Characters from each range has
30*993b0882SAndroid Build Coastguard Worker // the same script (see kRangeScripts below).  Multiple ranges may have the same
31*993b0882SAndroid Build Coastguard Worker // script.  Note: we represent the kNumRanges ranges as an array with their
32*993b0882SAndroid Build Coastguard Worker // first codepoints, and a separate array with their sizes (see kRangeSize
33*993b0882SAndroid Build Coastguard Worker // below).  This leads to better memory locality during the binary search (which
34*993b0882SAndroid Build Coastguard Worker // uses only the first codepoints, up until the very end).
35*993b0882SAndroid Build Coastguard Worker //
36*993b0882SAndroid Build Coastguard Worker // kRangeFirst[i] = first codepoint from range #i, \forall 0 <= i < kNumRanges.
37*993b0882SAndroid Build Coastguard Worker extern const uint32 kRangeFirst[];
38*993b0882SAndroid Build Coastguard Worker 
39*993b0882SAndroid Build Coastguard Worker // kRangeSize[i] > 0 is the number of consecutive codepoints in range #i *minus*
40*993b0882SAndroid Build Coastguard Worker // 1, \forall 0 <= i < kNumRanges.  I.e., 0 means that the range contains 1
41*993b0882SAndroid Build Coastguard Worker // codepoints.  Since we don't have empty ranges, this "minus one" convention
42*993b0882SAndroid Build Coastguard Worker // allows us to use all 2^16 values here.
43*993b0882SAndroid Build Coastguard Worker extern const uint16 kRangeSizeMinusOne[];
44*993b0882SAndroid Build Coastguard Worker 
45*993b0882SAndroid Build Coastguard Worker // Scripts for the ranges from kRanges.  For each i such that 0 <= i <
46*993b0882SAndroid Build Coastguard Worker // kNumRanges, the range #i has the script kRangeScript[i].  Each uint8 element
47*993b0882SAndroid Build Coastguard Worker // can be casted to an UScriptCode enum value (see
48*993b0882SAndroid Build Coastguard Worker // unicode/uscript.h).
49*993b0882SAndroid Build Coastguard Worker //
50*993b0882SAndroid Build Coastguard Worker // NOTE: we don't use directly UScriptCode here, as that requires a full int
51*993b0882SAndroid Build Coastguard Worker // (due to USCRIPT_INVALID_CODE = -1).  uint8 is enough for us (and shorter!)
52*993b0882SAndroid Build Coastguard Worker extern const uint8 kRangeScript[];
53*993b0882SAndroid Build Coastguard Worker 
54*993b0882SAndroid Build Coastguard Worker // Max value from kRangeScript[].  Scripts are guaranteed to be in the interval
55*993b0882SAndroid Build Coastguard Worker // [0, kMaxScript] (inclusive on both sides).  Can be used to e.g., set the
56*993b0882SAndroid Build Coastguard Worker // number of rows in an embedding table for a script-based feature.
57*993b0882SAndroid Build Coastguard Worker extern const uint8 kMaxScript;
58*993b0882SAndroid Build Coastguard Worker 
59*993b0882SAndroid Build Coastguard Worker }  // namespace approx_script_internal
60*993b0882SAndroid Build Coastguard Worker }  // namespace mobile
61*993b0882SAndroid Build Coastguard Worker }  // namespace nlp_saft
62*993b0882SAndroid Build Coastguard Worker 
63*993b0882SAndroid Build Coastguard Worker #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_SCRIPT_APPROX_SCRIPT_DATA_H_
64