xref: /aosp_15_r20/external/libtextclassifier/native/lang_id/custom-tokenizer.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "lang_id/custom-tokenizer.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <ctype.h>
20*993b0882SAndroid Build Coastguard Worker 
21*993b0882SAndroid Build Coastguard Worker #include <string>
22*993b0882SAndroid Build Coastguard Worker 
23*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/attributes.h"
24*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/lite_base/logging.h"
25*993b0882SAndroid Build Coastguard Worker #include "lang_id/common/utf8.h"
26*993b0882SAndroid Build Coastguard Worker #include "utf.h"
27*993b0882SAndroid Build Coastguard Worker 
28*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
29*993b0882SAndroid Build Coastguard Worker namespace mobile {
30*993b0882SAndroid Build Coastguard Worker namespace lang_id {
31*993b0882SAndroid Build Coastguard Worker 
32*993b0882SAndroid Build Coastguard Worker namespace {
IsTokenSeparator(int num_bytes,const char * curr)33*993b0882SAndroid Build Coastguard Worker inline bool IsTokenSeparator(int num_bytes, const char *curr) {
34*993b0882SAndroid Build Coastguard Worker   if (num_bytes != 1) {
35*993b0882SAndroid Build Coastguard Worker     return false;
36*993b0882SAndroid Build Coastguard Worker   }
37*993b0882SAndroid Build Coastguard Worker   return !isalpha(*curr);
38*993b0882SAndroid Build Coastguard Worker }
39*993b0882SAndroid Build Coastguard Worker 
40*993b0882SAndroid Build Coastguard Worker // Appends to *word the UTF8 encoding for the lowercase version of the UTF8
41*993b0882SAndroid Build Coastguard Worker // character that starts at |curr| and has |num_bytes| bytes.
42*993b0882SAndroid Build Coastguard Worker //
43*993b0882SAndroid Build Coastguard Worker // NOTE: if the current UTF8 character does not have a lowercase version, then
44*993b0882SAndroid Build Coastguard Worker // we append the original UTF8 character.
AppendLowerCase(const char * curr,int num_bytes,std::string * word)45*993b0882SAndroid Build Coastguard Worker inline SAFTM_ATTRIBUTE_ALWAYS_INLINE void AppendLowerCase(const char *curr,
46*993b0882SAndroid Build Coastguard Worker                                                           int num_bytes,
47*993b0882SAndroid Build Coastguard Worker                                                           std::string *word) {
48*993b0882SAndroid Build Coastguard Worker   if (num_bytes == 1) {
49*993b0882SAndroid Build Coastguard Worker     // Optimize the ASCII case.
50*993b0882SAndroid Build Coastguard Worker     word->push_back(tolower(*curr));
51*993b0882SAndroid Build Coastguard Worker     return;
52*993b0882SAndroid Build Coastguard Worker   }
53*993b0882SAndroid Build Coastguard Worker 
54*993b0882SAndroid Build Coastguard Worker   // Harder, general case.
55*993b0882SAndroid Build Coastguard Worker   //
56*993b0882SAndroid Build Coastguard Worker   // NOTE: for lowercasing, we use the utils from utf.h:
57*993b0882SAndroid Build Coastguard Worker   // charntorune + tolowerrune + runetochar.  Unfortunately, that library does
58*993b0882SAndroid Build Coastguard Worker   // not contain any fast util for determining the number of bytes for the UTF8
59*993b0882SAndroid Build Coastguard Worker   // character that starts at a given address *without* converting to a full
60*993b0882SAndroid Build Coastguard Worker   // codepoint (like our utils::OneCharLen, which is used intensively by the
61*993b0882SAndroid Build Coastguard Worker   // rest of our code, including by the performance-critical char ngram
62*993b0882SAndroid Build Coastguard Worker   // feature).  Hence, the rest of our code continues to use utils::OneCharLen,
63*993b0882SAndroid Build Coastguard Worker   // and here, when we append the bytes to *word, we make sure that's consistent
64*993b0882SAndroid Build Coastguard Worker   // with utils::OneCharLen.
65*993b0882SAndroid Build Coastguard Worker 
66*993b0882SAndroid Build Coastguard Worker   // charntorune() below reads the UTF8 character that starts at curr (using at
67*993b0882SAndroid Build Coastguard Worker   // most num_bytes bytes) and stores the corresponding codepoint into rune.
68*993b0882SAndroid Build Coastguard Worker   Rune rune;
69*993b0882SAndroid Build Coastguard Worker   charntorune(&rune, curr, num_bytes);
70*993b0882SAndroid Build Coastguard Worker   if (rune != Runeerror) {
71*993b0882SAndroid Build Coastguard Worker     Rune lower = tolowerrune(rune);
72*993b0882SAndroid Build Coastguard Worker     char lower_buf[UTFmax];
73*993b0882SAndroid Build Coastguard Worker     runetochar(lower_buf, &lower);
74*993b0882SAndroid Build Coastguard Worker 
75*993b0882SAndroid Build Coastguard Worker     // When appending the UTF8 bytes to word, we do not use the number of bytes
76*993b0882SAndroid Build Coastguard Worker     // returned by runetochar(); instead, we use utils::OneCharLen(), the same
77*993b0882SAndroid Build Coastguard Worker     // method used by the char ngram feature.  We expect them to be equal, but
78*993b0882SAndroid Build Coastguard Worker     // just in case.
79*993b0882SAndroid Build Coastguard Worker     int lower_num_bytes = utils::OneCharLen(lower_buf);
80*993b0882SAndroid Build Coastguard Worker 
81*993b0882SAndroid Build Coastguard Worker     // Using lower_num_bytes below is safe, because, by definition of UTFmax,
82*993b0882SAndroid Build Coastguard Worker     SAFTM_DCHECK_GE(UTFmax, 4);
83*993b0882SAndroid Build Coastguard Worker 
84*993b0882SAndroid Build Coastguard Worker     // And, by implementation of utils::OneCharLen():
85*993b0882SAndroid Build Coastguard Worker     SAFTM_DCHECK_GT(lower_num_bytes, 0);
86*993b0882SAndroid Build Coastguard Worker     SAFTM_DCHECK_LE(lower_num_bytes, 4);
87*993b0882SAndroid Build Coastguard Worker     word->append(lower_buf, lower_num_bytes);
88*993b0882SAndroid Build Coastguard Worker   } else {
89*993b0882SAndroid Build Coastguard Worker     // There are sequences of bytes that charntorune() can't convert into a
90*993b0882SAndroid Build Coastguard Worker     // valid Rune (a special case is [0xEF, 0xBF, 0xBD], the UTF8 encoding for
91*993b0882SAndroid Build Coastguard Worker     // the U+FFFD special Unicode character, which is also the value of
92*993b0882SAndroid Build Coastguard Worker     // Runeerror).  We keep those bytes unchanged.
93*993b0882SAndroid Build Coastguard Worker     word->append(curr, num_bytes);
94*993b0882SAndroid Build Coastguard Worker   }
95*993b0882SAndroid Build Coastguard Worker }
96*993b0882SAndroid Build Coastguard Worker }  // namespace
97*993b0882SAndroid Build Coastguard Worker 
Setup(TaskContext * context)98*993b0882SAndroid Build Coastguard Worker void TokenizerForLangId::Setup(TaskContext *context) {
99*993b0882SAndroid Build Coastguard Worker   lowercase_input_ = context->Get("lang_id_lowercase_input", false);
100*993b0882SAndroid Build Coastguard Worker }
101*993b0882SAndroid Build Coastguard Worker 
Tokenize(StringPiece text,LightSentence * sentence) const102*993b0882SAndroid Build Coastguard Worker void TokenizerForLangId::Tokenize(StringPiece text,
103*993b0882SAndroid Build Coastguard Worker                                   LightSentence *sentence) const {
104*993b0882SAndroid Build Coastguard Worker   const char *const start = text.data();
105*993b0882SAndroid Build Coastguard Worker   const char *curr = start;
106*993b0882SAndroid Build Coastguard Worker   const char *end = utils::GetSafeEndOfUtf8String(start, text.size());
107*993b0882SAndroid Build Coastguard Worker 
108*993b0882SAndroid Build Coastguard Worker   // Corner case: the safe part of the text is empty ("").
109*993b0882SAndroid Build Coastguard Worker   if (curr >= end) {
110*993b0882SAndroid Build Coastguard Worker     return;
111*993b0882SAndroid Build Coastguard Worker   }
112*993b0882SAndroid Build Coastguard Worker 
113*993b0882SAndroid Build Coastguard Worker   // Number of bytes for UTF8 character starting at *curr.  Note: the loop below
114*993b0882SAndroid Build Coastguard Worker   // is guaranteed to terminate because in each iteration, we move curr by at
115*993b0882SAndroid Build Coastguard Worker   // least num_bytes, and num_bytes is guaranteed to be > 0.
116*993b0882SAndroid Build Coastguard Worker   int num_bytes = utils::OneCharLen(curr);
117*993b0882SAndroid Build Coastguard Worker   while (curr < end) {
118*993b0882SAndroid Build Coastguard Worker     // Jump over consecutive token separators.
119*993b0882SAndroid Build Coastguard Worker     while (IsTokenSeparator(num_bytes, curr)) {
120*993b0882SAndroid Build Coastguard Worker       curr += num_bytes;
121*993b0882SAndroid Build Coastguard Worker       if (curr >= end) {
122*993b0882SAndroid Build Coastguard Worker         return;
123*993b0882SAndroid Build Coastguard Worker       }
124*993b0882SAndroid Build Coastguard Worker       num_bytes = utils::OneCharLen(curr);
125*993b0882SAndroid Build Coastguard Worker     }
126*993b0882SAndroid Build Coastguard Worker 
127*993b0882SAndroid Build Coastguard Worker     // If control reaches this point, we are at beginning of a non-empty token.
128*993b0882SAndroid Build Coastguard Worker     sentence->emplace_back();
129*993b0882SAndroid Build Coastguard Worker     std::string *word = &(sentence->back());
130*993b0882SAndroid Build Coastguard Worker 
131*993b0882SAndroid Build Coastguard Worker     // Add special token-start character.
132*993b0882SAndroid Build Coastguard Worker     word->push_back('^');
133*993b0882SAndroid Build Coastguard Worker 
134*993b0882SAndroid Build Coastguard Worker     // Add UTF8 characters to word, until we hit the end of the safe text or a
135*993b0882SAndroid Build Coastguard Worker     // token separator.
136*993b0882SAndroid Build Coastguard Worker     while (true) {
137*993b0882SAndroid Build Coastguard Worker       if (lowercase_input_) {
138*993b0882SAndroid Build Coastguard Worker         AppendLowerCase(curr, num_bytes, word);
139*993b0882SAndroid Build Coastguard Worker       } else {
140*993b0882SAndroid Build Coastguard Worker         word->append(curr, num_bytes);
141*993b0882SAndroid Build Coastguard Worker       }
142*993b0882SAndroid Build Coastguard Worker       curr += num_bytes;
143*993b0882SAndroid Build Coastguard Worker       if (curr >= end) {
144*993b0882SAndroid Build Coastguard Worker         break;
145*993b0882SAndroid Build Coastguard Worker       }
146*993b0882SAndroid Build Coastguard Worker       num_bytes = utils::OneCharLen(curr);
147*993b0882SAndroid Build Coastguard Worker       if (IsTokenSeparator(num_bytes, curr)) {
148*993b0882SAndroid Build Coastguard Worker         curr += num_bytes;
149*993b0882SAndroid Build Coastguard Worker         if (curr >= end) {
150*993b0882SAndroid Build Coastguard Worker           break;
151*993b0882SAndroid Build Coastguard Worker         }
152*993b0882SAndroid Build Coastguard Worker         num_bytes = utils::OneCharLen(curr);
153*993b0882SAndroid Build Coastguard Worker         break;
154*993b0882SAndroid Build Coastguard Worker       }
155*993b0882SAndroid Build Coastguard Worker     }
156*993b0882SAndroid Build Coastguard Worker     word->push_back('$');
157*993b0882SAndroid Build Coastguard Worker   }
158*993b0882SAndroid Build Coastguard Worker }
159*993b0882SAndroid Build Coastguard Worker 
160*993b0882SAndroid Build Coastguard Worker }  // namespace lang_id
161*993b0882SAndroid Build Coastguard Worker }  // namespace mobile
162*993b0882SAndroid Build Coastguard Worker }  // namespace nlp_saft
163