xref: /aosp_15_r20/external/icing/icing/transform/icu/icu-normalizer.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/transform/icu/icu-normalizer.h"
16 
17 #include <cctype>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/absl_ports/str_cat.h"
26 #include "icing/transform/normalizer.h"
27 #include "icing/util/character-iterator.h"
28 #include "icing/util/i18n-utils.h"
29 #include "icing/util/logging.h"
30 #include "icing/util/status-macros.h"
31 #include "unicode/umachine.h"
32 #include "unicode/unorm2.h"
33 #include "unicode/ustring.h"
34 #include "unicode/utrans.h"
35 
36 namespace icing {
37 namespace lib {
38 
39 namespace {
40 
41 // The following is the compound id used to tell UTransliterator how to
42 // transform terms. The basic normalization forms NFD (canonical normalization
43 // form decomposition) and NFKC (compatible normalization form composition)
44 // are applied as well as some other rules we need. More information at
45 // http://www.unicode.org/reports/tr15/
46 //
47 // Please note that the following rules don't support small hiragana to katakana
48 // transformation.
49 constexpr UChar kTransformRulesUtf16[] =
50     u"Lower; "                      // Lowercase
51     "Latin-ASCII; "                 // Map Latin characters to ASCII characters
52     "Hiragana-Katakana; "           // Map hiragana to katakana
53     "[:Latin:] NFD; "               // Decompose Latin letters
54     "[:Greek:] NFD; "               // Decompose Greek letters
55     "[:Nonspacing Mark:] Remove; "  // Remove accent / diacritic marks
56     "NFKC";                         // Decompose and compose everything
57 
58 // Length of the transform rules excluding the terminating NULL.
59 constexpr int kTransformRulesLength =
60     sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
61 
62 // Transforms a Unicode character with diacritics to its counterpart in ASCII
63 // range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
64 // the transformation is successful.
65 //
66 // NOTE: According to our convention this function should have returned
67 // StatusOr<char>. However, this function is performance-sensitive because is
68 // could be called on every Latin character in normalization, so we make it
69 // return a bool here to save a bit more time and memory.
DiacriticCharToAscii(const UNormalizer2 * normalizer2,UChar32 uchar32_in,char * char_out)70 bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
71                           char* char_out) {
72   if (i18n_utils::IsAscii(uchar32_in)) {
73     // The Unicode character is within ASCII range
74     if (char_out != nullptr) {
75       *char_out = uchar32_in;
76     }
77     return true;
78   }
79 
80   // Maximum number of pieces a Unicode character can be decomposed into.
81   // TODO(tjbarron) figure out if this number is proper.
82   constexpr int kDecompositionBufferCapacity = 5;
83 
84   // A buffer used to store Unicode decomposition mappings of only one
85   // character.
86   UChar decomposition_buffer[kDecompositionBufferCapacity];
87 
88   // Decomposes the Unicode character, trying to get an ASCII char and some
89   // diacritic chars.
90   UErrorCode status = U_ZERO_ERROR;
91   if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
92                               kDecompositionBufferCapacity, &status) > 0 &&
93       !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
94     if (char_out != nullptr) {
95       *char_out = decomposition_buffer[0];
96     }
97     return true;
98   }
99   return false;
100 }
101 
102 }  // namespace
103 
104 // Creates a IcuNormalizer with a valid TermTransformer instance.
105 //
106 // Note: UTokenizer2 is also an option to normalize Unicode strings, but since
107 // we need some custom transform rules other than NFC/NFKC we have to use
108 // TermTransformer as a custom transform rule executor.
109 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>>
Create(int max_term_byte_size)110 IcuNormalizer::Create(int max_term_byte_size) {
111   ICING_ASSIGN_OR_RETURN(
112       std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
113       IcuNormalizer::TermTransformer::Create());
114 
115   return std::unique_ptr<IcuNormalizer>(
116       new IcuNormalizer(std::move(term_transformer), max_term_byte_size));
117 }
118 
IcuNormalizer(std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,int max_term_byte_size)119 IcuNormalizer::IcuNormalizer(
120     std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
121     int max_term_byte_size)
122     : term_transformer_(std::move(term_transformer)),
123       max_term_byte_size_(max_term_byte_size) {}
124 
NormalizeTerm(const std::string_view term) const125 Normalizer::NormalizedTerm IcuNormalizer::NormalizeTerm(
126     const std::string_view term) const {
127   NormalizedTerm normalized_text = {""};
128 
129   if (term.empty()) {
130     return normalized_text;
131   }
132 
133   UErrorCode status = U_ZERO_ERROR;
134   // ICU manages the singleton instance
135   const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
136   if (U_FAILURE(status)) {
137     ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
138   }
139 
140   // Normalize the prefix that can be transformed into ASCII.
141   // This is a faster method to normalize Latin terms.
142   NormalizeLatinResult result = NormalizeLatin(normalizer2, term);
143   normalized_text.text = std::move(result.text);
144   if (result.end_pos < term.length()) {
145     // Some portion of term couldn't be normalized via NormalizeLatin. Use
146     // term_transformer to handle this portion.
147     std::string_view rest_term = term.substr(result.end_pos);
148     TermTransformer::TransformResult transform_result =
149         term_transformer_->Transform(rest_term);
150     absl_ports::StrAppend(&normalized_text.text,
151                           transform_result.transformed_term);
152   }
153 
154   if (normalized_text.text.length() > max_term_byte_size_) {
155     i18n_utils::SafeTruncateUtf8(&normalized_text.text, max_term_byte_size_);
156   }
157 
158   return normalized_text;
159 }
160 
NormalizeLatin(const UNormalizer2 * normalizer2,const std::string_view term) const161 IcuNormalizer::NormalizeLatinResult IcuNormalizer::NormalizeLatin(
162     const UNormalizer2* normalizer2, const std::string_view term) const {
163   NormalizeLatinResult result = {};
164   if (normalizer2 == nullptr) {
165     return result;
166   }
167   CharacterIterator char_itr(term);
168   result.text.reserve(term.length());
169   char ascii_char;
170   while (char_itr.utf8_index() < term.length()) {
171     UChar32 c = char_itr.GetCurrentChar();
172     if (i18n_utils::IsAscii(c)) {
173       result.text.push_back(std::tolower(c));
174     } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
175       result.text.push_back(std::tolower(ascii_char));
176     } else {
177       // We don't know how to transform / decompose this Unicode character, it
178       // probably means that some other Unicode characters are mixed with Latin
179       // characters. We return the partial result here and let the caller handle
180       // the rest.
181       result.end_pos = char_itr.utf8_index();
182       return result;
183     }
184     char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
185   }
186   result.end_pos = term.length();
187   return result;
188 }
189 
190 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer::TermTransformer>>
Create()191 IcuNormalizer::TermTransformer::Create() {
192   UErrorCode status = U_ZERO_ERROR;
193   UTransliterator* term_transformer = utrans_openU(
194       kTransformRulesUtf16, kTransformRulesLength, UTRANS_FORWARD,
195       /*rules=*/nullptr, /*rulesLength=*/0, /*parseError=*/nullptr, &status);
196 
197   if (U_FAILURE(status)) {
198     return absl_ports::InternalError("Failed to create UTransliterator.");
199   }
200 
201   return std::unique_ptr<IcuNormalizer::TermTransformer>(
202       new IcuNormalizer::TermTransformer(term_transformer));
203 }
204 
TermTransformer(UTransliterator * u_transliterator)205 IcuNormalizer::TermTransformer::TermTransformer(
206     UTransliterator* u_transliterator)
207     : u_transliterator_(u_transliterator) {}
208 
~TermTransformer()209 IcuNormalizer::TermTransformer::~TermTransformer() {
210   if (u_transliterator_ != nullptr) {
211     utrans_close(u_transliterator_);
212   }
213 }
214 
215 IcuNormalizer::TermTransformer::TransformResult
Transform(const std::string_view term) const216 IcuNormalizer::TermTransformer::Transform(const std::string_view term) const {
217   auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
218   if (!utf16_term_or.ok()) {
219     ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
220     return {std::string(term)};
221   }
222   std::u16string utf16_term = std::move(utf16_term_or).ValueOrDie();
223   UErrorCode status = U_ZERO_ERROR;
224   int utf16_term_desired_length = utf16_term.length();
225   int limit = utf16_term.length();
226   utrans_transUChars(u_transliterator_, &utf16_term[0],
227                      &utf16_term_desired_length, utf16_term.length(),
228                      /*start=*/0, &limit, &status);
229 
230   // For most cases, one Unicode character is normalized to exact one Unicode
231   // character according to our transformation rules. However, there could be
232   // some rare cases where the normalized text is longer than the original
233   // one. E.g. "¼" (1 character) -> "1/4" (3 characters). That causes a buffer
234   // overflow error and we need to increase our buffer size and try again.
235   if (status == U_BUFFER_OVERFLOW_ERROR) {
236     // 'utf16_term_desired_length' has already been set to the desired value
237     // by utrans_transUChars(), here we increase the buffer size to that
238     // value.
239     //
240     // NOTE: we need to call resize() but not reserve() because values can't
241     // be set at positions after length().
242     int original_content_length = utf16_term.length();
243     utf16_term.resize(utf16_term_desired_length);
244     utf16_term_desired_length = original_content_length;
245     limit = original_content_length;
246     status = U_ZERO_ERROR;
247     utrans_transUChars(u_transliterator_, &utf16_term[0],
248                        &utf16_term_desired_length, utf16_term.length(),
249                        /*start=*/0, &limit, &status);
250   }
251 
252   if (U_FAILURE(status)) {
253     // Failed to transform, return its original form.
254     ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
255     return {std::string(term)};
256   }
257 
258   auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
259   if (!utf8_term_or.ok()) {
260     ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
261     return {std::string(term)};
262   }
263   std::string utf8_term = std::move(utf8_term_or).ValueOrDie();
264   return {std::move(utf8_term)};
265 }
266 
FindNormalizedLatinMatchEndPosition(const UNormalizer2 * normalizer2,std::string_view term,CharacterIterator & char_itr,std::string_view normalized_term,CharacterIterator & normalized_char_itr) const267 bool IcuNormalizer::FindNormalizedLatinMatchEndPosition(
268     const UNormalizer2* normalizer2, std::string_view term,
269     CharacterIterator& char_itr, std::string_view normalized_term,
270     CharacterIterator& normalized_char_itr) const {
271   if (normalizer2 == nullptr) {
272     return false;
273   }
274   char ascii_char;
275   while (char_itr.utf8_index() < term.length() &&
276          normalized_char_itr.utf8_index() < normalized_term.length()) {
277     UChar32 c = char_itr.GetCurrentChar();
278     if (i18n_utils::IsAscii(c)) {
279       c = std::tolower(c);
280     } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
281       c = std::tolower(ascii_char);
282     } else {
283       return false;
284     }
285     UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
286     if (c != normalized_c) {
287       return true;
288     }
289     char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
290     normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
291   }
292   return true;
293 }
294 
295 CharacterIterator
FindNormalizedNonLatinMatchEndPosition(std::string_view term,CharacterIterator char_itr,std::string_view normalized_term) const296 IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition(
297     std::string_view term, CharacterIterator char_itr,
298     std::string_view normalized_term) const {
299   CharacterIterator normalized_char_itr(normalized_term);
300   UErrorCode status = U_ZERO_ERROR;
301 
302   constexpr int kUtf16CharBufferLength = 6;
303   UChar c16[kUtf16CharBufferLength];
304   int32_t c16_length;
305   int32_t limit;
306 
307   constexpr int kCharBufferLength = 3 * 4;
308   char normalized_buffer[kCharBufferLength];
309   int32_t c8_length;
310   while (char_itr.utf8_index() < term.length() &&
311          normalized_char_itr.utf8_index() < normalized_term.length()) {
312     UChar32 c = char_itr.GetCurrentChar();
313     int c_lenth = i18n_utils::GetUtf8Length(c);
314     u_strFromUTF8(c16, kUtf16CharBufferLength, &c16_length,
315                   term.data() + char_itr.utf8_index(),
316                   /*srcLength=*/c_lenth, &status);
317     if (U_FAILURE(status)) {
318       break;
319     }
320 
321     limit = c16_length;
322     utrans_transUChars(u_transliterator_, c16, &c16_length,
323                        kUtf16CharBufferLength,
324                        /*start=*/0, &limit, &status);
325     if (U_FAILURE(status)) {
326       break;
327     }
328 
329     u_strToUTF8(normalized_buffer, kCharBufferLength, &c8_length, c16,
330                 c16_length, &status);
331     if (U_FAILURE(status)) {
332       break;
333     }
334 
335     for (int i = 0; i < c8_length; ++i) {
336       if (normalized_buffer[i] !=
337           normalized_term[normalized_char_itr.utf8_index() + i]) {
338         return char_itr;
339       }
340     }
341     normalized_char_itr.AdvanceToUtf8(normalized_char_itr.utf8_index() +
342                                       c8_length);
343     char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
344   }
345   if (U_FAILURE(status)) {
346     // Failed to transform, return its original form.
347     ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
348   }
349   return char_itr;
350 }
351 
FindNormalizedMatchEndPosition(std::string_view term,std::string_view normalized_term) const352 CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition(
353     std::string_view term, std::string_view normalized_term) const {
354   UErrorCode status = U_ZERO_ERROR;
355   // ICU manages the singleton instance
356   const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
357   if (U_FAILURE(status)) {
358     ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
359   }
360 
361   CharacterIterator char_itr(term);
362   CharacterIterator normalized_char_itr(normalized_term);
363   if (FindNormalizedLatinMatchEndPosition(
364           normalizer2, term, char_itr, normalized_term, normalized_char_itr)) {
365     return char_itr;
366   }
367   // Some portion of term couldn't be normalized via
368   // FindNormalizedLatinMatchEndPosition. Use term_transformer to handle this
369   // portion.
370   std::string_view rest_normalized_term =
371       normalized_term.substr(normalized_char_itr.utf8_index());
372   return term_transformer_->FindNormalizedNonLatinMatchEndPosition(
373       term, char_itr, rest_normalized_term);
374 }
375 
376 }  // namespace lib
377 }  // namespace icing
378