1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/transform/icu/icu-normalizer.h"
16
17 #include <cctype>
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/absl_ports/str_cat.h"
26 #include "icing/transform/normalizer.h"
27 #include "icing/util/character-iterator.h"
28 #include "icing/util/i18n-utils.h"
29 #include "icing/util/logging.h"
30 #include "icing/util/status-macros.h"
31 #include "unicode/umachine.h"
32 #include "unicode/unorm2.h"
33 #include "unicode/ustring.h"
34 #include "unicode/utrans.h"
35
36 namespace icing {
37 namespace lib {
38
39 namespace {
40
41 // The following is the compound id used to tell UTransliterator how to
42 // transform terms. The basic normalization forms NFD (canonical normalization
43 // form decomposition) and NFKC (compatible normalization form composition)
44 // are applied as well as some other rules we need. More information at
45 // http://www.unicode.org/reports/tr15/
46 //
47 // Please note that the following rules don't support small hiragana to katakana
48 // transformation.
49 constexpr UChar kTransformRulesUtf16[] =
50 u"Lower; " // Lowercase
51 "Latin-ASCII; " // Map Latin characters to ASCII characters
52 "Hiragana-Katakana; " // Map hiragana to katakana
53 "[:Latin:] NFD; " // Decompose Latin letters
54 "[:Greek:] NFD; " // Decompose Greek letters
55 "[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks
56 "NFKC"; // Decompose and compose everything
57
58 // Length of the transform rules excluding the terminating NULL.
59 constexpr int kTransformRulesLength =
60 sizeof(kTransformRulesUtf16) / sizeof(kTransformRulesUtf16[0]) - 1;
61
62 // Transforms a Unicode character with diacritics to its counterpart in ASCII
63 // range. E.g. "ü" -> "u". Result will be set to char_out. Returns true if
64 // the transformation is successful.
65 //
66 // NOTE: According to our convention this function should have returned
67 // StatusOr<char>. However, this function is performance-sensitive because is
68 // could be called on every Latin character in normalization, so we make it
69 // return a bool here to save a bit more time and memory.
DiacriticCharToAscii(const UNormalizer2 * normalizer2,UChar32 uchar32_in,char * char_out)70 bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
71 char* char_out) {
72 if (i18n_utils::IsAscii(uchar32_in)) {
73 // The Unicode character is within ASCII range
74 if (char_out != nullptr) {
75 *char_out = uchar32_in;
76 }
77 return true;
78 }
79
80 // Maximum number of pieces a Unicode character can be decomposed into.
81 // TODO(tjbarron) figure out if this number is proper.
82 constexpr int kDecompositionBufferCapacity = 5;
83
84 // A buffer used to store Unicode decomposition mappings of only one
85 // character.
86 UChar decomposition_buffer[kDecompositionBufferCapacity];
87
88 // Decomposes the Unicode character, trying to get an ASCII char and some
89 // diacritic chars.
90 UErrorCode status = U_ZERO_ERROR;
91 if (unorm2_getDecomposition(normalizer2, uchar32_in, &decomposition_buffer[0],
92 kDecompositionBufferCapacity, &status) > 0 &&
93 !U_FAILURE(status) && i18n_utils::IsAscii(decomposition_buffer[0])) {
94 if (char_out != nullptr) {
95 *char_out = decomposition_buffer[0];
96 }
97 return true;
98 }
99 return false;
100 }
101
102 } // namespace
103
104 // Creates a IcuNormalizer with a valid TermTransformer instance.
105 //
106 // Note: UTokenizer2 is also an option to normalize Unicode strings, but since
107 // we need some custom transform rules other than NFC/NFKC we have to use
108 // TermTransformer as a custom transform rule executor.
109 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>>
Create(int max_term_byte_size)110 IcuNormalizer::Create(int max_term_byte_size) {
111 ICING_ASSIGN_OR_RETURN(
112 std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
113 IcuNormalizer::TermTransformer::Create());
114
115 return std::unique_ptr<IcuNormalizer>(
116 new IcuNormalizer(std::move(term_transformer), max_term_byte_size));
117 }
118
IcuNormalizer(std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,int max_term_byte_size)119 IcuNormalizer::IcuNormalizer(
120 std::unique_ptr<IcuNormalizer::TermTransformer> term_transformer,
121 int max_term_byte_size)
122 : term_transformer_(std::move(term_transformer)),
123 max_term_byte_size_(max_term_byte_size) {}
124
NormalizeTerm(const std::string_view term) const125 Normalizer::NormalizedTerm IcuNormalizer::NormalizeTerm(
126 const std::string_view term) const {
127 NormalizedTerm normalized_text = {""};
128
129 if (term.empty()) {
130 return normalized_text;
131 }
132
133 UErrorCode status = U_ZERO_ERROR;
134 // ICU manages the singleton instance
135 const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
136 if (U_FAILURE(status)) {
137 ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
138 }
139
140 // Normalize the prefix that can be transformed into ASCII.
141 // This is a faster method to normalize Latin terms.
142 NormalizeLatinResult result = NormalizeLatin(normalizer2, term);
143 normalized_text.text = std::move(result.text);
144 if (result.end_pos < term.length()) {
145 // Some portion of term couldn't be normalized via NormalizeLatin. Use
146 // term_transformer to handle this portion.
147 std::string_view rest_term = term.substr(result.end_pos);
148 TermTransformer::TransformResult transform_result =
149 term_transformer_->Transform(rest_term);
150 absl_ports::StrAppend(&normalized_text.text,
151 transform_result.transformed_term);
152 }
153
154 if (normalized_text.text.length() > max_term_byte_size_) {
155 i18n_utils::SafeTruncateUtf8(&normalized_text.text, max_term_byte_size_);
156 }
157
158 return normalized_text;
159 }
160
NormalizeLatin(const UNormalizer2 * normalizer2,const std::string_view term) const161 IcuNormalizer::NormalizeLatinResult IcuNormalizer::NormalizeLatin(
162 const UNormalizer2* normalizer2, const std::string_view term) const {
163 NormalizeLatinResult result = {};
164 if (normalizer2 == nullptr) {
165 return result;
166 }
167 CharacterIterator char_itr(term);
168 result.text.reserve(term.length());
169 char ascii_char;
170 while (char_itr.utf8_index() < term.length()) {
171 UChar32 c = char_itr.GetCurrentChar();
172 if (i18n_utils::IsAscii(c)) {
173 result.text.push_back(std::tolower(c));
174 } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
175 result.text.push_back(std::tolower(ascii_char));
176 } else {
177 // We don't know how to transform / decompose this Unicode character, it
178 // probably means that some other Unicode characters are mixed with Latin
179 // characters. We return the partial result here and let the caller handle
180 // the rest.
181 result.end_pos = char_itr.utf8_index();
182 return result;
183 }
184 char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
185 }
186 result.end_pos = term.length();
187 return result;
188 }
189
190 libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer::TermTransformer>>
Create()191 IcuNormalizer::TermTransformer::Create() {
192 UErrorCode status = U_ZERO_ERROR;
193 UTransliterator* term_transformer = utrans_openU(
194 kTransformRulesUtf16, kTransformRulesLength, UTRANS_FORWARD,
195 /*rules=*/nullptr, /*rulesLength=*/0, /*parseError=*/nullptr, &status);
196
197 if (U_FAILURE(status)) {
198 return absl_ports::InternalError("Failed to create UTransliterator.");
199 }
200
201 return std::unique_ptr<IcuNormalizer::TermTransformer>(
202 new IcuNormalizer::TermTransformer(term_transformer));
203 }
204
TermTransformer(UTransliterator * u_transliterator)205 IcuNormalizer::TermTransformer::TermTransformer(
206 UTransliterator* u_transliterator)
207 : u_transliterator_(u_transliterator) {}
208
~TermTransformer()209 IcuNormalizer::TermTransformer::~TermTransformer() {
210 if (u_transliterator_ != nullptr) {
211 utrans_close(u_transliterator_);
212 }
213 }
214
215 IcuNormalizer::TermTransformer::TransformResult
Transform(const std::string_view term) const216 IcuNormalizer::TermTransformer::Transform(const std::string_view term) const {
217 auto utf16_term_or = i18n_utils::Utf8ToUtf16(term);
218 if (!utf16_term_or.ok()) {
219 ICING_VLOG(0) << "Failed to convert UTF8 term '" << term << "' to UTF16";
220 return {std::string(term)};
221 }
222 std::u16string utf16_term = std::move(utf16_term_or).ValueOrDie();
223 UErrorCode status = U_ZERO_ERROR;
224 int utf16_term_desired_length = utf16_term.length();
225 int limit = utf16_term.length();
226 utrans_transUChars(u_transliterator_, &utf16_term[0],
227 &utf16_term_desired_length, utf16_term.length(),
228 /*start=*/0, &limit, &status);
229
230 // For most cases, one Unicode character is normalized to exact one Unicode
231 // character according to our transformation rules. However, there could be
232 // some rare cases where the normalized text is longer than the original
233 // one. E.g. "¼" (1 character) -> "1/4" (3 characters). That causes a buffer
234 // overflow error and we need to increase our buffer size and try again.
235 if (status == U_BUFFER_OVERFLOW_ERROR) {
236 // 'utf16_term_desired_length' has already been set to the desired value
237 // by utrans_transUChars(), here we increase the buffer size to that
238 // value.
239 //
240 // NOTE: we need to call resize() but not reserve() because values can't
241 // be set at positions after length().
242 int original_content_length = utf16_term.length();
243 utf16_term.resize(utf16_term_desired_length);
244 utf16_term_desired_length = original_content_length;
245 limit = original_content_length;
246 status = U_ZERO_ERROR;
247 utrans_transUChars(u_transliterator_, &utf16_term[0],
248 &utf16_term_desired_length, utf16_term.length(),
249 /*start=*/0, &limit, &status);
250 }
251
252 if (U_FAILURE(status)) {
253 // Failed to transform, return its original form.
254 ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
255 return {std::string(term)};
256 }
257
258 auto utf8_term_or = i18n_utils::Utf16ToUtf8(utf16_term);
259 if (!utf8_term_or.ok()) {
260 ICING_VLOG(0) << "Failed to convert UTF16 term '" << term << "' to UTF8";
261 return {std::string(term)};
262 }
263 std::string utf8_term = std::move(utf8_term_or).ValueOrDie();
264 return {std::move(utf8_term)};
265 }
266
FindNormalizedLatinMatchEndPosition(const UNormalizer2 * normalizer2,std::string_view term,CharacterIterator & char_itr,std::string_view normalized_term,CharacterIterator & normalized_char_itr) const267 bool IcuNormalizer::FindNormalizedLatinMatchEndPosition(
268 const UNormalizer2* normalizer2, std::string_view term,
269 CharacterIterator& char_itr, std::string_view normalized_term,
270 CharacterIterator& normalized_char_itr) const {
271 if (normalizer2 == nullptr) {
272 return false;
273 }
274 char ascii_char;
275 while (char_itr.utf8_index() < term.length() &&
276 normalized_char_itr.utf8_index() < normalized_term.length()) {
277 UChar32 c = char_itr.GetCurrentChar();
278 if (i18n_utils::IsAscii(c)) {
279 c = std::tolower(c);
280 } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
281 c = std::tolower(ascii_char);
282 } else {
283 return false;
284 }
285 UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
286 if (c != normalized_c) {
287 return true;
288 }
289 char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
290 normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
291 }
292 return true;
293 }
294
295 CharacterIterator
FindNormalizedNonLatinMatchEndPosition(std::string_view term,CharacterIterator char_itr,std::string_view normalized_term) const296 IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition(
297 std::string_view term, CharacterIterator char_itr,
298 std::string_view normalized_term) const {
299 CharacterIterator normalized_char_itr(normalized_term);
300 UErrorCode status = U_ZERO_ERROR;
301
302 constexpr int kUtf16CharBufferLength = 6;
303 UChar c16[kUtf16CharBufferLength];
304 int32_t c16_length;
305 int32_t limit;
306
307 constexpr int kCharBufferLength = 3 * 4;
308 char normalized_buffer[kCharBufferLength];
309 int32_t c8_length;
310 while (char_itr.utf8_index() < term.length() &&
311 normalized_char_itr.utf8_index() < normalized_term.length()) {
312 UChar32 c = char_itr.GetCurrentChar();
313 int c_lenth = i18n_utils::GetUtf8Length(c);
314 u_strFromUTF8(c16, kUtf16CharBufferLength, &c16_length,
315 term.data() + char_itr.utf8_index(),
316 /*srcLength=*/c_lenth, &status);
317 if (U_FAILURE(status)) {
318 break;
319 }
320
321 limit = c16_length;
322 utrans_transUChars(u_transliterator_, c16, &c16_length,
323 kUtf16CharBufferLength,
324 /*start=*/0, &limit, &status);
325 if (U_FAILURE(status)) {
326 break;
327 }
328
329 u_strToUTF8(normalized_buffer, kCharBufferLength, &c8_length, c16,
330 c16_length, &status);
331 if (U_FAILURE(status)) {
332 break;
333 }
334
335 for (int i = 0; i < c8_length; ++i) {
336 if (normalized_buffer[i] !=
337 normalized_term[normalized_char_itr.utf8_index() + i]) {
338 return char_itr;
339 }
340 }
341 normalized_char_itr.AdvanceToUtf8(normalized_char_itr.utf8_index() +
342 c8_length);
343 char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
344 }
345 if (U_FAILURE(status)) {
346 // Failed to transform, return its original form.
347 ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
348 }
349 return char_itr;
350 }
351
FindNormalizedMatchEndPosition(std::string_view term,std::string_view normalized_term) const352 CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition(
353 std::string_view term, std::string_view normalized_term) const {
354 UErrorCode status = U_ZERO_ERROR;
355 // ICU manages the singleton instance
356 const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
357 if (U_FAILURE(status)) {
358 ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
359 }
360
361 CharacterIterator char_itr(term);
362 CharacterIterator normalized_char_itr(normalized_term);
363 if (FindNormalizedLatinMatchEndPosition(
364 normalizer2, term, char_itr, normalized_term, normalized_char_itr)) {
365 return char_itr;
366 }
367 // Some portion of term couldn't be normalized via
368 // FindNormalizedLatinMatchEndPosition. Use term_transformer to handle this
369 // portion.
370 std::string_view rest_normalized_term =
371 normalized_term.substr(normalized_char_itr.utf8_index());
372 return term_transformer_->FindNormalizedNonLatinMatchEndPosition(
373 term, char_itr, rest_normalized_term);
374 }
375
376 } // namespace lib
377 } // namespace icing
378