xref: /aosp_15_r20/external/icing/icing/transform/icu/icu-normalizer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
16 #define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
17 
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/transform/normalizer.h"
24 #include "icing/util/character-iterator.h"
25 #include "unicode/unorm2.h"
26 #include "unicode/utrans.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 // Used to normalize UTF8 strings for text matching. It enforces a set of rules:
32 //  1. Transforms text to be lowercase UTF8.
33 //  2. Transforms full-width Latin characters to ASCII characters if possible.
34 //  3. Transforms hiragana to katakana.
35 //  4. Removes accent / diacritic marks on Latin characters
36 //  5. Removes accent / diacritic marks on Greek characters
37 //  6. Normalized text must be less than or equal to max_term_byte_size,
38 //     otherwise it will be truncated.
39 //
40 // There're some other rules from ICU not listed here, please see .cc file for
41 // details.
42 class IcuNormalizer : public Normalizer {
43  public:
44   // Creates a normalizer with the subcomponents it needs. max_term_byte_size
45   // enforces the max size of text after normalization, text will be truncated
46   // if exceeds the max size.
47   //
48   // Returns:
49   //   A normalizer on success
50   //   INVALID_ARGUMENT if max_term_byte_size <= 0
51   //   INTERNAL_ERROR if failed to create any subcomponent
52   static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create(
53       int max_term_byte_size);
54 
55   // Normalizes the input term based on rules. See .cc file for rule details.
56   //
57   // NOTE: Term should not mix Latin and non-Latin characters. Doing so may
58   // result in the non-Latin characters not properly being normalized
59   Normalizer::NormalizedTerm NormalizeTerm(
60       std::string_view term) const override;
61 
62   // Returns a CharacterIterator pointing to one past the end of the segment of
63   // term that (once normalized) matches with normalized_term.
64   //
65   // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
66   // CharacterIterator(u8:4, u16:4, u32:4).
67   //
68   // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
69   // CharacterIterator(u8:0, u16:0, u32:0).
70   CharacterIterator FindNormalizedMatchEndPosition(
71       std::string_view term, std::string_view normalized_term) const override;
72 
73  private:
74   // A handler class that helps manage the lifecycle of UTransliterator. It's
75   // used in IcuNormalizer to transform terms into the formats we need.
76   class TermTransformer {
77    public:
78     // Creates TermTransformer with a valid UTransliterator instance
79     //
80     // Returns:
81     //   A term transformer on success
82     //   INTERNAL_ERROR if failed to create any subcomponent
83     static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>>
84     Create();
85 
86     // Closes the UTransliterator instance
87     ~TermTransformer();
88 
89     // Transforms the text based on our rules described at top of this file
90     struct TransformResult {
91       std::string transformed_term;
92     };
93     TransformResult Transform(std::string_view term) const;
94 
95     // Returns a CharacterIterator pointing to one past the end of the segment
96     // of a non-latin term that (once normalized) matches with normalized_term.
97     CharacterIterator FindNormalizedNonLatinMatchEndPosition(
98         std::string_view term, CharacterIterator char_itr,
99         std::string_view normalized_term) const;
100 
101    private:
102     explicit TermTransformer(UTransliterator* u_transliterator);
103 
104     // An ICU class to execute custom term transformation / normalization rules.
105     // utrans_close() must by called after using.
106     UTransliterator* u_transliterator_;
107   };
108 
109   struct NormalizeLatinResult {
110     // A string representing the maximum prefix of term (can be empty or term
111     // itself) that can be normalized into ASCII.
112     std::string text;
113     // The first position of the char within term that normalization failed to
114     // transform into an ASCII char, or term.length() if all chars can be
115     // transformed.
116     size_t end_pos;
117   };
118 
119   explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer,
120                          int max_term_byte_size);
121 
122   // Helper method to normalize Latin terms only. Rules applied:
123   // 1. Uppercase to lowercase
124   // 2. Remove diacritic (accent) marks
125   NormalizeLatinResult NormalizeLatin(const UNormalizer2* normalizer2,
126                                       std::string_view term) const;
127 
128   // Set char_itr and normalized_char_itr to point to one past the end of the
129   // segments of term and normalized_term that can match if normalized into
130   // ASCII. In this case, true will be returned.
131   //
132   // The method stops at the position when char_itr cannot be normalized into
133   // ASCII and returns false, so that term_transformer can handle the remaining
134   // portion.
135   bool FindNormalizedLatinMatchEndPosition(
136       const UNormalizer2* normalizer2, std::string_view term,
137       CharacterIterator& char_itr, std::string_view normalized_term,
138       CharacterIterator& normalized_char_itr) const;
139 
140   // Used to transform terms into their normalized forms.
141   std::unique_ptr<TermTransformer> term_transformer_;
142 
143   // The maximum term length allowed after normalization.
144   int max_term_byte_size_;
145 };
146 
147 }  // namespace lib
148 }  // namespace icing
149 
150 #endif  // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
151