1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 16 #define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 17 18 #include <memory> 19 #include <string> 20 #include <string_view> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/transform/normalizer.h" 24 #include "icing/util/character-iterator.h" 25 #include "unicode/unorm2.h" 26 #include "unicode/utrans.h" 27 28 namespace icing { 29 namespace lib { 30 31 // Used to normalize UTF8 strings for text matching. It enforces a set of rules: 32 // 1. Transforms text to be lowercase UTF8. 33 // 2. Transforms full-width Latin characters to ASCII characters if possible. 34 // 3. Transforms hiragana to katakana. 35 // 4. Removes accent / diacritic marks on Latin characters 36 // 5. Removes accent / diacritic marks on Greek characters 37 // 6. Normalized text must be less than or equal to max_term_byte_size, 38 // otherwise it will be truncated. 39 // 40 // There're some other rules from ICU not listed here, please see .cc file for 41 // details. 42 class IcuNormalizer : public Normalizer { 43 public: 44 // Creates a normalizer with the subcomponents it needs. max_term_byte_size 45 // enforces the max size of text after normalization, text will be truncated 46 // if exceeds the max size. 47 // 48 // Returns: 49 // A normalizer on success 50 // INVALID_ARGUMENT if max_term_byte_size <= 0 51 // INTERNAL_ERROR if failed to create any subcomponent 52 static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create( 53 int max_term_byte_size); 54 55 // Normalizes the input term based on rules. See .cc file for rule details. 56 // 57 // NOTE: Term should not mix Latin and non-Latin characters. Doing so may 58 // result in the non-Latin characters not properly being normalized 59 Normalizer::NormalizedTerm NormalizeTerm( 60 std::string_view term) const override; 61 62 // Returns a CharacterIterator pointing to one past the end of the segment of 63 // term that (once normalized) matches with normalized_term. 64 // 65 // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return 66 // CharacterIterator(u8:4, u16:4, u32:4). 67 // 68 // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return 69 // CharacterIterator(u8:0, u16:0, u32:0). 70 CharacterIterator FindNormalizedMatchEndPosition( 71 std::string_view term, std::string_view normalized_term) const override; 72 73 private: 74 // A handler class that helps manage the lifecycle of UTransliterator. It's 75 // used in IcuNormalizer to transform terms into the formats we need. 76 class TermTransformer { 77 public: 78 // Creates TermTransformer with a valid UTransliterator instance 79 // 80 // Returns: 81 // A term transformer on success 82 // INTERNAL_ERROR if failed to create any subcomponent 83 static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>> 84 Create(); 85 86 // Closes the UTransliterator instance 87 ~TermTransformer(); 88 89 // Transforms the text based on our rules described at top of this file 90 struct TransformResult { 91 std::string transformed_term; 92 }; 93 TransformResult Transform(std::string_view term) const; 94 95 // Returns a CharacterIterator pointing to one past the end of the segment 96 // of a non-latin term that (once normalized) matches with normalized_term. 97 CharacterIterator FindNormalizedNonLatinMatchEndPosition( 98 std::string_view term, CharacterIterator char_itr, 99 std::string_view normalized_term) const; 100 101 private: 102 explicit TermTransformer(UTransliterator* u_transliterator); 103 104 // An ICU class to execute custom term transformation / normalization rules. 105 // utrans_close() must by called after using. 106 UTransliterator* u_transliterator_; 107 }; 108 109 struct NormalizeLatinResult { 110 // A string representing the maximum prefix of term (can be empty or term 111 // itself) that can be normalized into ASCII. 112 std::string text; 113 // The first position of the char within term that normalization failed to 114 // transform into an ASCII char, or term.length() if all chars can be 115 // transformed. 116 size_t end_pos; 117 }; 118 119 explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer, 120 int max_term_byte_size); 121 122 // Helper method to normalize Latin terms only. Rules applied: 123 // 1. Uppercase to lowercase 124 // 2. Remove diacritic (accent) marks 125 NormalizeLatinResult NormalizeLatin(const UNormalizer2* normalizer2, 126 std::string_view term) const; 127 128 // Set char_itr and normalized_char_itr to point to one past the end of the 129 // segments of term and normalized_term that can match if normalized into 130 // ASCII. In this case, true will be returned. 131 // 132 // The method stops at the position when char_itr cannot be normalized into 133 // ASCII and returns false, so that term_transformer can handle the remaining 134 // portion. 135 bool FindNormalizedLatinMatchEndPosition( 136 const UNormalizer2* normalizer2, std::string_view term, 137 CharacterIterator& char_itr, std::string_view normalized_term, 138 CharacterIterator& normalized_char_itr) const; 139 140 // Used to transform terms into their normalized forms. 141 std::unique_ptr<TermTransformer> term_transformer_; 142 143 // The maximum term length allowed after normalization. 144 int max_term_byte_size_; 145 }; 146 147 } // namespace lib 148 } // namespace icing 149 150 #endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 151