xref: /aosp_15_r20/external/icing/icing/tokenization/plain-tokenizer.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/tokenization/plain-tokenizer.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <vector>
20 
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/tokenization/language-segmenter.h"
23 #include "icing/util/character-iterator.h"
24 #include "icing/util/i18n-utils.h"
25 #include "icing/util/status-macros.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 namespace {
31 // Helper function to validate a term.
32 // A term is valid if:
33 //   1. it's not empty
34 //   2. it's not a whitespace
35 //   3. it's not a punctuation mark
36 //
37 // TODO(b/141007791): figure out how we'd like to support special characters
38 // like "+", "&", "@", "#" in indexing and query tokenizers.
IsValidTerm(std::string_view term)39 bool IsValidTerm(std::string_view term) {
40   if (term.empty()) {
41     return false;
42   }
43   // Gets the first unicode character. We can know what the whole term is by
44   // checking only the first character.
45   return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
46          !i18n_utils::IsPunctuationAt(term, /*position=*/0);
47 }
48 }  // namespace
49 
50 // Plain tokenizer applies its rules to the results from language segmenter. It
51 // simply filters out invalid terms from language segmenter and returns
52 // everything else as tokens. Please refer to IsValidTerm() above for what terms
53 // are valid.
54 class PlainTokenIterator : public Tokenizer::Iterator {
55  public:
PlainTokenIterator(std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)56   explicit PlainTokenIterator(
57       std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)
58       : base_iterator_(std::move(base_iterator)) {}
59 
Advance()60   bool Advance() override {
61     bool found_next_valid_term = false;
62     while (!found_next_valid_term && base_iterator_->Advance()) {
63       current_term_ = base_iterator_->GetTerm();
64       found_next_valid_term = IsValidTerm(current_term_);
65     }
66     return found_next_valid_term;
67   }
68 
GetTokens() const69   std::vector<Token> GetTokens() const override {
70     std::vector<Token> result;
71     if (!current_term_.empty()) {
72       result.push_back(Token(Token::Type::REGULAR, current_term_));
73     }
74     return result;
75   }
76 
CalculateTokenStart()77   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
78       override {
79     return base_iterator_->CalculateTermStart();
80   }
81 
CalculateTokenEndExclusive()82   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
83       override {
84     return base_iterator_->CalculateTermEndExclusive();
85   }
86 
ResetToTokenStartingAfter(int32_t utf32_offset)87   bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
88     if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
89       return false;
90     }
91     current_term_ = base_iterator_->GetTerm();
92     if (!IsValidTerm(current_term_)) {
93       // If the current value isn't valid, advance to the next valid value.
94       return Advance();
95     }
96     return true;
97   }
98 
ResetToTokenEndingBefore(int32_t utf32_offset)99   bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
100     ICING_ASSIGN_OR_RETURN(
101         utf32_offset,
102         base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
103     current_term_ = base_iterator_->GetTerm();
104     while (!IsValidTerm(current_term_)) {
105       // Haven't found a valid term yet. Retrieve the term prior to this one
106       // from the segmenter.
107       ICING_ASSIGN_OR_RETURN(
108           utf32_offset,
109           base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
110       current_term_ = base_iterator_->GetTerm();
111     }
112     return true;
113   }
114 
ResetToStart()115   bool ResetToStart() override {
116     if (!base_iterator_->ResetToStartUtf32().ok()) {
117       return false;
118     }
119     current_term_ = base_iterator_->GetTerm();
120     if (!IsValidTerm(current_term_)) {
121       // If the current value isn't valid, advance to the next valid value.
122       return Advance();
123     }
124     return true;
125   }
126 
127  private:
128   std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
129   std::string_view current_term_;
130 };
131 
132 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
Tokenize(std::string_view text) const133 PlainTokenizer::Tokenize(std::string_view text) const {
134   ICING_ASSIGN_OR_RETURN(
135       std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
136       language_segmenter_.Segment(text));
137   return std::make_unique<PlainTokenIterator>(std::move(base_iterator));
138 }
139 
TokenizeAll(std::string_view text) const140 libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
141     std::string_view text) const {
142   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
143                          Tokenize(text));
144   std::vector<Token> tokens;
145   while (iterator->Advance()) {
146     std::vector<Token> batch_tokens = iterator->GetTokens();
147     tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
148   }
149   return tokens;
150 }
151 
152 }  // namespace lib
153 }  // namespace icing
154