1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/tokenization/plain-tokenizer.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <vector>
20
21 #include "icing/text_classifier/lib3/utils/base/statusor.h"
22 #include "icing/tokenization/language-segmenter.h"
23 #include "icing/util/character-iterator.h"
24 #include "icing/util/i18n-utils.h"
25 #include "icing/util/status-macros.h"
26
27 namespace icing {
28 namespace lib {
29
30 namespace {
31 // Helper function to validate a term.
32 // A term is valid if:
33 // 1. it's not empty
34 // 2. it's not a whitespace
35 // 3. it's not a punctuation mark
36 //
37 // TODO(b/141007791): figure out how we'd like to support special characters
38 // like "+", "&", "@", "#" in indexing and query tokenizers.
IsValidTerm(std::string_view term)39 bool IsValidTerm(std::string_view term) {
40 if (term.empty()) {
41 return false;
42 }
43 // Gets the first unicode character. We can know what the whole term is by
44 // checking only the first character.
45 return !i18n_utils::IsWhitespaceAt(term, /*position=*/0) &&
46 !i18n_utils::IsPunctuationAt(term, /*position=*/0);
47 }
48 } // namespace
49
50 // Plain tokenizer applies its rules to the results from language segmenter. It
51 // simply filters out invalid terms from language segmenter and returns
52 // everything else as tokens. Please refer to IsValidTerm() above for what terms
53 // are valid.
54 class PlainTokenIterator : public Tokenizer::Iterator {
55 public:
PlainTokenIterator(std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)56 explicit PlainTokenIterator(
57 std::unique_ptr<LanguageSegmenter::Iterator> base_iterator)
58 : base_iterator_(std::move(base_iterator)) {}
59
Advance()60 bool Advance() override {
61 bool found_next_valid_term = false;
62 while (!found_next_valid_term && base_iterator_->Advance()) {
63 current_term_ = base_iterator_->GetTerm();
64 found_next_valid_term = IsValidTerm(current_term_);
65 }
66 return found_next_valid_term;
67 }
68
GetTokens() const69 std::vector<Token> GetTokens() const override {
70 std::vector<Token> result;
71 if (!current_term_.empty()) {
72 result.push_back(Token(Token::Type::REGULAR, current_term_));
73 }
74 return result;
75 }
76
CalculateTokenStart()77 libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
78 override {
79 return base_iterator_->CalculateTermStart();
80 }
81
CalculateTokenEndExclusive()82 libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
83 override {
84 return base_iterator_->CalculateTermEndExclusive();
85 }
86
ResetToTokenStartingAfter(int32_t utf32_offset)87 bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
88 if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
89 return false;
90 }
91 current_term_ = base_iterator_->GetTerm();
92 if (!IsValidTerm(current_term_)) {
93 // If the current value isn't valid, advance to the next valid value.
94 return Advance();
95 }
96 return true;
97 }
98
ResetToTokenEndingBefore(int32_t utf32_offset)99 bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
100 ICING_ASSIGN_OR_RETURN(
101 utf32_offset,
102 base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
103 current_term_ = base_iterator_->GetTerm();
104 while (!IsValidTerm(current_term_)) {
105 // Haven't found a valid term yet. Retrieve the term prior to this one
106 // from the segmenter.
107 ICING_ASSIGN_OR_RETURN(
108 utf32_offset,
109 base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
110 current_term_ = base_iterator_->GetTerm();
111 }
112 return true;
113 }
114
ResetToStart()115 bool ResetToStart() override {
116 if (!base_iterator_->ResetToStartUtf32().ok()) {
117 return false;
118 }
119 current_term_ = base_iterator_->GetTerm();
120 if (!IsValidTerm(current_term_)) {
121 // If the current value isn't valid, advance to the next valid value.
122 return Advance();
123 }
124 return true;
125 }
126
127 private:
128 std::unique_ptr<LanguageSegmenter::Iterator> base_iterator_;
129 std::string_view current_term_;
130 };
131
132 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
Tokenize(std::string_view text) const133 PlainTokenizer::Tokenize(std::string_view text) const {
134 ICING_ASSIGN_OR_RETURN(
135 std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
136 language_segmenter_.Segment(text));
137 return std::make_unique<PlainTokenIterator>(std::move(base_iterator));
138 }
139
TokenizeAll(std::string_view text) const140 libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
141 std::string_view text) const {
142 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
143 Tokenize(text));
144 std::vector<Token> tokens;
145 while (iterator->Advance()) {
146 std::vector<Token> batch_tokens = iterator->GetTokens();
147 tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
148 }
149 return tokens;
150 }
151
152 } // namespace lib
153 } // namespace icing
154