xref: /aosp_15_r20/external/icing/icing/tokenization/tokenizer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_TOKENIZER_H_
16 #define ICING_TOKENIZATION_TOKENIZER_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string_view>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 #include "icing/absl_ports/canonical_errors.h"
25 #include "icing/tokenization/token.h"
26 #include "icing/util/character-iterator.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 // A virtual class that all other tokenizers should inherit. It provides
32 // interfaces that allow callers to tokenize text. The return value could be an
33 // iterator or a list of tokens. Example usage:
34 //
35 // std::unique_ptr<Tokenizer> tokenizer = GetTokenizer();
36 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter,
37 //                  tokenizer->Tokenize(text));
38 // ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens,
39 // tokenizer->TokenizeAll(text));
40 class Tokenizer {
41  public:
42   virtual ~Tokenizer() = default;
43 
44   // An iterator helping to get tokens.
45   // Example usage:
46   //
47   // while (iterator.Advance()) {
48   //   const Token& token = iterator.GetToken();
49   //   // Do something
50   // }
51   class Iterator {
52    public:
53     virtual ~Iterator() = default;
54 
55     // Advances to the next token. Returns false if it has reached the end.
56     virtual bool Advance() = 0;
57 
58     // Returns the current token, maybe with compound tokens as well. It can be
59     // called only when Advance() returns true, otherwise an empty Token vector
60     // may be returned.
61     virtual std::vector<Token> GetTokens() const = 0;
62 
63     virtual libtextclassifier3::StatusOr<CharacterIterator>
CalculateTokenStart()64     CalculateTokenStart() {
65       return absl_ports::UnimplementedError(
66           "CalculateTokenStart is not implemented!");
67     }
68 
69     virtual libtextclassifier3::StatusOr<CharacterIterator>
CalculateTokenEndExclusive()70     CalculateTokenEndExclusive() {
71       return absl_ports::UnimplementedError(
72           "CalculateTokenEndExclusive is not implemented!");
73     }
74 
75     // Sets the tokenizer to point at the first token that *starts* *after*
76     // offset. Returns false if there are no valid tokens starting after
77     // offset.
78     // Ex.
79     // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
80     // iterator.ResetToTokenStartingAfter(4);
81     // // The first full token starting after position 4 (the 'b' in "bar") is
82     // // "baz".
83     // PrintToken(iterator.GetToken());  // prints "baz"
ResetToTokenStartingAfter(int32_t utf32_offset)84     virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) {
85       return false;
86     }
87 
88     // Sets the tokenizer to point at the first token that *ends* *before*
89     // offset. Returns false if there are no valid tokens ending
90     // before offset.
91     // Ex.
92     // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
93     // iterator.ResetToTokenEndingBefore(4);
94     // // The first full token ending before position 4 (the 'b' in "bar") is
95     // // "foo".
96     // PrintToken(iterator.GetToken());  // prints "foo"
ResetToTokenEndingBefore(int32_t utf32_offset)97     virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) {
98       return false;
99     }
100 
ResetToStart()101     virtual bool ResetToStart() { return false; }
102   };
103 
104   // Tokenizes the input text. The input text should outlive the returned
105   // iterator.
106   //
107   // Returns:
108   //   A token iterator on success
109   //   INVALID_ARGUMENT with error message if input text has a wrong syntax
110   //                    according to implementations of different tokenizer
111   //                    types.
112   //   INTERNAL_ERROR if any other errors occur
113   virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize(
114       std::string_view text) const = 0;
115 
116   // Tokenizes and returns all tokens in the input text. The input text should
117   // outlive the returned vector.
118   //
119   // Returns:
120   //   A list of tokens on success
121   //   INVALID_ARGUMENT with error message if input text has a wrong syntax
122   //                    according to implementations of different tokenizer
123   //                    types.
124   //   INTERNAL_ERROR if any other errors occur
125   virtual libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
126       std::string_view text) const = 0;
127 };
128 
129 }  // namespace lib
130 }  // namespace icing
131 
132 #endif  // ICING_TOKENIZATION_TOKENIZER_H_
133