1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_TOKENIZER_H_ 16 #define ICING_TOKENIZATION_TOKENIZER_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string_view> 21 #include <vector> 22 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 #include "icing/absl_ports/canonical_errors.h" 25 #include "icing/tokenization/token.h" 26 #include "icing/util/character-iterator.h" 27 28 namespace icing { 29 namespace lib { 30 31 // A virtual class that all other tokenizers should inherit. It provides 32 // interfaces that allow callers to tokenize text. The return value could be an 33 // iterator or a list of tokens. Example usage: 34 // 35 // std::unique_ptr<Tokenizer> tokenizer = GetTokenizer(); 36 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iter, 37 // tokenizer->Tokenize(text)); 38 // ICING_ASSIGN_OR_RETURN(std::vector<Token> tokens, 39 // tokenizer->TokenizeAll(text)); 40 class Tokenizer { 41 public: 42 virtual ~Tokenizer() = default; 43 44 // An iterator helping to get tokens. 45 // Example usage: 46 // 47 // while (iterator.Advance()) { 48 // const Token& token = iterator.GetToken(); 49 // // Do something 50 // } 51 class Iterator { 52 public: 53 virtual ~Iterator() = default; 54 55 // Advances to the next token. Returns false if it has reached the end. 56 virtual bool Advance() = 0; 57 58 // Returns the current token, maybe with compound tokens as well. It can be 59 // called only when Advance() returns true, otherwise an empty Token vector 60 // may be returned. 61 virtual std::vector<Token> GetTokens() const = 0; 62 63 virtual libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()64 CalculateTokenStart() { 65 return absl_ports::UnimplementedError( 66 "CalculateTokenStart is not implemented!"); 67 } 68 69 virtual libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()70 CalculateTokenEndExclusive() { 71 return absl_ports::UnimplementedError( 72 "CalculateTokenEndExclusive is not implemented!"); 73 } 74 75 // Sets the tokenizer to point at the first token that *starts* *after* 76 // offset. Returns false if there are no valid tokens starting after 77 // offset. 78 // Ex. 79 // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); 80 // iterator.ResetToTokenStartingAfter(4); 81 // // The first full token starting after position 4 (the 'b' in "bar") is 82 // // "baz". 83 // PrintToken(iterator.GetToken()); // prints "baz" ResetToTokenStartingAfter(int32_t utf32_offset)84 virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) { 85 return false; 86 } 87 88 // Sets the tokenizer to point at the first token that *ends* *before* 89 // offset. Returns false if there are no valid tokens ending 90 // before offset. 91 // Ex. 92 // auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie(); 93 // iterator.ResetToTokenEndingBefore(4); 94 // // The first full token ending before position 4 (the 'b' in "bar") is 95 // // "foo". 96 // PrintToken(iterator.GetToken()); // prints "foo" ResetToTokenEndingBefore(int32_t utf32_offset)97 virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) { 98 return false; 99 } 100 ResetToStart()101 virtual bool ResetToStart() { return false; } 102 }; 103 104 // Tokenizes the input text. The input text should outlive the returned 105 // iterator. 106 // 107 // Returns: 108 // A token iterator on success 109 // INVALID_ARGUMENT with error message if input text has a wrong syntax 110 // according to implementations of different tokenizer 111 // types. 112 // INTERNAL_ERROR if any other errors occur 113 virtual libtextclassifier3::StatusOr<std::unique_ptr<Iterator>> Tokenize( 114 std::string_view text) const = 0; 115 116 // Tokenizes and returns all tokens in the input text. The input text should 117 // outlive the returned vector. 118 // 119 // Returns: 120 // A list of tokens on success 121 // INVALID_ARGUMENT with error message if input text has a wrong syntax 122 // according to implementations of different tokenizer 123 // types. 124 // INTERNAL_ERROR if any other errors occur 125 virtual libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll( 126 std::string_view text) const = 0; 127 }; 128 129 } // namespace lib 130 } // namespace icing 131 132 #endif // ICING_TOKENIZATION_TOKENIZER_H_ 133