1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TOKENIZATION_TOKEN_H_ 16 #define ICING_TOKENIZATION_TOKEN_H_ 17 18 #include <string_view> 19 20 namespace icing { 21 namespace lib { 22 23 struct Token { 24 enum class Type { 25 // Common types 26 REGULAR, // A token without special meanings, the value of it will be 27 // indexed or searched directly 28 29 VERBATIM, // A token that should be indexed and searched without any 30 // modifications to the raw text 31 32 // An RFC822 section with the content in RFC822_TOKEN tokenizes as follows: 33 RFC822_NAME, // "User", "Johnsson" 34 RFC822_COMMENT, // "A", "comment", "here" 35 RFC822_LOCAL_ADDRESS, // "user.name" 36 RFC822_HOST_ADDRESS, // "domain.name.com" 37 RFC822_ADDRESS, // "[email protected]" 38 RFC822_ADDRESS_COMPONENT_LOCAL, // "user", "name", 39 RFC822_ADDRESS_COMPONENT_HOST, // "domain", "name", "com" 40 RFC822_TOKEN, // "User Johnsson (A comment) <[email protected]>" 41 42 // Types only used in raw query 43 QUERY_OR, // Indicates OR logic between its left and right tokens 44 QUERY_EXCLUSION, // Indicates exclusion operation on next token 45 QUERY_PROPERTY, // Indicates property restrict on next token 46 QUERY_LEFT_PARENTHESES, // Left parentheses 47 QUERY_RIGHT_PARENTHESES, // Right parentheses 48 49 // Types used in URL tokenization 50 URL_SCHEME, // "http", "https", "ftp", "content" 51 URL_USERNAME, 52 URL_PASSWORD, 53 URL_HOST_COMMON_PART, // Hosts are split into two types, common and 54 // significant. Common are e.g: www, ww2, .com, etc. 55 URL_HOST_SIGNIFICANT_PART, 56 URL_PORT, 57 URL_PATH_PART, // Tokenized path, e.g. /abc-d/e.fg-> [abc-d], [e.fg] 58 URL_QUERY, // After ?, before #, e.g. "param1=value-1¶m2=value-2 59 URL_REF, // Anything after #. Could be anything 60 URL_SUFFIX, 61 URL_SUFFIX_INNERMOST, 62 63 TRIGRAM, // Trigram token of the text 64 65 // Indicates errors 66 INVALID, 67 }; 68 69 // The input text should outlive the Token instance. 70 explicit Token(Type type_in, std::string_view text_in = "") typeToken71 : type(type_in), text(text_in) {} 72 73 // The type of token 74 Type type; 75 76 // The content of token 77 std::string_view text; 78 }; 79 80 } // namespace lib 81 } // namespace icing 82 83 #endif // ICING_TOKENIZATION_TOKEN_H_ 84