1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_ 16 #define ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <set> 21 #include <stack> 22 #include <string> 23 #include <unordered_map> 24 #include <unordered_set> 25 #include <utility> 26 #include <vector> 27 28 #include "icing/text_classifier/lib3/utils/base/status.h" 29 #include "icing/text_classifier/lib3/utils/base/statusor.h" 30 #include "icing/feature-flags.h" 31 #include "icing/index/embed/embedding-index.h" 32 #include "icing/index/embed/embedding-query-results.h" 33 #include "icing/index/index.h" 34 #include "icing/index/iterator/doc-hit-info-iterator-filter.h" 35 #include "icing/index/iterator/doc-hit-info-iterator.h" 36 #include "icing/index/numeric/numeric-index.h" 37 #include "icing/join/join-children-fetcher.h" 38 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h" 39 #include "icing/query/advanced_query_parser/function.h" 40 #include "icing/query/advanced_query_parser/pending-value.h" 41 #include "icing/query/query-features.h" 42 #include "icing/query/query-results.h" 43 #include "icing/query/query-terms.h" 44 #include "icing/schema/schema-store.h" 45 #include "icing/store/document-store.h" 46 #include "icing/tokenization/tokenizer.h" 47 #include "icing/transform/normalizer.h" 48 49 namespace icing { 50 namespace lib { 51 52 // The Visitor used to create the DocHitInfoIterator tree from the AST output by 53 // the parser. 54 class QueryVisitor : public AbstractSyntaxTreeVisitor { 55 public: QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,const JoinChildrenFetcher * join_children_fetcher,const SearchSpecProto & search_spec,DocHitInfoIteratorFilter::Options filter_options,bool needs_term_frequency_info,const FeatureFlags * feature_flags,int64_t current_time_ms)56 explicit QueryVisitor( 57 Index* index, const NumericIndex<int64_t>* numeric_index, 58 const EmbeddingIndex* embedding_index, 59 const DocumentStore* document_store, const SchemaStore* schema_store, 60 const Normalizer* normalizer, const Tokenizer* tokenizer, 61 const JoinChildrenFetcher* join_children_fetcher, 62 const SearchSpecProto& search_spec, 63 DocHitInfoIteratorFilter::Options filter_options, 64 bool needs_term_frequency_info, const FeatureFlags* feature_flags, 65 int64_t current_time_ms) 66 : QueryVisitor(index, numeric_index, embedding_index, document_store, 67 schema_store, normalizer, tokenizer, join_children_fetcher, 68 search_spec, filter_options, needs_term_frequency_info, 69 feature_flags, PendingPropertyRestricts(), 70 /*processing_not=*/false, current_time_ms) {} 71 72 void VisitString(const StringNode* node) override; 73 void VisitText(const TextNode* node) override; 74 void VisitMember(const MemberNode* node) override; 75 void VisitFunction(const FunctionNode* node) override; 76 void VisitUnaryOperator(const UnaryOperatorNode* node) override; 77 void VisitNaryOperator(const NaryOperatorNode* node) override; 78 79 // RETURNS: 80 // - the QueryResults reflecting the AST that was visited 81 // - INVALID_ARGUMENT if the AST does not conform to supported expressions 82 // - NOT_FOUND if the AST refers to a property that does not exist 83 libtextclassifier3::StatusOr<QueryResults> ConsumeResults() &&; 84 85 private: 86 // An internal class to help manage property restricts being applied at 87 // different levels. 88 class PendingPropertyRestricts { 89 public: 90 // Add another set of property restricts. Elements of new_restricts that are 91 // not present in active_property_rest 92 void AddValidRestricts(std::set<std::string> new_restricts); 93 94 // Pops the most recently added set of property restricts. PopRestricts()95 void PopRestricts() { 96 if (has_active_property_restricts()) { 97 pending_property_restricts_.pop_back(); 98 } 99 } 100 has_active_property_restricts()101 bool has_active_property_restricts() const { 102 return !pending_property_restricts_.empty(); 103 } 104 105 // The set of all property restrictions that are currently being applied. active_property_restricts()106 const std::set<std::string>& active_property_restricts() const { 107 return pending_property_restricts_.back(); 108 } 109 110 private: 111 std::vector<std::set<std::string>> pending_property_restricts_; 112 }; 113 QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,const JoinChildrenFetcher * join_children_fetcher,const SearchSpecProto & search_spec,DocHitInfoIteratorFilter::Options filter_options,bool needs_term_frequency_info,const FeatureFlags * feature_flags,PendingPropertyRestricts pending_property_restricts,bool processing_not,int64_t current_time_ms)114 explicit QueryVisitor( 115 Index* index, const NumericIndex<int64_t>* numeric_index, 116 const EmbeddingIndex* embedding_index, 117 const DocumentStore* document_store, const SchemaStore* schema_store, 118 const Normalizer* normalizer, const Tokenizer* tokenizer, 119 const JoinChildrenFetcher* join_children_fetcher, 120 const SearchSpecProto& search_spec, 121 DocHitInfoIteratorFilter::Options filter_options, 122 bool needs_term_frequency_info, const FeatureFlags* feature_flags, 123 PendingPropertyRestricts pending_property_restricts, bool processing_not, 124 int64_t current_time_ms) 125 : index_(*index), 126 numeric_index_(*numeric_index), 127 embedding_index_(*embedding_index), 128 document_store_(*document_store), 129 schema_store_(*schema_store), 130 normalizer_(*normalizer), 131 tokenizer_(*tokenizer), 132 join_children_fetcher_(join_children_fetcher), 133 search_spec_(search_spec), 134 filter_options_(std::move(filter_options)), 135 needs_term_frequency_info_(needs_term_frequency_info), 136 feature_flags_(*feature_flags), 137 pending_property_restricts_(std::move(pending_property_restricts)), 138 processing_not_(processing_not), 139 expecting_numeric_arg_(false), 140 current_time_ms_(current_time_ms) { 141 RegisterFunctions(); 142 } 143 has_pending_error()144 bool has_pending_error() const { return !pending_error_.ok(); } 145 146 // Creates a DocHitInfoIterator reflecting the provided term and whether the 147 // prefix operator has been applied to this term. Also populates, 148 // property_query_terms_map_ and query_term_iterators_ as appropriate. 149 // Returns: 150 // - On success, a DocHitInfoIterator for the provided term 151 // - INVALID_ARGUMENT if unable to create an iterator for the term. 152 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> 153 CreateTermIterator(const QueryTerm& term); 154 155 // Processes the PendingValue at the top of pending_values_, parses it into a 156 // int64_t and pops the top. 157 // Returns: 158 // - On success, the int value stored in the text at the top 159 // - INVALID_ARGUMENT if pending_values_ is empty, doesn't hold a text or 160 // can't be parsed as an int. 161 libtextclassifier3::StatusOr<int64_t> PopPendingIntValue(); 162 163 // Processes the PendingValue at the top of pending_values_ and pops the top. 164 // Returns: 165 // - On success, the string value stored in the text at the top and a bool 166 // indicating whether or not the string value has a prefix operator. 167 // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string. 168 libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue(); 169 170 // Processes the PendingValue at the top of pending_values_ and pops the top. 171 // Returns: 172 // - On success, the string value stored in the text at the top 173 // indicating whether or not the string value has a prefix operator. 174 // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text. 175 libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue(); 176 177 // Processes the PendingValue at the top of pending_values_ and pops the top. 178 // Returns: 179 // - On success, a DocHitInfoIterator representing for the term at the top 180 // - INVALID_ARGUMENT if pending_values_ is empty or if unable to create an 181 // iterator for the term. 182 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> 183 PopPendingIterator(); 184 185 // Processes all PendingValues at the top of pending_values_ until the first 186 // placeholder is encounter. 187 // Returns: 188 // - On success, a vector containing all DocHitInfoIterators representing 189 // the values at the top of pending_values_ 190 // - INVALID_ARGUMENT if pending_values_is empty or if unable to create an 191 // iterator for any of the terms at the top of pending_values_ 192 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>> 193 PopAllPendingIterators(); 194 195 // Processes the TEXT segment within text_value. Processing includes 196 // tokenizing the text, normalizing it and outputting a DocHitIterator that 197 // ANDs all the produced tokens together. 198 // Returns: 199 // - On success, a DocHitInfoIterator representing the tokenized text from 200 // text_value 201 // - INVALID_ARGUMENT if the tokenizer produces more than one token within 202 // a token group. 203 // - Any errors that could be produced by Tokenizer::Tokenize. 204 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> 205 ProduceTextTokenIterators(QueryTerm text_value); 206 207 // Processes the unary operator node as a NOT operator. A NOT can have an 208 // operator type of "NOT" or "MINUS" 209 // 210 // RETURNS: 211 // - OK on success 212 // - INVALID_ARGUMENT if any errors are encountered while processing 213 // node->child 214 libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node); 215 216 // Processes the unary operator node as a negation operator. A negation 217 // operator should have an operator of type "MINUS" and it's children must 218 // resolve to a numeric value. 219 // 220 // RETURNS: 221 // - OK on success 222 // - INVALID_ARGUMENT if the node->child can't be resolved to a numeric 223 // value. 224 libtextclassifier3::Status ProcessNegationOperator( 225 const UnaryOperatorNode* node); 226 227 // Processes the NumericComparator represented by node. This must be called 228 // *after* this node's children have been visited. The PendingValues added by 229 // this node's children will be consumed by this function and the PendingValue 230 // for this node will be returned. 231 // Returns: 232 // - On success, OK 233 // - INVALID_ARGUMENT if unable to retrieve string value or int value 234 // - NOT_FOUND if there is no entry in the numeric index for the property 235 libtextclassifier3::Status ProcessNumericComparator( 236 const NaryOperatorNode* node); 237 238 // Processes the AND and OR operators represented by the node. This must be 239 // called *after* this node's children have been visited. The PendingValues 240 // added by this node's children will be consumed by this function and the 241 // PendingValue for this node will be returned. 242 // Returns: 243 // - On success, then PendingValue representing this node and it's children. 244 // - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's 245 // children. 246 libtextclassifier3::StatusOr<PendingValue> ProcessAndOperator( 247 const NaryOperatorNode* node); 248 249 // Processes the OR operator represented by the node. This must be called 250 // *after* this node's children have been visited. The PendingValues added by 251 // this node's children will be consumed by this function and the PendingValue 252 // for this node will be returned. 253 // Returns: 254 // - On success, then PendingValue representing this node and it's children. 255 // - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's 256 // children. 257 libtextclassifier3::StatusOr<PendingValue> ProcessOrOperator( 258 const NaryOperatorNode* node); 259 260 // Populates registered_functions with the currently supported set of 261 // functions. 262 void RegisterFunctions(); 263 264 // Implementation of `search` custom function in the query language. 265 // Returns: 266 // - a PendingValue holding the DocHitInfoIterator reflecting the query 267 // provided to SearchFunction 268 // - any errors returned by Lexer::ExtractTokens, Parser::ConsumeQuery or 269 // QueryVisitor::ConsumeResults. 270 libtextclassifier3::StatusOr<PendingValue> SearchFunction( 271 std::vector<PendingValue>&& args); 272 273 // Implementation of the propertyDefined(property_path) custom function. 274 // Returns: 275 // - a Pending Value holding a DocHitIterator that returns hits for all 276 // documents whose schema types have defined the property specified by 277 // property_path. 278 // - any errors returned by Lexer::ExtractTokens 279 libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction( 280 std::vector<PendingValue>&& args); 281 282 // Implementation of the hasProperty(property_path) custom function. 283 // Returns: 284 // - a Pending Value holding a DocHitIterator that returns hits for all 285 // documents that have the property specified by property_path. 286 // - any errors returned by Lexer::ExtractTokens 287 libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction( 288 std::vector<PendingValue>&& args); 289 290 // Implementation of the semanticSearch(vector, low, high, metric) custom 291 // function. This function is used for supporting vector search with a 292 // syntax like `semanticSearch(getEmbeddingParameter(0), 0.5, 1, "COSINE")`. 293 // 294 // low, high, metric are optional parameters: 295 // - low is default to negative infinity 296 // - high is default to positive infinity 297 // - metric is default to the metric specified in SearchSpec 298 // 299 // Returns: 300 // - a Pending Value of type DocHitIterator that returns all documents with 301 // an embedding vector that has a score within [low, high]. 302 // - OUT_OF_RANGE if index provided to getEmbeddingParameter is out of 303 // bounds of SearchSpec.embedding_query_vectors() 304 // - any errors returned by Lexer::ExtractTokens 305 libtextclassifier3::StatusOr<PendingValue> SemanticSearchFunction( 306 std::vector<PendingValue>&& args); 307 308 // Implementation of the getSearchStringParameter(index) custom function. 309 // Retrieves the parameterized string stored at 310 // SearchSpec.query_parameter_strings(index). 311 // 312 // Returns: 313 // - a Pending Value holding a DocHitIterator that returns hits for all 314 // documents containing the normalized tokens present in the string. 315 // - OUT_OF_RANGE if index is out of bounds of 316 // SearchSpec.query_parameter_strings() 317 // - any errors returned by ProduceTextTokenIterators 318 libtextclassifier3::StatusOr<PendingValue> GetSearchStringParameterFunction( 319 std::vector<PendingValue>&& args); 320 321 // Implementation of the matchScoreExpression(scoring_expression, low, high) 322 // custom function. 323 // 324 // high is an optional parameter, which defaults to positive infinity. 325 // 326 // Returns: 327 // - a Pending Value of type DocHitIterator that returns all documents with 328 // a score within [low, high] based on the provided scoring expression. 329 // Documents that cause an evaluation error during scoring are ignored. 330 // - INVALID_ARGUMENT if low is greater than high. 331 // - any errors returned by score_expression_util::GetScoreExpression. 332 libtextclassifier3::StatusOr<PendingValue> MatchScoreExpressionFunction( 333 std::vector<PendingValue>&& args); 334 335 // Handles a NaryOperatorNode where the operator is HAS (':') and pushes an 336 // iterator with the proper section filter applied. If the current property 337 // restriction represented by pending_property_restricts and the first child 338 // of this node is unsatisfiable (ex. `prop1:(prop2:foo)`), then a NONE 339 // iterator is returned immediately and subtree represented by the second 340 // child is not traversed. 341 // 342 // Returns: 343 // - OK on success 344 // - INVALID_ARGUMENT node does not have exactly two children or the two 345 // children cannot be resolved to a MEMBER or an iterator respectively. 346 libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node); 347 348 // Returns the correct match type to apply based on both the match type and 349 // whether the prefix operator is currently present. GetTermMatchType(bool is_prefix)350 TermMatchType::Code GetTermMatchType(bool is_prefix) const { 351 return (is_prefix) ? TermMatchType::PREFIX : search_spec_.term_match_type(); 352 } 353 354 std::stack<PendingValue> pending_values_; 355 libtextclassifier3::Status pending_error_; 356 357 // A map from function name to Function instance. 358 std::unordered_map<std::string, Function> registered_functions_; 359 360 SectionRestrictQueryTermsMap property_query_terms_map_; 361 362 QueryTermIteratorsMap query_term_iterators_; 363 364 EmbeddingQueryResults embedding_query_results_; 365 366 // Set of features invoked in the query. 367 std::unordered_set<Feature> features_; 368 369 Index& index_; // Does not own! 370 const NumericIndex<int64_t>& numeric_index_; // Does not own! 371 const EmbeddingIndex& embedding_index_; // Does not own! 372 const DocumentStore& document_store_; // Does not own! 373 const SchemaStore& schema_store_; // Does not own! 374 const Normalizer& normalizer_; // Does not own! 375 const Tokenizer& tokenizer_; // Does not own! 376 377 // Nullable. A non-null join_children_fetcher_ indicates that this is the 378 // parent query for a join query, in which case child scores are available. 379 const JoinChildrenFetcher* join_children_fetcher_; // Does not own. 380 381 const SearchSpecProto& search_spec_; 382 383 DocHitInfoIteratorFilter::Options filter_options_; 384 385 // Whether or not term_frequency information is needed. This affects: 386 // - how DocHitInfoIteratorTerms are constructed 387 // - whether the QueryTermIteratorsMap is populated in the QueryResults. 388 bool needs_term_frequency_info_; 389 390 const FeatureFlags& feature_flags_; // Does not own. 391 // TODO(b/377215223): Pass enabled scoring features from top level. 392 std::unordered_set<ScoringFeatureType> scoring_feature_types_enabled_; 393 394 // The stack of property restricts currently being processed by the visitor. 395 PendingPropertyRestricts pending_property_restricts_; 396 bool processing_not_; 397 398 // Whether we are in the midst of processing a subtree that is expected to 399 // resolve to a numeric argument. 400 bool expecting_numeric_arg_; 401 402 int64_t current_time_ms_; 403 }; 404 405 } // namespace lib 406 } // namespace icing 407 408 #endif // ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_ 409