xref: /aosp_15_r20/external/icing/icing/query/advanced_query_parser/query-visitor.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
16 #define ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <set>
21 #include <stack>
22 #include <string>
23 #include <unordered_map>
24 #include <unordered_set>
25 #include <utility>
26 #include <vector>
27 
28 #include "icing/text_classifier/lib3/utils/base/status.h"
29 #include "icing/text_classifier/lib3/utils/base/statusor.h"
30 #include "icing/feature-flags.h"
31 #include "icing/index/embed/embedding-index.h"
32 #include "icing/index/embed/embedding-query-results.h"
33 #include "icing/index/index.h"
34 #include "icing/index/iterator/doc-hit-info-iterator-filter.h"
35 #include "icing/index/iterator/doc-hit-info-iterator.h"
36 #include "icing/index/numeric/numeric-index.h"
37 #include "icing/join/join-children-fetcher.h"
38 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
39 #include "icing/query/advanced_query_parser/function.h"
40 #include "icing/query/advanced_query_parser/pending-value.h"
41 #include "icing/query/query-features.h"
42 #include "icing/query/query-results.h"
43 #include "icing/query/query-terms.h"
44 #include "icing/schema/schema-store.h"
45 #include "icing/store/document-store.h"
46 #include "icing/tokenization/tokenizer.h"
47 #include "icing/transform/normalizer.h"
48 
49 namespace icing {
50 namespace lib {
51 
52 // The Visitor used to create the DocHitInfoIterator tree from the AST output by
53 // the parser.
54 class QueryVisitor : public AbstractSyntaxTreeVisitor {
55  public:
QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,const JoinChildrenFetcher * join_children_fetcher,const SearchSpecProto & search_spec,DocHitInfoIteratorFilter::Options filter_options,bool needs_term_frequency_info,const FeatureFlags * feature_flags,int64_t current_time_ms)56   explicit QueryVisitor(
57       Index* index, const NumericIndex<int64_t>* numeric_index,
58       const EmbeddingIndex* embedding_index,
59       const DocumentStore* document_store, const SchemaStore* schema_store,
60       const Normalizer* normalizer, const Tokenizer* tokenizer,
61       const JoinChildrenFetcher* join_children_fetcher,
62       const SearchSpecProto& search_spec,
63       DocHitInfoIteratorFilter::Options filter_options,
64       bool needs_term_frequency_info, const FeatureFlags* feature_flags,
65       int64_t current_time_ms)
66       : QueryVisitor(index, numeric_index, embedding_index, document_store,
67                      schema_store, normalizer, tokenizer, join_children_fetcher,
68                      search_spec, filter_options, needs_term_frequency_info,
69                      feature_flags, PendingPropertyRestricts(),
70                      /*processing_not=*/false, current_time_ms) {}
71 
72   void VisitString(const StringNode* node) override;
73   void VisitText(const TextNode* node) override;
74   void VisitMember(const MemberNode* node) override;
75   void VisitFunction(const FunctionNode* node) override;
76   void VisitUnaryOperator(const UnaryOperatorNode* node) override;
77   void VisitNaryOperator(const NaryOperatorNode* node) override;
78 
79   // RETURNS:
80   //   - the QueryResults reflecting the AST that was visited
81   //   - INVALID_ARGUMENT if the AST does not conform to supported expressions
82   //   - NOT_FOUND if the AST refers to a property that does not exist
83   libtextclassifier3::StatusOr<QueryResults> ConsumeResults() &&;
84 
85  private:
86   // An internal class to help manage property restricts being applied at
87   // different levels.
88   class PendingPropertyRestricts {
89    public:
90     // Add another set of property restricts. Elements of new_restricts that are
91     // not present in active_property_rest
92     void AddValidRestricts(std::set<std::string> new_restricts);
93 
94     // Pops the most recently added set of property restricts.
PopRestricts()95     void PopRestricts() {
96       if (has_active_property_restricts()) {
97         pending_property_restricts_.pop_back();
98       }
99     }
100 
has_active_property_restricts()101     bool has_active_property_restricts() const {
102       return !pending_property_restricts_.empty();
103     }
104 
105     // The set of all property restrictions that are currently being applied.
active_property_restricts()106     const std::set<std::string>& active_property_restricts() const {
107       return pending_property_restricts_.back();
108     }
109 
110    private:
111     std::vector<std::set<std::string>> pending_property_restricts_;
112   };
113 
QueryVisitor(Index * index,const NumericIndex<int64_t> * numeric_index,const EmbeddingIndex * embedding_index,const DocumentStore * document_store,const SchemaStore * schema_store,const Normalizer * normalizer,const Tokenizer * tokenizer,const JoinChildrenFetcher * join_children_fetcher,const SearchSpecProto & search_spec,DocHitInfoIteratorFilter::Options filter_options,bool needs_term_frequency_info,const FeatureFlags * feature_flags,PendingPropertyRestricts pending_property_restricts,bool processing_not,int64_t current_time_ms)114   explicit QueryVisitor(
115       Index* index, const NumericIndex<int64_t>* numeric_index,
116       const EmbeddingIndex* embedding_index,
117       const DocumentStore* document_store, const SchemaStore* schema_store,
118       const Normalizer* normalizer, const Tokenizer* tokenizer,
119       const JoinChildrenFetcher* join_children_fetcher,
120       const SearchSpecProto& search_spec,
121       DocHitInfoIteratorFilter::Options filter_options,
122       bool needs_term_frequency_info, const FeatureFlags* feature_flags,
123       PendingPropertyRestricts pending_property_restricts, bool processing_not,
124       int64_t current_time_ms)
125       : index_(*index),
126         numeric_index_(*numeric_index),
127         embedding_index_(*embedding_index),
128         document_store_(*document_store),
129         schema_store_(*schema_store),
130         normalizer_(*normalizer),
131         tokenizer_(*tokenizer),
132         join_children_fetcher_(join_children_fetcher),
133         search_spec_(search_spec),
134         filter_options_(std::move(filter_options)),
135         needs_term_frequency_info_(needs_term_frequency_info),
136         feature_flags_(*feature_flags),
137         pending_property_restricts_(std::move(pending_property_restricts)),
138         processing_not_(processing_not),
139         expecting_numeric_arg_(false),
140         current_time_ms_(current_time_ms) {
141     RegisterFunctions();
142   }
143 
has_pending_error()144   bool has_pending_error() const { return !pending_error_.ok(); }
145 
146   // Creates a DocHitInfoIterator reflecting the provided term and whether the
147   // prefix operator has been applied to this term. Also populates,
148   // property_query_terms_map_ and query_term_iterators_ as appropriate.
149   // Returns:
150   //   - On success, a DocHitInfoIterator for the provided term
151   //   - INVALID_ARGUMENT if unable to create an iterator for the term.
152   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
153   CreateTermIterator(const QueryTerm& term);
154 
155   // Processes the PendingValue at the top of pending_values_, parses it into a
156   // int64_t and pops the top.
157   // Returns:
158   //   - On success, the int value stored in the text at the top
159   //   - INVALID_ARGUMENT if pending_values_ is empty, doesn't hold a text or
160   //     can't be parsed as an int.
161   libtextclassifier3::StatusOr<int64_t> PopPendingIntValue();
162 
163   // Processes the PendingValue at the top of pending_values_ and pops the top.
164   // Returns:
165   //   - On success, the string value stored in the text at the top and a bool
166   //     indicating whether or not the string value has a prefix operator.
167   //   - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string.
168   libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue();
169 
170   // Processes the PendingValue at the top of pending_values_ and pops the top.
171   // Returns:
172   //   - On success, the string value stored in the text at the top
173   //     indicating whether or not the string value has a prefix operator.
174   //   - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text.
175   libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue();
176 
177   // Processes the PendingValue at the top of pending_values_ and pops the top.
178   // Returns:
179   //   - On success, a DocHitInfoIterator representing for the term at the top
180   //   - INVALID_ARGUMENT if pending_values_ is empty or if unable to create an
181   //       iterator for the term.
182   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
183   PopPendingIterator();
184 
185   // Processes all PendingValues at the top of pending_values_ until the first
186   // placeholder is encounter.
187   // Returns:
188   //   - On success, a vector containing all DocHitInfoIterators representing
189   //     the values at the top of pending_values_
190   //   - INVALID_ARGUMENT if pending_values_is empty or if unable to create an
191   //       iterator for any of the terms at the top of pending_values_
192   libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
193   PopAllPendingIterators();
194 
195   // Processes the TEXT segment within text_value. Processing includes
196   // tokenizing the text, normalizing it and outputting a DocHitIterator that
197   // ANDs all the produced tokens together.
198   // Returns:
199   //   - On success, a DocHitInfoIterator representing the tokenized text from
200   //     text_value
201   //   - INVALID_ARGUMENT if the tokenizer produces more than one token within
202   //       a token group.
203   //   - Any errors that could be produced by Tokenizer::Tokenize.
204   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
205   ProduceTextTokenIterators(QueryTerm text_value);
206 
207   // Processes the unary operator node as a NOT operator. A NOT can have an
208   // operator type of "NOT" or "MINUS"
209   //
210   // RETURNS:
211   //   - OK on success
212   //   - INVALID_ARGUMENT if any errors are encountered while processing
213   //     node->child
214   libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node);
215 
216   // Processes the unary operator node as a negation operator. A negation
217   // operator should have an operator of type "MINUS" and it's children must
218   // resolve to a numeric value.
219   //
220   // RETURNS:
221   //   - OK on success
222   //   - INVALID_ARGUMENT if the node->child can't be resolved to a numeric
223   //     value.
224   libtextclassifier3::Status ProcessNegationOperator(
225       const UnaryOperatorNode* node);
226 
227   // Processes the NumericComparator represented by node. This must be called
228   // *after* this node's children have been visited. The PendingValues added by
229   // this node's children will be consumed by this function and the PendingValue
230   // for this node will be returned.
231   // Returns:
232   //   - On success, OK
233   //   - INVALID_ARGUMENT if unable to retrieve string value or int value
234   //   - NOT_FOUND if there is no entry in the numeric index for the property
235   libtextclassifier3::Status ProcessNumericComparator(
236       const NaryOperatorNode* node);
237 
238   // Processes the AND and OR operators represented by the node. This must be
239   // called *after* this node's children have been visited. The PendingValues
240   // added by this node's children will be consumed by this function and the
241   // PendingValue for this node will be returned.
242   // Returns:
243   //   - On success, then PendingValue representing this node and it's children.
244   //   - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
245   //       children.
246   libtextclassifier3::StatusOr<PendingValue> ProcessAndOperator(
247       const NaryOperatorNode* node);
248 
249   // Processes the OR operator represented by the node. This must be called
250   // *after* this node's children have been visited. The PendingValues added by
251   // this node's children will be consumed by this function and the PendingValue
252   // for this node will be returned.
253   // Returns:
254   //   - On success, then PendingValue representing this node and it's children.
255   //   - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
256   //       children.
257   libtextclassifier3::StatusOr<PendingValue> ProcessOrOperator(
258       const NaryOperatorNode* node);
259 
260   // Populates registered_functions with the currently supported set of
261   // functions.
262   void RegisterFunctions();
263 
264   // Implementation of `search` custom function in the query language.
265   // Returns:
266   //   - a PendingValue holding the DocHitInfoIterator reflecting the query
267   //     provided to SearchFunction
268   //   - any errors returned by Lexer::ExtractTokens, Parser::ConsumeQuery or
269   //     QueryVisitor::ConsumeResults.
270   libtextclassifier3::StatusOr<PendingValue> SearchFunction(
271       std::vector<PendingValue>&& args);
272 
273   // Implementation of the propertyDefined(property_path) custom function.
274   // Returns:
275   //   - a Pending Value holding a DocHitIterator that returns hits for all
276   //     documents whose schema types have defined the property specified by
277   //     property_path.
278   //   - any errors returned by Lexer::ExtractTokens
279   libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction(
280       std::vector<PendingValue>&& args);
281 
282   // Implementation of the hasProperty(property_path) custom function.
283   // Returns:
284   //   - a Pending Value holding a DocHitIterator that returns hits for all
285   //     documents that have the property specified by property_path.
286   //   - any errors returned by Lexer::ExtractTokens
287   libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction(
288       std::vector<PendingValue>&& args);
289 
290   // Implementation of the semanticSearch(vector, low, high, metric) custom
291   // function. This function is used for supporting vector search with a
292   // syntax like `semanticSearch(getEmbeddingParameter(0), 0.5, 1, "COSINE")`.
293   //
294   // low, high, metric are optional parameters:
295   //   - low is default to negative infinity
296   //   - high is default to positive infinity
297   //   - metric is default to the metric specified in SearchSpec
298   //
299   // Returns:
300   //   - a Pending Value of type DocHitIterator that returns all documents with
301   //     an embedding vector that has a score within [low, high].
302   //   - OUT_OF_RANGE if index provided to getEmbeddingParameter is out of
303   //     bounds of SearchSpec.embedding_query_vectors()
304   //   - any errors returned by Lexer::ExtractTokens
305   libtextclassifier3::StatusOr<PendingValue> SemanticSearchFunction(
306       std::vector<PendingValue>&& args);
307 
308   // Implementation of the getSearchStringParameter(index) custom function.
309   // Retrieves the parameterized string stored at
310   // SearchSpec.query_parameter_strings(index).
311   //
312   // Returns:
313   //   - a Pending Value holding a DocHitIterator that returns hits for all
314   //     documents containing the normalized tokens present in the string.
315   //   - OUT_OF_RANGE if index is out of bounds of
316   //     SearchSpec.query_parameter_strings()
317   //   - any errors returned by ProduceTextTokenIterators
318   libtextclassifier3::StatusOr<PendingValue> GetSearchStringParameterFunction(
319       std::vector<PendingValue>&& args);
320 
321   // Implementation of the matchScoreExpression(scoring_expression, low, high)
322   // custom function.
323   //
324   // high is an optional parameter, which defaults to positive infinity.
325   //
326   // Returns:
327   //   - a Pending Value of type DocHitIterator that returns all documents with
328   //     a score within [low, high] based on the provided scoring expression.
329   //     Documents that cause an evaluation error during scoring are ignored.
330   //   - INVALID_ARGUMENT if low is greater than high.
331   //   - any errors returned by score_expression_util::GetScoreExpression.
332   libtextclassifier3::StatusOr<PendingValue> MatchScoreExpressionFunction(
333       std::vector<PendingValue>&& args);
334 
335   // Handles a NaryOperatorNode where the operator is HAS (':') and pushes an
336   // iterator with the proper section filter applied. If the current property
337   // restriction represented by pending_property_restricts and the first child
338   // of this node is unsatisfiable (ex. `prop1:(prop2:foo)`), then a NONE
339   // iterator is returned immediately and subtree represented by the second
340   // child is not traversed.
341   //
342   // Returns:
343   //  - OK on success
344   //  - INVALID_ARGUMENT node does not have exactly two children or the two
345   //    children cannot be resolved to a MEMBER or an iterator respectively.
346   libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node);
347 
348   // Returns the correct match type to apply based on both the match type and
349   // whether the prefix operator is currently present.
GetTermMatchType(bool is_prefix)350   TermMatchType::Code GetTermMatchType(bool is_prefix) const {
351     return (is_prefix) ? TermMatchType::PREFIX : search_spec_.term_match_type();
352   }
353 
354   std::stack<PendingValue> pending_values_;
355   libtextclassifier3::Status pending_error_;
356 
357   // A map from function name to Function instance.
358   std::unordered_map<std::string, Function> registered_functions_;
359 
360   SectionRestrictQueryTermsMap property_query_terms_map_;
361 
362   QueryTermIteratorsMap query_term_iterators_;
363 
364   EmbeddingQueryResults embedding_query_results_;
365 
366   // Set of features invoked in the query.
367   std::unordered_set<Feature> features_;
368 
369   Index& index_;                                // Does not own!
370   const NumericIndex<int64_t>& numeric_index_;  // Does not own!
371   const EmbeddingIndex& embedding_index_;       // Does not own!
372   const DocumentStore& document_store_;         // Does not own!
373   const SchemaStore& schema_store_;             // Does not own!
374   const Normalizer& normalizer_;                // Does not own!
375   const Tokenizer& tokenizer_;                  // Does not own!
376 
377   // Nullable. A non-null join_children_fetcher_ indicates that this is the
378   // parent query for a join query, in which case child scores are available.
379   const JoinChildrenFetcher* join_children_fetcher_;  // Does not own.
380 
381   const SearchSpecProto& search_spec_;
382 
383   DocHitInfoIteratorFilter::Options filter_options_;
384 
385   // Whether or not term_frequency information is needed. This affects:
386   //  - how DocHitInfoIteratorTerms are constructed
387   //  - whether the QueryTermIteratorsMap is populated in the QueryResults.
388   bool needs_term_frequency_info_;
389 
390   const FeatureFlags& feature_flags_;  // Does not own.
391   // TODO(b/377215223): Pass enabled scoring features from top level.
392   std::unordered_set<ScoringFeatureType> scoring_feature_types_enabled_;
393 
394   // The stack of property restricts currently being processed by the visitor.
395   PendingPropertyRestricts pending_property_restricts_;
396   bool processing_not_;
397 
398   // Whether we are in the midst of processing a subtree that is expected to
399   // resolve to a numeric argument.
400   bool expecting_numeric_arg_;
401 
402   int64_t current_time_ms_;
403 };
404 
405 }  // namespace lib
406 }  // namespace icing
407 
408 #endif  // ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
409