xref: /aosp_15_r20/external/icing/icing/query/advanced_query_parser/parser.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/advanced_query_parser/parser.h"
16 
17 #include <memory>
18 #include <string_view>
19 
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/legacy/core/icing-string-util.h"
22 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
23 #include "icing/util/status-macros.h"
24 
25 namespace icing {
26 namespace lib {
27 
28 namespace {
29 
CreateNaryNode(std::string_view operator_text,std::vector<std::unique_ptr<Node>> && operands)30 std::unique_ptr<Node> CreateNaryNode(
31     std::string_view operator_text,
32     std::vector<std::unique_ptr<Node>>&& operands) {
33   if (operands.empty()) {
34     return nullptr;
35   }
36   if (operands.size() == 1) {
37     return std::move(operands.at(0));
38   }
39   return std::make_unique<NaryOperatorNode>(std::string(operator_text),
40                                             std::move(operands));
41 }
42 
43 }  // namespace
44 
Consume(Lexer::TokenType token_type)45 libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) {
46   if (!Match(token_type)) {
47     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
48         "Unable to consume token %d.", static_cast<int>(token_type)));
49   }
50   ++current_token_;
51   return libtextclassifier3::Status::OK;
52 }
53 
ConsumeText()54 libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() {
55   if (!Match(Lexer::TokenType::TEXT)) {
56     return absl_ports::InvalidArgumentError("Unable to consume token as TEXT.");
57   }
58   auto text_node = std::make_unique<TextNode>(std::move(current_token_->text),
59                                               current_token_->original_text);
60   ++current_token_;
61   return text_node;
62 }
63 
ConsumeFunctionName()64 libtextclassifier3::StatusOr<std::string> Parser::ConsumeFunctionName() {
65   if (!Match(Lexer::TokenType::FUNCTION_NAME)) {
66     return absl_ports::InvalidArgumentError(
67         "Unable to consume token as FUNCTION_NAME.");
68   }
69   std::string function_name = std::move(current_token_->text);
70   ++current_token_;
71   return function_name;
72 }
73 
74 // stringElement
75 //    : STRING STAR?
76 libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
ConsumeStringElement()77 Parser::ConsumeStringElement() {
78   if (!Match(Lexer::TokenType::STRING)) {
79     return absl_ports::InvalidArgumentError(
80         "Unable to consume token as STRING.");
81   }
82   std::string text = std::move(current_token_->text);
83   std::string_view raw_text = current_token_->original_text;
84   ++current_token_;
85 
86   bool is_prefix = false;
87   if (Match(Lexer::TokenType::STAR)) {
88     is_prefix = true;
89     ++current_token_;
90   }
91 
92   return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix);
93 }
94 
ConsumeComparator()95 libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
96   if (!Match(Lexer::TokenType::COMPARATOR)) {
97     return absl_ports::InvalidArgumentError(
98         "Unable to consume token as COMPARATOR.");
99   }
100   std::string comparator = std::move(current_token_->text);
101   ++current_token_;
102   return comparator;
103 }
104 
105 // member
106 //    :  TEXT (DOT TEXT)* (DOT function)?
107 //    |  TEXT STAR
108 //    ;
109 libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
ConsumeMember()110 Parser::ConsumeMember() {
111   ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
112   std::vector<std::unique_ptr<TextNode>> children;
113 
114   // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR`
115   // at this point. So check for 'STAR' to differentiate the two cases.
116   if (Match(Lexer::TokenType::STAR)) {
117     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::STAR));
118     std::string_view raw_text = text_node->raw_value();
119     std::string text = std::move(*text_node).value();
120     text_node = std::make_unique<TextNode>(std::move(text), raw_text,
121                                            /*is_prefix=*/true);
122     children.push_back(std::move(text_node));
123   } else {
124     children.push_back(std::move(text_node));
125     while (Match(Lexer::TokenType::DOT)) {
126       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DOT));
127       if (MatchFunction()) {
128         ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
129                                ConsumeFunction());
130         // Once a function is matched, we should exit the current rule based on
131         // the grammar.
132         return std::make_unique<MemberNode>(std::move(children),
133                                             std::move(function_node));
134       }
135       ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
136       children.push_back(std::move(text_node));
137     }
138   }
139   return std::make_unique<MemberNode>(std::move(children),
140                                       /*function=*/nullptr);
141 }
142 
143 // function
144 //    : FUNCTION_NAME LPAREN argList? RPAREN
145 //    ;
146 libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>>
ConsumeFunction()147 Parser::ConsumeFunction() {
148   ICING_ASSIGN_OR_RETURN(std::string function_name, ConsumeFunctionName());
149   ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
150 
151   std::vector<std::unique_ptr<Node>> args;
152   if (Match(Lexer::TokenType::RPAREN)) {
153     // Got empty argument.
154     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
155   } else {
156     ICING_ASSIGN_OR_RETURN(args, ConsumeArgs());
157     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
158   }
159   return std::make_unique<FunctionNode>(std::move(function_name),
160                                         std::move(args));
161 }
162 
163 // comparable
164 //     : stringElement
165 //     | member
166 //     | function
167 //     ;
168 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeComparable()169 Parser::ConsumeComparable() {
170   if (Match(Lexer::TokenType::STRING)) {
171     return ConsumeStringElement();
172   } else if (MatchMember()) {
173     return ConsumeMember();
174   }
175   // The current token sequence isn't a STRING or member. Therefore, it must be
176   // a function.
177   return ConsumeFunction();
178 }
179 
180 // composite
181 //    : LPAREN expression RPAREN
182 //    ;
ConsumeComposite()183 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() {
184   ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
185 
186   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression());
187 
188   ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
189   return expression;
190 }
191 
192 // argList
193 //    : expression (COMMA expression)*
194 //    ;
195 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
ConsumeArgs()196 Parser::ConsumeArgs() {
197   std::vector<std::unique_ptr<Node>> args;
198   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression());
199   args.push_back(std::move(arg));
200   while (Match(Lexer::TokenType::COMMA)) {
201     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::COMMA));
202     ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression());
203     args.push_back(std::move(arg));
204   }
205   return args;
206 }
207 
208 // restriction
209 //     : comparable (COMPARATOR MINUS? (comparable | composite))?
210 //     ;
211 // COMPARATOR will not be produced in Scoring Lexer.
212 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeRestriction()213 Parser::ConsumeRestriction() {
214   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable());
215 
216   if (!Match(Lexer::TokenType::COMPARATOR)) {
217     return comparable;
218   }
219   ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
220 
221   bool has_minus = Match(Lexer::TokenType::MINUS);
222   if (has_minus) {
223     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
224   }
225 
226   std::unique_ptr<Node> arg;
227   if (MatchComposite()) {
228     ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
229   } else if (MatchComparable()) {
230     ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable());
231   } else {
232     return absl_ports::InvalidArgumentError(
233         "ARG: must begin with LPAREN or FIRST(comparable)");
234   }
235 
236   if (has_minus) {
237     arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg));
238   }
239 
240   std::vector<std::unique_ptr<Node>> args;
241   args.push_back(std::move(comparable));
242   args.push_back(std::move(arg));
243   return std::make_unique<NaryOperatorNode>(std::move(operator_text),
244                                             std::move(args));
245 }
246 
247 // simple
248 //     : restriction
249 //     | composite
250 //     ;
ConsumeSimple()251 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() {
252   if (MatchComposite()) {
253     return ConsumeComposite();
254   } else if (MatchRestriction()) {
255     return ConsumeRestriction();
256   }
257   return absl_ports::InvalidArgumentError(
258       "SIMPLE: must be a restriction or composite");
259 }
260 
261 // term
262 //     : NOT? simple
263 //     | MINUS simple
264 //     ;
265 // NOT will not be produced in Scoring Lexer.
ConsumeTerm()266 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
267   if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) {
268     return ConsumeSimple();
269   }
270   std::string operator_text;
271   if (language_ == Lexer::Language::SCORING) {
272     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
273     operator_text = "MINUS";
274   } else {
275     if (Match(Lexer::TokenType::NOT)) {
276       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::NOT));
277       operator_text = "NOT";
278     } else {
279       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
280       operator_text = "MINUS";
281     }
282   }
283   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
284   return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
285 }
286 
287 // factor
288 //     : term (OR term)*
289 //     ;
ConsumeFactor()290 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() {
291   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm());
292   std::vector<std::unique_ptr<Node>> terms;
293   terms.push_back(std::move(term));
294 
295   while (Match(Lexer::TokenType::OR)) {
296     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::OR));
297     ICING_ASSIGN_OR_RETURN(term, ConsumeTerm());
298     terms.push_back(std::move(term));
299   }
300 
301   return CreateNaryNode("OR", std::move(terms));
302 }
303 
304 // sequence
305 //    : (factor)+
306 //    ;
ConsumeSequence()307 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() {
308   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor());
309   std::vector<std::unique_ptr<Node>> factors;
310   factors.push_back(std::move(factor));
311 
312   while (MatchFactor()) {
313     ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor());
314     factors.push_back(std::move(factor));
315   }
316 
317   return CreateNaryNode("AND", std::move(factors));
318 }
319 
320 // expression
321 //    : sequence (AND sequence)*
322 //    ;
323 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeQueryExpression()324 Parser::ConsumeQueryExpression() {
325   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence());
326   std::vector<std::unique_ptr<Node>> sequences;
327   sequences.push_back(std::move(sequence));
328 
329   while (Match(Lexer::TokenType::AND)) {
330     ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::AND));
331     ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence());
332     sequences.push_back(std::move(sequence));
333   }
334 
335   return CreateNaryNode("AND", std::move(sequences));
336 }
337 
338 // multExpr
339 //     : term ((TIMES | DIV) term)*
340 //     ;
ConsumeMultExpr()341 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() {
342   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm());
343   std::vector<std::unique_ptr<Node>> stack;
344   stack.push_back(std::move(node));
345 
346   while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) {
347     while (Match(Lexer::TokenType::TIMES)) {
348       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::TIMES));
349       ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
350       stack.push_back(std::move(node));
351     }
352     node = CreateNaryNode("TIMES", std::move(stack));
353     stack.clear();
354     stack.push_back(std::move(node));
355 
356     while (Match(Lexer::TokenType::DIV)) {
357       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DIV));
358       ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
359       stack.push_back(std::move(node));
360     }
361     node = CreateNaryNode("DIV", std::move(stack));
362     stack.clear();
363     stack.push_back(std::move(node));
364   }
365 
366   return std::move(stack[0]);
367 }
368 
369 // expression
370 //    : multExpr ((PLUS | MINUS) multExpr)*
371 //    ;
372 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeScoringExpression()373 Parser::ConsumeScoringExpression() {
374   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr());
375   std::vector<std::unique_ptr<Node>> stack;
376   stack.push_back(std::move(node));
377 
378   while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) {
379     while (Match(Lexer::TokenType::PLUS)) {
380       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::PLUS));
381       ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
382       stack.push_back(std::move(node));
383     }
384     node = CreateNaryNode("PLUS", std::move(stack));
385     stack.clear();
386     stack.push_back(std::move(node));
387 
388     while (Match(Lexer::TokenType::MINUS)) {
389       ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
390       ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
391       stack.push_back(std::move(node));
392     }
393     node = CreateNaryNode("MINUS", std::move(stack));
394     stack.clear();
395     stack.push_back(std::move(node));
396   }
397 
398   return std::move(stack[0]);
399 }
400 
401 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeExpression()402 Parser::ConsumeExpression() {
403   switch (language_) {
404     case Lexer::Language::QUERY:
405       return ConsumeQueryExpression();
406     case Lexer::Language::SCORING:
407       return ConsumeScoringExpression();
408   }
409 }
410 
411 // query
412 //     : expression? EOF
413 //     ;
ConsumeQuery()414 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() {
415   language_ = Lexer::Language::QUERY;
416   std::unique_ptr<Node> node;
417   if (current_token_ != lexer_tokens_.end()) {
418     ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
419   }
420   if (current_token_ != lexer_tokens_.end()) {
421     return absl_ports::InvalidArgumentError(
422         "Error parsing Query. Must reach EOF after parsing Expression!");
423   }
424   return node;
425 }
426 
427 // scoring
428 //     : expression EOF
429 //     ;
ConsumeScoring()430 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() {
431   language_ = Lexer::Language::SCORING;
432   std::unique_ptr<Node> node;
433   if (current_token_ == lexer_tokens_.end()) {
434     return absl_ports::InvalidArgumentError("Got empty scoring expression!");
435   }
436   ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
437   if (current_token_ != lexer_tokens_.end()) {
438     return absl_ports::InvalidArgumentError(
439         "Error parsing the scoring expression. Must reach EOF after parsing "
440         "Expression!");
441   }
442   return node;
443 }
444 
445 }  // namespace lib
446 }  // namespace icing
447