1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/advanced_query_parser/parser.h"
16
17 #include <memory>
18 #include <string_view>
19
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/legacy/core/icing-string-util.h"
22 #include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
23 #include "icing/util/status-macros.h"
24
25 namespace icing {
26 namespace lib {
27
28 namespace {
29
CreateNaryNode(std::string_view operator_text,std::vector<std::unique_ptr<Node>> && operands)30 std::unique_ptr<Node> CreateNaryNode(
31 std::string_view operator_text,
32 std::vector<std::unique_ptr<Node>>&& operands) {
33 if (operands.empty()) {
34 return nullptr;
35 }
36 if (operands.size() == 1) {
37 return std::move(operands.at(0));
38 }
39 return std::make_unique<NaryOperatorNode>(std::string(operator_text),
40 std::move(operands));
41 }
42
43 } // namespace
44
Consume(Lexer::TokenType token_type)45 libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) {
46 if (!Match(token_type)) {
47 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
48 "Unable to consume token %d.", static_cast<int>(token_type)));
49 }
50 ++current_token_;
51 return libtextclassifier3::Status::OK;
52 }
53
ConsumeText()54 libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() {
55 if (!Match(Lexer::TokenType::TEXT)) {
56 return absl_ports::InvalidArgumentError("Unable to consume token as TEXT.");
57 }
58 auto text_node = std::make_unique<TextNode>(std::move(current_token_->text),
59 current_token_->original_text);
60 ++current_token_;
61 return text_node;
62 }
63
ConsumeFunctionName()64 libtextclassifier3::StatusOr<std::string> Parser::ConsumeFunctionName() {
65 if (!Match(Lexer::TokenType::FUNCTION_NAME)) {
66 return absl_ports::InvalidArgumentError(
67 "Unable to consume token as FUNCTION_NAME.");
68 }
69 std::string function_name = std::move(current_token_->text);
70 ++current_token_;
71 return function_name;
72 }
73
74 // stringElement
75 // : STRING STAR?
76 libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
ConsumeStringElement()77 Parser::ConsumeStringElement() {
78 if (!Match(Lexer::TokenType::STRING)) {
79 return absl_ports::InvalidArgumentError(
80 "Unable to consume token as STRING.");
81 }
82 std::string text = std::move(current_token_->text);
83 std::string_view raw_text = current_token_->original_text;
84 ++current_token_;
85
86 bool is_prefix = false;
87 if (Match(Lexer::TokenType::STAR)) {
88 is_prefix = true;
89 ++current_token_;
90 }
91
92 return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix);
93 }
94
ConsumeComparator()95 libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
96 if (!Match(Lexer::TokenType::COMPARATOR)) {
97 return absl_ports::InvalidArgumentError(
98 "Unable to consume token as COMPARATOR.");
99 }
100 std::string comparator = std::move(current_token_->text);
101 ++current_token_;
102 return comparator;
103 }
104
105 // member
106 // : TEXT (DOT TEXT)* (DOT function)?
107 // | TEXT STAR
108 // ;
109 libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
ConsumeMember()110 Parser::ConsumeMember() {
111 ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
112 std::vector<std::unique_ptr<TextNode>> children;
113
114 // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR`
115 // at this point. So check for 'STAR' to differentiate the two cases.
116 if (Match(Lexer::TokenType::STAR)) {
117 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::STAR));
118 std::string_view raw_text = text_node->raw_value();
119 std::string text = std::move(*text_node).value();
120 text_node = std::make_unique<TextNode>(std::move(text), raw_text,
121 /*is_prefix=*/true);
122 children.push_back(std::move(text_node));
123 } else {
124 children.push_back(std::move(text_node));
125 while (Match(Lexer::TokenType::DOT)) {
126 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DOT));
127 if (MatchFunction()) {
128 ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
129 ConsumeFunction());
130 // Once a function is matched, we should exit the current rule based on
131 // the grammar.
132 return std::make_unique<MemberNode>(std::move(children),
133 std::move(function_node));
134 }
135 ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
136 children.push_back(std::move(text_node));
137 }
138 }
139 return std::make_unique<MemberNode>(std::move(children),
140 /*function=*/nullptr);
141 }
142
143 // function
144 // : FUNCTION_NAME LPAREN argList? RPAREN
145 // ;
146 libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>>
ConsumeFunction()147 Parser::ConsumeFunction() {
148 ICING_ASSIGN_OR_RETURN(std::string function_name, ConsumeFunctionName());
149 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
150
151 std::vector<std::unique_ptr<Node>> args;
152 if (Match(Lexer::TokenType::RPAREN)) {
153 // Got empty argument.
154 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
155 } else {
156 ICING_ASSIGN_OR_RETURN(args, ConsumeArgs());
157 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
158 }
159 return std::make_unique<FunctionNode>(std::move(function_name),
160 std::move(args));
161 }
162
163 // comparable
164 // : stringElement
165 // | member
166 // | function
167 // ;
168 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeComparable()169 Parser::ConsumeComparable() {
170 if (Match(Lexer::TokenType::STRING)) {
171 return ConsumeStringElement();
172 } else if (MatchMember()) {
173 return ConsumeMember();
174 }
175 // The current token sequence isn't a STRING or member. Therefore, it must be
176 // a function.
177 return ConsumeFunction();
178 }
179
180 // composite
181 // : LPAREN expression RPAREN
182 // ;
ConsumeComposite()183 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() {
184 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
185
186 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression());
187
188 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
189 return expression;
190 }
191
192 // argList
193 // : expression (COMMA expression)*
194 // ;
195 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
ConsumeArgs()196 Parser::ConsumeArgs() {
197 std::vector<std::unique_ptr<Node>> args;
198 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression());
199 args.push_back(std::move(arg));
200 while (Match(Lexer::TokenType::COMMA)) {
201 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::COMMA));
202 ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression());
203 args.push_back(std::move(arg));
204 }
205 return args;
206 }
207
208 // restriction
209 // : comparable (COMPARATOR MINUS? (comparable | composite))?
210 // ;
211 // COMPARATOR will not be produced in Scoring Lexer.
212 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeRestriction()213 Parser::ConsumeRestriction() {
214 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable());
215
216 if (!Match(Lexer::TokenType::COMPARATOR)) {
217 return comparable;
218 }
219 ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
220
221 bool has_minus = Match(Lexer::TokenType::MINUS);
222 if (has_minus) {
223 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
224 }
225
226 std::unique_ptr<Node> arg;
227 if (MatchComposite()) {
228 ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
229 } else if (MatchComparable()) {
230 ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable());
231 } else {
232 return absl_ports::InvalidArgumentError(
233 "ARG: must begin with LPAREN or FIRST(comparable)");
234 }
235
236 if (has_minus) {
237 arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg));
238 }
239
240 std::vector<std::unique_ptr<Node>> args;
241 args.push_back(std::move(comparable));
242 args.push_back(std::move(arg));
243 return std::make_unique<NaryOperatorNode>(std::move(operator_text),
244 std::move(args));
245 }
246
247 // simple
248 // : restriction
249 // | composite
250 // ;
ConsumeSimple()251 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() {
252 if (MatchComposite()) {
253 return ConsumeComposite();
254 } else if (MatchRestriction()) {
255 return ConsumeRestriction();
256 }
257 return absl_ports::InvalidArgumentError(
258 "SIMPLE: must be a restriction or composite");
259 }
260
261 // term
262 // : NOT? simple
263 // | MINUS simple
264 // ;
265 // NOT will not be produced in Scoring Lexer.
ConsumeTerm()266 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
267 if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) {
268 return ConsumeSimple();
269 }
270 std::string operator_text;
271 if (language_ == Lexer::Language::SCORING) {
272 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
273 operator_text = "MINUS";
274 } else {
275 if (Match(Lexer::TokenType::NOT)) {
276 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::NOT));
277 operator_text = "NOT";
278 } else {
279 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
280 operator_text = "MINUS";
281 }
282 }
283 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
284 return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
285 }
286
287 // factor
288 // : term (OR term)*
289 // ;
ConsumeFactor()290 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() {
291 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm());
292 std::vector<std::unique_ptr<Node>> terms;
293 terms.push_back(std::move(term));
294
295 while (Match(Lexer::TokenType::OR)) {
296 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::OR));
297 ICING_ASSIGN_OR_RETURN(term, ConsumeTerm());
298 terms.push_back(std::move(term));
299 }
300
301 return CreateNaryNode("OR", std::move(terms));
302 }
303
304 // sequence
305 // : (factor)+
306 // ;
ConsumeSequence()307 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() {
308 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor());
309 std::vector<std::unique_ptr<Node>> factors;
310 factors.push_back(std::move(factor));
311
312 while (MatchFactor()) {
313 ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor());
314 factors.push_back(std::move(factor));
315 }
316
317 return CreateNaryNode("AND", std::move(factors));
318 }
319
320 // expression
321 // : sequence (AND sequence)*
322 // ;
323 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeQueryExpression()324 Parser::ConsumeQueryExpression() {
325 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence());
326 std::vector<std::unique_ptr<Node>> sequences;
327 sequences.push_back(std::move(sequence));
328
329 while (Match(Lexer::TokenType::AND)) {
330 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::AND));
331 ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence());
332 sequences.push_back(std::move(sequence));
333 }
334
335 return CreateNaryNode("AND", std::move(sequences));
336 }
337
338 // multExpr
339 // : term ((TIMES | DIV) term)*
340 // ;
ConsumeMultExpr()341 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() {
342 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm());
343 std::vector<std::unique_ptr<Node>> stack;
344 stack.push_back(std::move(node));
345
346 while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) {
347 while (Match(Lexer::TokenType::TIMES)) {
348 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::TIMES));
349 ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
350 stack.push_back(std::move(node));
351 }
352 node = CreateNaryNode("TIMES", std::move(stack));
353 stack.clear();
354 stack.push_back(std::move(node));
355
356 while (Match(Lexer::TokenType::DIV)) {
357 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DIV));
358 ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
359 stack.push_back(std::move(node));
360 }
361 node = CreateNaryNode("DIV", std::move(stack));
362 stack.clear();
363 stack.push_back(std::move(node));
364 }
365
366 return std::move(stack[0]);
367 }
368
369 // expression
370 // : multExpr ((PLUS | MINUS) multExpr)*
371 // ;
372 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeScoringExpression()373 Parser::ConsumeScoringExpression() {
374 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr());
375 std::vector<std::unique_ptr<Node>> stack;
376 stack.push_back(std::move(node));
377
378 while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) {
379 while (Match(Lexer::TokenType::PLUS)) {
380 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::PLUS));
381 ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
382 stack.push_back(std::move(node));
383 }
384 node = CreateNaryNode("PLUS", std::move(stack));
385 stack.clear();
386 stack.push_back(std::move(node));
387
388 while (Match(Lexer::TokenType::MINUS)) {
389 ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
390 ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
391 stack.push_back(std::move(node));
392 }
393 node = CreateNaryNode("MINUS", std::move(stack));
394 stack.clear();
395 stack.push_back(std::move(node));
396 }
397
398 return std::move(stack[0]);
399 }
400
401 libtextclassifier3::StatusOr<std::unique_ptr<Node>>
ConsumeExpression()402 Parser::ConsumeExpression() {
403 switch (language_) {
404 case Lexer::Language::QUERY:
405 return ConsumeQueryExpression();
406 case Lexer::Language::SCORING:
407 return ConsumeScoringExpression();
408 }
409 }
410
411 // query
412 // : expression? EOF
413 // ;
ConsumeQuery()414 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() {
415 language_ = Lexer::Language::QUERY;
416 std::unique_ptr<Node> node;
417 if (current_token_ != lexer_tokens_.end()) {
418 ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
419 }
420 if (current_token_ != lexer_tokens_.end()) {
421 return absl_ports::InvalidArgumentError(
422 "Error parsing Query. Must reach EOF after parsing Expression!");
423 }
424 return node;
425 }
426
427 // scoring
428 // : expression EOF
429 // ;
ConsumeScoring()430 libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() {
431 language_ = Lexer::Language::SCORING;
432 std::unique_ptr<Node> node;
433 if (current_token_ == lexer_tokens_.end()) {
434 return absl_ports::InvalidArgumentError("Got empty scoring expression!");
435 }
436 ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
437 if (current_token_ != lexer_tokens_.end()) {
438 return absl_ports::InvalidArgumentError(
439 "Error parsing the scoring expression. Must reach EOF after parsing "
440 "Expression!");
441 }
442 return node;
443 }
444
445 } // namespace lib
446 } // namespace icing
447