xref: /aosp_15_r20/external/icing/icing/query/advanced_query_parser/lexer.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/advanced_query_parser/lexer.h"
16 
17 #include <string>
18 #include <utility>
19 
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/absl_ports/str_cat.h"
22 #include "icing/util/i18n-utils.h"
23 
24 namespace icing {
25 namespace lib {
26 
ConsumeWhitespace()27 bool Lexer::ConsumeWhitespace() {
28   if (current_char_ == '\0') {
29     return false;
30   }
31   if (i18n_utils::IsWhitespaceAt(query_, current_index_)) {
32     UChar32 uchar32 = i18n_utils::GetUChar32At(query_.data(), query_.length(),
33                                                current_index_);
34     int length = i18n_utils::GetUtf8Length(uchar32);
35     Advance(length);
36     return true;
37   }
38   return false;
39 }
40 
ConsumeQuerySingleChar()41 bool Lexer::ConsumeQuerySingleChar() {
42   std::string_view original_text = query_.substr(current_index_, 1);
43   switch (current_char_) {
44     case ':':
45       tokens_.push_back({":", original_text, TokenType::COMPARATOR});
46       break;
47     case '*':
48       tokens_.push_back({"", original_text, TokenType::STAR});
49       break;
50     case '-':
51       if (in_text_) {
52         // MINUS ('-') is considered to be a part of a text segment if it is
53         // in the middle of a TEXT segment (ex. `foo-bar`).
54         return false;
55       }
56       tokens_.push_back({"", original_text, TokenType::MINUS});
57       break;
58     default:
59       return false;
60   }
61   Advance();
62   return true;
63 }
64 
ConsumeScoringSingleChar()65 bool Lexer::ConsumeScoringSingleChar() {
66   std::string_view original_text = query_.substr(current_index_, 1);
67   switch (current_char_) {
68     case '+':
69       tokens_.push_back({"", original_text, TokenType::PLUS});
70       break;
71     case '*':
72       tokens_.push_back({"", original_text, TokenType::TIMES});
73       break;
74     case '/':
75       tokens_.push_back({"", original_text, TokenType::DIV});
76       break;
77     case '-':
78       tokens_.push_back({"", original_text, TokenType::MINUS});
79       break;
80     default:
81       return false;
82   }
83   Advance();
84   return true;
85 }
86 
ConsumeGeneralSingleChar()87 bool Lexer::ConsumeGeneralSingleChar() {
88   std::string_view original_text = query_.substr(current_index_, 1);
89   switch (current_char_) {
90     case ',':
91       tokens_.push_back({"", original_text, TokenType::COMMA});
92       break;
93     case '.':
94       tokens_.push_back({"", original_text, TokenType::DOT});
95       break;
96     case '(':
97       tokens_.push_back({"", original_text, TokenType::LPAREN});
98       break;
99     case ')':
100       tokens_.push_back({"", original_text, TokenType::RPAREN});
101       break;
102     default:
103       return false;
104   }
105   Advance();
106   return true;
107 }
108 
ConsumeSingleChar()109 bool Lexer::ConsumeSingleChar() {
110   if (language_ == Language::QUERY) {
111     if (ConsumeQuerySingleChar()) {
112       return true;
113     }
114   } else if (language_ == Language::SCORING) {
115     if (ConsumeScoringSingleChar()) {
116       return true;
117     }
118   }
119   return ConsumeGeneralSingleChar();
120 }
121 
ConsumeComparator()122 bool Lexer::ConsumeComparator() {
123   if (current_char_ != '<' && current_char_ != '>' && current_char_ != '!' &&
124       current_char_ != '=') {
125     return false;
126   }
127   // Now, current_char_ must be one of '<', '>', '!', or '='.
128   // Matching for '<=', '>=', '!=', or '=='.
129   char next_char = PeekNext(1);
130   if (next_char == '=') {
131     tokens_.push_back({{current_char_, next_char},
132                        query_.substr(current_index_, 2),
133                        TokenType::COMPARATOR});
134     Advance(2);
135     return true;
136   }
137   // Now, next_char must not be '='. Let's match for '<' and '>'.
138   if (current_char_ == '<' || current_char_ == '>') {
139     tokens_.push_back({{current_char_},
140                        query_.substr(current_index_, 1),
141                        TokenType::COMPARATOR});
142     Advance();
143     return true;
144   }
145   return false;
146 }
147 
ConsumeAndOr()148 bool Lexer::ConsumeAndOr() {
149   if (current_char_ != '&' && current_char_ != '|') {
150     return false;
151   }
152   char next_char = PeekNext(1);
153   if (current_char_ != next_char) {
154     return false;
155   }
156   std::string_view original_text = query_.substr(current_index_, 2);
157   if (current_char_ == '&') {
158     tokens_.push_back({"", original_text, TokenType::AND});
159   } else {
160     tokens_.push_back({"", original_text, TokenType::OR});
161   }
162   Advance(2);
163   return true;
164 }
165 
ConsumeStringLiteral()166 bool Lexer::ConsumeStringLiteral() {
167   if (current_char_ != '"') {
168     return false;
169   }
170   Advance();
171   int32_t unnormalized_start_pos = current_index_;
172   while (current_char_ != '\0' && current_char_ != '"') {
173     // When getting a backslash, we will always match the next character, even
174     // if the next character is a quotation mark
175     if (current_char_ == '\\') {
176       Advance();
177       if (current_char_ == '\0') {
178         // In this case, we are missing a terminating quotation mark.
179         break;
180       }
181     }
182     Advance();
183   }
184   if (current_char_ == '\0') {
185     SyntaxError("missing terminating \" character");
186     return false;
187   }
188   int32_t unnormalized_length = current_index_ - unnormalized_start_pos;
189   std::string_view raw_token_text =
190       query_.substr(unnormalized_start_pos, unnormalized_length);
191   std::string token_text(raw_token_text);
192   tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING});
193   Advance();
194   return true;
195 }
196 
ConsumeText()197 bool Lexer::ConsumeText() {
198   if (current_char_ == '\0') {
199     return false;
200   }
201   tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT});
202   int token_index = tokens_.size() - 1;
203 
204   int32_t unnormalized_start_pos = current_index_;
205   int32_t unnormalized_end_pos = current_index_;
206   while (!ConsumeNonText() && current_char_ != '\0') {
207     in_text_ = true;
208     // When getting a backslash in TEXT, unescape it by accepting its following
209     // character no matter which character it is, including white spaces,
210     // operator symbols, parentheses, etc.
211     if (current_char_ == '\\') {
212       Advance();
213       if (current_char_ == '\0') {
214         SyntaxError("missing a escaping character after \\");
215         break;
216       }
217     }
218     tokens_[token_index].text.push_back(current_char_);
219     Advance();
220     unnormalized_end_pos = current_index_;
221   }
222   in_text_ = false;
223 
224   tokens_[token_index].original_text = query_.substr(
225       unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos);
226   if (unnormalized_end_pos < query_.length() &&
227       query_[unnormalized_end_pos] == '(') {
228     // A TEXT followed by a LPAREN is a FUNCTION_NAME.
229     tokens_[token_index].type = TokenType::FUNCTION_NAME;
230   }
231 
232   if (language_ == Lexer::Language::QUERY) {
233     std::string &text = tokens_[token_index].text;
234     TokenType &type = tokens_[token_index].type;
235     if (text == "AND") {
236       text.clear();
237       type = TokenType::AND;
238     } else if (text == "OR") {
239       text.clear();
240       type = TokenType::OR;
241     } else if (text == "NOT") {
242       text.clear();
243       type = TokenType::NOT;
244     }
245   }
246   return true;
247 }
248 
249 libtextclassifier3::StatusOr<std::vector<Lexer::LexerToken>>
ExtractTokens()250 Lexer::ExtractTokens() && {
251   while (current_char_ != '\0') {
252     // Clear out any non-text before matching a Text.
253     while (ConsumeNonText()) {
254     }
255     ConsumeText();
256   }
257   if (!error_.empty()) {
258     return absl_ports::InvalidArgumentError(
259         absl_ports::StrCat("Syntax Error: ", error_));
260   }
261   if (tokens_.size() > kMaxNumTokens) {
262     return absl_ports::InvalidArgumentError(
263         absl_ports::StrCat("The maximum number of tokens allowed is ",
264                            std::to_string(kMaxNumTokens), ", but got ",
265                            std::to_string(tokens_.size()), " tokens."));
266   }
267   return std::move(tokens_);
268 }
269 
270 }  // namespace lib
271 }  // namespace icing
272