1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/advanced_query_parser/lexer.h"
16
17 #include <string>
18 #include <utility>
19
20 #include "icing/absl_ports/canonical_errors.h"
21 #include "icing/absl_ports/str_cat.h"
22 #include "icing/util/i18n-utils.h"
23
24 namespace icing {
25 namespace lib {
26
ConsumeWhitespace()27 bool Lexer::ConsumeWhitespace() {
28 if (current_char_ == '\0') {
29 return false;
30 }
31 if (i18n_utils::IsWhitespaceAt(query_, current_index_)) {
32 UChar32 uchar32 = i18n_utils::GetUChar32At(query_.data(), query_.length(),
33 current_index_);
34 int length = i18n_utils::GetUtf8Length(uchar32);
35 Advance(length);
36 return true;
37 }
38 return false;
39 }
40
ConsumeQuerySingleChar()41 bool Lexer::ConsumeQuerySingleChar() {
42 std::string_view original_text = query_.substr(current_index_, 1);
43 switch (current_char_) {
44 case ':':
45 tokens_.push_back({":", original_text, TokenType::COMPARATOR});
46 break;
47 case '*':
48 tokens_.push_back({"", original_text, TokenType::STAR});
49 break;
50 case '-':
51 if (in_text_) {
52 // MINUS ('-') is considered to be a part of a text segment if it is
53 // in the middle of a TEXT segment (ex. `foo-bar`).
54 return false;
55 }
56 tokens_.push_back({"", original_text, TokenType::MINUS});
57 break;
58 default:
59 return false;
60 }
61 Advance();
62 return true;
63 }
64
ConsumeScoringSingleChar()65 bool Lexer::ConsumeScoringSingleChar() {
66 std::string_view original_text = query_.substr(current_index_, 1);
67 switch (current_char_) {
68 case '+':
69 tokens_.push_back({"", original_text, TokenType::PLUS});
70 break;
71 case '*':
72 tokens_.push_back({"", original_text, TokenType::TIMES});
73 break;
74 case '/':
75 tokens_.push_back({"", original_text, TokenType::DIV});
76 break;
77 case '-':
78 tokens_.push_back({"", original_text, TokenType::MINUS});
79 break;
80 default:
81 return false;
82 }
83 Advance();
84 return true;
85 }
86
ConsumeGeneralSingleChar()87 bool Lexer::ConsumeGeneralSingleChar() {
88 std::string_view original_text = query_.substr(current_index_, 1);
89 switch (current_char_) {
90 case ',':
91 tokens_.push_back({"", original_text, TokenType::COMMA});
92 break;
93 case '.':
94 tokens_.push_back({"", original_text, TokenType::DOT});
95 break;
96 case '(':
97 tokens_.push_back({"", original_text, TokenType::LPAREN});
98 break;
99 case ')':
100 tokens_.push_back({"", original_text, TokenType::RPAREN});
101 break;
102 default:
103 return false;
104 }
105 Advance();
106 return true;
107 }
108
ConsumeSingleChar()109 bool Lexer::ConsumeSingleChar() {
110 if (language_ == Language::QUERY) {
111 if (ConsumeQuerySingleChar()) {
112 return true;
113 }
114 } else if (language_ == Language::SCORING) {
115 if (ConsumeScoringSingleChar()) {
116 return true;
117 }
118 }
119 return ConsumeGeneralSingleChar();
120 }
121
ConsumeComparator()122 bool Lexer::ConsumeComparator() {
123 if (current_char_ != '<' && current_char_ != '>' && current_char_ != '!' &&
124 current_char_ != '=') {
125 return false;
126 }
127 // Now, current_char_ must be one of '<', '>', '!', or '='.
128 // Matching for '<=', '>=', '!=', or '=='.
129 char next_char = PeekNext(1);
130 if (next_char == '=') {
131 tokens_.push_back({{current_char_, next_char},
132 query_.substr(current_index_, 2),
133 TokenType::COMPARATOR});
134 Advance(2);
135 return true;
136 }
137 // Now, next_char must not be '='. Let's match for '<' and '>'.
138 if (current_char_ == '<' || current_char_ == '>') {
139 tokens_.push_back({{current_char_},
140 query_.substr(current_index_, 1),
141 TokenType::COMPARATOR});
142 Advance();
143 return true;
144 }
145 return false;
146 }
147
ConsumeAndOr()148 bool Lexer::ConsumeAndOr() {
149 if (current_char_ != '&' && current_char_ != '|') {
150 return false;
151 }
152 char next_char = PeekNext(1);
153 if (current_char_ != next_char) {
154 return false;
155 }
156 std::string_view original_text = query_.substr(current_index_, 2);
157 if (current_char_ == '&') {
158 tokens_.push_back({"", original_text, TokenType::AND});
159 } else {
160 tokens_.push_back({"", original_text, TokenType::OR});
161 }
162 Advance(2);
163 return true;
164 }
165
ConsumeStringLiteral()166 bool Lexer::ConsumeStringLiteral() {
167 if (current_char_ != '"') {
168 return false;
169 }
170 Advance();
171 int32_t unnormalized_start_pos = current_index_;
172 while (current_char_ != '\0' && current_char_ != '"') {
173 // When getting a backslash, we will always match the next character, even
174 // if the next character is a quotation mark
175 if (current_char_ == '\\') {
176 Advance();
177 if (current_char_ == '\0') {
178 // In this case, we are missing a terminating quotation mark.
179 break;
180 }
181 }
182 Advance();
183 }
184 if (current_char_ == '\0') {
185 SyntaxError("missing terminating \" character");
186 return false;
187 }
188 int32_t unnormalized_length = current_index_ - unnormalized_start_pos;
189 std::string_view raw_token_text =
190 query_.substr(unnormalized_start_pos, unnormalized_length);
191 std::string token_text(raw_token_text);
192 tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING});
193 Advance();
194 return true;
195 }
196
ConsumeText()197 bool Lexer::ConsumeText() {
198 if (current_char_ == '\0') {
199 return false;
200 }
201 tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT});
202 int token_index = tokens_.size() - 1;
203
204 int32_t unnormalized_start_pos = current_index_;
205 int32_t unnormalized_end_pos = current_index_;
206 while (!ConsumeNonText() && current_char_ != '\0') {
207 in_text_ = true;
208 // When getting a backslash in TEXT, unescape it by accepting its following
209 // character no matter which character it is, including white spaces,
210 // operator symbols, parentheses, etc.
211 if (current_char_ == '\\') {
212 Advance();
213 if (current_char_ == '\0') {
214 SyntaxError("missing a escaping character after \\");
215 break;
216 }
217 }
218 tokens_[token_index].text.push_back(current_char_);
219 Advance();
220 unnormalized_end_pos = current_index_;
221 }
222 in_text_ = false;
223
224 tokens_[token_index].original_text = query_.substr(
225 unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos);
226 if (unnormalized_end_pos < query_.length() &&
227 query_[unnormalized_end_pos] == '(') {
228 // A TEXT followed by a LPAREN is a FUNCTION_NAME.
229 tokens_[token_index].type = TokenType::FUNCTION_NAME;
230 }
231
232 if (language_ == Lexer::Language::QUERY) {
233 std::string &text = tokens_[token_index].text;
234 TokenType &type = tokens_[token_index].type;
235 if (text == "AND") {
236 text.clear();
237 type = TokenType::AND;
238 } else if (text == "OR") {
239 text.clear();
240 type = TokenType::OR;
241 } else if (text == "NOT") {
242 text.clear();
243 type = TokenType::NOT;
244 }
245 }
246 return true;
247 }
248
249 libtextclassifier3::StatusOr<std::vector<Lexer::LexerToken>>
ExtractTokens()250 Lexer::ExtractTokens() && {
251 while (current_char_ != '\0') {
252 // Clear out any non-text before matching a Text.
253 while (ConsumeNonText()) {
254 }
255 ConsumeText();
256 }
257 if (!error_.empty()) {
258 return absl_ports::InvalidArgumentError(
259 absl_ports::StrCat("Syntax Error: ", error_));
260 }
261 if (tokens_.size() > kMaxNumTokens) {
262 return absl_ports::InvalidArgumentError(
263 absl_ports::StrCat("The maximum number of tokens allowed is ",
264 std::to_string(kMaxNumTokens), ", but got ",
265 std::to_string(tokens_.size()), " tokens."));
266 }
267 return std::move(tokens_);
268 }
269
270 } // namespace lib
271 } // namespace icing
272