1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/query/advanced_query_parser/util/string-util.h"
16
17 #include "icing/absl_ports/canonical_errors.h"
18 #include "icing/absl_ports/str_cat.h"
19
20 namespace icing {
21 namespace lib {
22
23 namespace string_util {
24
UnescapeStringValue(std::string_view value)25 libtextclassifier3::StatusOr<std::string> UnescapeStringValue(
26 std::string_view value) {
27 std::string result;
28 bool in_escape = false;
29 for (char c : value) {
30 if (in_escape) {
31 in_escape = false;
32 } else if (c == '\\') {
33 in_escape = true;
34 continue;
35 } else if (c == '"') {
36 return absl_ports::InvalidArgumentError(
37 "Encountered an unescaped quotation mark!");
38 }
39 result += c;
40 }
41 return result;
42 }
43
FindEscapedToken(std::string_view escaped_string,std::string_view unescaped_token)44 libtextclassifier3::StatusOr<std::string_view> FindEscapedToken(
45 std::string_view escaped_string, std::string_view unescaped_token) {
46 if (unescaped_token.empty()) {
47 return absl_ports::InvalidArgumentError(
48 "Cannot find escaped token in empty unescaped token.");
49 }
50
51 // Find the start of unescaped_token within the escaped_string
52 const char* esc_string_end = escaped_string.data() + escaped_string.length();
53 size_t pos = escaped_string.find(unescaped_token[0]);
54 const char* esc_token_start = (pos == std::string_view::npos)
55 ? esc_string_end
56 : escaped_string.data() + pos;
57 const char* esc_token_cur = esc_token_start;
58 const char* possible_next_start = nullptr;
59 bool is_escaped = false;
60 int i = 0;
61 for (; i < unescaped_token.length() && esc_token_cur < esc_string_end;
62 ++esc_token_cur) {
63 if (esc_token_cur != esc_token_start &&
64 *esc_token_cur == unescaped_token[0] &&
65 possible_next_start == nullptr) {
66 possible_next_start = esc_token_cur;
67 }
68
69 // Every char in unescaped_token should either be an escape or match the
70 // next char in unescaped_token.
71 if (!is_escaped && *esc_token_cur == '\\') {
72 is_escaped = true;
73 } else if (*esc_token_cur == unescaped_token[i]) {
74 is_escaped = false;
75 ++i;
76 } else {
77 // No match. If we don't have a possible_next_start, then try to find one.
78 if (possible_next_start == nullptr) {
79 pos = escaped_string.find(unescaped_token[0],
80 esc_token_cur - escaped_string.data());
81 if (pos == std::string_view::npos) {
82 break;
83 }
84 esc_token_start = escaped_string.data() + pos;
85 } else {
86 esc_token_start = possible_next_start;
87 possible_next_start = nullptr;
88 }
89 // esc_token_start has been reset to a char that equals unescaped_token[0]
90 // The for loop above will advance esc_token_cur so set i to 1.
91 i = 1;
92 esc_token_cur = esc_token_start;
93 }
94 }
95 if (i != unescaped_token.length()) {
96 return absl_ports::InvalidArgumentError(
97 absl_ports::StrCat("Couldn't match chars at token=", unescaped_token,
98 ") and raw_text=", escaped_string));
99 }
100 return std::string_view(esc_token_start, esc_token_cur - esc_token_start);
101 }
102
103 } // namespace string_util
104
105 } // namespace lib
106 } // namespace icing