xref: /aosp_15_r20/external/icing/icing/query/advanced_query_parser/util/string-util.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/query/advanced_query_parser/util/string-util.h"
16 
17 #include "icing/absl_ports/canonical_errors.h"
18 #include "icing/absl_ports/str_cat.h"
19 
20 namespace icing {
21 namespace lib {
22 
23 namespace string_util {
24 
UnescapeStringValue(std::string_view value)25 libtextclassifier3::StatusOr<std::string> UnescapeStringValue(
26     std::string_view value) {
27   std::string result;
28   bool in_escape = false;
29   for (char c : value) {
30     if (in_escape) {
31       in_escape = false;
32     } else if (c == '\\') {
33       in_escape = true;
34       continue;
35     } else if (c == '"') {
36       return absl_ports::InvalidArgumentError(
37           "Encountered an unescaped quotation mark!");
38     }
39     result += c;
40   }
41   return result;
42 }
43 
FindEscapedToken(std::string_view escaped_string,std::string_view unescaped_token)44 libtextclassifier3::StatusOr<std::string_view> FindEscapedToken(
45     std::string_view escaped_string, std::string_view unescaped_token) {
46   if (unescaped_token.empty()) {
47     return absl_ports::InvalidArgumentError(
48         "Cannot find escaped token in empty unescaped token.");
49   }
50 
51   // Find the start of unescaped_token within the escaped_string
52   const char* esc_string_end = escaped_string.data() + escaped_string.length();
53   size_t pos = escaped_string.find(unescaped_token[0]);
54   const char* esc_token_start = (pos == std::string_view::npos)
55                                     ? esc_string_end
56                                     : escaped_string.data() + pos;
57   const char* esc_token_cur = esc_token_start;
58   const char* possible_next_start = nullptr;
59   bool is_escaped = false;
60   int i = 0;
61   for (; i < unescaped_token.length() && esc_token_cur < esc_string_end;
62        ++esc_token_cur) {
63     if (esc_token_cur != esc_token_start &&
64         *esc_token_cur == unescaped_token[0] &&
65         possible_next_start == nullptr) {
66       possible_next_start = esc_token_cur;
67     }
68 
69     // Every char in unescaped_token should either be an escape or match the
70     // next char in unescaped_token.
71     if (!is_escaped && *esc_token_cur == '\\') {
72       is_escaped = true;
73     } else if (*esc_token_cur == unescaped_token[i]) {
74       is_escaped = false;
75       ++i;
76     } else {
77       // No match. If we don't have a possible_next_start, then try to find one.
78       if (possible_next_start == nullptr) {
79         pos = escaped_string.find(unescaped_token[0],
80                                   esc_token_cur - escaped_string.data());
81         if (pos == std::string_view::npos) {
82           break;
83         }
84         esc_token_start = escaped_string.data() + pos;
85       } else {
86         esc_token_start = possible_next_start;
87         possible_next_start = nullptr;
88       }
89       // esc_token_start has been reset to a char that equals unescaped_token[0]
90       // The for loop above will advance esc_token_cur so set i to 1.
91       i = 1;
92       esc_token_cur = esc_token_start;
93     }
94   }
95   if (i != unescaped_token.length()) {
96     return absl_ports::InvalidArgumentError(
97         absl_ports::StrCat("Couldn't match chars at token=", unescaped_token,
98                            ") and raw_text=", escaped_string));
99   }
100   return std::string_view(esc_token_start, esc_token_cur - esc_token_start);
101 }
102 
103 }  // namespace string_util
104 
105 }  // namespace lib
106 }  // namespace icing