1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/monkey_test/in-memory-icing-search-engine.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <memory>
20 #include <random>
21 #include <string>
22 #include <string_view>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/absl_ports/canonical_errors.h"
30 #include "icing/absl_ports/str_cat.h"
31 #include "icing/absl_ports/str_join.h"
32 #include "icing/index/embed/embedding-scorer.h"
33 #include "icing/monkey_test/monkey-tokenized-document.h"
34 #include "icing/proto/document.pb.h"
35 #include "icing/proto/schema.pb.h"
36 #include "icing/proto/search.pb.h"
37 #include "icing/proto/term.pb.h"
38 #include "icing/store/document-id.h"
39 #include "icing/util/status-macros.h"
40
41 namespace icing {
42 namespace lib {
43
44 namespace {
45
46 // Check if s1 is a prefix of s2.
IsPrefix(std::string_view s1,std::string_view s2)47 bool IsPrefix(std::string_view s1, std::string_view s2) {
48 if (s1.length() > s2.length()) {
49 return false;
50 }
51 return s1 == s2.substr(0, s1.length());
52 }
53
54 const std::string_view kSemanticSearchPrefix =
55 "semanticSearch(getEmbeddingParameter(0)";
56
GetEmbeddingSearchRange(std::string_view s)57 libtextclassifier3::StatusOr<std::pair<double, double>> GetEmbeddingSearchRange(
58 std::string_view s) {
59 std::vector<double> values;
60 std::string current_number;
61 int i = s.find(kSemanticSearchPrefix) + kSemanticSearchPrefix.length();
62 for (; i < s.size(); ++i) {
63 char c = s[i];
64 if (c == '.' || c == '-' || (c >= '0' && c <= '9')) {
65 current_number += c;
66 } else {
67 if (!current_number.empty()) {
68 values.push_back(std::stod(current_number));
69 current_number.clear();
70 }
71 }
72 }
73 if (values.size() != 2) {
74 return absl_ports::InvalidArgumentError(
75 absl_ports::StrCat("Not an embedding search.", s));
76 }
77 return std::make_pair(values[0], values[1]);
78 }
79
DoesVectorsMatch(const EmbeddingScorer * scorer,std::pair<double,double> embedding_search_range,const PropertyProto::VectorProto & vector1,const PropertyProto::VectorProto & vector2)80 bool DoesVectorsMatch(const EmbeddingScorer *scorer,
81 std::pair<double, double> embedding_search_range,
82 const PropertyProto::VectorProto &vector1,
83 const PropertyProto::VectorProto &vector2) {
84 if (vector1.model_signature() != vector2.model_signature() ||
85 vector1.values_size() != vector2.values_size()) {
86 return false;
87 }
88 float score = scorer->Score(vector1.values_size(), vector1.values().data(),
89 vector2.values().data());
90 return embedding_search_range.first <= score &&
91 score <= embedding_search_range.second;
92 }
93
94 } // namespace
95
96 libtextclassifier3::StatusOr<const PropertyConfigProto *>
GetPropertyConfig(const std::string & schema_type,const std::string & property_name) const97 InMemoryIcingSearchEngine::GetPropertyConfig(
98 const std::string &schema_type, const std::string &property_name) const {
99 auto schema_iter = property_config_map_.find(schema_type);
100 if (schema_iter == property_config_map_.end()) {
101 return absl_ports::NotFoundError(
102 absl_ports::StrCat("Schema type: ", schema_type, " is not found."));
103 }
104 auto property_iter = schema_iter->second.find(property_name);
105 if (property_iter == schema_iter->second.end()) {
106 return absl_ports::NotFoundError(
107 absl_ports::StrCat("Property: ", property_name, " is not found."));
108 }
109 return &property_iter->second;
110 }
111
112 libtextclassifier3::StatusOr<InMemoryIcingSearchEngine::PropertyIndexInfo>
GetPropertyIndexInfo(const std::string & schema_type,const MonkeyTokenizedSection & section) const113 InMemoryIcingSearchEngine::GetPropertyIndexInfo(
114 const std::string &schema_type,
115 const MonkeyTokenizedSection §ion) const {
116 bool in_indexable_properties_list = false;
117 bool all_indexable_from_top = true;
118
119 std::vector<std::string_view> properties_in_path =
120 absl_ports::StrSplit(section.path, ".");
121 if (properties_in_path.empty()) {
122 return absl_ports::InvalidArgumentError("Got empty path.");
123 }
124 std::string curr_schema_type = schema_type;
125 for (int i = 0; i < properties_in_path.size(); ++i) {
126 ICING_ASSIGN_OR_RETURN(
127 const PropertyConfigProto *prop,
128 GetPropertyConfig(curr_schema_type,
129 std::string(properties_in_path[i])));
130 if (prop->data_type() == PropertyConfigProto::DataType::STRING) {
131 TermMatchType::Code term_match_type =
132 prop->string_indexing_config().term_match_type();
133 bool indexable = term_match_type != TermMatchType::UNKNOWN;
134 return PropertyIndexInfo{indexable, term_match_type};
135 }
136 if (prop->data_type() == PropertyConfigProto::DataType::VECTOR) {
137 bool indexable =
138 prop->embedding_indexing_config().embedding_indexing_type() !=
139 EmbeddingIndexingConfig::EmbeddingIndexingType::UNKNOWN;
140 return PropertyIndexInfo{indexable};
141 }
142
143 if (prop->data_type() != PropertyConfigProto::DataType::DOCUMENT) {
144 return PropertyIndexInfo{/*indexable=*/false};
145 }
146
147 bool old_all_indexable_from_top = all_indexable_from_top;
148 all_indexable_from_top &=
149 prop->document_indexing_config().index_nested_properties();
150 if (!all_indexable_from_top && !in_indexable_properties_list) {
151 // Only try to update in_indexable_properties_list if this is the first
152 // level with index_nested_properties=false.
153 if (old_all_indexable_from_top) {
154 auto &indexable_properties =
155 prop->document_indexing_config().indexable_nested_properties_list();
156 std::string relative_path =
157 absl_ports::StrCatPieces(std::vector<std::string_view>(
158 properties_in_path.begin() + i + 1, properties_in_path.end()));
159 in_indexable_properties_list =
160 std::find(indexable_properties.begin(), indexable_properties.end(),
161 relative_path) != indexable_properties.end();
162 }
163 // Check in_indexable_properties_list again.
164 if (!in_indexable_properties_list) {
165 return PropertyIndexInfo{/*indexable=*/false};
166 }
167 }
168 curr_schema_type = prop->document_indexing_config().GetTypeName();
169 }
170 return PropertyIndexInfo{/*indexable=*/false};
171 }
172
173 libtextclassifier3::StatusOr<bool>
DoesDocumentMatchQuery(const MonkeyTokenizedDocument & document,const SearchSpecProto & search_spec) const174 InMemoryIcingSearchEngine::DoesDocumentMatchQuery(
175 const MonkeyTokenizedDocument &document,
176 const SearchSpecProto &search_spec) const {
177 std::string_view query = search_spec.query();
178 std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":");
179 std::string_view section_restrict;
180 if (strs.size() > 1) {
181 section_restrict = strs[0];
182 query = strs[1];
183 }
184
185 // Preprocess for embedding search.
186 libtextclassifier3::StatusOr<std::pair<double, double>>
187 embedding_search_range_or = GetEmbeddingSearchRange(query);
188 std::unique_ptr<EmbeddingScorer> embedding_scorer;
189 if (embedding_search_range_or.ok()) {
190 ICING_ASSIGN_OR_RETURN(
191 embedding_scorer,
192 EmbeddingScorer::Create(search_spec.embedding_query_metric_type()));
193 }
194
195 for (const MonkeyTokenizedSection §ion : document.tokenized_sections) {
196 if (!section_restrict.empty() && section.path != section_restrict) {
197 continue;
198 }
199 ICING_ASSIGN_OR_RETURN(
200 PropertyIndexInfo property_index_info,
201 GetPropertyIndexInfo(document.document.schema(), section));
202 if (!property_index_info.indexable) {
203 // Skip non-indexable property.
204 continue;
205 }
206
207 if (embedding_search_range_or.ok()) {
208 // Process embedding search.
209 for (const PropertyProto::VectorProto &vector :
210 section.embedding_vectors) {
211 if (DoesVectorsMatch(embedding_scorer.get(),
212 embedding_search_range_or.ValueOrDie(),
213 search_spec.embedding_query_vectors(0), vector)) {
214 return true;
215 }
216 }
217 } else {
218 // Process term search.
219 for (const std::string &token : section.token_sequence) {
220 if (property_index_info.term_match_type == TermMatchType::EXACT_ONLY ||
221 search_spec.term_match_type() == TermMatchType::EXACT_ONLY) {
222 if (token == query) {
223 return true;
224 }
225 } else if (IsPrefix(query, token)) {
226 return true;
227 }
228 }
229 }
230 }
231 return false;
232 }
233
SetSchema(SchemaProto && schema)234 void InMemoryIcingSearchEngine::SetSchema(SchemaProto &&schema) {
235 schema_ = std::make_unique<SchemaProto>(std::move(schema));
236 property_config_map_.clear();
237 for (const SchemaTypeConfigProto &type_config : schema_->types()) {
238 auto &curr_property_map = property_config_map_[type_config.schema_type()];
239 for (const PropertyConfigProto &property_config :
240 type_config.properties()) {
241 curr_property_map.insert(
242 {property_config.property_name(), property_config});
243 }
244 }
245 }
246
247 InMemoryIcingSearchEngine::PickDocumentResult
RandomPickDocument(float p_alive,float p_all,float p_other) const248 InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all,
249 float p_other) const {
250 // Normalizing p_alive, p_all and p_other, so that they sum to 1.
251 if (p_alive == 0 && p_all == 0 && p_other == 0) {
252 p_alive = p_all = p_other = 1 / 3.;
253 } else {
254 float p_sum = p_alive + p_all + p_other;
255 p_alive = p_alive / p_sum;
256 p_all = p_all / p_sum;
257 p_other = p_other / p_sum;
258 }
259
260 std::uniform_real_distribution<> real_dist(0, 1);
261 float p = real_dist(*random_);
262 if (p <= p_other || documents_.empty()) {
263 // 20 is a fair number of non-existing namespaces and uris, enough for
264 // monkey testing.
265 std::uniform_int_distribution<> dist(0, 19);
266 std::string name_space = absl_ports::StrCat("non_existing_namespace",
267 std::to_string(dist(*random_)));
268 std::string uri =
269 absl_ports::StrCat("non_existing_uri", std::to_string(dist(*random_)));
270 return {name_space, uri};
271 }
272 p -= p_other;
273 DocumentId doc_id;
274 if (p <= p_all || existing_doc_ids_.empty()) {
275 std::uniform_int_distribution<DocumentId> dist(0, documents_.size() - 1);
276 doc_id = dist(*random_);
277 } else {
278 std::uniform_int_distribution<DocumentId> dist(
279 0, existing_doc_ids_.size() - 1);
280 doc_id = existing_doc_ids_[dist(*random_)];
281 }
282 InMemoryIcingSearchEngine::PickDocumentResult result = {
283 documents_[doc_id].document.namespace_(),
284 documents_[doc_id].document.uri()};
285
286 // Even the (name_space, uri) of the picked doc_id has not been deleted
287 // specifically, doc_id may be outdated because of possible overwriting. So we
288 // need to find the latest document id, and return the latest DocumentProto.
289 auto latest_doc_id = InternalGet(result.name_space, result.uri);
290 if (latest_doc_id.ok()) {
291 result.document = documents_[latest_doc_id.ValueOrDie()].document;
292 }
293 return result;
294 }
295
Put(const MonkeyTokenizedDocument & document)296 void InMemoryIcingSearchEngine::Put(const MonkeyTokenizedDocument &document) {
297 // Delete the old one if existing.
298 Delete(document.document.namespace_(), document.document.uri()).IgnoreError();
299 existing_doc_ids_.push_back(documents_.size());
300 namespace_uri_docid_map[document.document.namespace_()]
301 [document.document.uri()] = documents_.size();
302 documents_.push_back(document);
303 }
304
GetAllNamespaces() const305 std::unordered_set<std::string> InMemoryIcingSearchEngine::GetAllNamespaces()
306 const {
307 std::unordered_set<std::string> namespaces;
308 for (DocumentId doc_id : existing_doc_ids_) {
309 namespaces.insert(documents_[doc_id].document.namespace_());
310 }
311 return namespaces;
312 }
313
Delete(const std::string & name_space,const std::string & uri)314 libtextclassifier3::Status InMemoryIcingSearchEngine::Delete(
315 const std::string &name_space, const std::string &uri) {
316 libtextclassifier3::StatusOr<DocumentId> doc_id_or =
317 InternalGet(name_space, uri);
318 if (doc_id_or.ok()) {
319 DocumentId doc_id = doc_id_or.ValueOrDie();
320 const DocumentProto &document = documents_[doc_id].document;
321 namespace_uri_docid_map[document.namespace_()].erase(document.uri());
322 auto end_itr =
323 std::remove(existing_doc_ids_.begin(), existing_doc_ids_.end(), doc_id);
324 existing_doc_ids_.erase(end_itr, existing_doc_ids_.end());
325 }
326 return doc_id_or.status();
327 }
328
329 libtextclassifier3::StatusOr<uint32_t>
DeleteByNamespace(const std::string & name_space)330 InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) {
331 std::vector<DocumentId> doc_ids_to_delete;
332 for (DocumentId doc_id : existing_doc_ids_) {
333 if (documents_[doc_id].document.namespace_() == name_space) {
334 doc_ids_to_delete.push_back(doc_id);
335 }
336 }
337 for (DocumentId doc_id : doc_ids_to_delete) {
338 const DocumentProto &document = documents_[doc_id].document;
339 if (!Delete(document.namespace_(), document.uri()).ok()) {
340 return absl_ports::InternalError(
341 "Should never happen. There are inconsistencies in the in-memory "
342 "Icing.");
343 }
344 }
345 return doc_ids_to_delete.size();
346 }
347
348 libtextclassifier3::StatusOr<uint32_t>
DeleteBySchemaType(const std::string & schema_type)349 InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) {
350 std::vector<DocumentId> doc_ids_to_delete;
351 for (DocumentId doc_id : existing_doc_ids_) {
352 if (documents_[doc_id].document.schema() == schema_type) {
353 doc_ids_to_delete.push_back(doc_id);
354 }
355 }
356 for (DocumentId doc_id : doc_ids_to_delete) {
357 const DocumentProto &document = documents_[doc_id].document;
358 if (!Delete(document.namespace_(), document.uri()).ok()) {
359 return absl_ports::InternalError(
360 "Should never happen. There are inconsistencies in the in-memory "
361 "Icing.");
362 }
363 }
364 return doc_ids_to_delete.size();
365 }
366
DeleteByQuery(const SearchSpecProto & search_spec)367 libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery(
368 const SearchSpecProto &search_spec) {
369 ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> doc_ids_to_delete,
370 InternalSearch(search_spec));
371 for (DocumentId doc_id : doc_ids_to_delete) {
372 const DocumentProto &document = documents_[doc_id].document;
373 if (!Delete(document.namespace_(), document.uri()).ok()) {
374 return absl_ports::InternalError(
375 "Should never happen. There are inconsistencies in the in-memory "
376 "Icing.");
377 }
378 }
379 return doc_ids_to_delete.size();
380 }
381
382 libtextclassifier3::StatusOr<std::vector<DocumentProto>>
Search(const SearchSpecProto & search_spec) const383 InMemoryIcingSearchEngine::Search(const SearchSpecProto &search_spec) const {
384 ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> matched_doc_ids,
385 InternalSearch(search_spec));
386 std::vector<DocumentProto> result;
387 result.reserve(matched_doc_ids.size());
388 for (DocumentId doc_id : matched_doc_ids) {
389 result.push_back(documents_[doc_id].document);
390 }
391 return result;
392 }
393
InternalGet(const std::string & name_space,const std::string & uri) const394 libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
395 const std::string &name_space, const std::string &uri) const {
396 auto uris = namespace_uri_docid_map.find(name_space);
397 if (uris != namespace_uri_docid_map.end()) {
398 auto doc = uris->second.find(uri);
399 if (doc != uris->second.end()) {
400 return doc->second;
401 }
402 }
403 return absl_ports::NotFoundError(absl_ports::StrCat(
404 name_space, ", ", uri,
405 " is not found by InMemoryIcingSearchEngine::InternalGet."));
406 }
407
408 libtextclassifier3::StatusOr<std::vector<DocumentId>>
InternalSearch(const SearchSpecProto & search_spec) const409 InMemoryIcingSearchEngine::InternalSearch(
410 const SearchSpecProto &search_spec) const {
411 std::vector<DocumentId> matched_doc_ids;
412 for (DocumentId doc_id : existing_doc_ids_) {
413 ICING_ASSIGN_OR_RETURN(
414 bool match, DoesDocumentMatchQuery(documents_[doc_id], search_spec));
415 if (match) {
416 matched_doc_ids.push_back(doc_id);
417 }
418 }
419 return matched_doc_ids;
420 }
421
422 } // namespace lib
423 } // namespace icing
424