1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/icing-search-engine.h"
16
17 #include <algorithm>
18 #include <cstddef>
19 #include <cstdint>
20 #include <functional>
21 #include <memory>
22 #include <string>
23 #include <string_view>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <utility>
27 #include <vector>
28
29 #include "icing/text_classifier/lib3/utils/base/status.h"
30 #include "icing/text_classifier/lib3/utils/base/statusor.h"
31 #include "icing/absl_ports/annotate.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/feature-flags.h"
36 #include "icing/file/destructible-file.h"
37 #include "icing/file/file-backed-proto.h"
38 #include "icing/file/filesystem.h"
39 #include "icing/file/version-util.h"
40 #include "icing/index/data-indexing-handler.h"
41 #include "icing/index/embed/embedding-index.h"
42 #include "icing/index/embedding-indexing-handler.h"
43 #include "icing/index/hit/doc-hit-info.h"
44 #include "icing/index/index-processor.h"
45 #include "icing/index/index.h"
46 #include "icing/index/integer-section-indexing-handler.h"
47 #include "icing/index/iterator/doc-hit-info-iterator.h"
48 #include "icing/index/numeric/integer-index.h"
49 #include "icing/index/term-indexing-handler.h"
50 #include "icing/index/term-metadata.h"
51 #include "icing/jni/jni-cache.h"
52 #include "icing/join/join-children-fetcher.h"
53 #include "icing/join/join-processor.h"
54 #include "icing/join/qualified-id-join-index-impl-v2.h"
55 #include "icing/join/qualified-id-join-index-impl-v3.h"
56 #include "icing/join/qualified-id-join-index.h"
57 #include "icing/join/qualified-id-join-indexing-handler.h"
58 #include "icing/legacy/index/icing-filesystem.h"
59 #include "icing/performance-configuration.h"
60 #include "icing/portable/endian.h"
61 #include "icing/proto/blob.pb.h"
62 #include "icing/proto/debug.pb.h"
63 #include "icing/proto/document.pb.h"
64 #include "icing/proto/initialize.pb.h"
65 #include "icing/proto/internal/optimize.pb.h"
66 #include "icing/proto/logging.pb.h"
67 #include "icing/proto/optimize.pb.h"
68 #include "icing/proto/persist.pb.h"
69 #include "icing/proto/reset.pb.h"
70 #include "icing/proto/schema.pb.h"
71 #include "icing/proto/scoring.pb.h"
72 #include "icing/proto/search.pb.h"
73 #include "icing/proto/status.pb.h"
74 #include "icing/proto/storage.pb.h"
75 #include "icing/proto/term.pb.h"
76 #include "icing/proto/usage.pb.h"
77 #include "icing/query/advanced_query_parser/lexer.h"
78 #include "icing/query/query-features.h"
79 #include "icing/query/query-processor.h"
80 #include "icing/query/query-results.h"
81 #include "icing/query/suggestion-processor.h"
82 #include "icing/result/page-result.h"
83 #include "icing/result/projection-tree.h"
84 #include "icing/result/projector.h"
85 #include "icing/result/result-adjustment-info.h"
86 #include "icing/result/result-retriever-v2.h"
87 #include "icing/result/result-state-manager.h"
88 #include "icing/schema/schema-store.h"
89 #include "icing/scoring/advanced_scoring/score-expression.h"
90 #include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
91 #include "icing/scoring/scored-document-hit.h"
92 #include "icing/scoring/scored-document-hits-ranker.h"
93 #include "icing/scoring/scoring-processor.h"
94 #include "icing/store/blob-store.h"
95 #include "icing/store/document-id.h"
96 #include "icing/store/document-store.h"
97 #include "icing/tokenization/language-segmenter-factory.h"
98 #include "icing/transform/normalizer-factory.h"
99 #include "icing/util/clock.h"
100 #include "icing/util/data-loss.h"
101 #include "icing/util/logging.h"
102 #include "icing/util/status-macros.h"
103 #include "icing/util/tokenized-document.h"
104 #include "unicode/uloc.h"
105 #include <google/protobuf/repeated_field.h>
106
107 namespace icing {
108 namespace lib {
109
110 namespace {
111
112 constexpr std::string_view kDocumentSubfolderName = "document_dir";
113 constexpr std::string_view kBlobSubfolderName = "blob_dir";
114 constexpr std::string_view kIndexSubfolderName = "index_dir";
115 constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir";
116 constexpr std::string_view kQualifiedIdJoinIndexSubfolderName =
117 "qualified_id_join_index_dir";
118 constexpr std::string_view kEmbeddingIndexSubfolderName = "embedding_index_dir";
119 constexpr std::string_view kSchemaSubfolderName = "schema_dir";
120 constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
121 constexpr std::string_view kInitMarkerFilename = "init_marker";
122 constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
123
124 // The maximum number of unsuccessful initialization attempts from the current
125 // state that we will tolerate before deleting all data and starting from a
126 // fresh state.
127 constexpr int kMaxUnsuccessfulInitAttempts = 5;
128
129 // A pair that holds namespace and type.
130 struct NamespaceTypePair {
131 std::string namespace_;
132 std::string type;
133
operator ==icing::lib::__anonbe1cacbd0111::NamespaceTypePair134 bool operator==(const NamespaceTypePair& other) const {
135 return namespace_ == other.namespace_ && type == other.type;
136 }
137 };
138
139 struct NamespaceTypePairHasher {
operator ()icing::lib::__anonbe1cacbd0111::NamespaceTypePairHasher140 std::size_t operator()(const NamespaceTypePair& pair) const {
141 return std::hash<std::string>()(pair.namespace_) ^
142 std::hash<std::string>()(pair.type);
143 }
144 };
145
ValidateResultSpec(const DocumentStore * document_store,const ResultSpecProto & result_spec)146 libtextclassifier3::Status ValidateResultSpec(
147 const DocumentStore* document_store, const ResultSpecProto& result_spec) {
148 if (result_spec.num_per_page() < 0) {
149 return absl_ports::InvalidArgumentError(
150 "ResultSpecProto.num_per_page cannot be negative.");
151 }
152 if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
153 return absl_ports::InvalidArgumentError(
154 "ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
155 "non-positive.");
156 }
157 if (result_spec.max_joined_children_per_parent_to_return() < 0) {
158 return absl_ports::InvalidArgumentError(
159 "ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
160 "negative.");
161 }
162 if (result_spec.num_to_score() <= 0) {
163 return absl_ports::InvalidArgumentError(
164 "ResultSpecProto.num_to_score cannot be non-positive.");
165 }
166 // Validate ResultGroupings.
167 std::unordered_set<int32_t> unique_entry_ids;
168 ResultSpecProto::ResultGroupingType result_grouping_type =
169 result_spec.result_group_type();
170 for (const ResultSpecProto::ResultGrouping& result_grouping :
171 result_spec.result_groupings()) {
172 if (result_grouping.max_results() <= 0) {
173 return absl_ports::InvalidArgumentError(
174 "Cannot specify a result grouping with max results <= 0.");
175 }
176 for (const ResultSpecProto::ResultGrouping::Entry& entry :
177 result_grouping.entry_groupings()) {
178 const std::string& name_space = entry.namespace_();
179 const std::string& schema = entry.schema();
180 auto entry_id_or = document_store->GetResultGroupingEntryId(
181 result_grouping_type, name_space, schema);
182 if (!entry_id_or.ok()) {
183 continue;
184 }
185 int32_t entry_id = entry_id_or.ValueOrDie();
186 if (unique_entry_ids.find(entry_id) != unique_entry_ids.end()) {
187 return absl_ports::InvalidArgumentError(
188 "Entry Ids must be unique across result groups.");
189 }
190 unique_entry_ids.insert(entry_id);
191 }
192 }
193 return libtextclassifier3::Status::OK;
194 }
195
ValidateSearchSpec(const SearchSpecProto & search_spec,const PerformanceConfiguration & configuration)196 libtextclassifier3::Status ValidateSearchSpec(
197 const SearchSpecProto& search_spec,
198 const PerformanceConfiguration& configuration) {
199 if (search_spec.query().size() > configuration.max_query_length) {
200 return absl_ports::InvalidArgumentError(
201 absl_ports::StrCat("SearchSpecProto.query is longer than the maximum "
202 "allowed query length: ",
203 std::to_string(configuration.max_query_length)));
204 }
205 // Check that no unknown features have been enabled in the search spec.
206 std::unordered_set<Feature> query_features_set = GetQueryFeaturesSet();
207 for (const Feature feature : search_spec.enabled_features()) {
208 if (query_features_set.find(feature) == query_features_set.end()) {
209 return absl_ports::InvalidArgumentError(
210 absl_ports::StrCat("Unknown feature in "
211 "SearchSpecProto.enabled_features: ",
212 feature));
213 }
214 }
215 return libtextclassifier3::Status::OK;
216 }
217
ValidateSuggestionSpec(const SuggestionSpecProto & suggestion_spec,const PerformanceConfiguration & configuration)218 libtextclassifier3::Status ValidateSuggestionSpec(
219 const SuggestionSpecProto& suggestion_spec,
220 const PerformanceConfiguration& configuration) {
221 if (suggestion_spec.prefix().empty()) {
222 return absl_ports::InvalidArgumentError(
223 absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
224 }
225 if (suggestion_spec.scoring_spec().scoring_match_type() ==
226 TermMatchType::UNKNOWN) {
227 return absl_ports::InvalidArgumentError(
228 absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
229 }
230 if (suggestion_spec.num_to_return() <= 0) {
231 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
232 "SuggestionSpecProto.num_to_return must be positive."));
233 }
234 if (suggestion_spec.prefix().size() > configuration.max_query_length) {
235 return absl_ports::InvalidArgumentError(
236 absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
237 "maximum allowed prefix length: ",
238 std::to_string(configuration.max_query_length)));
239 }
240 return libtextclassifier3::Status::OK;
241 }
242
ValidateScoringSpec(const ScoringSpecProto & scoring_spec)243 libtextclassifier3::Status ValidateScoringSpec(
244 const ScoringSpecProto& scoring_spec) {
245 std::unordered_set<std::string> alias_schema_types;
246 for (const SchemaTypeAliasMapProto& alias_map_proto :
247 scoring_spec.schema_type_alias_map_protos()) {
248 if (alias_map_proto.alias_schema_type().empty()) {
249 return absl_ports::InvalidArgumentError(
250 "SchemaTypeAliasMapProto contains alias_schema_type with empty "
251 "string");
252 }
253 if (alias_map_proto.schema_types().empty()) {
254 return absl_ports::InvalidArgumentError(
255 absl_ports::StrCat("SchemaTypeAliasMapProto contains empty "
256 "schema_types for alias_schema_type: ",
257 alias_map_proto.alias_schema_type()));
258 }
259 if (alias_schema_types.find(alias_map_proto.alias_schema_type()) !=
260 alias_schema_types.end()) {
261 return absl_ports::InvalidArgumentError(
262 absl_ports::StrCat("SchemaTypeAliasMapProto contains multiple "
263 "entries with the same alias_schema_type: ",
264 alias_map_proto.alias_schema_type()));
265 }
266 alias_schema_types.insert(alias_map_proto.alias_schema_type());
267 }
268 return libtextclassifier3::Status::OK;
269 }
270
CopyParentSchemaTypeAliasMapToChild(const ScoringSpecProto & parent_scoring_spec,const ScoringSpecProto & child_scoring_spec)271 ScoringSpecProto CopyParentSchemaTypeAliasMapToChild(
272 const ScoringSpecProto& parent_scoring_spec,
273 const ScoringSpecProto& child_scoring_spec) {
274 ScoringSpecProto new_child_scoring_spec = std::move(child_scoring_spec);
275 for (const SchemaTypeAliasMapProto& alias_map_proto :
276 parent_scoring_spec.schema_type_alias_map_protos()) {
277 *new_child_scoring_spec.add_schema_type_alias_map_protos() =
278 alias_map_proto;
279 }
280 return new_child_scoring_spec;
281 }
282
283 libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
CreateQualifiedIdJoinIndex(const Filesystem & filesystem,std::string qualified_id_join_index_dir,const IcingSearchEngineOptions & options,const FeatureFlags & feature_flags)284 CreateQualifiedIdJoinIndex(const Filesystem& filesystem,
285 std::string qualified_id_join_index_dir,
286 const IcingSearchEngineOptions& options,
287 const FeatureFlags& feature_flags) {
288 if (options.enable_qualified_id_join_index_v3_and_delete_propagate_from()) {
289 return QualifiedIdJoinIndexImplV3::Create(
290 filesystem, std::move(qualified_id_join_index_dir), feature_flags);
291 } else {
292 // V2
293 return QualifiedIdJoinIndexImplV2::Create(
294 filesystem, std::move(qualified_id_join_index_dir),
295 options.pre_mapping_fbv());
296 }
297 }
298
299 // Document store files are in a standalone subfolder for easier file
300 // management. We can delete and recreate the subfolder and not touch/affect
301 // anything else.
MakeDocumentDirectoryPath(const std::string & base_dir)302 std::string MakeDocumentDirectoryPath(const std::string& base_dir) {
303 return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName);
304 }
305
MakeBlobDirectoryPath(const std::string & base_dir)306 std::string MakeBlobDirectoryPath(const std::string& base_dir) {
307 return absl_ports::StrCat(base_dir, "/", kBlobSubfolderName);
308 }
309
310 // Makes a temporary folder path for the document store which will be used
311 // during full optimization.
MakeDocumentTemporaryDirectoryPath(const std::string & base_dir)312 std::string MakeDocumentTemporaryDirectoryPath(const std::string& base_dir) {
313 return absl_ports::StrCat(base_dir, "/", kDocumentSubfolderName,
314 "_optimize_tmp");
315 }
316
317 // Index files are in a standalone subfolder because for easier file management.
318 // We can delete and recreate the subfolder and not touch/affect anything
319 // else.
MakeIndexDirectoryPath(const std::string & base_dir)320 std::string MakeIndexDirectoryPath(const std::string& base_dir) {
321 return absl_ports::StrCat(base_dir, "/", kIndexSubfolderName);
322 }
323
324 // Working path for integer index. Integer index is derived from
325 // PersistentStorage and it will take full ownership of this working path,
326 // including creation/deletion. See PersistentStorage for more details about
327 // working path.
MakeIntegerIndexWorkingPath(const std::string & base_dir)328 std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) {
329 return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName);
330 }
331
332 // Working path for qualified id join index. It is derived from
333 // PersistentStorage and it will take full ownership of this working path,
334 // including creation/deletion. See PersistentStorage for more details about
335 // working path.
MakeQualifiedIdJoinIndexWorkingPath(const std::string & base_dir)336 std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) {
337 return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName);
338 }
339
340 // Working path for embedding index.
MakeEmbeddingIndexWorkingPath(const std::string & base_dir)341 std::string MakeEmbeddingIndexWorkingPath(const std::string& base_dir) {
342 return absl_ports::StrCat(base_dir, "/", kEmbeddingIndexSubfolderName);
343 }
344
345 // SchemaStore files are in a standalone subfolder for easier file management.
346 // We can delete and recreate the subfolder and not touch/affect anything
347 // else.
MakeSchemaDirectoryPath(const std::string & base_dir)348 std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
349 return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
350 }
351
MakeSetSchemaMarkerFilePath(const std::string & base_dir)352 std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
353 return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
354 }
355
MakeInitMarkerFilePath(const std::string & base_dir)356 std::string MakeInitMarkerFilePath(const std::string& base_dir) {
357 return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
358 }
359
TransformStatus(const libtextclassifier3::Status & internal_status,StatusProto * status_proto)360 void TransformStatus(const libtextclassifier3::Status& internal_status,
361 StatusProto* status_proto) {
362 StatusProto::Code code;
363 if (!internal_status.ok()) {
364 ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
365 << ", Message: " << internal_status.error_message();
366 }
367 switch (internal_status.CanonicalCode()) {
368 case libtextclassifier3::StatusCode::OK:
369 code = StatusProto::OK;
370 break;
371 case libtextclassifier3::StatusCode::DATA_LOSS:
372 code = StatusProto::WARNING_DATA_LOSS;
373 break;
374 case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
375 code = StatusProto::INVALID_ARGUMENT;
376 break;
377 case libtextclassifier3::StatusCode::NOT_FOUND:
378 code = StatusProto::NOT_FOUND;
379 break;
380 case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
381 code = StatusProto::FAILED_PRECONDITION;
382 break;
383 case libtextclassifier3::StatusCode::ABORTED:
384 code = StatusProto::ABORTED;
385 break;
386 case libtextclassifier3::StatusCode::INTERNAL:
387 // TODO(b/147699081): Cleanup our internal use of INTERNAL since it
388 // doesn't match with what it *should* indicate as described in
389 // go/icing-library-apis.
390 code = StatusProto::INTERNAL;
391 break;
392 case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
393 // TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
394 // (e.g. if the document log is full). And we use RESOURCE_EXHAUSTED
395 // internally to indicate other resources are exhausted (e.g.
396 // DocHitInfos) - although none of these are exposed through the API.
397 // Consider separating the two cases out more clearly.
398 code = StatusProto::OUT_OF_SPACE;
399 break;
400 case libtextclassifier3::StatusCode::ALREADY_EXISTS:
401 code = StatusProto::ALREADY_EXISTS;
402 break;
403 case libtextclassifier3::StatusCode::CANCELLED:
404 [[fallthrough]];
405 case libtextclassifier3::StatusCode::UNKNOWN:
406 [[fallthrough]];
407 case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
408 [[fallthrough]];
409 case libtextclassifier3::StatusCode::PERMISSION_DENIED:
410 [[fallthrough]];
411 case libtextclassifier3::StatusCode::OUT_OF_RANGE:
412 [[fallthrough]];
413 case libtextclassifier3::StatusCode::UNIMPLEMENTED:
414 [[fallthrough]];
415 case libtextclassifier3::StatusCode::UNAVAILABLE:
416 [[fallthrough]];
417 case libtextclassifier3::StatusCode::UNAUTHENTICATED:
418 // Other internal status codes aren't supported externally yet. If it
419 // should be supported, add another switch-case above.
420 ICING_LOG(ERROR) << "Internal status code "
421 << internal_status.error_code()
422 << " not supported in the external API";
423 code = StatusProto::UNKNOWN;
424 break;
425 }
426 status_proto->set_code(code);
427 status_proto->set_message(internal_status.error_message());
428 }
429
RetrieveAndAddDocumentInfo(const DocumentStore * document_store,DeleteByQueryResultProto & result_proto,std::unordered_map<NamespaceTypePair,DeleteByQueryResultProto::DocumentGroupInfo *,NamespaceTypePairHasher> & info_map,DocumentId document_id)430 libtextclassifier3::Status RetrieveAndAddDocumentInfo(
431 const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
432 std::unordered_map<NamespaceTypePair,
433 DeleteByQueryResultProto::DocumentGroupInfo*,
434 NamespaceTypePairHasher>& info_map,
435 DocumentId document_id) {
436 ICING_ASSIGN_OR_RETURN(DocumentProto document,
437 document_store->Get(document_id));
438 NamespaceTypePair key = {document.namespace_(), document.schema()};
439 auto iter = info_map.find(key);
440 if (iter == info_map.end()) {
441 auto entry = result_proto.add_deleted_documents();
442 entry->set_namespace_(std::move(document.namespace_()));
443 entry->set_schema(std::move(document.schema()));
444 entry->add_uris(std::move(document.uri()));
445 info_map[key] = entry;
446 } else {
447 iter->second->add_uris(std::move(document.uri()));
448 }
449 return libtextclassifier3::Status::OK;
450 }
451
ShouldRebuildIndex(const OptimizeStatsProto & optimize_stats,float optimize_rebuild_index_threshold)452 bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats,
453 float optimize_rebuild_index_threshold) {
454 int num_invalid_documents = optimize_stats.num_deleted_documents() +
455 optimize_stats.num_expired_documents();
456 return num_invalid_documents >= optimize_stats.num_original_documents() *
457 optimize_rebuild_index_threshold;
458 }
459
ScoringExpressionHasRelevanceScoreFunction(std::string_view scoring_expression)460 libtextclassifier3::StatusOr<bool> ScoringExpressionHasRelevanceScoreFunction(
461 std::string_view scoring_expression) {
462 // TODO(b/261474063) The Lexer will be called again when creating the
463 // AdvancedScorer instance. Consider refactoring the code to allow the Lexer
464 // to be called only once.
465 Lexer lexer(scoring_expression, Lexer::Language::SCORING);
466 ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
467 std::move(lexer).ExtractTokens());
468 for (const Lexer::LexerToken& token : lexer_tokens) {
469 if (token.type == Lexer::TokenType::FUNCTION_NAME &&
470 token.text == RelevanceScoreFunctionScoreExpression::kFunctionName) {
471 return true;
472 }
473 }
474 return false;
475 }
476
477 // Useful method to get RankingStrategy if advanced scoring is enabled. When the
478 // "RelevanceScore" function is used in the advanced scoring expression,
479 // RankingStrategy will be treated as RELEVANCE_SCORE in order to prepare the
480 // necessary information needed for calculating relevance score.
481 libtextclassifier3::StatusOr<ScoringSpecProto::RankingStrategy::Code>
GetRankingStrategyFromScoringSpec(const ScoringSpecProto & scoring_spec)482 GetRankingStrategyFromScoringSpec(const ScoringSpecProto& scoring_spec) {
483 if (scoring_spec.advanced_scoring_expression().empty() &&
484 scoring_spec.additional_advanced_scoring_expressions().empty()) {
485 return scoring_spec.rank_by();
486 }
487
488 ICING_ASSIGN_OR_RETURN(bool has_relevance_score_function,
489 ScoringExpressionHasRelevanceScoreFunction(
490 scoring_spec.advanced_scoring_expression()));
491 if (has_relevance_score_function) {
492 return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
493 }
494 for (std::string_view additional_scoring_expression :
495 scoring_spec.additional_advanced_scoring_expressions()) {
496 ICING_ASSIGN_OR_RETURN(has_relevance_score_function,
497 ScoringExpressionHasRelevanceScoreFunction(
498 additional_scoring_expression));
499 if (has_relevance_score_function) {
500 return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
501 }
502 }
503 return ScoringSpecProto::RankingStrategy::NONE;
504 }
505
506 } // namespace
507
IcingSearchEngine(const IcingSearchEngineOptions & options,std::unique_ptr<const JniCache> jni_cache)508 IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
509 std::unique_ptr<const JniCache> jni_cache)
510 : IcingSearchEngine(options, std::make_unique<Filesystem>(),
511 std::make_unique<IcingFilesystem>(),
512 std::make_unique<Clock>(), std::move(jni_cache)) {}
513
IcingSearchEngine(IcingSearchEngineOptions options,std::unique_ptr<const Filesystem> filesystem,std::unique_ptr<const IcingFilesystem> icing_filesystem,std::unique_ptr<Clock> clock,std::unique_ptr<const JniCache> jni_cache)514 IcingSearchEngine::IcingSearchEngine(
515 IcingSearchEngineOptions options,
516 std::unique_ptr<const Filesystem> filesystem,
517 std::unique_ptr<const IcingFilesystem> icing_filesystem,
518 std::unique_ptr<Clock> clock, std::unique_ptr<const JniCache> jni_cache)
519 : options_(std::move(options)),
520 feature_flags_(options_.enable_scorable_properties(),
521 options_.enable_embedding_quantization(),
522 options_.enable_repeated_field_joins()),
523 filesystem_(std::move(filesystem)),
524 icing_filesystem_(std::move(icing_filesystem)),
525 clock_(std::move(clock)),
526 jni_cache_(std::move(jni_cache)) {
527 ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
528 }
529
~IcingSearchEngine()530 IcingSearchEngine::~IcingSearchEngine() {
531 if (initialized_) {
532 if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) {
533 ICING_LOG(ERROR)
534 << "Error persisting to disk in IcingSearchEngine destructor";
535 }
536 }
537 }
538
Initialize()539 InitializeResultProto IcingSearchEngine::Initialize() {
540 // This method does both read and write so we need a writer lock. Using two
541 // locks (reader and writer) has the chance to be interrupted during
542 // switching.
543 absl_ports::unique_lock l(&mutex_);
544 return InternalInitialize();
545 }
546
ResetMembers()547 void IcingSearchEngine::ResetMembers() {
548 // Reset all members in the reverse order of their initialization to ensure
549 // the dependencies are not violated.
550 embedding_index_.reset();
551 qualified_id_join_index_.reset();
552 integer_index_.reset();
553 index_.reset();
554 normalizer_.reset();
555 language_segmenter_.reset();
556 blob_store_.reset();
557 result_state_manager_.reset();
558 document_store_.reset();
559 schema_store_.reset();
560 }
561
CheckInitMarkerFile(InitializeStatsProto * initialize_stats)562 libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
563 InitializeStatsProto* initialize_stats) {
564 // Check to see if the marker file exists and if we've already passed our max
565 // number of init attempts.
566 std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
567 bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
568 int network_init_attempts = 0;
569 int host_init_attempts = 0;
570
571 // Read the number of previous failed init attempts from the file. If it
572 // fails, then just assume the value is zero (the most likely reason for
573 // failure would be non-existence because the last init was successful
574 // anyways).
575 std::unique_ptr<ScopedFd> marker_file_fd = std::make_unique<ScopedFd>(
576 filesystem_->OpenForWrite(marker_filepath.c_str()));
577 libtextclassifier3::Status status;
578 if (file_exists &&
579 filesystem_->PRead(marker_file_fd->get(), &network_init_attempts,
580 sizeof(network_init_attempts), /*offset=*/0)) {
581 host_init_attempts = GNetworkToHostL(network_init_attempts);
582 if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
583 // We're tried and failed to init too many times. We need to throw
584 // everything out and start from scratch.
585 ResetMembers();
586 marker_file_fd.reset();
587
588 // Delete the entire base directory.
589 if (!filesystem_->DeleteDirectoryRecursively(
590 options_.base_dir().c_str())) {
591 return absl_ports::InternalError("Failed to delete icing base dir!");
592 }
593
594 // Create the base directory again and reopen marker file.
595 if (!filesystem_->CreateDirectoryRecursively(
596 options_.base_dir().c_str())) {
597 return absl_ports::InternalError("Failed to create icing base dir!");
598 }
599
600 marker_file_fd = std::make_unique<ScopedFd>(
601 filesystem_->OpenForWrite(marker_filepath.c_str()));
602
603 status = absl_ports::DataLossError(
604 "Encountered failed initialization limit. Cleared all data.");
605 host_init_attempts = 0;
606 }
607 }
608
609 // Use network_init_attempts here because we might have set host_init_attempts
610 // to 0 if it exceeded the max threshold.
611 initialize_stats->set_num_previous_init_failures(
612 GNetworkToHostL(network_init_attempts));
613
614 ++host_init_attempts;
615 network_init_attempts = GHostToNetworkL(host_init_attempts);
616 // Write the updated number of attempts before we get started.
617 if (!filesystem_->PWrite(marker_file_fd->get(), /*offset=*/0,
618 &network_init_attempts,
619 sizeof(network_init_attempts)) ||
620 !filesystem_->DataSync(marker_file_fd->get())) {
621 return absl_ports::InternalError(
622 "Failed to write and sync init marker file");
623 }
624
625 return status;
626 }
627
InternalInitialize()628 InitializeResultProto IcingSearchEngine::InternalInitialize() {
629 ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
630 << options_.base_dir();
631
632 // Measure the latency of the initialization process.
633 std::unique_ptr<Timer> initialize_timer = clock_->GetNewTimer();
634
635 InitializeResultProto result_proto;
636 StatusProto* result_status = result_proto.mutable_status();
637 InitializeStatsProto* initialize_stats =
638 result_proto.mutable_initialize_stats();
639 if (initialized_) {
640 // Already initialized.
641 result_status->set_code(StatusProto::OK);
642 initialize_stats->set_latency_ms(
643 initialize_timer->GetElapsedMilliseconds());
644 initialize_stats->set_num_documents(document_store_->num_documents());
645 return result_proto;
646 }
647
648 // Now go ahead and try to initialize.
649 libtextclassifier3::Status status = InitializeMembers(initialize_stats);
650 if (status.ok() || absl_ports::IsDataLoss(status)) {
651 // We successfully initialized. We should delete the init marker file to
652 // indicate a successful init.
653 std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
654 if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
655 status = absl_ports::InternalError("Failed to delete init marker file!");
656 } else {
657 initialized_ = true;
658 }
659 }
660 TransformStatus(status, result_status);
661 initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
662 return result_proto;
663 }
664
InitializeMembers(InitializeStatsProto * initialize_stats)665 libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
666 InitializeStatsProto* initialize_stats) {
667 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
668 // Make sure the base directory exists
669 if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
670 return absl_ports::InternalError(absl_ports::StrCat(
671 "Could not create directory: ", options_.base_dir()));
672 }
673
674 // Check to see if the marker file exists and if we've already passed our max
675 // number of init attempts.
676 libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
677 if (!status.ok() && !absl_ports::IsDataLoss(status)) {
678 return status;
679 }
680
681 // Do version and flags compatibility check
682 // Read version file, determine the state change and rebuild derived files if
683 // needed.
684 const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
685 ICING_ASSIGN_OR_RETURN(
686 IcingSearchEngineVersionProto stored_version_proto,
687 version_util::ReadVersion(
688 *filesystem_, /*version_file_dir=*/options_.base_dir(), index_dir));
689 version_util::VersionInfo stored_version_info =
690 version_util::GetVersionInfoFromProto(stored_version_proto);
691 version_util::StateChange version_state_change =
692 version_util::GetVersionStateChange(stored_version_info);
693
694 // Construct icing's current version proto based on the current code version
695 IcingSearchEngineVersionProto current_version_proto;
696 current_version_proto.set_version(version_util::kVersion);
697 current_version_proto.set_max_version(
698 std::max(stored_version_info.max_version, version_util::kVersion));
699 version_util::AddEnabledFeatures(options_, ¤t_version_proto);
700
701 // Step 1: Perform schema migration if needed. This is a no-op if the schema
702 // is fully compatible with the current version.
703 bool perform_schema_database_migration =
704 version_util::SchemaDatabaseMigrationRequired(stored_version_proto) &&
705 options_.enable_schema_database();
706 ICING_RETURN_IF_ERROR(SchemaStore::MigrateSchema(
707 filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()),
708 version_state_change, version_util::kVersion,
709 perform_schema_database_migration));
710
711 // Step 2: Discard derived files that need to be rebuilt
712 version_util::DerivedFilesRebuildResult required_derived_files_rebuild =
713 version_util::CalculateRequiredDerivedFilesRebuild(stored_version_proto,
714 current_version_proto);
715 ICING_RETURN_IF_ERROR(DiscardDerivedFiles(required_derived_files_rebuild));
716
717 // Step 3: update version files. We need to update both the V1 and V2
718 // version files.
719 ICING_RETURN_IF_ERROR(version_util::WriteV1Version(
720 *filesystem_, /*version_file_dir=*/options_.base_dir(),
721 version_util::GetVersionInfoFromProto(current_version_proto)));
722 ICING_RETURN_IF_ERROR(version_util::WriteV2Version(
723 *filesystem_, /*version_file_dir=*/options_.base_dir(),
724 std::make_unique<IcingSearchEngineVersionProto>(
725 std::move(current_version_proto))));
726
727 ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
728
729 // TODO(b/156383798) : Resolve how to specify the locale.
730 language_segmenter_factory::SegmenterOptions segmenter_options(
731 ULOC_US, jni_cache_.get());
732 TC3_ASSIGN_OR_RETURN(language_segmenter_, language_segmenter_factory::Create(
733 std::move(segmenter_options)));
734
735 TC3_ASSIGN_OR_RETURN(normalizer_,
736 normalizer_factory::Create(options_.max_token_length()));
737
738 std::string marker_filepath =
739 MakeSetSchemaMarkerFilePath(options_.base_dir());
740
741 libtextclassifier3::Status index_init_status;
742 if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
743 // The schema was either lost or never set before. Wipe out the doc store
744 // and index directories and initialize them from scratch.
745 const std::string doc_store_dir =
746 MakeDocumentDirectoryPath(options_.base_dir());
747 const std::string integer_index_dir =
748 MakeIntegerIndexWorkingPath(options_.base_dir());
749 const std::string qualified_id_join_index_dir =
750 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
751 const std::string embedding_index_dir =
752 MakeEmbeddingIndexWorkingPath(options_.base_dir());
753 const std::string blob_store_dir =
754 MakeBlobDirectoryPath(options_.base_dir());
755
756 if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
757 !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
758 !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() ||
759 !QualifiedIdJoinIndex::Discard(*filesystem_,
760 qualified_id_join_index_dir)
761 .ok() ||
762 !EmbeddingIndex::Discard(*filesystem_, embedding_index_dir).ok() ||
763 !filesystem_->DeleteDirectoryRecursively(blob_store_dir.c_str())) {
764 return absl_ports::InternalError(absl_ports::StrCat(
765 "Could not delete directories: ", index_dir, ", ", integer_index_dir,
766 ", ", qualified_id_join_index_dir, ", ", embedding_index_dir, ", ",
767 blob_store_dir, " and ", doc_store_dir));
768 }
769 if (options_.enable_blob_store()) {
770 ICING_RETURN_IF_ERROR(
771 InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
772 options_.blob_store_compression_level()));
773 }
774 ICING_ASSIGN_OR_RETURN(
775 bool document_store_derived_files_regenerated,
776 InitializeDocumentStore(
777 /*force_recovery_and_revalidate_documents=*/false,
778 initialize_stats));
779 index_init_status = InitializeIndex(
780 document_store_derived_files_regenerated, initialize_stats);
781 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
782 return index_init_status;
783 }
784 } else if (filesystem_->FileExists(marker_filepath.c_str())) {
785 // If the marker file is still around then something wonky happened when we
786 // last tried to set the schema.
787 //
788 // Since we're going to rebuild all indices in this case, the return value
789 // of InitializeDocumentStore (document_store_derived_files_regenerated) is
790 // unused.
791 if (options_.enable_blob_store()) {
792 ICING_RETURN_IF_ERROR(
793 InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
794 options_.blob_store_compression_level()));
795 }
796 ICING_RETURN_IF_ERROR(InitializeDocumentStore(
797 /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
798
799 // We're going to need to build the index from scratch. So just delete its
800 // directory now.
801 // Discard index directory and instantiate a new one.
802 Index::Options index_options(index_dir, options_.index_merge_size(),
803 /*lite_index_sort_at_indexing=*/true,
804 options_.lite_index_sort_size());
805 if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
806 !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
807 return absl_ports::InternalError(
808 absl_ports::StrCat("Could not recreate directory: ", index_dir));
809 }
810 ICING_ASSIGN_OR_RETURN(index_,
811 Index::Create(index_options, filesystem_.get(),
812 icing_filesystem_.get()));
813
814 // Discard integer index directory and instantiate a new one.
815 std::string integer_index_dir =
816 MakeIntegerIndexWorkingPath(options_.base_dir());
817 ICING_RETURN_IF_ERROR(
818 IntegerIndex::Discard(*filesystem_, integer_index_dir));
819 ICING_ASSIGN_OR_RETURN(
820 integer_index_,
821 IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
822 options_.integer_index_bucket_split_threshold(),
823 options_.pre_mapping_fbv()));
824
825 // Discard qualified id join index directory and instantiate a new one.
826 std::string qualified_id_join_index_dir =
827 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
828 ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
829 *filesystem_, qualified_id_join_index_dir));
830 ICING_ASSIGN_OR_RETURN(
831 qualified_id_join_index_,
832 CreateQualifiedIdJoinIndex(*filesystem_,
833 std::move(qualified_id_join_index_dir),
834 options_, feature_flags_));
835
836 // Discard embedding index directory and instantiate a new one.
837 std::string embedding_index_dir =
838 MakeEmbeddingIndexWorkingPath(options_.base_dir());
839 ICING_RETURN_IF_ERROR(
840 EmbeddingIndex::Discard(*filesystem_, embedding_index_dir));
841 ICING_ASSIGN_OR_RETURN(
842 embedding_index_,
843 EmbeddingIndex::Create(filesystem_.get(), embedding_index_dir,
844 clock_.get(), &feature_flags_));
845
846 std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
847 IndexRestorationResult restore_result = RestoreIndexIfNeeded();
848 index_init_status = std::move(restore_result.status);
849 // DATA_LOSS means that we have successfully initialized and re-added
850 // content to the index. Some indexed content was lost, but otherwise the
851 // index is in a valid state and can be queried.
852 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
853 return index_init_status;
854 }
855
856 // Delete the marker file to indicate that everything is now in sync with
857 // whatever changes were made to the schema.
858 filesystem_->DeleteFile(marker_filepath.c_str());
859
860 initialize_stats->set_index_restoration_latency_ms(
861 restore_timer->GetElapsedMilliseconds());
862 initialize_stats->set_index_restoration_cause(
863 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
864 initialize_stats->set_integer_index_restoration_cause(
865 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
866 initialize_stats->set_qualified_id_join_index_restoration_cause(
867 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
868 initialize_stats->set_embedding_index_restoration_cause(
869 InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
870 } else if (version_state_change != version_util::StateChange::kCompatible) {
871 if (options_.enable_blob_store()) {
872 ICING_RETURN_IF_ERROR(
873 InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
874 options_.blob_store_compression_level()));
875 }
876 ICING_ASSIGN_OR_RETURN(bool document_store_derived_files_regenerated,
877 InitializeDocumentStore(
878 /*force_recovery_and_revalidate_documents=*/true,
879 initialize_stats));
880 index_init_status = InitializeIndex(
881 document_store_derived_files_regenerated, initialize_stats);
882 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
883 return index_init_status;
884 }
885
886 initialize_stats->set_schema_store_recovery_cause(
887 InitializeStatsProto::VERSION_CHANGED);
888 initialize_stats->set_document_store_recovery_cause(
889 InitializeStatsProto::VERSION_CHANGED);
890 initialize_stats->set_index_restoration_cause(
891 InitializeStatsProto::VERSION_CHANGED);
892 initialize_stats->set_integer_index_restoration_cause(
893 InitializeStatsProto::VERSION_CHANGED);
894 initialize_stats->set_qualified_id_join_index_restoration_cause(
895 InitializeStatsProto::VERSION_CHANGED);
896 initialize_stats->set_embedding_index_restoration_cause(
897 InitializeStatsProto::VERSION_CHANGED);
898 } else {
899 if (options_.enable_blob_store()) {
900 ICING_RETURN_IF_ERROR(
901 InitializeBlobStore(options_.orphan_blob_time_to_live_ms(),
902 options_.blob_store_compression_level()));
903 }
904 ICING_ASSIGN_OR_RETURN(
905 bool document_store_derived_files_regenerated,
906 InitializeDocumentStore(
907 /*force_recovery_and_revalidate_documents=*/false,
908 initialize_stats));
909 index_init_status = InitializeIndex(
910 document_store_derived_files_regenerated, initialize_stats);
911 if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
912 return index_init_status;
913 }
914
915 // Set recovery cause to FEATURE_FLAG_CHANGED according to the calculated
916 // required_derived_files_rebuild
917 if (required_derived_files_rebuild
918 .needs_document_store_derived_files_rebuild) {
919 initialize_stats->set_document_store_recovery_cause(
920 InitializeStatsProto::FEATURE_FLAG_CHANGED);
921 }
922 if (required_derived_files_rebuild
923 .needs_schema_store_derived_files_rebuild) {
924 initialize_stats->set_schema_store_recovery_cause(
925 InitializeStatsProto::FEATURE_FLAG_CHANGED);
926 }
927 if (required_derived_files_rebuild.needs_term_index_rebuild) {
928 initialize_stats->set_index_restoration_cause(
929 InitializeStatsProto::FEATURE_FLAG_CHANGED);
930 }
931 if (required_derived_files_rebuild.needs_integer_index_rebuild) {
932 initialize_stats->set_integer_index_restoration_cause(
933 InitializeStatsProto::FEATURE_FLAG_CHANGED);
934 }
935 if (required_derived_files_rebuild.needs_qualified_id_join_index_rebuild) {
936 initialize_stats->set_qualified_id_join_index_restoration_cause(
937 InitializeStatsProto::FEATURE_FLAG_CHANGED);
938 }
939 if (required_derived_files_rebuild.needs_embedding_index_rebuild) {
940 initialize_stats->set_embedding_index_restoration_cause(
941 InitializeStatsProto::FEATURE_FLAG_CHANGED);
942 }
943 }
944
945 if (status.ok()) {
946 status = index_init_status;
947 }
948
949 result_state_manager_ = std::make_unique<ResultStateManager>(
950 performance_configuration_.max_num_total_hits, *document_store_);
951
952 return status;
953 }
954
InitializeSchemaStore(InitializeStatsProto * initialize_stats)955 libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
956 InitializeStatsProto* initialize_stats) {
957 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
958
959 const std::string schema_store_dir =
960 MakeSchemaDirectoryPath(options_.base_dir());
961 // Make sure the sub-directory exists
962 if (!filesystem_->CreateDirectoryRecursively(schema_store_dir.c_str())) {
963 return absl_ports::InternalError(
964 absl_ports::StrCat("Could not create directory: ", schema_store_dir));
965 }
966 ICING_ASSIGN_OR_RETURN(
967 schema_store_,
968 SchemaStore::Create(filesystem_.get(), schema_store_dir, clock_.get(),
969 &feature_flags_, options_.enable_schema_database(),
970 initialize_stats));
971
972 return libtextclassifier3::Status::OK;
973 }
974
InitializeDocumentStore(bool force_recovery_and_revalidate_documents,InitializeStatsProto * initialize_stats)975 libtextclassifier3::StatusOr<bool> IcingSearchEngine::InitializeDocumentStore(
976 bool force_recovery_and_revalidate_documents,
977 InitializeStatsProto* initialize_stats) {
978 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
979
980 const std::string document_dir =
981 MakeDocumentDirectoryPath(options_.base_dir());
982 // Make sure the sub-directory exists
983 if (!filesystem_->CreateDirectoryRecursively(document_dir.c_str())) {
984 return absl_ports::InternalError(
985 absl_ports::StrCat("Could not create directory: ", document_dir));
986 }
987 ICING_ASSIGN_OR_RETURN(
988 DocumentStore::CreateResult create_result,
989 DocumentStore::Create(
990 filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
991 &feature_flags_, force_recovery_and_revalidate_documents,
992 /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
993 options_.compression_level(), initialize_stats));
994 document_store_ = std::move(create_result.document_store);
995 return create_result.derived_files_regenerated;
996 }
997
InitializeBlobStore(int32_t orphan_blob_time_to_live_ms,int32_t blob_store_compression_level)998 libtextclassifier3::Status IcingSearchEngine::InitializeBlobStore(
999 int32_t orphan_blob_time_to_live_ms, int32_t blob_store_compression_level) {
1000 std::string blob_dir = MakeBlobDirectoryPath(options_.base_dir());
1001 // Make sure the sub-directory exists
1002 if (!filesystem_->CreateDirectoryRecursively(blob_dir.c_str())) {
1003 return absl_ports::InternalError(
1004 absl_ports::StrCat("Could not create directory: ", blob_dir));
1005 }
1006
1007 ICING_ASSIGN_OR_RETURN(
1008 auto blob_store_or,
1009 BlobStore::Create(filesystem_.get(), blob_dir, clock_.get(),
1010 orphan_blob_time_to_live_ms,
1011 blob_store_compression_level));
1012 blob_store_ = std::make_unique<BlobStore>(std::move(blob_store_or));
1013 return libtextclassifier3::Status::OK;
1014 }
1015
InitializeIndex(bool document_store_derived_files_regenerated,InitializeStatsProto * initialize_stats)1016 libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
1017 bool document_store_derived_files_regenerated,
1018 InitializeStatsProto* initialize_stats) {
1019 ICING_RETURN_ERROR_IF_NULL(initialize_stats);
1020
1021 const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
1022 // Make sure the sub-directory exists
1023 if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
1024 return absl_ports::InternalError(
1025 absl_ports::StrCat("Could not create directory: ", index_dir));
1026 }
1027 Index::Options index_options(index_dir, options_.index_merge_size(),
1028 /*lite_index_sort_at_indexing=*/true,
1029 options_.lite_index_sort_size());
1030
1031 // Term index
1032 InitializeStatsProto::RecoveryCause index_recovery_cause;
1033 auto index_or =
1034 Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
1035 if (!index_or.ok()) {
1036 if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
1037 !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
1038 return absl_ports::InternalError(
1039 absl_ports::StrCat("Could not recreate directory: ", index_dir));
1040 }
1041
1042 index_recovery_cause = InitializeStatsProto::IO_ERROR;
1043
1044 // Try recreating it from scratch and re-indexing everything.
1045 ICING_ASSIGN_OR_RETURN(index_,
1046 Index::Create(index_options, filesystem_.get(),
1047 icing_filesystem_.get()));
1048 } else {
1049 // Index was created fine.
1050 index_ = std::move(index_or).ValueOrDie();
1051 // If a recover does have to happen, then it must be because the index is
1052 // out of sync with the document store.
1053 index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1054 }
1055
1056 // Integer index
1057 std::string integer_index_dir =
1058 MakeIntegerIndexWorkingPath(options_.base_dir());
1059 InitializeStatsProto::RecoveryCause integer_index_recovery_cause;
1060 auto integer_index_or =
1061 IntegerIndex::Create(*filesystem_, integer_index_dir,
1062 options_.integer_index_bucket_split_threshold(),
1063 options_.pre_mapping_fbv());
1064 if (!integer_index_or.ok()) {
1065 ICING_RETURN_IF_ERROR(
1066 IntegerIndex::Discard(*filesystem_, integer_index_dir));
1067
1068 integer_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1069
1070 // Try recreating it from scratch and re-indexing everything.
1071 ICING_ASSIGN_OR_RETURN(
1072 integer_index_,
1073 IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
1074 options_.integer_index_bucket_split_threshold(),
1075 options_.pre_mapping_fbv()));
1076 } else {
1077 // Integer index was created fine.
1078 integer_index_ = std::move(integer_index_or).ValueOrDie();
1079 // If a recover does have to happen, then it must be because the index is
1080 // out of sync with the document store.
1081 integer_index_recovery_cause =
1082 InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1083 }
1084
1085 // Qualified id join index
1086 std::string qualified_id_join_index_dir =
1087 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
1088 InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause;
1089 if (document_store_derived_files_regenerated &&
1090 !options_.enable_qualified_id_join_index_v3_and_delete_propagate_from()) {
1091 // V2 qualified id join index depends on document store derived files, so we
1092 // have to rebuild it from scratch if
1093 // document_store_derived_files_regenerated is true.
1094 ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
1095 *filesystem_, qualified_id_join_index_dir));
1096
1097 ICING_ASSIGN_OR_RETURN(
1098 qualified_id_join_index_,
1099 CreateQualifiedIdJoinIndex(*filesystem_,
1100 std::move(qualified_id_join_index_dir),
1101 options_, feature_flags_));
1102
1103 qualified_id_join_index_recovery_cause =
1104 InitializeStatsProto::DEPENDENCIES_CHANGED;
1105 } else {
1106 auto qualified_id_join_index_or = CreateQualifiedIdJoinIndex(
1107 *filesystem_, qualified_id_join_index_dir, options_, feature_flags_);
1108 if (!qualified_id_join_index_or.ok()) {
1109 ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
1110 *filesystem_, qualified_id_join_index_dir));
1111
1112 qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1113
1114 // Try recreating it from scratch and rebuild everything.
1115 ICING_ASSIGN_OR_RETURN(
1116 qualified_id_join_index_,
1117 CreateQualifiedIdJoinIndex(*filesystem_,
1118 std::move(qualified_id_join_index_dir),
1119 options_, feature_flags_));
1120 } else {
1121 // Qualified id join index was created fine.
1122 qualified_id_join_index_ =
1123 std::move(qualified_id_join_index_or).ValueOrDie();
1124 // If a recover does have to happen, then it must be because the index is
1125 // out of sync with the document store.
1126 qualified_id_join_index_recovery_cause =
1127 InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1128 }
1129 }
1130
1131 // Embedding index
1132 const std::string embedding_dir =
1133 MakeEmbeddingIndexWorkingPath(options_.base_dir());
1134 InitializeStatsProto::RecoveryCause embedding_index_recovery_cause;
1135 auto embedding_index_or = EmbeddingIndex::Create(
1136 filesystem_.get(), embedding_dir, clock_.get(), &feature_flags_);
1137 if (!embedding_index_or.ok()) {
1138 ICING_RETURN_IF_ERROR(EmbeddingIndex::Discard(*filesystem_, embedding_dir));
1139
1140 embedding_index_recovery_cause = InitializeStatsProto::IO_ERROR;
1141
1142 // Try recreating it from scratch and re-indexing everything.
1143 ICING_ASSIGN_OR_RETURN(
1144 embedding_index_,
1145 EmbeddingIndex::Create(filesystem_.get(), embedding_dir, clock_.get(),
1146 &feature_flags_));
1147 } else {
1148 // Embedding index was created fine.
1149 embedding_index_ = std::move(embedding_index_or).ValueOrDie();
1150 // If a recover does have to happen, then it must be because the index is
1151 // out of sync with the document store.
1152 embedding_index_recovery_cause =
1153 InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
1154 }
1155
1156 std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
1157 IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1158 if (restore_result.index_needed_restoration ||
1159 restore_result.integer_index_needed_restoration ||
1160 restore_result.qualified_id_join_index_needed_restoration) {
1161 initialize_stats->set_index_restoration_latency_ms(
1162 restore_timer->GetElapsedMilliseconds());
1163
1164 if (restore_result.index_needed_restoration) {
1165 initialize_stats->set_index_restoration_cause(index_recovery_cause);
1166 }
1167 if (restore_result.integer_index_needed_restoration) {
1168 initialize_stats->set_integer_index_restoration_cause(
1169 integer_index_recovery_cause);
1170 }
1171 if (restore_result.qualified_id_join_index_needed_restoration) {
1172 initialize_stats->set_qualified_id_join_index_restoration_cause(
1173 qualified_id_join_index_recovery_cause);
1174 }
1175 if (restore_result.embedding_index_needed_restoration) {
1176 initialize_stats->set_embedding_index_restoration_cause(
1177 embedding_index_recovery_cause);
1178 }
1179 }
1180 return restore_result.status;
1181 }
1182
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents)1183 SetSchemaResultProto IcingSearchEngine::SetSchema(
1184 const SchemaProto& new_schema, bool ignore_errors_and_delete_documents) {
1185 return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
1186 }
1187
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents)1188 SetSchemaResultProto IcingSearchEngine::SetSchema(
1189 SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) {
1190 ICING_VLOG(1) << "Setting new Schema";
1191
1192 SetSchemaResultProto result_proto;
1193 StatusProto* result_status = result_proto.mutable_status();
1194
1195 absl_ports::unique_lock l(&mutex_);
1196 ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
1197 result_proto.set_latency_ms(t);
1198 });
1199 if (!initialized_) {
1200 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1201 result_status->set_message("IcingSearchEngine has not been initialized!");
1202 return result_proto;
1203 }
1204
1205 auto lost_previous_schema_or = LostPreviousSchema();
1206 if (!lost_previous_schema_or.ok()) {
1207 TransformStatus(lost_previous_schema_or.status(), result_status);
1208 return result_proto;
1209 }
1210 bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
1211
1212 std::string marker_filepath =
1213 MakeSetSchemaMarkerFilePath(options_.base_dir());
1214 // Create the marker file indicating that we are going to apply a schema
1215 // change. No need to write anything to the marker file - its existence is the
1216 // only thing that matters. The marker file is used to indicate if we
1217 // encountered a crash or a power loss while updating the schema and other
1218 // files. So set it up to be deleted as long as we return from this function.
1219 DestructibleFile marker_file(marker_filepath, filesystem_.get());
1220
1221 auto set_schema_result_or = schema_store_->SetSchema(
1222 std::move(new_schema), ignore_errors_and_delete_documents,
1223 options_.allow_circular_schema_definitions());
1224 if (!set_schema_result_or.ok()) {
1225 TransformStatus(set_schema_result_or.status(), result_status);
1226 return result_proto;
1227 }
1228 SchemaStore::SetSchemaResult set_schema_result =
1229 std::move(set_schema_result_or).ValueOrDie();
1230
1231 for (const std::string& deleted_type :
1232 set_schema_result.schema_types_deleted_by_name) {
1233 result_proto.add_deleted_schema_types(deleted_type);
1234 }
1235
1236 for (const std::string& incompatible_type :
1237 set_schema_result.schema_types_incompatible_by_name) {
1238 result_proto.add_incompatible_schema_types(incompatible_type);
1239 }
1240
1241 for (const std::string& new_type :
1242 set_schema_result.schema_types_new_by_name) {
1243 result_proto.add_new_schema_types(std::move(new_type));
1244 }
1245
1246 for (const std::string& compatible_type :
1247 set_schema_result.schema_types_changed_fully_compatible_by_name) {
1248 result_proto.add_fully_compatible_changed_schema_types(
1249 std::move(compatible_type));
1250 }
1251
1252 bool index_incompatible =
1253 !set_schema_result.schema_types_index_incompatible_by_name.empty();
1254 for (const std::string& index_incompatible_type :
1255 set_schema_result.schema_types_index_incompatible_by_name) {
1256 result_proto.add_index_incompatible_changed_schema_types(
1257 std::move(index_incompatible_type));
1258 }
1259
1260 // Join index is incompatible and needs rebuild if:
1261 // - Any schema type is join incompatible.
1262 // - OR existing schema type id assignment has changed, since join index
1263 // stores schema type id (+ joinable property path) as a key to group join
1264 // data.
1265 bool join_incompatible =
1266 !set_schema_result.schema_types_join_incompatible_by_name.empty() ||
1267 !set_schema_result.old_schema_type_ids_changed.empty();
1268 for (const std::string& join_incompatible_type :
1269 set_schema_result.schema_types_join_incompatible_by_name) {
1270 result_proto.add_join_incompatible_changed_schema_types(
1271 std::move(join_incompatible_type));
1272 }
1273
1274 libtextclassifier3::Status status;
1275 if (set_schema_result.success) {
1276 if (lost_previous_schema) {
1277 // No previous schema to calculate a diff against. We have to go through
1278 // and revalidate all the Documents in the DocumentStore
1279 status = document_store_->UpdateSchemaStore(schema_store_.get());
1280 if (!status.ok()) {
1281 TransformStatus(status, result_status);
1282 return result_proto;
1283 }
1284 } else if (!set_schema_result.old_schema_type_ids_changed.empty() ||
1285 !set_schema_result.schema_types_incompatible_by_id.empty() ||
1286 !set_schema_result.schema_types_deleted_by_id.empty()) {
1287 status = document_store_->OptimizedUpdateSchemaStore(schema_store_.get(),
1288 set_schema_result);
1289 if (!status.ok()) {
1290 TransformStatus(status, result_status);
1291 return result_proto;
1292 }
1293 }
1294
1295 if (lost_previous_schema || index_incompatible) {
1296 // Clears search indices
1297 status = ClearSearchIndices();
1298 if (!status.ok()) {
1299 TransformStatus(status, result_status);
1300 return result_proto;
1301 }
1302 }
1303
1304 if (lost_previous_schema || join_incompatible) {
1305 // Clears join indices
1306 status = ClearJoinIndices();
1307 if (!status.ok()) {
1308 TransformStatus(status, result_status);
1309 return result_proto;
1310 }
1311 }
1312
1313 if (lost_previous_schema || index_incompatible || join_incompatible) {
1314 IndexRestorationResult restore_result = RestoreIndexIfNeeded();
1315 // DATA_LOSS means that we have successfully re-added content to the
1316 // index. Some indexed content was lost, but otherwise the index is in a
1317 // valid state and can be queried.
1318 if (!restore_result.status.ok() &&
1319 !absl_ports::IsDataLoss(restore_result.status)) {
1320 TransformStatus(status, result_status);
1321 return result_proto;
1322 }
1323 }
1324
1325 if (feature_flags_.enable_scorable_properties()) {
1326 if (!set_schema_result.schema_types_scorable_property_inconsistent_by_id
1327 .empty()) {
1328 status = document_store_->RegenerateScorablePropertyCache(
1329 set_schema_result
1330 .schema_types_scorable_property_inconsistent_by_id);
1331 if (!status.ok()) {
1332 TransformStatus(status, result_status);
1333 return result_proto;
1334 }
1335 }
1336 }
1337
1338 result_status->set_code(StatusProto::OK);
1339 } else {
1340 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1341 result_status->set_message("Schema is incompatible.");
1342 }
1343
1344 return result_proto;
1345 }
1346
GetSchema()1347 GetSchemaResultProto IcingSearchEngine::GetSchema() {
1348 GetSchemaResultProto result_proto;
1349 StatusProto* result_status = result_proto.mutable_status();
1350
1351 absl_ports::shared_lock l(&mutex_);
1352 if (!initialized_) {
1353 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1354 result_status->set_message("IcingSearchEngine has not been initialized!");
1355 return result_proto;
1356 }
1357
1358 auto schema_or = schema_store_->GetSchema();
1359 if (!schema_or.ok()) {
1360 TransformStatus(schema_or.status(), result_status);
1361 return result_proto;
1362 }
1363
1364 result_status->set_code(StatusProto::OK);
1365 *result_proto.mutable_schema() = *std::move(schema_or).ValueOrDie();
1366 return result_proto;
1367 }
1368
GetSchema(std::string_view database)1369 GetSchemaResultProto IcingSearchEngine::GetSchema(std::string_view database) {
1370 GetSchemaResultProto result_proto;
1371 StatusProto* result_status = result_proto.mutable_status();
1372
1373 absl_ports::shared_lock l(&mutex_);
1374 if (!initialized_) {
1375 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1376 result_status->set_message("IcingSearchEngine has not been initialized!");
1377 return result_proto;
1378 }
1379
1380 libtextclassifier3::StatusOr<SchemaProto> schema =
1381 schema_store_->GetSchema(std::string(database));
1382 if (!schema.ok()) {
1383 TransformStatus(schema.status(), result_status);
1384 return result_proto;
1385 }
1386
1387 result_status->set_code(StatusProto::OK);
1388 *result_proto.mutable_schema() = std::move(schema).ValueOrDie();
1389 return result_proto;
1390 }
1391
GetSchemaType(std::string_view schema_type)1392 GetSchemaTypeResultProto IcingSearchEngine::GetSchemaType(
1393 std::string_view schema_type) {
1394 GetSchemaTypeResultProto result_proto;
1395 StatusProto* result_status = result_proto.mutable_status();
1396
1397 absl_ports::shared_lock l(&mutex_);
1398 if (!initialized_) {
1399 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1400 result_status->set_message("IcingSearchEngine has not been initialized!");
1401 return result_proto;
1402 }
1403
1404 auto type_config_or = schema_store_->GetSchemaTypeConfig(schema_type);
1405 if (!type_config_or.ok()) {
1406 TransformStatus(type_config_or.status(), result_status);
1407 return result_proto;
1408 }
1409
1410 result_status->set_code(StatusProto::OK);
1411 *result_proto.mutable_schema_type_config() = *(type_config_or.ValueOrDie());
1412 return result_proto;
1413 }
1414
Put(const DocumentProto & document)1415 PutResultProto IcingSearchEngine::Put(const DocumentProto& document) {
1416 return Put(DocumentProto(document));
1417 }
1418
Put(DocumentProto && document)1419 PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
1420 ICING_VLOG(1) << "Writing document to document store";
1421
1422 PutResultProto result_proto;
1423 StatusProto* result_status = result_proto.mutable_status();
1424 PutDocumentStatsProto* put_document_stats =
1425 result_proto.mutable_put_document_stats();
1426 ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
1427 put_document_stats->set_latency_ms(t);
1428 });
1429
1430 // Lock must be acquired before validation because the DocumentStore uses
1431 // the schema file to validate, and the schema could be changed in
1432 // SetSchema() which is protected by the same mutex.
1433 absl_ports::unique_lock l(&mutex_);
1434 if (!initialized_) {
1435 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1436 result_status->set_message("IcingSearchEngine has not been initialized!");
1437 return result_proto;
1438 }
1439
1440 auto tokenized_document_or = TokenizedDocument::Create(
1441 schema_store_.get(), language_segmenter_.get(), std::move(document));
1442 if (!tokenized_document_or.ok()) {
1443 TransformStatus(tokenized_document_or.status(), result_status);
1444 return result_proto;
1445 }
1446 TokenizedDocument tokenized_document(
1447 std::move(tokenized_document_or).ValueOrDie());
1448
1449 auto put_result_or = document_store_->Put(
1450 tokenized_document.document(), tokenized_document.num_string_tokens(),
1451 put_document_stats);
1452 if (!put_result_or.ok()) {
1453 TransformStatus(put_result_or.status(), result_status);
1454 return result_proto;
1455 }
1456 DocumentId old_document_id = put_result_or.ValueOrDie().old_document_id;
1457 DocumentId document_id = put_result_or.ValueOrDie().new_document_id;
1458 result_proto.set_was_replacement(
1459 put_result_or.ValueOrDie().was_replacement());
1460
1461 auto data_indexing_handlers_or = CreateDataIndexingHandlers();
1462 if (!data_indexing_handlers_or.ok()) {
1463 TransformStatus(data_indexing_handlers_or.status(), result_status);
1464 return result_proto;
1465 }
1466 IndexProcessor index_processor(
1467 std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
1468
1469 auto index_status = index_processor.IndexDocument(
1470 tokenized_document, document_id, old_document_id, put_document_stats);
1471 // Getting an internal error from the index could possibly mean that the index
1472 // is broken. Try to rebuild them to recover.
1473 if (absl_ports::IsInternal(index_status)) {
1474 ICING_LOG(ERROR) << "Got an internal error from the index. Trying to "
1475 "rebuild the index!\n"
1476 << index_status.error_message();
1477 index_status = ClearAllIndices();
1478 if (index_status.ok()) {
1479 index_status = RestoreIndexIfNeeded().status;
1480 if (!index_status.ok()) {
1481 ICING_LOG(ERROR) << "Failed to reindex documents after a failure of "
1482 "indexing a document.";
1483 }
1484 } else {
1485 ICING_LOG(ERROR)
1486 << "Failed to clear indices after a failure of indexing a document.";
1487 }
1488 }
1489
1490 if (!index_status.ok()) {
1491 // If we encountered a failure or cannot resolve an internal error while
1492 // indexing this document, then mark it as deleted.
1493 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1494 libtextclassifier3::Status delete_status =
1495 document_store_->Delete(document_id, current_time_ms);
1496 if (!delete_status.ok()) {
1497 // This is pretty dire (and, hopefully, unlikely). We can't roll back the
1498 // document that we just added. Wipeout the whole index.
1499 ICING_LOG(ERROR) << "Cannot delete the document that is failed to index. "
1500 "Wiping out the whole Icing search engine.";
1501 ResetInternal();
1502 }
1503 }
1504
1505 TransformStatus(index_status, result_status);
1506 return result_proto;
1507 }
1508
Get(const std::string_view name_space,const std::string_view uri,const GetResultSpecProto & result_spec)1509 GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
1510 const std::string_view uri,
1511 const GetResultSpecProto& result_spec) {
1512 GetResultProto result_proto;
1513 StatusProto* result_status = result_proto.mutable_status();
1514
1515 absl_ports::shared_lock l(&mutex_);
1516 if (!initialized_) {
1517 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1518 result_status->set_message("IcingSearchEngine has not been initialized!");
1519 return result_proto;
1520 }
1521
1522 auto document_or = document_store_->Get(name_space, uri);
1523 if (!document_or.ok()) {
1524 TransformStatus(document_or.status(), result_status);
1525 return result_proto;
1526 }
1527
1528 DocumentProto document = std::move(document_or).ValueOrDie();
1529 std::unique_ptr<ProjectionTree> type_projection_tree;
1530 std::unique_ptr<ProjectionTree> wildcard_projection_tree;
1531 for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
1532 schema_store_->ExpandTypePropertyMasks(
1533 result_spec.type_property_masks())) {
1534 if (type_field_mask.schema_type == document.schema()) {
1535 type_projection_tree = std::make_unique<ProjectionTree>(type_field_mask);
1536 } else if (type_field_mask.schema_type ==
1537 SchemaStore::kSchemaTypeWildcard) {
1538 wildcard_projection_tree =
1539 std::make_unique<ProjectionTree>(type_field_mask);
1540 }
1541 }
1542
1543 // Apply projection
1544 if (type_projection_tree != nullptr) {
1545 projector::Project(type_projection_tree->root().children, &document);
1546 } else if (wildcard_projection_tree != nullptr) {
1547 projector::Project(wildcard_projection_tree->root().children, &document);
1548 }
1549
1550 result_status->set_code(StatusProto::OK);
1551 *result_proto.mutable_document() = std::move(document);
1552 return result_proto;
1553 }
1554
ReportUsage(const UsageReport & usage_report)1555 ReportUsageResultProto IcingSearchEngine::ReportUsage(
1556 const UsageReport& usage_report) {
1557 ReportUsageResultProto result_proto;
1558 StatusProto* result_status = result_proto.mutable_status();
1559
1560 absl_ports::unique_lock l(&mutex_);
1561 if (!initialized_) {
1562 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1563 result_status->set_message("IcingSearchEngine has not been initialized!");
1564 return result_proto;
1565 }
1566
1567 libtextclassifier3::Status status =
1568 document_store_->ReportUsage(usage_report);
1569 TransformStatus(status, result_status);
1570 return result_proto;
1571 }
1572
GetAllNamespaces()1573 GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
1574 GetAllNamespacesResultProto result_proto;
1575 StatusProto* result_status = result_proto.mutable_status();
1576
1577 absl_ports::shared_lock l(&mutex_);
1578 if (!initialized_) {
1579 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1580 result_status->set_message("IcingSearchEngine has not been initialized!");
1581 return result_proto;
1582 }
1583
1584 std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
1585
1586 for (const std::string& namespace_ : namespaces) {
1587 result_proto.add_namespaces(namespace_);
1588 }
1589
1590 result_status->set_code(StatusProto::OK);
1591 return result_proto;
1592 }
1593
Delete(const std::string_view name_space,const std::string_view uri)1594 DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
1595 const std::string_view uri) {
1596 ICING_VLOG(1) << "Deleting document from doc store";
1597
1598 DeleteResultProto result_proto;
1599 StatusProto* result_status = result_proto.mutable_status();
1600
1601 absl_ports::unique_lock l(&mutex_);
1602 if (!initialized_) {
1603 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1604 result_status->set_message("IcingSearchEngine has not been initialized!");
1605 return result_proto;
1606 }
1607
1608 DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
1609 delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
1610
1611 libtextclassifier3::Status status;
1612 libtextclassifier3::Status propagate_delete_status;
1613 int num_documents_deleted = 0;
1614 std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1615
1616 libtextclassifier3::StatusOr<DocumentId> document_id_or =
1617 document_store_->GetDocumentId(name_space, uri);
1618 if (!document_id_or.ok()) {
1619 status = std::move(document_id_or).status();
1620 } else {
1621 DocumentId document_id = document_id_or.ValueOrDie();
1622
1623 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1624 // that can support error logging.
1625 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1626 status = document_store_->Delete(document_id, current_time_ms);
1627 if (status.ok()) {
1628 ++num_documents_deleted;
1629 }
1630
1631 // It is possible that the document has expired and the delete operation
1632 // fails with NOT_FOUND_ERROR. In this case, we should still propagate the
1633 // delete operation, regardless of the outcome of the delete operation.
1634 libtextclassifier3::StatusOr<int> propagated_child_docs_deleted_or =
1635 PropagateDelete(/*deleted_document_ids=*/{document_id},
1636 current_time_ms);
1637 if (propagated_child_docs_deleted_or.ok()) {
1638 num_documents_deleted += propagated_child_docs_deleted_or.ValueOrDie();
1639 } else {
1640 propagate_delete_status =
1641 std::move(propagated_child_docs_deleted_or).status();
1642 }
1643 }
1644 delete_stats->set_num_documents_deleted(num_documents_deleted);
1645 delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1646
1647 if (!status.ok()) {
1648 LogSeverity::Code severity = ERROR;
1649 if (absl_ports::IsNotFound(status)) {
1650 severity = DBG;
1651 }
1652 ICING_LOG(severity) << status.error_message()
1653 << ". Failed to delete Document. namespace: "
1654 << name_space << ", uri: " << uri;
1655 TransformStatus(status, result_status);
1656 return result_proto;
1657 }
1658
1659 if (!propagate_delete_status.ok()) {
1660 ICING_LOG(ERROR) << propagate_delete_status.error_message()
1661 << ". Failed to propagate delete for document. namespace: "
1662 << name_space << ", uri: " << uri;
1663 TransformStatus(propagate_delete_status, result_status);
1664 return result_proto;
1665 }
1666
1667 result_status->set_code(StatusProto::OK);
1668 return result_proto;
1669 }
1670
DeleteByNamespace(const std::string_view name_space)1671 DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
1672 const std::string_view name_space) {
1673 ICING_VLOG(1) << "Deleting namespace from doc store";
1674
1675 DeleteByNamespaceResultProto delete_result;
1676 StatusProto* result_status = delete_result.mutable_status();
1677 absl_ports::unique_lock l(&mutex_);
1678 if (!initialized_) {
1679 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1680 result_status->set_message("IcingSearchEngine has not been initialized!");
1681 return delete_result;
1682 }
1683
1684 DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1685 delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
1686
1687 std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1688 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1689 // that can support error logging.
1690 DocumentStore::DeleteByGroupResult doc_store_result =
1691 document_store_->DeleteByNamespace(name_space);
1692 if (!doc_store_result.status.ok()) {
1693 ICING_LOG(ERROR) << doc_store_result.status.error_message()
1694 << "Failed to delete Namespace: " << name_space;
1695 TransformStatus(doc_store_result.status, result_status);
1696 return delete_result;
1697 }
1698
1699 result_status->set_code(StatusProto::OK);
1700 delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1701 delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1702 return delete_result;
1703 }
1704
DeleteBySchemaType(const std::string_view schema_type)1705 DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
1706 const std::string_view schema_type) {
1707 ICING_VLOG(1) << "Deleting type from doc store";
1708
1709 DeleteBySchemaTypeResultProto delete_result;
1710 StatusProto* result_status = delete_result.mutable_status();
1711 absl_ports::unique_lock l(&mutex_);
1712 if (!initialized_) {
1713 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1714 result_status->set_message("IcingSearchEngine has not been initialized!");
1715 return delete_result;
1716 }
1717
1718 DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
1719 delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
1720
1721 std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
1722 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
1723 // that can support error logging.
1724 DocumentStore::DeleteByGroupResult doc_store_result =
1725 document_store_->DeleteBySchemaType(schema_type);
1726 if (!doc_store_result.status.ok()) {
1727 ICING_LOG(ERROR) << doc_store_result.status.error_message()
1728 << "Failed to delete SchemaType: " << schema_type;
1729 TransformStatus(doc_store_result.status, result_status);
1730 return delete_result;
1731 }
1732
1733 result_status->set_code(StatusProto::OK);
1734 delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
1735 delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
1736 return delete_result;
1737 }
1738
DeleteByQuery(const SearchSpecProto & search_spec,bool return_deleted_document_info)1739 DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
1740 const SearchSpecProto& search_spec, bool return_deleted_document_info) {
1741 ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
1742 << " from doc store";
1743
1744 DeleteByQueryResultProto result_proto;
1745 StatusProto* result_status = result_proto.mutable_status();
1746
1747 absl_ports::unique_lock l(&mutex_);
1748 if (!initialized_) {
1749 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1750 result_status->set_message("IcingSearchEngine has not been initialized!");
1751 return result_proto;
1752 }
1753
1754 DeleteByQueryStatsProto* delete_stats =
1755 result_proto.mutable_delete_by_query_stats();
1756 delete_stats->set_query_length(search_spec.query().length());
1757 delete_stats->set_num_namespaces_filtered(
1758 search_spec.namespace_filters_size());
1759 delete_stats->set_num_schema_types_filtered(
1760 search_spec.schema_type_filters_size());
1761
1762 ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
1763 delete_stats->set_latency_ms(t);
1764 });
1765 libtextclassifier3::Status status =
1766 ValidateSearchSpec(search_spec, performance_configuration_);
1767 if (!status.ok()) {
1768 TransformStatus(status, result_status);
1769 return result_proto;
1770 }
1771
1772 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
1773 // Gets unordered results from query processor
1774 auto query_processor_or = QueryProcessor::Create(
1775 index_.get(), integer_index_.get(), embedding_index_.get(),
1776 language_segmenter_.get(), normalizer_.get(), document_store_.get(),
1777 schema_store_.get(), /*join_children_fetcher=*/nullptr, clock_.get(),
1778 &feature_flags_);
1779 if (!query_processor_or.ok()) {
1780 TransformStatus(query_processor_or.status(), result_status);
1781 delete_stats->set_parse_query_latency_ms(
1782 component_timer->GetElapsedMilliseconds());
1783 return result_proto;
1784 }
1785 std::unique_ptr<QueryProcessor> query_processor =
1786 std::move(query_processor_or).ValueOrDie();
1787
1788 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
1789 auto query_results_or = query_processor->ParseSearch(
1790 search_spec, ScoringSpecProto::RankingStrategy::NONE, current_time_ms);
1791 if (!query_results_or.ok()) {
1792 TransformStatus(query_results_or.status(), result_status);
1793 delete_stats->set_parse_query_latency_ms(
1794 component_timer->GetElapsedMilliseconds());
1795 return result_proto;
1796 }
1797 QueryResults query_results = std::move(query_results_or).ValueOrDie();
1798 delete_stats->set_parse_query_latency_ms(
1799 component_timer->GetElapsedMilliseconds());
1800
1801 ICING_VLOG(2) << "Deleting the docs that matched the query.";
1802 int num_deleted = 0;
1803 // A map used to group deleted documents.
1804 // From the (namespace, type) pair to a list of uris.
1805 std::unordered_map<NamespaceTypePair,
1806 DeleteByQueryResultProto::DocumentGroupInfo*,
1807 NamespaceTypePairHasher>
1808 deleted_info_map;
1809
1810 component_timer = clock_->GetNewTimer();
1811 while (query_results.root_iterator->Advance().ok()) {
1812 ICING_VLOG(3) << "Deleting doc "
1813 << query_results.root_iterator->doc_hit_info().document_id();
1814 ++num_deleted;
1815 if (return_deleted_document_info) {
1816 status = RetrieveAndAddDocumentInfo(
1817 document_store_.get(), result_proto, deleted_info_map,
1818 query_results.root_iterator->doc_hit_info().document_id());
1819 if (!status.ok()) {
1820 TransformStatus(status, result_status);
1821 delete_stats->set_document_removal_latency_ms(
1822 component_timer->GetElapsedMilliseconds());
1823 return result_proto;
1824 }
1825 }
1826 status = document_store_->Delete(
1827 query_results.root_iterator->doc_hit_info().document_id(),
1828 current_time_ms);
1829 if (!status.ok()) {
1830 TransformStatus(status, result_status);
1831 delete_stats->set_document_removal_latency_ms(
1832 component_timer->GetElapsedMilliseconds());
1833 return result_proto;
1834 }
1835 }
1836 delete_stats->set_document_removal_latency_ms(
1837 component_timer->GetElapsedMilliseconds());
1838 int term_count = 0;
1839 for (const auto& section_and_terms : query_results.query_terms) {
1840 term_count += section_and_terms.second.size();
1841 }
1842 delete_stats->set_num_terms(term_count);
1843
1844 if (num_deleted > 0) {
1845 result_proto.mutable_status()->set_code(StatusProto::OK);
1846 } else {
1847 result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
1848 result_proto.mutable_status()->set_message(
1849 "No documents matched the query to delete by!");
1850 }
1851 delete_stats->set_num_documents_deleted(num_deleted);
1852 return result_proto;
1853 }
1854
PropagateDelete(const std::unordered_set<DocumentId> & deleted_document_ids,int64_t current_time_ms)1855 libtextclassifier3::StatusOr<int> IcingSearchEngine::PropagateDelete(
1856 const std::unordered_set<DocumentId>& deleted_document_ids,
1857 int64_t current_time_ms) {
1858 int propagated_child_docs_deleted = 0;
1859
1860 if (!options_.enable_qualified_id_join_index_v3_and_delete_propagate_from() ||
1861 qualified_id_join_index_->version() !=
1862 QualifiedIdJoinIndex::Version::kV3) {
1863 // No-op if delete propagation is disabled or the join index is not v3.
1864 return propagated_child_docs_deleted;
1865 }
1866
1867 // Create join processor to get propagated child documents to delete.
1868 JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
1869 qualified_id_join_index_.get(), current_time_ms);
1870 ICING_ASSIGN_OR_RETURN(
1871 std::unordered_set<DocumentId> child_docs_to_delete,
1872 join_processor.GetPropagatedChildDocumentsToDelete(deleted_document_ids));
1873
1874 // Delete all propagated child documents.
1875 for (DocumentId child_doc_id : child_docs_to_delete) {
1876 auto status = document_store_->Delete(child_doc_id, current_time_ms);
1877 if (!status.ok()) {
1878 if (absl_ports::IsNotFound(status)) {
1879 // The child document has already been deleted or expired, so skip the
1880 // error.
1881 continue;
1882 }
1883
1884 // Real error.
1885 return status;
1886 }
1887 ++propagated_child_docs_deleted;
1888 }
1889
1890 return propagated_child_docs_deleted;
1891 }
1892
PersistToDisk(PersistType::Code persist_type)1893 PersistToDiskResultProto IcingSearchEngine::PersistToDisk(
1894 PersistType::Code persist_type) {
1895 ICING_VLOG(1) << "Persisting data to disk";
1896
1897 PersistToDiskResultProto result_proto;
1898 StatusProto* result_status = result_proto.mutable_status();
1899
1900 absl_ports::unique_lock l(&mutex_);
1901 if (!initialized_) {
1902 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1903 result_status->set_message("IcingSearchEngine has not been initialized!");
1904 return result_proto;
1905 }
1906
1907 auto status = InternalPersistToDisk(persist_type);
1908 TransformStatus(status, result_status);
1909 return result_proto;
1910 }
1911
1912 // Optimizes Icing's storage
1913 //
1914 // Steps:
1915 // 1. Flush data to disk.
1916 // 2. Copy data needed to a tmp directory.
1917 // 3. Swap current directory and tmp directory.
Optimize()1918 OptimizeResultProto IcingSearchEngine::Optimize() {
1919 ICING_VLOG(1) << "Optimizing icing storage";
1920
1921 OptimizeResultProto result_proto;
1922 StatusProto* result_status = result_proto.mutable_status();
1923
1924 absl_ports::unique_lock l(&mutex_);
1925 if (!initialized_) {
1926 result_status->set_code(StatusProto::FAILED_PRECONDITION);
1927 result_status->set_message("IcingSearchEngine has not been initialized!");
1928 return result_proto;
1929 }
1930
1931 OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
1932 ScopedTimer optimize_timer(
1933 clock_->GetNewTimer(),
1934 [optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
1935
1936 // Flushes data to disk before doing optimization
1937 auto status = InternalPersistToDisk(PersistType::FULL);
1938 if (!status.ok()) {
1939 TransformStatus(status, result_status);
1940 return result_proto;
1941 }
1942
1943 int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
1944 optimize_stats->set_storage_size_before(
1945 Filesystem::SanitizeFileSize(before_size));
1946
1947 // Get all expired blob handles
1948 std::unordered_set<std::string> potentially_optimizable_blob_handles;
1949 if (blob_store_ != nullptr) {
1950 potentially_optimizable_blob_handles =
1951 blob_store_->GetPotentiallyOptimizableBlobHandles();
1952 }
1953
1954 // TODO(b/143646633): figure out if we need to optimize index and doc store
1955 // at the same time.
1956 std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
1957 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
1958 optimize_result_or = OptimizeDocumentStore(
1959 std::move(potentially_optimizable_blob_handles), optimize_stats);
1960 optimize_stats->set_document_store_optimize_latency_ms(
1961 optimize_doc_store_timer->GetElapsedMilliseconds());
1962
1963 if (!optimize_result_or.ok() &&
1964 !absl_ports::IsDataLoss(optimize_result_or.status())) {
1965 // The status now is either ABORTED_ERROR or INTERNAL_ERROR.
1966 // If ABORTED_ERROR, Icing should still be working.
1967 // If INTERNAL_ERROR, we're having IO errors or other errors that we can't
1968 // recover from.
1969 TransformStatus(optimize_result_or.status(), result_status);
1970 return result_proto;
1971 }
1972
1973 libtextclassifier3::Status doc_store_optimize_result_status =
1974 optimize_result_or.status();
1975 if (blob_store_ != nullptr && doc_store_optimize_result_status.ok()) {
1976 // optimize blob store
1977 libtextclassifier3::Status blob_store_optimize_status =
1978 blob_store_->Optimize(
1979 optimize_result_or.ValueOrDie().dead_blob_handles);
1980 if (!blob_store_optimize_status.ok()) {
1981 TransformStatus(status, result_status);
1982 return result_proto;
1983 }
1984 }
1985
1986 // The status is either OK or DATA_LOSS. The optimized document store is
1987 // guaranteed to work, so we update index according to the new document store.
1988 std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
1989 bool should_rebuild_index =
1990 !optimize_result_or.ok() ||
1991 optimize_result_or.ValueOrDie().should_rebuild_index ||
1992 ShouldRebuildIndex(*optimize_stats,
1993 options_.optimize_rebuild_index_threshold());
1994 if (!should_rebuild_index) {
1995 // At this point should_rebuild_index is false, so it means
1996 // optimize_result_or.ok() is true and therefore it is safe to call
1997 // ValueOrDie.
1998 DocumentStore::OptimizeResult optimize_result =
1999 std::move(optimize_result_or).ValueOrDie();
2000
2001 optimize_stats->set_index_restoration_mode(
2002 OptimizeStatsProto::INDEX_TRANSLATION);
2003 libtextclassifier3::Status index_optimize_status =
2004 index_->Optimize(optimize_result.document_id_old_to_new,
2005 document_store_->last_added_document_id());
2006 if (!index_optimize_status.ok()) {
2007 ICING_LOG(WARNING) << "Failed to optimize index. Error: "
2008 << index_optimize_status.error_message();
2009 should_rebuild_index = true;
2010 }
2011
2012 libtextclassifier3::Status integer_index_optimize_status =
2013 integer_index_->Optimize(optimize_result.document_id_old_to_new,
2014 document_store_->last_added_document_id());
2015 if (!integer_index_optimize_status.ok()) {
2016 ICING_LOG(WARNING) << "Failed to optimize integer index. Error: "
2017 << integer_index_optimize_status.error_message();
2018 should_rebuild_index = true;
2019 }
2020
2021 libtextclassifier3::Status qualified_id_join_index_optimize_status =
2022 qualified_id_join_index_->Optimize(
2023 optimize_result.document_id_old_to_new,
2024 optimize_result.namespace_id_old_to_new,
2025 document_store_->last_added_document_id());
2026 if (!qualified_id_join_index_optimize_status.ok()) {
2027 ICING_LOG(WARNING)
2028 << "Failed to optimize qualified id join index. Error: "
2029 << qualified_id_join_index_optimize_status.error_message();
2030 should_rebuild_index = true;
2031 }
2032
2033 libtextclassifier3::Status embedding_index_optimize_status =
2034 embedding_index_->Optimize(document_store_.get(), schema_store_.get(),
2035 optimize_result.document_id_old_to_new,
2036 document_store_->last_added_document_id());
2037 if (!embedding_index_optimize_status.ok()) {
2038 ICING_LOG(WARNING) << "Failed to optimize embedding index. Error: "
2039 << embedding_index_optimize_status.error_message();
2040 should_rebuild_index = true;
2041 }
2042 }
2043 // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
2044 // valid document store, but it might be the old one or the new one. So throw
2045 // out the index data and rebuild from scratch.
2046 // Also rebuild index if DocumentStore::OptimizeInto hints to do so.
2047 // Likewise, if Index::Optimize failed, then attempt to recover the index by
2048 // rebuilding from scratch.
2049 // If ShouldRebuildIndex() returns true, we will also rebuild the index for
2050 // better performance.
2051 if (should_rebuild_index) {
2052 optimize_stats->set_index_restoration_mode(
2053 OptimizeStatsProto::FULL_INDEX_REBUILD);
2054 ICING_LOG(WARNING) << "Clearing the entire index!";
2055
2056 libtextclassifier3::Status index_clear_status = ClearAllIndices();
2057 if (!index_clear_status.ok()) {
2058 status = absl_ports::Annotate(
2059 absl_ports::InternalError("Failed to clear index."),
2060 index_clear_status.error_message());
2061 TransformStatus(status, result_status);
2062 optimize_stats->set_index_restoration_latency_ms(
2063 optimize_index_timer->GetElapsedMilliseconds());
2064 return result_proto;
2065 }
2066
2067 IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
2068 // DATA_LOSS means that we have successfully re-added content to the index.
2069 // Some indexed content was lost, but otherwise the index is in a valid
2070 // state and can be queried.
2071 if (!index_restoration_status.status.ok() &&
2072 !absl_ports::IsDataLoss(index_restoration_status.status)) {
2073 status = absl_ports::Annotate(
2074 absl_ports::InternalError(
2075 "Failed to reindex documents after optimization."),
2076 index_restoration_status.status.error_message());
2077
2078 TransformStatus(status, result_status);
2079 optimize_stats->set_index_restoration_latency_ms(
2080 optimize_index_timer->GetElapsedMilliseconds());
2081 return result_proto;
2082 }
2083 }
2084 optimize_stats->set_index_restoration_latency_ms(
2085 optimize_index_timer->GetElapsedMilliseconds());
2086
2087 // Read the optimize status to get the time that we last ran.
2088 std::string optimize_status_filename =
2089 absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
2090 FileBackedProto<OptimizeStatusProto> optimize_status_file(
2091 *filesystem_, optimize_status_filename);
2092 auto optimize_status_or = optimize_status_file.Read();
2093 int64_t current_time = clock_->GetSystemTimeMilliseconds();
2094 if (optimize_status_or.ok()) {
2095 // If we have trouble reading the status or this is the first time that
2096 // we've ever run, don't set this field.
2097 optimize_stats->set_time_since_last_optimize_ms(
2098 current_time - optimize_status_or.ValueOrDie()
2099 ->last_successful_optimize_run_time_ms());
2100 }
2101
2102 // Update the status for this run and write it.
2103 auto optimize_status = std::make_unique<OptimizeStatusProto>();
2104 optimize_status->set_last_successful_optimize_run_time_ms(current_time);
2105 auto write_status = optimize_status_file.Write(std::move(optimize_status));
2106 if (!write_status.ok()) {
2107 ICING_LOG(ERROR) << "Failed to write optimize status:\n"
2108 << write_status.error_message();
2109 }
2110
2111 // Flushes data to disk after doing optimization
2112 status = InternalPersistToDisk(PersistType::FULL);
2113 if (!status.ok()) {
2114 TransformStatus(status, result_status);
2115 return result_proto;
2116 }
2117
2118 int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
2119 optimize_stats->set_storage_size_after(
2120 Filesystem::SanitizeFileSize(after_size));
2121
2122 TransformStatus(doc_store_optimize_result_status, result_status);
2123 return result_proto;
2124 }
2125
GetOptimizeInfo()2126 GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
2127 ICING_VLOG(1) << "Getting optimize info from IcingSearchEngine";
2128
2129 GetOptimizeInfoResultProto result_proto;
2130 StatusProto* result_status = result_proto.mutable_status();
2131
2132 absl_ports::shared_lock l(&mutex_);
2133 if (!initialized_) {
2134 result_status->set_code(StatusProto::FAILED_PRECONDITION);
2135 result_status->set_message("IcingSearchEngine has not been initialized!");
2136 return result_proto;
2137 }
2138
2139 // Read the optimize status to get the time that we last ran.
2140 std::string optimize_status_filename =
2141 absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
2142 FileBackedProto<OptimizeStatusProto> optimize_status_file(
2143 *filesystem_, optimize_status_filename);
2144 auto optimize_status_or = optimize_status_file.Read();
2145 int64_t current_time = clock_->GetSystemTimeMilliseconds();
2146
2147 if (optimize_status_or.ok()) {
2148 // If we have trouble reading the status or this is the first time that
2149 // we've ever run, don't set this field.
2150 result_proto.set_time_since_last_optimize_ms(
2151 current_time - optimize_status_or.ValueOrDie()
2152 ->last_successful_optimize_run_time_ms());
2153 }
2154
2155 // Get stats from DocumentStore
2156 auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
2157 if (!doc_store_optimize_info_or.ok()) {
2158 TransformStatus(doc_store_optimize_info_or.status(), result_status);
2159 return result_proto;
2160 }
2161 DocumentStore::OptimizeInfo doc_store_optimize_info =
2162 doc_store_optimize_info_or.ValueOrDie();
2163 result_proto.set_optimizable_docs(doc_store_optimize_info.optimizable_docs);
2164
2165 if (doc_store_optimize_info.optimizable_docs == 0) {
2166 // Can return early since there's nothing to calculate on the index side
2167 result_proto.set_estimated_optimizable_bytes(0);
2168 result_status->set_code(StatusProto::OK);
2169 return result_proto;
2170 }
2171
2172 // Get stats from Index.
2173 auto index_elements_size_or = index_->GetElementsSize();
2174 if (!index_elements_size_or.ok()) {
2175 TransformStatus(index_elements_size_or.status(), result_status);
2176 return result_proto;
2177 }
2178 int64_t index_elements_size = index_elements_size_or.ValueOrDie();
2179 // TODO(b/273591938): add stats for blob store
2180 // TODO(b/259744228): add stats for integer index
2181
2182 // Sum up the optimizable sizes from DocumentStore and Index
2183 result_proto.set_estimated_optimizable_bytes(
2184 index_elements_size * doc_store_optimize_info.optimizable_docs /
2185 doc_store_optimize_info.total_docs +
2186 doc_store_optimize_info.estimated_optimizable_bytes);
2187
2188 result_status->set_code(StatusProto::OK);
2189 return result_proto;
2190 }
2191
GetStorageInfo()2192 StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
2193 StorageInfoResultProto result;
2194 absl_ports::shared_lock l(&mutex_);
2195 if (!initialized_) {
2196 result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
2197 result.mutable_status()->set_message(
2198 "IcingSearchEngine has not been initialized!");
2199 return result;
2200 }
2201
2202 int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
2203 result.mutable_storage_info()->set_total_storage_size(
2204 Filesystem::SanitizeFileSize(index_size));
2205 *result.mutable_storage_info()->mutable_document_storage_info() =
2206 document_store_->GetStorageInfo();
2207 *result.mutable_storage_info()->mutable_schema_store_storage_info() =
2208 schema_store_->GetStorageInfo();
2209 *result.mutable_storage_info()->mutable_index_storage_info() =
2210 index_->GetStorageInfo();
2211 if (blob_store_ != nullptr) {
2212 auto namespace_blob_storage_infos_or = blob_store_->GetStorageInfo();
2213 if (!namespace_blob_storage_infos_or.ok()) {
2214 result.mutable_status()->set_code(StatusProto::INTERNAL);
2215 result.mutable_status()->set_message(
2216 namespace_blob_storage_infos_or.status().error_message());
2217 return result;
2218 }
2219 std::vector<NamespaceBlobStorageInfoProto> namespace_blob_storage_infos =
2220 std::move(namespace_blob_storage_infos_or).ValueOrDie();
2221
2222 for (NamespaceBlobStorageInfoProto& namespace_blob_storage_info :
2223 namespace_blob_storage_infos) {
2224 *result.mutable_storage_info()
2225 ->mutable_namespace_blob_storage_info()
2226 ->Add() = std::move(namespace_blob_storage_info);
2227 }
2228 }
2229 // TODO(b/259744228): add stats for integer index
2230 result.mutable_status()->set_code(StatusProto::OK);
2231 return result;
2232 }
2233
GetDebugInfo(DebugInfoVerbosity::Code verbosity)2234 DebugInfoResultProto IcingSearchEngine::GetDebugInfo(
2235 DebugInfoVerbosity::Code verbosity) {
2236 DebugInfoResultProto debug_info;
2237 StatusProto* result_status = debug_info.mutable_status();
2238 absl_ports::shared_lock l(&mutex_);
2239 if (!initialized_) {
2240 debug_info.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
2241 debug_info.mutable_status()->set_message(
2242 "IcingSearchEngine has not been initialized!");
2243 return debug_info;
2244 }
2245
2246 // Index
2247 *debug_info.mutable_debug_info()->mutable_index_info() =
2248 index_->GetDebugInfo(verbosity);
2249
2250 // TODO(b/259744228): add debug info for integer index
2251
2252 // Document Store
2253 libtextclassifier3::StatusOr<DocumentDebugInfoProto> document_debug_info =
2254 document_store_->GetDebugInfo(verbosity);
2255 if (!document_debug_info.ok()) {
2256 TransformStatus(document_debug_info.status(), result_status);
2257 return debug_info;
2258 }
2259 *debug_info.mutable_debug_info()->mutable_document_info() =
2260 std::move(document_debug_info).ValueOrDie();
2261
2262 // Schema Store
2263 libtextclassifier3::StatusOr<SchemaDebugInfoProto> schema_debug_info =
2264 schema_store_->GetDebugInfo();
2265 if (!schema_debug_info.ok()) {
2266 TransformStatus(schema_debug_info.status(), result_status);
2267 return debug_info;
2268 }
2269 *debug_info.mutable_debug_info()->mutable_schema_info() =
2270 std::move(schema_debug_info).ValueOrDie();
2271
2272 result_status->set_code(StatusProto::OK);
2273 return debug_info;
2274 }
2275
InternalPersistToDisk(PersistType::Code persist_type)2276 libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk(
2277 PersistType::Code persist_type) {
2278 if (blob_store_ != nullptr) {
2279 // For all valid PersistTypes, we persist the ground truth. The ground truth
2280 // in the blob_store is a proto log file, which is need to be called when
2281 // persist_type is LITE.
2282 ICING_RETURN_IF_ERROR(blob_store_->PersistToDisk());
2283 }
2284 ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(persist_type));
2285 if (persist_type == PersistType::RECOVERY_PROOF) {
2286 // Persist RECOVERY_PROOF will persist the ground truth and then update all
2287 // checksums. There is no need to call document_store_->UpdateChecksum()
2288 // because PersistToDisk(RECOVERY_PROOF) will update the checksum anyways.
2289 ICING_RETURN_IF_ERROR(schema_store_->UpdateChecksum());
2290 index_->UpdateChecksum();
2291 ICING_RETURN_IF_ERROR(integer_index_->UpdateChecksums());
2292 ICING_RETURN_IF_ERROR(qualified_id_join_index_->UpdateChecksums());
2293 ICING_RETURN_IF_ERROR(embedding_index_->UpdateChecksums());
2294 } else if (persist_type == PersistType::FULL) {
2295 ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
2296 ICING_RETURN_IF_ERROR(index_->PersistToDisk());
2297 ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk());
2298 ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk());
2299 ICING_RETURN_IF_ERROR(embedding_index_->PersistToDisk());
2300 }
2301
2302 return libtextclassifier3::Status::OK;
2303 }
2304
Search(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2305 SearchResultProto IcingSearchEngine::Search(
2306 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2307 const ResultSpecProto& result_spec) {
2308 if (search_spec.use_read_only_search()) {
2309 return SearchLockedShared(search_spec, scoring_spec, result_spec);
2310 } else {
2311 return SearchLockedExclusive(search_spec, scoring_spec, result_spec);
2312 }
2313 }
2314
SearchLockedShared(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2315 SearchResultProto IcingSearchEngine::SearchLockedShared(
2316 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2317 const ResultSpecProto& result_spec) {
2318 std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2319
2320 // Only acquire an overall read-lock for this implementation. Finer-grained
2321 // locks are implemented around code paths that write changes to Icing's data
2322 // members.
2323 absl_ports::shared_lock l(&mutex_);
2324 int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2325
2326 SearchResultProto result_proto =
2327 InternalSearch(search_spec, scoring_spec, result_spec);
2328
2329 result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2330 lock_acquisition_latency);
2331 result_proto.mutable_query_stats()->set_latency_ms(
2332 overall_timer->GetElapsedMilliseconds());
2333 return result_proto;
2334 }
2335
SearchLockedExclusive(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2336 SearchResultProto IcingSearchEngine::SearchLockedExclusive(
2337 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2338 const ResultSpecProto& result_spec) {
2339 std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2340
2341 // Acquire the overall write-lock for this locked implementation.
2342 absl_ports::unique_lock l(&mutex_);
2343 int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
2344
2345 SearchResultProto result_proto =
2346 InternalSearch(search_spec, scoring_spec, result_spec);
2347
2348 result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
2349 lock_acquisition_latency);
2350 result_proto.mutable_query_stats()->set_latency_ms(
2351 overall_timer->GetElapsedMilliseconds());
2352 return result_proto;
2353 }
2354
InternalSearch(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec)2355 SearchResultProto IcingSearchEngine::InternalSearch(
2356 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2357 const ResultSpecProto& result_spec) {
2358 SearchResultProto result_proto;
2359 StatusProto* result_status = result_proto.mutable_status();
2360
2361 QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2362 query_stats->set_is_first_page(true);
2363 query_stats->set_requested_page_size(result_spec.num_per_page());
2364
2365 // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2366 query_stats->set_num_namespaces_filtered(
2367 search_spec.namespace_filters_size());
2368 query_stats->set_num_schema_types_filtered(
2369 search_spec.schema_type_filters_size());
2370 query_stats->set_query_length(search_spec.query().length());
2371 query_stats->set_ranking_strategy(scoring_spec.rank_by());
2372
2373 if (!initialized_) {
2374 result_status->set_code(StatusProto::FAILED_PRECONDITION);
2375 result_status->set_message("IcingSearchEngine has not been initialized!");
2376 return result_proto;
2377 }
2378 index_->PublishQueryStats(query_stats);
2379
2380 libtextclassifier3::Status status =
2381 ValidateResultSpec(document_store_.get(), result_spec);
2382 if (!status.ok()) {
2383 TransformStatus(status, result_status);
2384 return result_proto;
2385 }
2386 status = ValidateSearchSpec(search_spec, performance_configuration_);
2387 if (!status.ok()) {
2388 TransformStatus(status, result_status);
2389 return result_proto;
2390 }
2391 status = ValidateScoringSpec(scoring_spec);
2392 if (!status.ok()) {
2393 TransformStatus(status, result_status);
2394 return result_proto;
2395 }
2396
2397 const JoinSpecProto& join_spec = search_spec.join_spec();
2398 std::unique_ptr<JoinChildrenFetcher> join_children_fetcher;
2399 std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info;
2400 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2401 if (!join_spec.parent_property_expression().empty() &&
2402 !join_spec.child_property_expression().empty()) {
2403 query_stats->set_is_join_query(true);
2404 QueryStatsProto::SearchStats* child_search_stats =
2405 query_stats->mutable_child_search_stats();
2406
2407 // Build a child scoring spec by copying the parent schema type alias map.
2408 // Note that this function will transfer the ownership of data from
2409 // search_spec.join_spec().nested_spec().scoring_spec() to the
2410 // child_scoring_spec.
2411 // Hence, that following functions should NOT access
2412 // search_spec.join_spec().nested_spec().scoring_spec() after this function
2413 // call, but using child_scoring_spec instead.
2414 //
2415 // TODO(b/379288742): Avoid making the copy of the parent schema type alias
2416 // map.
2417 ScoringSpecProto child_scoring_spec = CopyParentSchemaTypeAliasMapToChild(
2418 scoring_spec, search_spec.join_spec().nested_spec().scoring_spec());
2419
2420 // Process child query
2421 // TODO(b/372541905): Validate the child search spec.
2422 QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore(
2423 join_spec.nested_spec().search_spec(), child_scoring_spec,
2424 join_spec.nested_spec().result_spec(),
2425 /*join_children_fetcher=*/nullptr, current_time_ms, child_search_stats);
2426 if (!nested_query_scoring_results.status.ok()) {
2427 TransformStatus(nested_query_scoring_results.status, result_status);
2428 return result_proto;
2429 }
2430
2431 JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2432 qualified_id_join_index_.get(),
2433 current_time_ms);
2434 // Building a JoinChildrenFetcher for looking up child documents by parent
2435 // document id.
2436 libtextclassifier3::StatusOr<std::unique_ptr<JoinChildrenFetcher>>
2437 join_children_fetcher_or = join_processor.GetChildrenFetcher(
2438 search_spec.join_spec(),
2439 std::move(nested_query_scoring_results.scored_document_hits));
2440 if (!join_children_fetcher_or.ok()) {
2441 TransformStatus(join_children_fetcher_or.status(), result_status);
2442 return result_proto;
2443 }
2444 join_children_fetcher = std::move(join_children_fetcher_or).ValueOrDie();
2445
2446 // Assign child's ResultAdjustmentInfo.
2447 child_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2448 join_spec.nested_spec().search_spec(), child_scoring_spec,
2449 join_spec.nested_spec().result_spec(), schema_store_.get(),
2450 std::move(nested_query_scoring_results.query_terms));
2451 }
2452
2453 // Process parent query
2454 QueryStatsProto::SearchStats* parent_search_stats =
2455 query_stats->mutable_parent_search_stats();
2456 QueryScoringResults query_scoring_results = ProcessQueryAndScore(
2457 search_spec, scoring_spec, result_spec, join_children_fetcher.get(),
2458 current_time_ms, parent_search_stats);
2459 // TODO(b/305098009): deprecate search-related flat fields in query_stats.
2460 query_stats->set_num_terms(parent_search_stats->num_terms());
2461 query_stats->set_parse_query_latency_ms(
2462 parent_search_stats->parse_query_latency_ms());
2463 query_stats->set_scoring_latency_ms(
2464 parent_search_stats->scoring_latency_ms());
2465 query_stats->set_num_documents_scored(
2466 parent_search_stats->num_documents_scored());
2467 if (!query_scoring_results.status.ok()) {
2468 TransformStatus(query_scoring_results.status, result_status);
2469 return result_proto;
2470 }
2471
2472 // Returns early for empty result
2473 if (query_scoring_results.scored_document_hits.empty()) {
2474 result_status->set_code(StatusProto::OK);
2475 return result_proto;
2476 }
2477
2478 // Construct parent's result adjustment info.
2479 auto parent_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
2480 search_spec, scoring_spec, result_spec, schema_store_.get(),
2481 std::move(query_scoring_results.query_terms));
2482
2483 std::unique_ptr<ScoredDocumentHitsRanker> ranker;
2484 if (join_children_fetcher != nullptr) {
2485 std::unique_ptr<Timer> join_timer = clock_->GetNewTimer();
2486 // Join 2 scored document hits
2487 JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
2488 qualified_id_join_index_.get(),
2489 current_time_ms);
2490 libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
2491 joined_result_document_hits_or = join_processor.Join(
2492 join_spec, std::move(query_scoring_results.scored_document_hits),
2493 *join_children_fetcher);
2494 if (!joined_result_document_hits_or.ok()) {
2495 TransformStatus(joined_result_document_hits_or.status(), result_status);
2496 return result_proto;
2497 }
2498 std::vector<JoinedScoredDocumentHit> joined_result_document_hits =
2499 std::move(joined_result_document_hits_or).ValueOrDie();
2500
2501 query_stats->set_join_latency_ms(join_timer->GetElapsedMilliseconds());
2502
2503 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2504 // Ranks results
2505 ranker = std::make_unique<
2506 PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
2507 std::move(joined_result_document_hits),
2508 /*is_descending=*/scoring_spec.order_by() ==
2509 ScoringSpecProto::Order::DESC);
2510 query_stats->set_ranking_latency_ms(
2511 component_timer->GetElapsedMilliseconds());
2512 } else {
2513 // Non-join query
2514 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2515 // Ranks results
2516 ranker = std::make_unique<
2517 PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
2518 std::move(query_scoring_results.scored_document_hits),
2519 /*is_descending=*/scoring_spec.order_by() ==
2520 ScoringSpecProto::Order::DESC);
2521 query_stats->set_ranking_latency_ms(
2522 component_timer->GetElapsedMilliseconds());
2523 }
2524
2525 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2526 // CacheAndRetrieveFirstPage and retrieves the document protos and snippets if
2527 // requested
2528 auto result_retriever_or =
2529 ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2530 language_segmenter_.get(), normalizer_.get());
2531 if (!result_retriever_or.ok()) {
2532 TransformStatus(result_retriever_or.status(), result_status);
2533 query_stats->set_document_retrieval_latency_ms(
2534 component_timer->GetElapsedMilliseconds());
2535 return result_proto;
2536 }
2537 std::unique_ptr<ResultRetrieverV2> result_retriever =
2538 std::move(result_retriever_or).ValueOrDie();
2539
2540 libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2541 page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
2542 std::move(ranker), std::move(parent_result_adjustment_info),
2543 std::move(child_result_adjustment_info), result_spec,
2544 *document_store_, *result_retriever, current_time_ms);
2545 if (!page_result_info_or.ok()) {
2546 TransformStatus(page_result_info_or.status(), result_status);
2547 query_stats->set_document_retrieval_latency_ms(
2548 component_timer->GetElapsedMilliseconds());
2549 return result_proto;
2550 }
2551 std::pair<uint64_t, PageResult> page_result_info =
2552 std::move(page_result_info_or).ValueOrDie();
2553
2554 // Assembles the final search result proto
2555 result_proto.mutable_results()->Reserve(
2556 page_result_info.second.results.size());
2557
2558 int32_t child_count = 0;
2559 for (SearchResultProto::ResultProto& result :
2560 page_result_info.second.results) {
2561 child_count += result.joined_results_size();
2562 result_proto.mutable_results()->Add(std::move(result));
2563 }
2564
2565 result_status->set_code(StatusProto::OK);
2566 if (page_result_info.first != kInvalidNextPageToken) {
2567 result_proto.set_next_page_token(page_result_info.first);
2568 }
2569
2570 query_stats->set_document_retrieval_latency_ms(
2571 component_timer->GetElapsedMilliseconds());
2572 query_stats->set_num_results_returned_current_page(
2573 result_proto.results_size());
2574
2575 query_stats->set_num_joined_results_returned_current_page(child_count);
2576
2577 query_stats->set_num_results_with_snippets(
2578 page_result_info.second.num_results_with_snippets);
2579 return result_proto;
2580 }
2581
ProcessQueryAndScore(const SearchSpecProto & search_spec,const ScoringSpecProto & scoring_spec,const ResultSpecProto & result_spec,const JoinChildrenFetcher * join_children_fetcher,int64_t current_time_ms,QueryStatsProto::SearchStats * search_stats)2582 IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore(
2583 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
2584 const ResultSpecProto& result_spec,
2585 const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
2586 QueryStatsProto::SearchStats* search_stats) {
2587 search_stats->set_num_namespaces_filtered(
2588 search_spec.namespace_filters_size());
2589 search_stats->set_num_schema_types_filtered(
2590 search_spec.schema_type_filters_size());
2591 search_stats->set_query_length(search_spec.query().length());
2592 search_stats->set_ranking_strategy(scoring_spec.rank_by());
2593
2594 std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
2595
2596 // Gets unordered results from query processor
2597 auto query_processor_or = QueryProcessor::Create(
2598 index_.get(), integer_index_.get(), embedding_index_.get(),
2599 language_segmenter_.get(), normalizer_.get(), document_store_.get(),
2600 schema_store_.get(), join_children_fetcher, clock_.get(),
2601 &feature_flags_);
2602 if (!query_processor_or.ok()) {
2603 search_stats->set_parse_query_latency_ms(
2604 component_timer->GetElapsedMilliseconds());
2605 return QueryScoringResults(std::move(query_processor_or).status(),
2606 /*query_terms_in=*/{},
2607 /*scored_document_hits_in=*/{});
2608 }
2609 std::unique_ptr<QueryProcessor> query_processor =
2610 std::move(query_processor_or).ValueOrDie();
2611
2612 auto ranking_strategy_or = GetRankingStrategyFromScoringSpec(scoring_spec);
2613 libtextclassifier3::StatusOr<QueryResults> query_results_or;
2614 if (ranking_strategy_or.ok()) {
2615 query_results_or = query_processor->ParseSearch(
2616 search_spec, ranking_strategy_or.ValueOrDie(), current_time_ms,
2617 search_stats);
2618 } else {
2619 query_results_or = ranking_strategy_or.status();
2620 }
2621 search_stats->set_parse_query_latency_ms(
2622 component_timer->GetElapsedMilliseconds());
2623 if (!query_results_or.ok()) {
2624 return QueryScoringResults(std::move(query_results_or).status(),
2625 /*query_terms_in=*/{},
2626 /*scored_document_hits_in=*/{});
2627 }
2628 QueryResults query_results = std::move(query_results_or).ValueOrDie();
2629
2630 // Set SearchStats related to QueryResults.
2631 int term_count = 0;
2632 for (const auto& section_and_terms : query_results.query_terms) {
2633 term_count += section_and_terms.second.size();
2634 }
2635 search_stats->set_num_terms(term_count);
2636
2637 if (query_results.features_in_use.count(kNumericSearchFeature)) {
2638 search_stats->set_is_numeric_query(true);
2639 }
2640
2641 component_timer = clock_->GetNewTimer();
2642 // Scores but does not rank the results.
2643 libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
2644 scoring_processor_or = ScoringProcessor::Create(
2645 scoring_spec, /*default_semantic_metric_type=*/
2646 search_spec.embedding_query_metric_type(), document_store_.get(),
2647 schema_store_.get(), current_time_ms, join_children_fetcher,
2648 &query_results.embedding_query_results, &feature_flags_);
2649 if (!scoring_processor_or.ok()) {
2650 return QueryScoringResults(std::move(scoring_processor_or).status(),
2651 std::move(query_results.query_terms),
2652 /*scored_document_hits_in=*/{});
2653 }
2654 std::unique_ptr<ScoringProcessor> scoring_processor =
2655 std::move(scoring_processor_or).ValueOrDie();
2656 std::vector<ScoredDocumentHit> scored_document_hits =
2657 scoring_processor->Score(
2658 std::move(query_results.root_iterator), result_spec.num_to_score(),
2659 &query_results.query_term_iterators, search_stats);
2660 search_stats->set_scoring_latency_ms(
2661 component_timer->GetElapsedMilliseconds());
2662
2663 return QueryScoringResults(libtextclassifier3::Status::OK,
2664 std::move(query_results.query_terms),
2665 std::move(scored_document_hits));
2666 }
2667
GetNextPage(uint64_t next_page_token)2668 SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
2669 SearchResultProto result_proto;
2670 StatusProto* result_status = result_proto.mutable_status();
2671
2672 QueryStatsProto* query_stats = result_proto.mutable_query_stats();
2673 query_stats->set_is_first_page(false);
2674 std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
2675 // ResultStateManager has its own writer lock, so here we only need a reader
2676 // lock for other components.
2677 absl_ports::shared_lock l(&mutex_);
2678 query_stats->set_lock_acquisition_latency_ms(
2679 overall_timer->GetElapsedMilliseconds());
2680 if (!initialized_) {
2681 result_status->set_code(StatusProto::FAILED_PRECONDITION);
2682 result_status->set_message("IcingSearchEngine has not been initialized!");
2683 return result_proto;
2684 }
2685
2686 auto result_retriever_or =
2687 ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
2688 language_segmenter_.get(), normalizer_.get());
2689 if (!result_retriever_or.ok()) {
2690 TransformStatus(result_retriever_or.status(), result_status);
2691 return result_proto;
2692 }
2693 std::unique_ptr<ResultRetrieverV2> result_retriever =
2694 std::move(result_retriever_or).ValueOrDie();
2695
2696 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
2697 libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
2698 page_result_info_or = result_state_manager_->GetNextPage(
2699 next_page_token, *result_retriever, current_time_ms);
2700 if (!page_result_info_or.ok()) {
2701 if (absl_ports::IsNotFound(page_result_info_or.status())) {
2702 // NOT_FOUND means an empty result.
2703 result_status->set_code(StatusProto::OK);
2704 } else {
2705 // Real error, pass up.
2706 TransformStatus(page_result_info_or.status(), result_status);
2707 }
2708 return result_proto;
2709 }
2710
2711 std::pair<uint64_t, PageResult> page_result_info =
2712 std::move(page_result_info_or).ValueOrDie();
2713 query_stats->set_requested_page_size(
2714 page_result_info.second.requested_page_size);
2715
2716 // Assembles the final search result proto
2717 result_proto.mutable_results()->Reserve(
2718 page_result_info.second.results.size());
2719
2720 int32_t child_count = 0;
2721 for (SearchResultProto::ResultProto& result :
2722 page_result_info.second.results) {
2723 child_count += result.joined_results_size();
2724 result_proto.mutable_results()->Add(std::move(result));
2725 }
2726
2727 result_status->set_code(StatusProto::OK);
2728 if (page_result_info.first != kInvalidNextPageToken) {
2729 result_proto.set_next_page_token(page_result_info.first);
2730 }
2731
2732 // The only thing that we're doing is document retrieval. So document
2733 // retrieval latency and overall latency are the same and can use the same
2734 // timer.
2735 query_stats->set_document_retrieval_latency_ms(
2736 overall_timer->GetElapsedMilliseconds());
2737 query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
2738 query_stats->set_num_results_returned_current_page(
2739 result_proto.results_size());
2740 query_stats->set_num_results_with_snippets(
2741 page_result_info.second.num_results_with_snippets);
2742 query_stats->set_num_joined_results_returned_current_page(child_count);
2743
2744 return result_proto;
2745 }
2746
InvalidateNextPageToken(uint64_t next_page_token)2747 void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
2748 absl_ports::shared_lock l(&mutex_);
2749 if (!initialized_) {
2750 ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
2751 return;
2752 }
2753 result_state_manager_->InvalidateResultState(next_page_token);
2754 }
2755
OpenWriteBlob(const PropertyProto::BlobHandleProto & blob_handle)2756 BlobProto IcingSearchEngine::OpenWriteBlob(
2757 const PropertyProto::BlobHandleProto& blob_handle) {
2758 BlobProto blob_proto;
2759 StatusProto* status = blob_proto.mutable_status();
2760
2761 absl_ports::unique_lock l(&mutex_);
2762 if (blob_store_ == nullptr) {
2763 status->set_code(StatusProto::FAILED_PRECONDITION);
2764 status->set_message(
2765 "Open write blob is not supported in this Icing instance!");
2766 return blob_proto;
2767 }
2768
2769 if (!initialized_) {
2770 status->set_code(StatusProto::FAILED_PRECONDITION);
2771 status->set_message("IcingSearchEngine has not been initialized!");
2772 return blob_proto;
2773 }
2774
2775 libtextclassifier3::StatusOr<int> write_fd_or =
2776 blob_store_->OpenWrite(blob_handle);
2777 if (!write_fd_or.ok()) {
2778 TransformStatus(write_fd_or.status(), status);
2779 return blob_proto;
2780 }
2781 blob_proto.set_file_descriptor(write_fd_or.ValueOrDie());
2782 status->set_code(StatusProto::OK);
2783 return blob_proto;
2784 }
2785
RemoveBlob(const PropertyProto::BlobHandleProto & blob_handle)2786 BlobProto IcingSearchEngine::RemoveBlob(
2787 const PropertyProto::BlobHandleProto& blob_handle) {
2788 BlobProto blob_proto;
2789 StatusProto* status = blob_proto.mutable_status();
2790
2791 absl_ports::unique_lock l(&mutex_);
2792 if (blob_store_ == nullptr) {
2793 status->set_code(StatusProto::FAILED_PRECONDITION);
2794 status->set_message("Remove blob is not supported in this Icing instance!");
2795 return blob_proto;
2796 }
2797
2798 if (!initialized_) {
2799 status->set_code(StatusProto::FAILED_PRECONDITION);
2800 status->set_message("IcingSearchEngine has not been initialized!");
2801 return blob_proto;
2802 }
2803
2804 auto remove_result = blob_store_->RemoveBlob(blob_handle);
2805 if (!remove_result.ok()) {
2806 TransformStatus(remove_result, status);
2807 return blob_proto;
2808 }
2809 status->set_code(StatusProto::OK);
2810 return blob_proto;
2811 }
2812
OpenReadBlob(const PropertyProto::BlobHandleProto & blob_handle)2813 BlobProto IcingSearchEngine::OpenReadBlob(
2814 const PropertyProto::BlobHandleProto& blob_handle) {
2815 BlobProto blob_proto;
2816 StatusProto* status = blob_proto.mutable_status();
2817 absl_ports::shared_lock l(&mutex_);
2818 if (blob_store_ == nullptr) {
2819 status->set_code(StatusProto::FAILED_PRECONDITION);
2820 status->set_message(
2821 "Open read blob is not supported in this Icing instance!");
2822 return blob_proto;
2823 }
2824
2825 if (!initialized_) {
2826 status->set_code(StatusProto::FAILED_PRECONDITION);
2827 status->set_message("IcingSearchEngine has not been initialized!");
2828 ICING_LOG(ERROR) << status->message();
2829 return blob_proto;
2830 }
2831
2832 auto read_fd_or = blob_store_->OpenRead(blob_handle);
2833 if (!read_fd_or.ok()) {
2834 TransformStatus(read_fd_or.status(), status);
2835 return blob_proto;
2836 }
2837 blob_proto.set_file_descriptor(read_fd_or.ValueOrDie());
2838 status->set_code(StatusProto::OK);
2839 return blob_proto;
2840 }
2841
CommitBlob(const PropertyProto::BlobHandleProto & blob_handle)2842 BlobProto IcingSearchEngine::CommitBlob(
2843 const PropertyProto::BlobHandleProto& blob_handle) {
2844 BlobProto blob_proto;
2845 StatusProto* status = blob_proto.mutable_status();
2846 absl_ports::unique_lock l(&mutex_);
2847 if (blob_store_ == nullptr) {
2848 status->set_code(StatusProto::FAILED_PRECONDITION);
2849 status->set_message("Commit blob is not supported in this Icing instance!");
2850 return blob_proto;
2851 }
2852
2853 if (!initialized_) {
2854 status->set_code(StatusProto::FAILED_PRECONDITION);
2855 status->set_message("IcingSearchEngine has not been initialized!");
2856 ICING_LOG(ERROR) << status->message();
2857 return blob_proto;
2858 }
2859
2860 auto commit_result_or = blob_store_->CommitBlob(blob_handle);
2861 if (!commit_result_or.ok()) {
2862 TransformStatus(commit_result_or, status);
2863 return blob_proto;
2864 }
2865 status->set_code(StatusProto::OK);
2866 return blob_proto;
2867 }
2868
2869 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
OptimizeDocumentStore(std::unordered_set<std::string> && potentially_optimizable_blob_handles,OptimizeStatsProto * optimize_stats)2870 IcingSearchEngine::OptimizeDocumentStore(
2871 std::unordered_set<std::string>&& potentially_optimizable_blob_handles,
2872 OptimizeStatsProto* optimize_stats) {
2873 // Gets the current directory path and an empty tmp directory path for
2874 // document store optimization.
2875 const std::string current_document_dir =
2876 MakeDocumentDirectoryPath(options_.base_dir());
2877 const std::string temporary_document_dir =
2878 MakeDocumentTemporaryDirectoryPath(options_.base_dir());
2879 if (!filesystem_->DeleteDirectoryRecursively(
2880 temporary_document_dir.c_str()) ||
2881 !filesystem_->CreateDirectoryRecursively(
2882 temporary_document_dir.c_str())) {
2883 return absl_ports::AbortedError(absl_ports::StrCat(
2884 "Failed to create a tmp directory: ", temporary_document_dir));
2885 }
2886
2887 // Copies valid document data to tmp directory
2888 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
2889 optimize_result_or = document_store_->OptimizeInto(
2890 temporary_document_dir, language_segmenter_.get(),
2891 std::move(potentially_optimizable_blob_handles), optimize_stats);
2892
2893 // Handles error if any
2894 if (!optimize_result_or.ok()) {
2895 filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
2896 return absl_ports::Annotate(
2897 absl_ports::AbortedError("Failed to optimize document store"),
2898 optimize_result_or.status().error_message());
2899 }
2900
2901 // result_state_manager_ depends on document_store_. So we need to reset it at
2902 // the same time that we reset the document_store_.
2903 result_state_manager_.reset();
2904 document_store_.reset();
2905
2906 // When swapping files, always put the current working directory at the
2907 // second place because it is renamed at the latter position so we're less
2908 // vulnerable to errors.
2909 if (!filesystem_->SwapFiles(temporary_document_dir.c_str(),
2910 current_document_dir.c_str())) {
2911 ICING_LOG(ERROR) << "Failed to swap files";
2912
2913 // Ensures that current directory is still present.
2914 if (!filesystem_->CreateDirectoryRecursively(
2915 current_document_dir.c_str())) {
2916 // Can't even create the old directory. Mark as uninitialized and return
2917 // INTERNAL.
2918 initialized_ = false;
2919 return absl_ports::InternalError(
2920 "Failed to create file directory for document store");
2921 }
2922
2923 // Tries to rebuild document store if swapping fails, to avoid leaving the
2924 // system in the broken state for future operations.
2925 auto create_result_or = DocumentStore::Create(
2926 filesystem_.get(), current_document_dir, clock_.get(),
2927 schema_store_.get(), &feature_flags_,
2928 /*force_recovery_and_revalidate_documents=*/false,
2929 /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2930 options_.compression_level(), /*initialize_stats=*/nullptr);
2931 // TODO(b/144458732): Implement a more robust version of
2932 // TC_ASSIGN_OR_RETURN that can support error logging.
2933 if (!create_result_or.ok()) {
2934 // Unable to create DocumentStore from the old file. Mark as uninitialized
2935 // and return INTERNAL.
2936 initialized_ = false;
2937 ICING_LOG(ERROR) << "Failed to create document store instance";
2938 return absl_ports::Annotate(
2939 absl_ports::InternalError("Failed to create document store instance"),
2940 create_result_or.status().error_message());
2941 }
2942 document_store_ = std::move(create_result_or.ValueOrDie().document_store);
2943 result_state_manager_ = std::make_unique<ResultStateManager>(
2944 performance_configuration_.max_num_total_hits, *document_store_);
2945
2946 // Potential data loss
2947 // TODO(b/147373249): Find a way to detect true data loss error
2948 return absl_ports::DataLossError(
2949 "Failed to optimize document store, there might be data loss");
2950 }
2951
2952 // Recreates the doc store instance
2953 auto create_result_or = DocumentStore::Create(
2954 filesystem_.get(), current_document_dir, clock_.get(),
2955 schema_store_.get(), &feature_flags_,
2956 /*force_recovery_and_revalidate_documents=*/false,
2957 /*pre_mapping_fbv=*/false, /*use_persistent_hash_map=*/true,
2958 options_.compression_level(), /*initialize_stats=*/nullptr);
2959 if (!create_result_or.ok()) {
2960 // Unable to create DocumentStore from the new file. Mark as uninitialized
2961 // and return INTERNAL.
2962 initialized_ = false;
2963 return absl_ports::InternalError(
2964 "Document store has been optimized, but a valid document store "
2965 "instance can't be created");
2966 }
2967 DocumentStore::CreateResult create_result =
2968 std::move(create_result_or).ValueOrDie();
2969 document_store_ = std::move(create_result.document_store);
2970 result_state_manager_ = std::make_unique<ResultStateManager>(
2971 performance_configuration_.max_num_total_hits, *document_store_);
2972
2973 // Deletes tmp directory
2974 if (!filesystem_->DeleteDirectoryRecursively(
2975 temporary_document_dir.c_str())) {
2976 ICING_LOG(ERROR) << "Document store has been optimized, but it failed to "
2977 "delete temporary file directory";
2978 }
2979
2980 // Since we created new (optimized) document store with correct PersistToDisk
2981 // call, we shouldn't have data loss or regenerate derived files. Therefore,
2982 // if we really encounter any of these situations, then return DataLossError
2983 // to let the caller rebuild index.
2984 if (create_result.data_loss != DataLoss::NONE ||
2985 create_result.derived_files_regenerated) {
2986 return absl_ports::DataLossError(
2987 "Unexpected data loss or derived files regenerated for new document "
2988 "store");
2989 }
2990
2991 return optimize_result_or;
2992 }
2993
2994 IcingSearchEngine::IndexRestorationResult
RestoreIndexIfNeeded()2995 IcingSearchEngine::RestoreIndexIfNeeded() {
2996 DocumentId last_stored_document_id =
2997 document_store_->last_added_document_id();
2998 if (last_stored_document_id == index_->last_added_document_id() &&
2999 last_stored_document_id == integer_index_->last_added_document_id() &&
3000 last_stored_document_id ==
3001 qualified_id_join_index_->last_added_document_id() &&
3002 last_stored_document_id == embedding_index_->last_added_document_id()) {
3003 // No need to recover.
3004 return {libtextclassifier3::Status::OK, false, false, false, false};
3005 }
3006
3007 if (last_stored_document_id == kInvalidDocumentId) {
3008 // Document store is empty but index is not. Clear the index.
3009 return {ClearAllIndices(), false, false, false, false};
3010 }
3011
3012 // Truncate indices first.
3013 auto truncate_result_or = TruncateIndicesTo(last_stored_document_id);
3014 if (!truncate_result_or.ok()) {
3015 return {std::move(truncate_result_or).status(), false, false, false, false};
3016 }
3017 TruncateIndexResult truncate_result =
3018 std::move(truncate_result_or).ValueOrDie();
3019
3020 if (truncate_result.first_document_to_reindex > last_stored_document_id) {
3021 // Nothing to restore. Just return.
3022 return {libtextclassifier3::Status::OK, false, false, false, false};
3023 }
3024
3025 auto data_indexing_handlers_or = CreateDataIndexingHandlers();
3026 if (!data_indexing_handlers_or.ok()) {
3027 return {data_indexing_handlers_or.status(),
3028 truncate_result.index_needed_restoration,
3029 truncate_result.integer_index_needed_restoration,
3030 truncate_result.qualified_id_join_index_needed_restoration,
3031 truncate_result.embedding_index_needed_restoration};
3032 }
3033 // By using recovery_mode for IndexProcessor, we're able to replay documents
3034 // from smaller document id and it will skip documents that are already been
3035 // indexed.
3036 IndexProcessor index_processor(
3037 std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
3038 /*recovery_mode=*/true);
3039
3040 ICING_VLOG(1) << "Restoring index by replaying documents from document id "
3041 << truncate_result.first_document_to_reindex
3042 << " to document id " << last_stored_document_id;
3043 libtextclassifier3::Status overall_status;
3044 for (DocumentId document_id = truncate_result.first_document_to_reindex;
3045 document_id <= last_stored_document_id; ++document_id) {
3046 libtextclassifier3::StatusOr<DocumentProto> document_or =
3047 document_store_->Get(document_id);
3048
3049 if (!document_or.ok()) {
3050 if (absl_ports::IsInvalidArgument(document_or.status()) ||
3051 absl_ports::IsNotFound(document_or.status())) {
3052 // Skips invalid and non-existing documents.
3053 continue;
3054 } else {
3055 // Returns other errors
3056 return {document_or.status(), truncate_result.index_needed_restoration,
3057 truncate_result.integer_index_needed_restoration,
3058 truncate_result.qualified_id_join_index_needed_restoration,
3059 truncate_result.embedding_index_needed_restoration};
3060 }
3061 }
3062 DocumentProto document(std::move(document_or).ValueOrDie());
3063
3064 libtextclassifier3::StatusOr<TokenizedDocument> tokenized_document_or =
3065 TokenizedDocument::Create(schema_store_.get(),
3066 language_segmenter_.get(),
3067 std::move(document));
3068 if (!tokenized_document_or.ok()) {
3069 return {tokenized_document_or.status(),
3070 truncate_result.index_needed_restoration,
3071 truncate_result.integer_index_needed_restoration,
3072 truncate_result.qualified_id_join_index_needed_restoration,
3073 truncate_result.embedding_index_needed_restoration};
3074 }
3075 TokenizedDocument tokenized_document(
3076 std::move(tokenized_document_or).ValueOrDie());
3077
3078 // No valid old_document_id should be used here since we're in recovery mode
3079 // and there is no "existing document replacement/update".
3080 libtextclassifier3::Status status =
3081 index_processor.IndexDocument(tokenized_document, document_id,
3082 /*old_document_id=*/kInvalidDocumentId);
3083 if (!status.ok()) {
3084 if (!absl_ports::IsDataLoss(status)) {
3085 // Real error. Stop recovering and pass it up.
3086 return {status, truncate_result.index_needed_restoration,
3087 truncate_result.integer_index_needed_restoration,
3088 truncate_result.qualified_id_join_index_needed_restoration,
3089 truncate_result.embedding_index_needed_restoration};
3090 }
3091 // FIXME: why can we skip data loss error here?
3092 // Just a data loss. Keep trying to add the remaining docs, but report the
3093 // data loss when we're done.
3094 overall_status = status;
3095 }
3096 }
3097
3098 return {overall_status, truncate_result.index_needed_restoration,
3099 truncate_result.integer_index_needed_restoration,
3100 truncate_result.qualified_id_join_index_needed_restoration,
3101 truncate_result.embedding_index_needed_restoration};
3102 }
3103
LostPreviousSchema()3104 libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
3105 auto status_or = schema_store_->GetSchema();
3106 if (status_or.ok()) {
3107 // Found a schema.
3108 return false;
3109 }
3110
3111 if (!absl_ports::IsNotFound(status_or.status())) {
3112 // Any other type of error
3113 return status_or.status();
3114 }
3115
3116 // We know: We don't have a schema now.
3117 //
3118 // We know: If no documents have been added, then the last_added_document_id
3119 // will be invalid.
3120 //
3121 // So: If documents have been added before and we don't have a schema now,
3122 // then that means we must have had a schema at some point. Since we wouldn't
3123 // accept documents without a schema to validate them against.
3124 return document_store_->last_added_document_id() != kInvalidDocumentId;
3125 }
3126
3127 libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
CreateDataIndexingHandlers()3128 IcingSearchEngine::CreateDataIndexingHandlers() {
3129 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
3130
3131 // Term index handler
3132 ICING_ASSIGN_OR_RETURN(
3133 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
3134 TermIndexingHandler::Create(
3135 clock_.get(), normalizer_.get(), index_.get(),
3136 options_.build_property_existence_metadata_hits()));
3137 handlers.push_back(std::move(term_indexing_handler));
3138
3139 // Integer index handler
3140 ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
3141 integer_section_indexing_handler,
3142 IntegerSectionIndexingHandler::Create(
3143 clock_.get(), integer_index_.get()));
3144 handlers.push_back(std::move(integer_section_indexing_handler));
3145
3146 // Qualified id join index handler
3147 ICING_ASSIGN_OR_RETURN(
3148 std::unique_ptr<QualifiedIdJoinIndexingHandler>
3149 qualified_id_join_indexing_handler,
3150 QualifiedIdJoinIndexingHandler::Create(
3151 clock_.get(), document_store_.get(), qualified_id_join_index_.get()));
3152 handlers.push_back(std::move(qualified_id_join_indexing_handler));
3153
3154 // Embedding index handler
3155 ICING_ASSIGN_OR_RETURN(
3156 std::unique_ptr<EmbeddingIndexingHandler> embedding_indexing_handler,
3157 EmbeddingIndexingHandler::Create(clock_.get(), embedding_index_.get(),
3158 options_.enable_embedding_index()));
3159 handlers.push_back(std::move(embedding_indexing_handler));
3160 return handlers;
3161 }
3162
3163 libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
TruncateIndicesTo(DocumentId last_stored_document_id)3164 IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
3165 // Attempt to truncate term index.
3166 // TruncateTo ensures that the index does not hold any data that is not
3167 // present in the ground truth. If the document store lost some documents,
3168 // TruncateTo will ensure that the index does not contain any hits from those
3169 // lost documents. If the index does not contain any hits for documents with
3170 // document id greater than last_stored_document_id, then TruncateTo will have
3171 // no effect.
3172 ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
3173
3174 // Get last indexed document id for term index after truncating.
3175 DocumentId term_index_last_added_document_id =
3176 index_->last_added_document_id();
3177 DocumentId first_document_to_reindex =
3178 (term_index_last_added_document_id != kInvalidDocumentId)
3179 ? term_index_last_added_document_id + 1
3180 : kMinDocumentId;
3181 bool index_needed_restoration =
3182 (last_stored_document_id != term_index_last_added_document_id);
3183
3184 // Attempt to truncate integer index.
3185 bool integer_index_needed_restoration = false;
3186 DocumentId integer_index_last_added_document_id =
3187 integer_index_->last_added_document_id();
3188 if (integer_index_last_added_document_id == kInvalidDocumentId ||
3189 last_stored_document_id > integer_index_last_added_document_id) {
3190 // If last_stored_document_id is greater than
3191 // integer_index_last_added_document_id, then we only have to replay docs
3192 // starting from integer_index_last_added_document_id + 1. Also use std::min
3193 // since we might need to replay even smaller doc ids for term index.
3194 integer_index_needed_restoration = true;
3195 if (integer_index_last_added_document_id != kInvalidDocumentId) {
3196 first_document_to_reindex = std::min(
3197 first_document_to_reindex, integer_index_last_added_document_id + 1);
3198 } else {
3199 first_document_to_reindex = kMinDocumentId;
3200 }
3201 } else if (last_stored_document_id < integer_index_last_added_document_id) {
3202 // Clear the entire integer index if last_stored_document_id is smaller than
3203 // integer_index_last_added_document_id, because there is no way to remove
3204 // data with doc_id > last_stored_document_id from integer index and we have
3205 // to rebuild.
3206 ICING_RETURN_IF_ERROR(integer_index_->Clear());
3207
3208 // Since the entire integer index is discarded, we start to rebuild it by
3209 // setting first_document_to_reindex to kMinDocumentId.
3210 integer_index_needed_restoration = true;
3211 first_document_to_reindex = kMinDocumentId;
3212 }
3213
3214 // Attempt to truncate qualified id join index
3215 bool qualified_id_join_index_needed_restoration = false;
3216 DocumentId qualified_id_join_index_last_added_document_id =
3217 qualified_id_join_index_->last_added_document_id();
3218 if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId ||
3219 last_stored_document_id >
3220 qualified_id_join_index_last_added_document_id) {
3221 // If last_stored_document_id is greater than
3222 // qualified_id_join_index_last_added_document_id, then we only have to
3223 // replay docs starting from (qualified_id_join_index_last_added_document_id
3224 // + 1). Also use std::min since we might need to replay even smaller doc
3225 // ids for other components.
3226 qualified_id_join_index_needed_restoration = true;
3227 if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) {
3228 first_document_to_reindex =
3229 std::min(first_document_to_reindex,
3230 qualified_id_join_index_last_added_document_id + 1);
3231 } else {
3232 first_document_to_reindex = kMinDocumentId;
3233 }
3234 } else if (last_stored_document_id <
3235 qualified_id_join_index_last_added_document_id) {
3236 // Clear the entire qualified id join index if last_stored_document_id is
3237 // smaller than qualified_id_join_index_last_added_document_id, because
3238 // there is no way to remove data with doc_id > last_stored_document_id from
3239 // join index efficiently and we have to rebuild.
3240 ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear());
3241
3242 // Since the entire qualified id join index is discarded, we start to
3243 // rebuild it by setting first_document_to_reindex to kMinDocumentId.
3244 qualified_id_join_index_needed_restoration = true;
3245 first_document_to_reindex = kMinDocumentId;
3246 }
3247
3248 // Attempt to truncate embedding index
3249 bool embedding_index_needed_restoration = false;
3250 DocumentId embedding_index_last_added_document_id =
3251 embedding_index_->last_added_document_id();
3252 if (embedding_index_last_added_document_id == kInvalidDocumentId ||
3253 last_stored_document_id > embedding_index_last_added_document_id) {
3254 // If last_stored_document_id is greater than
3255 // embedding_index_last_added_document_id, then we only have to replay docs
3256 // starting from (embedding_index_last_added_document_id + 1). Also use
3257 // std::min since we might need to replay even smaller doc ids for other
3258 // components.
3259 embedding_index_needed_restoration = true;
3260 if (embedding_index_last_added_document_id != kInvalidDocumentId) {
3261 first_document_to_reindex =
3262 std::min(first_document_to_reindex,
3263 embedding_index_last_added_document_id + 1);
3264 } else {
3265 first_document_to_reindex = kMinDocumentId;
3266 }
3267 } else if (last_stored_document_id < embedding_index_last_added_document_id) {
3268 // Clear the entire embedding index if last_stored_document_id is
3269 // smaller than embedding_index_last_added_document_id, because
3270 // there is no way to remove data with doc_id > last_stored_document_id from
3271 // embedding index efficiently and we have to rebuild.
3272 ICING_RETURN_IF_ERROR(embedding_index_->Clear());
3273
3274 // Since the entire embedding index is discarded, we start to
3275 // rebuild it by setting first_document_to_reindex to kMinDocumentId.
3276 embedding_index_needed_restoration = true;
3277 first_document_to_reindex = kMinDocumentId;
3278 }
3279
3280 return TruncateIndexResult(first_document_to_reindex,
3281 index_needed_restoration,
3282 integer_index_needed_restoration,
3283 qualified_id_join_index_needed_restoration,
3284 embedding_index_needed_restoration);
3285 }
3286
DiscardDerivedFiles(const version_util::DerivedFilesRebuildResult & rebuild_result)3287 libtextclassifier3::Status IcingSearchEngine::DiscardDerivedFiles(
3288 const version_util::DerivedFilesRebuildResult& rebuild_result) {
3289 if (!rebuild_result.IsRebuildNeeded()) {
3290 return libtextclassifier3::Status::OK;
3291 }
3292
3293 if (schema_store_ != nullptr || document_store_ != nullptr ||
3294 index_ != nullptr || integer_index_ != nullptr ||
3295 qualified_id_join_index_ != nullptr || embedding_index_ != nullptr) {
3296 return absl_ports::FailedPreconditionError(
3297 "Cannot discard derived files while having valid instances");
3298 }
3299
3300 // Schema store
3301 if (rebuild_result.needs_schema_store_derived_files_rebuild) {
3302 ICING_RETURN_IF_ERROR(SchemaStore::DiscardDerivedFiles(
3303 filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir())));
3304 }
3305
3306 // Document store
3307 if (rebuild_result.needs_document_store_derived_files_rebuild) {
3308 ICING_RETURN_IF_ERROR(DocumentStore::DiscardDerivedFiles(
3309 filesystem_.get(), MakeDocumentDirectoryPath(options_.base_dir())));
3310 }
3311
3312 // Term index
3313 if (rebuild_result.needs_term_index_rebuild) {
3314 if (!filesystem_->DeleteDirectoryRecursively(
3315 MakeIndexDirectoryPath(options_.base_dir()).c_str())) {
3316 return absl_ports::InternalError("Failed to discard index");
3317 }
3318 }
3319
3320 // Integer index
3321 if (rebuild_result.needs_integer_index_rebuild) {
3322 if (!filesystem_->DeleteDirectoryRecursively(
3323 MakeIntegerIndexWorkingPath(options_.base_dir()).c_str())) {
3324 return absl_ports::InternalError("Failed to discard integer index");
3325 }
3326 }
3327
3328 // Qualified id join index
3329 if (rebuild_result.needs_qualified_id_join_index_rebuild) {
3330 if (!filesystem_->DeleteDirectoryRecursively(
3331 MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()).c_str())) {
3332 return absl_ports::InternalError(
3333 "Failed to discard qualified id join index");
3334 }
3335 }
3336
3337 // Embedding index.
3338 if (rebuild_result.needs_embedding_index_rebuild) {
3339 ICING_RETURN_IF_ERROR(EmbeddingIndex::Discard(
3340 *filesystem_, MakeEmbeddingIndexWorkingPath(options_.base_dir())));
3341 }
3342
3343 return libtextclassifier3::Status::OK;
3344 }
3345
ClearSearchIndices()3346 libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() {
3347 ICING_RETURN_IF_ERROR(index_->Reset());
3348 ICING_RETURN_IF_ERROR(integer_index_->Clear());
3349 ICING_RETURN_IF_ERROR(embedding_index_->Clear());
3350 return libtextclassifier3::Status::OK;
3351 }
3352
ClearJoinIndices()3353 libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() {
3354 return qualified_id_join_index_->Clear();
3355 }
3356
ClearAllIndices()3357 libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() {
3358 ICING_RETURN_IF_ERROR(ClearSearchIndices());
3359 ICING_RETURN_IF_ERROR(ClearJoinIndices());
3360 return libtextclassifier3::Status::OK;
3361 }
3362
Reset()3363 ResetResultProto IcingSearchEngine::Reset() {
3364 absl_ports::unique_lock l(&mutex_);
3365 return ResetInternal();
3366 }
3367
ResetInternal()3368 ResetResultProto IcingSearchEngine::ResetInternal() {
3369 ICING_VLOG(1) << "Resetting IcingSearchEngine";
3370
3371 ResetResultProto result_proto;
3372 StatusProto* result_status = result_proto.mutable_status();
3373
3374 initialized_ = false;
3375 ResetMembers();
3376 if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
3377 result_status->set_code(StatusProto::INTERNAL);
3378 return result_proto;
3379 }
3380
3381 if (InternalInitialize().status().code() != StatusProto::OK) {
3382 // We shouldn't hit the following Initialize errors:
3383 // NOT_FOUND: all data was cleared, we aren't expecting anything
3384 // DATA_LOSS: all data was cleared, we aren't expecting anything
3385 // RESOURCE_EXHAUSTED: just deleted files, shouldn't run out of space
3386 //
3387 // We can't tell if Initialize failed and left Icing in an inconsistent
3388 // state or if it was a temporary I/O error. Group everything under INTERNAL
3389 // to be safe.
3390 //
3391 // TODO(b/147699081): Once Initialize returns the proper ABORTED/INTERNAL
3392 // status code, we can just propagate it up from here.
3393 result_status->set_code(StatusProto::INTERNAL);
3394 return result_proto;
3395 }
3396
3397 result_status->set_code(StatusProto::OK);
3398 return result_proto;
3399 }
3400
SearchSuggestions(const SuggestionSpecProto & suggestion_spec)3401 SuggestionResponse IcingSearchEngine::SearchSuggestions(
3402 const SuggestionSpecProto& suggestion_spec) {
3403 // TODO(b/146008613) Explore ideas to make this function read-only.
3404 absl_ports::unique_lock l(&mutex_);
3405 SuggestionResponse response;
3406 StatusProto* response_status = response.mutable_status();
3407 if (!initialized_) {
3408 response_status->set_code(StatusProto::FAILED_PRECONDITION);
3409 response_status->set_message("IcingSearchEngine has not been initialized!");
3410 return response;
3411 }
3412
3413 libtextclassifier3::Status status =
3414 ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
3415 if (!status.ok()) {
3416 TransformStatus(status, response_status);
3417 return response;
3418 }
3419
3420 // Create the suggestion processor.
3421 auto suggestion_processor_or = SuggestionProcessor::Create(
3422 index_.get(), integer_index_.get(), embedding_index_.get(),
3423 language_segmenter_.get(), normalizer_.get(), document_store_.get(),
3424 schema_store_.get(), clock_.get(), &feature_flags_);
3425 if (!suggestion_processor_or.ok()) {
3426 TransformStatus(suggestion_processor_or.status(), response_status);
3427 return response;
3428 }
3429 std::unique_ptr<SuggestionProcessor> suggestion_processor =
3430 std::move(suggestion_processor_or).ValueOrDie();
3431
3432 // Run suggestion based on given SuggestionSpec.
3433 int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
3434 libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
3435 suggestion_processor->QuerySuggestions(suggestion_spec, current_time_ms);
3436 if (!terms_or.ok()) {
3437 TransformStatus(terms_or.status(), response_status);
3438 return response;
3439 }
3440
3441 // Convert vector<TermMetaData> into final SuggestionResponse proto.
3442 for (TermMetadata& term : terms_or.ValueOrDie()) {
3443 SuggestionResponse::Suggestion suggestion;
3444 suggestion.set_query(std::move(term.content));
3445 response.mutable_suggestions()->Add(std::move(suggestion));
3446 }
3447 response_status->set_code(StatusProto::OK);
3448 return response;
3449 }
3450
3451 } // namespace lib
3452 } // namespace icing
3453