1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_ICING_SEARCH_ENGINE_H_ 16 #define ICING_ICING_SEARCH_ENGINE_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <unordered_set> 23 #include <utility> 24 #include <vector> 25 26 #include "icing/text_classifier/lib3/utils/base/status.h" 27 #include "icing/text_classifier/lib3/utils/base/statusor.h" 28 #include "icing/absl_ports/mutex.h" 29 #include "icing/absl_ports/thread_annotations.h" 30 #include "icing/feature-flags.h" 31 #include "icing/file/filesystem.h" 32 #include "icing/file/version-util.h" 33 #include "icing/index/data-indexing-handler.h" 34 #include "icing/index/embed/embedding-index.h" 35 #include "icing/index/index.h" 36 #include "icing/index/numeric/numeric-index.h" 37 #include "icing/jni/jni-cache.h" 38 #include "icing/join/join-children-fetcher.h" 39 #include "icing/join/qualified-id-join-index.h" 40 #include "icing/legacy/index/icing-filesystem.h" 41 #include "icing/performance-configuration.h" 42 #include "icing/proto/blob.pb.h" 43 #include "icing/proto/debug.pb.h" 44 #include "icing/proto/document.pb.h" 45 #include "icing/proto/initialize.pb.h" 46 #include "icing/proto/logging.pb.h" 47 #include "icing/proto/optimize.pb.h" 48 #include "icing/proto/persist.pb.h" 49 #include "icing/proto/reset.pb.h" 50 #include "icing/proto/schema.pb.h" 51 #include "icing/proto/scoring.pb.h" 52 #include "icing/proto/search.pb.h" 53 #include "icing/proto/storage.pb.h" 54 #include "icing/proto/usage.pb.h" 55 #include "icing/query/query-terms.h" 56 #include "icing/result/result-state-manager.h" 57 #include "icing/schema/schema-store.h" 58 #include "icing/scoring/scored-document-hit.h" 59 #include "icing/store/blob-store.h" 60 #include "icing/store/document-id.h" 61 #include "icing/store/document-store.h" 62 #include "icing/tokenization/language-segmenter.h" 63 #include "icing/transform/normalizer.h" 64 #include "icing/util/clock.h" 65 66 namespace icing { 67 namespace lib { 68 69 // TODO(cassiewang) Top-level comments and links to design-doc. 70 class IcingSearchEngine { 71 public: 72 // Note: It is only required to provide a pointer to a valid instance of 73 // JniCache if this instance needs to perform reverse-jni calls. Users on 74 // Linux and iOS should always provide a nullptr. 75 explicit IcingSearchEngine( 76 const IcingSearchEngineOptions& options, 77 std::unique_ptr<const JniCache> jni_cache = nullptr); 78 79 // Calculates integrity checks and persists files to disk. 80 ~IcingSearchEngine(); 81 82 // Loads & verifies the contents previously indexed from disk and gets ready 83 // to handle read/write requests. 84 // 85 // WARNING: This is expected to be fast if Icing had a clean shutdown. 86 // Otherwise, it can take longer as it runs integrity checks and attempts 87 // to bring the index to a consistent state. If the data on disk is not 88 // consistent, it restores the state when PersistToDisk() was last called. 89 // 90 // TODO(cassiewang): We shouldn't return NOT_FOUND here, this is a symptom 91 // of some other error. We should return a broader error group, i.e. data 92 // inconsistency or something 93 // 94 // Returns: 95 // OK on success 96 // DATA_LOSS if encountered any inconsistencies in data and had to restore 97 // its state back to the last time PersistToDisk was called. Or if any 98 // persisted data was lost and could not be recovered. 99 // INTERNAL if any internal state was left in an inconsistent. The instance 100 // of IcingSearchEngine is unusable if this happens. It's recommended to 101 // clear the underlying directory provided in 102 // IcingSearchEngineOptions.base_dir and reinitialize. 103 // RESOURCE_EXHAUSTED if not enough storage space 104 // NOT_FOUND if missing some internal data 105 InitializeResultProto Initialize() ICING_LOCKS_EXCLUDED(mutex_); 106 107 // Specifies the schema to be applied on all Documents that are already 108 // stored as well as future documents. A schema can be 'invalid' and/or 109 // 'incompatible'. These are two independent concepts. 110 // 111 // An 'invalid' schema is one that is not constructed properly. For example, 112 // a PropertyConfigProto is missing the property name field. A schema can be 113 // 'invalid' even if there is no previously existing schema. 114 // 115 // An 'incompatible' schema is one that is incompatible with a previously 116 // existing schema. If there is no previously existing schema, then a new 117 // schema cannot be incompatible. An incompatible schema is one that 118 // invalidates pre-existing data. For example, a previously OPTIONAL field is 119 // now REQUIRED in the new schema, and pre-existing data is considered invalid 120 // against the new schema now. 121 // 122 // Default behavior will not allow a new schema to be set if it is invalid or 123 // incompatible. 124 // 125 // The argument 'ignore_errors_and_delete_documents' can be set to true to 126 // force set an incompatible schema. In that case, documents that are 127 // invalidated by the new schema would be deleted from Icing. This cannot be 128 // used to force set an invalid schema. 129 // 130 // This schema is persisted to disk and used across multiple instances. 131 // So, callers should only have to call this if the schema changed. 132 // However, calling it multiple times with the same schema is a no-op. 133 // 134 // On some errors, Icing will keep using the older schema, but on 135 // INTERNAL_ERROR, it is undefined to continue using Icing. 136 // 137 // Returns: 138 // OK on success 139 // ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same 140 // type or contains a type that has multiple properties with the same 141 // name. 142 // INVALID_ARGUMENT if 'new_schema' is invalid 143 // FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine 144 // has not been initialized yet. 145 // INTERNAL_ERROR if Icing failed to store the new schema or upgrade 146 // existing data based on the new schema. Using Icing beyond this error is 147 // undefined and may cause crashes. 148 // DATA_LOSS_ERROR if 'new_schema' requires the index to be rebuilt and an 149 // IO error leads to some documents being excluded from the index. These 150 // documents will still be retrievable via Get, but won't match queries. 151 // 152 // TODO(cassiewang) Figure out, document (and maybe even enforce) the best 153 // way ordering of calls between Initialize() and SetSchema(), both when 154 // the caller is creating an instance of IcingSearchEngine for the first 155 // time and when the caller is reinitializing an existing index on disk. 156 SetSchemaResultProto SetSchema( 157 SchemaProto&& new_schema, bool ignore_errors_and_delete_documents = false) 158 ICING_LOCKS_EXCLUDED(mutex_); 159 160 // This function makes a copy of the schema and calls SetSchema(SchemaProto&& 161 // new_schema, bool ignore_errors_and_delete_documents) 162 // 163 // NOTE: It's recommended to call SetSchema(SchemaProto&& new_schema, bool 164 // ignore_errors_and_delete_documents) directly to avoid a copy if the caller 165 // can make an rvalue SchemaProto. 166 SetSchemaResultProto SetSchema(const SchemaProto& new_schema, 167 bool ignore_errors_and_delete_documents = 168 false) ICING_LOCKS_EXCLUDED(mutex_); 169 170 // Get Icing's current copy of the schema. 171 // 172 // Returns: 173 // SchemaProto on success 174 // NOT_FOUND if a schema has not been set yet 175 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet. 176 // INTERNAL_ERROR on IO error 177 GetSchemaResultProto GetSchema() ICING_LOCKS_EXCLUDED(mutex_); 178 179 // Get Icing's current copy of the schema for the given database. 180 // 181 // NOTE: This is an expensive operation. It is recommended to call GetSchema() 182 // instead if you do not need to filter the schema by database, or if you're 183 // retrieving the only database in the schema. 184 // 185 // Returns: 186 // SchemaProto on success 187 // NOT_FOUND if a schema has not been set yet, or if the database is not 188 // present in the schema 189 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet. 190 // INTERNAL_ERROR on IO error 191 GetSchemaResultProto GetSchema(std::string_view database) 192 ICING_LOCKS_EXCLUDED(mutex_); 193 194 // Get Icing's copy of the SchemaTypeConfigProto of name schema_type 195 // 196 // Returns: 197 // SchemaTypeConfigProto on success 198 // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine 199 // has not been initialized yet. 200 // NOT_FOUND if there is no SchemaTypeConfig of schema_type in the 201 // SchemaProto 202 // INTERNAL_ERROR on IO error 203 GetSchemaTypeResultProto GetSchemaType(std::string_view schema_type) 204 ICING_LOCKS_EXCLUDED(mutex_); 205 206 // Puts the document into icing search engine so that it's stored and 207 // indexed. Documents are automatically written to disk, callers can also 208 // call PersistToDisk() to flush changes immediately. 209 // 210 // Returns: 211 // OK on success 212 // OUT_OF_SPACE if exceeds maximum number of allowed documents 213 // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine 214 // has not been initialized yet. 215 // NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches 216 // the document's schema 217 // DATA_LOSS if an IO error occurs while merging document into the index and 218 // the index is lost. These documents will still be retrievable via Get, 219 // but won't match queries. 220 // INTERNAL_ERROR on IO error 221 PutResultProto Put(DocumentProto&& document) ICING_LOCKS_EXCLUDED(mutex_); 222 223 // This function makes a copy of document and calls Put(DocumentProto&& 224 // document). 225 // 226 // NOTE: It's recommended to call Put(DocumentProto&& document) directly to 227 // avoid a copy if the caller can make an rvalue DocumentProto. 228 PutResultProto Put(const DocumentProto& document) 229 ICING_LOCKS_EXCLUDED(mutex_); 230 231 // Finds and returns the document identified by the given key (namespace + 232 // uri) 233 // 234 // Returns: 235 // The document found on success 236 // NOT_FOUND if the key doesn't exist or doc has been deleted 237 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 238 // INTERNAL_ERROR on IO error 239 GetResultProto Get(std::string_view name_space, std::string_view uri, 240 const GetResultSpecProto& result_spec); 241 242 // Reports usage. The corresponding usage scores of the specified document in 243 // the report will be updated. 244 // 245 // Returns: 246 // OK on success 247 // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist 248 // INTERNAL_ERROR on I/O errors. 249 ReportUsageResultProto ReportUsage(const UsageReport& usage_report); 250 251 // Returns all the namespaces that have at least one valid document in it. 252 // 253 // Returns: 254 // All namespaces on success 255 GetAllNamespacesResultProto GetAllNamespaces(); 256 257 // Deletes the Document specified by the given namespace / uri pair from the 258 // search engine. Delete changes are automatically applied to disk, callers 259 // can also call PersistToDisk() to flush changes immediately. 260 // 261 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 262 // called. 263 // 264 // Returns: 265 // OK on success 266 // NOT_FOUND if no document exists with namespace, uri 267 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 268 // INTERNAL_ERROR on IO error 269 DeleteResultProto Delete(std::string_view name_space, std::string_view uri) 270 ICING_LOCKS_EXCLUDED(mutex_); 271 272 // Deletes all Documents belonging to the specified namespace from the search 273 // engine. Delete changes are automatically applied to disk, callers can also 274 // call PersistToDisk() to flush changes immediately. 275 // 276 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 277 // called. 278 // 279 // Returns: 280 // OK on success 281 // NOT_FOUND if namespace doesn't exist 282 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 283 // INTERNAL_ERROR on IO error 284 DeleteByNamespaceResultProto DeleteByNamespace(std::string_view name_space) 285 ICING_LOCKS_EXCLUDED(mutex_); 286 287 // Deletes all Documents belonging to the specified type from the search 288 // engine. Delete changes are automatically applied to disk, callers can also 289 // call PersistToDisk() to flush changes immediately. 290 // 291 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 292 // called. 293 // 294 // Returns: 295 // OK on success 296 // NOT_FOUND if schema type doesn't exist 297 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 298 // INTERNAL_ERROR on IO error 299 DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type) 300 ICING_LOCKS_EXCLUDED(mutex_); 301 302 // Deletes all Documents that match the query specified in search_spec. Delete 303 // changes are automatically applied to disk, callers can also call 304 // PersistToDisk() to flush changes immediately. 305 // 306 // NOTE: Space is not reclaimed for deleted documents until Optimize() is 307 // called. 308 // 309 // Returns: 310 // OK on success 311 // NOT_FOUND if the query doesn't match any documents 312 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 313 // INTERNAL_ERROR on IO error 314 DeleteByQueryResultProto DeleteByQuery( 315 const SearchSpecProto& search_spec, 316 bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_); 317 318 // Retrieves, scores, ranks, and returns the results according to the specs. 319 // Results can be empty. If there're multiple pages of results, 320 // SearchResultProto.next_page_token will be set to a non-zero token and can 321 // be used to fetch more pages via GetNextPage() method. Clients should call 322 // InvalidateNextPageToken() after they get the pages they need to release 323 // result cache in memory. Please refer to each proto file for spec 324 // definitions. 325 // 326 // Returns a SearchResultProto with status: 327 // OK with results on success 328 // INVALID_ARGUMENT if any of specs is invalid 329 // ABORTED if failed to perform search but existing data is not affected 330 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 331 // INTERNAL_ERROR on any other errors 332 SearchResultProto Search(const SearchSpecProto& search_spec, 333 const ScoringSpecProto& scoring_spec, 334 const ResultSpecProto& result_spec) 335 ICING_LOCKS_EXCLUDED(mutex_); 336 337 // Retrieves, scores, ranks and returns the suggested query string according 338 // to the specs. Results can be empty. 339 // 340 // Returns a SuggestionResponse with status: 341 // OK with results on success 342 // INVALID_ARGUMENT if any of specs is invalid 343 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 344 // INTERNAL_ERROR on any other errors 345 SuggestionResponse SearchSuggestions( 346 const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_); 347 348 // Fetches the next page of results of a previously executed query. Results 349 // can be empty if next-page token is invalid. Invalid next page tokens are 350 // tokens that are either zero or were previously passed to 351 // InvalidateNextPageToken. If there are pages of results remaining after the 352 // one retrieved by this call, SearchResultProto.next_page_token will be 353 // set to a non-zero token and can be used to fetch more pages via 354 // GetNextPage() method. 355 // 356 // Returns a SearchResultProto with status: 357 // OK with results on success 358 // ABORTED if failed to get results but existing data is not affected 359 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 360 // INTERNAL_ERROR on any other errors 361 SearchResultProto GetNextPage(uint64_t next_page_token) 362 ICING_LOCKS_EXCLUDED(mutex_); 363 364 // Invalidates the next-page token so that no more results of the related 365 // query can be returned. 366 void InvalidateNextPageToken(uint64_t next_page_token) 367 ICING_LOCKS_EXCLUDED(mutex_); 368 369 // Gets or creates a file for write only purpose for the given blob handle. 370 // To mark the blob is completed written, commitBlob must be called. Once 371 // commitBlob is called, the blob is sealed and rewrite is not allowed. 372 // 373 // Returns: 374 // File descriptor on success 375 // InvalidArgumentError on invalid blob handle 376 // FailedPreconditionError on blob is already opened for write 377 // AlreadyExistsError on blob is committed 378 // INTERNAL_ERROR on IO error 379 BlobProto OpenWriteBlob(const PropertyProto::BlobHandleProto& blob_handle); 380 381 // Removes a blob file and blob handle from the blob store. 382 // 383 // This will remove the blob on any state. No matter it's committed or not or 384 // it has reference document links or not. 385 // 386 // Returns: 387 // InvalidArgumentError on invalid blob handle 388 // NotFoundError on blob is not found 389 // InternalError on IO error 390 BlobProto RemoveBlob(const PropertyProto::BlobHandleProto& blob_handle); 391 392 // Gets or creates a file for read only purpose for the given blob handle. 393 // The blob must be committed by calling commitBlob otherwise it is not 394 // accessible. 395 // 396 // Returns: 397 // File descriptor on success 398 // InvalidArgumentError on invalid blob handle 399 // NotFoundError on blob is not found or is not committed 400 BlobProto OpenReadBlob(const PropertyProto::BlobHandleProto& blob_handle); 401 402 // Commits the given blob, the blob is open to write via openWrite. 403 // Before the blob is committed, it is not visible to any reader via openRead. 404 // After the blob is committed, it is not allowed to rewrite or update the 405 // content. 406 // 407 // Returns: 408 // True on the blob is successfuly committed. 409 // False on the blob is already committed. 410 // InvalidArgumentError on invalid blob handle or digest is mismatch with 411 // file content NotFoundError on blob is not found. 412 BlobProto CommitBlob(const PropertyProto::BlobHandleProto& blob_handle); 413 414 // Makes sure that every update/delete received till this point is flushed 415 // to disk. If the app crashes after a call to PersistToDisk(), Icing 416 // would be able to fully recover all data written up to this point. 417 // 418 // If persist_type is PersistType::LITE, then only the ground truth will be 419 // synced. This should be relatively lightweight to do (order of microseconds) 420 // and ensures that there will be no data loss. At worst, Icing may need to 421 // recover internal data structures by replaying the document log upon the 422 // next startup. Clients should call PersistToDisk(LITE) after each batch of 423 // mutations. 424 // 425 // If persist_type is PersistType::FULL, then all internal data structures in 426 // Icing will be synced. This is a heavier operation (order of milliseconds). 427 // It ensures that Icing will not need to recover internal data structures 428 // upon the next startup. Clients should call PersistToDisk(FULL) before their 429 // process dies. 430 // 431 // NOTE: It is not necessary to call PersistToDisk() to read back data 432 // that was recently written. All read APIs will include the most recent 433 // updates/deletes regardless of the data being flushed to disk. 434 // 435 // Returns: 436 // OK on success 437 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 438 // INTERNAL on I/O error 439 PersistToDiskResultProto PersistToDisk(PersistType::Code persist_type) 440 ICING_LOCKS_EXCLUDED(mutex_); 441 442 // Allows Icing to run tasks that are too expensive and/or unnecessary to be 443 // executed in real-time, but are useful to keep it fast and be 444 // resource-efficient. This method purely optimizes the internal files and 445 // has no functional impact on what gets accepted/returned. 446 // 447 // WARNING: This method is CPU and IO intensive and depending on the 448 // contents stored, it can take from a few seconds to a few minutes. 449 // This call also blocks all read/write operations on Icing. 450 // 451 // SUGGESTION: Assuming the client has no restrictions on their side, it's 452 // recommended to call this method about once every 24 hours when the 453 // device is idle and charging. It can also be called when the system needs 454 // to free up extra disk-space. 455 // 456 // Returns: 457 // OK on success 458 // ABORTED_ERROR if optimization is aborted due to non-fatal errors before 459 // actual modifications are made. 460 // DATA_LOSS_ERROR on errors that could potentially cause data loss, 461 // IcingSearchEngine is still functioning. 462 // INTERNAL_ERROR on any IO errors or other unrecoverable errors. Continued 463 // use of Icing is undefined. 464 // Clients could clear and reinitialize IcingSearchEngine. 465 // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet 466 OptimizeResultProto Optimize() ICING_LOCKS_EXCLUDED(mutex_); 467 468 // Returns potential size and document savings if Optimize were called. 469 // 470 // Returns: 471 // OK on success 472 // FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet 473 // INTERNAL_ERROR on IO error 474 GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_); 475 476 // Calculates the StorageInfo for Icing. 477 // 478 // If an IO error occurs while trying to calculate the value for a field, then 479 // that field will be set to -1. 480 StorageInfoResultProto GetStorageInfo() ICING_LOCKS_EXCLUDED(mutex_); 481 482 // Get debug information for Icing. 483 DebugInfoResultProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) 484 ICING_LOCKS_EXCLUDED(mutex_); 485 486 // Clears all data from Icing and re-initializes. Clients DO NOT need to call 487 // Initialize again. 488 // 489 // Returns: 490 // OK on success 491 // ABORTED_ERROR if failed to delete underlying files 492 // INTERNAL_ERROR if internal state is no longer consistent 493 ResetResultProto Reset() ICING_LOCKS_EXCLUDED(mutex_); 494 495 // Disallow copy and move. 496 IcingSearchEngine(const IcingSearchEngine&) = delete; 497 IcingSearchEngine& operator=(const IcingSearchEngine&) = delete; 498 499 protected: 500 IcingSearchEngine(IcingSearchEngineOptions options, 501 std::unique_ptr<const Filesystem> filesystem, 502 std::unique_ptr<const IcingFilesystem> icing_filesystem, 503 std::unique_ptr<Clock> clock, 504 std::unique_ptr<const JniCache> jni_cache = nullptr); 505 506 private: 507 const IcingSearchEngineOptions options_; 508 const FeatureFlags feature_flags_; 509 const std::unique_ptr<const Filesystem> filesystem_; 510 const std::unique_ptr<const IcingFilesystem> icing_filesystem_; 511 bool initialized_ ICING_GUARDED_BY(mutex_) = false; 512 513 // Abstraction for accessing time values. 514 const std::unique_ptr<const Clock> clock_; 515 516 // Provides key thresholds that affects the running time and memory of major 517 // components in Icing search engine. 518 const PerformanceConfiguration performance_configuration_; 519 520 // Used to provide reader and writer locks 521 absl_ports::shared_mutex mutex_; 522 523 // Stores and processes the schema 524 std::unique_ptr<SchemaStore> schema_store_ ICING_GUARDED_BY(mutex_); 525 526 // Used to store all valid documents 527 // 528 // Dependencies: schema_store_ 529 std::unique_ptr<DocumentStore> document_store_ ICING_GUARDED_BY(mutex_); 530 531 // Used to manage pagination state of query results. Even though 532 // ResultStateManager has its own reader-writer lock, mutex_ must still be 533 // acquired first in order to adhere to the global lock ordering: 534 // 1. mutex_ 535 // 2. result_state_manager_.lock_ 536 // 537 // Dependencies: document_store_ 538 std::unique_ptr<ResultStateManager> result_state_manager_ 539 ICING_GUARDED_BY(mutex_); 540 541 // Used to store all valid blob data 542 std::unique_ptr<BlobStore> blob_store_ ICING_GUARDED_BY(mutex_); 543 544 std::unique_ptr<const LanguageSegmenter> language_segmenter_ 545 ICING_GUARDED_BY(mutex_); 546 547 std::unique_ptr<const Normalizer> normalizer_ ICING_GUARDED_BY(mutex_); 548 549 // Storage for all hits of string contents from the document store. 550 std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_); 551 552 // Storage for all hits of numeric contents from the document store. 553 std::unique_ptr<NumericIndex<int64_t>> integer_index_ 554 ICING_GUARDED_BY(mutex_); 555 556 // Storage for all join qualified ids from the document store. 557 std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_ 558 ICING_GUARDED_BY(mutex_); 559 560 // Storage for all hits of embedding contents from the document store. 561 std::unique_ptr<EmbeddingIndex> embedding_index_ ICING_GUARDED_BY(mutex_); 562 563 // Pointer to JNI class references 564 const std::unique_ptr<const JniCache> jni_cache_; 565 566 // Resets all members that are created during Initialize. 567 void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 568 569 // Resets all members that are created during Initialize, deletes all 570 // underlying files and initializes a fresh index. 571 ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 572 573 // Checks for the existence of the init marker file. If the failed init count 574 // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is 575 // initialized from scratch. The updated count (original failed init count + 1 576 // ) is written to the marker file. 577 // 578 // RETURNS 579 // OK on success 580 // INTERNAL if an IO error occurs while trying to update the marker file. 581 libtextclassifier3::Status CheckInitMarkerFile( 582 InitializeStatsProto* initialize_stats) 583 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 584 585 // Helper method to do the actual work to persist data to disk. We need this 586 // separate method so that other public methods don't need to call 587 // PersistToDisk(). Public methods calling each other may cause deadlock 588 // issues. 589 libtextclassifier3::Status InternalPersistToDisk( 590 PersistType::Code persist_type) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 591 592 // Helper method to the actual work to Initialize. We need this separate 593 // method so that other public methods don't need to call Initialize(). Public 594 // methods calling each other may cause deadlock issues. 595 InitializeResultProto InternalInitialize() 596 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 597 598 // Helper method to initialize member variables. 599 // 600 // Returns: 601 // OK on success 602 // FAILED_PRECONDITION if initialize_stats is null 603 // RESOURCE_EXHAUSTED if the index runs out of storage 604 // NOT_FOUND if some Document's schema type is not in the SchemaStore 605 // INTERNAL on any I/O errors 606 libtextclassifier3::Status InitializeMembers( 607 InitializeStatsProto* initialize_stats) 608 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 609 610 // Do any initialization/recovery necessary to create a SchemaStore instance. 611 // 612 // Returns: 613 // OK on success 614 // FAILED_PRECONDITION if initialize_stats is null 615 // INTERNAL on I/O error 616 libtextclassifier3::Status InitializeSchemaStore( 617 InitializeStatsProto* initialize_stats) 618 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 619 620 // Do any initialization/recovery necessary to create a DocumentStore 621 // instance. 622 // 623 // See comments on DocumentStore::Create for explanation of 624 // force_recovery_and_revalidate_documents. 625 // 626 // Returns: 627 // On success, a boolean flag indicating whether derived files of the 628 // document store have been regenerated or not. If true, any other 629 // components depending on them should also be rebuilt if true. 630 // FAILED_PRECONDITION if initialize_stats is null 631 // INTERNAL on I/O error 632 libtextclassifier3::StatusOr<bool> InitializeDocumentStore( 633 bool force_recovery_and_revalidate_documents, 634 InitializeStatsProto* initialize_stats) 635 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 636 637 // Do any initialization necessary to create a BlobStore instance. 638 // 639 // Returns: 640 // OK on success 641 // FAILED_PRECONDITION if initialize_stats is null 642 libtextclassifier3::Status InitializeBlobStore( 643 int32_t orphan_blob_time_to_live_ms, int32_t compression_level) 644 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 645 646 // Do any initialization/recovery necessary to create term index, integer 647 // index, and qualified id join index instances. 648 // 649 // If document_store_derived_files_regenerated is true, then we have to 650 // rebuild qualified id join index since NamespaceIds were reassigned. 651 // 652 // Returns: 653 // OK on success 654 // FAILED_PRECONDITION if initialize_stats is null 655 // RESOURCE_EXHAUSTED if the index runs out of storage 656 // NOT_FOUND if some Document's schema type is not in the SchemaStore 657 // INTERNAL on I/O error 658 libtextclassifier3::Status InitializeIndex( 659 bool document_store_derived_files_regenerated, 660 InitializeStatsProto* initialize_stats) 661 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 662 663 // Implementation of IcingSearchEngine::Search that only grabs the overall 664 // read-lock, allowing for parallel non-exclusive operations. 665 // This implementation is used if search_spec.use_read_only_search is true. 666 SearchResultProto SearchLockedShared(const SearchSpecProto& search_spec, 667 const ScoringSpecProto& scoring_spec, 668 const ResultSpecProto& result_spec) 669 ICING_LOCKS_EXCLUDED(mutex_); 670 671 // Implementation of IcingSearchEngine::Search that requires the overall 672 // write lock. No other operations of any kind can be executed in parallel if 673 // this version is used. 674 // This implementation is used if search_spec.use_read_only_search is false. 675 SearchResultProto SearchLockedExclusive(const SearchSpecProto& search_spec, 676 const ScoringSpecProto& scoring_spec, 677 const ResultSpecProto& result_spec) 678 ICING_LOCKS_EXCLUDED(mutex_); 679 680 // Helper method for the actual work to Search. We need this separate 681 // method to manage locking for Search. 682 SearchResultProto InternalSearch(const SearchSpecProto& search_spec, 683 const ScoringSpecProto& scoring_spec, 684 const ResultSpecProto& result_spec) 685 ICING_SHARED_LOCKS_REQUIRED(mutex_); 686 687 // Processes query and scores according to the specs. It is a helper function 688 // (called by Search) to process and score normal query and the nested child 689 // query for join search. 690 // 691 // Returns a QueryScoringResults 692 // OK on success with a vector of ScoredDocumentHits, 693 // SectionRestrictQueryTermsMap, and other stats fields for logging. 694 // Any other errors when processing the query or scoring 695 struct QueryScoringResults { 696 libtextclassifier3::Status status; 697 SectionRestrictQueryTermsMap query_terms; 698 std::vector<ScoredDocumentHit> scored_document_hits; 699 QueryScoringResultsQueryScoringResults700 explicit QueryScoringResults( 701 libtextclassifier3::Status status_in, 702 SectionRestrictQueryTermsMap&& query_terms_in, 703 std::vector<ScoredDocumentHit>&& scored_document_hits_in) 704 : status(std::move(status_in)), 705 query_terms(std::move(query_terms_in)), 706 scored_document_hits(std::move(scored_document_hits_in)) {} 707 }; 708 QueryScoringResults ProcessQueryAndScore( 709 const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec, 710 const ResultSpecProto& result_spec, 711 const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms, 712 QueryStatsProto::SearchStats* search_stats) 713 ICING_SHARED_LOCKS_REQUIRED(mutex_); 714 715 // Deletes documents propagated from the given deleted document ids via 716 // joinable properties with delete propagation enabled. 717 // 718 // Returns: 719 // Number of propagated documents deleted on success 720 // INTERNAL_ERROR on any I/O errors 721 libtextclassifier3::StatusOr<int> PropagateDelete( 722 const std::unordered_set<DocumentId>& deleted_document_ids, 723 int64_t current_time_ms) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 724 725 // Discards derived data that requires rebuild based on rebuild_result. 726 // 727 // Returns: 728 // OK on success 729 // FAILED_PRECONDITION_ERROR if those instances are valid (non nullptr) 730 // INTERNAL_ERROR on any I/O errors 731 libtextclassifier3::Status DiscardDerivedFiles( 732 const version_util::DerivedFilesRebuildResult& rebuild_result) 733 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 734 735 // Repopulates derived data off our ground truths. 736 // 737 // Returns: 738 // OK on success 739 // INTERNAL_ERROR on any IO errors 740 libtextclassifier3::Status RegenerateDerivedFiles( 741 InitializeStatsProto* initialize_stats = nullptr, 742 bool log_document_store_stats = false) 743 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 744 745 // Optimizes the DocumentStore by removing any unneeded documents (i.e. 746 // deleted, expired, etc.) from the filesystem storage. 747 // 748 // NOTE: This may leave the DocumentStore in an invalid/uncreated state. Users 749 // would need call Initialize() to reinitialize everything into a valid state. 750 // 751 // Returns: 752 // On success, OptimizeResult which contains a vector mapping from old 753 // document id to new document id and another vector mapping from old 754 // namespace id to new namespace id. A value of kInvalidDocumentId indicates 755 // that the old document id has been deleted. 756 // ABORTED_ERROR if any error happens before the actual optimization, the 757 // original document store should be still available 758 // DATA_LOSS_ERROR on errors that could potentially cause data loss, 759 // document store is still available 760 // INTERNAL_ERROR on any IO errors or other errors that we can't recover 761 // from 762 libtextclassifier3::StatusOr<DocumentStore::OptimizeResult> 763 OptimizeDocumentStore(std::unordered_set<std::string>&& mature_blob_handles, 764 OptimizeStatsProto* optimize_stats) 765 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 766 767 // Helper method to restore missing document data in index_, integer_index_, 768 // and qualified_id_join_index_. All documents will be reindexed. This does 769 // not clear the index, so it is recommended to call ClearAllIndices, 770 // ClearSearchIndices, or ClearJoinIndices first if needed. 771 // 772 // Returns: 773 // On success, OK and a bool indicating whether or not restoration was 774 // needed. 775 // DATA_LOSS, if an error during index merging caused us to lose indexed 776 // data in the main index. Despite the data loss, this is still considered 777 // a successful run and needed_restoration will be set to true. 778 // RESOURCE_EXHAUSTED if the index fills up before finishing indexing 779 // NOT_FOUND if some Document's schema type is not in the SchemaStore 780 // INTERNAL_ERROR on any IO errors 781 struct IndexRestorationResult { 782 libtextclassifier3::Status status; 783 bool index_needed_restoration; 784 bool integer_index_needed_restoration; 785 bool qualified_id_join_index_needed_restoration; 786 bool embedding_index_needed_restoration; 787 }; 788 IndexRestorationResult RestoreIndexIfNeeded() 789 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 790 791 // If we lost the schema during a previous failure, it may "look" the same as 792 // not having a schema set before: we don't have a schema proto file. So do 793 // some extra checks to differentiate between having-lost the schema, and 794 // never having a schema before. This may determine if we need to do extra 795 // recovery steps. 796 // 797 // Returns: 798 // bool indicating if we had a schema and unintentionally lost it 799 // INTERNAL_ERROR on I/O error 800 libtextclassifier3::StatusOr<bool> LostPreviousSchema() 801 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 802 803 // Helper method to create all types of data indexing handlers to index term, 804 // integer, and join qualified ids. 805 libtextclassifier3::StatusOr< 806 std::vector<std::unique_ptr<DataIndexingHandler>>> 807 CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 808 809 // Helper method to discard parts of (term, integer, qualified id join) 810 // indices if they contain data for document ids greater than 811 // last_stored_document_id. 812 // 813 // REQUIRES: last_stored_document_id is valid (!= kInvalidDocumentId). Note: 814 // if we want to truncate everything in the index, then please call 815 // ClearSearchIndices/ClearJoinIndices/ClearAllIndices instead. 816 // 817 // Returns: 818 // On success, a DocumentId indicating the first document to start for 819 // reindexing and 2 bool flags indicating whether term or integer index 820 // needs restoration. 821 // INTERNAL on any I/O errors 822 struct TruncateIndexResult { 823 DocumentId first_document_to_reindex; 824 bool index_needed_restoration; 825 bool integer_index_needed_restoration; 826 bool qualified_id_join_index_needed_restoration; 827 bool embedding_index_needed_restoration; 828 TruncateIndexResultTruncateIndexResult829 explicit TruncateIndexResult( 830 DocumentId first_document_to_reindex_in, 831 bool index_needed_restoration_in, 832 bool integer_index_needed_restoration_in, 833 bool qualified_id_join_index_needed_restoration_in, 834 bool embedding_index_needed_restoration_in) 835 : first_document_to_reindex(first_document_to_reindex_in), 836 index_needed_restoration(index_needed_restoration_in), 837 integer_index_needed_restoration(integer_index_needed_restoration_in), 838 qualified_id_join_index_needed_restoration( 839 qualified_id_join_index_needed_restoration_in), 840 embedding_index_needed_restoration( 841 embedding_index_needed_restoration_in) {} 842 }; 843 libtextclassifier3::StatusOr<TruncateIndexResult> TruncateIndicesTo( 844 DocumentId last_stored_document_id) 845 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 846 847 // Helper method to discard search (term, integer) indices. 848 // 849 // Returns: 850 // OK on success 851 // INTERNAL_ERROR on any I/O errors 852 libtextclassifier3::Status ClearSearchIndices() 853 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 854 855 // Helper method to discard join (qualified id) indices. 856 // 857 // Returns: 858 // OK on success 859 // INTERNAL_ERROR on any I/O errors 860 libtextclassifier3::Status ClearJoinIndices() 861 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 862 863 // Helper method to discard all search and join indices. 864 // 865 // Returns: 866 // OK on success 867 // INTERNAL_ERROR on any I/O errors 868 libtextclassifier3::Status ClearAllIndices() 869 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 870 }; 871 872 } // namespace lib 873 } // namespace icing 874 875 #endif // ICING_ICING_SEARCH_ENGINE_H_ 876