1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_ 16 #define ICING_SCHEMA_SCHEMA_STORE_H_ 17 18 #include <cstdint> 19 #include <cstring> 20 #include <limits> 21 #include <memory> 22 #include <optional> 23 #include <string> 24 #include <string_view> 25 #include <unordered_map> 26 #include <unordered_set> 27 #include <utility> 28 #include <vector> 29 30 #include "icing/text_classifier/lib3/utils/base/status.h" 31 #include "icing/text_classifier/lib3/utils/base/statusor.h" 32 #include "icing/absl_ports/canonical_errors.h" 33 #include "icing/feature-flags.h" 34 #include "icing/file/file-backed-proto.h" 35 #include "icing/file/filesystem.h" 36 #include "icing/file/version-util.h" 37 #include "icing/proto/debug.pb.h" 38 #include "icing/proto/document.pb.h" 39 #include "icing/proto/logging.pb.h" 40 #include "icing/proto/schema.pb.h" 41 #include "icing/proto/search.pb.h" 42 #include "icing/proto/storage.pb.h" 43 #include "icing/schema/joinable-property.h" 44 #include "icing/schema/schema-type-manager.h" 45 #include "icing/schema/schema-util.h" 46 #include "icing/schema/scorable_property_manager.h" 47 #include "icing/schema/section.h" 48 #include "icing/store/document-filter-data.h" 49 #include "icing/store/key-mapper.h" 50 #include "icing/util/clock.h" 51 #include "icing/util/crc32.h" 52 #include "icing/util/status-macros.h" 53 54 namespace icing { 55 namespace lib { 56 57 // Holds the ground truth schema proto. Tracks compatible changes to the schema 58 // and will update any derived data based on the schema proto, such as Sections, 59 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have 60 // the most up-to-date data, callers should not save instances themselves and 61 // should always call Get* from the SchemaStore. 62 class SchemaStore { 63 public: 64 struct LegacyHeader { 65 // Holds the magic as a quick sanity check against file corruption. 66 int32_t magic; 67 68 // Checksum of the SchemaStore's sub-component's checksums. 69 uint32_t checksum; 70 }; 71 72 class Header { 73 public: 74 static constexpr int32_t kMagic = 0x72650d0a; 75 Header(const Filesystem * filesystem,std::string path)76 explicit Header(const Filesystem* filesystem, std::string path) 77 : path_(std::move(path)), filesystem_(filesystem) {} 78 Header(Header && other)79 Header(Header&& other) 80 : serialized_header_(std::move(other.serialized_header_)), 81 path_(std::move(other.path_)), 82 header_fd_(std::move(other.header_fd_)), 83 filesystem_(other.filesystem_), 84 dirty_(other.dirty_) {} 85 86 Header& operator=(Header&& other) { 87 serialized_header_ = std::move(other.serialized_header_); 88 path_ = std::move(other.path_); 89 header_fd_ = std::move(other.header_fd_); 90 filesystem_ = other.filesystem_; 91 dirty_ = other.dirty_; 92 return *this; 93 } 94 95 struct SerializedHeader { SerializedHeaderSerializedHeader96 explicit SerializedHeader() 97 : magic(kMagic), 98 checksum(0), 99 overlay_created(false), 100 min_overlay_version_compatibility( 101 std::numeric_limits<int32_t>::max()) { 102 memset(overlay_created_padding, 0, kOverlayCreatedPaddingSize); 103 memset(padding, 0, kPaddingSize); 104 } 105 // Holds the magic as a quick sanity check against file corruption. 106 int32_t magic; 107 108 // Checksum of the SchemaStore's sub-component's checksums. 109 uint32_t checksum; 110 111 bool overlay_created; 112 // Three bytes of padding due to the fact that 113 // min_overlay_version_compatibility_ has an alignof() == 4 and the offset 114 // of overlay_created_padding_ == 9. 115 static constexpr int kOverlayCreatedPaddingSize = 3; 116 uint8_t overlay_created_padding[kOverlayCreatedPaddingSize]; 117 118 int32_t min_overlay_version_compatibility; 119 120 static constexpr int kPaddingSize = 1008; 121 // Padding exists just to reserve space for additional values. 122 uint8_t padding[kPaddingSize]; 123 }; 124 static_assert(sizeof(SerializedHeader) == 1024); 125 126 // RETURNS: 127 // - On success, a valid Header instance 128 // - NOT_FOUND if header file doesn't exist 129 // - INTERNAL if unable to read header 130 static libtextclassifier3::StatusOr<Header> Read( 131 const Filesystem* filesystem, std::string path); 132 133 libtextclassifier3::Status Write(); 134 135 libtextclassifier3::Status PersistToDisk(); 136 magic()137 int32_t magic() const { return serialized_header_.magic; } 138 checksum()139 uint32_t checksum() const { return serialized_header_.checksum; } set_checksum(uint32_t checksum)140 void set_checksum(uint32_t checksum) { 141 dirty_ = true; 142 serialized_header_.checksum = checksum; 143 } 144 overlay_created()145 bool overlay_created() const { return serialized_header_.overlay_created; } 146 min_overlay_version_compatibility()147 int32_t min_overlay_version_compatibility() const { 148 return serialized_header_.min_overlay_version_compatibility; 149 } 150 SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)151 void SetOverlayInfo(bool overlay_created, 152 int32_t min_overlay_version_compatibility) { 153 dirty_ = true; 154 serialized_header_.overlay_created = overlay_created; 155 serialized_header_.min_overlay_version_compatibility = 156 min_overlay_version_compatibility; 157 } 158 159 private: Header(SerializedHeader serialized_header,std::string path,ScopedFd header_fd,const Filesystem * filesystem)160 explicit Header(SerializedHeader serialized_header, std::string path, 161 ScopedFd header_fd, const Filesystem* filesystem) 162 : serialized_header_(std::move(serialized_header)), 163 path_(std::move(path)), 164 header_fd_(std::move(header_fd)), 165 filesystem_(filesystem), 166 dirty_(false) {} 167 168 SerializedHeader serialized_header_; 169 std::string path_; 170 ScopedFd header_fd_; 171 const Filesystem* filesystem_; // Not owned. 172 bool dirty_; 173 }; 174 175 // Holds information on what may have been affected by the new schema. This is 176 // generally data that other classes may depend on from the SchemaStore, 177 // so that we can know if we should go update those classes as well. 178 struct SetSchemaResult { 179 // Whether we are able to write the schema as determined by SetSchema's 180 // arguments. This boolean reflects SetSchema's logic, and does not reflect 181 // any system level IO errors that may prevent the schema from being written 182 // to file. 183 bool success = false; 184 185 // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: 186 // 1. Schema types are added in the middle of the SchemaProto 187 // 2. Schema types are removed from the middle of the SchemaProto 188 // 3. Schema types are reordered in the SchemaProto 189 // 190 // SchemaTypeIds are not changed if schema types are added/removed to the 191 // end of the SchemaProto. 192 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed; 193 194 // Schema types that have been removed from the new schema. Represented by 195 // the `schema_type` field in the SchemaTypeConfigProto. 196 std::unordered_set<std::string> schema_types_deleted_by_name; 197 198 // Schema types that have been removed from the new schema. Represented by 199 // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old* 200 // schema. 201 std::unordered_set<SchemaTypeId> schema_types_deleted_by_id; 202 203 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 204 // manner in the new schema. Compatibility determined in 205 // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type` 206 // field in the SchemaTypeConfigProto. 207 std::unordered_set<std::string> schema_types_incompatible_by_name; 208 209 // Schema types whose SchemaTypeConfigProto has changed in an incompatible 210 // manner in the new schema. Compatibility determined in 211 // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId 212 // assigned to this SchemaTypeConfigProto in the *old* schema. 213 std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; 214 215 // Schema types that were added in the new schema. Represented by the 216 // `schema_type` field in the SchemaTypeConfigProto. 217 std::unordered_set<std::string> schema_types_new_by_name; 218 219 // Schema types that were changed in a way that was backwards compatible and 220 // didn't invalidate the index. Represented by the `schema_type` field in 221 // the SchemaTypeConfigProto. 222 std::unordered_set<std::string> 223 schema_types_changed_fully_compatible_by_name; 224 225 // Schema types that were changed in a way that was backwards compatible, 226 // but invalidated the index. Represented by the `schema_type` field in the 227 // SchemaTypeConfigProto. 228 std::unordered_set<std::string> schema_types_index_incompatible_by_name; 229 230 // Schema types that were changed in a way that was backwards compatible, 231 // but invalidated the joinable cache. Represented by the `schema_type` 232 // field in the SchemaTypeConfigProto. 233 std::unordered_set<std::string> schema_types_join_incompatible_by_name; 234 235 // Schema types that were changed in a way that was backwards compatible, 236 // but inconsistent with the old schema so that the scorable property cache 237 // needs to be re-generated. 238 std::unordered_set<SchemaTypeId> 239 schema_types_scorable_property_inconsistent_by_id; 240 241 // Schema types that were changed in a way that was backwards compatible, 242 // but inconsistent with the old schema so that the scorable property cache 243 // needs to be re-generated. 244 std::unordered_set<std::string> 245 schema_types_scorable_property_inconsistent_by_name; 246 }; 247 248 struct ExpandedTypePropertyMask { 249 std::string schema_type; 250 std::unordered_set<std::string> paths; 251 }; 252 253 static constexpr std::string_view kSchemaTypeWildcard = "*"; 254 255 // Factory function to create a SchemaStore which does not take ownership 256 // of any input components, and all pointers must refer to valid objects that 257 // outlive the created SchemaStore instance. The base_dir must already exist. 258 // There does not need to be an existing schema already. 259 // 260 // If initialize_stats is present, the fields related to SchemaStore will be 261 // populated. 262 // 263 // Returns: 264 // A SchemaStore on success 265 // FAILED_PRECONDITION on any null pointer input 266 // INTERNAL_ERROR on any IO errors 267 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 268 const Filesystem* filesystem, const std::string& base_dir, 269 const Clock* clock, const FeatureFlags* feature_flags, 270 bool enable_schema_database = false, 271 InitializeStatsProto* initialize_stats = nullptr); 272 273 // Migrates schema files (backup v.s. new schema) according to version state 274 // change. Also performs schema database migration and populates the database 275 // fields in the persisted schema file if necessary. 276 // 277 // Returns: 278 // OK on success or nothing to migrate 279 static libtextclassifier3::Status MigrateSchema( 280 const Filesystem* filesystem, const std::string& base_dir, 281 version_util::StateChange version_state_change, int32_t new_version, 282 bool perform_schema_database_migration); 283 284 // Discards all derived data in the schema store. 285 // 286 // Returns: 287 // OK on success or nothing to discard 288 // INTERNAL_ERROR on any I/O errors 289 static libtextclassifier3::Status DiscardDerivedFiles( 290 const Filesystem* filesystem, const std::string& base_dir); 291 292 SchemaStore(SchemaStore&&) = default; 293 SchemaStore& operator=(SchemaStore&&) = default; 294 295 SchemaStore(const SchemaStore&) = delete; 296 SchemaStore& operator=(const SchemaStore&) = delete; 297 298 // Persists and updates checksum of subcomponents. 299 ~SchemaStore(); 300 301 // Retrieve the current schema if it exists. 302 // 303 // Returns: 304 // - SchemaProto* if exists 305 // - INTERNAL_ERROR on any IO errors 306 // - NOT_FOUND_ERROR if a schema hasn't been set before 307 libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const; 308 309 // Retrieve the current schema for a given database if it exists. 310 // 311 // This is an expensive operation. Use GetSchema() when retrieving the entire 312 // schema, or if there is only a single database in the schema store. 313 // 314 // Returns: 315 // - SchemaProto* containing only schema types from the database, if exists 316 // - INTERNAL_ERROR on any IO errors 317 // - NOT_FOUND_ERROR if the database doesn't exist in the schema, or if a 318 // schema hasn't been set before 319 libtextclassifier3::StatusOr<SchemaProto> GetSchema( 320 const std::string& database) const; 321 322 // Update our current schema if it's compatible. Does not accept incompatible 323 // schema or schema with types from multiple databases. Compatibility rules 324 // defined by SchemaUtil::ComputeCompatibilityDelta. 325 // 326 // The schema types in the new schema proto must all be from a single 327 // database. Does not support setting schema types across multiple databases 328 // at once. 329 // 330 // If ignore_errors_and_delete_documents is set to true, then incompatible 331 // schema are allowed and we'll force set the schema, meaning 332 // SetSchemaResult.success will always be true. 333 // 334 // Returns: 335 // - SetSchemaResult that encapsulates the differences between the old and 336 // new schema, as well as if the new schema can be set. 337 // - INTERNAL_ERROR on any IO errors 338 // - ALREADY_EXISTS_ERROR if type names in the new schema are already in use 339 // by a different database. 340 // - INVALID_ARGUMENT_ERROR if the schema is invalid, or if the schema types 341 // are from multiple databases (once schema database is enabled). 342 libtextclassifier3::StatusOr<SetSchemaResult> SetSchema( 343 const SchemaProto& new_schema, bool ignore_errors_and_delete_documents, 344 bool allow_circular_schema_definitions); 345 libtextclassifier3::StatusOr<SetSchemaResult> SetSchema( 346 SchemaProto&& new_schema, bool ignore_errors_and_delete_documents, 347 bool allow_circular_schema_definitions); 348 349 // Get the SchemaTypeConfigProto of schema_type name. 350 // 351 // Returns: 352 // SchemaTypeConfigProto on success 353 // FAILED_PRECONDITION if schema hasn't been set yet 354 // NOT_FOUND if schema type name doesn't exist 355 // INTERNAL on any I/O errors 356 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> 357 GetSchemaTypeConfig(std::string_view schema_type) const; 358 359 // Get a map contains all schema_type name to its blob property paths. 360 // 361 // Returns: 362 // A map contains all schema_type name to its blob property paths on success 363 // FAILED_PRECONDITION if schema hasn't been set yet 364 // INTERNAL on any I/O errors 365 libtextclassifier3::StatusOr< 366 std::unordered_map<std::string, std::vector<std::string>>> 367 ConstructBlobPropertyMap() const; 368 369 // Returns the schema type of the passed in SchemaTypeId 370 // 371 // Returns: 372 // schema type on success 373 // FAILED_PRECONDITION if schema hasn't been set yet 374 // INVALID_ARGUMENT if schema type id is invalid 375 libtextclassifier3::StatusOr<const std::string*> GetSchemaType( 376 SchemaTypeId schema_type_id) const; 377 378 // Returns the SchemaTypeId of the passed in schema type 379 // 380 // Returns: 381 // SchemaTypeId on success 382 // FAILED_PRECONDITION if schema hasn't been set yet 383 // NOT_FOUND_ERROR if we don't know about the schema type 384 // INTERNAL_ERROR on IO error 385 libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId( 386 std::string_view schema_type) const; 387 388 // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also 389 // include child types. 390 // 391 // Returns: 392 // A set of SchemaTypeId on success 393 // FAILED_PRECONDITION if schema hasn't been set yet 394 // NOT_FOUND_ERROR if we don't know about the schema type 395 // INTERNAL_ERROR on IO error 396 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*> 397 GetSchemaTypeIdsWithChildren(std::string_view schema_type) const; 398 399 // Returns the SectionMetadata associated with the SectionId that's in the 400 // SchemaTypeId. 401 // 402 // Returns: 403 // Valid pointer to SectionMetadata on success 404 // FAILED_PRECONDITION if schema hasn't been set yet 405 // INVALID_ARGUMENT if schema type id or section id is invalid 406 libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( 407 SchemaTypeId schema_type_id, SectionId section_id) const; 408 409 // Returns true if a property is defined in the said schema, regardless of 410 // whether it is indexed or not. 411 bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id, 412 const std::string& property) const; 413 414 // Extracts all sections of different types from the given document and group 415 // them by type. 416 // - Each Section vector is sorted by section Id in ascending order. The 417 // sorted section ids may not be continuous, since not all sections are 418 // present in the document. 419 // - Sections with empty content won't be returned. 420 // - For example, we may extract: 421 // string_sections: [2, 7, 10] 422 // integer_sections: [3, 5, 8] 423 // 424 // Returns: 425 // A SectionGroup instance on success 426 // FAILED_PRECONDITION if schema hasn't been set yet 427 // NOT_FOUND if type config name of document not found 428 libtextclassifier3::StatusOr<SectionGroup> ExtractSections( 429 const DocumentProto& document) const; 430 431 // Returns the JoinablePropertyMetadata associated with property_path that's 432 // in the SchemaTypeId. 433 // 434 // Returns: 435 // Valid pointer to JoinablePropertyMetadata on success 436 // nullptr if property_path doesn't exist (or is not joinable) in the 437 // joinable metadata list of the schema 438 // FAILED_PRECONDITION if schema hasn't been set yet 439 // INVALID_ARGUMENT if schema type id is invalid 440 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> 441 GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, 442 const std::string& property_path) const; 443 444 // Returns the JoinablePropertyMetadata associated with joinable_property_id 445 // that's in the SchemaTypeId. 446 // 447 // Returns: 448 // Valid pointer to JoinablePropertyMetadata on success 449 // FAILED_PRECONDITION if schema hasn't been set yet 450 // INVALID_ARGUMENT if schema type id or joinable property id is invalid 451 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> 452 GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, 453 JoinablePropertyId joinable_property_id) const; 454 455 // Extracts all joinable property contents of different types from the given 456 // document and group them by joinable value type. 457 // - Joinable properties are sorted by joinable property id in ascending 458 // order. The sorted joinable property ids may not be continuous, since not 459 // all joinable properties are present in the document. 460 // - Joinable property ids start from 0. 461 // - Joinable properties with empty content won't be returned. 462 // 463 // Returns: 464 // A JoinablePropertyGroup instance on success 465 // FAILED_PRECONDITION if schema hasn't been set yet 466 // NOT_FOUND if the type config name of document not found 467 libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties( 468 const DocumentProto& document) const; 469 470 // Returns the quantization type for the given schema_type_id and section_id. 471 // 472 // Returns: 473 // - The quantization type on success. 474 // - INVALID_ARGUMENT_ERROR if schema_type_id or section_id is invalid. 475 // - Any error from schema store. 476 libtextclassifier3::StatusOr<EmbeddingIndexingConfig::QuantizationType::Code> GetQuantizationType(SchemaTypeId schema_type_id,SectionId section_id)477 GetQuantizationType(SchemaTypeId schema_type_id, SectionId section_id) const { 478 ICING_ASSIGN_OR_RETURN(const SectionMetadata* section_metadata, 479 GetSectionMetadata(schema_type_id, section_id)); 480 return section_metadata->quantization_type; 481 } 482 483 // Syncs all the data changes to disk. 484 // 485 // Returns: 486 // OK on success 487 // INTERNAL on I/O errors. 488 libtextclassifier3::Status PersistToDisk(); 489 490 // Recomputes the combined checksum of components of the schema store and 491 // updates the header. 492 // 493 // Returns: 494 // - the checksum on success 495 // - INTERNAL on I/O errors. 496 libtextclassifier3::StatusOr<Crc32> UpdateChecksum(); 497 498 // Recomputes the combined checksum of components of the schema store. Does 499 // NOT update the header. 500 // 501 // Returns: 502 // - the checksum on success 503 // - INTERNAL on I/O errors. 504 libtextclassifier3::StatusOr<Crc32> GetChecksum() const; 505 506 // Returns: 507 // - On success, the section metadata list for the specified schema type 508 // - NOT_FOUND if the schema type is not present in the schema 509 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> 510 GetSectionMetadata(const std::string& schema_type) const; 511 512 // Gets the index of the given |property_path|, where the index N means that 513 // it is the Nth scorable property path in the schema config of the given 514 // |schema_type_id|, in lexicographical order. 515 // 516 // Returns: 517 // - Index on success 518 // - std::nullopt if the |property_path| doesn't point to a scorable 519 // property under the |schema_type_id| 520 // - FAILED_PRECONDITION if the schema hasn't been set yet 521 // - INVALID_ARGUMENT if |schema_type_id| is invalid 522 libtextclassifier3::StatusOr<std::optional<int>> GetScorablePropertyIndex( 523 SchemaTypeId schema_type_id, std::string_view property_path) const; 524 525 // Returns the list of ScorablePropertyInfo for the given |schema_type_id|, 526 // in lexicographical order of its property path. 527 // 528 // Returns: 529 // - Vector of scorable property info on success. The vector can be empty 530 // if no scorable property is found under the schema config of 531 // |schema_type_id|. 532 // - FAILED_PRECONDITION if the schema hasn't been set yet 533 // - INVALID_ARGUMENT if |schema_type_id| is invalid 534 libtextclassifier3::StatusOr< 535 const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*> 536 GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const; 537 538 // Calculates the StorageInfo for the Schema Store. 539 // 540 // If an IO error occurs while trying to calculate the value for a field, then 541 // that field will be set to -1. 542 SchemaStoreStorageInfoProto GetStorageInfo() const; 543 544 // Get debug information for the schema store. 545 // 546 // Returns: 547 // SchemaDebugInfoProto on success 548 // INTERNAL_ERROR on IO errors, crc compute error 549 libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const; 550 551 // Expands the provided type_property_masks into a vector of 552 // ExpandedTypePropertyMasks to account for polymorphism. If both a parent 553 // type and one of its child type appears in the masks, the parent type's 554 // paths will be merged into the child's. 555 // 556 // For example, assume that we have two schema types A and B, and we have 557 // - A is the parent type of B 558 // - Paths of A: {P1, P2} 559 // - Paths of B: {P3} 560 // 561 // Then, we will have the following in the result. 562 // - Expanded paths of A: {P1, P2} 563 // - Expanded paths of B: {P1, P2, P3} 564 std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks( 565 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks) 566 const; 567 568 private: 569 // Factory function to create a SchemaStore and set its schema. The created 570 // instance does not take ownership of any input components and all pointers 571 // must refer to valid objects that outlive the created SchemaStore instance. 572 // The base_dir must already exist. No schema must have set in base_dir prior 573 // to this. 574 // 575 // Returns: 576 // A SchemaStore on success 577 // FAILED_PRECONDITION on any null pointer input or if there has already 578 // been a schema set for this path. 579 // INTERNAL_ERROR on any IO errors 580 static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( 581 const Filesystem* filesystem, const std::string& base_dir, 582 const Clock* clock, const FeatureFlags* feature_flags, SchemaProto schema, 583 bool enable_schema_database); 584 585 // Use SchemaStore::Create instead. 586 explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, 587 const Clock* clock, const FeatureFlags* feature_flags, 588 bool enable_schema_database); 589 590 // Deletes the overlay schema and ensures that the Header is correctly set. 591 // 592 // RETURNS: 593 // OK on success 594 // INTERNAL_ERROR on any IO errors 595 static libtextclassifier3::Status DiscardOverlaySchema( 596 const Filesystem* filesystem, const std::string& base_dir, 597 Header& header); 598 599 // Handles the overlay schema after a version change by deleting it if it is 600 // no longer compatible with the new version. 601 // 602 // Requires: base_dir exists. 603 // 604 // Returns: 605 // OK on success 606 // INTERNAL_ERROR on any IO errors 607 static libtextclassifier3::Status HandleOverlaySchemaForVersionChange( 608 const Filesystem* filesystem, const std::string& base_dir, 609 version_util::StateChange version_state_change, int32_t new_version); 610 611 // Populates the schema database field in the schema proto that is stored in 612 // the input schema file. 613 // 614 // Returns: 615 // OK on success or nothing to migrate 616 // INTERNAL_ERROR on IO error 617 static libtextclassifier3::Status PopulateSchemaDatabaseFieldForSchemaFile( 618 const Filesystem* filesystem, const std::string& schema_filename); 619 620 // Verifies that there is no error retrieving a previously set schema. Then 621 // initializes like normal. 622 // 623 // Returns: 624 // OK on success 625 // INTERNAL_ERROR on IO error 626 libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); 627 628 // First, blindly writes new_schema to the schema_file. Then initializes like 629 // normal. 630 // 631 // Returns: 632 // OK on success 633 // INTERNAL_ERROR on IO error 634 // FAILED_PRECONDITION if there is already a schema set for the schema_file. 635 libtextclassifier3::Status Initialize(SchemaProto new_schema); 636 637 // Handles initializing the SchemaStore and regenerating any data if needed. 638 // 639 // Returns: 640 // OK on success 641 // INTERNAL_ERROR on IO error 642 libtextclassifier3::Status InitializeInternal( 643 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats); 644 645 // Creates sub-components and verifies the integrity of each sub-component. 646 // 647 // Returns: 648 // OK on success 649 // INTERNAL_ERROR on IO error 650 libtextclassifier3::Status InitializeDerivedFiles(); 651 652 // Populates any derived data structures off of the schema. 653 // 654 // Returns: 655 // OK on success 656 // NOT_FOUND_ERROR if a schema proto has not been set 657 // INTERNAL_ERROR on any IO errors 658 libtextclassifier3::Status RegenerateDerivedFiles( 659 bool create_overlay_if_necessary); 660 661 // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_. 662 // 663 // Returns: 664 // OK on success 665 // NOT_FOUND_ERROR if a schema proto has not been set 666 // INTERNAL_ERROR on any IO errors 667 libtextclassifier3::Status BuildInMemoryCache(); 668 669 // Update and replace the header file. Creates the header file if it doesn't 670 // exist. 671 // 672 // Returns: 673 // OK on success 674 // INTERNAL on I/O error 675 libtextclassifier3::Status UpdateHeader(const Crc32& checksum); 676 677 // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying 678 // file, and re-creates a new instance of the schema_type_mapper_. Does not 679 // populate the schema_type_mapper_. 680 // 681 // Returns any IO errors. 682 libtextclassifier3::Status ResetSchemaTypeMapper(); 683 684 // Creates a new schema store with new_schema and then swaps that new schema 685 // store with the existing one. This function guarantees that either: this 686 // instance will be fully updated to the new schema or no changes will take 687 // effect. 688 // 689 // Returns: 690 // OK on success 691 // INTERNAL on I/O error. 692 libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema); 693 CheckSchemaSet()694 libtextclassifier3::Status CheckSchemaSet() const { 695 return has_schema_successfully_set_ 696 ? libtextclassifier3::Status::OK 697 : absl_ports::FailedPreconditionError("Schema not set yet."); 698 } 699 700 // Correctly loads the Header, schema_file_ and (if present) the 701 // overlay_schema_file_. 702 // RETURNS: 703 // - OK on success 704 // - INTERNAL if an IO error is encountered when reading the Header or 705 // schemas. 706 // Or an invalid schema configuration is present. 707 libtextclassifier3::Status LoadSchema(); 708 709 // Sets the schema for a database for the first time. 710 // 711 // Note that when schema database is disabled, this function sets the entire 712 // schema, with all under the default empty database. 713 // 714 // Requires: 715 // - All types in new_schema are from the same database. 716 // - new_schema does not contain type names that are already in use by a 717 // different database. 718 // 719 // Returns: 720 // - SetSchemaResult that indicates if the new schema can be set. 721 // - INTERNAL_ERROR on any IO errors. 722 // - INVALID_ARGUMENT_ERROR if the schema is invalid. 723 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult> 724 SetInitialSchemaForDatabase(SchemaProto new_schema, 725 bool ignore_errors_and_delete_documents, 726 bool allow_circular_schema_definitions); 727 728 // Sets the schema for a database, overriding any existing schema for that 729 // database. 730 // 731 // Note that when schema database is disabled, this function sets and 732 // overrides the entire schema. 733 // 734 // Requires: 735 // - All types in new_schema are from the same database. 736 // - new_schema does not contain type names that are already in use by a 737 // different database. 738 // 739 // Returns: 740 // - SetSchemaResult that encapsulates the differences between the old and 741 // new schema, as well as if the new schema can be set. 742 // - INTERNAL_ERROR on any IO errors. 743 // - INVALID_ARGUMENT_ERROR if the schema is invalid. 744 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult> 745 SetSchemaWithDatabaseOverride(SchemaProto new_schema, 746 const SchemaProto& old_schema, 747 bool ignore_errors_and_delete_documents, 748 bool allow_circular_schema_definitions); 749 750 // Initial validation on the SchemaProto for SetSchema. This is intended as a 751 // preliminary check before any expensive operations are performed during 752 // `SetSchema::Validate`. Returns the schema's database if it's valid. 753 // 754 // Note that when schema database is disabled, any schema input is valid and 755 // an empty string is returned as the database. 756 // 757 // Checks that: 758 // - The new schema only contains types from a single database. 759 // - The schema's type names are not already in use in other databases. This 760 // is done outside of `SchemaUtil::Validate` because we need to know all 761 // existing type names, which is stored in the SchemaStore and not known to 762 // SchemaUtil. 763 // 764 // Returns: 765 // - new_schema's database on success 766 // - INVALID_ARGUMENT_ERROR if new_schema contains types from multiple 767 // databases 768 // - ALREADY_EXISTS_ERROR if new_schema's types names are not unique 769 libtextclassifier3::StatusOr<std::string> ValidateAndGetDatabase( 770 const SchemaProto& new_schema) const; 771 772 // Returns a SchemaProto representing the full schema, which is a combination 773 // of the existing schema and the input database schema. 774 // 775 // For the database being updated by the input database schema: 776 // - If the existing schema does not contain the database, the input types 777 // are appended to the end of the SchemaProto, without changing the order 778 // of the existing schema types. 779 // - Otherwise, the existing schema types are replaced with types from the 780 // input database schema in their original position in the existing 781 // SchemaProto. 782 // - Types from input_database_schema are added in the order in which they 783 // appear. 784 // - If more types are added to the database, the additional types are 785 // appended at the end of the SchemaProto, without changing the order of 786 // existing types from unaffected databases. 787 // 788 // Requires: 789 // - input_database_schema must not contain types from multiple databases. 790 // 791 // Returns: 792 // - SchemaProto on success 793 // - INTERNAL_ERROR on any IO errors, or if the schema store was not 794 // previously initialized properly. 795 // - INVALID_ARGUMENT_ERROR if the input schema contains types from multiple 796 // databases. 797 libtextclassifier3::StatusOr<SchemaProto> GetFullSchemaProtoWithUpdatedDb( 798 SchemaProto input_database_schema) const; 799 800 const Filesystem* filesystem_; 801 std::string base_dir_; 802 const Clock* clock_; 803 const FeatureFlags* feature_flags_; // Does not own. 804 805 // Used internally to indicate whether the class has been successfully 806 // initialized with a valid schema. Will be false if Initialize failed or no 807 // schema has ever been set. 808 bool has_schema_successfully_set_ = false; 809 810 // Cached schema 811 std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_; 812 813 // This schema holds the definition of any schema types that are not 814 // compatible with older versions of Icing code. 815 std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_; 816 817 // Maps schema types to a densely-assigned unique id. 818 std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_; 819 820 // Maps schema type ids to the corresponding schema type. This is an inverse 821 // map of schema_type_mapper_. 822 std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_; 823 824 // A hash map of (database -> vector of type config names in the database). 825 // 826 // We use a vector instead of a set because we need to preserve the order of 827 // the types (i.e. the order in which they appear in the input SchemaProto 828 // during SetSchema), so that we can return the correct SchemaProto for 829 // GetSchema. 830 // 831 // This keeps track of the type configs defined in each database, which allows 832 // schema operations to be performed on a per-database basis. 833 std::unordered_map<std::string, std::vector<std::string>> database_type_map_; 834 835 // A hash map of (type config name -> type config), allows faster lookup of 836 // type config in schema. The O(1) type config access makes schema-related and 837 // section-related operations faster. 838 SchemaUtil::TypeConfigMap type_config_map_; 839 840 // Maps from each type id to all of its subtype ids. 841 // T2 is a subtype of T1, if and only if one of the following conditions is 842 // met: 843 // - T2 is T1 844 // - T2 extends T1 845 // - There exists a type U, such that T2 is a subtype of U, and U is a subtype 846 // of T1 847 std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>> 848 schema_subtype_id_map_; 849 850 // Manager of section (indexable property) and joinable property related 851 // metadata for all Schemas. 852 std::unique_ptr<const SchemaTypeManager> schema_type_manager_; 853 854 // Used to cache and manage the schema's scorable properties. 855 std::unique_ptr<ScorablePropertyManager> scorable_property_manager_; 856 857 std::unique_ptr<Header> header_; 858 859 // Whether to use the database field for the schema. 860 // 861 // This is a temporary flag to control the rollout of the schema database. It 862 // affects the `SetSchema` and `GetSchema(std::string database)` methods. 863 // TODO - b/337913932: Remove this flag once the schema database is fully 864 // rolled out. 865 bool enable_schema_database_ = false; 866 }; 867 868 } // namespace lib 869 } // namespace icing 870 871 #endif // ICING_SCHEMA_SCHEMA_STORE_H_ 872