xref: /aosp_15_r20/external/icing/icing/schema/schema-store.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SCHEMA_STORE_H_
16 #define ICING_SCHEMA_SCHEMA_STORE_H_
17 
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/feature-flags.h"
34 #include "icing/file/file-backed-proto.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/file/version-util.h"
37 #include "icing/proto/debug.pb.h"
38 #include "icing/proto/document.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/schema.pb.h"
41 #include "icing/proto/search.pb.h"
42 #include "icing/proto/storage.pb.h"
43 #include "icing/schema/joinable-property.h"
44 #include "icing/schema/schema-type-manager.h"
45 #include "icing/schema/schema-util.h"
46 #include "icing/schema/scorable_property_manager.h"
47 #include "icing/schema/section.h"
48 #include "icing/store/document-filter-data.h"
49 #include "icing/store/key-mapper.h"
50 #include "icing/util/clock.h"
51 #include "icing/util/crc32.h"
52 #include "icing/util/status-macros.h"
53 
54 namespace icing {
55 namespace lib {
56 
57 // Holds the ground truth schema proto. Tracks compatible changes to the schema
58 // and will update any derived data based on the schema proto, such as Sections,
59 // SchemaTypeConfigs, PropertyConfigs, and SchemaTypeIds. To ensure they have
60 // the most up-to-date data, callers should not save instances themselves and
61 // should always call Get* from the SchemaStore.
62 class SchemaStore {
63  public:
64   struct LegacyHeader {
65     // Holds the magic as a quick sanity check against file corruption.
66     int32_t magic;
67 
68     // Checksum of the SchemaStore's sub-component's checksums.
69     uint32_t checksum;
70   };
71 
72   class Header {
73    public:
74     static constexpr int32_t kMagic = 0x72650d0a;
75 
Header(const Filesystem * filesystem,std::string path)76     explicit Header(const Filesystem* filesystem, std::string path)
77         : path_(std::move(path)), filesystem_(filesystem) {}
78 
Header(Header && other)79     Header(Header&& other)
80         : serialized_header_(std::move(other.serialized_header_)),
81           path_(std::move(other.path_)),
82           header_fd_(std::move(other.header_fd_)),
83           filesystem_(other.filesystem_),
84           dirty_(other.dirty_) {}
85 
86     Header& operator=(Header&& other) {
87       serialized_header_ = std::move(other.serialized_header_);
88       path_ = std::move(other.path_);
89       header_fd_ = std::move(other.header_fd_);
90       filesystem_ = other.filesystem_;
91       dirty_ = other.dirty_;
92       return *this;
93     }
94 
95     struct SerializedHeader {
SerializedHeaderSerializedHeader96       explicit SerializedHeader()
97           : magic(kMagic),
98             checksum(0),
99             overlay_created(false),
100             min_overlay_version_compatibility(
101                 std::numeric_limits<int32_t>::max()) {
102         memset(overlay_created_padding, 0, kOverlayCreatedPaddingSize);
103         memset(padding, 0, kPaddingSize);
104       }
105       // Holds the magic as a quick sanity check against file corruption.
106       int32_t magic;
107 
108       // Checksum of the SchemaStore's sub-component's checksums.
109       uint32_t checksum;
110 
111       bool overlay_created;
112       // Three bytes of padding due to the fact that
113       // min_overlay_version_compatibility_ has an alignof() == 4 and the offset
114       // of overlay_created_padding_ == 9.
115       static constexpr int kOverlayCreatedPaddingSize = 3;
116       uint8_t overlay_created_padding[kOverlayCreatedPaddingSize];
117 
118       int32_t min_overlay_version_compatibility;
119 
120       static constexpr int kPaddingSize = 1008;
121       // Padding exists just to reserve space for additional values.
122       uint8_t padding[kPaddingSize];
123     };
124     static_assert(sizeof(SerializedHeader) == 1024);
125 
126     // RETURNS:
127     //   - On success, a valid Header instance
128     //   - NOT_FOUND if header file doesn't exist
129     //   - INTERNAL if unable to read header
130     static libtextclassifier3::StatusOr<Header> Read(
131         const Filesystem* filesystem, std::string path);
132 
133     libtextclassifier3::Status Write();
134 
135     libtextclassifier3::Status PersistToDisk();
136 
magic()137     int32_t magic() const { return serialized_header_.magic; }
138 
checksum()139     uint32_t checksum() const { return serialized_header_.checksum; }
set_checksum(uint32_t checksum)140     void set_checksum(uint32_t checksum) {
141       dirty_ = true;
142       serialized_header_.checksum = checksum;
143     }
144 
overlay_created()145     bool overlay_created() const { return serialized_header_.overlay_created; }
146 
min_overlay_version_compatibility()147     int32_t min_overlay_version_compatibility() const {
148       return serialized_header_.min_overlay_version_compatibility;
149     }
150 
SetOverlayInfo(bool overlay_created,int32_t min_overlay_version_compatibility)151     void SetOverlayInfo(bool overlay_created,
152                         int32_t min_overlay_version_compatibility) {
153       dirty_ = true;
154       serialized_header_.overlay_created = overlay_created;
155       serialized_header_.min_overlay_version_compatibility =
156           min_overlay_version_compatibility;
157     }
158 
159    private:
Header(SerializedHeader serialized_header,std::string path,ScopedFd header_fd,const Filesystem * filesystem)160     explicit Header(SerializedHeader serialized_header, std::string path,
161                     ScopedFd header_fd, const Filesystem* filesystem)
162         : serialized_header_(std::move(serialized_header)),
163           path_(std::move(path)),
164           header_fd_(std::move(header_fd)),
165           filesystem_(filesystem),
166           dirty_(false) {}
167 
168     SerializedHeader serialized_header_;
169     std::string path_;
170     ScopedFd header_fd_;
171     const Filesystem* filesystem_;  // Not owned.
172     bool dirty_;
173   };
174 
175   // Holds information on what may have been affected by the new schema. This is
176   // generally data that other classes may depend on from the SchemaStore,
177   // so that we can know if we should go update those classes as well.
178   struct SetSchemaResult {
179     // Whether we are able to write the schema as determined by SetSchema's
180     // arguments. This boolean reflects SetSchema's logic, and does not reflect
181     // any system level IO errors that may prevent the schema from being written
182     // to file.
183     bool success = false;
184 
185     // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
186     //   1. Schema types are added in the middle of the SchemaProto
187     //   2. Schema types are removed from the middle of the SchemaProto
188     //   3. Schema types are reordered in the SchemaProto
189     //
190     // SchemaTypeIds are not changed if schema types are added/removed to the
191     // end of the SchemaProto.
192     std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
193 
194     // Schema types that have been removed from the new schema. Represented by
195     // the `schema_type` field in the SchemaTypeConfigProto.
196     std::unordered_set<std::string> schema_types_deleted_by_name;
197 
198     // Schema types that have been removed from the new schema. Represented by
199     // the SchemaTypeId assigned to this SchemaTypeConfigProto in the *old*
200     // schema.
201     std::unordered_set<SchemaTypeId> schema_types_deleted_by_id;
202 
203     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
204     // manner in the new schema. Compatibility determined in
205     // SchemaUtil::ComputeCompatibilityDelta. Represented by the `schema_type`
206     // field in the SchemaTypeConfigProto.
207     std::unordered_set<std::string> schema_types_incompatible_by_name;
208 
209     // Schema types whose SchemaTypeConfigProto has changed in an incompatible
210     // manner in the new schema. Compatibility determined in
211     // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
212     // assigned to this SchemaTypeConfigProto in the *old* schema.
213     std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
214 
215     // Schema types that were added in the new schema. Represented by the
216     // `schema_type` field in the SchemaTypeConfigProto.
217     std::unordered_set<std::string> schema_types_new_by_name;
218 
219     // Schema types that were changed in a way that was backwards compatible and
220     // didn't invalidate the index. Represented by the `schema_type` field in
221     // the SchemaTypeConfigProto.
222     std::unordered_set<std::string>
223         schema_types_changed_fully_compatible_by_name;
224 
225     // Schema types that were changed in a way that was backwards compatible,
226     // but invalidated the index. Represented by the `schema_type` field in the
227     // SchemaTypeConfigProto.
228     std::unordered_set<std::string> schema_types_index_incompatible_by_name;
229 
230     // Schema types that were changed in a way that was backwards compatible,
231     // but invalidated the joinable cache. Represented by the `schema_type`
232     // field in the SchemaTypeConfigProto.
233     std::unordered_set<std::string> schema_types_join_incompatible_by_name;
234 
235     // Schema types that were changed in a way that was backwards compatible,
236     // but inconsistent with the old schema so that the scorable property cache
237     // needs to be re-generated.
238     std::unordered_set<SchemaTypeId>
239         schema_types_scorable_property_inconsistent_by_id;
240 
241     // Schema types that were changed in a way that was backwards compatible,
242     // but inconsistent with the old schema so that the scorable property cache
243     // needs to be re-generated.
244     std::unordered_set<std::string>
245         schema_types_scorable_property_inconsistent_by_name;
246   };
247 
248   struct ExpandedTypePropertyMask {
249     std::string schema_type;
250     std::unordered_set<std::string> paths;
251   };
252 
253   static constexpr std::string_view kSchemaTypeWildcard = "*";
254 
255   // Factory function to create a SchemaStore which does not take ownership
256   // of any input components, and all pointers must refer to valid objects that
257   // outlive the created SchemaStore instance. The base_dir must already exist.
258   // There does not need to be an existing schema already.
259   //
260   // If initialize_stats is present, the fields related to SchemaStore will be
261   // populated.
262   //
263   // Returns:
264   //   A SchemaStore on success
265   //   FAILED_PRECONDITION on any null pointer input
266   //   INTERNAL_ERROR on any IO errors
267   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
268       const Filesystem* filesystem, const std::string& base_dir,
269       const Clock* clock, const FeatureFlags* feature_flags,
270       bool enable_schema_database = false,
271       InitializeStatsProto* initialize_stats = nullptr);
272 
273   // Migrates schema files (backup v.s. new schema) according to version state
274   // change. Also performs schema database migration and populates the database
275   // fields in the persisted schema file if necessary.
276   //
277   // Returns:
278   //   OK on success or nothing to migrate
279   static libtextclassifier3::Status MigrateSchema(
280       const Filesystem* filesystem, const std::string& base_dir,
281       version_util::StateChange version_state_change, int32_t new_version,
282       bool perform_schema_database_migration);
283 
284   // Discards all derived data in the schema store.
285   //
286   // Returns:
287   //   OK on success or nothing to discard
288   //   INTERNAL_ERROR on any I/O errors
289   static libtextclassifier3::Status DiscardDerivedFiles(
290       const Filesystem* filesystem, const std::string& base_dir);
291 
292   SchemaStore(SchemaStore&&) = default;
293   SchemaStore& operator=(SchemaStore&&) = default;
294 
295   SchemaStore(const SchemaStore&) = delete;
296   SchemaStore& operator=(const SchemaStore&) = delete;
297 
298   // Persists and updates checksum of subcomponents.
299   ~SchemaStore();
300 
301   // Retrieve the current schema if it exists.
302   //
303   // Returns:
304   //   - SchemaProto* if exists
305   //   - INTERNAL_ERROR on any IO errors
306   //   - NOT_FOUND_ERROR if a schema hasn't been set before
307   libtextclassifier3::StatusOr<const SchemaProto*> GetSchema() const;
308 
309   // Retrieve the current schema for a given database if it exists.
310   //
311   // This is an expensive operation. Use GetSchema() when retrieving the entire
312   // schema, or if there is only a single database in the schema store.
313   //
314   // Returns:
315   //   - SchemaProto* containing only schema types from the database, if exists
316   //   - INTERNAL_ERROR on any IO errors
317   //   - NOT_FOUND_ERROR if the database doesn't exist in the schema, or if a
318   //     schema hasn't been set before
319   libtextclassifier3::StatusOr<SchemaProto> GetSchema(
320       const std::string& database) const;
321 
322   // Update our current schema if it's compatible. Does not accept incompatible
323   // schema or schema with types from multiple databases. Compatibility rules
324   // defined by SchemaUtil::ComputeCompatibilityDelta.
325   //
326   // The schema types in the new schema proto must all be from a single
327   // database. Does not support setting schema types across multiple databases
328   // at once.
329   //
330   // If ignore_errors_and_delete_documents is set to true, then incompatible
331   // schema are allowed and we'll force set the schema, meaning
332   // SetSchemaResult.success will always be true.
333   //
334   // Returns:
335   //   - SetSchemaResult that encapsulates the differences between the old and
336   //     new schema, as well as if the new schema can be set.
337   //   - INTERNAL_ERROR on any IO errors
338   //   - ALREADY_EXISTS_ERROR if type names in the new schema are already in use
339   //     by a different database.
340   //   - INVALID_ARGUMENT_ERROR if the schema is invalid, or if the schema types
341   //     are from multiple databases (once schema database is enabled).
342   libtextclassifier3::StatusOr<SetSchemaResult> SetSchema(
343       const SchemaProto& new_schema, bool ignore_errors_and_delete_documents,
344       bool allow_circular_schema_definitions);
345   libtextclassifier3::StatusOr<SetSchemaResult> SetSchema(
346       SchemaProto&& new_schema, bool ignore_errors_and_delete_documents,
347       bool allow_circular_schema_definitions);
348 
349   // Get the SchemaTypeConfigProto of schema_type name.
350   //
351   // Returns:
352   //   SchemaTypeConfigProto on success
353   //   FAILED_PRECONDITION if schema hasn't been set yet
354   //   NOT_FOUND if schema type name doesn't exist
355   //   INTERNAL on any I/O errors
356   libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
357   GetSchemaTypeConfig(std::string_view schema_type) const;
358 
359   // Get a map contains all schema_type name to its blob property paths.
360   //
361   // Returns:
362   //   A map contains all schema_type name to its blob property paths on success
363   //   FAILED_PRECONDITION if schema hasn't been set yet
364   //   INTERNAL on any I/O errors
365   libtextclassifier3::StatusOr<
366       std::unordered_map<std::string, std::vector<std::string>>>
367   ConstructBlobPropertyMap() const;
368 
369   // Returns the schema type of the passed in SchemaTypeId
370   //
371   // Returns:
372   //   schema type on success
373   //   FAILED_PRECONDITION if schema hasn't been set yet
374   //   INVALID_ARGUMENT if schema type id is invalid
375   libtextclassifier3::StatusOr<const std::string*> GetSchemaType(
376       SchemaTypeId schema_type_id) const;
377 
378   // Returns the SchemaTypeId of the passed in schema type
379   //
380   // Returns:
381   //   SchemaTypeId on success
382   //   FAILED_PRECONDITION if schema hasn't been set yet
383   //   NOT_FOUND_ERROR if we don't know about the schema type
384   //   INTERNAL_ERROR on IO error
385   libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
386       std::string_view schema_type) const;
387 
388   // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also
389   // include child types.
390   //
391   // Returns:
392   //   A set of SchemaTypeId on success
393   //   FAILED_PRECONDITION if schema hasn't been set yet
394   //   NOT_FOUND_ERROR if we don't know about the schema type
395   //   INTERNAL_ERROR on IO error
396   libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
397   GetSchemaTypeIdsWithChildren(std::string_view schema_type) const;
398 
399   // Returns the SectionMetadata associated with the SectionId that's in the
400   // SchemaTypeId.
401   //
402   // Returns:
403   //   Valid pointer to SectionMetadata on success
404   //   FAILED_PRECONDITION if schema hasn't been set yet
405   //   INVALID_ARGUMENT if schema type id or section id is invalid
406   libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
407       SchemaTypeId schema_type_id, SectionId section_id) const;
408 
409   // Returns true if a property is defined in the said schema, regardless of
410   // whether it is indexed or not.
411   bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,
412                                  const std::string& property) const;
413 
414   // Extracts all sections of different types from the given document and group
415   // them by type.
416   // - Each Section vector is sorted by section Id in ascending order. The
417   //   sorted section ids may not be continuous, since not all sections are
418   //   present in the document.
419   // - Sections with empty content won't be returned.
420   // - For example, we may extract:
421   //   string_sections: [2, 7, 10]
422   //   integer_sections: [3, 5, 8]
423   //
424   // Returns:
425   //   A SectionGroup instance on success
426   //   FAILED_PRECONDITION if schema hasn't been set yet
427   //   NOT_FOUND if type config name of document not found
428   libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
429       const DocumentProto& document) const;
430 
431   // Returns the JoinablePropertyMetadata associated with property_path that's
432   // in the SchemaTypeId.
433   //
434   // Returns:
435   //   Valid pointer to JoinablePropertyMetadata on success
436   //   nullptr if property_path doesn't exist (or is not joinable) in the
437   //     joinable metadata list of the schema
438   //   FAILED_PRECONDITION if schema hasn't been set yet
439   //   INVALID_ARGUMENT if schema type id is invalid
440   libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
441   GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
442                               const std::string& property_path) const;
443 
444   // Returns the JoinablePropertyMetadata associated with joinable_property_id
445   // that's in the SchemaTypeId.
446   //
447   // Returns:
448   //   Valid pointer to JoinablePropertyMetadata on success
449   //   FAILED_PRECONDITION if schema hasn't been set yet
450   //   INVALID_ARGUMENT if schema type id or joinable property id is invalid
451   libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
452   GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
453                               JoinablePropertyId joinable_property_id) const;
454 
455   // Extracts all joinable property contents of different types from the given
456   // document and group them by joinable value type.
457   // - Joinable properties are sorted by joinable property id in ascending
458   //   order. The sorted joinable property ids may not be continuous, since not
459   //   all joinable properties are present in the document.
460   // - Joinable property ids start from 0.
461   // - Joinable properties with empty content won't be returned.
462   //
463   // Returns:
464   //   A JoinablePropertyGroup instance on success
465   //   FAILED_PRECONDITION if schema hasn't been set yet
466   //   NOT_FOUND if the type config name of document not found
467   libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
468       const DocumentProto& document) const;
469 
470   // Returns the quantization type for the given schema_type_id and section_id.
471   //
472   // Returns:
473   //   - The quantization type on success.
474   //   - INVALID_ARGUMENT_ERROR if schema_type_id or section_id is invalid.
475   //   - Any error from schema store.
476   libtextclassifier3::StatusOr<EmbeddingIndexingConfig::QuantizationType::Code>
GetQuantizationType(SchemaTypeId schema_type_id,SectionId section_id)477   GetQuantizationType(SchemaTypeId schema_type_id, SectionId section_id) const {
478     ICING_ASSIGN_OR_RETURN(const SectionMetadata* section_metadata,
479                            GetSectionMetadata(schema_type_id, section_id));
480     return section_metadata->quantization_type;
481   }
482 
483   // Syncs all the data changes to disk.
484   //
485   // Returns:
486   //   OK on success
487   //   INTERNAL on I/O errors.
488   libtextclassifier3::Status PersistToDisk();
489 
490   // Recomputes the combined checksum of components of the schema store and
491   // updates the header.
492   //
493   // Returns:
494   //   - the checksum on success
495   //   - INTERNAL on I/O errors.
496   libtextclassifier3::StatusOr<Crc32> UpdateChecksum();
497 
498   // Recomputes the combined checksum of components of the schema store. Does
499   // NOT update the header.
500   //
501   // Returns:
502   //   - the checksum on success
503   //   - INTERNAL on I/O errors.
504   libtextclassifier3::StatusOr<Crc32> GetChecksum() const;
505 
506   // Returns:
507   //   - On success, the section metadata list for the specified schema type
508   //   - NOT_FOUND if the schema type is not present in the schema
509   libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
510   GetSectionMetadata(const std::string& schema_type) const;
511 
512   // Gets the index of the given |property_path|, where the index N means that
513   // it is the Nth scorable property path in the schema config of the given
514   // |schema_type_id|, in lexicographical order.
515   //
516   // Returns:
517   //   - Index on success
518   //   - std::nullopt if the |property_path| doesn't point to a scorable
519   //     property under the |schema_type_id|
520   //   - FAILED_PRECONDITION if the schema hasn't been set yet
521   //   - INVALID_ARGUMENT if |schema_type_id| is invalid
522   libtextclassifier3::StatusOr<std::optional<int>> GetScorablePropertyIndex(
523       SchemaTypeId schema_type_id, std::string_view property_path) const;
524 
525   // Returns the list of ScorablePropertyInfo for the given |schema_type_id|,
526   // in lexicographical order of its property path.
527   //
528   // Returns:
529   //   - Vector of scorable property info on success. The vector can be empty
530   //     if no scorable property is found under the schema config of
531   //     |schema_type_id|.
532   //   - FAILED_PRECONDITION if the schema hasn't been set yet
533   //   - INVALID_ARGUMENT if |schema_type_id| is invalid
534   libtextclassifier3::StatusOr<
535       const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*>
536   GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const;
537 
538   // Calculates the StorageInfo for the Schema Store.
539   //
540   // If an IO error occurs while trying to calculate the value for a field, then
541   // that field will be set to -1.
542   SchemaStoreStorageInfoProto GetStorageInfo() const;
543 
544   // Get debug information for the schema store.
545   //
546   // Returns:
547   //   SchemaDebugInfoProto on success
548   //   INTERNAL_ERROR on IO errors, crc compute error
549   libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
550 
551   // Expands the provided type_property_masks into a vector of
552   // ExpandedTypePropertyMasks to account for polymorphism. If both a parent
553   // type and one of its child type appears in the masks, the parent type's
554   // paths will be merged into the child's.
555   //
556   // For example, assume that we have two schema types A and B, and we have
557   // - A is the parent type of B
558   // - Paths of A: {P1, P2}
559   // - Paths of B: {P3}
560   //
561   // Then, we will have the following in the result.
562   // - Expanded paths of A: {P1, P2}
563   // - Expanded paths of B: {P1, P2, P3}
564   std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks(
565       const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
566       const;
567 
568  private:
569   // Factory function to create a SchemaStore and set its schema. The created
570   // instance does not take ownership of any input components and all pointers
571   // must refer to valid objects that outlive the created SchemaStore instance.
572   // The base_dir must already exist. No schema must have set in base_dir prior
573   // to this.
574   //
575   // Returns:
576   //   A SchemaStore on success
577   //   FAILED_PRECONDITION on any null pointer input or if there has already
578   //       been a schema set for this path.
579   //   INTERNAL_ERROR on any IO errors
580   static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
581       const Filesystem* filesystem, const std::string& base_dir,
582       const Clock* clock, const FeatureFlags* feature_flags, SchemaProto schema,
583       bool enable_schema_database);
584 
585   // Use SchemaStore::Create instead.
586   explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
587                        const Clock* clock, const FeatureFlags* feature_flags,
588                        bool enable_schema_database);
589 
590   // Deletes the overlay schema and ensures that the Header is correctly set.
591   //
592   // RETURNS:
593   //   OK on success
594   //   INTERNAL_ERROR on any IO errors
595   static libtextclassifier3::Status DiscardOverlaySchema(
596       const Filesystem* filesystem, const std::string& base_dir,
597       Header& header);
598 
599   // Handles the overlay schema after a version change by deleting it if it is
600   // no longer compatible with the new version.
601   //
602   // Requires: base_dir exists.
603   //
604   // Returns:
605   //   OK on success
606   //   INTERNAL_ERROR on any IO errors
607   static libtextclassifier3::Status HandleOverlaySchemaForVersionChange(
608       const Filesystem* filesystem, const std::string& base_dir,
609       version_util::StateChange version_state_change, int32_t new_version);
610 
611   // Populates the schema database field in the schema proto that is stored in
612   // the input schema file.
613   //
614   // Returns:
615   //   OK on success or nothing to migrate
616   //   INTERNAL_ERROR on IO error
617   static libtextclassifier3::Status PopulateSchemaDatabaseFieldForSchemaFile(
618       const Filesystem* filesystem, const std::string& schema_filename);
619 
620   // Verifies that there is no error retrieving a previously set schema. Then
621   // initializes like normal.
622   //
623   // Returns:
624   //   OK on success
625   //   INTERNAL_ERROR on IO error
626   libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
627 
628   // First, blindly writes new_schema to the schema_file. Then initializes like
629   // normal.
630   //
631   // Returns:
632   //   OK on success
633   //   INTERNAL_ERROR on IO error
634   //   FAILED_PRECONDITION if there is already a schema set for the schema_file.
635   libtextclassifier3::Status Initialize(SchemaProto new_schema);
636 
637   // Handles initializing the SchemaStore and regenerating any data if needed.
638   //
639   // Returns:
640   //   OK on success
641   //   INTERNAL_ERROR on IO error
642   libtextclassifier3::Status InitializeInternal(
643       bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats);
644 
645   // Creates sub-components and verifies the integrity of each sub-component.
646   //
647   // Returns:
648   //   OK on success
649   //   INTERNAL_ERROR on IO error
650   libtextclassifier3::Status InitializeDerivedFiles();
651 
652   // Populates any derived data structures off of the schema.
653   //
654   // Returns:
655   //   OK on success
656   //   NOT_FOUND_ERROR if a schema proto has not been set
657   //   INTERNAL_ERROR on any IO errors
658   libtextclassifier3::Status RegenerateDerivedFiles(
659       bool create_overlay_if_necessary);
660 
661   // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_.
662   //
663   // Returns:
664   //   OK on success
665   //   NOT_FOUND_ERROR if a schema proto has not been set
666   //   INTERNAL_ERROR on any IO errors
667   libtextclassifier3::Status BuildInMemoryCache();
668 
669   // Update and replace the header file. Creates the header file if it doesn't
670   // exist.
671   //
672   // Returns:
673   //   OK on success
674   //   INTERNAL on I/O error
675   libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
676 
677   // Resets the unique_ptr to the schema_type_mapper_, deletes the underlying
678   // file, and re-creates a new instance of the schema_type_mapper_. Does not
679   // populate the schema_type_mapper_.
680   //
681   // Returns any IO errors.
682   libtextclassifier3::Status ResetSchemaTypeMapper();
683 
684   // Creates a new schema store with new_schema and then swaps that new schema
685   // store with the existing one. This function guarantees that either: this
686   // instance will be fully updated to the new schema or no changes will take
687   // effect.
688   //
689   // Returns:
690   //   OK on success
691   //   INTERNAL on I/O error.
692   libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema);
693 
CheckSchemaSet()694   libtextclassifier3::Status CheckSchemaSet() const {
695     return has_schema_successfully_set_
696                ? libtextclassifier3::Status::OK
697                : absl_ports::FailedPreconditionError("Schema not set yet.");
698   }
699 
700   // Correctly loads the Header, schema_file_ and (if present) the
701   // overlay_schema_file_.
702   // RETURNS:
703   //   - OK on success
704   //   - INTERNAL if an IO error is encountered when reading the Header or
705   //   schemas.
706   //     Or an invalid schema configuration is present.
707   libtextclassifier3::Status LoadSchema();
708 
709   // Sets the schema for a database for the first time.
710   //
711   // Note that when schema database is disabled, this function sets the entire
712   // schema, with all under the default empty database.
713   //
714   // Requires:
715   //   - All types in new_schema are from the same database.
716   //   - new_schema does not contain type names that are already in use by a
717   //     different database.
718   //
719   // Returns:
720   //   - SetSchemaResult that indicates if the new schema can be set.
721   //   - INTERNAL_ERROR on any IO errors.
722   //   - INVALID_ARGUMENT_ERROR if the schema is invalid.
723   libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
724   SetInitialSchemaForDatabase(SchemaProto new_schema,
725                               bool ignore_errors_and_delete_documents,
726                               bool allow_circular_schema_definitions);
727 
728   // Sets the schema for a database, overriding any existing schema for that
729   // database.
730   //
731   // Note that when schema database is disabled, this function sets and
732   // overrides the entire schema.
733   //
734   // Requires:
735   //   - All types in new_schema are from the same database.
736   //   - new_schema does not contain type names that are already in use by a
737   //     different database.
738   //
739   // Returns:
740   //   - SetSchemaResult that encapsulates the differences between the old and
741   //     new schema, as well as if the new schema can be set.
742   //   - INTERNAL_ERROR on any IO errors.
743   //   - INVALID_ARGUMENT_ERROR if the schema is invalid.
744   libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
745   SetSchemaWithDatabaseOverride(SchemaProto new_schema,
746                                 const SchemaProto& old_schema,
747                                 bool ignore_errors_and_delete_documents,
748                                 bool allow_circular_schema_definitions);
749 
750   // Initial validation on the SchemaProto for SetSchema. This is intended as a
751   // preliminary check before any expensive operations are performed during
752   // `SetSchema::Validate`. Returns the schema's database if it's valid.
753   //
754   // Note that when schema database is disabled, any schema input is valid and
755   // an empty string is returned as the database.
756   //
757   // Checks that:
758   // - The new schema only contains types from a single database.
759   // - The schema's type names are not already in use in other databases. This
760   //   is done outside of `SchemaUtil::Validate` because we need to know all
761   //   existing type names, which is stored in the SchemaStore and not known to
762   //   SchemaUtil.
763   //
764   // Returns:
765   //   - new_schema's database on success
766   //   - INVALID_ARGUMENT_ERROR if new_schema contains types from multiple
767   //     databases
768   //   - ALREADY_EXISTS_ERROR if new_schema's types names are not unique
769   libtextclassifier3::StatusOr<std::string> ValidateAndGetDatabase(
770       const SchemaProto& new_schema) const;
771 
772   // Returns a SchemaProto representing the full schema, which is a combination
773   // of the existing schema and the input database schema.
774   //
775   // For the database being updated by the input database schema:
776   // - If the existing schema does not contain the database, the input types
777   //   are appended to the end of the SchemaProto, without changing the order
778   //   of the existing schema types.
779   // - Otherwise, the existing schema types are replaced with types from the
780   //   input database schema in their original position in the existing
781   //   SchemaProto.
782   //   - Types from input_database_schema are added in the order in which they
783   //     appear.
784   //   - If more types are added to the database, the additional types are
785   //     appended at the end of the SchemaProto, without changing the order of
786   //     existing types from unaffected databases.
787   //
788   // Requires:
789   //   - input_database_schema must not contain types from multiple databases.
790   //
791   // Returns:
792   //   - SchemaProto on success
793   //   - INTERNAL_ERROR on any IO errors, or if the schema store was not
794   //     previously initialized properly.
795   //   - INVALID_ARGUMENT_ERROR if the input schema contains types from multiple
796   //     databases.
797   libtextclassifier3::StatusOr<SchemaProto> GetFullSchemaProtoWithUpdatedDb(
798       SchemaProto input_database_schema) const;
799 
800   const Filesystem* filesystem_;
801   std::string base_dir_;
802   const Clock* clock_;
803   const FeatureFlags* feature_flags_;  // Does not own.
804 
805   // Used internally to indicate whether the class has been successfully
806   // initialized with a valid schema. Will be false if Initialize failed or no
807   // schema has ever been set.
808   bool has_schema_successfully_set_ = false;
809 
810   // Cached schema
811   std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_;
812 
813   // This schema holds the definition of any schema types that are not
814   // compatible with older versions of Icing code.
815   std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_;
816 
817   // Maps schema types to a densely-assigned unique id.
818   std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
819 
820   // Maps schema type ids to the corresponding schema type. This is an inverse
821   // map of schema_type_mapper_.
822   std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_;
823 
824   // A hash map of (database -> vector of type config names in the database).
825   //
826   // We use a vector instead of a set because we need to preserve the order of
827   // the types (i.e. the order in which they appear in the input SchemaProto
828   // during SetSchema), so that we can return the correct SchemaProto for
829   // GetSchema.
830   //
831   // This keeps track of the type configs defined in each database, which allows
832   // schema operations to be performed on a per-database basis.
833   std::unordered_map<std::string, std::vector<std::string>> database_type_map_;
834 
835   // A hash map of (type config name -> type config), allows faster lookup of
836   // type config in schema. The O(1) type config access makes schema-related and
837   // section-related operations faster.
838   SchemaUtil::TypeConfigMap type_config_map_;
839 
840   // Maps from each type id to all of its subtype ids.
841   // T2 is a subtype of T1, if and only if one of the following conditions is
842   // met:
843   // - T2 is T1
844   // - T2 extends T1
845   // - There exists a type U, such that T2 is a subtype of U, and U is a subtype
846   //   of T1
847   std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>>
848       schema_subtype_id_map_;
849 
850   // Manager of section (indexable property) and joinable property related
851   // metadata for all Schemas.
852   std::unique_ptr<const SchemaTypeManager> schema_type_manager_;
853 
854   // Used to cache and manage the schema's scorable properties.
855   std::unique_ptr<ScorablePropertyManager> scorable_property_manager_;
856 
857   std::unique_ptr<Header> header_;
858 
859   // Whether to use the database field for the schema.
860   //
861   // This is a temporary flag to control the rollout of the schema database. It
862   // affects the `SetSchema` and `GetSchema(std::string database)` methods.
863   // TODO - b/337913932: Remove this flag once the schema database is fully
864   // rolled out.
865   bool enable_schema_database_ = false;
866 };
867 
868 }  // namespace lib
869 }  // namespace icing
870 
871 #endif  // ICING_SCHEMA_SCHEMA_STORE_H_
872