xref: /aosp_15_r20/external/icing/icing/schema/schema-store.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/schema/schema-store.h"
16 
17 #include <cinttypes>
18 #include <cstddef>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/str_cat.h"
34 #include "icing/feature-flags.h"
35 #include "icing/file/destructible-directory.h"
36 #include "icing/file/file-backed-proto.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/file/version-util.h"
39 #include "icing/legacy/core/icing-string-util.h"
40 #include "icing/proto/debug.pb.h"
41 #include "icing/proto/document.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/schema.pb.h"
44 #include "icing/proto/search.pb.h"
45 #include "icing/proto/storage.pb.h"
46 #include "icing/schema/backup-schema-producer.h"
47 #include "icing/schema/joinable-property.h"
48 #include "icing/schema/property-util.h"
49 #include "icing/schema/schema-property-iterator.h"
50 #include "icing/schema/schema-type-manager.h"
51 #include "icing/schema/schema-util.h"
52 #include "icing/schema/scorable_property_manager.h"
53 #include "icing/schema/section.h"
54 #include "icing/store/document-filter-data.h"
55 #include "icing/store/dynamic-trie-key-mapper.h"
56 #include "icing/util/clock.h"
57 #include "icing/util/crc32.h"
58 #include "icing/util/logging.h"
59 #include "icing/util/status-macros.h"
60 
61 namespace icing {
62 namespace lib {
63 
64 namespace {
65 
66 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
67 constexpr char kSchemaFilename[] = "schema.pb";
68 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
69 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
70 
71 // This should be kept consistent with the delimiter used in AppSearch.
72 // See:
73 // https://cs.android.com/androidx/platform/frameworks/support/+/androidx-main:appsearch/appsearch-local-storage/src/main/java/androidx/appsearch/localstorage/util/PrefixUtil.java;l=42;drc=ffaf979c6f0cbd26caafd7a9d07a6bad12fe3a2a
74 
75 constexpr char kAppSearchDatabaseDelimiter = '/';
76 
77 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
78 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
79 // 384KiB.
80 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024;  // 384 KiB
81 
MakeHeaderFilename(const std::string & base_dir)82 std::string MakeHeaderFilename(const std::string& base_dir) {
83   return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
84 }
85 
MakeSchemaFilename(const std::string & base_dir)86 std::string MakeSchemaFilename(const std::string& base_dir) {
87   return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
88 }
89 
MakeOverlaySchemaFilename(const std::string & base_dir)90 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
91   return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
92 }
93 
MakeSchemaTypeMapperFilename(const std::string & base_dir)94 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
95   return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
96 }
97 
98 // Assuming that SchemaTypeIds are assigned to schema types based on their order
99 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
100 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)101 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
102     const SchemaProto& old_schema, const SchemaProto& new_schema) {
103   std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
104 
105   std::unordered_map<std::string, int> old_types_and_index;
106   for (int i = 0; i < old_schema.types().size(); ++i) {
107     old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
108   }
109 
110   std::unordered_map<std::string, int> new_types_and_index;
111   for (int i = 0; i < new_schema.types().size(); ++i) {
112     new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
113   }
114 
115   for (const auto& old_type_index : old_types_and_index) {
116     const auto& iter = new_types_and_index.find(old_type_index.first);
117     // We only care if the type exists in both the old and new schema. If the
118     // type has been deleted, then it'll be captured in
119     // SetSchemaResult.schema_types_deleted*. If the type has been added in the
120     // new schema then we also don't care because nothing needs to be updated.
121     if (iter != new_types_and_index.end()) {
122       // Since the SchemaTypeId of the schema type is just the index of it in
123       // the SchemaProto, compare the index and save it if it's not the same
124       if (old_type_index.second != iter->second) {
125         old_schema_type_ids_changed.emplace(old_type_index.second);
126       }
127     }
128   }
129 
130   return old_schema_type_ids_changed;
131 }
132 
133 // Returns the database from the schema type name if it exists.
134 //
135 // The schema type is expected to be in the format of
136 // <database><delimiter><actual_type_name>.
137 //
138 // Returns an empty string if the schema type name is not in the database
139 // format.
GetDatabaseFromSchemaType(const std::string & schema_type,char database_delimeter)140 std::string GetDatabaseFromSchemaType(const std::string& schema_type,
141                                       char database_delimeter) {
142   size_t db_index = schema_type.find(database_delimeter);
143   std::string database;
144   if (db_index != std::string::npos) {
145     database = schema_type.substr(0, db_index);
146   }
147   return database;
148 }
149 
150 // For each schema type in the schema proto, parses out the database from the
151 // type name, and sets it as the database field in the input proto in
152 // place. The schema_type name field itself is not modified.
153 //
154 // If the schema type name does not contain an AppSearch database, then
155 // SchemaTypeConfigProto is not modified.
156 //
157 // Returns:
158 //   - True if any SchemaTypeConfigProto in the schema proto is rewritten.
159 //   - False otherwise.
ParseAndPopulateAppSearchDatabaseField(SchemaProto & schema_proto)160 bool ParseAndPopulateAppSearchDatabaseField(SchemaProto& schema_proto) {
161   bool populated_database_field = false;
162   for (auto& type : *schema_proto.mutable_types()) {
163     std::string database = GetDatabaseFromSchemaType(
164         type.schema_type(), kAppSearchDatabaseDelimiter);
165     if (type.database() != database) {
166       type.set_database(std::move(database));
167       populated_database_field = true;
168     }
169   }
170   return populated_database_field;
171 }
172 
173 }  // namespace
174 
175 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,std::string path)176 SchemaStore::Header::Read(const Filesystem* filesystem, std::string path) {
177   if (!filesystem->FileExists(path.c_str())) {
178     return absl_ports::NotFoundError(
179         absl_ports::StrCat("Header file is empty: ", path));
180   }
181 
182   SerializedHeader serialized_header;
183   ScopedFd sfd(filesystem->OpenForWrite(path.c_str()));
184   if (!sfd.is_valid()) {
185     return absl_ports::InternalError("Unable to open or create header file.");
186   }
187 
188   // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
189   int64_t file_size = filesystem->GetFileSize(sfd.get());
190   if (file_size == sizeof(LegacyHeader)) {
191     LegacyHeader legacy_header;
192     if (!filesystem->Read(sfd.get(), &legacy_header, sizeof(legacy_header))) {
193       return absl_ports::InternalError(
194           absl_ports::StrCat("Couldn't read: ", path));
195     }
196     if (legacy_header.magic != Header::kMagic) {
197       return absl_ports::InternalError(
198           absl_ports::StrCat("Invalid header kMagic for file: ", path));
199     }
200     serialized_header.checksum = legacy_header.checksum;
201   } else if (file_size == sizeof(SerializedHeader)) {
202     if (!filesystem->Read(sfd.get(), &serialized_header,
203                           sizeof(serialized_header))) {
204       return absl_ports::InternalError(
205           absl_ports::StrCat("Couldn't read: ", path));
206     }
207     if (serialized_header.magic != Header::kMagic) {
208       return absl_ports::InternalError(
209           absl_ports::StrCat("Invalid header kMagic for file: ", path));
210     }
211   } else if (file_size != 0) {
212     // file is neither the legacy header, the new header nor empty. Something is
213     // wrong here.
214     int legacy_header_size = sizeof(LegacyHeader);
215     int header_size = sizeof(SerializedHeader);
216     return absl_ports::InternalError(IcingStringUtil::StringPrintf(
217         "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
218         legacy_header_size, header_size));
219   }
220   return Header(serialized_header, std::move(path), std::move(sfd), filesystem);
221 }
222 
Write()223 libtextclassifier3::Status SchemaStore::Header::Write() {
224   if (!dirty_) {
225     return libtextclassifier3::Status::OK;
226   }
227   if (!header_fd_.is_valid() && !filesystem_->FileExists(path_.c_str())) {
228     header_fd_.reset(filesystem_->OpenForWrite(path_.c_str()));
229   }
230   // This should overwrite the header.
231   if (!header_fd_.is_valid() ||
232       !filesystem_->PWrite(header_fd_.get(), /*offset=*/0, &serialized_header_,
233                            sizeof(serialized_header_))) {
234     return absl_ports::InternalError(
235         absl_ports::StrCat("Failed to write SchemaStore header"));
236   }
237   dirty_ = false;
238   return libtextclassifier3::Status::OK;
239 }
240 
PersistToDisk()241 libtextclassifier3::Status SchemaStore::Header::PersistToDisk() {
242   if (dirty_) {
243     ICING_RETURN_IF_ERROR(Write());
244   }
245   // This should overwrite the header.
246   if (!header_fd_.is_valid() || !filesystem_->DataSync(header_fd_.get())) {
247     return absl_ports::InternalError(
248         absl_ports::StrCat("Failed to sync SchemaStore header."));
249   }
250   return libtextclassifier3::Status::OK;
251 }
252 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,bool enable_schema_database,InitializeStatsProto * initialize_stats)253 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
254     const Filesystem* filesystem, const std::string& base_dir,
255     const Clock* clock, const FeatureFlags* feature_flags,
256     bool enable_schema_database, InitializeStatsProto* initialize_stats) {
257   ICING_RETURN_ERROR_IF_NULL(filesystem);
258   ICING_RETURN_ERROR_IF_NULL(clock);
259   ICING_RETURN_ERROR_IF_NULL(feature_flags);
260 
261   if (!filesystem->DirectoryExists(base_dir.c_str())) {
262     return absl_ports::FailedPreconditionError(
263         "Schema store base directory does not exist!");
264   }
265   std::unique_ptr<SchemaStore> schema_store =
266       std::unique_ptr<SchemaStore>(new SchemaStore(
267           filesystem, base_dir, clock, feature_flags, enable_schema_database));
268   ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
269   return schema_store;
270 }
271 
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,SchemaProto schema,bool enable_schema_database)272 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
273     const Filesystem* filesystem, const std::string& base_dir,
274     const Clock* clock, const FeatureFlags* feature_flags, SchemaProto schema,
275     bool enable_schema_database) {
276   ICING_RETURN_ERROR_IF_NULL(filesystem);
277   ICING_RETURN_ERROR_IF_NULL(clock);
278   ICING_RETURN_ERROR_IF_NULL(feature_flags);
279 
280   if (!filesystem->DirectoryExists(base_dir.c_str())) {
281     return absl_ports::FailedPreconditionError(
282         "Schema store base directory does not exist!");
283   }
284   std::unique_ptr<SchemaStore> schema_store =
285       std::unique_ptr<SchemaStore>(new SchemaStore(
286           filesystem, base_dir, clock, feature_flags, enable_schema_database));
287   ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
288   return schema_store;
289 }
290 
291 /* static */ libtextclassifier3::Status
PopulateSchemaDatabaseFieldForSchemaFile(const Filesystem * filesystem,const std::string & schema_filename)292 SchemaStore::PopulateSchemaDatabaseFieldForSchemaFile(
293     const Filesystem* filesystem, const std::string& schema_filename) {
294   FileBackedProto<SchemaProto> schema_file(*filesystem, schema_filename);
295   auto schema_proto_or = schema_file.Read();
296   if (absl_ports::IsNotFound(schema_proto_or.status())) {
297     // Don't have an existing schema proto, that's fine
298     return libtextclassifier3::Status::OK;
299   } else if (!schema_proto_or.ok()) {
300     // Real error when trying to read the existing schema
301     return schema_proto_or.status();
302   }
303 
304   SchemaProto schema_proto_copy = *schema_proto_or.ValueOrDie();
305   bool schema_changed =
306       ParseAndPopulateAppSearchDatabaseField(schema_proto_copy);
307   if (!schema_changed) {
308     // Nothing to do if the schema is not changed.
309     return libtextclassifier3::Status::OK;
310   }
311 
312   // Create a temporary schema file and schema proto copy to update the
313   // schema.
314   std::string temp_schema_filename = schema_filename + ".tmp";
315   if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
316     return absl_ports::InternalError(
317         "Unable to delete temp schema file to prepare for schema database "
318         "migration.");
319   }
320 
321   {
322     FileBackedProto<SchemaProto> temp_schema_file(*filesystem,
323                                                   temp_schema_filename);
324     ICING_RETURN_IF_ERROR(temp_schema_file.Write(
325         std::make_unique<SchemaProto>(schema_proto_copy)));
326   }
327 
328   // Swap the temp schema file with the original schema file.
329   if (!filesystem->SwapFiles(temp_schema_filename.c_str(),
330                              schema_filename.c_str())) {
331     return absl_ports::InternalError(
332         "Unable to apply migrated schema with database due to failed swap!");
333   }
334   // Clean up the temp schema file.
335   if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
336     return absl_ports::InternalError(
337         "Unable to delete temp schema file after schema database migration.");
338   }
339 
340   return libtextclassifier3::Status::OK;
341 }
342 
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)343 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
344     const Filesystem* filesystem, const std::string& base_dir, Header& header) {
345   std::string header_filename = MakeHeaderFilename(base_dir);
346   if (header.overlay_created()) {
347     header.SetOverlayInfo(
348         /*overlay_created=*/false,
349         /*min_overlay_version_compatibility=*/std::numeric_limits<
350             int32_t>::max());
351     ICING_RETURN_IF_ERROR(header.Write());
352   }
353   std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
354   if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
355     return absl_ports::InternalError(
356         "Unable to delete stale schema overlay file.");
357   }
358   return libtextclassifier3::Status::OK;
359 }
360 
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version,bool perform_schema_database_migration)361 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
362     const Filesystem* filesystem, const std::string& base_dir,
363     version_util::StateChange version_state_change, int32_t new_version,
364     bool perform_schema_database_migration) {
365   if (!filesystem->DirectoryExists(base_dir.c_str())) {
366     // Situations when schema store directory doesn't exist:
367     // - Initializing new Icing instance: don't have to do anything now. The
368     //   directory will be created later.
369     // - Lose schema store: there is nothing we can do now. The logic will be
370     //   handled later by initializing.
371     //
372     // Therefore, just simply return OK here.
373     return libtextclassifier3::Status::OK;
374   }
375 
376   ICING_RETURN_IF_ERROR(HandleOverlaySchemaForVersionChange(
377       filesystem, base_dir, version_state_change, new_version));
378 
379   // Perform schema database migration if needed.
380   // - This populates the the database field in the schema proto and writes it
381   //   to the schema file.
382   // - If the overlay schema file exists at this point, does the same for the
383   //   overlay schema.
384   if (perform_schema_database_migration) {
385     std::string base_schema_filename = MakeSchemaFilename(base_dir);
386     ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
387         filesystem, base_schema_filename));
388 
389     std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
390     if (filesystem->FileExists(overlay_schema_filename.c_str())) {
391       ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
392           filesystem, overlay_schema_filename));
393     }
394   }
395 
396   return libtextclassifier3::Status::OK;
397 }
398 
399 /* static */ libtextclassifier3::Status
HandleOverlaySchemaForVersionChange(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)400 SchemaStore::HandleOverlaySchemaForVersionChange(
401     const Filesystem* filesystem, const std::string& base_dir,
402     version_util::StateChange version_state_change, int32_t new_version) {
403   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
404   if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
405     // The overlay doesn't exist. So there should be nothing particularly
406     // interesting to worry about.
407     return libtextclassifier3::Status::OK;
408   }
409 
410   std::string header_filename = MakeHeaderFilename(base_dir);
411   libtextclassifier3::StatusOr<Header> header_or;
412   switch (version_state_change) {
413     // No necessary actions for normal upgrades or no version change. The data
414     // that was produced by the previous version is fully compatible with this
415     // version and there's no stale data for us to clean up.
416     // The same is true for a normal rollforward. A normal rollforward implies
417     // that the previous version was one that understood the concept of the
418     // overlay schema and would have already discarded it if it was unusable.
419     case version_util::StateChange::kVersionZeroUpgrade:
420       // fallthrough
421     case version_util::StateChange::kUpgrade:
422       // fallthrough
423     case version_util::StateChange::kRollForward:
424       // fallthrough
425     case version_util::StateChange::kCompatible:
426       return libtextclassifier3::Status::OK;
427     case version_util::StateChange::kVersionZeroRollForward: {
428       // We've rolled forward. The schema overlay file, if it exists, is
429       // possibly stale. We must throw it out.
430       header_or = Header::Read(filesystem, header_filename);
431       ICING_RETURN_IF_ERROR(header_or.status());
432       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
433                                                header_or.ValueOrDie());
434     }
435     case version_util::StateChange::kRollBack: {
436       header_or = Header::Read(filesystem, header_filename);
437       ICING_RETURN_IF_ERROR(header_or.status());
438       if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
439           new_version) {
440         // We've been rolled back, but the overlay schema claims that it
441         // supports this version. So we can safely return.
442         return libtextclassifier3::Status::OK;
443       }
444       // We've been rolled back to a version that the overlay schema doesn't
445       // support. We must throw it out.
446       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
447                                                header_or.ValueOrDie());
448     }
449     case version_util::StateChange::kUndetermined:
450       // It's not clear what version we're on, but the base schema should always
451       // be safe to use. Throw out the overlay.
452       header_or = Header::Read(filesystem, header_filename);
453       ICING_RETURN_IF_ERROR(header_or.status());
454       return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
455                                                header_or.ValueOrDie());
456   }
457   return libtextclassifier3::Status::OK;
458 }
459 
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)460 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
461     const Filesystem* filesystem, const std::string& base_dir) {
462   // Schema type mapper
463   return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
464       *filesystem, MakeSchemaTypeMapperFilename(base_dir));
465 }
466 
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock,const FeatureFlags * feature_flags,bool enable_schema_database)467 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
468                          const Clock* clock, const FeatureFlags* feature_flags,
469                          bool enable_schema_database)
470     : filesystem_(filesystem),
471       base_dir_(std::move(base_dir)),
472       clock_(clock),
473       feature_flags_(feature_flags),
474       schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
475           *filesystem, MakeSchemaFilename(base_dir_))),
476       enable_schema_database_(enable_schema_database) {}
477 
~SchemaStore()478 SchemaStore::~SchemaStore() {
479   if (has_schema_successfully_set_ && schema_file_ != nullptr &&
480       schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
481     if (!PersistToDisk().ok()) {
482       ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
483     }
484   }
485 }
486 
Initialize(SchemaProto new_schema)487 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
488   ICING_RETURN_IF_ERROR(LoadSchema());
489   if (!absl_ports::IsNotFound(GetSchema().status())) {
490     return absl_ports::FailedPreconditionError(
491         "Incorrectly tried to initialize schema store with a new schema, when "
492         "one is already set!");
493   }
494   ICING_RETURN_IF_ERROR(schema_file_->Write(
495       std::make_unique<SchemaProto>(std::move(new_schema))));
496   return InitializeInternal(/*create_overlay_if_necessary=*/true,
497                             /*initialize_stats=*/nullptr);
498 }
499 
Initialize(InitializeStatsProto * initialize_stats)500 libtextclassifier3::Status SchemaStore::Initialize(
501     InitializeStatsProto* initialize_stats) {
502   ICING_RETURN_IF_ERROR(LoadSchema());
503   auto schema_proto_or = GetSchema();
504   if (absl_ports::IsNotFound(schema_proto_or.status())) {
505     // Don't have an existing schema proto, that's fine
506     return libtextclassifier3::Status::OK;
507   } else if (!schema_proto_or.ok()) {
508     // Real error when trying to read the existing schema
509     return schema_proto_or.status();
510   }
511   return InitializeInternal(/*create_overlay_if_necessary=*/false,
512                             initialize_stats);
513 }
514 
LoadSchema()515 libtextclassifier3::Status SchemaStore::LoadSchema() {
516   libtextclassifier3::StatusOr<Header> header_or =
517       Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
518   bool header_exists = false;
519   if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
520     return header_or.status();
521   } else if (!header_or.ok()) {
522     header_ =
523         std::make_unique<Header>(filesystem_, MakeHeaderFilename(base_dir_));
524   } else {
525     header_exists = true;
526     header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
527   }
528 
529   std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
530   bool overlay_schema_file_exists =
531       filesystem_->FileExists(overlay_schema_filename.c_str());
532 
533   libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
534   if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
535     return base_schema_state;
536   }
537 
538   // There are three valid cases:
539   // 1. Everything is missing. This is an empty schema store.
540   if (!base_schema_state.ok() && !overlay_schema_file_exists &&
541       !header_exists) {
542     return libtextclassifier3::Status::OK;
543   }
544 
545   // 2. There never was a overlay schema. The header exists, the base schema
546   //    exists and the header says the overlay schema shouldn't exist
547   if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
548       !header_->overlay_created()) {
549     // Nothing else to do. Just return safely.
550     return libtextclassifier3::Status::OK;
551   }
552 
553   // 3. There is an overlay schema and a base schema and a header. The header
554   // says that the overlay schema should exist.
555   if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
556       header_->overlay_created()) {
557     overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
558         *filesystem_, MakeOverlaySchemaFilename(base_dir_));
559     return libtextclassifier3::Status::OK;
560   }
561 
562   // Something has gone wrong. We've lost part of the schema ground truth.
563   // Return an error.
564   bool overlay_created = header_->overlay_created();
565   bool base_schema_exists = base_schema_state.ok();
566   return absl_ports::InternalError(IcingStringUtil::StringPrintf(
567       "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
568       "base schema exists: %d, overlay_schema_exists: %d",
569       header_exists, overlay_created, base_schema_exists,
570       overlay_schema_file_exists));
571 }
572 
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)573 libtextclassifier3::Status SchemaStore::InitializeInternal(
574     bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
575   if (!InitializeDerivedFiles().ok()) {
576     ICING_VLOG(3)
577         << "Couldn't find derived files or failed to initialize them, "
578            "regenerating derived files for SchemaStore.";
579     std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
580     if (initialize_stats != nullptr) {
581       initialize_stats->set_schema_store_recovery_cause(
582           InitializeStatsProto::IO_ERROR);
583     }
584     ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
585     if (initialize_stats != nullptr) {
586       initialize_stats->set_schema_store_recovery_latency_ms(
587           regenerate_timer->GetElapsedMilliseconds());
588     }
589   }
590 
591   if (initialize_stats != nullptr) {
592     initialize_stats->set_num_schema_types(type_config_map_.size());
593   }
594   has_schema_successfully_set_ = true;
595 
596   return libtextclassifier3::Status::OK;
597 }
598 
InitializeDerivedFiles()599 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
600   ICING_ASSIGN_OR_RETURN(
601       schema_type_mapper_,
602       DynamicTrieKeyMapper<SchemaTypeId>::Create(
603           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
604           kSchemaTypeMapperMaxSize));
605 
606   Crc32 expected_checksum(header_->checksum());
607   ICING_ASSIGN_OR_RETURN(Crc32 checksum, GetChecksum());
608   if (checksum != expected_checksum) {
609     return absl_ports::InternalError(
610         "Combined checksum of SchemaStore was inconsistent");
611   }
612 
613   ICING_RETURN_IF_ERROR(BuildInMemoryCache());
614   return libtextclassifier3::Status::OK;
615 }
616 
RegenerateDerivedFiles(bool create_overlay_if_necessary)617 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
618     bool create_overlay_if_necessary) {
619   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
620 
621   ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
622 
623   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
624     // Assign a SchemaTypeId to the type
625     ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
626         type_config.schema_type(), schema_type_mapper_->num_keys()));
627   }
628   ICING_RETURN_IF_ERROR(BuildInMemoryCache());
629 
630   if (create_overlay_if_necessary) {
631     ICING_ASSIGN_OR_RETURN(
632         BackupSchemaProducer producer,
633         BackupSchemaProducer::Create(*schema_proto,
634                                      schema_type_manager_->section_manager()));
635 
636     if (producer.is_backup_necessary()) {
637       SchemaProto base_schema = std::move(producer).Produce();
638 
639       // The overlay schema should be written to the overlay file location.
640       overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
641           *filesystem_, MakeOverlaySchemaFilename(base_dir_));
642       auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
643       ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
644 
645       // The base schema should be written to the original file
646       auto base_schema_ptr =
647           std::make_unique<SchemaProto>(std::move(base_schema));
648       ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
649 
650       // LINT.IfChange(min_overlay_version_compatibility)
651       // Although the current version is 5, the schema is compatible with
652       // version 1, so min_overlay_version_compatibility should be 1.
653       int32_t min_overlay_version_compatibility = version_util::kVersionOne;
654       // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
655       header_->SetOverlayInfo(
656           /*overlay_created=*/true, min_overlay_version_compatibility);
657       // Rebuild in memory data - references to the old schema will be invalid
658       // now.
659       ICING_RETURN_IF_ERROR(BuildInMemoryCache());
660     }
661   }
662 
663   // Write the header
664   ICING_RETURN_IF_ERROR(UpdateChecksum());
665   return libtextclassifier3::Status::OK;
666 }
667 
BuildInMemoryCache()668 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
669   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
670   ICING_ASSIGN_OR_RETURN(
671       SchemaUtil::InheritanceMap inheritance_map,
672       SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
673 
674   reverse_schema_type_mapper_.clear();
675   database_type_map_.clear();
676   type_config_map_.clear();
677   schema_subtype_id_map_.clear();
678   for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
679     const std::string& database = type_config.database();
680     const std::string& type_name = type_config.schema_type();
681     ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
682                            schema_type_mapper_->Get(type_name));
683 
684     // Build reverse_schema_type_mapper_
685     reverse_schema_type_mapper_.insert({type_id, type_name});
686 
687     // Build database_type_map_
688     database_type_map_[database].push_back(type_name);
689 
690     // Build type_config_map_
691     type_config_map_.insert({type_name, type_config});
692 
693     // Build schema_subtype_id_map_
694     std::unordered_set<SchemaTypeId>& subtype_id_set =
695         schema_subtype_id_map_[type_id];
696     // Find all child types
697     auto child_types_names = inheritance_map.find(type_name);
698     if (child_types_names != inheritance_map.end()) {
699       subtype_id_set.reserve(child_types_names->second.size() + 1);
700       for (const auto& [child_type_name, is_direct_child] :
701            child_types_names->second) {
702         ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
703                                schema_type_mapper_->Get(child_type_name));
704         subtype_id_set.insert(child_type_id);
705       }
706     }
707     // Every type is a subtype of itself.
708     subtype_id_set.insert(type_id);
709   }
710 
711   // Build schema_type_manager_
712   ICING_ASSIGN_OR_RETURN(
713       schema_type_manager_,
714       SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
715 
716   scorable_property_manager_ = std::make_unique<ScorablePropertyManager>();
717 
718   return libtextclassifier3::Status::OK;
719 }
720 
ResetSchemaTypeMapper()721 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
722   // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
723   schema_type_mapper_.reset();
724   // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
725   // that can support error logging.
726   libtextclassifier3::Status status =
727       DynamicTrieKeyMapper<SchemaTypeId>::Delete(
728           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
729   if (!status.ok()) {
730     ICING_LOG(ERROR) << status.error_message()
731                      << "Failed to delete old schema_type mapper";
732     return status;
733   }
734   ICING_ASSIGN_OR_RETURN(
735       schema_type_mapper_,
736       DynamicTrieKeyMapper<SchemaTypeId>::Create(
737           *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
738           kSchemaTypeMapperMaxSize));
739 
740   return libtextclassifier3::Status::OK;
741 }
742 
GetChecksum() const743 libtextclassifier3::StatusOr<Crc32> SchemaStore::GetChecksum() const {
744   ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_->GetChecksum());
745   // We've gotten the schema_checksum successfully. This means that
746   // schema_file_->Read() will only return either a schema or NOT_FOUND.
747   // Sadly, we actually need to differentiate between an existing, but empty
748   // schema and a non-existent schema (both of which will have a checksum of 0).
749   // For existing, but empty schemas, we need to continue with the checksum
750   // calculation of the other components.
751   if (schema_checksum == Crc32() &&
752       absl_ports::IsNotFound(schema_file_->Read().status())) {
753     return schema_checksum;
754   }
755 
756   Crc32 total_checksum;
757   total_checksum.Append(std::to_string(schema_checksum.Get()));
758   if (overlay_schema_file_ != nullptr) {
759     ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
760                            overlay_schema_file_->GetChecksum());
761     total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
762   }
763 
764   ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
765                          schema_type_mapper_->GetChecksum());
766   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
767   return total_checksum;
768 }
769 
UpdateChecksum()770 libtextclassifier3::StatusOr<Crc32> SchemaStore::UpdateChecksum() {
771   // FileBackedProto always keeps its checksum up to date. So we just need to
772   // retrieve the checksum.
773   ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_->GetChecksum());
774   // We've gotten the schema_checksum successfully. This means that
775   // schema_file_->Read() will only return either a schema or NOT_FOUND.
776   // Sadly, we actually need to differentiate between an existing, but empty
777   // schema and a non-existent schema (both of which will have a checksum of 0).
778   // For existing, but empty schemas, we need to continue with the checksum
779   // calculation of the other components so that we will correctly write the
780   // header.
781   if (schema_checksum == Crc32() &&
782       absl_ports::IsNotFound(schema_file_->Read().status())) {
783     return schema_checksum;
784   }
785   Crc32 total_checksum;
786   total_checksum.Append(std::to_string(schema_checksum.Get()));
787 
788   if (overlay_schema_file_ != nullptr) {
789     ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
790                            overlay_schema_file_->GetChecksum());
791     total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
792   }
793 
794   ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
795                          schema_type_mapper_->UpdateChecksum());
796   total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
797 
798   header_->set_checksum(total_checksum.Get());
799   ICING_RETURN_IF_ERROR(header_->Write());
800   return total_checksum;
801 }
802 
GetSchema() const803 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
804     const {
805   if (overlay_schema_file_ != nullptr) {
806     return overlay_schema_file_->Read();
807   }
808   return schema_file_->Read();
809 }
810 
GetSchema(const std::string & database) const811 libtextclassifier3::StatusOr<SchemaProto> SchemaStore::GetSchema(
812     const std::string& database) const {
813   if (!has_schema_successfully_set_) {
814     return absl_ports::NotFoundError("No schema found.");
815   }
816 
817   const auto database_type_map_itr_ = database_type_map_.find(database);
818   if (database_type_map_itr_ == database_type_map_.end()) {
819     return absl_ports::NotFoundError(
820         absl_ports::StrCat("No schema found for database '", database, "'."));
821   }
822 
823   SchemaProto schema_proto;
824   for (const std::string& type_name : database_type_map_itr_->second) {
825     ICING_ASSIGN_OR_RETURN(const SchemaTypeConfigProto* type_config,
826                            GetSchemaTypeConfig(type_name));
827     *schema_proto.add_types() = *type_config;
828   }
829   return schema_proto;
830 }
831 
832 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
833 // needed by production code. It's currently being used by our tests, but maybe
834 // it's trivial to change our test code to also use the
835 // SetSchema(SchemaProto&& new_schema)
836 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)837 SchemaStore::SetSchema(const SchemaProto& new_schema,
838                        bool ignore_errors_and_delete_documents,
839                        bool allow_circular_schema_definitions) {
840   return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
841                    allow_circular_schema_definitions);
842 }
843 
844 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)845 SchemaStore::SetSchema(SchemaProto&& new_schema,
846                        bool ignore_errors_and_delete_documents,
847                        bool allow_circular_schema_definitions) {
848   if (enable_schema_database_) {
849     // Step 1: (Only required if schema database is enabled)
850     // Do some preliminary checks on the new schema before formal validation and
851     // delta computation. This checks that:
852     // - The new schema only contains types from a single database.
853     // - The new schema's type names are not already in use from other
854     // databases.
855     ICING_ASSIGN_OR_RETURN(std::string database,
856                            ValidateAndGetDatabase(new_schema));
857 
858     // Step 2: Schema validation and delta computation -- try to get the
859     // existing schema for the database to compare to the new schema.
860     libtextclassifier3::StatusOr<SchemaProto> schema_proto =
861         GetSchema(database);
862     if (absl_ports::IsNotFound(schema_proto.status())) {
863       // Case 1: No preexisting schema for this database.
864       return SetInitialSchemaForDatabase(std::move(new_schema),
865                                          ignore_errors_and_delete_documents,
866                                          allow_circular_schema_definitions);
867     }
868 
869     if (!schema_proto.ok()) {
870       // Case 2: Real error
871       return schema_proto.status();
872     }
873 
874     // Case 3: At this point, we're guaranteed that we have an existing schema
875     // for this database.
876     const SchemaProto& old_schema = schema_proto.ValueOrDie();
877     return SetSchemaWithDatabaseOverride(std::move(new_schema), old_schema,
878                                          ignore_errors_and_delete_documents,
879                                          allow_circular_schema_definitions);
880   }
881 
882   // Get the full schema if schema database is disabled.
883   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
884   if (absl_ports::IsNotFound(schema_proto.status())) {
885     // Case 1: No preexisting schema
886     return SetInitialSchemaForDatabase(std::move(new_schema),
887                                        ignore_errors_and_delete_documents,
888                                        allow_circular_schema_definitions);
889   }
890 
891   if (!schema_proto.ok()) {
892     // Case 2: Real error
893     return schema_proto.status();
894   }
895 
896   // Case 3: At this point, we're guaranteed that we have an existing schema
897   const SchemaProto& old_schema = *schema_proto.ValueOrDie();
898   return SetSchemaWithDatabaseOverride(std::move(new_schema), old_schema,
899                                        ignore_errors_and_delete_documents,
900                                        allow_circular_schema_definitions);
901 }
902 
903 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetInitialSchemaForDatabase(SchemaProto new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)904 SchemaStore::SetInitialSchemaForDatabase(
905     SchemaProto new_schema, bool ignore_errors_and_delete_documents,
906     bool allow_circular_schema_definitions) {
907   SetSchemaResult result;
908 
909   ICING_RETURN_IF_ERROR(SchemaUtil::Validate(
910       new_schema, *feature_flags_, allow_circular_schema_definitions));
911 
912   result.success = true;
913   for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
914     result.schema_types_new_by_name.insert(type_config.schema_type());
915   }
916   // Get the full new SchemaProto that is a combination of the existing schema
917   // and new_schema. This is needed as we can only write the full proto to the
918   // schema file.
919   ICING_ASSIGN_OR_RETURN(
920       SchemaProto full_new_schema,
921       GetFullSchemaProtoWithUpdatedDb(std::move(new_schema)));
922   ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
923   has_schema_successfully_set_ = true;
924 
925   return result;
926 }
927 
928 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchemaWithDatabaseOverride(SchemaProto new_schema,const SchemaProto & old_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)929 SchemaStore::SetSchemaWithDatabaseOverride(
930     SchemaProto new_schema, const SchemaProto& old_schema,
931     bool ignore_errors_and_delete_documents,
932     bool allow_circular_schema_definitions) {
933   // Assume we can set the schema unless proven otherwise.
934   SetSchemaResult result;
935   result.success = true;
936 
937   if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
938     // Same schema as before. No need to update anything
939     return result;
940   }
941 
942   // Different schema -- we need to validate the schema and track the
943   // differences to see if we can still write it.
944   //
945   // Validate the new schema and compute the delta between the old and new
946   // schema.
947   ICING_ASSIGN_OR_RETURN(
948       SchemaUtil::DependentMap new_dependent_map,
949       SchemaUtil::Validate(new_schema, *feature_flags_,
950                            allow_circular_schema_definitions));
951   SchemaUtil::SchemaDelta schema_delta = SchemaUtil::ComputeCompatibilityDelta(
952       old_schema, new_schema, new_dependent_map, *feature_flags_);
953 
954   result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
955   result.schema_types_changed_fully_compatible_by_name =
956       std::move(schema_delta.schema_types_changed_fully_compatible);
957   result.schema_types_index_incompatible_by_name =
958       std::move(schema_delta.schema_types_index_incompatible);
959   result.schema_types_join_incompatible_by_name =
960       std::move(schema_delta.schema_types_join_incompatible);
961   result.schema_types_scorable_property_inconsistent_by_name =
962       std::move(schema_delta.schema_types_scorable_property_inconsistent);
963 
964   for (const std::string& schema_type : schema_delta.schema_types_deleted) {
965     // We currently don't support deletions, so mark this as not possible.
966     // This will change once we allow force-set schemas.
967     result.success = false;
968 
969     result.schema_types_deleted_by_name.emplace(schema_type);
970 
971     ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
972                            GetSchemaTypeId(schema_type));
973     result.schema_types_deleted_by_id.emplace(schema_type_id);
974   }
975 
976   for (const std::string& schema_type :
977        schema_delta.schema_types_incompatible) {
978     // We currently don't support incompatible schemas, so mark this as
979     // not possible. This will change once we allow force-set schemas.
980     result.success = false;
981 
982     result.schema_types_incompatible_by_name.emplace(schema_type);
983 
984     ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
985                            GetSchemaTypeId(schema_type));
986     result.schema_types_incompatible_by_id.emplace(schema_type_id);
987   }
988 
989   // Get the full new SchemaProto that is a combination of the existing schema
990   // and new_schema. This is needed to calculate the updated SchemaTypeIds, and
991   // for writing the full proto to the schema file.
992   ICING_ASSIGN_OR_RETURN(
993       SchemaProto full_new_schema,
994       GetFullSchemaProtoWithUpdatedDb(std::move(new_schema)));
995 
996   // We still need to update old_schema_type_ids_changed. We need to retrieve
997   // the entire old schema for this, as type ids are assigned for the entire
998   // schema, and not on a per-database level.
999   //
1000   // SchemaTypeIds changing is fine, we can update the DocumentStore.
1001   ICING_ASSIGN_OR_RETURN(const SchemaProto* full_old_schema, GetSchema());
1002   result.old_schema_type_ids_changed =
1003       SchemaTypeIdsChanged(*full_old_schema, full_new_schema);
1004 
1005   // We can force set the schema if the caller has told us to ignore any errors
1006   result.success = result.success || ignore_errors_and_delete_documents;
1007 
1008   // Step 3: Apply the schema change if success. This updates persisted files
1009   // and derived data structures.
1010   if (result.success) {
1011     ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
1012     has_schema_successfully_set_ = true;
1013   }
1014 
1015   // Convert schema types to SchemaTypeIds after the new schema is applied.
1016   if (feature_flags_->enable_scorable_properties()) {
1017     for (const std::string& schema_type :
1018          result.schema_types_scorable_property_inconsistent_by_name) {
1019       libtextclassifier3::StatusOr<SchemaTypeId> schema_type_id_or =
1020           GetSchemaTypeId(schema_type);
1021       if (!schema_type_id_or.ok()) {
1022         if (absl_ports::IsNotFound(schema_type_id_or.status())) {
1023           continue;
1024         }
1025         return schema_type_id_or.status();
1026       }
1027       result.schema_types_scorable_property_inconsistent_by_id.insert(
1028           schema_type_id_or.ValueOrDie());
1029     }
1030   }
1031 
1032   return result;
1033 }
1034 
ApplySchemaChange(SchemaProto new_schema)1035 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
1036     SchemaProto new_schema) {
1037   // We need to ensure that we either 1) successfully set the schema and
1038   // update all derived data structures or 2) fail and leave the schema store
1039   // unchanged.
1040   // So, first, we create an empty temporary directory to build a new schema
1041   // store in.
1042   std::string temp_schema_store_dir_path = base_dir_ + "_temp";
1043   if (!filesystem_->DeleteDirectoryRecursively(
1044           temp_schema_store_dir_path.c_str())) {
1045     ICING_LOG(ERROR) << "Recursively deleting "
1046                      << temp_schema_store_dir_path.c_str();
1047     return absl_ports::InternalError(
1048         "Unable to delete temp directory to prepare to build new schema "
1049         "store.");
1050   }
1051 
1052   DestructibleDirectory temp_schema_store_dir(
1053       filesystem_, std::move(temp_schema_store_dir_path));
1054   if (!temp_schema_store_dir.is_valid()) {
1055     return absl_ports::InternalError(
1056         "Unable to create temp directory to build new schema store.");
1057   }
1058 
1059   // Then we create our new schema store with the new schema.
1060   ICING_ASSIGN_OR_RETURN(
1061       std::unique_ptr<SchemaStore> new_schema_store,
1062       SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
1063                           feature_flags_, std::move(new_schema),
1064                           enable_schema_database_));
1065 
1066   // Then we swap the new schema file + new derived files with the old files.
1067   if (!filesystem_->SwapFiles(base_dir_.c_str(),
1068                               temp_schema_store_dir.dir().c_str())) {
1069     return absl_ports::InternalError(
1070         "Unable to apply new schema due to failed swap!");
1071   }
1072 
1073   std::string old_base_dir = std::move(base_dir_);
1074   *this = std::move(*new_schema_store);
1075 
1076   // After the std::move, the filepaths saved in this instance and in the
1077   // schema_file_ instance will still be the one from temp_schema_store_dir
1078   // even though they now point to files that are within old_base_dir.
1079   // Manually set them to the correct paths.
1080   base_dir_ = std::move(old_base_dir);
1081   schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
1082   if (overlay_schema_file_ != nullptr) {
1083     overlay_schema_file_->SetSwappedFilepath(
1084         MakeOverlaySchemaFilename(base_dir_));
1085   }
1086 
1087   return libtextclassifier3::Status::OK;
1088 }
1089 
1090 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const1091 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
1092   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1093   const auto& type_config_iter =
1094       type_config_map_.find(std::string(schema_type));
1095   if (type_config_iter == type_config_map_.end()) {
1096     return absl_ports::NotFoundError(
1097         absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
1098   }
1099   return &type_config_iter->second;
1100 }
1101 
GetSchemaTypeId(std::string_view schema_type) const1102 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
1103     std::string_view schema_type) const {
1104   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1105   return schema_type_mapper_->Get(schema_type);
1106 }
1107 
GetSchemaType(SchemaTypeId schema_type_id) const1108 libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
1109     SchemaTypeId schema_type_id) const {
1110   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1111   if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
1112       it == reverse_schema_type_mapper_.end()) {
1113     return absl_ports::InvalidArgumentError("Invalid schema type id");
1114   } else {
1115     return &it->second;
1116   }
1117 }
1118 
1119 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const1120 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
1121   ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1122                          GetSchemaTypeId(schema_type));
1123   auto iter = schema_subtype_id_map_.find(schema_type_id);
1124   if (iter == schema_subtype_id_map_.end()) {
1125     // This should never happen, unless there is an inconsistency or IO error.
1126     return absl_ports::InternalError(absl_ports::StrCat(
1127         "Schema type '", schema_type, "' is not found in the subtype map."));
1128   }
1129   return &iter->second;
1130 }
1131 
1132 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const1133 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
1134                                 SectionId section_id) const {
1135   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1136   return schema_type_manager_->section_manager().GetSectionMetadata(
1137       schema_type_id, section_id);
1138 }
1139 
ExtractSections(const DocumentProto & document) const1140 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
1141     const DocumentProto& document) const {
1142   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1143   return schema_type_manager_->section_manager().ExtractSections(document);
1144 }
1145 
1146 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const1147 SchemaStore::GetJoinablePropertyMetadata(
1148     SchemaTypeId schema_type_id, const std::string& property_path) const {
1149   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1150   return schema_type_manager_->joinable_property_manager()
1151       .GetJoinablePropertyMetadata(schema_type_id, property_path);
1152 }
1153 
1154 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id) const1155 SchemaStore::GetJoinablePropertyMetadata(
1156     SchemaTypeId schema_type_id,
1157     JoinablePropertyId joinable_property_id) const {
1158   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1159   return schema_type_manager_->joinable_property_manager()
1160       .GetJoinablePropertyMetadata(schema_type_id, joinable_property_id);
1161 }
1162 
1163 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const1164 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
1165   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1166   return schema_type_manager_->joinable_property_manager()
1167       .ExtractJoinableProperties(document);
1168 }
1169 
1170 libtextclassifier3::StatusOr<std::optional<int>>
GetScorablePropertyIndex(SchemaTypeId schema_type_id,std::string_view property_path) const1171 SchemaStore::GetScorablePropertyIndex(SchemaTypeId schema_type_id,
1172                                       std::string_view property_path) const {
1173   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1174   if (!feature_flags_->enable_scorable_properties()) {
1175     return std::nullopt;
1176   }
1177   return scorable_property_manager_->GetScorablePropertyIndex(
1178       schema_type_id, property_path, type_config_map_,
1179       reverse_schema_type_mapper_);
1180 }
1181 
1182 libtextclassifier3::StatusOr<
1183     const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*>
GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const1184 SchemaStore::GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const {
1185   ICING_RETURN_IF_ERROR(CheckSchemaSet());
1186   if (!feature_flags_->enable_scorable_properties()) {
1187     return nullptr;
1188   }
1189   return scorable_property_manager_->GetOrderedScorablePropertyInfo(
1190       schema_type_id, type_config_map_, reverse_schema_type_mapper_);
1191 }
1192 
PersistToDisk()1193 libtextclassifier3::Status SchemaStore::PersistToDisk() {
1194   if (!has_schema_successfully_set_) {
1195     return libtextclassifier3::Status::OK;
1196   }
1197   ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
1198   ICING_RETURN_IF_ERROR(UpdateChecksum());
1199   ICING_RETURN_IF_ERROR(header_->PersistToDisk());
1200   return libtextclassifier3::Status::OK;
1201 }
1202 
GetStorageInfo() const1203 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
1204   SchemaStoreStorageInfoProto storage_info;
1205   int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1206   storage_info.set_schema_store_size(
1207       Filesystem::SanitizeFileSize(directory_size));
1208   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
1209   storage_info.set_num_schema_types(schema->types().size());
1210   int total_sections = 0;
1211   int num_types_sections_exhausted = 0;
1212   for (const SchemaTypeConfigProto& type : schema->types()) {
1213     auto sections_list_or =
1214         schema_type_manager_->section_manager().GetMetadataList(
1215             type.schema_type());
1216     if (!sections_list_or.ok()) {
1217       continue;
1218     }
1219     total_sections += sections_list_or.ValueOrDie()->size();
1220     if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
1221       ++num_types_sections_exhausted;
1222     }
1223   }
1224 
1225   storage_info.set_num_total_sections(total_sections);
1226   storage_info.set_num_schema_types_sections_exhausted(
1227       num_types_sections_exhausted);
1228   return storage_info;
1229 }
1230 
1231 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const1232 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
1233   return schema_type_manager_->section_manager().GetMetadataList(schema_type);
1234 }
1235 
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const1236 bool SchemaStore::IsPropertyDefinedInSchema(
1237     SchemaTypeId schema_type_id, const std::string& property_path) const {
1238   auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
1239   if (schema_name_itr == reverse_schema_type_mapper_.end()) {
1240     return false;
1241   }
1242   const std::string* current_type_name = &schema_name_itr->second;
1243 
1244   std::vector<std::string_view> property_path_parts =
1245       property_util::SplitPropertyPathExpr(property_path);
1246   for (int i = 0; i < property_path_parts.size(); ++i) {
1247     auto type_config_itr = type_config_map_.find(*current_type_name);
1248     if (type_config_itr == type_config_map_.end()) {
1249       return false;
1250     }
1251     std::string_view property_name = property_path_parts.at(i);
1252     const PropertyConfigProto* selected_property = nullptr;
1253     for (const PropertyConfigProto& property :
1254          type_config_itr->second.properties()) {
1255       if (property.property_name() == property_name) {
1256         selected_property = &property;
1257         break;
1258       }
1259     }
1260     if (selected_property == nullptr) {
1261       return false;
1262     }
1263     if (i == property_path_parts.size() - 1) {
1264       // We've found a property at the final part of the path.
1265       return true;
1266     }
1267     if (selected_property->data_type() !=
1268         PropertyConfigProto::DataType::DOCUMENT) {
1269       // If this isn't final part of the path, but this property isn't a
1270       // document, so we know that this path doesn't exist.
1271       return false;
1272     }
1273     current_type_name = &selected_property->schema_type();
1274   }
1275 
1276   // We should never reach this point.
1277   return false;
1278 }
1279 
GetDebugInfo() const1280 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
1281     const {
1282   SchemaDebugInfoProto debug_info;
1283   if (has_schema_successfully_set_) {
1284     ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1285     *debug_info.mutable_schema() = *schema;
1286   }
1287   ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
1288   debug_info.set_crc(crc.Get());
1289   return debug_info;
1290 }
1291 
1292 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const1293 SchemaStore::ExpandTypePropertyMasks(
1294     const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
1295     const {
1296   std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
1297   for (const TypePropertyMask& type_field_mask : type_property_masks) {
1298     if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
1299       ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
1300                                      /*paths=*/{}};
1301       entry.paths.insert(type_field_mask.paths().begin(),
1302                          type_field_mask.paths().end());
1303       result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
1304     } else {
1305       auto schema_type_ids_or =
1306           GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
1307       // If we can't find the SchemaTypeIds, just throw it away
1308       if (!schema_type_ids_or.ok()) {
1309         continue;
1310       }
1311       const std::unordered_set<SchemaTypeId>* schema_type_ids =
1312           schema_type_ids_or.ValueOrDie();
1313       for (SchemaTypeId schema_type_id : *schema_type_ids) {
1314         auto schema_type_name_iter =
1315             reverse_schema_type_mapper_.find(schema_type_id);
1316         if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
1317           // This should never happen, unless there is an inconsistency or IO
1318           // error.
1319           ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
1320           continue;
1321         }
1322 
1323         auto iter = result_map.find(schema_type_id);
1324         if (iter == result_map.end()) {
1325           ExpandedTypePropertyMask entry{schema_type_name_iter->second,
1326                                          /*paths=*/{}};
1327           iter = result_map.insert({schema_type_id, std::move(entry)}).first;
1328         }
1329         iter->second.paths.insert(type_field_mask.paths().begin(),
1330                                   type_field_mask.paths().end());
1331       }
1332     }
1333   }
1334   std::vector<ExpandedTypePropertyMask> result;
1335   result.reserve(result_map.size());
1336   for (auto& entry : result_map) {
1337     result.push_back(std::move(entry.second));
1338   }
1339   return result;
1340 }
1341 
1342 libtextclassifier3::StatusOr<
1343     std::unordered_map<std::string, std::vector<std::string>>>
ConstructBlobPropertyMap() const1344 SchemaStore::ConstructBlobPropertyMap() const {
1345   ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1346   std::unordered_map<std::string, std::vector<std::string>> blob_property_map;
1347   for (const SchemaTypeConfigProto& type_config : schema->types()) {
1348     SchemaPropertyIterator iterator(type_config, type_config_map_);
1349     std::vector<std::string> blob_properties;
1350 
1351     libtextclassifier3::Status status = iterator.Advance();
1352     while (status.ok()) {
1353       if (iterator.GetCurrentPropertyConfig().data_type() ==
1354           PropertyConfigProto::DataType::BLOB_HANDLE) {
1355         blob_properties.push_back(iterator.GetCurrentPropertyPath());
1356       }
1357       status = iterator.Advance();
1358     }
1359     if (!absl_ports::IsOutOfRange(status)) {
1360       return status;
1361     }
1362     if (!blob_properties.empty()) {
1363       blob_property_map.insert(
1364           {type_config.schema_type(), std::move(blob_properties)});
1365     }
1366   }
1367   return blob_property_map;
1368 }
1369 
ValidateAndGetDatabase(const SchemaProto & new_schema) const1370 libtextclassifier3::StatusOr<std::string> SchemaStore::ValidateAndGetDatabase(
1371     const SchemaProto& new_schema) const {
1372   std::string database;
1373 
1374   if (!enable_schema_database_ || new_schema.types().empty()) {
1375     return database;
1376   }
1377 
1378   database = new_schema.types(0).database();
1379   // Loop through new_schema's types and validate it. The input SchemaProto
1380   // contains a list of SchemaTypeConfigProtos without deduplication. We need to
1381   // check that:
1382   // 1. All SchemaTypeConfigProtos have the same database value.
1383   // 2. The SchemaTypeConfigProtos's schema_type field is unique within both
1384   //    new_schema, as well as the existing schema (recorded in
1385   //    type_config_map_).
1386   for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
1387     // Check database consistency.
1388     if (database != type_config.database()) {
1389       return absl_ports::InvalidArgumentError(
1390           "SetSchema only accepts a SchemaProto with types from a single "
1391           "database at a time. Please make separate calls for each database if "
1392           "you need to set the schema for multiple databases.");
1393     }
1394 
1395     // Check type name uniqueness. This is only necessary if there is a
1396     // pre-existing schema.
1397     if (has_schema_successfully_set_) {
1398       auto iter = type_config_map_.find(type_config.schema_type());
1399       if (iter != type_config_map_.end() &&
1400           database != iter->second.database()) {
1401         return absl_ports::AlreadyExistsError(
1402             absl_ports::StrCat("schema_type name: '", type_config.schema_type(),
1403                                "' is already in use by a different database."));
1404       }
1405     }
1406   }
1407   return database;
1408 }
1409 
1410 libtextclassifier3::StatusOr<SchemaProto>
GetFullSchemaProtoWithUpdatedDb(SchemaProto input_database_schema) const1411 SchemaStore::GetFullSchemaProtoWithUpdatedDb(
1412     SchemaProto input_database_schema) const {
1413   if (!enable_schema_database_) {
1414     // If the schema database is not enabled, the input schema is already the
1415     // full schema, so we don't need to do any merges.
1416     return input_database_schema;
1417   }
1418 
1419   libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
1420   if (absl_ports::IsNotFound(schema_proto.status())) {
1421     // We don't have a pre-existing schema -- we can return the input database
1422     // schema as it's already the full schema.
1423     return input_database_schema;
1424   }
1425 
1426   if (!schema_proto.ok()) {
1427     // Real error.
1428     return schema_proto.status();
1429   }
1430 
1431   if (!has_schema_successfully_set_) {
1432     return absl_ports::InternalError(
1433         "Schema store was not initialized properly.");
1434   }
1435 
1436   // At this point, we have a pre-existing schema -- we need to merge the
1437   // updated database with the existing schema.
1438   if (input_database_schema.types().empty()) {
1439     return *schema_proto.ValueOrDie();
1440   }
1441 
1442   std::string input_database = input_database_schema.types(0).database();
1443   if (database_type_map_.size() == 1 &&
1444       database_type_map_.find(input_database) != database_type_map_.end()) {
1445     // No other databases in the schema -- we can return the input database
1446     // schema.
1447     return input_database_schema;
1448   }
1449 
1450   const SchemaProto* existing_schema = schema_proto.ValueOrDie();
1451   SchemaProto full_schema;
1452 
1453   // 1. Add types from the existing schema, replacing existing types with the
1454   // input types if the database is the one being updated by the input schema.
1455   // - For the input_database, we replace the existing types with the input
1456   //   types. An exisiting type is deleted if it's not included in
1457   //   input_database.
1458   // - If there are more input types than existing types for the input_database,
1459   //   the rest of the input types are appended to the end of the full_schema.
1460   // - If there are fewer input types than existing types for the
1461   //   input_database, we shift all existing that come after input_database
1462   //   forward.
1463   // - For existing types from other databases, we add the types in their
1464   //   original order to full_schema. Note that the type-ids of existing types
1465   //   might still change if some types deleted in input_database as this will
1466   //   cause all subsequent types ids to shift forward.
1467   int input_schema_index = 0, existing_schema_index = 0;
1468   while (input_schema_index < input_database_schema.types().size() &&
1469          existing_schema_index < existing_schema->types().size()) {
1470     const SchemaTypeConfigProto& existing_type_config =
1471         existing_schema->types(existing_schema_index);
1472     SchemaTypeConfigProto& input_type_config =
1473         *input_database_schema.mutable_types(input_schema_index);
1474 
1475     if (input_type_config.database() != input_database) {
1476       return absl_ports::InvalidArgumentError(
1477           "Can only update a single database at a time.");
1478     }
1479 
1480     if (existing_type_config.database() == input_database) {
1481       // If the database is the one being updated by the input schema, replace
1482       // the existing type with a type from the input schema.
1483       *full_schema.add_types() = std::move(input_type_config);
1484       ++input_schema_index;
1485     } else {
1486       *full_schema.add_types() = existing_type_config;
1487     }
1488     ++existing_schema_index;
1489   }
1490 
1491   // 2. Append remaining types to the end of the SchemaProto.
1492   for (; input_schema_index < input_database_schema.types().size();
1493        ++input_schema_index) {
1494     // Case 1: Append all remaining types from the input schema. This happens
1495     // when more types are added in input_database_schema than what's in the
1496     // existing schema. In this case, we've used up the space for the database
1497     // in the existing schema, so we can just append the rest of the types to
1498     // the end.
1499     SchemaTypeConfigProto& input_type_config =
1500         *input_database_schema.mutable_types(input_schema_index);
1501     *full_schema.add_types() = std::move(input_type_config);
1502   }
1503   for (; existing_schema_index < existing_schema->types().size();
1504        ++existing_schema_index) {
1505     // Case 2: Add remaining types from the existing schema, but skip the ones
1506     // that are from input_database, since existing types from input_database
1507     // are replaced with input_database_schema.
1508     if (existing_schema->types(existing_schema_index).database() !=
1509         input_database) {
1510       *full_schema.add_types() = existing_schema->types(existing_schema_index);
1511     }
1512   }
1513 
1514   return full_schema;
1515 }
1516 
1517 }  // namespace lib
1518 }  // namespace icing
1519