1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/schema-store.h"
16
17 #include <cinttypes>
18 #include <cstddef>
19 #include <cstdint>
20 #include <limits>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/str_cat.h"
34 #include "icing/feature-flags.h"
35 #include "icing/file/destructible-directory.h"
36 #include "icing/file/file-backed-proto.h"
37 #include "icing/file/filesystem.h"
38 #include "icing/file/version-util.h"
39 #include "icing/legacy/core/icing-string-util.h"
40 #include "icing/proto/debug.pb.h"
41 #include "icing/proto/document.pb.h"
42 #include "icing/proto/logging.pb.h"
43 #include "icing/proto/schema.pb.h"
44 #include "icing/proto/search.pb.h"
45 #include "icing/proto/storage.pb.h"
46 #include "icing/schema/backup-schema-producer.h"
47 #include "icing/schema/joinable-property.h"
48 #include "icing/schema/property-util.h"
49 #include "icing/schema/schema-property-iterator.h"
50 #include "icing/schema/schema-type-manager.h"
51 #include "icing/schema/schema-util.h"
52 #include "icing/schema/scorable_property_manager.h"
53 #include "icing/schema/section.h"
54 #include "icing/store/document-filter-data.h"
55 #include "icing/store/dynamic-trie-key-mapper.h"
56 #include "icing/util/clock.h"
57 #include "icing/util/crc32.h"
58 #include "icing/util/logging.h"
59 #include "icing/util/status-macros.h"
60
61 namespace icing {
62 namespace lib {
63
64 namespace {
65
66 constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
67 constexpr char kSchemaFilename[] = "schema.pb";
68 constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
69 constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
70
71 // This should be kept consistent with the delimiter used in AppSearch.
72 // See:
73 // https://cs.android.com/androidx/platform/frameworks/support/+/androidx-main:appsearch/appsearch-local-storage/src/main/java/androidx/appsearch/localstorage/util/PrefixUtil.java;l=42;drc=ffaf979c6f0cbd26caafd7a9d07a6bad12fe3a2a
74
75 constexpr char kAppSearchDatabaseDelimiter = '/';
76
77 // A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
78 // each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
79 // 384KiB.
80 constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
81
MakeHeaderFilename(const std::string & base_dir)82 std::string MakeHeaderFilename(const std::string& base_dir) {
83 return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
84 }
85
MakeSchemaFilename(const std::string & base_dir)86 std::string MakeSchemaFilename(const std::string& base_dir) {
87 return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
88 }
89
MakeOverlaySchemaFilename(const std::string & base_dir)90 std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
91 return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
92 }
93
MakeSchemaTypeMapperFilename(const std::string & base_dir)94 std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
95 return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
96 }
97
98 // Assuming that SchemaTypeIds are assigned to schema types based on their order
99 // in the SchemaProto. Check if the schema type->SchemaTypeId mapping would
100 // change with the new schema.
SchemaTypeIdsChanged(const SchemaProto & old_schema,const SchemaProto & new_schema)101 std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
102 const SchemaProto& old_schema, const SchemaProto& new_schema) {
103 std::unordered_set<SchemaTypeId> old_schema_type_ids_changed;
104
105 std::unordered_map<std::string, int> old_types_and_index;
106 for (int i = 0; i < old_schema.types().size(); ++i) {
107 old_types_and_index.emplace(old_schema.types(i).schema_type(), i);
108 }
109
110 std::unordered_map<std::string, int> new_types_and_index;
111 for (int i = 0; i < new_schema.types().size(); ++i) {
112 new_types_and_index.emplace(new_schema.types(i).schema_type(), i);
113 }
114
115 for (const auto& old_type_index : old_types_and_index) {
116 const auto& iter = new_types_and_index.find(old_type_index.first);
117 // We only care if the type exists in both the old and new schema. If the
118 // type has been deleted, then it'll be captured in
119 // SetSchemaResult.schema_types_deleted*. If the type has been added in the
120 // new schema then we also don't care because nothing needs to be updated.
121 if (iter != new_types_and_index.end()) {
122 // Since the SchemaTypeId of the schema type is just the index of it in
123 // the SchemaProto, compare the index and save it if it's not the same
124 if (old_type_index.second != iter->second) {
125 old_schema_type_ids_changed.emplace(old_type_index.second);
126 }
127 }
128 }
129
130 return old_schema_type_ids_changed;
131 }
132
133 // Returns the database from the schema type name if it exists.
134 //
135 // The schema type is expected to be in the format of
136 // <database><delimiter><actual_type_name>.
137 //
138 // Returns an empty string if the schema type name is not in the database
139 // format.
GetDatabaseFromSchemaType(const std::string & schema_type,char database_delimeter)140 std::string GetDatabaseFromSchemaType(const std::string& schema_type,
141 char database_delimeter) {
142 size_t db_index = schema_type.find(database_delimeter);
143 std::string database;
144 if (db_index != std::string::npos) {
145 database = schema_type.substr(0, db_index);
146 }
147 return database;
148 }
149
150 // For each schema type in the schema proto, parses out the database from the
151 // type name, and sets it as the database field in the input proto in
152 // place. The schema_type name field itself is not modified.
153 //
154 // If the schema type name does not contain an AppSearch database, then
155 // SchemaTypeConfigProto is not modified.
156 //
157 // Returns:
158 // - True if any SchemaTypeConfigProto in the schema proto is rewritten.
159 // - False otherwise.
ParseAndPopulateAppSearchDatabaseField(SchemaProto & schema_proto)160 bool ParseAndPopulateAppSearchDatabaseField(SchemaProto& schema_proto) {
161 bool populated_database_field = false;
162 for (auto& type : *schema_proto.mutable_types()) {
163 std::string database = GetDatabaseFromSchemaType(
164 type.schema_type(), kAppSearchDatabaseDelimiter);
165 if (type.database() != database) {
166 type.set_database(std::move(database));
167 populated_database_field = true;
168 }
169 }
170 return populated_database_field;
171 }
172
173 } // namespace
174
175 /* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
Read(const Filesystem * filesystem,std::string path)176 SchemaStore::Header::Read(const Filesystem* filesystem, std::string path) {
177 if (!filesystem->FileExists(path.c_str())) {
178 return absl_ports::NotFoundError(
179 absl_ports::StrCat("Header file is empty: ", path));
180 }
181
182 SerializedHeader serialized_header;
183 ScopedFd sfd(filesystem->OpenForWrite(path.c_str()));
184 if (!sfd.is_valid()) {
185 return absl_ports::InternalError("Unable to open or create header file.");
186 }
187
188 // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
189 int64_t file_size = filesystem->GetFileSize(sfd.get());
190 if (file_size == sizeof(LegacyHeader)) {
191 LegacyHeader legacy_header;
192 if (!filesystem->Read(sfd.get(), &legacy_header, sizeof(legacy_header))) {
193 return absl_ports::InternalError(
194 absl_ports::StrCat("Couldn't read: ", path));
195 }
196 if (legacy_header.magic != Header::kMagic) {
197 return absl_ports::InternalError(
198 absl_ports::StrCat("Invalid header kMagic for file: ", path));
199 }
200 serialized_header.checksum = legacy_header.checksum;
201 } else if (file_size == sizeof(SerializedHeader)) {
202 if (!filesystem->Read(sfd.get(), &serialized_header,
203 sizeof(serialized_header))) {
204 return absl_ports::InternalError(
205 absl_ports::StrCat("Couldn't read: ", path));
206 }
207 if (serialized_header.magic != Header::kMagic) {
208 return absl_ports::InternalError(
209 absl_ports::StrCat("Invalid header kMagic for file: ", path));
210 }
211 } else if (file_size != 0) {
212 // file is neither the legacy header, the new header nor empty. Something is
213 // wrong here.
214 int legacy_header_size = sizeof(LegacyHeader);
215 int header_size = sizeof(SerializedHeader);
216 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
217 "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
218 legacy_header_size, header_size));
219 }
220 return Header(serialized_header, std::move(path), std::move(sfd), filesystem);
221 }
222
Write()223 libtextclassifier3::Status SchemaStore::Header::Write() {
224 if (!dirty_) {
225 return libtextclassifier3::Status::OK;
226 }
227 if (!header_fd_.is_valid() && !filesystem_->FileExists(path_.c_str())) {
228 header_fd_.reset(filesystem_->OpenForWrite(path_.c_str()));
229 }
230 // This should overwrite the header.
231 if (!header_fd_.is_valid() ||
232 !filesystem_->PWrite(header_fd_.get(), /*offset=*/0, &serialized_header_,
233 sizeof(serialized_header_))) {
234 return absl_ports::InternalError(
235 absl_ports::StrCat("Failed to write SchemaStore header"));
236 }
237 dirty_ = false;
238 return libtextclassifier3::Status::OK;
239 }
240
PersistToDisk()241 libtextclassifier3::Status SchemaStore::Header::PersistToDisk() {
242 if (dirty_) {
243 ICING_RETURN_IF_ERROR(Write());
244 }
245 // This should overwrite the header.
246 if (!header_fd_.is_valid() || !filesystem_->DataSync(header_fd_.get())) {
247 return absl_ports::InternalError(
248 absl_ports::StrCat("Failed to sync SchemaStore header."));
249 }
250 return libtextclassifier3::Status::OK;
251 }
252
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,bool enable_schema_database,InitializeStatsProto * initialize_stats)253 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
254 const Filesystem* filesystem, const std::string& base_dir,
255 const Clock* clock, const FeatureFlags* feature_flags,
256 bool enable_schema_database, InitializeStatsProto* initialize_stats) {
257 ICING_RETURN_ERROR_IF_NULL(filesystem);
258 ICING_RETURN_ERROR_IF_NULL(clock);
259 ICING_RETURN_ERROR_IF_NULL(feature_flags);
260
261 if (!filesystem->DirectoryExists(base_dir.c_str())) {
262 return absl_ports::FailedPreconditionError(
263 "Schema store base directory does not exist!");
264 }
265 std::unique_ptr<SchemaStore> schema_store =
266 std::unique_ptr<SchemaStore>(new SchemaStore(
267 filesystem, base_dir, clock, feature_flags, enable_schema_database));
268 ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
269 return schema_store;
270 }
271
Create(const Filesystem * filesystem,const std::string & base_dir,const Clock * clock,const FeatureFlags * feature_flags,SchemaProto schema,bool enable_schema_database)272 libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
273 const Filesystem* filesystem, const std::string& base_dir,
274 const Clock* clock, const FeatureFlags* feature_flags, SchemaProto schema,
275 bool enable_schema_database) {
276 ICING_RETURN_ERROR_IF_NULL(filesystem);
277 ICING_RETURN_ERROR_IF_NULL(clock);
278 ICING_RETURN_ERROR_IF_NULL(feature_flags);
279
280 if (!filesystem->DirectoryExists(base_dir.c_str())) {
281 return absl_ports::FailedPreconditionError(
282 "Schema store base directory does not exist!");
283 }
284 std::unique_ptr<SchemaStore> schema_store =
285 std::unique_ptr<SchemaStore>(new SchemaStore(
286 filesystem, base_dir, clock, feature_flags, enable_schema_database));
287 ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
288 return schema_store;
289 }
290
291 /* static */ libtextclassifier3::Status
PopulateSchemaDatabaseFieldForSchemaFile(const Filesystem * filesystem,const std::string & schema_filename)292 SchemaStore::PopulateSchemaDatabaseFieldForSchemaFile(
293 const Filesystem* filesystem, const std::string& schema_filename) {
294 FileBackedProto<SchemaProto> schema_file(*filesystem, schema_filename);
295 auto schema_proto_or = schema_file.Read();
296 if (absl_ports::IsNotFound(schema_proto_or.status())) {
297 // Don't have an existing schema proto, that's fine
298 return libtextclassifier3::Status::OK;
299 } else if (!schema_proto_or.ok()) {
300 // Real error when trying to read the existing schema
301 return schema_proto_or.status();
302 }
303
304 SchemaProto schema_proto_copy = *schema_proto_or.ValueOrDie();
305 bool schema_changed =
306 ParseAndPopulateAppSearchDatabaseField(schema_proto_copy);
307 if (!schema_changed) {
308 // Nothing to do if the schema is not changed.
309 return libtextclassifier3::Status::OK;
310 }
311
312 // Create a temporary schema file and schema proto copy to update the
313 // schema.
314 std::string temp_schema_filename = schema_filename + ".tmp";
315 if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
316 return absl_ports::InternalError(
317 "Unable to delete temp schema file to prepare for schema database "
318 "migration.");
319 }
320
321 {
322 FileBackedProto<SchemaProto> temp_schema_file(*filesystem,
323 temp_schema_filename);
324 ICING_RETURN_IF_ERROR(temp_schema_file.Write(
325 std::make_unique<SchemaProto>(schema_proto_copy)));
326 }
327
328 // Swap the temp schema file with the original schema file.
329 if (!filesystem->SwapFiles(temp_schema_filename.c_str(),
330 schema_filename.c_str())) {
331 return absl_ports::InternalError(
332 "Unable to apply migrated schema with database due to failed swap!");
333 }
334 // Clean up the temp schema file.
335 if (!filesystem->DeleteFile(temp_schema_filename.c_str())) {
336 return absl_ports::InternalError(
337 "Unable to delete temp schema file after schema database migration.");
338 }
339
340 return libtextclassifier3::Status::OK;
341 }
342
DiscardOverlaySchema(const Filesystem * filesystem,const std::string & base_dir,Header & header)343 /* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
344 const Filesystem* filesystem, const std::string& base_dir, Header& header) {
345 std::string header_filename = MakeHeaderFilename(base_dir);
346 if (header.overlay_created()) {
347 header.SetOverlayInfo(
348 /*overlay_created=*/false,
349 /*min_overlay_version_compatibility=*/std::numeric_limits<
350 int32_t>::max());
351 ICING_RETURN_IF_ERROR(header.Write());
352 }
353 std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
354 if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
355 return absl_ports::InternalError(
356 "Unable to delete stale schema overlay file.");
357 }
358 return libtextclassifier3::Status::OK;
359 }
360
MigrateSchema(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version,bool perform_schema_database_migration)361 /* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
362 const Filesystem* filesystem, const std::string& base_dir,
363 version_util::StateChange version_state_change, int32_t new_version,
364 bool perform_schema_database_migration) {
365 if (!filesystem->DirectoryExists(base_dir.c_str())) {
366 // Situations when schema store directory doesn't exist:
367 // - Initializing new Icing instance: don't have to do anything now. The
368 // directory will be created later.
369 // - Lose schema store: there is nothing we can do now. The logic will be
370 // handled later by initializing.
371 //
372 // Therefore, just simply return OK here.
373 return libtextclassifier3::Status::OK;
374 }
375
376 ICING_RETURN_IF_ERROR(HandleOverlaySchemaForVersionChange(
377 filesystem, base_dir, version_state_change, new_version));
378
379 // Perform schema database migration if needed.
380 // - This populates the the database field in the schema proto and writes it
381 // to the schema file.
382 // - If the overlay schema file exists at this point, does the same for the
383 // overlay schema.
384 if (perform_schema_database_migration) {
385 std::string base_schema_filename = MakeSchemaFilename(base_dir);
386 ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
387 filesystem, base_schema_filename));
388
389 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
390 if (filesystem->FileExists(overlay_schema_filename.c_str())) {
391 ICING_RETURN_IF_ERROR(PopulateSchemaDatabaseFieldForSchemaFile(
392 filesystem, overlay_schema_filename));
393 }
394 }
395
396 return libtextclassifier3::Status::OK;
397 }
398
399 /* static */ libtextclassifier3::Status
HandleOverlaySchemaForVersionChange(const Filesystem * filesystem,const std::string & base_dir,version_util::StateChange version_state_change,int32_t new_version)400 SchemaStore::HandleOverlaySchemaForVersionChange(
401 const Filesystem* filesystem, const std::string& base_dir,
402 version_util::StateChange version_state_change, int32_t new_version) {
403 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
404 if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
405 // The overlay doesn't exist. So there should be nothing particularly
406 // interesting to worry about.
407 return libtextclassifier3::Status::OK;
408 }
409
410 std::string header_filename = MakeHeaderFilename(base_dir);
411 libtextclassifier3::StatusOr<Header> header_or;
412 switch (version_state_change) {
413 // No necessary actions for normal upgrades or no version change. The data
414 // that was produced by the previous version is fully compatible with this
415 // version and there's no stale data for us to clean up.
416 // The same is true for a normal rollforward. A normal rollforward implies
417 // that the previous version was one that understood the concept of the
418 // overlay schema and would have already discarded it if it was unusable.
419 case version_util::StateChange::kVersionZeroUpgrade:
420 // fallthrough
421 case version_util::StateChange::kUpgrade:
422 // fallthrough
423 case version_util::StateChange::kRollForward:
424 // fallthrough
425 case version_util::StateChange::kCompatible:
426 return libtextclassifier3::Status::OK;
427 case version_util::StateChange::kVersionZeroRollForward: {
428 // We've rolled forward. The schema overlay file, if it exists, is
429 // possibly stale. We must throw it out.
430 header_or = Header::Read(filesystem, header_filename);
431 ICING_RETURN_IF_ERROR(header_or.status());
432 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
433 header_or.ValueOrDie());
434 }
435 case version_util::StateChange::kRollBack: {
436 header_or = Header::Read(filesystem, header_filename);
437 ICING_RETURN_IF_ERROR(header_or.status());
438 if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
439 new_version) {
440 // We've been rolled back, but the overlay schema claims that it
441 // supports this version. So we can safely return.
442 return libtextclassifier3::Status::OK;
443 }
444 // We've been rolled back to a version that the overlay schema doesn't
445 // support. We must throw it out.
446 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
447 header_or.ValueOrDie());
448 }
449 case version_util::StateChange::kUndetermined:
450 // It's not clear what version we're on, but the base schema should always
451 // be safe to use. Throw out the overlay.
452 header_or = Header::Read(filesystem, header_filename);
453 ICING_RETURN_IF_ERROR(header_or.status());
454 return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
455 header_or.ValueOrDie());
456 }
457 return libtextclassifier3::Status::OK;
458 }
459
DiscardDerivedFiles(const Filesystem * filesystem,const std::string & base_dir)460 /* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
461 const Filesystem* filesystem, const std::string& base_dir) {
462 // Schema type mapper
463 return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
464 *filesystem, MakeSchemaTypeMapperFilename(base_dir));
465 }
466
SchemaStore(const Filesystem * filesystem,std::string base_dir,const Clock * clock,const FeatureFlags * feature_flags,bool enable_schema_database)467 SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
468 const Clock* clock, const FeatureFlags* feature_flags,
469 bool enable_schema_database)
470 : filesystem_(filesystem),
471 base_dir_(std::move(base_dir)),
472 clock_(clock),
473 feature_flags_(feature_flags),
474 schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
475 *filesystem, MakeSchemaFilename(base_dir_))),
476 enable_schema_database_(enable_schema_database) {}
477
~SchemaStore()478 SchemaStore::~SchemaStore() {
479 if (has_schema_successfully_set_ && schema_file_ != nullptr &&
480 schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
481 if (!PersistToDisk().ok()) {
482 ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
483 }
484 }
485 }
486
Initialize(SchemaProto new_schema)487 libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
488 ICING_RETURN_IF_ERROR(LoadSchema());
489 if (!absl_ports::IsNotFound(GetSchema().status())) {
490 return absl_ports::FailedPreconditionError(
491 "Incorrectly tried to initialize schema store with a new schema, when "
492 "one is already set!");
493 }
494 ICING_RETURN_IF_ERROR(schema_file_->Write(
495 std::make_unique<SchemaProto>(std::move(new_schema))));
496 return InitializeInternal(/*create_overlay_if_necessary=*/true,
497 /*initialize_stats=*/nullptr);
498 }
499
Initialize(InitializeStatsProto * initialize_stats)500 libtextclassifier3::Status SchemaStore::Initialize(
501 InitializeStatsProto* initialize_stats) {
502 ICING_RETURN_IF_ERROR(LoadSchema());
503 auto schema_proto_or = GetSchema();
504 if (absl_ports::IsNotFound(schema_proto_or.status())) {
505 // Don't have an existing schema proto, that's fine
506 return libtextclassifier3::Status::OK;
507 } else if (!schema_proto_or.ok()) {
508 // Real error when trying to read the existing schema
509 return schema_proto_or.status();
510 }
511 return InitializeInternal(/*create_overlay_if_necessary=*/false,
512 initialize_stats);
513 }
514
LoadSchema()515 libtextclassifier3::Status SchemaStore::LoadSchema() {
516 libtextclassifier3::StatusOr<Header> header_or =
517 Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
518 bool header_exists = false;
519 if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
520 return header_or.status();
521 } else if (!header_or.ok()) {
522 header_ =
523 std::make_unique<Header>(filesystem_, MakeHeaderFilename(base_dir_));
524 } else {
525 header_exists = true;
526 header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
527 }
528
529 std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
530 bool overlay_schema_file_exists =
531 filesystem_->FileExists(overlay_schema_filename.c_str());
532
533 libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
534 if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
535 return base_schema_state;
536 }
537
538 // There are three valid cases:
539 // 1. Everything is missing. This is an empty schema store.
540 if (!base_schema_state.ok() && !overlay_schema_file_exists &&
541 !header_exists) {
542 return libtextclassifier3::Status::OK;
543 }
544
545 // 2. There never was a overlay schema. The header exists, the base schema
546 // exists and the header says the overlay schema shouldn't exist
547 if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
548 !header_->overlay_created()) {
549 // Nothing else to do. Just return safely.
550 return libtextclassifier3::Status::OK;
551 }
552
553 // 3. There is an overlay schema and a base schema and a header. The header
554 // says that the overlay schema should exist.
555 if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
556 header_->overlay_created()) {
557 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
558 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
559 return libtextclassifier3::Status::OK;
560 }
561
562 // Something has gone wrong. We've lost part of the schema ground truth.
563 // Return an error.
564 bool overlay_created = header_->overlay_created();
565 bool base_schema_exists = base_schema_state.ok();
566 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
567 "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
568 "base schema exists: %d, overlay_schema_exists: %d",
569 header_exists, overlay_created, base_schema_exists,
570 overlay_schema_file_exists));
571 }
572
InitializeInternal(bool create_overlay_if_necessary,InitializeStatsProto * initialize_stats)573 libtextclassifier3::Status SchemaStore::InitializeInternal(
574 bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
575 if (!InitializeDerivedFiles().ok()) {
576 ICING_VLOG(3)
577 << "Couldn't find derived files or failed to initialize them, "
578 "regenerating derived files for SchemaStore.";
579 std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
580 if (initialize_stats != nullptr) {
581 initialize_stats->set_schema_store_recovery_cause(
582 InitializeStatsProto::IO_ERROR);
583 }
584 ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
585 if (initialize_stats != nullptr) {
586 initialize_stats->set_schema_store_recovery_latency_ms(
587 regenerate_timer->GetElapsedMilliseconds());
588 }
589 }
590
591 if (initialize_stats != nullptr) {
592 initialize_stats->set_num_schema_types(type_config_map_.size());
593 }
594 has_schema_successfully_set_ = true;
595
596 return libtextclassifier3::Status::OK;
597 }
598
InitializeDerivedFiles()599 libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
600 ICING_ASSIGN_OR_RETURN(
601 schema_type_mapper_,
602 DynamicTrieKeyMapper<SchemaTypeId>::Create(
603 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
604 kSchemaTypeMapperMaxSize));
605
606 Crc32 expected_checksum(header_->checksum());
607 ICING_ASSIGN_OR_RETURN(Crc32 checksum, GetChecksum());
608 if (checksum != expected_checksum) {
609 return absl_ports::InternalError(
610 "Combined checksum of SchemaStore was inconsistent");
611 }
612
613 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
614 return libtextclassifier3::Status::OK;
615 }
616
RegenerateDerivedFiles(bool create_overlay_if_necessary)617 libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
618 bool create_overlay_if_necessary) {
619 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
620
621 ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
622
623 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
624 // Assign a SchemaTypeId to the type
625 ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
626 type_config.schema_type(), schema_type_mapper_->num_keys()));
627 }
628 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
629
630 if (create_overlay_if_necessary) {
631 ICING_ASSIGN_OR_RETURN(
632 BackupSchemaProducer producer,
633 BackupSchemaProducer::Create(*schema_proto,
634 schema_type_manager_->section_manager()));
635
636 if (producer.is_backup_necessary()) {
637 SchemaProto base_schema = std::move(producer).Produce();
638
639 // The overlay schema should be written to the overlay file location.
640 overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
641 *filesystem_, MakeOverlaySchemaFilename(base_dir_));
642 auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
643 ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
644
645 // The base schema should be written to the original file
646 auto base_schema_ptr =
647 std::make_unique<SchemaProto>(std::move(base_schema));
648 ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
649
650 // LINT.IfChange(min_overlay_version_compatibility)
651 // Although the current version is 5, the schema is compatible with
652 // version 1, so min_overlay_version_compatibility should be 1.
653 int32_t min_overlay_version_compatibility = version_util::kVersionOne;
654 // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
655 header_->SetOverlayInfo(
656 /*overlay_created=*/true, min_overlay_version_compatibility);
657 // Rebuild in memory data - references to the old schema will be invalid
658 // now.
659 ICING_RETURN_IF_ERROR(BuildInMemoryCache());
660 }
661 }
662
663 // Write the header
664 ICING_RETURN_IF_ERROR(UpdateChecksum());
665 return libtextclassifier3::Status::OK;
666 }
667
BuildInMemoryCache()668 libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
669 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
670 ICING_ASSIGN_OR_RETURN(
671 SchemaUtil::InheritanceMap inheritance_map,
672 SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
673
674 reverse_schema_type_mapper_.clear();
675 database_type_map_.clear();
676 type_config_map_.clear();
677 schema_subtype_id_map_.clear();
678 for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
679 const std::string& database = type_config.database();
680 const std::string& type_name = type_config.schema_type();
681 ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
682 schema_type_mapper_->Get(type_name));
683
684 // Build reverse_schema_type_mapper_
685 reverse_schema_type_mapper_.insert({type_id, type_name});
686
687 // Build database_type_map_
688 database_type_map_[database].push_back(type_name);
689
690 // Build type_config_map_
691 type_config_map_.insert({type_name, type_config});
692
693 // Build schema_subtype_id_map_
694 std::unordered_set<SchemaTypeId>& subtype_id_set =
695 schema_subtype_id_map_[type_id];
696 // Find all child types
697 auto child_types_names = inheritance_map.find(type_name);
698 if (child_types_names != inheritance_map.end()) {
699 subtype_id_set.reserve(child_types_names->second.size() + 1);
700 for (const auto& [child_type_name, is_direct_child] :
701 child_types_names->second) {
702 ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
703 schema_type_mapper_->Get(child_type_name));
704 subtype_id_set.insert(child_type_id);
705 }
706 }
707 // Every type is a subtype of itself.
708 subtype_id_set.insert(type_id);
709 }
710
711 // Build schema_type_manager_
712 ICING_ASSIGN_OR_RETURN(
713 schema_type_manager_,
714 SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
715
716 scorable_property_manager_ = std::make_unique<ScorablePropertyManager>();
717
718 return libtextclassifier3::Status::OK;
719 }
720
ResetSchemaTypeMapper()721 libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
722 // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
723 schema_type_mapper_.reset();
724 // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
725 // that can support error logging.
726 libtextclassifier3::Status status =
727 DynamicTrieKeyMapper<SchemaTypeId>::Delete(
728 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
729 if (!status.ok()) {
730 ICING_LOG(ERROR) << status.error_message()
731 << "Failed to delete old schema_type mapper";
732 return status;
733 }
734 ICING_ASSIGN_OR_RETURN(
735 schema_type_mapper_,
736 DynamicTrieKeyMapper<SchemaTypeId>::Create(
737 *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
738 kSchemaTypeMapperMaxSize));
739
740 return libtextclassifier3::Status::OK;
741 }
742
GetChecksum() const743 libtextclassifier3::StatusOr<Crc32> SchemaStore::GetChecksum() const {
744 ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_->GetChecksum());
745 // We've gotten the schema_checksum successfully. This means that
746 // schema_file_->Read() will only return either a schema or NOT_FOUND.
747 // Sadly, we actually need to differentiate between an existing, but empty
748 // schema and a non-existent schema (both of which will have a checksum of 0).
749 // For existing, but empty schemas, we need to continue with the checksum
750 // calculation of the other components.
751 if (schema_checksum == Crc32() &&
752 absl_ports::IsNotFound(schema_file_->Read().status())) {
753 return schema_checksum;
754 }
755
756 Crc32 total_checksum;
757 total_checksum.Append(std::to_string(schema_checksum.Get()));
758 if (overlay_schema_file_ != nullptr) {
759 ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
760 overlay_schema_file_->GetChecksum());
761 total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
762 }
763
764 ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
765 schema_type_mapper_->GetChecksum());
766 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
767 return total_checksum;
768 }
769
UpdateChecksum()770 libtextclassifier3::StatusOr<Crc32> SchemaStore::UpdateChecksum() {
771 // FileBackedProto always keeps its checksum up to date. So we just need to
772 // retrieve the checksum.
773 ICING_ASSIGN_OR_RETURN(Crc32 schema_checksum, schema_file_->GetChecksum());
774 // We've gotten the schema_checksum successfully. This means that
775 // schema_file_->Read() will only return either a schema or NOT_FOUND.
776 // Sadly, we actually need to differentiate between an existing, but empty
777 // schema and a non-existent schema (both of which will have a checksum of 0).
778 // For existing, but empty schemas, we need to continue with the checksum
779 // calculation of the other components so that we will correctly write the
780 // header.
781 if (schema_checksum == Crc32() &&
782 absl_ports::IsNotFound(schema_file_->Read().status())) {
783 return schema_checksum;
784 }
785 Crc32 total_checksum;
786 total_checksum.Append(std::to_string(schema_checksum.Get()));
787
788 if (overlay_schema_file_ != nullptr) {
789 ICING_ASSIGN_OR_RETURN(Crc32 overlay_schema_checksum,
790 overlay_schema_file_->GetChecksum());
791 total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
792 }
793
794 ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
795 schema_type_mapper_->UpdateChecksum());
796 total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
797
798 header_->set_checksum(total_checksum.Get());
799 ICING_RETURN_IF_ERROR(header_->Write());
800 return total_checksum;
801 }
802
GetSchema() const803 libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
804 const {
805 if (overlay_schema_file_ != nullptr) {
806 return overlay_schema_file_->Read();
807 }
808 return schema_file_->Read();
809 }
810
GetSchema(const std::string & database) const811 libtextclassifier3::StatusOr<SchemaProto> SchemaStore::GetSchema(
812 const std::string& database) const {
813 if (!has_schema_successfully_set_) {
814 return absl_ports::NotFoundError("No schema found.");
815 }
816
817 const auto database_type_map_itr_ = database_type_map_.find(database);
818 if (database_type_map_itr_ == database_type_map_.end()) {
819 return absl_ports::NotFoundError(
820 absl_ports::StrCat("No schema found for database '", database, "'."));
821 }
822
823 SchemaProto schema_proto;
824 for (const std::string& type_name : database_type_map_itr_->second) {
825 ICING_ASSIGN_OR_RETURN(const SchemaTypeConfigProto* type_config,
826 GetSchemaTypeConfig(type_name));
827 *schema_proto.add_types() = *type_config;
828 }
829 return schema_proto;
830 }
831
832 // TODO(cassiewang): Consider removing this definition of SetSchema if it's not
833 // needed by production code. It's currently being used by our tests, but maybe
834 // it's trivial to change our test code to also use the
835 // SetSchema(SchemaProto&& new_schema)
836 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(const SchemaProto & new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)837 SchemaStore::SetSchema(const SchemaProto& new_schema,
838 bool ignore_errors_and_delete_documents,
839 bool allow_circular_schema_definitions) {
840 return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
841 allow_circular_schema_definitions);
842 }
843
844 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchema(SchemaProto && new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)845 SchemaStore::SetSchema(SchemaProto&& new_schema,
846 bool ignore_errors_and_delete_documents,
847 bool allow_circular_schema_definitions) {
848 if (enable_schema_database_) {
849 // Step 1: (Only required if schema database is enabled)
850 // Do some preliminary checks on the new schema before formal validation and
851 // delta computation. This checks that:
852 // - The new schema only contains types from a single database.
853 // - The new schema's type names are not already in use from other
854 // databases.
855 ICING_ASSIGN_OR_RETURN(std::string database,
856 ValidateAndGetDatabase(new_schema));
857
858 // Step 2: Schema validation and delta computation -- try to get the
859 // existing schema for the database to compare to the new schema.
860 libtextclassifier3::StatusOr<SchemaProto> schema_proto =
861 GetSchema(database);
862 if (absl_ports::IsNotFound(schema_proto.status())) {
863 // Case 1: No preexisting schema for this database.
864 return SetInitialSchemaForDatabase(std::move(new_schema),
865 ignore_errors_and_delete_documents,
866 allow_circular_schema_definitions);
867 }
868
869 if (!schema_proto.ok()) {
870 // Case 2: Real error
871 return schema_proto.status();
872 }
873
874 // Case 3: At this point, we're guaranteed that we have an existing schema
875 // for this database.
876 const SchemaProto& old_schema = schema_proto.ValueOrDie();
877 return SetSchemaWithDatabaseOverride(std::move(new_schema), old_schema,
878 ignore_errors_and_delete_documents,
879 allow_circular_schema_definitions);
880 }
881
882 // Get the full schema if schema database is disabled.
883 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
884 if (absl_ports::IsNotFound(schema_proto.status())) {
885 // Case 1: No preexisting schema
886 return SetInitialSchemaForDatabase(std::move(new_schema),
887 ignore_errors_and_delete_documents,
888 allow_circular_schema_definitions);
889 }
890
891 if (!schema_proto.ok()) {
892 // Case 2: Real error
893 return schema_proto.status();
894 }
895
896 // Case 3: At this point, we're guaranteed that we have an existing schema
897 const SchemaProto& old_schema = *schema_proto.ValueOrDie();
898 return SetSchemaWithDatabaseOverride(std::move(new_schema), old_schema,
899 ignore_errors_and_delete_documents,
900 allow_circular_schema_definitions);
901 }
902
903 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetInitialSchemaForDatabase(SchemaProto new_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)904 SchemaStore::SetInitialSchemaForDatabase(
905 SchemaProto new_schema, bool ignore_errors_and_delete_documents,
906 bool allow_circular_schema_definitions) {
907 SetSchemaResult result;
908
909 ICING_RETURN_IF_ERROR(SchemaUtil::Validate(
910 new_schema, *feature_flags_, allow_circular_schema_definitions));
911
912 result.success = true;
913 for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
914 result.schema_types_new_by_name.insert(type_config.schema_type());
915 }
916 // Get the full new SchemaProto that is a combination of the existing schema
917 // and new_schema. This is needed as we can only write the full proto to the
918 // schema file.
919 ICING_ASSIGN_OR_RETURN(
920 SchemaProto full_new_schema,
921 GetFullSchemaProtoWithUpdatedDb(std::move(new_schema)));
922 ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
923 has_schema_successfully_set_ = true;
924
925 return result;
926 }
927
928 libtextclassifier3::StatusOr<SchemaStore::SetSchemaResult>
SetSchemaWithDatabaseOverride(SchemaProto new_schema,const SchemaProto & old_schema,bool ignore_errors_and_delete_documents,bool allow_circular_schema_definitions)929 SchemaStore::SetSchemaWithDatabaseOverride(
930 SchemaProto new_schema, const SchemaProto& old_schema,
931 bool ignore_errors_and_delete_documents,
932 bool allow_circular_schema_definitions) {
933 // Assume we can set the schema unless proven otherwise.
934 SetSchemaResult result;
935 result.success = true;
936
937 if (new_schema.SerializeAsString() == old_schema.SerializeAsString()) {
938 // Same schema as before. No need to update anything
939 return result;
940 }
941
942 // Different schema -- we need to validate the schema and track the
943 // differences to see if we can still write it.
944 //
945 // Validate the new schema and compute the delta between the old and new
946 // schema.
947 ICING_ASSIGN_OR_RETURN(
948 SchemaUtil::DependentMap new_dependent_map,
949 SchemaUtil::Validate(new_schema, *feature_flags_,
950 allow_circular_schema_definitions));
951 SchemaUtil::SchemaDelta schema_delta = SchemaUtil::ComputeCompatibilityDelta(
952 old_schema, new_schema, new_dependent_map, *feature_flags_);
953
954 result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
955 result.schema_types_changed_fully_compatible_by_name =
956 std::move(schema_delta.schema_types_changed_fully_compatible);
957 result.schema_types_index_incompatible_by_name =
958 std::move(schema_delta.schema_types_index_incompatible);
959 result.schema_types_join_incompatible_by_name =
960 std::move(schema_delta.schema_types_join_incompatible);
961 result.schema_types_scorable_property_inconsistent_by_name =
962 std::move(schema_delta.schema_types_scorable_property_inconsistent);
963
964 for (const std::string& schema_type : schema_delta.schema_types_deleted) {
965 // We currently don't support deletions, so mark this as not possible.
966 // This will change once we allow force-set schemas.
967 result.success = false;
968
969 result.schema_types_deleted_by_name.emplace(schema_type);
970
971 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
972 GetSchemaTypeId(schema_type));
973 result.schema_types_deleted_by_id.emplace(schema_type_id);
974 }
975
976 for (const std::string& schema_type :
977 schema_delta.schema_types_incompatible) {
978 // We currently don't support incompatible schemas, so mark this as
979 // not possible. This will change once we allow force-set schemas.
980 result.success = false;
981
982 result.schema_types_incompatible_by_name.emplace(schema_type);
983
984 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
985 GetSchemaTypeId(schema_type));
986 result.schema_types_incompatible_by_id.emplace(schema_type_id);
987 }
988
989 // Get the full new SchemaProto that is a combination of the existing schema
990 // and new_schema. This is needed to calculate the updated SchemaTypeIds, and
991 // for writing the full proto to the schema file.
992 ICING_ASSIGN_OR_RETURN(
993 SchemaProto full_new_schema,
994 GetFullSchemaProtoWithUpdatedDb(std::move(new_schema)));
995
996 // We still need to update old_schema_type_ids_changed. We need to retrieve
997 // the entire old schema for this, as type ids are assigned for the entire
998 // schema, and not on a per-database level.
999 //
1000 // SchemaTypeIds changing is fine, we can update the DocumentStore.
1001 ICING_ASSIGN_OR_RETURN(const SchemaProto* full_old_schema, GetSchema());
1002 result.old_schema_type_ids_changed =
1003 SchemaTypeIdsChanged(*full_old_schema, full_new_schema);
1004
1005 // We can force set the schema if the caller has told us to ignore any errors
1006 result.success = result.success || ignore_errors_and_delete_documents;
1007
1008 // Step 3: Apply the schema change if success. This updates persisted files
1009 // and derived data structures.
1010 if (result.success) {
1011 ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(full_new_schema)));
1012 has_schema_successfully_set_ = true;
1013 }
1014
1015 // Convert schema types to SchemaTypeIds after the new schema is applied.
1016 if (feature_flags_->enable_scorable_properties()) {
1017 for (const std::string& schema_type :
1018 result.schema_types_scorable_property_inconsistent_by_name) {
1019 libtextclassifier3::StatusOr<SchemaTypeId> schema_type_id_or =
1020 GetSchemaTypeId(schema_type);
1021 if (!schema_type_id_or.ok()) {
1022 if (absl_ports::IsNotFound(schema_type_id_or.status())) {
1023 continue;
1024 }
1025 return schema_type_id_or.status();
1026 }
1027 result.schema_types_scorable_property_inconsistent_by_id.insert(
1028 schema_type_id_or.ValueOrDie());
1029 }
1030 }
1031
1032 return result;
1033 }
1034
ApplySchemaChange(SchemaProto new_schema)1035 libtextclassifier3::Status SchemaStore::ApplySchemaChange(
1036 SchemaProto new_schema) {
1037 // We need to ensure that we either 1) successfully set the schema and
1038 // update all derived data structures or 2) fail and leave the schema store
1039 // unchanged.
1040 // So, first, we create an empty temporary directory to build a new schema
1041 // store in.
1042 std::string temp_schema_store_dir_path = base_dir_ + "_temp";
1043 if (!filesystem_->DeleteDirectoryRecursively(
1044 temp_schema_store_dir_path.c_str())) {
1045 ICING_LOG(ERROR) << "Recursively deleting "
1046 << temp_schema_store_dir_path.c_str();
1047 return absl_ports::InternalError(
1048 "Unable to delete temp directory to prepare to build new schema "
1049 "store.");
1050 }
1051
1052 DestructibleDirectory temp_schema_store_dir(
1053 filesystem_, std::move(temp_schema_store_dir_path));
1054 if (!temp_schema_store_dir.is_valid()) {
1055 return absl_ports::InternalError(
1056 "Unable to create temp directory to build new schema store.");
1057 }
1058
1059 // Then we create our new schema store with the new schema.
1060 ICING_ASSIGN_OR_RETURN(
1061 std::unique_ptr<SchemaStore> new_schema_store,
1062 SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
1063 feature_flags_, std::move(new_schema),
1064 enable_schema_database_));
1065
1066 // Then we swap the new schema file + new derived files with the old files.
1067 if (!filesystem_->SwapFiles(base_dir_.c_str(),
1068 temp_schema_store_dir.dir().c_str())) {
1069 return absl_ports::InternalError(
1070 "Unable to apply new schema due to failed swap!");
1071 }
1072
1073 std::string old_base_dir = std::move(base_dir_);
1074 *this = std::move(*new_schema_store);
1075
1076 // After the std::move, the filepaths saved in this instance and in the
1077 // schema_file_ instance will still be the one from temp_schema_store_dir
1078 // even though they now point to files that are within old_base_dir.
1079 // Manually set them to the correct paths.
1080 base_dir_ = std::move(old_base_dir);
1081 schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
1082 if (overlay_schema_file_ != nullptr) {
1083 overlay_schema_file_->SetSwappedFilepath(
1084 MakeOverlaySchemaFilename(base_dir_));
1085 }
1086
1087 return libtextclassifier3::Status::OK;
1088 }
1089
1090 libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const1091 SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
1092 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1093 const auto& type_config_iter =
1094 type_config_map_.find(std::string(schema_type));
1095 if (type_config_iter == type_config_map_.end()) {
1096 return absl_ports::NotFoundError(
1097 absl_ports::StrCat("Schema type config '", schema_type, "' not found"));
1098 }
1099 return &type_config_iter->second;
1100 }
1101
GetSchemaTypeId(std::string_view schema_type) const1102 libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
1103 std::string_view schema_type) const {
1104 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1105 return schema_type_mapper_->Get(schema_type);
1106 }
1107
GetSchemaType(SchemaTypeId schema_type_id) const1108 libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
1109 SchemaTypeId schema_type_id) const {
1110 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1111 if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
1112 it == reverse_schema_type_mapper_.end()) {
1113 return absl_ports::InvalidArgumentError("Invalid schema type id");
1114 } else {
1115 return &it->second;
1116 }
1117 }
1118
1119 libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
GetSchemaTypeIdsWithChildren(std::string_view schema_type) const1120 SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
1121 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
1122 GetSchemaTypeId(schema_type));
1123 auto iter = schema_subtype_id_map_.find(schema_type_id);
1124 if (iter == schema_subtype_id_map_.end()) {
1125 // This should never happen, unless there is an inconsistency or IO error.
1126 return absl_ports::InternalError(absl_ports::StrCat(
1127 "Schema type '", schema_type, "' is not found in the subtype map."));
1128 }
1129 return &iter->second;
1130 }
1131
1132 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const1133 SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
1134 SectionId section_id) const {
1135 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1136 return schema_type_manager_->section_manager().GetSectionMetadata(
1137 schema_type_id, section_id);
1138 }
1139
ExtractSections(const DocumentProto & document) const1140 libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
1141 const DocumentProto& document) const {
1142 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1143 return schema_type_manager_->section_manager().ExtractSections(document);
1144 }
1145
1146 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,const std::string & property_path) const1147 SchemaStore::GetJoinablePropertyMetadata(
1148 SchemaTypeId schema_type_id, const std::string& property_path) const {
1149 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1150 return schema_type_manager_->joinable_property_manager()
1151 .GetJoinablePropertyMetadata(schema_type_id, property_path);
1152 }
1153
1154 libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,JoinablePropertyId joinable_property_id) const1155 SchemaStore::GetJoinablePropertyMetadata(
1156 SchemaTypeId schema_type_id,
1157 JoinablePropertyId joinable_property_id) const {
1158 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1159 return schema_type_manager_->joinable_property_manager()
1160 .GetJoinablePropertyMetadata(schema_type_id, joinable_property_id);
1161 }
1162
1163 libtextclassifier3::StatusOr<JoinablePropertyGroup>
ExtractJoinableProperties(const DocumentProto & document) const1164 SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
1165 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1166 return schema_type_manager_->joinable_property_manager()
1167 .ExtractJoinableProperties(document);
1168 }
1169
1170 libtextclassifier3::StatusOr<std::optional<int>>
GetScorablePropertyIndex(SchemaTypeId schema_type_id,std::string_view property_path) const1171 SchemaStore::GetScorablePropertyIndex(SchemaTypeId schema_type_id,
1172 std::string_view property_path) const {
1173 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1174 if (!feature_flags_->enable_scorable_properties()) {
1175 return std::nullopt;
1176 }
1177 return scorable_property_manager_->GetScorablePropertyIndex(
1178 schema_type_id, property_path, type_config_map_,
1179 reverse_schema_type_mapper_);
1180 }
1181
1182 libtextclassifier3::StatusOr<
1183 const std::vector<ScorablePropertyManager::ScorablePropertyInfo>*>
GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const1184 SchemaStore::GetOrderedScorablePropertyInfo(SchemaTypeId schema_type_id) const {
1185 ICING_RETURN_IF_ERROR(CheckSchemaSet());
1186 if (!feature_flags_->enable_scorable_properties()) {
1187 return nullptr;
1188 }
1189 return scorable_property_manager_->GetOrderedScorablePropertyInfo(
1190 schema_type_id, type_config_map_, reverse_schema_type_mapper_);
1191 }
1192
PersistToDisk()1193 libtextclassifier3::Status SchemaStore::PersistToDisk() {
1194 if (!has_schema_successfully_set_) {
1195 return libtextclassifier3::Status::OK;
1196 }
1197 ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
1198 ICING_RETURN_IF_ERROR(UpdateChecksum());
1199 ICING_RETURN_IF_ERROR(header_->PersistToDisk());
1200 return libtextclassifier3::Status::OK;
1201 }
1202
GetStorageInfo() const1203 SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
1204 SchemaStoreStorageInfoProto storage_info;
1205 int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
1206 storage_info.set_schema_store_size(
1207 Filesystem::SanitizeFileSize(directory_size));
1208 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
1209 storage_info.set_num_schema_types(schema->types().size());
1210 int total_sections = 0;
1211 int num_types_sections_exhausted = 0;
1212 for (const SchemaTypeConfigProto& type : schema->types()) {
1213 auto sections_list_or =
1214 schema_type_manager_->section_manager().GetMetadataList(
1215 type.schema_type());
1216 if (!sections_list_or.ok()) {
1217 continue;
1218 }
1219 total_sections += sections_list_or.ValueOrDie()->size();
1220 if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
1221 ++num_types_sections_exhausted;
1222 }
1223 }
1224
1225 storage_info.set_num_total_sections(total_sections);
1226 storage_info.set_num_schema_types_sections_exhausted(
1227 num_types_sections_exhausted);
1228 return storage_info;
1229 }
1230
1231 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetSectionMetadata(const std::string & schema_type) const1232 SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
1233 return schema_type_manager_->section_manager().GetMetadataList(schema_type);
1234 }
1235
IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,const std::string & property_path) const1236 bool SchemaStore::IsPropertyDefinedInSchema(
1237 SchemaTypeId schema_type_id, const std::string& property_path) const {
1238 auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
1239 if (schema_name_itr == reverse_schema_type_mapper_.end()) {
1240 return false;
1241 }
1242 const std::string* current_type_name = &schema_name_itr->second;
1243
1244 std::vector<std::string_view> property_path_parts =
1245 property_util::SplitPropertyPathExpr(property_path);
1246 for (int i = 0; i < property_path_parts.size(); ++i) {
1247 auto type_config_itr = type_config_map_.find(*current_type_name);
1248 if (type_config_itr == type_config_map_.end()) {
1249 return false;
1250 }
1251 std::string_view property_name = property_path_parts.at(i);
1252 const PropertyConfigProto* selected_property = nullptr;
1253 for (const PropertyConfigProto& property :
1254 type_config_itr->second.properties()) {
1255 if (property.property_name() == property_name) {
1256 selected_property = &property;
1257 break;
1258 }
1259 }
1260 if (selected_property == nullptr) {
1261 return false;
1262 }
1263 if (i == property_path_parts.size() - 1) {
1264 // We've found a property at the final part of the path.
1265 return true;
1266 }
1267 if (selected_property->data_type() !=
1268 PropertyConfigProto::DataType::DOCUMENT) {
1269 // If this isn't final part of the path, but this property isn't a
1270 // document, so we know that this path doesn't exist.
1271 return false;
1272 }
1273 current_type_name = &selected_property->schema_type();
1274 }
1275
1276 // We should never reach this point.
1277 return false;
1278 }
1279
GetDebugInfo() const1280 libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
1281 const {
1282 SchemaDebugInfoProto debug_info;
1283 if (has_schema_successfully_set_) {
1284 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1285 *debug_info.mutable_schema() = *schema;
1286 }
1287 ICING_ASSIGN_OR_RETURN(Crc32 crc, GetChecksum());
1288 debug_info.set_crc(crc.Get());
1289 return debug_info;
1290 }
1291
1292 std::vector<SchemaStore::ExpandedTypePropertyMask>
ExpandTypePropertyMasks(const google::protobuf::RepeatedPtrField<TypePropertyMask> & type_property_masks) const1293 SchemaStore::ExpandTypePropertyMasks(
1294 const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
1295 const {
1296 std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
1297 for (const TypePropertyMask& type_field_mask : type_property_masks) {
1298 if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
1299 ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
1300 /*paths=*/{}};
1301 entry.paths.insert(type_field_mask.paths().begin(),
1302 type_field_mask.paths().end());
1303 result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
1304 } else {
1305 auto schema_type_ids_or =
1306 GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
1307 // If we can't find the SchemaTypeIds, just throw it away
1308 if (!schema_type_ids_or.ok()) {
1309 continue;
1310 }
1311 const std::unordered_set<SchemaTypeId>* schema_type_ids =
1312 schema_type_ids_or.ValueOrDie();
1313 for (SchemaTypeId schema_type_id : *schema_type_ids) {
1314 auto schema_type_name_iter =
1315 reverse_schema_type_mapper_.find(schema_type_id);
1316 if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
1317 // This should never happen, unless there is an inconsistency or IO
1318 // error.
1319 ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
1320 continue;
1321 }
1322
1323 auto iter = result_map.find(schema_type_id);
1324 if (iter == result_map.end()) {
1325 ExpandedTypePropertyMask entry{schema_type_name_iter->second,
1326 /*paths=*/{}};
1327 iter = result_map.insert({schema_type_id, std::move(entry)}).first;
1328 }
1329 iter->second.paths.insert(type_field_mask.paths().begin(),
1330 type_field_mask.paths().end());
1331 }
1332 }
1333 }
1334 std::vector<ExpandedTypePropertyMask> result;
1335 result.reserve(result_map.size());
1336 for (auto& entry : result_map) {
1337 result.push_back(std::move(entry.second));
1338 }
1339 return result;
1340 }
1341
1342 libtextclassifier3::StatusOr<
1343 std::unordered_map<std::string, std::vector<std::string>>>
ConstructBlobPropertyMap() const1344 SchemaStore::ConstructBlobPropertyMap() const {
1345 ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
1346 std::unordered_map<std::string, std::vector<std::string>> blob_property_map;
1347 for (const SchemaTypeConfigProto& type_config : schema->types()) {
1348 SchemaPropertyIterator iterator(type_config, type_config_map_);
1349 std::vector<std::string> blob_properties;
1350
1351 libtextclassifier3::Status status = iterator.Advance();
1352 while (status.ok()) {
1353 if (iterator.GetCurrentPropertyConfig().data_type() ==
1354 PropertyConfigProto::DataType::BLOB_HANDLE) {
1355 blob_properties.push_back(iterator.GetCurrentPropertyPath());
1356 }
1357 status = iterator.Advance();
1358 }
1359 if (!absl_ports::IsOutOfRange(status)) {
1360 return status;
1361 }
1362 if (!blob_properties.empty()) {
1363 blob_property_map.insert(
1364 {type_config.schema_type(), std::move(blob_properties)});
1365 }
1366 }
1367 return blob_property_map;
1368 }
1369
ValidateAndGetDatabase(const SchemaProto & new_schema) const1370 libtextclassifier3::StatusOr<std::string> SchemaStore::ValidateAndGetDatabase(
1371 const SchemaProto& new_schema) const {
1372 std::string database;
1373
1374 if (!enable_schema_database_ || new_schema.types().empty()) {
1375 return database;
1376 }
1377
1378 database = new_schema.types(0).database();
1379 // Loop through new_schema's types and validate it. The input SchemaProto
1380 // contains a list of SchemaTypeConfigProtos without deduplication. We need to
1381 // check that:
1382 // 1. All SchemaTypeConfigProtos have the same database value.
1383 // 2. The SchemaTypeConfigProtos's schema_type field is unique within both
1384 // new_schema, as well as the existing schema (recorded in
1385 // type_config_map_).
1386 for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
1387 // Check database consistency.
1388 if (database != type_config.database()) {
1389 return absl_ports::InvalidArgumentError(
1390 "SetSchema only accepts a SchemaProto with types from a single "
1391 "database at a time. Please make separate calls for each database if "
1392 "you need to set the schema for multiple databases.");
1393 }
1394
1395 // Check type name uniqueness. This is only necessary if there is a
1396 // pre-existing schema.
1397 if (has_schema_successfully_set_) {
1398 auto iter = type_config_map_.find(type_config.schema_type());
1399 if (iter != type_config_map_.end() &&
1400 database != iter->second.database()) {
1401 return absl_ports::AlreadyExistsError(
1402 absl_ports::StrCat("schema_type name: '", type_config.schema_type(),
1403 "' is already in use by a different database."));
1404 }
1405 }
1406 }
1407 return database;
1408 }
1409
1410 libtextclassifier3::StatusOr<SchemaProto>
GetFullSchemaProtoWithUpdatedDb(SchemaProto input_database_schema) const1411 SchemaStore::GetFullSchemaProtoWithUpdatedDb(
1412 SchemaProto input_database_schema) const {
1413 if (!enable_schema_database_) {
1414 // If the schema database is not enabled, the input schema is already the
1415 // full schema, so we don't need to do any merges.
1416 return input_database_schema;
1417 }
1418
1419 libtextclassifier3::StatusOr<const SchemaProto*> schema_proto = GetSchema();
1420 if (absl_ports::IsNotFound(schema_proto.status())) {
1421 // We don't have a pre-existing schema -- we can return the input database
1422 // schema as it's already the full schema.
1423 return input_database_schema;
1424 }
1425
1426 if (!schema_proto.ok()) {
1427 // Real error.
1428 return schema_proto.status();
1429 }
1430
1431 if (!has_schema_successfully_set_) {
1432 return absl_ports::InternalError(
1433 "Schema store was not initialized properly.");
1434 }
1435
1436 // At this point, we have a pre-existing schema -- we need to merge the
1437 // updated database with the existing schema.
1438 if (input_database_schema.types().empty()) {
1439 return *schema_proto.ValueOrDie();
1440 }
1441
1442 std::string input_database = input_database_schema.types(0).database();
1443 if (database_type_map_.size() == 1 &&
1444 database_type_map_.find(input_database) != database_type_map_.end()) {
1445 // No other databases in the schema -- we can return the input database
1446 // schema.
1447 return input_database_schema;
1448 }
1449
1450 const SchemaProto* existing_schema = schema_proto.ValueOrDie();
1451 SchemaProto full_schema;
1452
1453 // 1. Add types from the existing schema, replacing existing types with the
1454 // input types if the database is the one being updated by the input schema.
1455 // - For the input_database, we replace the existing types with the input
1456 // types. An exisiting type is deleted if it's not included in
1457 // input_database.
1458 // - If there are more input types than existing types for the input_database,
1459 // the rest of the input types are appended to the end of the full_schema.
1460 // - If there are fewer input types than existing types for the
1461 // input_database, we shift all existing that come after input_database
1462 // forward.
1463 // - For existing types from other databases, we add the types in their
1464 // original order to full_schema. Note that the type-ids of existing types
1465 // might still change if some types deleted in input_database as this will
1466 // cause all subsequent types ids to shift forward.
1467 int input_schema_index = 0, existing_schema_index = 0;
1468 while (input_schema_index < input_database_schema.types().size() &&
1469 existing_schema_index < existing_schema->types().size()) {
1470 const SchemaTypeConfigProto& existing_type_config =
1471 existing_schema->types(existing_schema_index);
1472 SchemaTypeConfigProto& input_type_config =
1473 *input_database_schema.mutable_types(input_schema_index);
1474
1475 if (input_type_config.database() != input_database) {
1476 return absl_ports::InvalidArgumentError(
1477 "Can only update a single database at a time.");
1478 }
1479
1480 if (existing_type_config.database() == input_database) {
1481 // If the database is the one being updated by the input schema, replace
1482 // the existing type with a type from the input schema.
1483 *full_schema.add_types() = std::move(input_type_config);
1484 ++input_schema_index;
1485 } else {
1486 *full_schema.add_types() = existing_type_config;
1487 }
1488 ++existing_schema_index;
1489 }
1490
1491 // 2. Append remaining types to the end of the SchemaProto.
1492 for (; input_schema_index < input_database_schema.types().size();
1493 ++input_schema_index) {
1494 // Case 1: Append all remaining types from the input schema. This happens
1495 // when more types are added in input_database_schema than what's in the
1496 // existing schema. In this case, we've used up the space for the database
1497 // in the existing schema, so we can just append the rest of the types to
1498 // the end.
1499 SchemaTypeConfigProto& input_type_config =
1500 *input_database_schema.mutable_types(input_schema_index);
1501 *full_schema.add_types() = std::move(input_type_config);
1502 }
1503 for (; existing_schema_index < existing_schema->types().size();
1504 ++existing_schema_index) {
1505 // Case 2: Add remaining types from the existing schema, but skip the ones
1506 // that are from input_database, since existing types from input_database
1507 // are replaced with input_database_schema.
1508 if (existing_schema->types(existing_schema_index).database() !=
1509 input_database) {
1510 *full_schema.add_types() = existing_schema->types(existing_schema_index);
1511 }
1512 }
1513
1514 return full_schema;
1515 }
1516
1517 } // namespace lib
1518 } // namespace icing
1519