1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/embed/embedding-index.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/destructible-directory.h"
32 #include "icing/file/file-backed-vector.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/memory-mapped-file.h"
35 #include "icing/file/posting_list/flash-index-storage.h"
36 #include "icing/file/posting_list/posting-list-identifier.h"
37 #include "icing/index/embed/embedding-hit.h"
38 #include "icing/index/embed/embedding-scorer.h"
39 #include "icing/index/embed/posting-list-embedding-hit-accessor.h"
40 #include "icing/index/embed/quantizer.h"
41 #include "icing/index/hit/hit.h"
42 #include "icing/schema/schema-store.h"
43 #include "icing/store/document-filter-data.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/document-store.h"
46 #include "icing/store/dynamic-trie-key-mapper.h"
47 #include "icing/store/key-mapper.h"
48 #include "icing/util/clock.h"
49 #include "icing/util/crc32.h"
50 #include "icing/util/encode-util.h"
51 #include "icing/util/logging.h"
52 #include "icing/util/status-macros.h"
53
54 namespace icing {
55 namespace lib {
56
57 namespace {
58
59 constexpr uint32_t kEmbeddingHitListMapperMaxSize =
60 128 * 1024 * 1024; // 128 MiB;
61
62 // The maximum length returned by encode_util::EncodeIntToCString is 5 for
63 // uint32_t.
64 constexpr uint32_t kEncodedDimensionLength = 5;
65
GetMetadataFilePath(std::string_view working_path)66 std::string GetMetadataFilePath(std::string_view working_path) {
67 return absl_ports::StrCat(working_path, "/metadata");
68 }
69
GetFlashIndexStorageFilePath(std::string_view working_path)70 std::string GetFlashIndexStorageFilePath(std::string_view working_path) {
71 return absl_ports::StrCat(working_path, "/flash_index_storage");
72 }
73
GetEmbeddingHitListMapperPath(std::string_view working_path)74 std::string GetEmbeddingHitListMapperPath(std::string_view working_path) {
75 return absl_ports::StrCat(working_path, "/embedding_hit_list_mapper");
76 }
77
GetEmbeddingVectorsFilePath(std::string_view working_path)78 std::string GetEmbeddingVectorsFilePath(std::string_view working_path) {
79 return absl_ports::StrCat(working_path, "/embedding_vectors");
80 }
81
GetQuantizedEmbeddingVectorsFilePath(std::string_view working_path)82 std::string GetQuantizedEmbeddingVectorsFilePath(
83 std::string_view working_path) {
84 return absl_ports::StrCat(working_path, "/quantized_embedding_vectors");
85 }
86
87 // An injective function that maps the ordered pair (dimension, model_signature)
88 // to a string, which is used to form a key for embedding_posting_list_mapper_.
GetPostingListKey(uint32_t dimension,std::string_view model_signature)89 std::string GetPostingListKey(uint32_t dimension,
90 std::string_view model_signature) {
91 std::string encoded_dimension_str =
92 encode_util::EncodeIntToCString(dimension);
93 // Make encoded_dimension_str to fixed kEncodedDimensionLength bytes.
94 while (encoded_dimension_str.size() < kEncodedDimensionLength) {
95 // C string cannot contain 0 bytes, so we append it using 1, just like what
96 // we do in encode_util::EncodeIntToCString.
97 //
98 // The reason that this works is because DecodeIntToString decodes a byte
99 // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded
100 // dimension that is less than 5 bytes, it means that the dimension contains
101 // unencoded leading 0x00. So here we're explicitly encoding those bytes as
102 // 0x01.
103 encoded_dimension_str.push_back(1);
104 }
105 return absl_ports::StrCat(encoded_dimension_str, model_signature);
106 }
107
GetPostingListKey(const PropertyProto::VectorProto & vector)108 std::string GetPostingListKey(const PropertyProto::VectorProto& vector) {
109 return GetPostingListKey(vector.values().size(), vector.model_signature());
110 }
111
CreateQuantizer(const PropertyProto::VectorProto & vector)112 libtextclassifier3::StatusOr<Quantizer> CreateQuantizer(
113 const PropertyProto::VectorProto& vector) {
114 if (vector.values().empty()) {
115 return absl_ports::InvalidArgumentError("Vector dimension is 0");
116 }
117 auto minmax_pair =
118 std::minmax_element(vector.values().begin(), vector.values().end());
119 return Quantizer::Create(*minmax_pair.first, *minmax_pair.second);
120 }
121
122 } // namespace
123
124 libtextclassifier3::StatusOr<std::unique_ptr<EmbeddingIndex>>
Create(const Filesystem * filesystem,std::string working_path,const Clock * clock,const FeatureFlags * feature_flags)125 EmbeddingIndex::Create(const Filesystem* filesystem, std::string working_path,
126 const Clock* clock, const FeatureFlags* feature_flags) {
127 ICING_RETURN_ERROR_IF_NULL(filesystem);
128 ICING_RETURN_ERROR_IF_NULL(clock);
129
130 std::unique_ptr<EmbeddingIndex> index =
131 std::unique_ptr<EmbeddingIndex>(new EmbeddingIndex(
132 *filesystem, std::move(working_path), clock, feature_flags));
133 ICING_RETURN_IF_ERROR(index->Initialize());
134 return index;
135 }
136
CreateStorageDataIfNonEmpty()137 libtextclassifier3::Status EmbeddingIndex::CreateStorageDataIfNonEmpty() {
138 if (is_empty()) {
139 return libtextclassifier3::Status::OK;
140 }
141
142 ICING_ASSIGN_OR_RETURN(FlashIndexStorage flash_index_storage,
143 FlashIndexStorage::Create(
144 GetFlashIndexStorageFilePath(working_path_),
145 &filesystem_, posting_list_hit_serializer_.get()));
146 flash_index_storage_ =
147 std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
148
149 ICING_ASSIGN_OR_RETURN(
150 embedding_posting_list_mapper_,
151 DynamicTrieKeyMapper<PostingListIdentifier>::Create(
152 filesystem_, GetEmbeddingHitListMapperPath(working_path_),
153 kEmbeddingHitListMapperMaxSize));
154
155 ICING_ASSIGN_OR_RETURN(
156 embedding_vectors_,
157 FileBackedVector<float>::Create(
158 filesystem_, GetEmbeddingVectorsFilePath(working_path_),
159 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
160
161 ICING_ASSIGN_OR_RETURN(
162 quantized_embedding_vectors_,
163 FileBackedVector<char>::Create(
164 filesystem_, GetQuantizedEmbeddingVectorsFilePath(working_path_),
165 MemoryMappedFile::READ_WRITE_AUTO_SYNC));
166
167 return libtextclassifier3::Status::OK;
168 }
169
MarkIndexNonEmpty()170 libtextclassifier3::Status EmbeddingIndex::MarkIndexNonEmpty() {
171 if (!is_empty()) {
172 return libtextclassifier3::Status::OK;
173 }
174 info().is_empty = false;
175 return CreateStorageDataIfNonEmpty();
176 }
177
Initialize()178 libtextclassifier3::Status EmbeddingIndex::Initialize() {
179 bool is_new = false;
180 if (!filesystem_.FileExists(GetMetadataFilePath(working_path_).c_str())) {
181 // Create working directory.
182 if (!filesystem_.CreateDirectoryRecursively(working_path_.c_str())) {
183 return absl_ports::InternalError(
184 absl_ports::StrCat("Failed to create directory: ", working_path_));
185 }
186 is_new = true;
187 }
188
189 ICING_ASSIGN_OR_RETURN(
190 MemoryMappedFile metadata_mmapped_file,
191 MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_),
192 MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
193 /*max_file_size=*/kMetadataFileSize,
194 /*pre_mapping_file_offset=*/0,
195 /*pre_mapping_mmap_size=*/kMetadataFileSize));
196 metadata_mmapped_file_ =
197 std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
198
199 if (is_new) {
200 ICING_RETURN_IF_ERROR(metadata_mmapped_file_->GrowAndRemapIfNecessary(
201 /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
202 info().magic = Info::kMagic;
203 info().last_added_document_id = kInvalidDocumentId;
204 info().is_empty = true;
205 memset(Info().padding_, 0, Info::kPaddingSize);
206 ICING_RETURN_IF_ERROR(InitializeNewStorage());
207 } else {
208 if (metadata_mmapped_file_->available_size() != kMetadataFileSize) {
209 return absl_ports::FailedPreconditionError(
210 "Incorrect metadata file size");
211 }
212 if (info().magic != Info::kMagic) {
213 return absl_ports::FailedPreconditionError("Incorrect magic value");
214 }
215 ICING_RETURN_IF_ERROR(CreateStorageDataIfNonEmpty());
216 ICING_RETURN_IF_ERROR(InitializeExistingStorage());
217 }
218 return libtextclassifier3::Status::OK;
219 }
220
Clear()221 libtextclassifier3::Status EmbeddingIndex::Clear() {
222 pending_embedding_hits_.clear();
223 metadata_mmapped_file_.reset();
224 flash_index_storage_.reset();
225 embedding_posting_list_mapper_.reset();
226 embedding_vectors_.reset();
227 quantized_embedding_vectors_.reset();
228 if (filesystem_.DirectoryExists(working_path_.c_str())) {
229 ICING_RETURN_IF_ERROR(Discard(filesystem_, working_path_));
230 }
231 is_initialized_ = false;
232 return Initialize();
233 }
234
235 libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>>
GetAccessor(uint32_t dimension,std::string_view model_signature) const236 EmbeddingIndex::GetAccessor(uint32_t dimension,
237 std::string_view model_signature) const {
238 if (dimension == 0) {
239 return absl_ports::InvalidArgumentError("Dimension is 0");
240 }
241 if (is_empty()) {
242 return absl_ports::NotFoundError("EmbeddingIndex is empty");
243 }
244
245 std::string key = GetPostingListKey(dimension, model_signature);
246 ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
247 embedding_posting_list_mapper_->Get(key));
248 return PostingListEmbeddingHitAccessor::CreateFromExisting(
249 flash_index_storage_.get(), posting_list_hit_serializer_.get(),
250 posting_list_id);
251 }
252
AppendEmbeddingVector(const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)253 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::AppendEmbeddingVector(
254 const PropertyProto::VectorProto& vector,
255 EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
256 uint32_t dimension = vector.values().size();
257 uint32_t location;
258 if (!feature_flags_->enable_embedding_quantization() ||
259 quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
260 location = embedding_vectors_->num_elements();
261 ICING_ASSIGN_OR_RETURN(
262 FileBackedVector<float>::MutableArrayView mutable_arr,
263 embedding_vectors_->Allocate(dimension));
264 mutable_arr.SetArray(/*idx=*/0, vector.values().data(), dimension);
265 } else {
266 ICING_ASSIGN_OR_RETURN(Quantizer quantizer, CreateQuantizer(vector));
267 // Quantize the vector
268 std::vector<uint8_t> quantized_values;
269 quantized_values.reserve(vector.values().size());
270 for (float value : vector.values()) {
271 quantized_values.push_back(quantizer.Quantize(value));
272 }
273
274 // Store the quantizer and the quantized vector
275 location = quantized_embedding_vectors_->num_elements();
276 ICING_ASSIGN_OR_RETURN(
277 FileBackedVector<char>::MutableArrayView mutable_arr,
278 quantized_embedding_vectors_->Allocate(sizeof(Quantizer) + dimension));
279 mutable_arr.SetArray(/*idx=*/0, reinterpret_cast<char*>(&quantizer),
280 sizeof(Quantizer));
281 mutable_arr.SetArray(/*idx=*/sizeof(Quantizer),
282 reinterpret_cast<char*>(quantized_values.data()),
283 dimension);
284 }
285 return location;
286 }
287
BufferEmbedding(const BasicHit & basic_hit,const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)288 libtextclassifier3::Status EmbeddingIndex::BufferEmbedding(
289 const BasicHit& basic_hit, const PropertyProto::VectorProto& vector,
290 EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
291 if (vector.values().empty()) {
292 return absl_ports::InvalidArgumentError("Vector dimension is 0");
293 }
294 ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
295
296 std::string key = GetPostingListKey(vector);
297 ICING_ASSIGN_OR_RETURN(uint32_t location,
298 AppendEmbeddingVector(vector, quantization_type));
299
300 // Buffer the embedding hit.
301 pending_embedding_hits_.push_back(
302 {std::move(key), EmbeddingHit(basic_hit, location)});
303 return libtextclassifier3::Status::OK;
304 }
305
CommitBufferToIndex()306 libtextclassifier3::Status EmbeddingIndex::CommitBufferToIndex() {
307 if (pending_embedding_hits_.empty()) {
308 return libtextclassifier3::Status::OK;
309 }
310 ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
311
312 std::sort(pending_embedding_hits_.begin(), pending_embedding_hits_.end());
313 auto iter_curr_key = pending_embedding_hits_.rbegin();
314 while (iter_curr_key != pending_embedding_hits_.rend()) {
315 // In order to batch putting embedding hits with the same key (dimension,
316 // model_signature) to the same posting list, we find the range
317 // [iter_curr_key, iter_next_key) of embedding hits with the same key and
318 // put them into their corresponding posting list together.
319 auto iter_next_key = iter_curr_key;
320 while (iter_next_key != pending_embedding_hits_.rend() &&
321 iter_next_key->first == iter_curr_key->first) {
322 iter_next_key++;
323 }
324
325 const std::string& key = iter_curr_key->first;
326 libtextclassifier3::StatusOr<PostingListIdentifier> posting_list_id_or =
327 embedding_posting_list_mapper_->Get(key);
328 std::unique_ptr<PostingListEmbeddingHitAccessor> pl_accessor;
329 if (posting_list_id_or.ok()) {
330 // Existing posting list.
331 ICING_ASSIGN_OR_RETURN(
332 pl_accessor,
333 PostingListEmbeddingHitAccessor::CreateFromExisting(
334 flash_index_storage_.get(), posting_list_hit_serializer_.get(),
335 posting_list_id_or.ValueOrDie()));
336 } else if (absl_ports::IsNotFound(posting_list_id_or.status())) {
337 // New posting list.
338 ICING_ASSIGN_OR_RETURN(
339 pl_accessor,
340 PostingListEmbeddingHitAccessor::Create(
341 flash_index_storage_.get(), posting_list_hit_serializer_.get()));
342 } else {
343 // Errors
344 return std::move(posting_list_id_or).status();
345 }
346
347 // Adding the embedding hits.
348 for (auto iter = iter_curr_key; iter != iter_next_key; ++iter) {
349 ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(iter->second));
350 }
351
352 // Finalize this posting list and add the posting list id in
353 // embedding_posting_list_mapper_.
354 PostingListEmbeddingHitAccessor::FinalizeResult result =
355 std::move(*pl_accessor).Finalize();
356 if (!result.id.is_valid()) {
357 return absl_ports::InternalError("Failed to finalize posting list");
358 }
359 ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->Put(key, result.id));
360
361 // Advance to the next key.
362 iter_curr_key = iter_next_key;
363 }
364 pending_embedding_hits_.clear();
365 return libtextclassifier3::Status::OK;
366 }
367
TransferEmbeddingVector(const EmbeddingHit & old_hit,uint32_t dimension,EmbeddingIndexingConfig::QuantizationType::Code quantization_type,EmbeddingIndex * new_index) const368 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::TransferEmbeddingVector(
369 const EmbeddingHit& old_hit, uint32_t dimension,
370 EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
371 EmbeddingIndex* new_index) const {
372 uint32_t new_location;
373 if (!feature_flags_->enable_embedding_quantization() ||
374 quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
375 ICING_ASSIGN_OR_RETURN(const float* old_vector,
376 GetEmbeddingVector(old_hit, dimension));
377 new_location = new_index->embedding_vectors_->num_elements();
378
379 // Copy the embedding vector of the hit to the new index.
380 ICING_ASSIGN_OR_RETURN(
381 FileBackedVector<float>::MutableArrayView mutable_arr,
382 new_index->embedding_vectors_->Allocate(dimension));
383 mutable_arr.SetArray(/*idx=*/0, old_vector, dimension);
384 } else {
385 ICING_ASSIGN_OR_RETURN(const char* old_data,
386 GetQuantizedEmbeddingVector(old_hit, dimension));
387 new_location = new_index->quantized_embedding_vectors_->num_elements();
388
389 // Copy the embedding vector of the hit to the new index.
390 ICING_ASSIGN_OR_RETURN(FileBackedVector<char>::MutableArrayView mutable_arr,
391 new_index->quantized_embedding_vectors_->Allocate(
392 sizeof(Quantizer) + dimension));
393 mutable_arr.SetArray(/*idx=*/0, old_data, sizeof(Quantizer) + dimension);
394 }
395 return new_location;
396 }
397
TransferIndex(const DocumentStore & document_store,const SchemaStore & schema_store,const std::vector<DocumentId> & document_id_old_to_new,EmbeddingIndex * new_index) const398 libtextclassifier3::Status EmbeddingIndex::TransferIndex(
399 const DocumentStore& document_store, const SchemaStore& schema_store,
400 const std::vector<DocumentId>& document_id_old_to_new,
401 EmbeddingIndex* new_index) const {
402 if (is_empty()) {
403 return absl_ports::FailedPreconditionError("EmbeddingIndex is empty");
404 }
405
406 const int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
407 std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> itr =
408 embedding_posting_list_mapper_->GetIterator();
409 while (itr->Advance()) {
410 std::string_view key = itr->GetKey();
411 // This should never happen unless there is an inconsistency, or the index
412 // is corrupted.
413 if (key.size() < kEncodedDimensionLength) {
414 return absl_ports::InternalError(
415 "Got invalid key from embedding posting list mapper.");
416 }
417 uint32_t dimension = encode_util::DecodeIntFromCString(
418 std::string_view(key.begin(), kEncodedDimensionLength));
419
420 // Transfer hits
421 std::vector<EmbeddingHit> new_hits;
422 ICING_ASSIGN_OR_RETURN(
423 std::unique_ptr<PostingListEmbeddingHitAccessor> old_pl_accessor,
424 PostingListEmbeddingHitAccessor::CreateFromExisting(
425 flash_index_storage_.get(), posting_list_hit_serializer_.get(),
426 /*existing_posting_list_id=*/itr->GetValue()));
427 DocumentId last_new_document_id = kInvalidDocumentId;
428 SchemaTypeId schema_type_id = kInvalidSchemaTypeId;
429 while (true) {
430 ICING_ASSIGN_OR_RETURN(std::vector<EmbeddingHit> batch,
431 old_pl_accessor->GetNextHitsBatch());
432 if (batch.empty()) {
433 break;
434 }
435 for (EmbeddingHit& old_hit : batch) {
436 // Safety checks to add robustness to the codebase, so to make sure
437 // that we never access invalid memory, in case that hit from the
438 // posting list is corrupted.
439 if (old_hit.basic_hit().document_id() < 0 ||
440 old_hit.basic_hit().document_id() >=
441 document_id_old_to_new.size()) {
442 return absl_ports::InternalError(
443 "Embedding hit document id is out of bound. The provided map is "
444 "too small, or the index may have been corrupted.");
445 }
446
447 // Construct transferred hit and add the embedding vector to the new
448 // index.
449 DocumentId new_document_id =
450 document_id_old_to_new[old_hit.basic_hit().document_id()];
451 if (new_document_id == kInvalidDocumentId) {
452 continue;
453 }
454 if (new_document_id != last_new_document_id) {
455 schema_type_id =
456 document_store.GetSchemaTypeId(new_document_id, current_time_ms);
457 }
458 last_new_document_id = new_document_id;
459 if (schema_type_id == kInvalidSchemaTypeId) {
460 // This should not happen, since document store is optimized first,
461 // so that new_document_id here should be alive.
462 continue;
463 }
464 ICING_ASSIGN_OR_RETURN(
465 EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
466 schema_store.GetQuantizationType(schema_type_id,
467 old_hit.basic_hit().section_id()));
468 ICING_RETURN_IF_ERROR(new_index->MarkIndexNonEmpty());
469
470 ICING_ASSIGN_OR_RETURN(
471 uint32_t new_location,
472 TransferEmbeddingVector(old_hit, dimension, quantization_type,
473 new_index));
474 new_hits.push_back(EmbeddingHit(
475 BasicHit(old_hit.basic_hit().section_id(), new_document_id),
476 new_location));
477 }
478 }
479 // No hit needs to be added to the new index.
480 if (new_hits.empty()) {
481 continue;
482 }
483 // Add transferred hits to the new index.
484 ICING_ASSIGN_OR_RETURN(
485 std::unique_ptr<PostingListEmbeddingHitAccessor> hit_accum,
486 PostingListEmbeddingHitAccessor::Create(
487 new_index->flash_index_storage_.get(),
488 new_index->posting_list_hit_serializer_.get()));
489 for (auto new_hit_itr = new_hits.rbegin(); new_hit_itr != new_hits.rend();
490 ++new_hit_itr) {
491 ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*new_hit_itr));
492 }
493 PostingListEmbeddingHitAccessor::FinalizeResult result =
494 std::move(*hit_accum).Finalize();
495 if (!result.id.is_valid()) {
496 return absl_ports::InternalError("Failed to finalize posting list");
497 }
498 ICING_RETURN_IF_ERROR(
499 new_index->embedding_posting_list_mapper_->Put(key, result.id));
500 }
501 return libtextclassifier3::Status::OK;
502 }
503
Optimize(const DocumentStore * document_store,const SchemaStore * schema_store,const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)504 libtextclassifier3::Status EmbeddingIndex::Optimize(
505 const DocumentStore* document_store, const SchemaStore* schema_store,
506 const std::vector<DocumentId>& document_id_old_to_new,
507 DocumentId new_last_added_document_id) {
508 ICING_RETURN_ERROR_IF_NULL(document_store);
509 ICING_RETURN_ERROR_IF_NULL(schema_store);
510 if (is_empty()) {
511 info().last_added_document_id = new_last_added_document_id;
512 return libtextclassifier3::Status::OK;
513 }
514
515 // This is just for completeness, but this should never be necessary, since we
516 // should never have pending hits at the time when Optimize is run.
517 ICING_RETURN_IF_ERROR(CommitBufferToIndex());
518
519 std::string temporary_index_working_path = working_path_ + "_temp";
520 if (!filesystem_.DeleteDirectoryRecursively(
521 temporary_index_working_path.c_str())) {
522 ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_working_path;
523 return absl_ports::InternalError(
524 "Unable to delete temp directory to prepare to build new index.");
525 }
526
527 DestructibleDirectory temporary_index_dir(
528 &filesystem_, std::move(temporary_index_working_path));
529 if (!temporary_index_dir.is_valid()) {
530 return absl_ports::InternalError(
531 "Unable to create temp directory to build new index.");
532 }
533
534 {
535 ICING_ASSIGN_OR_RETURN(
536 std::unique_ptr<EmbeddingIndex> new_index,
537 EmbeddingIndex::Create(&filesystem_, temporary_index_dir.dir(), &clock_,
538 feature_flags_));
539 ICING_RETURN_IF_ERROR(TransferIndex(*document_store, *schema_store,
540 document_id_old_to_new,
541 new_index.get()));
542 new_index->set_last_added_document_id(new_last_added_document_id);
543 ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
544 }
545
546 // Destruct current storage instances to safely swap directories.
547 metadata_mmapped_file_.reset();
548 flash_index_storage_.reset();
549 embedding_posting_list_mapper_.reset();
550 embedding_vectors_.reset();
551 quantized_embedding_vectors_.reset();
552
553 if (!filesystem_.SwapFiles(temporary_index_dir.dir().c_str(),
554 working_path_.c_str())) {
555 return absl_ports::InternalError(
556 "Unable to apply new index due to failed swap!");
557 }
558
559 // Reinitialize the index.
560 is_initialized_ = false;
561 return Initialize();
562 }
563
ScoreEmbeddingHit(const EmbeddingScorer & scorer,const PropertyProto::VectorProto & query,const EmbeddingHit & hit,EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const564 libtextclassifier3::StatusOr<float> EmbeddingIndex::ScoreEmbeddingHit(
565 const EmbeddingScorer& scorer, const PropertyProto::VectorProto& query,
566 const EmbeddingHit& hit,
567 EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const {
568 int dimension = query.values().size();
569 float semantic_score;
570 if (!feature_flags_->enable_embedding_quantization() ||
571 quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
572 ICING_ASSIGN_OR_RETURN(const float* vector,
573 GetEmbeddingVector(hit, dimension));
574 semantic_score = scorer.Score(dimension,
575 /*v1=*/query.values().data(),
576 /*v2=*/vector);
577 } else {
578 ICING_ASSIGN_OR_RETURN(const char* data,
579 GetQuantizedEmbeddingVector(hit, dimension));
580 Quantizer quantizer(data);
581 const uint8_t* quantized_vector =
582 reinterpret_cast<const uint8_t*>(data + sizeof(Quantizer));
583 semantic_score = scorer.Score(dimension,
584 /*v1=*/query.values().data(),
585 /*v2=*/quantized_vector, quantizer);
586 }
587 return semantic_score;
588 }
589
PersistMetadataToDisk()590 libtextclassifier3::Status EmbeddingIndex::PersistMetadataToDisk() {
591 return metadata_mmapped_file_->PersistToDisk();
592 }
593
PersistStoragesToDisk()594 libtextclassifier3::Status EmbeddingIndex::PersistStoragesToDisk() {
595 if (is_empty()) {
596 return libtextclassifier3::Status::OK;
597 }
598 if (!flash_index_storage_->PersistToDisk()) {
599 return absl_ports::InternalError("Fail to persist flash index to disk");
600 }
601 ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->PersistToDisk());
602 ICING_RETURN_IF_ERROR(embedding_vectors_->PersistToDisk());
603 ICING_RETURN_IF_ERROR(quantized_embedding_vectors_->PersistToDisk());
604 return libtextclassifier3::Status::OK;
605 }
606
UpdateStoragesChecksum()607 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::UpdateStoragesChecksum() {
608 if (is_empty()) {
609 return Crc32(0);
610 }
611 ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
612 embedding_posting_list_mapper_->UpdateChecksum());
613 ICING_ASSIGN_OR_RETURN(Crc32 embedding_vectors_crc,
614 embedding_vectors_->UpdateChecksum());
615 ICING_ASSIGN_OR_RETURN(Crc32 quantized_embedding_vectors_crc,
616 quantized_embedding_vectors_->UpdateChecksum());
617 return Crc32(embedding_posting_list_mapper_crc.Get() ^
618 embedding_vectors_crc.Get() ^
619 quantized_embedding_vectors_crc.Get());
620 }
621
GetStoragesChecksum() const622 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::GetStoragesChecksum()
623 const {
624 if (is_empty()) {
625 return Crc32(0);
626 }
627 ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
628 embedding_posting_list_mapper_->GetChecksum());
629 Crc32 embedding_vectors_crc = embedding_vectors_->GetChecksum();
630 Crc32 quantized_embedding_vectors_crc =
631 quantized_embedding_vectors_->GetChecksum();
632 return Crc32(embedding_posting_list_mapper_crc.Get() ^
633 embedding_vectors_crc.Get() ^
634 quantized_embedding_vectors_crc.Get());
635 }
636
637 } // namespace lib
638 } // namespace icing
639