xref: /aosp_15_r20/external/icing/icing/index/embed/embedding-index.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/embed/embedding-index.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/feature-flags.h"
31 #include "icing/file/destructible-directory.h"
32 #include "icing/file/file-backed-vector.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/memory-mapped-file.h"
35 #include "icing/file/posting_list/flash-index-storage.h"
36 #include "icing/file/posting_list/posting-list-identifier.h"
37 #include "icing/index/embed/embedding-hit.h"
38 #include "icing/index/embed/embedding-scorer.h"
39 #include "icing/index/embed/posting-list-embedding-hit-accessor.h"
40 #include "icing/index/embed/quantizer.h"
41 #include "icing/index/hit/hit.h"
42 #include "icing/schema/schema-store.h"
43 #include "icing/store/document-filter-data.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/document-store.h"
46 #include "icing/store/dynamic-trie-key-mapper.h"
47 #include "icing/store/key-mapper.h"
48 #include "icing/util/clock.h"
49 #include "icing/util/crc32.h"
50 #include "icing/util/encode-util.h"
51 #include "icing/util/logging.h"
52 #include "icing/util/status-macros.h"
53 
54 namespace icing {
55 namespace lib {
56 
57 namespace {
58 
59 constexpr uint32_t kEmbeddingHitListMapperMaxSize =
60     128 * 1024 * 1024;  // 128 MiB;
61 
62 // The maximum length returned by encode_util::EncodeIntToCString is 5 for
63 // uint32_t.
64 constexpr uint32_t kEncodedDimensionLength = 5;
65 
GetMetadataFilePath(std::string_view working_path)66 std::string GetMetadataFilePath(std::string_view working_path) {
67   return absl_ports::StrCat(working_path, "/metadata");
68 }
69 
GetFlashIndexStorageFilePath(std::string_view working_path)70 std::string GetFlashIndexStorageFilePath(std::string_view working_path) {
71   return absl_ports::StrCat(working_path, "/flash_index_storage");
72 }
73 
GetEmbeddingHitListMapperPath(std::string_view working_path)74 std::string GetEmbeddingHitListMapperPath(std::string_view working_path) {
75   return absl_ports::StrCat(working_path, "/embedding_hit_list_mapper");
76 }
77 
GetEmbeddingVectorsFilePath(std::string_view working_path)78 std::string GetEmbeddingVectorsFilePath(std::string_view working_path) {
79   return absl_ports::StrCat(working_path, "/embedding_vectors");
80 }
81 
GetQuantizedEmbeddingVectorsFilePath(std::string_view working_path)82 std::string GetQuantizedEmbeddingVectorsFilePath(
83     std::string_view working_path) {
84   return absl_ports::StrCat(working_path, "/quantized_embedding_vectors");
85 }
86 
87 // An injective function that maps the ordered pair (dimension, model_signature)
88 // to a string, which is used to form a key for embedding_posting_list_mapper_.
GetPostingListKey(uint32_t dimension,std::string_view model_signature)89 std::string GetPostingListKey(uint32_t dimension,
90                               std::string_view model_signature) {
91   std::string encoded_dimension_str =
92       encode_util::EncodeIntToCString(dimension);
93   // Make encoded_dimension_str to fixed kEncodedDimensionLength bytes.
94   while (encoded_dimension_str.size() < kEncodedDimensionLength) {
95     // C string cannot contain 0 bytes, so we append it using 1, just like what
96     // we do in encode_util::EncodeIntToCString.
97     //
98     // The reason that this works is because DecodeIntToString decodes a byte
99     // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded
100     // dimension that is less than 5 bytes, it means that the dimension contains
101     // unencoded leading 0x00. So here we're explicitly encoding those bytes as
102     // 0x01.
103     encoded_dimension_str.push_back(1);
104   }
105   return absl_ports::StrCat(encoded_dimension_str, model_signature);
106 }
107 
GetPostingListKey(const PropertyProto::VectorProto & vector)108 std::string GetPostingListKey(const PropertyProto::VectorProto& vector) {
109   return GetPostingListKey(vector.values().size(), vector.model_signature());
110 }
111 
CreateQuantizer(const PropertyProto::VectorProto & vector)112 libtextclassifier3::StatusOr<Quantizer> CreateQuantizer(
113     const PropertyProto::VectorProto& vector) {
114   if (vector.values().empty()) {
115     return absl_ports::InvalidArgumentError("Vector dimension is 0");
116   }
117   auto minmax_pair =
118       std::minmax_element(vector.values().begin(), vector.values().end());
119   return Quantizer::Create(*minmax_pair.first, *minmax_pair.second);
120 }
121 
122 }  // namespace
123 
124 libtextclassifier3::StatusOr<std::unique_ptr<EmbeddingIndex>>
Create(const Filesystem * filesystem,std::string working_path,const Clock * clock,const FeatureFlags * feature_flags)125 EmbeddingIndex::Create(const Filesystem* filesystem, std::string working_path,
126                        const Clock* clock, const FeatureFlags* feature_flags) {
127   ICING_RETURN_ERROR_IF_NULL(filesystem);
128   ICING_RETURN_ERROR_IF_NULL(clock);
129 
130   std::unique_ptr<EmbeddingIndex> index =
131       std::unique_ptr<EmbeddingIndex>(new EmbeddingIndex(
132           *filesystem, std::move(working_path), clock, feature_flags));
133   ICING_RETURN_IF_ERROR(index->Initialize());
134   return index;
135 }
136 
CreateStorageDataIfNonEmpty()137 libtextclassifier3::Status EmbeddingIndex::CreateStorageDataIfNonEmpty() {
138   if (is_empty()) {
139     return libtextclassifier3::Status::OK;
140   }
141 
142   ICING_ASSIGN_OR_RETURN(FlashIndexStorage flash_index_storage,
143                          FlashIndexStorage::Create(
144                              GetFlashIndexStorageFilePath(working_path_),
145                              &filesystem_, posting_list_hit_serializer_.get()));
146   flash_index_storage_ =
147       std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
148 
149   ICING_ASSIGN_OR_RETURN(
150       embedding_posting_list_mapper_,
151       DynamicTrieKeyMapper<PostingListIdentifier>::Create(
152           filesystem_, GetEmbeddingHitListMapperPath(working_path_),
153           kEmbeddingHitListMapperMaxSize));
154 
155   ICING_ASSIGN_OR_RETURN(
156       embedding_vectors_,
157       FileBackedVector<float>::Create(
158           filesystem_, GetEmbeddingVectorsFilePath(working_path_),
159           MemoryMappedFile::READ_WRITE_AUTO_SYNC));
160 
161   ICING_ASSIGN_OR_RETURN(
162       quantized_embedding_vectors_,
163       FileBackedVector<char>::Create(
164           filesystem_, GetQuantizedEmbeddingVectorsFilePath(working_path_),
165           MemoryMappedFile::READ_WRITE_AUTO_SYNC));
166 
167   return libtextclassifier3::Status::OK;
168 }
169 
MarkIndexNonEmpty()170 libtextclassifier3::Status EmbeddingIndex::MarkIndexNonEmpty() {
171   if (!is_empty()) {
172     return libtextclassifier3::Status::OK;
173   }
174   info().is_empty = false;
175   return CreateStorageDataIfNonEmpty();
176 }
177 
Initialize()178 libtextclassifier3::Status EmbeddingIndex::Initialize() {
179   bool is_new = false;
180   if (!filesystem_.FileExists(GetMetadataFilePath(working_path_).c_str())) {
181     // Create working directory.
182     if (!filesystem_.CreateDirectoryRecursively(working_path_.c_str())) {
183       return absl_ports::InternalError(
184           absl_ports::StrCat("Failed to create directory: ", working_path_));
185     }
186     is_new = true;
187   }
188 
189   ICING_ASSIGN_OR_RETURN(
190       MemoryMappedFile metadata_mmapped_file,
191       MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_),
192                                MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
193                                /*max_file_size=*/kMetadataFileSize,
194                                /*pre_mapping_file_offset=*/0,
195                                /*pre_mapping_mmap_size=*/kMetadataFileSize));
196   metadata_mmapped_file_ =
197       std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
198 
199   if (is_new) {
200     ICING_RETURN_IF_ERROR(metadata_mmapped_file_->GrowAndRemapIfNecessary(
201         /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
202     info().magic = Info::kMagic;
203     info().last_added_document_id = kInvalidDocumentId;
204     info().is_empty = true;
205     memset(Info().padding_, 0, Info::kPaddingSize);
206     ICING_RETURN_IF_ERROR(InitializeNewStorage());
207   } else {
208     if (metadata_mmapped_file_->available_size() != kMetadataFileSize) {
209       return absl_ports::FailedPreconditionError(
210           "Incorrect metadata file size");
211     }
212     if (info().magic != Info::kMagic) {
213       return absl_ports::FailedPreconditionError("Incorrect magic value");
214     }
215     ICING_RETURN_IF_ERROR(CreateStorageDataIfNonEmpty());
216     ICING_RETURN_IF_ERROR(InitializeExistingStorage());
217   }
218   return libtextclassifier3::Status::OK;
219 }
220 
Clear()221 libtextclassifier3::Status EmbeddingIndex::Clear() {
222   pending_embedding_hits_.clear();
223   metadata_mmapped_file_.reset();
224   flash_index_storage_.reset();
225   embedding_posting_list_mapper_.reset();
226   embedding_vectors_.reset();
227   quantized_embedding_vectors_.reset();
228   if (filesystem_.DirectoryExists(working_path_.c_str())) {
229     ICING_RETURN_IF_ERROR(Discard(filesystem_, working_path_));
230   }
231   is_initialized_ = false;
232   return Initialize();
233 }
234 
235 libtextclassifier3::StatusOr<std::unique_ptr<PostingListEmbeddingHitAccessor>>
GetAccessor(uint32_t dimension,std::string_view model_signature) const236 EmbeddingIndex::GetAccessor(uint32_t dimension,
237                             std::string_view model_signature) const {
238   if (dimension == 0) {
239     return absl_ports::InvalidArgumentError("Dimension is 0");
240   }
241   if (is_empty()) {
242     return absl_ports::NotFoundError("EmbeddingIndex is empty");
243   }
244 
245   std::string key = GetPostingListKey(dimension, model_signature);
246   ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
247                          embedding_posting_list_mapper_->Get(key));
248   return PostingListEmbeddingHitAccessor::CreateFromExisting(
249       flash_index_storage_.get(), posting_list_hit_serializer_.get(),
250       posting_list_id);
251 }
252 
AppendEmbeddingVector(const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)253 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::AppendEmbeddingVector(
254     const PropertyProto::VectorProto& vector,
255     EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
256   uint32_t dimension = vector.values().size();
257   uint32_t location;
258   if (!feature_flags_->enable_embedding_quantization() ||
259       quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
260     location = embedding_vectors_->num_elements();
261     ICING_ASSIGN_OR_RETURN(
262         FileBackedVector<float>::MutableArrayView mutable_arr,
263         embedding_vectors_->Allocate(dimension));
264     mutable_arr.SetArray(/*idx=*/0, vector.values().data(), dimension);
265   } else {
266     ICING_ASSIGN_OR_RETURN(Quantizer quantizer, CreateQuantizer(vector));
267     // Quantize the vector
268     std::vector<uint8_t> quantized_values;
269     quantized_values.reserve(vector.values().size());
270     for (float value : vector.values()) {
271       quantized_values.push_back(quantizer.Quantize(value));
272     }
273 
274     // Store the quantizer and the quantized vector
275     location = quantized_embedding_vectors_->num_elements();
276     ICING_ASSIGN_OR_RETURN(
277         FileBackedVector<char>::MutableArrayView mutable_arr,
278         quantized_embedding_vectors_->Allocate(sizeof(Quantizer) + dimension));
279     mutable_arr.SetArray(/*idx=*/0, reinterpret_cast<char*>(&quantizer),
280                          sizeof(Quantizer));
281     mutable_arr.SetArray(/*idx=*/sizeof(Quantizer),
282                          reinterpret_cast<char*>(quantized_values.data()),
283                          dimension);
284   }
285   return location;
286 }
287 
BufferEmbedding(const BasicHit & basic_hit,const PropertyProto::VectorProto & vector,EmbeddingIndexingConfig::QuantizationType::Code quantization_type)288 libtextclassifier3::Status EmbeddingIndex::BufferEmbedding(
289     const BasicHit& basic_hit, const PropertyProto::VectorProto& vector,
290     EmbeddingIndexingConfig::QuantizationType::Code quantization_type) {
291   if (vector.values().empty()) {
292     return absl_ports::InvalidArgumentError("Vector dimension is 0");
293   }
294   ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
295 
296   std::string key = GetPostingListKey(vector);
297   ICING_ASSIGN_OR_RETURN(uint32_t location,
298                          AppendEmbeddingVector(vector, quantization_type));
299 
300   // Buffer the embedding hit.
301   pending_embedding_hits_.push_back(
302       {std::move(key), EmbeddingHit(basic_hit, location)});
303   return libtextclassifier3::Status::OK;
304 }
305 
CommitBufferToIndex()306 libtextclassifier3::Status EmbeddingIndex::CommitBufferToIndex() {
307   if (pending_embedding_hits_.empty()) {
308     return libtextclassifier3::Status::OK;
309   }
310   ICING_RETURN_IF_ERROR(MarkIndexNonEmpty());
311 
312   std::sort(pending_embedding_hits_.begin(), pending_embedding_hits_.end());
313   auto iter_curr_key = pending_embedding_hits_.rbegin();
314   while (iter_curr_key != pending_embedding_hits_.rend()) {
315     // In order to batch putting embedding hits with the same key (dimension,
316     // model_signature) to the same posting list, we find the range
317     // [iter_curr_key, iter_next_key) of embedding hits with the same key and
318     // put them into their corresponding posting list together.
319     auto iter_next_key = iter_curr_key;
320     while (iter_next_key != pending_embedding_hits_.rend() &&
321            iter_next_key->first == iter_curr_key->first) {
322       iter_next_key++;
323     }
324 
325     const std::string& key = iter_curr_key->first;
326     libtextclassifier3::StatusOr<PostingListIdentifier> posting_list_id_or =
327         embedding_posting_list_mapper_->Get(key);
328     std::unique_ptr<PostingListEmbeddingHitAccessor> pl_accessor;
329     if (posting_list_id_or.ok()) {
330       // Existing posting list.
331       ICING_ASSIGN_OR_RETURN(
332           pl_accessor,
333           PostingListEmbeddingHitAccessor::CreateFromExisting(
334               flash_index_storage_.get(), posting_list_hit_serializer_.get(),
335               posting_list_id_or.ValueOrDie()));
336     } else if (absl_ports::IsNotFound(posting_list_id_or.status())) {
337       // New posting list.
338       ICING_ASSIGN_OR_RETURN(
339           pl_accessor,
340           PostingListEmbeddingHitAccessor::Create(
341               flash_index_storage_.get(), posting_list_hit_serializer_.get()));
342     } else {
343       // Errors
344       return std::move(posting_list_id_or).status();
345     }
346 
347     // Adding the embedding hits.
348     for (auto iter = iter_curr_key; iter != iter_next_key; ++iter) {
349       ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(iter->second));
350     }
351 
352     // Finalize this posting list and add the posting list id in
353     // embedding_posting_list_mapper_.
354     PostingListEmbeddingHitAccessor::FinalizeResult result =
355         std::move(*pl_accessor).Finalize();
356     if (!result.id.is_valid()) {
357       return absl_ports::InternalError("Failed to finalize posting list");
358     }
359     ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->Put(key, result.id));
360 
361     // Advance to the next key.
362     iter_curr_key = iter_next_key;
363   }
364   pending_embedding_hits_.clear();
365   return libtextclassifier3::Status::OK;
366 }
367 
TransferEmbeddingVector(const EmbeddingHit & old_hit,uint32_t dimension,EmbeddingIndexingConfig::QuantizationType::Code quantization_type,EmbeddingIndex * new_index) const368 libtextclassifier3::StatusOr<uint32_t> EmbeddingIndex::TransferEmbeddingVector(
369     const EmbeddingHit& old_hit, uint32_t dimension,
370     EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
371     EmbeddingIndex* new_index) const {
372   uint32_t new_location;
373   if (!feature_flags_->enable_embedding_quantization() ||
374       quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
375     ICING_ASSIGN_OR_RETURN(const float* old_vector,
376                            GetEmbeddingVector(old_hit, dimension));
377     new_location = new_index->embedding_vectors_->num_elements();
378 
379     // Copy the embedding vector of the hit to the new index.
380     ICING_ASSIGN_OR_RETURN(
381         FileBackedVector<float>::MutableArrayView mutable_arr,
382         new_index->embedding_vectors_->Allocate(dimension));
383     mutable_arr.SetArray(/*idx=*/0, old_vector, dimension);
384   } else {
385     ICING_ASSIGN_OR_RETURN(const char* old_data,
386                            GetQuantizedEmbeddingVector(old_hit, dimension));
387     new_location = new_index->quantized_embedding_vectors_->num_elements();
388 
389     // Copy the embedding vector of the hit to the new index.
390     ICING_ASSIGN_OR_RETURN(FileBackedVector<char>::MutableArrayView mutable_arr,
391                            new_index->quantized_embedding_vectors_->Allocate(
392                                sizeof(Quantizer) + dimension));
393     mutable_arr.SetArray(/*idx=*/0, old_data, sizeof(Quantizer) + dimension);
394   }
395   return new_location;
396 }
397 
TransferIndex(const DocumentStore & document_store,const SchemaStore & schema_store,const std::vector<DocumentId> & document_id_old_to_new,EmbeddingIndex * new_index) const398 libtextclassifier3::Status EmbeddingIndex::TransferIndex(
399     const DocumentStore& document_store, const SchemaStore& schema_store,
400     const std::vector<DocumentId>& document_id_old_to_new,
401     EmbeddingIndex* new_index) const {
402   if (is_empty()) {
403     return absl_ports::FailedPreconditionError("EmbeddingIndex is empty");
404   }
405 
406   const int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
407   std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> itr =
408       embedding_posting_list_mapper_->GetIterator();
409   while (itr->Advance()) {
410     std::string_view key = itr->GetKey();
411     // This should never happen unless there is an inconsistency, or the index
412     // is corrupted.
413     if (key.size() < kEncodedDimensionLength) {
414       return absl_ports::InternalError(
415           "Got invalid key from embedding posting list mapper.");
416     }
417     uint32_t dimension = encode_util::DecodeIntFromCString(
418         std::string_view(key.begin(), kEncodedDimensionLength));
419 
420     // Transfer hits
421     std::vector<EmbeddingHit> new_hits;
422     ICING_ASSIGN_OR_RETURN(
423         std::unique_ptr<PostingListEmbeddingHitAccessor> old_pl_accessor,
424         PostingListEmbeddingHitAccessor::CreateFromExisting(
425             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
426             /*existing_posting_list_id=*/itr->GetValue()));
427     DocumentId last_new_document_id = kInvalidDocumentId;
428     SchemaTypeId schema_type_id = kInvalidSchemaTypeId;
429     while (true) {
430       ICING_ASSIGN_OR_RETURN(std::vector<EmbeddingHit> batch,
431                              old_pl_accessor->GetNextHitsBatch());
432       if (batch.empty()) {
433         break;
434       }
435       for (EmbeddingHit& old_hit : batch) {
436         // Safety checks to add robustness to the codebase, so to make sure
437         // that we never access invalid memory, in case that hit from the
438         // posting list is corrupted.
439         if (old_hit.basic_hit().document_id() < 0 ||
440             old_hit.basic_hit().document_id() >=
441                 document_id_old_to_new.size()) {
442           return absl_ports::InternalError(
443               "Embedding hit document id is out of bound. The provided map is "
444               "too small, or the index may have been corrupted.");
445         }
446 
447         // Construct transferred hit and add the embedding vector to the new
448         // index.
449         DocumentId new_document_id =
450             document_id_old_to_new[old_hit.basic_hit().document_id()];
451         if (new_document_id == kInvalidDocumentId) {
452           continue;
453         }
454         if (new_document_id != last_new_document_id) {
455           schema_type_id =
456               document_store.GetSchemaTypeId(new_document_id, current_time_ms);
457         }
458         last_new_document_id = new_document_id;
459         if (schema_type_id == kInvalidSchemaTypeId) {
460           // This should not happen, since document store is optimized first,
461           // so that new_document_id here should be alive.
462           continue;
463         }
464         ICING_ASSIGN_OR_RETURN(
465             EmbeddingIndexingConfig::QuantizationType::Code quantization_type,
466             schema_store.GetQuantizationType(schema_type_id,
467                                              old_hit.basic_hit().section_id()));
468         ICING_RETURN_IF_ERROR(new_index->MarkIndexNonEmpty());
469 
470         ICING_ASSIGN_OR_RETURN(
471             uint32_t new_location,
472             TransferEmbeddingVector(old_hit, dimension, quantization_type,
473                                     new_index));
474         new_hits.push_back(EmbeddingHit(
475             BasicHit(old_hit.basic_hit().section_id(), new_document_id),
476             new_location));
477       }
478     }
479     // No hit needs to be added to the new index.
480     if (new_hits.empty()) {
481       continue;
482     }
483     // Add transferred hits to the new index.
484     ICING_ASSIGN_OR_RETURN(
485         std::unique_ptr<PostingListEmbeddingHitAccessor> hit_accum,
486         PostingListEmbeddingHitAccessor::Create(
487             new_index->flash_index_storage_.get(),
488             new_index->posting_list_hit_serializer_.get()));
489     for (auto new_hit_itr = new_hits.rbegin(); new_hit_itr != new_hits.rend();
490          ++new_hit_itr) {
491       ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*new_hit_itr));
492     }
493     PostingListEmbeddingHitAccessor::FinalizeResult result =
494         std::move(*hit_accum).Finalize();
495     if (!result.id.is_valid()) {
496       return absl_ports::InternalError("Failed to finalize posting list");
497     }
498     ICING_RETURN_IF_ERROR(
499         new_index->embedding_posting_list_mapper_->Put(key, result.id));
500   }
501   return libtextclassifier3::Status::OK;
502 }
503 
Optimize(const DocumentStore * document_store,const SchemaStore * schema_store,const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)504 libtextclassifier3::Status EmbeddingIndex::Optimize(
505     const DocumentStore* document_store, const SchemaStore* schema_store,
506     const std::vector<DocumentId>& document_id_old_to_new,
507     DocumentId new_last_added_document_id) {
508   ICING_RETURN_ERROR_IF_NULL(document_store);
509   ICING_RETURN_ERROR_IF_NULL(schema_store);
510   if (is_empty()) {
511     info().last_added_document_id = new_last_added_document_id;
512     return libtextclassifier3::Status::OK;
513   }
514 
515   // This is just for completeness, but this should never be necessary, since we
516   // should never have pending hits at the time when Optimize is run.
517   ICING_RETURN_IF_ERROR(CommitBufferToIndex());
518 
519   std::string temporary_index_working_path = working_path_ + "_temp";
520   if (!filesystem_.DeleteDirectoryRecursively(
521           temporary_index_working_path.c_str())) {
522     ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_working_path;
523     return absl_ports::InternalError(
524         "Unable to delete temp directory to prepare to build new index.");
525   }
526 
527   DestructibleDirectory temporary_index_dir(
528       &filesystem_, std::move(temporary_index_working_path));
529   if (!temporary_index_dir.is_valid()) {
530     return absl_ports::InternalError(
531         "Unable to create temp directory to build new index.");
532   }
533 
534   {
535     ICING_ASSIGN_OR_RETURN(
536         std::unique_ptr<EmbeddingIndex> new_index,
537         EmbeddingIndex::Create(&filesystem_, temporary_index_dir.dir(), &clock_,
538                                feature_flags_));
539     ICING_RETURN_IF_ERROR(TransferIndex(*document_store, *schema_store,
540                                         document_id_old_to_new,
541                                         new_index.get()));
542     new_index->set_last_added_document_id(new_last_added_document_id);
543     ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
544   }
545 
546   // Destruct current storage instances to safely swap directories.
547   metadata_mmapped_file_.reset();
548   flash_index_storage_.reset();
549   embedding_posting_list_mapper_.reset();
550   embedding_vectors_.reset();
551   quantized_embedding_vectors_.reset();
552 
553   if (!filesystem_.SwapFiles(temporary_index_dir.dir().c_str(),
554                              working_path_.c_str())) {
555     return absl_ports::InternalError(
556         "Unable to apply new index due to failed swap!");
557   }
558 
559   // Reinitialize the index.
560   is_initialized_ = false;
561   return Initialize();
562 }
563 
ScoreEmbeddingHit(const EmbeddingScorer & scorer,const PropertyProto::VectorProto & query,const EmbeddingHit & hit,EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const564 libtextclassifier3::StatusOr<float> EmbeddingIndex::ScoreEmbeddingHit(
565     const EmbeddingScorer& scorer, const PropertyProto::VectorProto& query,
566     const EmbeddingHit& hit,
567     EmbeddingIndexingConfig::QuantizationType::Code quantization_type) const {
568   int dimension = query.values().size();
569   float semantic_score;
570   if (!feature_flags_->enable_embedding_quantization() ||
571       quantization_type == EmbeddingIndexingConfig::QuantizationType::NONE) {
572     ICING_ASSIGN_OR_RETURN(const float* vector,
573                            GetEmbeddingVector(hit, dimension));
574     semantic_score = scorer.Score(dimension,
575                                   /*v1=*/query.values().data(),
576                                   /*v2=*/vector);
577   } else {
578     ICING_ASSIGN_OR_RETURN(const char* data,
579                            GetQuantizedEmbeddingVector(hit, dimension));
580     Quantizer quantizer(data);
581     const uint8_t* quantized_vector =
582         reinterpret_cast<const uint8_t*>(data + sizeof(Quantizer));
583     semantic_score = scorer.Score(dimension,
584                                   /*v1=*/query.values().data(),
585                                   /*v2=*/quantized_vector, quantizer);
586   }
587   return semantic_score;
588 }
589 
PersistMetadataToDisk()590 libtextclassifier3::Status EmbeddingIndex::PersistMetadataToDisk() {
591   return metadata_mmapped_file_->PersistToDisk();
592 }
593 
PersistStoragesToDisk()594 libtextclassifier3::Status EmbeddingIndex::PersistStoragesToDisk() {
595   if (is_empty()) {
596     return libtextclassifier3::Status::OK;
597   }
598   if (!flash_index_storage_->PersistToDisk()) {
599     return absl_ports::InternalError("Fail to persist flash index to disk");
600   }
601   ICING_RETURN_IF_ERROR(embedding_posting_list_mapper_->PersistToDisk());
602   ICING_RETURN_IF_ERROR(embedding_vectors_->PersistToDisk());
603   ICING_RETURN_IF_ERROR(quantized_embedding_vectors_->PersistToDisk());
604   return libtextclassifier3::Status::OK;
605 }
606 
UpdateStoragesChecksum()607 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::UpdateStoragesChecksum() {
608   if (is_empty()) {
609     return Crc32(0);
610   }
611   ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
612                          embedding_posting_list_mapper_->UpdateChecksum());
613   ICING_ASSIGN_OR_RETURN(Crc32 embedding_vectors_crc,
614                          embedding_vectors_->UpdateChecksum());
615   ICING_ASSIGN_OR_RETURN(Crc32 quantized_embedding_vectors_crc,
616                          quantized_embedding_vectors_->UpdateChecksum());
617   return Crc32(embedding_posting_list_mapper_crc.Get() ^
618                embedding_vectors_crc.Get() ^
619                quantized_embedding_vectors_crc.Get());
620 }
621 
GetStoragesChecksum() const622 libtextclassifier3::StatusOr<Crc32> EmbeddingIndex::GetStoragesChecksum()
623     const {
624   if (is_empty()) {
625     return Crc32(0);
626   }
627   ICING_ASSIGN_OR_RETURN(Crc32 embedding_posting_list_mapper_crc,
628                          embedding_posting_list_mapper_->GetChecksum());
629   Crc32 embedding_vectors_crc = embedding_vectors_->GetChecksum();
630   Crc32 quantized_embedding_vectors_crc =
631       quantized_embedding_vectors_->GetChecksum();
632   return Crc32(embedding_posting_list_mapper_crc.Get() ^
633                embedding_vectors_crc.Get() ^
634                quantized_embedding_vectors_crc.Get());
635 }
636 
637 }  // namespace lib
638 }  // namespace icing
639