xref: /aosp_15_r20/external/icing/icing/index/lite/lite-index.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/lite/lite-index.h"
16 
17 #include <sys/mman.h>
18 
19 #include <algorithm>
20 #include <cinttypes>
21 #include <cstddef>
22 #include <cstdint>
23 #include <memory>
24 #include <string>
25 #include <string_view>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/mutex.h"
34 #include "icing/absl_ports/str_cat.h"
35 #include "icing/file/filesystem.h"
36 #include "icing/index/hit/doc-hit-info.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/index/lite/lite-index-header.h"
39 #include "icing/index/lite/term-id-hit-pair.h"
40 #include "icing/index/term-id-codec.h"
41 #include "icing/index/term-property-id.h"
42 #include "icing/legacy/core/icing-string-util.h"
43 #include "icing/legacy/core/icing-timer.h"
44 #include "icing/legacy/index/icing-array-storage.h"
45 #include "icing/legacy/index/icing-dynamic-trie.h"
46 #include "icing/legacy/index/icing-filesystem.h"
47 #include "icing/legacy/index/icing-mmapper.h"
48 #include "icing/proto/debug.pb.h"
49 #include "icing/proto/scoring.pb.h"
50 #include "icing/proto/storage.pb.h"
51 #include "icing/proto/term.pb.h"
52 #include "icing/schema/section.h"
53 #include "icing/store/document-id.h"
54 #include "icing/store/namespace-id.h"
55 #include "icing/store/suggestion-result-checker.h"
56 #include "icing/util/crc32.h"
57 #include "icing/util/logging.h"
58 #include "icing/util/status-macros.h"
59 
60 namespace icing {
61 namespace lib {
62 
63 namespace {
64 
65 // Point at which we declare the trie full.
66 constexpr double kTrieFullFraction = 0.95;
67 
MakeHitBufferFilename(const std::string & filename_base)68 std::string MakeHitBufferFilename(const std::string& filename_base) {
69   return filename_base + "hb";
70 }
71 
header_size()72 size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
73 
74 }  // namespace
75 
76 const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
77     TermIdHitPair(0, Hit(Hit::kInvalidValue)).value();
78 
Create(const LiteIndex::Options & options,const IcingFilesystem * filesystem)79 libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
80     const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
81   ICING_RETURN_ERROR_IF_NULL(filesystem);
82 
83   std::unique_ptr<LiteIndex> lite_index =
84       std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
85   ICING_RETURN_IF_ERROR(lite_index->Initialize());
86   return std::move(lite_index);
87 }
88 
89 // size is max size in elements. An appropriate lexicon and display
90 // mapping size will be chosen based on hit buffer size.
LiteIndex(const LiteIndex::Options & options,const IcingFilesystem * filesystem)91 LiteIndex::LiteIndex(const LiteIndex::Options& options,
92                      const IcingFilesystem* filesystem)
93     : hit_buffer_(*filesystem),
94       hit_buffer_crc_(0),
95       lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
96                filesystem),
97       header_mmap_(false, MAP_SHARED),
98       options_(options),
99       filesystem_(filesystem) {}
100 
~LiteIndex()101 LiteIndex::~LiteIndex() {
102   if (initialized()) {
103     libtextclassifier3::Status unused = PersistToDisk();
104   }
105 }
106 
MakeTrieRuntimeOptions()107 IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
108   return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
109       IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
110 }
111 
Initialize()112 libtextclassifier3::Status LiteIndex::Initialize() {
113   // Size of hit buffer's header struct, rounded up to the nearest number of
114   // system memory pages.
115   const size_t header_padded_size =
116       IcingMMapper::page_aligned_size(header_size());
117 
118   // Variable declarations cannot cross goto jumps, so declare these up top.
119   libtextclassifier3::Status status;
120   uint64_t file_size;
121   IcingTimer timer;
122 
123   absl_ports::unique_lock l(&mutex_);
124   if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
125       !lexicon_.Init()) {
126     return absl_ports::InternalError("Failed to initialize lexicon trie");
127   }
128 
129   hit_buffer_fd_.reset(filesystem_->OpenForWrite(
130       MakeHitBufferFilename(options_.filename_base).c_str()));
131   if (!hit_buffer_fd_.is_valid()) {
132     status = absl_ports::InternalError("Failed to open hit buffer file");
133     goto error;
134   }
135 
136   file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
137   if (file_size == IcingFilesystem::kBadFileSize) {
138     status = absl_ports::InternalError("Failed to query hit buffer file size");
139     goto error;
140   }
141 
142   if (file_size < header_padded_size) {
143     if (file_size != 0) {
144       status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
145           "Hit buffer had unexpected size %" PRIu64, file_size));
146       goto error;
147     }
148 
149     ICING_VLOG(2) << "Creating new hit buffer";
150     // Make sure files are fresh.
151     if (!lexicon_.Remove() ||
152         !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
153         !lexicon_.Init()) {
154       status =
155           absl_ports::InternalError("Failed to refresh lexicon during clear");
156       goto error;
157     }
158 
159     // Create fresh hit buffer by first emptying the hit buffer file and then
160     // allocating header_padded_size of the cleared space.
161     if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
162         !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
163       status = absl_ports::InternalError("Failed to truncate hit buffer file");
164       goto error;
165     }
166 
167     // Set up header.
168     header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
169     header_ = std::make_unique<LiteIndex_HeaderImpl>(
170         reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
171             header_mmap_.address()));
172     header_->Reset();
173 
174     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
175                           sizeof(TermIdHitPair::Value), header_->cur_size(),
176                           options_.hit_buffer_size, &hit_buffer_crc_, true)) {
177       status = absl_ports::InternalError("Failed to initialize new hit buffer");
178       goto error;
179     }
180 
181     UpdateChecksumInternal();
182   } else {
183     header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
184     header_ = std::make_unique<LiteIndex_HeaderImpl>(
185         reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
186             header_mmap_.address()));
187 
188     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
189                           sizeof(TermIdHitPair::Value), header_->cur_size(),
190                           options_.hit_buffer_size, &hit_buffer_crc_, true)) {
191       status = absl_ports::InternalError(
192           "Failed to re-initialize existing hit buffer");
193       goto error;
194     }
195 
196     // Check integrity.
197     if (!header_->check_magic()) {
198       status = absl_ports::InternalError("Lite index header magic mismatch");
199       goto error;
200     }
201     Crc32 expected_crc(header_->lite_index_crc());
202     Crc32 crc = GetChecksumInternal();
203     if (crc != expected_crc) {
204       status = absl_ports::DataLossError(IcingStringUtil::StringPrintf(
205           "Lite index crc check failed: %u vs %u", crc.Get(),
206           header_->lite_index_crc().Get()));
207       goto error;
208     }
209   }
210 
211   ICING_VLOG(2) << "Lite index init ok in " << timer.Elapsed() * 1000 << "ms";
212   return status;
213 
214 error:
215   header_ = nullptr;
216   header_mmap_.Unmap();
217   lexicon_.Close();
218   hit_buffer_crc_ = 0;
219   hit_buffer_.Reset();
220   hit_buffer_fd_.reset();
221   if (status.ok()) {
222     return absl_ports::InternalError(
223         "Error handling code ran but status was ok");
224   }
225   return status;
226 }
227 
Reset()228 libtextclassifier3::Status LiteIndex::Reset() {
229   IcingTimer timer;
230 
231   absl_ports::unique_lock l(&mutex_);
232   // TODO(b/140436942): When these components have been changed to return errors
233   // they should be propagated from here.
234   lexicon_.Clear();
235   hit_buffer_.Clear();
236   header_->Reset();
237   UpdateChecksumInternal();
238 
239   ICING_VLOG(2) << "Lite index clear in " << timer.Elapsed() * 1000 << "ms";
240   return libtextclassifier3::Status::OK;
241 }
242 
Warm()243 void LiteIndex::Warm() {
244   absl_ports::shared_lock l(&mutex_);
245   hit_buffer_.Warm();
246   lexicon_.Warm();
247 }
248 
PersistToDisk()249 libtextclassifier3::Status LiteIndex::PersistToDisk() {
250   absl_ports::unique_lock l(&mutex_);
251   bool success = true;
252   if (!lexicon_.Sync()) {
253     ICING_VLOG(1) << "Failed to sync the lexicon.";
254     success = false;
255   }
256   hit_buffer_.Sync();
257   UpdateChecksumInternal();
258   header_mmap_.Sync();
259 
260   return (success) ? libtextclassifier3::Status::OK
261                    : absl_ports::InternalError(
262                          "Unable to sync lite index components.");
263 }
264 
UpdateChecksum()265 Crc32 LiteIndex::UpdateChecksum() {
266   absl_ports::unique_lock l(&mutex_);
267   return UpdateChecksumInternal();
268 }
269 
UpdateChecksumInternal()270 Crc32 LiteIndex::UpdateChecksumInternal() {
271   IcingTimer timer;
272 
273   // Update crcs.
274   uint32_t dependent_crcs[2];
275   hit_buffer_.UpdateCrc();
276   dependent_crcs[0] = hit_buffer_crc_;
277   dependent_crcs[1] = lexicon_.UpdateCrc().Get();
278 
279   // Update the header. The header is mmapped. So we don't need to explicitly
280   // write it.
281   Crc32 all_crc(header_->GetHeaderCrc());
282   all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
283                                   sizeof(dependent_crcs)));
284   header_->set_lite_index_crc(all_crc);
285   ICING_VLOG(2) << "Lite index crc updated in " << timer.Elapsed() * 1000
286                 << "ms";
287   return all_crc;
288 }
289 
GetChecksum() const290 Crc32 LiteIndex::GetChecksum() const {
291   absl_ports::unique_lock l(&mutex_);
292   return GetChecksumInternal();
293 }
294 
GetChecksumInternal() const295 Crc32 LiteIndex::GetChecksumInternal() const {
296   IcingTimer timer;
297 
298   uint32_t dependent_crcs[2];
299   dependent_crcs[0] = hit_buffer_.GetCrc().Get();
300   dependent_crcs[1] = lexicon_.GetCrc().Get();
301 
302   Crc32 all_crc(header_->GetHeaderCrc());
303   all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
304                                   sizeof(dependent_crcs)));
305   ICING_VLOG(2) << "Lite index crc computed in " << timer.Elapsed() * 1000
306                 << "ms";
307   return all_crc;
308 }
309 
InsertTerm(std::string_view term,TermMatchType::Code term_match_type,NamespaceId namespace_id)310 libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
311     std::string_view term, TermMatchType::Code term_match_type,
312     NamespaceId namespace_id) {
313   absl_ports::unique_lock l(&mutex_);
314   uint32_t tvi;
315   libtextclassifier3::Status status = lexicon_.Insert(term, "", &tvi, false);
316   if (!status.ok()) {
317     ICING_LOG(DBG) << "Unable to add term " << term << " to lexicon!\n"
318                    << status.error_message();
319     return status;
320   }
321   ICING_RETURN_IF_ERROR(UpdateTermPropertiesImpl(
322       tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
323   return tvi;
324 }
325 
UpdateTermProperties(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)326 libtextclassifier3::Status LiteIndex::UpdateTermProperties(
327     uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
328   absl_ports::unique_lock l(&mutex_);
329   return UpdateTermPropertiesImpl(tvi, hasPrefixHits, namespace_id);
330 }
331 
UpdateTermPropertiesImpl(uint32_t tvi,bool hasPrefixHits,NamespaceId namespace_id)332 libtextclassifier3::Status LiteIndex::UpdateTermPropertiesImpl(
333     uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
334   if (hasPrefixHits &&
335       !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
336     return absl_ports::ResourceExhaustedError(
337         "Insufficient disk space to create prefix property!");
338   }
339 
340   if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
341     return absl_ports::ResourceExhaustedError(
342         "Insufficient disk space to create namespace property!");
343   }
344 
345   return libtextclassifier3::Status::OK;
346 }
347 
AddHit(uint32_t term_id,const Hit & hit)348 libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
349   absl_ports::unique_lock l(&mutex_);
350   if (is_full()) {
351     return absl_ports::ResourceExhaustedError("Hit buffer is full!");
352   }
353 
354   TermIdHitPair term_id_hit_pair(term_id, hit);
355   uint32_t cur_size = header_->cur_size();
356   TermIdHitPair::Value* valp =
357       hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
358   if (valp == nullptr) {
359     return absl_ports::ResourceExhaustedError(
360         "Allocating more space in hit buffer failed!");
361   }
362   *valp = term_id_hit_pair.value();
363   header_->set_cur_size(cur_size + 1);
364 
365   return libtextclassifier3::Status::OK;
366 }
367 
GetTermId(std::string_view term) const368 libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
369     std::string_view term) const {
370   absl_ports::shared_lock l(&mutex_);
371   char dummy;
372   uint32_t tvi;
373   if (!lexicon_.Find(term, &dummy, &tvi)) {
374     return absl_ports::NotFoundError(
375         absl_ports::StrCat("Could not find ", term, " in the lexicon."));
376   }
377   return tvi;
378 }
379 
ScoreAndAppendFetchedHit(const Hit & hit,SectionIdMask section_id_mask,bool only_from_prefix_sections,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker,DocumentId & last_document_id,bool & is_last_document_desired,int & total_score_out,std::vector<DocHitInfo> * hits_out,std::vector<Hit::TermFrequencyArray> * term_frequency_out) const380 void LiteIndex::ScoreAndAppendFetchedHit(
381     const Hit& hit, SectionIdMask section_id_mask,
382     bool only_from_prefix_sections,
383     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
384     const SuggestionResultChecker* suggestion_result_checker,
385     DocumentId& last_document_id, bool& is_last_document_desired,
386     int& total_score_out, std::vector<DocHitInfo>* hits_out,
387     std::vector<Hit::TermFrequencyArray>* term_frequency_out) const {
388   // Check sections.
389   if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
390     return;
391   }
392   // Check prefix section only.
393   if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
394     return;
395   }
396   // Check whether this Hit is desired.
397   // TODO(b/230553264) Move common logic into helper function once we support
398   // score term by prefix_hit in lite_index.
399   DocumentId document_id = hit.document_id();
400   bool is_new_document = document_id != last_document_id;
401   if (is_new_document) {
402     last_document_id = document_id;
403     is_last_document_desired =
404         suggestion_result_checker == nullptr ||
405         suggestion_result_checker->BelongsToTargetResults(document_id,
406                                                           hit.section_id());
407   }
408   if (!is_last_document_desired) {
409     // The document is removed or expired or not desired.
410     return;
411   }
412 
413   // Score the hit by the strategy
414   switch (score_by) {
415     case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
416       total_score_out = 1;
417       break;
418     case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT:
419       if (is_new_document) {
420         ++total_score_out;
421       }
422       break;
423     case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY:
424       if (hit.has_term_frequency()) {
425         total_score_out += hit.term_frequency();
426       } else {
427         ++total_score_out;
428       }
429       break;
430   }
431 
432   // Append the Hit or update hit section to the output vector.
433   if (is_new_document && hits_out != nullptr) {
434     hits_out->push_back(DocHitInfo(document_id));
435     if (term_frequency_out != nullptr) {
436       term_frequency_out->push_back(Hit::TermFrequencyArray());
437     }
438   }
439   if (hits_out != nullptr) {
440     hits_out->back().UpdateSection(hit.section_id());
441     if (term_frequency_out != nullptr) {
442       term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
443     }
444   }
445 }
446 
FetchHits(uint32_t term_id,SectionIdMask section_id_mask,bool only_from_prefix_sections,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker,std::vector<DocHitInfo> * hits_out,std::vector<Hit::TermFrequencyArray> * term_frequency_out)447 int LiteIndex::FetchHits(
448     uint32_t term_id, SectionIdMask section_id_mask,
449     bool only_from_prefix_sections,
450     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
451     const SuggestionResultChecker* suggestion_result_checker,
452     std::vector<DocHitInfo>* hits_out,
453     std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
454   bool need_sort_at_querying = false;
455   {
456     absl_ports::shared_lock l(&mutex_);
457 
458     // We sort here when:
459     // 1. We don't enable sorting at indexing time (i.e. we sort at querying
460     //    time), and there is an unsorted tail portion. OR
461     // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold,
462     //    regardless of whether or not hit_buffer_sort_at_indexing is enabled.
463     //    This is more of a sanity check. We should not really be encountering
464     //    this case.
465     need_sort_at_querying = NeedSortAtQuerying();
466   }
467   if (need_sort_at_querying) {
468     absl_ports::unique_lock l(&mutex_);
469     IcingTimer timer;
470 
471     // Transition from shared_lock to unique_lock is safe here because it
472     // doesn't hurt to sort again if sorting was done already by another thread
473     // after need_sort_at_querying is evaluated.
474     // We check need_sort_at_querying to improve query concurrency as threads
475     // can avoid acquiring the unique lock if no sorting is needed.
476     SortHitsImpl();
477 
478     if (options_.hit_buffer_sort_at_indexing) {
479       // This is the second case for sort. Log as this should be a very rare
480       // occasion.
481       ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when "
482                             "hit_buffer_sort_at_indexing is enabled. Sort and "
483                             "merge HitBuffer in "
484                          << timer.Elapsed() * 1000 << " ms.";
485     }
486   }
487 
488   // This downgrade from an unique_lock to a shared_lock is safe because we're
489   // searching for the term in the searchable (sorted) section of the HitBuffer
490   // only in Seek().
491   // Any operations that might execute in between the transition of downgrading
492   // the lock here are guaranteed not to alter the searchable section (or the
493   // LiteIndex) due to a global lock in IcingSearchEngine.
494   absl_ports::shared_lock l(&mutex_);
495 
496   // Search in the HitBuffer array for Hits with the corresponding term_id.
497   // Hits are added in increasing order of doc ids, so hits that get appended
498   // later have larger docIds. This means that:
499   // 1. Hits in the unsorted tail will have larger docIds than hits in the
500   //    sorted portion.
501   // 2. Hits at the end of the unsorted tail will have larger docIds than hits
502   //    in the front of the tail.
503   // We want to retrieve hits in descending order of docIds. Therefore we should
504   // search by doing:
505   // 1. Linear search first in reverse iteration order over the unsorted tail
506   //    portion.
507   // 2. Followed by binary search on the sorted portion.
508   const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>();
509 
510   DocumentId last_document_id = kInvalidDocumentId;
511   // Record whether the last document belongs to the given namespaces.
512   bool is_last_document_desired = false;
513   int total_score = 0;
514 
515   // Linear search over unsorted tail in reverse iteration order.
516   // This should only be performed when hit_buffer_sort_at_indexing is enabled.
517   // When disabled, the entire HitBuffer should be sorted already and only
518   // binary search is needed.
519   if (options_.hit_buffer_sort_at_indexing) {
520     uint32_t unsorted_length = GetHitBufferUnsortedSizeImpl();
521     for (uint32_t i = 1; i <= unsorted_length; ++i) {
522       TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i];
523       if (term_id_hit_pair.term_id() == term_id) {
524         // We've found a matched hit.
525         const Hit& matched_hit = term_id_hit_pair.hit();
526         // Score the hit and add to total_score. Also add the hits and its term
527         // frequency info to hits_out and term_frequency_out if the two vectors
528         // are non-null.
529         ScoreAndAppendFetchedHit(matched_hit, section_id_mask,
530                                  only_from_prefix_sections, score_by,
531                                  suggestion_result_checker, last_document_id,
532                                  is_last_document_desired, total_score,
533                                  hits_out, term_frequency_out);
534       }
535     }
536   }
537 
538   // Do binary search over the sorted section and repeat the above steps.
539   TermIdHitPair target_term_id_hit_pair(
540       term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kNoEnabledFlags,
541                    Hit::kDefaultTermFrequency));
542   for (const TermIdHitPair* ptr = std::lower_bound(
543            array, array + header_->searchable_end(), target_term_id_hit_pair);
544        ptr < array + header_->searchable_end(); ++ptr) {
545     if (ptr->term_id() != term_id) {
546       // We've processed all matches. Stop iterating further.
547       break;
548     }
549 
550     const Hit& matched_hit = ptr->hit();
551     // Score the hit and add to total_score. Also add the hits and its term
552     // frequency info to hits_out and term_frequency_out if the two vectors are
553     // non-null.
554     ScoreAndAppendFetchedHit(
555         matched_hit, section_id_mask, only_from_prefix_sections, score_by,
556         suggestion_result_checker, last_document_id, is_last_document_desired,
557         total_score, hits_out, term_frequency_out);
558   }
559   return total_score;
560 }
561 
ScoreHits(uint32_t term_id,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)562 libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
563     uint32_t term_id,
564     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
565     const SuggestionResultChecker* suggestion_result_checker) {
566   return FetchHits(term_id, kSectionIdMaskAll,
567                    /*only_from_prefix_sections=*/false, score_by,
568                    suggestion_result_checker,
569                    /*hits_out=*/nullptr);
570 }
571 
is_full() const572 bool LiteIndex::is_full() const {
573   return (header_->cur_size() == options_.hit_buffer_size ||
574           lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
575 }
576 
GetDebugInfo(DebugInfoVerbosity::Code verbosity) const577 std::string LiteIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
578   absl_ports::unique_lock l(&mutex_);
579   std::string res;
580   std::string lexicon_info;
581   lexicon_.GetDebugInfo(verbosity, &lexicon_info);
582   IcingStringUtil::SStringAppendF(
583       &res, 0,
584       "curr_size: %u\n"
585       "hit_buffer_size: %u\n"
586       "last_added_document_id %u\n"
587       "searchable_end: %u\n"
588       "index_crc: %u\n"
589       "\n"
590       "lite_lexicon_info:\n%s\n",
591       header_->cur_size(), options_.hit_buffer_size,
592       header_->last_added_docid(), header_->searchable_end(),
593       GetChecksumInternal().Get(), lexicon_info.c_str());
594   return res;
595 }
596 
GetElementsSize() const597 libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
598   IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
599   if (storage_info.lite_index_hit_buffer_size() == -1 ||
600       storage_info.lite_index_lexicon_size() == -1) {
601     return absl_ports::AbortedError(
602         "Failed to get size of LiteIndex's members.");
603   }
604   // On initialization, we grow the file to a padded size first. So this size
605   // won't count towards the size taken up by elements
606   size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
607   return storage_info.lite_index_hit_buffer_size() - header_padded_size +
608          storage_info.lite_index_lexicon_size();
609 }
610 
GetStorageInfo(IndexStorageInfoProto storage_info) const611 IndexStorageInfoProto LiteIndex::GetStorageInfo(
612     IndexStorageInfoProto storage_info) const {
613   absl_ports::shared_lock l(&mutex_);
614   int64_t header_and_hit_buffer_file_size =
615       filesystem_->GetFileSize(hit_buffer_fd_.get());
616   storage_info.set_lite_index_hit_buffer_size(
617       IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
618   int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
619   if (lexicon_disk_usage != Filesystem::kBadFileSize) {
620     storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
621   } else {
622     storage_info.set_lite_index_lexicon_size(-1);
623   }
624   return storage_info;
625 }
626 
SortHitsImpl()627 void LiteIndex::SortHitsImpl() {
628   // Make searchable by sorting by hit buffer.
629   uint32_t need_sort_len = GetHitBufferUnsortedSizeImpl();
630   if (need_sort_len <= 0) {
631     return;
632   }
633   IcingTimer timer;
634 
635   TermIdHitPair::Value* array_start =
636       hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
637   TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
638   std::sort(sort_start, array_start + header_->cur_size());
639 
640   // Now merge with previous region. Since the previous region is already
641   // sorted and deduplicated, optimize the merge by skipping everything less
642   // than the new region's smallest value.
643   if (header_->searchable_end() > 0) {
644     std::inplace_merge(array_start, array_start + header_->searchable_end(),
645                        array_start + header_->cur_size());
646   }
647   ICING_VLOG(2) << "Lite index sort and merge " << need_sort_len << " into "
648                 << header_->searchable_end() << " in " << timer.Elapsed() * 1000
649                 << "ms";
650 
651   // Now the entire array is sorted.
652   header_->set_searchable_end(header_->cur_size());
653 
654   // Update crc in-line.
655   UpdateChecksumInternal();
656 }
657 
Optimize(const std::vector<DocumentId> & document_id_old_to_new,const TermIdCodec * term_id_codec,DocumentId new_last_added_document_id)658 libtextclassifier3::Status LiteIndex::Optimize(
659     const std::vector<DocumentId>& document_id_old_to_new,
660     const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
661   absl_ports::unique_lock l(&mutex_);
662   header_->set_last_added_docid(new_last_added_document_id);
663   if (header_->cur_size() == 0) {
664     return libtextclassifier3::Status::OK;
665   }
666   // Sort the hits so that hits with the same term id will be grouped together,
667   // which helps later to determine which terms will be unused after compaction.
668   SortHitsImpl();
669   uint32_t new_size = 0;
670   uint32_t curr_term_id = 0;
671   uint32_t curr_tvi = 0;
672   std::unordered_set<uint32_t> tvi_to_delete;
673   for (uint32_t idx = 0; idx < header_->cur_size(); ++idx) {
674     TermIdHitPair term_id_hit_pair(
675         hit_buffer_.array_cast<TermIdHitPair>()[idx]);
676     if (idx == 0 || term_id_hit_pair.term_id() != curr_term_id) {
677       curr_term_id = term_id_hit_pair.term_id();
678       ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo term_info,
679                              term_id_codec->DecodeTermInfo(curr_term_id));
680       curr_tvi = term_info.tvi;
681       // Mark the property of the current term as not having hits in prefix
682       // section. The property will be set below if there are any valid hits
683       // from a prefix section.
684       lexicon_.ClearProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
685       // Add curr_tvi to tvi_to_delete. It will be removed from tvi_to_delete
686       // below if there are any valid hits pointing to that termid.
687       tvi_to_delete.insert(curr_tvi);
688     }
689     DocumentId old_document_id = term_id_hit_pair.hit().document_id();
690     DocumentId new_document_id =
691         old_document_id >= 0 && old_document_id < document_id_old_to_new.size()
692             ? document_id_old_to_new[old_document_id]
693             : kInvalidDocumentId;
694     if (new_document_id == kInvalidDocumentId) {
695       continue;
696     }
697     if (term_id_hit_pair.hit().is_in_prefix_section()) {
698       lexicon_.SetProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
699     }
700     tvi_to_delete.erase(curr_tvi);
701     TermIdHitPair new_term_id_hit_pair(
702         term_id_hit_pair.term_id(),
703         Hit::TranslateHit(term_id_hit_pair.hit(), new_document_id));
704     // Rewriting the hit_buffer in place.
705     // new_size is weakly less than idx so we are okay to overwrite the entry at
706     // new_size, and valp should never be nullptr since it is within the already
707     // allocated region of hit_buffer_.
708     TermIdHitPair::Value* valp =
709         hit_buffer_.GetMutableMem<TermIdHitPair::Value>(new_size++, 1);
710     if (valp == nullptr) {
711       // This really shouldn't happen since we are only writing to the already
712       // allocated region of hit_buffer_. But just in case, we log and return an
713       // error here.
714       ICING_LOG(ERROR)
715           << "GetMutableMem failed in Optimize. This should never happen.";
716       return absl_ports::ResourceExhaustedError(
717           "Allocating more space in hit buffer failed!");
718     }
719     *valp = new_term_id_hit_pair.value();
720   }
721   header_->set_cur_size(new_size);
722   header_->set_searchable_end(new_size);
723 
724   // Delete unused terms.
725   std::unordered_set<std::string> terms_to_delete;
726   for (IcingDynamicTrie::Iterator term_iter(lexicon_, /*prefix=*/"");
727        term_iter.IsValid(); term_iter.Advance()) {
728     if (tvi_to_delete.find(term_iter.GetValueIndex()) != tvi_to_delete.end()) {
729       terms_to_delete.insert(std::string(term_iter.GetKey()));
730     }
731   }
732   for (const std::string& term : terms_to_delete) {
733     // Mark "term" as deleted. This won't actually free space in the lexicon. It
734     // will simply make it impossible to Find "term" in subsequent calls (which
735     // saves an unnecessary search through the hit buffer). This is acceptable
736     // because the free space will eventually be reclaimed the next time that
737     // the lite index is merged with the main index.
738     if (!lexicon_.Delete(term)) {
739       return absl_ports::InternalError(
740           "Could not delete invalid terms in lite lexicon during compaction.");
741     }
742   }
743   return libtextclassifier3::Status::OK;
744 }
745 
746 }  // namespace lib
747 }  // namespace icing
748