xref: /aosp_15_r20/external/icing/icing/index/index.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/index.h"
16 
17 #include <algorithm>
18 #include <cstddef>
19 #include <cstdint>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/file/filesystem.h"
31 #include "icing/index/hit/hit.h"
32 #include "icing/index/iterator/doc-hit-info-iterator-or.h"
33 #include "icing/index/iterator/doc-hit-info-iterator.h"
34 #include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
35 #include "icing/index/lite/lite-index.h"
36 #include "icing/index/main/doc-hit-info-iterator-term-main.h"
37 #include "icing/index/main/main-index.h"
38 #include "icing/index/term-id-codec.h"
39 #include "icing/index/term-metadata.h"
40 #include "icing/legacy/core/icing-string-util.h"
41 #include "icing/legacy/index/icing-dynamic-trie.h"
42 #include "icing/legacy/index/icing-filesystem.h"
43 #include "icing/proto/scoring.pb.h"
44 #include "icing/proto/storage.pb.h"
45 #include "icing/proto/term.pb.h"
46 #include "icing/schema/section.h"
47 #include "icing/scoring/ranker.h"
48 #include "icing/store/document-id.h"
49 #include "icing/store/suggestion-result-checker.h"
50 #include "icing/util/logging.h"
51 #include "icing/util/status-macros.h"
52 
53 namespace icing {
54 namespace lib {
55 
56 namespace {
57 
CreateLiteIndexOptions(const Index::Options & options)58 libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions(
59     const Index::Options& options) {
60   if (options.index_merge_size <= 0) {
61     return absl_ports::InvalidArgumentError(
62         "Requested hit buffer size must be greater than 0.");
63   }
64   if (options.index_merge_size > LiteIndex::max_hit_buffer_size()) {
65     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
66         "Requested hit buffer size %d is too large.",
67         options.index_merge_size));
68   }
69   return LiteIndex::Options(
70       options.base_dir + "/idx/lite.", options.index_merge_size,
71       options.lite_index_sort_at_indexing, options.lite_index_sort_size);
72 }
73 
MakeMainIndexFilepath(const std::string & base_dir)74 std::string MakeMainIndexFilepath(const std::string& base_dir) {
75   return base_dir + "/idx/main";
76 }
77 
GetMainLexiconOptions()78 IcingDynamicTrie::Options GetMainLexiconOptions() {
79   // The default values for IcingDynamicTrie::Options is fine for the main
80   // lexicon.
81   return IcingDynamicTrie::Options();
82 }
83 
84 enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
85 
86 // Merge the TermMetadata from lite index and main index. If the term exists in
87 // both index, sum up its hit count and push it to the term heap.
88 // The heap is a min-heap. So that we can avoid some push operation but the time
89 // complexity is O(NlgK) which N is total number of term and K is num_to_return.
MergeAndRankTermMetadatas(std::vector<TermMetadata> lite_term_metadata_list,std::vector<TermMetadata> main_term_metadata_list,int num_to_return)90 std::vector<TermMetadata> MergeAndRankTermMetadatas(
91     std::vector<TermMetadata> lite_term_metadata_list,
92     std::vector<TermMetadata> main_term_metadata_list, int num_to_return) {
93   std::vector<TermMetadata> merged_term_metadata_heap;
94   merged_term_metadata_heap.reserve(
95       std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(),
96                static_cast<size_t>(num_to_return)));
97 
98   auto lite_term_itr = lite_term_metadata_list.begin();
99   auto main_term_itr = main_term_metadata_list.begin();
100   MergeAction merge_action;
101   while (lite_term_itr != lite_term_metadata_list.end() ||
102          main_term_itr != main_term_metadata_list.end()) {
103     // Get pointers to the next metadatas in each group, if available
104     // Determine how to merge.
105     if (main_term_itr == main_term_metadata_list.end()) {
106       merge_action = MergeAction::kTakeLiteTerm;
107     } else if (lite_term_itr == lite_term_metadata_list.end()) {
108       merge_action = MergeAction::kTakeMainTerm;
109     } else if (lite_term_itr->content < main_term_itr->content) {
110       merge_action = MergeAction::kTakeLiteTerm;
111     } else if (main_term_itr->content < lite_term_itr->content) {
112       merge_action = MergeAction::kTakeMainTerm;
113     } else {
114       // The next metadatas refer to the same term. Combine them.
115       merge_action = MergeAction::kMergeTerms;
116     }
117     switch (merge_action) {
118       case MergeAction::kTakeLiteTerm:
119         PushToTermHeap(std::move(*lite_term_itr), num_to_return,
120                        merged_term_metadata_heap);
121         ++lite_term_itr;
122         break;
123       case MergeAction::kTakeMainTerm:
124         PushToTermHeap(std::move(*main_term_itr), num_to_return,
125                        merged_term_metadata_heap);
126         ++main_term_itr;
127         break;
128       case MergeAction::kMergeTerms:
129         int total_est_hit_count = lite_term_itr->score + main_term_itr->score;
130         PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
131                                     total_est_hit_count),
132                        num_to_return, merged_term_metadata_heap);
133         ++lite_term_itr;
134         ++main_term_itr;
135         break;
136     }
137   }
138   // Reverse the list since we pop them from a min heap and we need to return in
139   // decreasing order.
140   std::vector<TermMetadata> merged_term_metadata_list =
141       PopAllTermsFromHeap(merged_term_metadata_heap);
142   std::reverse(merged_term_metadata_list.begin(),
143                merged_term_metadata_list.end());
144   return merged_term_metadata_list;
145 }
146 
147 }  // namespace
148 
Create(const Options & options,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)149 libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
150     const Options& options, const Filesystem* filesystem,
151     const IcingFilesystem* icing_filesystem) {
152   ICING_RETURN_ERROR_IF_NULL(filesystem);
153   ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
154 
155   ICING_ASSIGN_OR_RETURN(LiteIndex::Options lite_index_options,
156                          CreateLiteIndexOptions(options));
157   ICING_ASSIGN_OR_RETURN(
158       std::unique_ptr<TermIdCodec> term_id_codec,
159       TermIdCodec::Create(
160           IcingDynamicTrie::max_value_index(GetMainLexiconOptions()),
161           IcingDynamicTrie::max_value_index(
162               lite_index_options.lexicon_options)));
163 
164   ICING_ASSIGN_OR_RETURN(
165       std::unique_ptr<LiteIndex> lite_index,
166       LiteIndex::Create(lite_index_options, icing_filesystem));
167   // Sort the lite index if we've enabled sorting the HitBuffer at indexing
168   // time, and there's an unsorted tail exceeding the threshold.
169   if (options.lite_index_sort_at_indexing &&
170       lite_index->HasUnsortedHitsExceedingSortThreshold()) {
171     lite_index->SortHits();
172   }
173 
174   ICING_ASSIGN_OR_RETURN(
175       std::unique_ptr<MainIndex> main_index,
176       MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem,
177                         icing_filesystem));
178   return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec),
179                                           std::move(lite_index),
180                                           std::move(main_index), filesystem));
181 }
182 
ReadFlashIndexMagic(const Filesystem * filesystem,const std::string & base_dir)183 /* static */ libtextclassifier3::StatusOr<int> Index::ReadFlashIndexMagic(
184     const Filesystem* filesystem, const std::string& base_dir) {
185   return MainIndex::ReadFlashIndexMagic(filesystem,
186                                         MakeMainIndexFilepath(base_dir));
187 }
188 
TruncateTo(DocumentId document_id)189 libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) {
190   if (lite_index_->last_added_document_id() != kInvalidDocumentId &&
191       lite_index_->last_added_document_id() > document_id) {
192     ICING_VLOG(1) << "Clipping to " << document_id
193                   << ". Throwing out lite index which is at "
194                   << lite_index_->last_added_document_id();
195     ICING_RETURN_IF_ERROR(lite_index_->Reset());
196   }
197   if (main_index_->last_added_document_id() != kInvalidDocumentId &&
198       main_index_->last_added_document_id() > document_id) {
199     ICING_VLOG(1) << "Clipping to " << document_id
200                   << ". Throwing out lite index which is at "
201                   << main_index_->last_added_document_id();
202     ICING_RETURN_IF_ERROR(main_index_->Reset());
203   }
204   return libtextclassifier3::Status::OK;
205 }
206 
207 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
GetIterator(const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,TermMatchType::Code term_match_type,bool need_hit_term_frequency)208 Index::GetIterator(const std::string& term, int term_start_index,
209                    int unnormalized_term_length, SectionIdMask section_id_mask,
210                    TermMatchType::Code term_match_type,
211                    bool need_hit_term_frequency) {
212   std::unique_ptr<DocHitInfoIterator> lite_itr;
213   std::unique_ptr<DocHitInfoIterator> main_itr;
214   switch (term_match_type) {
215     case TermMatchType::EXACT_ONLY:
216       lite_itr = std::make_unique<DocHitInfoIteratorTermLiteExact>(
217           term_id_codec_.get(), lite_index_.get(), term, term_start_index,
218           unnormalized_term_length, section_id_mask, need_hit_term_frequency);
219       main_itr = std::make_unique<DocHitInfoIteratorTermMainExact>(
220           main_index_.get(), term, term_start_index, unnormalized_term_length,
221           section_id_mask, need_hit_term_frequency);
222       break;
223     case TermMatchType::PREFIX:
224       lite_itr = std::make_unique<DocHitInfoIteratorTermLitePrefix>(
225           term_id_codec_.get(), lite_index_.get(), term, term_start_index,
226           unnormalized_term_length, section_id_mask, need_hit_term_frequency);
227       main_itr = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
228           main_index_.get(), term, term_start_index, unnormalized_term_length,
229           section_id_mask, need_hit_term_frequency);
230       break;
231     default:
232       return absl_ports::InvalidArgumentError(
233           absl_ports::StrCat("Invalid TermMatchType: ",
234                              TermMatchType::Code_Name(term_match_type)));
235   }
236   return std::make_unique<DocHitInfoIteratorOr>(std::move(lite_itr),
237                                                 std::move(main_itr));
238 }
239 
240 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindLiteTermsByPrefix(const std::string & prefix,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)241 Index::FindLiteTermsByPrefix(
242     const std::string& prefix,
243     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
244     const SuggestionResultChecker* suggestion_result_checker) {
245   // Finds all the terms that start with the given prefix in the lexicon.
246   IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(), prefix);
247 
248   std::vector<TermMetadata> term_metadata_list;
249   while (term_iterator.IsValid()) {
250     uint32_t term_value_index = term_iterator.GetValueIndex();
251 
252     ICING_ASSIGN_OR_RETURN(
253         uint32_t term_id,
254         term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
255         absl_ports::InternalError("Failed to access terms in lexicon."));
256     ICING_ASSIGN_OR_RETURN(
257         int hit_score,
258         lite_index_->ScoreHits(term_id, score_by, suggestion_result_checker));
259     if (hit_score > 0) {
260       // There is at least one document in the given namespace has this term.
261       term_metadata_list.push_back(
262           TermMetadata(std::string(term_iterator.GetKey()), hit_score));
263     }
264 
265     term_iterator.Advance();
266   }
267   return term_metadata_list;
268 }
269 
270 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindTermsByPrefix(const std::string & prefix,int num_to_return,TermMatchType::Code scoring_match_type,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,const SuggestionResultChecker * suggestion_result_checker)271 Index::FindTermsByPrefix(
272     const std::string& prefix, int num_to_return,
273     TermMatchType::Code scoring_match_type,
274     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
275     const SuggestionResultChecker* suggestion_result_checker) {
276   std::vector<TermMetadata> term_metadata_list;
277   if (num_to_return <= 0) {
278     return term_metadata_list;
279   }
280   // Get results from the LiteIndex.
281   ICING_ASSIGN_OR_RETURN(
282       std::vector<TermMetadata> lite_term_metadata_list,
283       FindLiteTermsByPrefix(prefix, rank_by, suggestion_result_checker));
284   // Append results from the MainIndex.
285   ICING_ASSIGN_OR_RETURN(
286       std::vector<TermMetadata> main_term_metadata_list,
287       main_index_->FindTermsByPrefix(prefix, scoring_match_type, rank_by,
288                                      suggestion_result_checker));
289   return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list),
290                                    std::move(main_term_metadata_list),
291                                    num_to_return);
292 }
293 
GetStorageInfo() const294 IndexStorageInfoProto Index::GetStorageInfo() const {
295   IndexStorageInfoProto storage_info;
296   int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
297   storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
298   storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
299   return main_index_->GetStorageInfo(std::move(storage_info));
300 }
301 
Optimize(const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)302 libtextclassifier3::Status Index::Optimize(
303     const std::vector<DocumentId>& document_id_old_to_new,
304     DocumentId new_last_added_document_id) {
305   if (main_index_->last_added_document_id() != kInvalidDocumentId) {
306     ICING_RETURN_IF_ERROR(main_index_->Optimize(document_id_old_to_new));
307   }
308   return lite_index_->Optimize(document_id_old_to_new, term_id_codec_.get(),
309                                new_last_added_document_id);
310 }
311 
BufferTerm(std::string_view term,TermMatchType::Code match_type)312 libtextclassifier3::Status Index::Editor::BufferTerm(
313     std::string_view term, TermMatchType::Code match_type) {
314   // Step 1: See if this term is already in the lexicon
315   uint32_t tvi;
316   auto tvi_or = lite_index_->GetTermId(term);
317 
318   // Step 2: Update the lexicon, either add the term or update its properties
319   if (tvi_or.ok()) {
320     tvi = tvi_or.ValueOrDie();
321     auto itr = seen_tokens_.find(tvi);
322     if (itr != seen_tokens_.end()) {
323       HitDetails& hit_details = itr->second;
324       if (hit_details.match_type == TermMatchType::EXACT_ONLY &&
325           match_type == TermMatchType::PREFIX) {
326         // If the same term is added multiple times, but with different match
327         // types, we will always update the match type to PREFIX.
328         ICING_VLOG(1) << "Updating term match type for term " << term;
329         hit_details.match_type = TermMatchType::PREFIX;
330       }
331       if (hit_details.term_frequency != Hit::kMaxTermFrequency) {
332         ICING_VLOG(1) << "Updating term frequency for term " << term;
333         ++hit_details.term_frequency;
334       }
335       return libtextclassifier3::Status::OK;
336     }
337     ICING_VLOG(1) << "Term " << term
338                   << " is already present in lexicon. Updating.";
339     // Already in the lexicon. Just update the properties.
340     ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
341         tvi, match_type == TermMatchType::PREFIX, namespace_id_));
342   } else {
343     ICING_VLOG(1) << "Term " << term << " is not in lexicon. Inserting.";
344     // Haven't seen this term before. Add it to the lexicon.
345     ICING_ASSIGN_OR_RETURN(
346         tvi, lite_index_->InsertTerm(term, match_type, namespace_id_));
347   }
348   // Token seen for the first time in the current document.
349   seen_tokens_[tvi] = {match_type, /*term_frequency=*/1};
350   return libtextclassifier3::Status::OK;
351 }
352 
IndexAllBufferedTerms()353 libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() {
354   for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) {
355     Hit hit(section_id_, document_id_, itr->second.term_frequency,
356             /*is_in_prefix_section=*/itr->second.match_type ==
357                 TermMatchType::PREFIX,
358             /*is_prefix_hit=*/false, /*is_stemmed_hit=*/false);
359     ICING_ASSIGN_OR_RETURN(
360         uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE));
361     ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit));
362   }
363   return libtextclassifier3::Status::OK;
364 }
365 
366 }  // namespace lib
367 }  // namespace icing
368