1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/index.h"
16
17 #include <algorithm>
18 #include <cstddef>
19 #include <cstdint>
20 #include <memory>
21 #include <string>
22 #include <string_view>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/file/filesystem.h"
31 #include "icing/index/hit/hit.h"
32 #include "icing/index/iterator/doc-hit-info-iterator-or.h"
33 #include "icing/index/iterator/doc-hit-info-iterator.h"
34 #include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
35 #include "icing/index/lite/lite-index.h"
36 #include "icing/index/main/doc-hit-info-iterator-term-main.h"
37 #include "icing/index/main/main-index.h"
38 #include "icing/index/term-id-codec.h"
39 #include "icing/index/term-metadata.h"
40 #include "icing/legacy/core/icing-string-util.h"
41 #include "icing/legacy/index/icing-dynamic-trie.h"
42 #include "icing/legacy/index/icing-filesystem.h"
43 #include "icing/proto/scoring.pb.h"
44 #include "icing/proto/storage.pb.h"
45 #include "icing/proto/term.pb.h"
46 #include "icing/schema/section.h"
47 #include "icing/scoring/ranker.h"
48 #include "icing/store/document-id.h"
49 #include "icing/store/suggestion-result-checker.h"
50 #include "icing/util/logging.h"
51 #include "icing/util/status-macros.h"
52
53 namespace icing {
54 namespace lib {
55
56 namespace {
57
CreateLiteIndexOptions(const Index::Options & options)58 libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions(
59 const Index::Options& options) {
60 if (options.index_merge_size <= 0) {
61 return absl_ports::InvalidArgumentError(
62 "Requested hit buffer size must be greater than 0.");
63 }
64 if (options.index_merge_size > LiteIndex::max_hit_buffer_size()) {
65 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
66 "Requested hit buffer size %d is too large.",
67 options.index_merge_size));
68 }
69 return LiteIndex::Options(
70 options.base_dir + "/idx/lite.", options.index_merge_size,
71 options.lite_index_sort_at_indexing, options.lite_index_sort_size);
72 }
73
MakeMainIndexFilepath(const std::string & base_dir)74 std::string MakeMainIndexFilepath(const std::string& base_dir) {
75 return base_dir + "/idx/main";
76 }
77
GetMainLexiconOptions()78 IcingDynamicTrie::Options GetMainLexiconOptions() {
79 // The default values for IcingDynamicTrie::Options is fine for the main
80 // lexicon.
81 return IcingDynamicTrie::Options();
82 }
83
84 enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
85
86 // Merge the TermMetadata from lite index and main index. If the term exists in
87 // both index, sum up its hit count and push it to the term heap.
88 // The heap is a min-heap. So that we can avoid some push operation but the time
89 // complexity is O(NlgK) which N is total number of term and K is num_to_return.
MergeAndRankTermMetadatas(std::vector<TermMetadata> lite_term_metadata_list,std::vector<TermMetadata> main_term_metadata_list,int num_to_return)90 std::vector<TermMetadata> MergeAndRankTermMetadatas(
91 std::vector<TermMetadata> lite_term_metadata_list,
92 std::vector<TermMetadata> main_term_metadata_list, int num_to_return) {
93 std::vector<TermMetadata> merged_term_metadata_heap;
94 merged_term_metadata_heap.reserve(
95 std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(),
96 static_cast<size_t>(num_to_return)));
97
98 auto lite_term_itr = lite_term_metadata_list.begin();
99 auto main_term_itr = main_term_metadata_list.begin();
100 MergeAction merge_action;
101 while (lite_term_itr != lite_term_metadata_list.end() ||
102 main_term_itr != main_term_metadata_list.end()) {
103 // Get pointers to the next metadatas in each group, if available
104 // Determine how to merge.
105 if (main_term_itr == main_term_metadata_list.end()) {
106 merge_action = MergeAction::kTakeLiteTerm;
107 } else if (lite_term_itr == lite_term_metadata_list.end()) {
108 merge_action = MergeAction::kTakeMainTerm;
109 } else if (lite_term_itr->content < main_term_itr->content) {
110 merge_action = MergeAction::kTakeLiteTerm;
111 } else if (main_term_itr->content < lite_term_itr->content) {
112 merge_action = MergeAction::kTakeMainTerm;
113 } else {
114 // The next metadatas refer to the same term. Combine them.
115 merge_action = MergeAction::kMergeTerms;
116 }
117 switch (merge_action) {
118 case MergeAction::kTakeLiteTerm:
119 PushToTermHeap(std::move(*lite_term_itr), num_to_return,
120 merged_term_metadata_heap);
121 ++lite_term_itr;
122 break;
123 case MergeAction::kTakeMainTerm:
124 PushToTermHeap(std::move(*main_term_itr), num_to_return,
125 merged_term_metadata_heap);
126 ++main_term_itr;
127 break;
128 case MergeAction::kMergeTerms:
129 int total_est_hit_count = lite_term_itr->score + main_term_itr->score;
130 PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
131 total_est_hit_count),
132 num_to_return, merged_term_metadata_heap);
133 ++lite_term_itr;
134 ++main_term_itr;
135 break;
136 }
137 }
138 // Reverse the list since we pop them from a min heap and we need to return in
139 // decreasing order.
140 std::vector<TermMetadata> merged_term_metadata_list =
141 PopAllTermsFromHeap(merged_term_metadata_heap);
142 std::reverse(merged_term_metadata_list.begin(),
143 merged_term_metadata_list.end());
144 return merged_term_metadata_list;
145 }
146
147 } // namespace
148
Create(const Options & options,const Filesystem * filesystem,const IcingFilesystem * icing_filesystem)149 libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
150 const Options& options, const Filesystem* filesystem,
151 const IcingFilesystem* icing_filesystem) {
152 ICING_RETURN_ERROR_IF_NULL(filesystem);
153 ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
154
155 ICING_ASSIGN_OR_RETURN(LiteIndex::Options lite_index_options,
156 CreateLiteIndexOptions(options));
157 ICING_ASSIGN_OR_RETURN(
158 std::unique_ptr<TermIdCodec> term_id_codec,
159 TermIdCodec::Create(
160 IcingDynamicTrie::max_value_index(GetMainLexiconOptions()),
161 IcingDynamicTrie::max_value_index(
162 lite_index_options.lexicon_options)));
163
164 ICING_ASSIGN_OR_RETURN(
165 std::unique_ptr<LiteIndex> lite_index,
166 LiteIndex::Create(lite_index_options, icing_filesystem));
167 // Sort the lite index if we've enabled sorting the HitBuffer at indexing
168 // time, and there's an unsorted tail exceeding the threshold.
169 if (options.lite_index_sort_at_indexing &&
170 lite_index->HasUnsortedHitsExceedingSortThreshold()) {
171 lite_index->SortHits();
172 }
173
174 ICING_ASSIGN_OR_RETURN(
175 std::unique_ptr<MainIndex> main_index,
176 MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem,
177 icing_filesystem));
178 return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec),
179 std::move(lite_index),
180 std::move(main_index), filesystem));
181 }
182
ReadFlashIndexMagic(const Filesystem * filesystem,const std::string & base_dir)183 /* static */ libtextclassifier3::StatusOr<int> Index::ReadFlashIndexMagic(
184 const Filesystem* filesystem, const std::string& base_dir) {
185 return MainIndex::ReadFlashIndexMagic(filesystem,
186 MakeMainIndexFilepath(base_dir));
187 }
188
TruncateTo(DocumentId document_id)189 libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) {
190 if (lite_index_->last_added_document_id() != kInvalidDocumentId &&
191 lite_index_->last_added_document_id() > document_id) {
192 ICING_VLOG(1) << "Clipping to " << document_id
193 << ". Throwing out lite index which is at "
194 << lite_index_->last_added_document_id();
195 ICING_RETURN_IF_ERROR(lite_index_->Reset());
196 }
197 if (main_index_->last_added_document_id() != kInvalidDocumentId &&
198 main_index_->last_added_document_id() > document_id) {
199 ICING_VLOG(1) << "Clipping to " << document_id
200 << ". Throwing out lite index which is at "
201 << main_index_->last_added_document_id();
202 ICING_RETURN_IF_ERROR(main_index_->Reset());
203 }
204 return libtextclassifier3::Status::OK;
205 }
206
207 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
GetIterator(const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,TermMatchType::Code term_match_type,bool need_hit_term_frequency)208 Index::GetIterator(const std::string& term, int term_start_index,
209 int unnormalized_term_length, SectionIdMask section_id_mask,
210 TermMatchType::Code term_match_type,
211 bool need_hit_term_frequency) {
212 std::unique_ptr<DocHitInfoIterator> lite_itr;
213 std::unique_ptr<DocHitInfoIterator> main_itr;
214 switch (term_match_type) {
215 case TermMatchType::EXACT_ONLY:
216 lite_itr = std::make_unique<DocHitInfoIteratorTermLiteExact>(
217 term_id_codec_.get(), lite_index_.get(), term, term_start_index,
218 unnormalized_term_length, section_id_mask, need_hit_term_frequency);
219 main_itr = std::make_unique<DocHitInfoIteratorTermMainExact>(
220 main_index_.get(), term, term_start_index, unnormalized_term_length,
221 section_id_mask, need_hit_term_frequency);
222 break;
223 case TermMatchType::PREFIX:
224 lite_itr = std::make_unique<DocHitInfoIteratorTermLitePrefix>(
225 term_id_codec_.get(), lite_index_.get(), term, term_start_index,
226 unnormalized_term_length, section_id_mask, need_hit_term_frequency);
227 main_itr = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
228 main_index_.get(), term, term_start_index, unnormalized_term_length,
229 section_id_mask, need_hit_term_frequency);
230 break;
231 default:
232 return absl_ports::InvalidArgumentError(
233 absl_ports::StrCat("Invalid TermMatchType: ",
234 TermMatchType::Code_Name(term_match_type)));
235 }
236 return std::make_unique<DocHitInfoIteratorOr>(std::move(lite_itr),
237 std::move(main_itr));
238 }
239
240 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindLiteTermsByPrefix(const std::string & prefix,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,const SuggestionResultChecker * suggestion_result_checker)241 Index::FindLiteTermsByPrefix(
242 const std::string& prefix,
243 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
244 const SuggestionResultChecker* suggestion_result_checker) {
245 // Finds all the terms that start with the given prefix in the lexicon.
246 IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(), prefix);
247
248 std::vector<TermMetadata> term_metadata_list;
249 while (term_iterator.IsValid()) {
250 uint32_t term_value_index = term_iterator.GetValueIndex();
251
252 ICING_ASSIGN_OR_RETURN(
253 uint32_t term_id,
254 term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
255 absl_ports::InternalError("Failed to access terms in lexicon."));
256 ICING_ASSIGN_OR_RETURN(
257 int hit_score,
258 lite_index_->ScoreHits(term_id, score_by, suggestion_result_checker));
259 if (hit_score > 0) {
260 // There is at least one document in the given namespace has this term.
261 term_metadata_list.push_back(
262 TermMetadata(std::string(term_iterator.GetKey()), hit_score));
263 }
264
265 term_iterator.Advance();
266 }
267 return term_metadata_list;
268 }
269
270 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
FindTermsByPrefix(const std::string & prefix,int num_to_return,TermMatchType::Code scoring_match_type,SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,const SuggestionResultChecker * suggestion_result_checker)271 Index::FindTermsByPrefix(
272 const std::string& prefix, int num_to_return,
273 TermMatchType::Code scoring_match_type,
274 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
275 const SuggestionResultChecker* suggestion_result_checker) {
276 std::vector<TermMetadata> term_metadata_list;
277 if (num_to_return <= 0) {
278 return term_metadata_list;
279 }
280 // Get results from the LiteIndex.
281 ICING_ASSIGN_OR_RETURN(
282 std::vector<TermMetadata> lite_term_metadata_list,
283 FindLiteTermsByPrefix(prefix, rank_by, suggestion_result_checker));
284 // Append results from the MainIndex.
285 ICING_ASSIGN_OR_RETURN(
286 std::vector<TermMetadata> main_term_metadata_list,
287 main_index_->FindTermsByPrefix(prefix, scoring_match_type, rank_by,
288 suggestion_result_checker));
289 return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list),
290 std::move(main_term_metadata_list),
291 num_to_return);
292 }
293
GetStorageInfo() const294 IndexStorageInfoProto Index::GetStorageInfo() const {
295 IndexStorageInfoProto storage_info;
296 int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
297 storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
298 storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
299 return main_index_->GetStorageInfo(std::move(storage_info));
300 }
301
Optimize(const std::vector<DocumentId> & document_id_old_to_new,DocumentId new_last_added_document_id)302 libtextclassifier3::Status Index::Optimize(
303 const std::vector<DocumentId>& document_id_old_to_new,
304 DocumentId new_last_added_document_id) {
305 if (main_index_->last_added_document_id() != kInvalidDocumentId) {
306 ICING_RETURN_IF_ERROR(main_index_->Optimize(document_id_old_to_new));
307 }
308 return lite_index_->Optimize(document_id_old_to_new, term_id_codec_.get(),
309 new_last_added_document_id);
310 }
311
BufferTerm(std::string_view term,TermMatchType::Code match_type)312 libtextclassifier3::Status Index::Editor::BufferTerm(
313 std::string_view term, TermMatchType::Code match_type) {
314 // Step 1: See if this term is already in the lexicon
315 uint32_t tvi;
316 auto tvi_or = lite_index_->GetTermId(term);
317
318 // Step 2: Update the lexicon, either add the term or update its properties
319 if (tvi_or.ok()) {
320 tvi = tvi_or.ValueOrDie();
321 auto itr = seen_tokens_.find(tvi);
322 if (itr != seen_tokens_.end()) {
323 HitDetails& hit_details = itr->second;
324 if (hit_details.match_type == TermMatchType::EXACT_ONLY &&
325 match_type == TermMatchType::PREFIX) {
326 // If the same term is added multiple times, but with different match
327 // types, we will always update the match type to PREFIX.
328 ICING_VLOG(1) << "Updating term match type for term " << term;
329 hit_details.match_type = TermMatchType::PREFIX;
330 }
331 if (hit_details.term_frequency != Hit::kMaxTermFrequency) {
332 ICING_VLOG(1) << "Updating term frequency for term " << term;
333 ++hit_details.term_frequency;
334 }
335 return libtextclassifier3::Status::OK;
336 }
337 ICING_VLOG(1) << "Term " << term
338 << " is already present in lexicon. Updating.";
339 // Already in the lexicon. Just update the properties.
340 ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
341 tvi, match_type == TermMatchType::PREFIX, namespace_id_));
342 } else {
343 ICING_VLOG(1) << "Term " << term << " is not in lexicon. Inserting.";
344 // Haven't seen this term before. Add it to the lexicon.
345 ICING_ASSIGN_OR_RETURN(
346 tvi, lite_index_->InsertTerm(term, match_type, namespace_id_));
347 }
348 // Token seen for the first time in the current document.
349 seen_tokens_[tvi] = {match_type, /*term_frequency=*/1};
350 return libtextclassifier3::Status::OK;
351 }
352
IndexAllBufferedTerms()353 libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() {
354 for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) {
355 Hit hit(section_id_, document_id_, itr->second.term_frequency,
356 /*is_in_prefix_section=*/itr->second.match_type ==
357 TermMatchType::PREFIX,
358 /*is_prefix_hit=*/false, /*is_stemmed_hit=*/false);
359 ICING_ASSIGN_OR_RETURN(
360 uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE));
361 ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit));
362 }
363 return libtextclassifier3::Status::OK;
364 }
365
366 } // namespace lib
367 } // namespace icing
368