xref: /aosp_15_r20/external/icing/icing/index/index.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_INDEX_H_
16 #define ICING_INDEX_INDEX_H_
17 
18 #include <cstdint>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/file/filesystem.h"
29 #include "icing/index/hit/hit.h"
30 #include "icing/index/iterator/doc-hit-info-iterator.h"
31 #include "icing/index/lite/lite-index.h"
32 #include "icing/index/lite/term-id-hit-pair.h"
33 #include "icing/index/main/main-index-merger.h"
34 #include "icing/index/main/main-index.h"
35 #include "icing/index/term-id-codec.h"
36 #include "icing/index/term-metadata.h"
37 #include "icing/legacy/index/icing-filesystem.h"
38 #include "icing/proto/debug.pb.h"
39 #include "icing/proto/logging.pb.h"
40 #include "icing/proto/scoring.pb.h"
41 #include "icing/proto/storage.pb.h"
42 #include "icing/proto/term.pb.h"
43 #include "icing/schema/section.h"
44 #include "icing/store/document-id.h"
45 #include "icing/store/namespace-id.h"
46 #include "icing/store/suggestion-result-checker.h"
47 #include "icing/util/status-macros.h"
48 
49 namespace icing {
50 namespace lib {
51 
52 // The class representing the Icing search index. This index maps terms to hits
53 // (document_ids, section_ids).
54 // Content is added to the index through the Editor class - which also dedupes
55 // hits (calling Editor::AddHit with the same arguments will only result in the
56 // creation of a single hit).
57 // Ex.
58 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
59 // .                Index::Create(MakeIndexOptions()));
60 // Index::Editor editor = index->Edit(document_id, section_id,
61 //     TermMatchType::EXACT_ONLY); ICING_RETURN_IF_ERROR(editor.AddHit("foo"));
62 // ICING_RETURN_IF_ERROR(editor.AddHit("baz"));
63 //
64 // Content is retrieved from the index through the Iterator class.
65 // Ex.
66 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index,
67 // .                Index::Create(MakeIndexOptions()));
68 // ICING_ASSIGN_OR_RETURN(Index::Iterator iterator =
69 //     index->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
70 // while(iterator->Advance().ok())
71 //   ProcessResult(iterator->value());
72 class Index {
73  public:
74   struct Options {
OptionsOptions75     explicit Options(const std::string& base_dir, uint32_t index_merge_size,
76                      bool lite_index_sort_at_indexing,
77                      uint32_t lite_index_sort_size)
78         : base_dir(base_dir),
79           index_merge_size(index_merge_size),
80           lite_index_sort_at_indexing(lite_index_sort_at_indexing),
81           lite_index_sort_size(lite_index_sort_size) {}
82 
83     std::string base_dir;
84     int32_t index_merge_size;
85     bool lite_index_sort_at_indexing;
86     int32_t lite_index_sort_size;
87   };
88 
89   // Creates an instance of Index in the directory pointed by file_dir.
90   //
91   // Returns:
92   //   Valid Index on success
93   //   DATA_LOSS if the index was corrupt and had to be cleared
94   //   INVALID_ARGUMENT if options have invalid values
95   //   INTERNAL on I/O error
96   static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create(
97       const Options& options, const Filesystem* filesystem,
98       const IcingFilesystem* icing_filesystem);
99 
100   // Reads magic from existing flash (main) index file header. We need this
101   // during Icing initialization phase to determine the version.
102   //
103   // Returns
104   //   Valid magic on success
105   //   NOT_FOUND if the lite index doesn't exist
106   //   INTERNAL on I/O error
107   static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
108       const Filesystem* filesystem, const std::string& base_dir);
109 
110   // Clears all files created by the index. Returns OK if all files were
111   // cleared.
Reset()112   libtextclassifier3::Status Reset() {
113     ICING_RETURN_IF_ERROR(lite_index_->Reset());
114     return main_index_->Reset();
115   }
116 
117   // Brings components of the index into memory in anticipation of a query in
118   // order to reduce latency.
Warm()119   void Warm() {
120     lite_index_->Warm();
121     main_index_->Warm();
122   }
123 
124   // Syncs all the data and metadata changes to disk.
125   //
126   // Returns:
127   //   OK on success
128   //   INTERNAL on I/O errors
PersistToDisk()129   libtextclassifier3::Status PersistToDisk() {
130     ICING_RETURN_IF_ERROR(lite_index_->PersistToDisk());
131     return main_index_->PersistToDisk();
132   }
133 
134   // Updates all checksums in the index and returns the combined index checksum.
UpdateChecksum()135   Crc32 UpdateChecksum() {
136     Crc32 lite_crc = lite_index_->UpdateChecksum();
137     Crc32 main_crc = main_index_->UpdateChecksum();
138     main_crc.Append(std::to_string(lite_crc.Get()));
139     return main_crc;
140   }
141 
142   // Calculates and returns the combined index checksum.
GetChecksum()143   Crc32 GetChecksum() const {
144     Crc32 lite_crc = lite_index_->GetChecksum();
145     Crc32 main_crc = main_index_->GetChecksum();
146     main_crc.Append(std::to_string(lite_crc.Get()));
147     return main_crc;
148   }
149 
150   // Discard parts of the index if they contain data for document ids greater
151   // than document_id.
152   //
153   // NOTE: This means that TruncateTo(kInvalidDocumentId) will have no effect.
154   //
155   // Returns:
156   //   OK on success
157   //   INTERNAL on I/O errors
158   libtextclassifier3::Status TruncateTo(DocumentId document_id);
159 
160   // DocumentIds are always inserted in increasing order. Returns the largest
161   // document_id added to the index.
last_added_document_id()162   DocumentId last_added_document_id() const {
163     DocumentId lite_document_id = lite_index_->last_added_document_id();
164     if (lite_document_id != kInvalidDocumentId) {
165       return lite_document_id;
166     }
167     return main_index_->last_added_document_id();
168   }
169 
170   // Sets last_added_document_id to document_id so long as document_id >
171   // last_added_document_id()
set_last_added_document_id(DocumentId document_id)172   void set_last_added_document_id(DocumentId document_id) {
173     DocumentId lite_document_id = lite_index_->last_added_document_id();
174     if (lite_document_id == kInvalidDocumentId ||
175         document_id >= lite_document_id) {
176       lite_index_->set_last_added_document_id(document_id);
177     }
178   }
179 
180   // Returns debug information for the index in out.
181   // verbosity = BASIC, simplest debug information - just the lexicons and lite
182   //                    index.
183   // verbosity = DETAILED, more detailed debug information including raw
184   //                       postings lists.
GetDebugInfo(DebugInfoVerbosity::Code verbosity)185   IndexDebugInfoProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
186     IndexDebugInfoProto debug_info;
187     *debug_info.mutable_index_storage_info() = GetStorageInfo();
188     *debug_info.mutable_lite_index_info() =
189         lite_index_->GetDebugInfo(verbosity);
190     *debug_info.mutable_main_index_info() =
191         main_index_->GetDebugInfo(verbosity);
192     return debug_info;
193   }
194 
PublishQueryStats(QueryStatsProto * query_stats)195   void PublishQueryStats(QueryStatsProto* query_stats) const {
196     query_stats->set_lite_index_hit_buffer_byte_size(
197         lite_index_->GetHitBufferByteSize());
198     query_stats->set_lite_index_hit_buffer_unsorted_byte_size(
199         lite_index_->GetHitBufferUnsortedByteSize());
200   }
201 
202   // Returns the byte size of the all the elements held in the index. This
203   // excludes the size of any internal metadata of the index, e.g. the index's
204   // header.
205   //
206   // Returns:
207   //   Byte size on success
208   //   INTERNAL_ERROR on IO error
GetElementsSize()209   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
210     ICING_ASSIGN_OR_RETURN(int64_t lite_index_size,
211                            lite_index_->GetElementsSize());
212     ICING_ASSIGN_OR_RETURN(int64_t main_index_size,
213                            main_index_->GetElementsSize());
214     return lite_index_size + main_index_size;
215   }
216 
217   // Calculates the StorageInfo for the Index.
218   //
219   // If an IO error occurs while trying to calculate the value for a field, then
220   // that field will be set to -1.
221   IndexStorageInfoProto GetStorageInfo() const;
222 
223   // Create an iterator to iterate through all doc hit infos in the index that
224   // match the term. term_start_index is the start index of the given term in
225   // the search query. unnormalized_term_length is the length of the given
226   // unnormalized term in the search query not listed in the mask.
227   // Eg. section_id_mask = 1U << 3; would only return hits that occur in
228   // section 3.
229   //
230   // Returns:
231   //   unique ptr to a valid DocHitInfoIterator that matches the term
232   //   INVALID_ARGUMENT if given an invalid term_match_type
233   libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
234       const std::string& term, int term_start_index,
235       int unnormalized_term_length, SectionIdMask section_id_mask,
236       TermMatchType::Code term_match_type, bool need_hit_term_frequency = true);
237 
238   // Finds terms with the given prefix in the given namespaces. If
239   // 'namespace_ids' is empty, returns results from all the namespaces. Results
240   // are sorted in decreasing order of hit count. Number of results are no more
241   // than 'num_to_return'.
242   //
243   // Returns:
244   //   A list of TermMetadata on success
245   //   INTERNAL_ERROR if failed to access term data.
246   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
247       const std::string& prefix, int num_to_return,
248       TermMatchType::Code scoring_match_type,
249       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
250       const SuggestionResultChecker* suggestion_result_checker);
251 
252   // A class that can be used to add hits to the index.
253   //
254   // An editor groups hits from a particular section within a document together
255   // and dedupes hits for the same term within a section. This removes the
256   // burden of deduping from the caller and direct access to the index
257   // implementation allows for more efficient deduping.
258   class Editor {
259    public:
260     // Does not take any ownership, and all pointers must refer to valid objects
261     // that outlive the one constructed.
262     // TODO(b/141180665): Add nullptr checks for the raw pointers
Editor(const TermIdCodec * term_id_codec,LiteIndex * lite_index,DocumentId document_id,SectionId section_id,NamespaceId namespace_id)263     Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index,
264            DocumentId document_id, SectionId section_id,
265            NamespaceId namespace_id)
266         : term_id_codec_(term_id_codec),
267           lite_index_(lite_index),
268           document_id_(document_id),
269           namespace_id_(namespace_id),
270           section_id_(section_id) {}
271 
272     // Buffer the term in seen_tokens_.
273     libtextclassifier3::Status BufferTerm(std::string_view term,
274                                           TermMatchType::Code match_type);
275     // Index all the terms stored in seen_tokens_.
276     libtextclassifier3::Status IndexAllBufferedTerms();
277 
278    private:
279     // The Editor is able to store previously seen terms as TermIds. This is
280     // is more efficient than a client doing this externally because TermIds are
281     // not exposed to clients.
282     struct HitDetails {
283       TermMatchType::Code match_type;
284       Hit::TermFrequency term_frequency;
285     };
286     std::unordered_map<uint32_t, HitDetails> seen_tokens_;
287     const TermIdCodec* term_id_codec_;
288     LiteIndex* lite_index_;
289     DocumentId document_id_;
290     NamespaceId namespace_id_;
291     SectionId section_id_;
292   };
Edit(DocumentId document_id,SectionId section_id,NamespaceId namespace_id)293   Editor Edit(DocumentId document_id, SectionId section_id,
294               NamespaceId namespace_id) {
295     return Editor(term_id_codec_.get(), lite_index_.get(), document_id,
296                   section_id, namespace_id);
297   }
298 
WantsMerge()299   bool WantsMerge() const { return lite_index_->WantsMerge(); }
300 
301   // Merges newly-added hits in the LiteIndex into the MainIndex.
302   //
303   // RETURNS:
304   //  - INTERNAL on IO error while writing to the MainIndex.
305   //  - RESOURCE_EXHAUSTED error if unable to grow the index.
Merge()306   libtextclassifier3::Status Merge() {
307     ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
308                            main_index_->MergeLexicon(lite_index_->lexicon()));
309     ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
310                            MainIndexMerger::TranslateAndExpandLiteHits(
311                                *lite_index_, *term_id_codec_, outputs));
312     ICING_RETURN_IF_ERROR(main_index_->AddHits(
313         *term_id_codec_, std::move(outputs.backfill_map),
314         std::move(term_id_hit_pairs), lite_index_->last_added_document_id()));
315     ICING_RETURN_IF_ERROR(main_index_->PersistToDisk());
316     return lite_index_->Reset();
317   }
318 
319   // Whether the LiteIndex HitBuffer requires sorting. This is only true if
320   // Icing has enabled sorting during indexing time, and the HitBuffer's
321   // unsorted tail has exceeded the lite_index_sort_size.
LiteIndexNeedSort()322   bool LiteIndexNeedSort() const {
323     return options_.lite_index_sort_at_indexing &&
324            lite_index_->HasUnsortedHitsExceedingSortThreshold();
325   }
326 
327   // Sorts the LiteIndex HitBuffer.
SortLiteIndex()328   void SortLiteIndex() { lite_index_->SortHits(); }
329 
330   // Reduces internal file sizes by reclaiming space of deleted documents.
331   // new_last_added_document_id will be used to update the last added document
332   // id in the lite index.
333   //
334   // Returns:
335   //   OK on success
336   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
337   //                               invalid state and should be cleared.
338   libtextclassifier3::Status Optimize(
339       const std::vector<DocumentId>& document_id_old_to_new,
340       DocumentId new_last_added_document_id);
341 
342  private:
Index(const Options & options,std::unique_ptr<TermIdCodec> term_id_codec,std::unique_ptr<LiteIndex> lite_index,std::unique_ptr<MainIndex> main_index,const Filesystem * filesystem)343   Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec,
344         std::unique_ptr<LiteIndex> lite_index,
345         std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem)
346       : lite_index_(std::move(lite_index)),
347         main_index_(std::move(main_index)),
348         options_(options),
349         term_id_codec_(std::move(term_id_codec)),
350         filesystem_(filesystem) {}
351 
352   libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
353       const std::string& prefix,
354       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
355       const SuggestionResultChecker* suggestion_result_checker);
356 
357   std::unique_ptr<LiteIndex> lite_index_;
358   std::unique_ptr<MainIndex> main_index_;
359   const Options options_;
360   std::unique_ptr<TermIdCodec> term_id_codec_;
361   const Filesystem* filesystem_;
362 };
363 
364 }  // namespace lib
365 }  // namespace icing
366 
367 #endif  // ICING_INDEX_INDEX_H_
368