xref: /aosp_15_r20/external/icing/icing/index/lite/lite-index.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // A small index with continuous updates (doesn't need explicit Flush
16 // to persiste) but has more possibility for corruption. It can always
17 // detect corruption reliably.
18 
19 #ifndef ICING_INDEX_LITE_INDEX_H_
20 #define ICING_INDEX_LITE_INDEX_H_
21 
22 #include <cstdint>
23 #include <iterator>
24 #include <limits>
25 #include <memory>
26 #include <string>
27 #include <string_view>
28 #include <vector>
29 
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/mutex.h"
33 #include "icing/absl_ports/thread_annotations.h"
34 #include "icing/file/filesystem.h"
35 #include "icing/index/hit/doc-hit-info.h"
36 #include "icing/index/hit/hit.h"
37 #include "icing/index/lite/lite-index-header.h"
38 #include "icing/index/lite/lite-index-options.h"
39 #include "icing/index/lite/term-id-hit-pair.h"
40 #include "icing/index/term-id-codec.h"
41 #include "icing/legacy/index/icing-array-storage.h"
42 #include "icing/legacy/index/icing-dynamic-trie.h"
43 #include "icing/legacy/index/icing-filesystem.h"
44 #include "icing/legacy/index/icing-mmapper.h"
45 #include "icing/proto/debug.pb.h"
46 #include "icing/proto/scoring.pb.h"
47 #include "icing/proto/storage.pb.h"
48 #include "icing/proto/term.pb.h"
49 #include "icing/schema/section.h"
50 #include "icing/store/document-id.h"
51 #include "icing/store/namespace-id.h"
52 #include "icing/store/suggestion-result-checker.h"
53 #include "icing/util/crc32.h"
54 
55 namespace icing {
56 namespace lib {
57 
58 // The LiteIndex is go/thread-compatible. Operations on the same data member
59 // object interfere with each other, unless they are guaranteed not to mutate
60 // the object (In the case of LiteIndex, this means all const methods,
61 // FetchHits and ScoreHits).
62 class LiteIndex {
63  public:
64   // An entry in the hit buffer.
65   using Options = LiteIndexOptions;
66 
67   // Offset for the LiteIndex_Header in the hit buffer mmap.
68   static constexpr uint32_t kHeaderFileOffset = 0;
69 
70   // Updates checksum of subcomponents.
71   ~LiteIndex();
72 
73   // Creates lite index from storage. The files will be created if they do not
74   // already exist.
75   //
76   // Returns:
77   //  OK on success
78   //  DATA_LOSS if the index was corrupted and cleared
79   //  INTERNAL on I/O error
80   static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create(
81       const Options& options, const IcingFilesystem* filesystem);
82 
83   // Resets all internal members of the index. Returns OK if all operations were
84   // successful.
85   libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_);
86 
87   // Advises the OS to cache pages in the index, which will be accessed for a
88   // query soon.
89   void Warm() ICING_LOCKS_EXCLUDED(mutex_);
90 
91   // Syncs all modified files in the index to disk.
92   //
93   // Returns:
94   //   OK on success
95   //   INTERNAL on I/O error
96   libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
97 
98   // Returns term_id if term found, NOT_FOUND otherwise.
99   libtextclassifier3::StatusOr<uint32_t> GetTermId(std::string_view term) const
100       ICING_LOCKS_EXCLUDED(mutex_);
101 
102   // Returns an iterator for all terms for which 'prefix' is a prefix.
103   class PrefixIterator {
104    public:
PrefixIterator(const IcingDynamicTrie::Iterator & delegate)105     explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate)
106         : delegate_(delegate) {}
IsValid()107     bool IsValid() const { return delegate_.IsValid(); }
108 
Advance()109     void Advance() { delegate_.Advance(); }
110 
GetKey()111     std::string_view GetKey() const { return delegate_.GetKey(); }
112 
GetValueIndex()113     uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); }
114 
115    private:
116     IcingDynamicTrie::Iterator delegate_;
117   };
118 
119   // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any
120   // previously returned PrefixIterator.
FindTermPrefixes(const std::string & prefix)121   PrefixIterator FindTermPrefixes(const std::string& prefix) const
122       ICING_LOCKS_EXCLUDED(mutex_) {
123     absl_ports::shared_lock l(&mutex_);
124     return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix));
125   }
126 
127   // Inserts a term with its properties.
128   //
129   // Returns:
130   //   A value index on success
131   //   RESOURCE_EXHAUSTED if lexicon is full or no disk space is available
132   libtextclassifier3::StatusOr<uint32_t> InsertTerm(
133       std::string_view term, TermMatchType::Code term_match_type,
134       NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_);
135 
136   // Updates term properties by setting hasPrefixHits and namespace id of the
137   // term.
138   //
139   // Returns:
140   //   OK on success
141   //   RESOURCE_EXHAUSTED if no disk space is available
142   libtextclassifier3::Status UpdateTermProperties(uint32_t tvi,
143                                                   bool hasPrefixHits,
144                                                   NamespaceId namespace_id)
145       ICING_LOCKS_EXCLUDED(mutex_);
146 
147   // Append hit to buffer. term_id must be encoded using the same term_id_codec
148   // supplied to the index constructor.
149   // RETURNS:
150   //  - OK if hit was successfully added
151   //  - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
152   //    or file system capacity reached).
153   libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit)
154       ICING_LOCKS_EXCLUDED(mutex_);
155 
156   // Add all hits with term_id from the sections specified in section_id_mask,
157   // skipping hits in non-prefix sections if only_from_prefix_sections is true,
158   // to hits_out. If hits_out is nullptr, no hits will be added. The
159   // corresponding hit term frequencies will also not be added if
160   // term_frequency_out is nullptr.
161   //
162   // Only those hits which belongs to the given namespaces will be counted and
163   // fetched. A nullptr namespace checker will disable this check.
164   //
165   // Returns the score of hits that would be added to hits_out according the
166   // given score_by.
167   int FetchHits(
168       uint32_t term_id, SectionIdMask section_id_mask,
169       bool only_from_prefix_sections,
170       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
171       const SuggestionResultChecker* suggestion_result_checker,
172       std::vector<DocHitInfo>* hits_out,
173       std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr)
174       ICING_LOCKS_EXCLUDED(mutex_);
175 
176   // Returns the hit count of the term.
177   // Only those hits which belongs to the given namespaces will be counted.
178   libtextclassifier3::StatusOr<int> ScoreHits(
179       uint32_t term_id,
180       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
181       const SuggestionResultChecker* suggestion_result_checker)
182       ICING_LOCKS_EXCLUDED(mutex_);
183 
empty()184   bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; }
185 
size()186   uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) {
187     absl_ports::shared_lock l(&mutex_);
188     return size_impl();
189   }
190 
WantsMerge()191   bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) {
192     absl_ports::shared_lock l(&mutex_);
193     return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes /
194                                         sizeof(TermIdHitPair::Value));
195   }
196 
197   // Whether or not the HitBuffer's unsorted tail size exceeds the sort
198   // threshold.
HasUnsortedHitsExceedingSortThreshold()199   bool HasUnsortedHitsExceedingSortThreshold() const
200       ICING_LOCKS_EXCLUDED(mutex_) {
201     absl_ports::shared_lock l(&mutex_);
202     return HasUnsortedHitsExceedingSortThresholdImpl();
203   }
204 
205   // Sort hits stored in the index.
SortHits()206   void SortHits() ICING_LOCKS_EXCLUDED(mutex_) {
207     absl_ports::unique_lock l(&mutex_);
208     SortHitsImpl();
209   };
210 
211   class const_iterator {
212     friend class LiteIndex;
213 
214    public:
215     using iterator_category = std::forward_iterator_tag;
216     using value_type = TermIdHitPair;
217     using reference = const value_type&;
218     using pointer = const value_type*;
219 
const_iterator()220     const_iterator() : const_iterator(nullptr, -1, -1) {}
221 
222     reference operator*() const { return start_[position_]; }
223 
224     pointer operator->() const { return start_ + position_; }
225 
226     const_iterator& operator++() {
227       if (++position_ >= end_position_) {
228         start_ = nullptr;
229         position_ = -1;
230         end_position_ = -1;
231       }
232       return *this;
233     }
234 
235     const_iterator operator++(int) {
236       auto tmp = *this;
237       ++*this;
238       return tmp;
239     }
240 
241     bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
242 
243     bool operator==(const const_iterator& rhs) {
244       return start_ == rhs.start_ && position_ == rhs.position_;
245     }
246 
247    private:
const_iterator(const TermIdHitPair * start,int position,int end_position)248     explicit const_iterator(const TermIdHitPair* start, int position,
249                             int end_position)
250         : start_(start), position_(position), end_position_(end_position) {}
251 
252     const TermIdHitPair* start_;
253     int position_;
254     int end_position_;
255   };
256 
begin()257   const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) {
258     absl_ports::shared_lock l(&mutex_);
259     // If the LiteIndex is empty, just return end().
260     return empty_impl()
261                ? end()
262                : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
263                                 header_->cur_size());
264   }
265 
end()266   const_iterator end() const { return const_iterator(); }
267 
max_hit_buffer_size()268   constexpr static uint32_t max_hit_buffer_size() {
269     return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
270   }
271 
272   // We keep track of the last added document_id. This is always the largest
273   // document_id that has been added because hits can only be added in order of
274   // increasing document_id.
last_added_document_id()275   DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) {
276     absl_ports::shared_lock l(&mutex_);
277     return header_->last_added_docid();
278   }
set_last_added_document_id(DocumentId document_id)279   void set_last_added_document_id(DocumentId document_id)
280       ICING_LOCKS_EXCLUDED(mutex_) {
281     absl_ports::unique_lock l(&mutex_);
282     header_->set_last_added_docid(document_id);
283   }
284 
285   // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference
286   // returned here.
lexicon()287   const IcingDynamicTrie& lexicon() const { return lexicon_; }
288 
289   // Returns debug information for the index in out.
290   // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer
291   // verbosity = DETAILED, more detailed debug information from the lexicon.
292   std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const
293       ICING_LOCKS_EXCLUDED(mutex_);
294 
295   // Returns the byte size of all the elements held in the index. This excludes
296   // the size of any internal metadata of the index, e.g. the index's header.
297   //
298   // Returns:
299   //   Byte size on success
300   //   INTERNAL_ERROR on IO error
301   libtextclassifier3::StatusOr<int64_t> GetElementsSize() const
302       ICING_LOCKS_EXCLUDED(mutex_);
303 
304   // Takes the provided storage_info, populates the fields related to the lite
305   // index and returns that storage_info.
306   //
307   // If an IO error occurs while trying to calculate the value for a field, then
308   // that field will be set to -1.
309   IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const
310       ICING_LOCKS_EXCLUDED(mutex_);
311 
312   // Returns the size of unsorted part of hit_buffer_.
GetHitBufferByteSize()313   uint32_t GetHitBufferByteSize() const ICING_LOCKS_EXCLUDED(mutex_) {
314     absl_ports::shared_lock l(&mutex_);
315     return size_impl() * sizeof(TermIdHitPair::Value);
316   }
317 
318   // Returns the size of unsorted part of hit_buffer_.
GetHitBufferUnsortedSize()319   uint32_t GetHitBufferUnsortedSize() const ICING_LOCKS_EXCLUDED(mutex_) {
320     absl_ports::shared_lock l(&mutex_);
321     return GetHitBufferUnsortedSizeImpl();
322   }
323 
324   // Returns the byte size of unsorted part of hit_buffer_.
GetHitBufferUnsortedByteSize()325   uint64_t GetHitBufferUnsortedByteSize() const ICING_LOCKS_EXCLUDED(mutex_) {
326     absl_ports::shared_lock l(&mutex_);
327     return GetHitBufferUnsortedSizeImpl() * sizeof(TermIdHitPair::Value);
328   }
329 
330   // Reduces internal file sizes by reclaiming space of deleted documents.
331   //
332   // This method also sets the last_added_docid of the index to
333   // new_last_added_document_id.
334   //
335   // Returns:
336   //   OK on success
337   //   INTERNAL_ERROR on IO error, this indicates that the index may be in an
338   //                               invalid state and should be cleared.
339   libtextclassifier3::Status Optimize(
340       const std::vector<DocumentId>& document_id_old_to_new,
341       const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id)
342       ICING_LOCKS_EXCLUDED(mutex_);
343 
344   // Updates the checksums of all index components, updates the combined
345   // checksum and returns it.
346   Crc32 UpdateChecksum() ICING_LOCKS_EXCLUDED(mutex_);
347 
348   // Calculates the checksum of the index components and returns the combined
349   // checksum.
350   Crc32 GetChecksum() const ICING_LOCKS_EXCLUDED(mutex_);
351 
352  private:
353   static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
354 
355   LiteIndex(const Options& options, const IcingFilesystem* filesystem);
356 
357   // Initializes lite index from storage. Must be called exactly once after
358   // object construction.
359   //
360   // Returns:
361   //  OK on success
362   //  DATA_LOSS if the index was corrupted and cleared
363   //  INTERNAL on I/O error
364   libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_);
365 
initialized()366   bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
367     return header_ != nullptr;
368   }
369 
370   // Check if the hit buffer has reached its capacity.
371   bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_);
372 
373   // Non-locking implementation for empty().
empty_impl()374   bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
375     return size_impl() == 0;
376   }
377 
378   // Non-locking implementation for size().
size_impl()379   uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
380     return header_->cur_size();
381   }
382 
383   // Calculate the checksum of all sub-components of the LiteIndex and set it in
384   // the header.
385   Crc32 UpdateChecksumInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
386 
387   // Calculates the checksum of all sub-components of the LiteIndex. Does NOT
388   // update the header.
389   Crc32 GetChecksumInternal() const ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
390 
391   // Non-locking implementation for UpdateTermProperties.
392   libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi,
393                                                       bool hasPrefixHits,
394                                                       NamespaceId namespace_id)
395       ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
396 
397   // We need to sort during querying time when:
398   // 1. Sorting at indexing time is not enabled and there is an unsorted tail
399   //    section in the HitBuffer.
400   // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless
401   //    of whether or not hit_buffer_sort_at_indexing is enabled. This is to
402   //    prevent performing sequential search on a large unsorted tail section,
403   //    which would result in bad query performance.
404   //    This is more of a sanity check. We should not really be encountering
405   //    this case.
NeedSortAtQuerying()406   bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
407     return HasUnsortedHitsExceedingSortThresholdImpl() ||
408            (!options_.hit_buffer_sort_at_indexing &&
409             GetHitBufferUnsortedSizeImpl() > 0);
410   }
411 
412   // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl().
HasUnsortedHitsExceedingSortThresholdImpl()413   bool HasUnsortedHitsExceedingSortThresholdImpl() const
414       ICING_SHARED_LOCKS_REQUIRED(mutex_) {
415     return GetHitBufferUnsortedSizeImpl() >=
416            (options_.hit_buffer_sort_threshold_bytes /
417             sizeof(TermIdHitPair::Value));
418   }
419 
420   // Non-locking implementation for SortHits().
421   void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
422 
423   // Calculates and adds the score for a fetched hit to total_score_out, while
424   // updating last_document_id (which keeps track of the last added docId so
425   // far), and is_last_document_desired (which keeps track of whether that last
426   // added docId belongs to the query's desired namespace.)
427   //
428   // Also appends the hit to hits_out and term_frequency_out if the vectors are
429   // not null.
430   void ScoreAndAppendFetchedHit(
431       const Hit& hit, SectionIdMask section_id_mask,
432       bool only_from_prefix_sections,
433       SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
434       const SuggestionResultChecker* suggestion_result_checker,
435       DocumentId& last_document_id, bool& is_last_document_desired,
436       int& total_score_out, std::vector<DocHitInfo>* hits_out,
437       std::vector<Hit::TermFrequencyArray>* term_frequency_out) const
438       ICING_SHARED_LOCKS_REQUIRED(mutex_);
439 
440   // Returns the size of unsorted part of hit_buffer_.
GetHitBufferUnsortedSizeImpl()441   uint32_t GetHitBufferUnsortedSizeImpl() const
442       ICING_SHARED_LOCKS_REQUIRED(mutex_) {
443     return header_->cur_size() - header_->searchable_end();
444   }
445 
446   // File descriptor that points to where the header and hit buffer are written
447   // to.
448   ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_);
449 
450   // Mmapped region past the header that stores the hits.
451   IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_);
452 
453   // Crc checksum of the hits, excludes the header.
454   uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_);
455 
456   // Trie that maps indexed terms to their term id
457   IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_);
458 
459   // TODO(b/140437260): Port over to MemoryMappedFile
460   // Memory mapped region of the underlying file that reflects the header.
461   IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_);
462 
463   // Wrapper around the mmapped header that contains stats on the lite index.
464   std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_);
465 
466   // Options used to initialize the LiteIndex.
467   const Options options_;
468 
469   // TODO(b/139087650) Move to icing::Filesystem
470   const IcingFilesystem* const filesystem_;
471 
472   // Used to provide reader and writer locks
473   mutable absl_ports::shared_mutex mutex_;
474 };
475 
476 }  // namespace lib
477 }  // namespace icing
478 
479 #endif  // ICING_INDEX_LITE_INDEX_H_
480