1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // A small index with continuous updates (doesn't need explicit Flush 16 // to persiste) but has more possibility for corruption. It can always 17 // detect corruption reliably. 18 19 #ifndef ICING_INDEX_LITE_INDEX_H_ 20 #define ICING_INDEX_LITE_INDEX_H_ 21 22 #include <cstdint> 23 #include <iterator> 24 #include <limits> 25 #include <memory> 26 #include <string> 27 #include <string_view> 28 #include <vector> 29 30 #include "icing/text_classifier/lib3/utils/base/status.h" 31 #include "icing/text_classifier/lib3/utils/base/statusor.h" 32 #include "icing/absl_ports/mutex.h" 33 #include "icing/absl_ports/thread_annotations.h" 34 #include "icing/file/filesystem.h" 35 #include "icing/index/hit/doc-hit-info.h" 36 #include "icing/index/hit/hit.h" 37 #include "icing/index/lite/lite-index-header.h" 38 #include "icing/index/lite/lite-index-options.h" 39 #include "icing/index/lite/term-id-hit-pair.h" 40 #include "icing/index/term-id-codec.h" 41 #include "icing/legacy/index/icing-array-storage.h" 42 #include "icing/legacy/index/icing-dynamic-trie.h" 43 #include "icing/legacy/index/icing-filesystem.h" 44 #include "icing/legacy/index/icing-mmapper.h" 45 #include "icing/proto/debug.pb.h" 46 #include "icing/proto/scoring.pb.h" 47 #include "icing/proto/storage.pb.h" 48 #include "icing/proto/term.pb.h" 49 #include "icing/schema/section.h" 50 #include "icing/store/document-id.h" 51 #include "icing/store/namespace-id.h" 52 #include "icing/store/suggestion-result-checker.h" 53 #include "icing/util/crc32.h" 54 55 namespace icing { 56 namespace lib { 57 58 // The LiteIndex is go/thread-compatible. Operations on the same data member 59 // object interfere with each other, unless they are guaranteed not to mutate 60 // the object (In the case of LiteIndex, this means all const methods, 61 // FetchHits and ScoreHits). 62 class LiteIndex { 63 public: 64 // An entry in the hit buffer. 65 using Options = LiteIndexOptions; 66 67 // Offset for the LiteIndex_Header in the hit buffer mmap. 68 static constexpr uint32_t kHeaderFileOffset = 0; 69 70 // Updates checksum of subcomponents. 71 ~LiteIndex(); 72 73 // Creates lite index from storage. The files will be created if they do not 74 // already exist. 75 // 76 // Returns: 77 // OK on success 78 // DATA_LOSS if the index was corrupted and cleared 79 // INTERNAL on I/O error 80 static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create( 81 const Options& options, const IcingFilesystem* filesystem); 82 83 // Resets all internal members of the index. Returns OK if all operations were 84 // successful. 85 libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_); 86 87 // Advises the OS to cache pages in the index, which will be accessed for a 88 // query soon. 89 void Warm() ICING_LOCKS_EXCLUDED(mutex_); 90 91 // Syncs all modified files in the index to disk. 92 // 93 // Returns: 94 // OK on success 95 // INTERNAL on I/O error 96 libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_); 97 98 // Returns term_id if term found, NOT_FOUND otherwise. 99 libtextclassifier3::StatusOr<uint32_t> GetTermId(std::string_view term) const 100 ICING_LOCKS_EXCLUDED(mutex_); 101 102 // Returns an iterator for all terms for which 'prefix' is a prefix. 103 class PrefixIterator { 104 public: PrefixIterator(const IcingDynamicTrie::Iterator & delegate)105 explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate) 106 : delegate_(delegate) {} IsValid()107 bool IsValid() const { return delegate_.IsValid(); } 108 Advance()109 void Advance() { delegate_.Advance(); } 110 GetKey()111 std::string_view GetKey() const { return delegate_.GetKey(); } 112 GetValueIndex()113 uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); } 114 115 private: 116 IcingDynamicTrie::Iterator delegate_; 117 }; 118 119 // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any 120 // previously returned PrefixIterator. FindTermPrefixes(const std::string & prefix)121 PrefixIterator FindTermPrefixes(const std::string& prefix) const 122 ICING_LOCKS_EXCLUDED(mutex_) { 123 absl_ports::shared_lock l(&mutex_); 124 return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix)); 125 } 126 127 // Inserts a term with its properties. 128 // 129 // Returns: 130 // A value index on success 131 // RESOURCE_EXHAUSTED if lexicon is full or no disk space is available 132 libtextclassifier3::StatusOr<uint32_t> InsertTerm( 133 std::string_view term, TermMatchType::Code term_match_type, 134 NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_); 135 136 // Updates term properties by setting hasPrefixHits and namespace id of the 137 // term. 138 // 139 // Returns: 140 // OK on success 141 // RESOURCE_EXHAUSTED if no disk space is available 142 libtextclassifier3::Status UpdateTermProperties(uint32_t tvi, 143 bool hasPrefixHits, 144 NamespaceId namespace_id) 145 ICING_LOCKS_EXCLUDED(mutex_); 146 147 // Append hit to buffer. term_id must be encoded using the same term_id_codec 148 // supplied to the index constructor. 149 // RETURNS: 150 // - OK if hit was successfully added 151 // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer 152 // or file system capacity reached). 153 libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit) 154 ICING_LOCKS_EXCLUDED(mutex_); 155 156 // Add all hits with term_id from the sections specified in section_id_mask, 157 // skipping hits in non-prefix sections if only_from_prefix_sections is true, 158 // to hits_out. If hits_out is nullptr, no hits will be added. The 159 // corresponding hit term frequencies will also not be added if 160 // term_frequency_out is nullptr. 161 // 162 // Only those hits which belongs to the given namespaces will be counted and 163 // fetched. A nullptr namespace checker will disable this check. 164 // 165 // Returns the score of hits that would be added to hits_out according the 166 // given score_by. 167 int FetchHits( 168 uint32_t term_id, SectionIdMask section_id_mask, 169 bool only_from_prefix_sections, 170 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 171 const SuggestionResultChecker* suggestion_result_checker, 172 std::vector<DocHitInfo>* hits_out, 173 std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr) 174 ICING_LOCKS_EXCLUDED(mutex_); 175 176 // Returns the hit count of the term. 177 // Only those hits which belongs to the given namespaces will be counted. 178 libtextclassifier3::StatusOr<int> ScoreHits( 179 uint32_t term_id, 180 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 181 const SuggestionResultChecker* suggestion_result_checker) 182 ICING_LOCKS_EXCLUDED(mutex_); 183 empty()184 bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; } 185 size()186 uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) { 187 absl_ports::shared_lock l(&mutex_); 188 return size_impl(); 189 } 190 WantsMerge()191 bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) { 192 absl_ports::shared_lock l(&mutex_); 193 return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes / 194 sizeof(TermIdHitPair::Value)); 195 } 196 197 // Whether or not the HitBuffer's unsorted tail size exceeds the sort 198 // threshold. HasUnsortedHitsExceedingSortThreshold()199 bool HasUnsortedHitsExceedingSortThreshold() const 200 ICING_LOCKS_EXCLUDED(mutex_) { 201 absl_ports::shared_lock l(&mutex_); 202 return HasUnsortedHitsExceedingSortThresholdImpl(); 203 } 204 205 // Sort hits stored in the index. SortHits()206 void SortHits() ICING_LOCKS_EXCLUDED(mutex_) { 207 absl_ports::unique_lock l(&mutex_); 208 SortHitsImpl(); 209 }; 210 211 class const_iterator { 212 friend class LiteIndex; 213 214 public: 215 using iterator_category = std::forward_iterator_tag; 216 using value_type = TermIdHitPair; 217 using reference = const value_type&; 218 using pointer = const value_type*; 219 const_iterator()220 const_iterator() : const_iterator(nullptr, -1, -1) {} 221 222 reference operator*() const { return start_[position_]; } 223 224 pointer operator->() const { return start_ + position_; } 225 226 const_iterator& operator++() { 227 if (++position_ >= end_position_) { 228 start_ = nullptr; 229 position_ = -1; 230 end_position_ = -1; 231 } 232 return *this; 233 } 234 235 const_iterator operator++(int) { 236 auto tmp = *this; 237 ++*this; 238 return tmp; 239 } 240 241 bool operator!=(const const_iterator& rhs) { return !(*this == rhs); } 242 243 bool operator==(const const_iterator& rhs) { 244 return start_ == rhs.start_ && position_ == rhs.position_; 245 } 246 247 private: const_iterator(const TermIdHitPair * start,int position,int end_position)248 explicit const_iterator(const TermIdHitPair* start, int position, 249 int end_position) 250 : start_(start), position_(position), end_position_(end_position) {} 251 252 const TermIdHitPair* start_; 253 int position_; 254 int end_position_; 255 }; 256 begin()257 const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) { 258 absl_ports::shared_lock l(&mutex_); 259 // If the LiteIndex is empty, just return end(). 260 return empty_impl() 261 ? end() 262 : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0, 263 header_->cur_size()); 264 } 265 end()266 const_iterator end() const { return const_iterator(); } 267 max_hit_buffer_size()268 constexpr static uint32_t max_hit_buffer_size() { 269 return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair); 270 } 271 272 // We keep track of the last added document_id. This is always the largest 273 // document_id that has been added because hits can only be added in order of 274 // increasing document_id. last_added_document_id()275 DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) { 276 absl_ports::shared_lock l(&mutex_); 277 return header_->last_added_docid(); 278 } set_last_added_document_id(DocumentId document_id)279 void set_last_added_document_id(DocumentId document_id) 280 ICING_LOCKS_EXCLUDED(mutex_) { 281 absl_ports::unique_lock l(&mutex_); 282 header_->set_last_added_docid(document_id); 283 } 284 285 // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference 286 // returned here. lexicon()287 const IcingDynamicTrie& lexicon() const { return lexicon_; } 288 289 // Returns debug information for the index in out. 290 // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer 291 // verbosity = DETAILED, more detailed debug information from the lexicon. 292 std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const 293 ICING_LOCKS_EXCLUDED(mutex_); 294 295 // Returns the byte size of all the elements held in the index. This excludes 296 // the size of any internal metadata of the index, e.g. the index's header. 297 // 298 // Returns: 299 // Byte size on success 300 // INTERNAL_ERROR on IO error 301 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const 302 ICING_LOCKS_EXCLUDED(mutex_); 303 304 // Takes the provided storage_info, populates the fields related to the lite 305 // index and returns that storage_info. 306 // 307 // If an IO error occurs while trying to calculate the value for a field, then 308 // that field will be set to -1. 309 IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const 310 ICING_LOCKS_EXCLUDED(mutex_); 311 312 // Returns the size of unsorted part of hit_buffer_. GetHitBufferByteSize()313 uint32_t GetHitBufferByteSize() const ICING_LOCKS_EXCLUDED(mutex_) { 314 absl_ports::shared_lock l(&mutex_); 315 return size_impl() * sizeof(TermIdHitPair::Value); 316 } 317 318 // Returns the size of unsorted part of hit_buffer_. GetHitBufferUnsortedSize()319 uint32_t GetHitBufferUnsortedSize() const ICING_LOCKS_EXCLUDED(mutex_) { 320 absl_ports::shared_lock l(&mutex_); 321 return GetHitBufferUnsortedSizeImpl(); 322 } 323 324 // Returns the byte size of unsorted part of hit_buffer_. GetHitBufferUnsortedByteSize()325 uint64_t GetHitBufferUnsortedByteSize() const ICING_LOCKS_EXCLUDED(mutex_) { 326 absl_ports::shared_lock l(&mutex_); 327 return GetHitBufferUnsortedSizeImpl() * sizeof(TermIdHitPair::Value); 328 } 329 330 // Reduces internal file sizes by reclaiming space of deleted documents. 331 // 332 // This method also sets the last_added_docid of the index to 333 // new_last_added_document_id. 334 // 335 // Returns: 336 // OK on success 337 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 338 // invalid state and should be cleared. 339 libtextclassifier3::Status Optimize( 340 const std::vector<DocumentId>& document_id_old_to_new, 341 const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) 342 ICING_LOCKS_EXCLUDED(mutex_); 343 344 // Updates the checksums of all index components, updates the combined 345 // checksum and returns it. 346 Crc32 UpdateChecksum() ICING_LOCKS_EXCLUDED(mutex_); 347 348 // Calculates the checksum of the index components and returns the combined 349 // checksum. 350 Crc32 GetChecksum() const ICING_LOCKS_EXCLUDED(mutex_); 351 352 private: 353 static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions(); 354 355 LiteIndex(const Options& options, const IcingFilesystem* filesystem); 356 357 // Initializes lite index from storage. Must be called exactly once after 358 // object construction. 359 // 360 // Returns: 361 // OK on success 362 // DATA_LOSS if the index was corrupted and cleared 363 // INTERNAL on I/O error 364 libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_); 365 initialized()366 bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 367 return header_ != nullptr; 368 } 369 370 // Check if the hit buffer has reached its capacity. 371 bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_); 372 373 // Non-locking implementation for empty(). empty_impl()374 bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 375 return size_impl() == 0; 376 } 377 378 // Non-locking implementation for size(). size_impl()379 uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 380 return header_->cur_size(); 381 } 382 383 // Calculate the checksum of all sub-components of the LiteIndex and set it in 384 // the header. 385 Crc32 UpdateChecksumInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 386 387 // Calculates the checksum of all sub-components of the LiteIndex. Does NOT 388 // update the header. 389 Crc32 GetChecksumInternal() const ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 390 391 // Non-locking implementation for UpdateTermProperties. 392 libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi, 393 bool hasPrefixHits, 394 NamespaceId namespace_id) 395 ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 396 397 // We need to sort during querying time when: 398 // 1. Sorting at indexing time is not enabled and there is an unsorted tail 399 // section in the HitBuffer. 400 // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless 401 // of whether or not hit_buffer_sort_at_indexing is enabled. This is to 402 // prevent performing sequential search on a large unsorted tail section, 403 // which would result in bad query performance. 404 // This is more of a sanity check. We should not really be encountering 405 // this case. NeedSortAtQuerying()406 bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) { 407 return HasUnsortedHitsExceedingSortThresholdImpl() || 408 (!options_.hit_buffer_sort_at_indexing && 409 GetHitBufferUnsortedSizeImpl() > 0); 410 } 411 412 // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl(). HasUnsortedHitsExceedingSortThresholdImpl()413 bool HasUnsortedHitsExceedingSortThresholdImpl() const 414 ICING_SHARED_LOCKS_REQUIRED(mutex_) { 415 return GetHitBufferUnsortedSizeImpl() >= 416 (options_.hit_buffer_sort_threshold_bytes / 417 sizeof(TermIdHitPair::Value)); 418 } 419 420 // Non-locking implementation for SortHits(). 421 void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); 422 423 // Calculates and adds the score for a fetched hit to total_score_out, while 424 // updating last_document_id (which keeps track of the last added docId so 425 // far), and is_last_document_desired (which keeps track of whether that last 426 // added docId belongs to the query's desired namespace.) 427 // 428 // Also appends the hit to hits_out and term_frequency_out if the vectors are 429 // not null. 430 void ScoreAndAppendFetchedHit( 431 const Hit& hit, SectionIdMask section_id_mask, 432 bool only_from_prefix_sections, 433 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, 434 const SuggestionResultChecker* suggestion_result_checker, 435 DocumentId& last_document_id, bool& is_last_document_desired, 436 int& total_score_out, std::vector<DocHitInfo>* hits_out, 437 std::vector<Hit::TermFrequencyArray>* term_frequency_out) const 438 ICING_SHARED_LOCKS_REQUIRED(mutex_); 439 440 // Returns the size of unsorted part of hit_buffer_. GetHitBufferUnsortedSizeImpl()441 uint32_t GetHitBufferUnsortedSizeImpl() const 442 ICING_SHARED_LOCKS_REQUIRED(mutex_) { 443 return header_->cur_size() - header_->searchable_end(); 444 } 445 446 // File descriptor that points to where the header and hit buffer are written 447 // to. 448 ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_); 449 450 // Mmapped region past the header that stores the hits. 451 IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_); 452 453 // Crc checksum of the hits, excludes the header. 454 uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_); 455 456 // Trie that maps indexed terms to their term id 457 IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_); 458 459 // TODO(b/140437260): Port over to MemoryMappedFile 460 // Memory mapped region of the underlying file that reflects the header. 461 IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_); 462 463 // Wrapper around the mmapped header that contains stats on the lite index. 464 std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_); 465 466 // Options used to initialize the LiteIndex. 467 const Options options_; 468 469 // TODO(b/139087650) Move to icing::Filesystem 470 const IcingFilesystem* const filesystem_; 471 472 // Used to provide reader and writer locks 473 mutable absl_ports::shared_mutex mutex_; 474 }; 475 476 } // namespace lib 477 } // namespace icing 478 479 #endif // ICING_INDEX_LITE_INDEX_H_ 480