1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_INDEX_H_ 16 #define ICING_INDEX_INDEX_H_ 17 18 #include <cstdint> 19 #include <memory> 20 #include <string> 21 #include <string_view> 22 #include <unordered_map> 23 #include <utility> 24 #include <vector> 25 26 #include "icing/text_classifier/lib3/utils/base/status.h" 27 #include "icing/text_classifier/lib3/utils/base/statusor.h" 28 #include "icing/file/filesystem.h" 29 #include "icing/index/hit/hit.h" 30 #include "icing/index/iterator/doc-hit-info-iterator.h" 31 #include "icing/index/lite/lite-index.h" 32 #include "icing/index/lite/term-id-hit-pair.h" 33 #include "icing/index/main/main-index-merger.h" 34 #include "icing/index/main/main-index.h" 35 #include "icing/index/term-id-codec.h" 36 #include "icing/index/term-metadata.h" 37 #include "icing/legacy/index/icing-filesystem.h" 38 #include "icing/proto/debug.pb.h" 39 #include "icing/proto/logging.pb.h" 40 #include "icing/proto/scoring.pb.h" 41 #include "icing/proto/storage.pb.h" 42 #include "icing/proto/term.pb.h" 43 #include "icing/schema/section.h" 44 #include "icing/store/document-id.h" 45 #include "icing/store/namespace-id.h" 46 #include "icing/store/suggestion-result-checker.h" 47 #include "icing/util/status-macros.h" 48 49 namespace icing { 50 namespace lib { 51 52 // The class representing the Icing search index. This index maps terms to hits 53 // (document_ids, section_ids). 54 // Content is added to the index through the Editor class - which also dedupes 55 // hits (calling Editor::AddHit with the same arguments will only result in the 56 // creation of a single hit). 57 // Ex. 58 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index, 59 // . Index::Create(MakeIndexOptions())); 60 // Index::Editor editor = index->Edit(document_id, section_id, 61 // TermMatchType::EXACT_ONLY); ICING_RETURN_IF_ERROR(editor.AddHit("foo")); 62 // ICING_RETURN_IF_ERROR(editor.AddHit("baz")); 63 // 64 // Content is retrieved from the index through the Iterator class. 65 // Ex. 66 // ICING_ASSIGN_OR_RETURN(std::unique_ptr<Index> index, 67 // . Index::Create(MakeIndexOptions())); 68 // ICING_ASSIGN_OR_RETURN(Index::Iterator iterator = 69 // index->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); 70 // while(iterator->Advance().ok()) 71 // ProcessResult(iterator->value()); 72 class Index { 73 public: 74 struct Options { OptionsOptions75 explicit Options(const std::string& base_dir, uint32_t index_merge_size, 76 bool lite_index_sort_at_indexing, 77 uint32_t lite_index_sort_size) 78 : base_dir(base_dir), 79 index_merge_size(index_merge_size), 80 lite_index_sort_at_indexing(lite_index_sort_at_indexing), 81 lite_index_sort_size(lite_index_sort_size) {} 82 83 std::string base_dir; 84 int32_t index_merge_size; 85 bool lite_index_sort_at_indexing; 86 int32_t lite_index_sort_size; 87 }; 88 89 // Creates an instance of Index in the directory pointed by file_dir. 90 // 91 // Returns: 92 // Valid Index on success 93 // DATA_LOSS if the index was corrupt and had to be cleared 94 // INVALID_ARGUMENT if options have invalid values 95 // INTERNAL on I/O error 96 static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create( 97 const Options& options, const Filesystem* filesystem, 98 const IcingFilesystem* icing_filesystem); 99 100 // Reads magic from existing flash (main) index file header. We need this 101 // during Icing initialization phase to determine the version. 102 // 103 // Returns 104 // Valid magic on success 105 // NOT_FOUND if the lite index doesn't exist 106 // INTERNAL on I/O error 107 static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic( 108 const Filesystem* filesystem, const std::string& base_dir); 109 110 // Clears all files created by the index. Returns OK if all files were 111 // cleared. Reset()112 libtextclassifier3::Status Reset() { 113 ICING_RETURN_IF_ERROR(lite_index_->Reset()); 114 return main_index_->Reset(); 115 } 116 117 // Brings components of the index into memory in anticipation of a query in 118 // order to reduce latency. Warm()119 void Warm() { 120 lite_index_->Warm(); 121 main_index_->Warm(); 122 } 123 124 // Syncs all the data and metadata changes to disk. 125 // 126 // Returns: 127 // OK on success 128 // INTERNAL on I/O errors PersistToDisk()129 libtextclassifier3::Status PersistToDisk() { 130 ICING_RETURN_IF_ERROR(lite_index_->PersistToDisk()); 131 return main_index_->PersistToDisk(); 132 } 133 134 // Updates all checksums in the index and returns the combined index checksum. UpdateChecksum()135 Crc32 UpdateChecksum() { 136 Crc32 lite_crc = lite_index_->UpdateChecksum(); 137 Crc32 main_crc = main_index_->UpdateChecksum(); 138 main_crc.Append(std::to_string(lite_crc.Get())); 139 return main_crc; 140 } 141 142 // Calculates and returns the combined index checksum. GetChecksum()143 Crc32 GetChecksum() const { 144 Crc32 lite_crc = lite_index_->GetChecksum(); 145 Crc32 main_crc = main_index_->GetChecksum(); 146 main_crc.Append(std::to_string(lite_crc.Get())); 147 return main_crc; 148 } 149 150 // Discard parts of the index if they contain data for document ids greater 151 // than document_id. 152 // 153 // NOTE: This means that TruncateTo(kInvalidDocumentId) will have no effect. 154 // 155 // Returns: 156 // OK on success 157 // INTERNAL on I/O errors 158 libtextclassifier3::Status TruncateTo(DocumentId document_id); 159 160 // DocumentIds are always inserted in increasing order. Returns the largest 161 // document_id added to the index. last_added_document_id()162 DocumentId last_added_document_id() const { 163 DocumentId lite_document_id = lite_index_->last_added_document_id(); 164 if (lite_document_id != kInvalidDocumentId) { 165 return lite_document_id; 166 } 167 return main_index_->last_added_document_id(); 168 } 169 170 // Sets last_added_document_id to document_id so long as document_id > 171 // last_added_document_id() set_last_added_document_id(DocumentId document_id)172 void set_last_added_document_id(DocumentId document_id) { 173 DocumentId lite_document_id = lite_index_->last_added_document_id(); 174 if (lite_document_id == kInvalidDocumentId || 175 document_id >= lite_document_id) { 176 lite_index_->set_last_added_document_id(document_id); 177 } 178 } 179 180 // Returns debug information for the index in out. 181 // verbosity = BASIC, simplest debug information - just the lexicons and lite 182 // index. 183 // verbosity = DETAILED, more detailed debug information including raw 184 // postings lists. GetDebugInfo(DebugInfoVerbosity::Code verbosity)185 IndexDebugInfoProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) const { 186 IndexDebugInfoProto debug_info; 187 *debug_info.mutable_index_storage_info() = GetStorageInfo(); 188 *debug_info.mutable_lite_index_info() = 189 lite_index_->GetDebugInfo(verbosity); 190 *debug_info.mutable_main_index_info() = 191 main_index_->GetDebugInfo(verbosity); 192 return debug_info; 193 } 194 PublishQueryStats(QueryStatsProto * query_stats)195 void PublishQueryStats(QueryStatsProto* query_stats) const { 196 query_stats->set_lite_index_hit_buffer_byte_size( 197 lite_index_->GetHitBufferByteSize()); 198 query_stats->set_lite_index_hit_buffer_unsorted_byte_size( 199 lite_index_->GetHitBufferUnsortedByteSize()); 200 } 201 202 // Returns the byte size of the all the elements held in the index. This 203 // excludes the size of any internal metadata of the index, e.g. the index's 204 // header. 205 // 206 // Returns: 207 // Byte size on success 208 // INTERNAL_ERROR on IO error GetElementsSize()209 libtextclassifier3::StatusOr<int64_t> GetElementsSize() const { 210 ICING_ASSIGN_OR_RETURN(int64_t lite_index_size, 211 lite_index_->GetElementsSize()); 212 ICING_ASSIGN_OR_RETURN(int64_t main_index_size, 213 main_index_->GetElementsSize()); 214 return lite_index_size + main_index_size; 215 } 216 217 // Calculates the StorageInfo for the Index. 218 // 219 // If an IO error occurs while trying to calculate the value for a field, then 220 // that field will be set to -1. 221 IndexStorageInfoProto GetStorageInfo() const; 222 223 // Create an iterator to iterate through all doc hit infos in the index that 224 // match the term. term_start_index is the start index of the given term in 225 // the search query. unnormalized_term_length is the length of the given 226 // unnormalized term in the search query not listed in the mask. 227 // Eg. section_id_mask = 1U << 3; would only return hits that occur in 228 // section 3. 229 // 230 // Returns: 231 // unique ptr to a valid DocHitInfoIterator that matches the term 232 // INVALID_ARGUMENT if given an invalid term_match_type 233 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( 234 const std::string& term, int term_start_index, 235 int unnormalized_term_length, SectionIdMask section_id_mask, 236 TermMatchType::Code term_match_type, bool need_hit_term_frequency = true); 237 238 // Finds terms with the given prefix in the given namespaces. If 239 // 'namespace_ids' is empty, returns results from all the namespaces. Results 240 // are sorted in decreasing order of hit count. Number of results are no more 241 // than 'num_to_return'. 242 // 243 // Returns: 244 // A list of TermMetadata on success 245 // INTERNAL_ERROR if failed to access term data. 246 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix( 247 const std::string& prefix, int num_to_return, 248 TermMatchType::Code scoring_match_type, 249 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by, 250 const SuggestionResultChecker* suggestion_result_checker); 251 252 // A class that can be used to add hits to the index. 253 // 254 // An editor groups hits from a particular section within a document together 255 // and dedupes hits for the same term within a section. This removes the 256 // burden of deduping from the caller and direct access to the index 257 // implementation allows for more efficient deduping. 258 class Editor { 259 public: 260 // Does not take any ownership, and all pointers must refer to valid objects 261 // that outlive the one constructed. 262 // TODO(b/141180665): Add nullptr checks for the raw pointers Editor(const TermIdCodec * term_id_codec,LiteIndex * lite_index,DocumentId document_id,SectionId section_id,NamespaceId namespace_id)263 Editor(const TermIdCodec* term_id_codec, LiteIndex* lite_index, 264 DocumentId document_id, SectionId section_id, 265 NamespaceId namespace_id) 266 : term_id_codec_(term_id_codec), 267 lite_index_(lite_index), 268 document_id_(document_id), 269 namespace_id_(namespace_id), 270 section_id_(section_id) {} 271 272 // Buffer the term in seen_tokens_. 273 libtextclassifier3::Status BufferTerm(std::string_view term, 274 TermMatchType::Code match_type); 275 // Index all the terms stored in seen_tokens_. 276 libtextclassifier3::Status IndexAllBufferedTerms(); 277 278 private: 279 // The Editor is able to store previously seen terms as TermIds. This is 280 // is more efficient than a client doing this externally because TermIds are 281 // not exposed to clients. 282 struct HitDetails { 283 TermMatchType::Code match_type; 284 Hit::TermFrequency term_frequency; 285 }; 286 std::unordered_map<uint32_t, HitDetails> seen_tokens_; 287 const TermIdCodec* term_id_codec_; 288 LiteIndex* lite_index_; 289 DocumentId document_id_; 290 NamespaceId namespace_id_; 291 SectionId section_id_; 292 }; Edit(DocumentId document_id,SectionId section_id,NamespaceId namespace_id)293 Editor Edit(DocumentId document_id, SectionId section_id, 294 NamespaceId namespace_id) { 295 return Editor(term_id_codec_.get(), lite_index_.get(), document_id, 296 section_id, namespace_id); 297 } 298 WantsMerge()299 bool WantsMerge() const { return lite_index_->WantsMerge(); } 300 301 // Merges newly-added hits in the LiteIndex into the MainIndex. 302 // 303 // RETURNS: 304 // - INTERNAL on IO error while writing to the MainIndex. 305 // - RESOURCE_EXHAUSTED error if unable to grow the index. Merge()306 libtextclassifier3::Status Merge() { 307 ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs, 308 main_index_->MergeLexicon(lite_index_->lexicon())); 309 ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs, 310 MainIndexMerger::TranslateAndExpandLiteHits( 311 *lite_index_, *term_id_codec_, outputs)); 312 ICING_RETURN_IF_ERROR(main_index_->AddHits( 313 *term_id_codec_, std::move(outputs.backfill_map), 314 std::move(term_id_hit_pairs), lite_index_->last_added_document_id())); 315 ICING_RETURN_IF_ERROR(main_index_->PersistToDisk()); 316 return lite_index_->Reset(); 317 } 318 319 // Whether the LiteIndex HitBuffer requires sorting. This is only true if 320 // Icing has enabled sorting during indexing time, and the HitBuffer's 321 // unsorted tail has exceeded the lite_index_sort_size. LiteIndexNeedSort()322 bool LiteIndexNeedSort() const { 323 return options_.lite_index_sort_at_indexing && 324 lite_index_->HasUnsortedHitsExceedingSortThreshold(); 325 } 326 327 // Sorts the LiteIndex HitBuffer. SortLiteIndex()328 void SortLiteIndex() { lite_index_->SortHits(); } 329 330 // Reduces internal file sizes by reclaiming space of deleted documents. 331 // new_last_added_document_id will be used to update the last added document 332 // id in the lite index. 333 // 334 // Returns: 335 // OK on success 336 // INTERNAL_ERROR on IO error, this indicates that the index may be in an 337 // invalid state and should be cleared. 338 libtextclassifier3::Status Optimize( 339 const std::vector<DocumentId>& document_id_old_to_new, 340 DocumentId new_last_added_document_id); 341 342 private: Index(const Options & options,std::unique_ptr<TermIdCodec> term_id_codec,std::unique_ptr<LiteIndex> lite_index,std::unique_ptr<MainIndex> main_index,const Filesystem * filesystem)343 Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec, 344 std::unique_ptr<LiteIndex> lite_index, 345 std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem) 346 : lite_index_(std::move(lite_index)), 347 main_index_(std::move(main_index)), 348 options_(options), 349 term_id_codec_(std::move(term_id_codec)), 350 filesystem_(filesystem) {} 351 352 libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( 353 const std::string& prefix, 354 SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by, 355 const SuggestionResultChecker* suggestion_result_checker); 356 357 std::unique_ptr<LiteIndex> lite_index_; 358 std::unique_ptr<MainIndex> main_index_; 359 const Options options_; 360 std::unique_ptr<TermIdCodec> term_id_codec_; 361 const Filesystem* filesystem_; 362 }; 363 364 } // namespace lib 365 } // namespace icing 366 367 #endif // ICING_INDEX_INDEX_H_ 368