1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ 16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ 17 18 #include <array> 19 #include <string> 20 #include <utility> 21 #include <vector> 22 23 #include "icing/text_classifier/lib3/utils/base/status.h" 24 #include "icing/text_classifier/lib3/utils/base/statusor.h" 25 #include "icing/index/hit/doc-hit-info.h" 26 #include "icing/index/hit/hit.h" 27 #include "icing/index/iterator/doc-hit-info-iterator.h" 28 #include "icing/index/lite/lite-index.h" 29 #include "icing/index/term-id-codec.h" 30 #include "icing/schema/section.h" 31 32 namespace icing { 33 namespace lib { 34 35 class DocHitInfoIteratorTermLite : public DocHitInfoLeafIterator { 36 public: DocHitInfoIteratorTermLite(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_restrict_mask,bool need_hit_term_frequency)37 explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec, 38 LiteIndex* lite_index, 39 const std::string& term, 40 int term_start_index, 41 int unnormalized_term_length, 42 SectionIdMask section_restrict_mask, 43 bool need_hit_term_frequency) 44 : term_(term), 45 term_start_index_(term_start_index), 46 unnormalized_term_length_(unnormalized_term_length), 47 lite_index_(lite_index), 48 cached_hits_idx_(-1), 49 term_id_codec_(term_id_codec), 50 num_advance_calls_(0), 51 section_restrict_mask_(section_restrict_mask), 52 need_hit_term_frequency_(need_hit_term_frequency) {} 53 54 libtextclassifier3::Status Advance() override; 55 56 libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override; 57 GetCallStats()58 CallStats GetCallStats() const override { 59 return CallStats( 60 /*num_leaf_advance_calls_lite_index_in=*/num_advance_calls_, 61 /*num_leaf_advance_calls_main_index_in=*/0, 62 /*num_leaf_advance_calls_integer_index_in=*/0, 63 /*num_leaf_advance_calls_no_index_in=*/0, 64 /*num_blocks_inspected_in=*/0); 65 } 66 67 void PopulateMatchedTermsStats( 68 std::vector<TermMatchInfo>* matched_terms_stats, 69 SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { 70 if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) { 71 // Current hit isn't valid, return. 72 return; 73 } 74 SectionIdMask section_mask = 75 doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; 76 SectionIdMask section_mask_copy = section_mask; 77 std::array<Hit::TermFrequency, kTotalNumSections> section_term_frequencies = 78 {Hit::kNoTermFrequency}; 79 while (section_mask_copy) { 80 SectionId section_id = __builtin_ctzll(section_mask_copy); 81 if (need_hit_term_frequency_) { 82 section_term_frequencies.at(section_id) = 83 cached_hit_term_frequency_.at(cached_hits_idx_)[section_id]; 84 } 85 section_mask_copy &= ~(UINT64_C(1) << section_id); 86 } 87 TermMatchInfo term_stats(term_, section_mask, 88 std::move(section_term_frequencies)); 89 90 for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { 91 if (cur_term_stats.term == term_stats.term) { 92 // Same docId and same term, we don't need to add the term and the term 93 // frequency should always be the same 94 return; 95 } 96 } 97 matched_terms_stats->push_back(std::move(term_stats)); 98 } 99 100 protected: 101 // Add DocHitInfos corresponding to term_ to cached_hits_. 102 // 103 // Returns: 104 // - OK, on success 105 // - NOT_FOUND if no term matching term_ was found in the lexicon. 106 // - INVALID_ARGUMENT if unable to properly encode the termid 107 virtual libtextclassifier3::Status RetrieveMoreHits() = 0; 108 109 const std::string term_; 110 // The start index of the given term in the search query 111 int term_start_index_; 112 // The length of the given unnormalized term in the search query 113 int unnormalized_term_length_; 114 LiteIndex* const lite_index_; 115 // Stores hits retrieved from the index. This may only be a subset of the hits 116 // that are present in the index. Current value pointed to by the Iterator is 117 // tracked by cached_hits_idx_. 118 std::vector<DocHitInfo> cached_hits_; 119 std::vector<Hit::TermFrequencyArray> cached_hit_term_frequency_; 120 int cached_hits_idx_; 121 const TermIdCodec* term_id_codec_; 122 int num_advance_calls_; 123 // Mask indicating which sections hits should be considered for. 124 // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired. 125 const SectionIdMask section_restrict_mask_; 126 const bool need_hit_term_frequency_; 127 }; 128 129 class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite { 130 public: DocHitInfoIteratorTermLiteExact(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,bool need_hit_term_frequency)131 explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec, 132 LiteIndex* lite_index, 133 const std::string& term, 134 int term_start_index, 135 int unnormalized_term_length, 136 SectionIdMask section_id_mask, 137 bool need_hit_term_frequency) 138 : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, 139 term_start_index, unnormalized_term_length, 140 section_id_mask, need_hit_term_frequency) {} 141 142 std::string ToString() const override; 143 144 protected: 145 libtextclassifier3::Status RetrieveMoreHits() override; 146 }; 147 148 class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite { 149 public: DocHitInfoIteratorTermLitePrefix(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,bool need_hit_term_frequency)150 explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec, 151 LiteIndex* lite_index, 152 const std::string& term, 153 int term_start_index, 154 int unnormalized_term_length, 155 SectionIdMask section_id_mask, 156 bool need_hit_term_frequency) 157 : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term, 158 term_start_index, unnormalized_term_length, 159 section_id_mask, need_hit_term_frequency) {} 160 161 std::string ToString() const override; 162 163 protected: 164 libtextclassifier3::Status RetrieveMoreHits() override; 165 166 private: 167 // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and 168 // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be 169 // merged. 170 void SortDocumentIds(); 171 void SortAndDedupeDocumentIds(); 172 }; 173 174 } // namespace lib 175 } // namespace icing 176 177 #endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_ 178