xref: /aosp_15_r20/external/icing/icing/index/lite/doc-hit-info-iterator-term-lite.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
16 #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
17 
18 #include <array>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/index/hit/doc-hit-info.h"
26 #include "icing/index/hit/hit.h"
27 #include "icing/index/iterator/doc-hit-info-iterator.h"
28 #include "icing/index/lite/lite-index.h"
29 #include "icing/index/term-id-codec.h"
30 #include "icing/schema/section.h"
31 
32 namespace icing {
33 namespace lib {
34 
35 class DocHitInfoIteratorTermLite : public DocHitInfoLeafIterator {
36  public:
DocHitInfoIteratorTermLite(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_restrict_mask,bool need_hit_term_frequency)37   explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
38                                       LiteIndex* lite_index,
39                                       const std::string& term,
40                                       int term_start_index,
41                                       int unnormalized_term_length,
42                                       SectionIdMask section_restrict_mask,
43                                       bool need_hit_term_frequency)
44       : term_(term),
45         term_start_index_(term_start_index),
46         unnormalized_term_length_(unnormalized_term_length),
47         lite_index_(lite_index),
48         cached_hits_idx_(-1),
49         term_id_codec_(term_id_codec),
50         num_advance_calls_(0),
51         section_restrict_mask_(section_restrict_mask),
52         need_hit_term_frequency_(need_hit_term_frequency) {}
53 
54   libtextclassifier3::Status Advance() override;
55 
56   libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
57 
GetCallStats()58   CallStats GetCallStats() const override {
59     return CallStats(
60         /*num_leaf_advance_calls_lite_index_in=*/num_advance_calls_,
61         /*num_leaf_advance_calls_main_index_in=*/0,
62         /*num_leaf_advance_calls_integer_index_in=*/0,
63         /*num_leaf_advance_calls_no_index_in=*/0,
64         /*num_blocks_inspected_in=*/0);
65   }
66 
67   void PopulateMatchedTermsStats(
68       std::vector<TermMatchInfo>* matched_terms_stats,
69       SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
70     if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
71       // Current hit isn't valid, return.
72       return;
73     }
74     SectionIdMask section_mask =
75         doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
76     SectionIdMask section_mask_copy = section_mask;
77     std::array<Hit::TermFrequency, kTotalNumSections> section_term_frequencies =
78         {Hit::kNoTermFrequency};
79     while (section_mask_copy) {
80       SectionId section_id = __builtin_ctzll(section_mask_copy);
81       if (need_hit_term_frequency_) {
82         section_term_frequencies.at(section_id) =
83             cached_hit_term_frequency_.at(cached_hits_idx_)[section_id];
84       }
85       section_mask_copy &= ~(UINT64_C(1) << section_id);
86     }
87     TermMatchInfo term_stats(term_, section_mask,
88                              std::move(section_term_frequencies));
89 
90     for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
91       if (cur_term_stats.term == term_stats.term) {
92         // Same docId and same term, we don't need to add the term and the term
93         // frequency should always be the same
94         return;
95       }
96     }
97     matched_terms_stats->push_back(std::move(term_stats));
98   }
99 
100  protected:
101   // Add DocHitInfos corresponding to term_ to cached_hits_.
102   //
103   // Returns:
104   //   - OK, on success
105   //   - NOT_FOUND if no term matching term_ was found in the lexicon.
106   //   - INVALID_ARGUMENT if unable to properly encode the termid
107   virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
108 
109   const std::string term_;
110   // The start index of the given term in the search query
111   int term_start_index_;
112   // The length of the given unnormalized term in the search query
113   int unnormalized_term_length_;
114   LiteIndex* const lite_index_;
115   // Stores hits retrieved from the index. This may only be a subset of the hits
116   // that are present in the index. Current value pointed to by the Iterator is
117   // tracked by cached_hits_idx_.
118   std::vector<DocHitInfo> cached_hits_;
119   std::vector<Hit::TermFrequencyArray> cached_hit_term_frequency_;
120   int cached_hits_idx_;
121   const TermIdCodec* term_id_codec_;
122   int num_advance_calls_;
123   // Mask indicating which sections hits should be considered for.
124   // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
125   const SectionIdMask section_restrict_mask_;
126   const bool need_hit_term_frequency_;
127 };
128 
129 class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
130  public:
DocHitInfoIteratorTermLiteExact(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,bool need_hit_term_frequency)131   explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
132                                            LiteIndex* lite_index,
133                                            const std::string& term,
134                                            int term_start_index,
135                                            int unnormalized_term_length,
136                                            SectionIdMask section_id_mask,
137                                            bool need_hit_term_frequency)
138       : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
139                                    term_start_index, unnormalized_term_length,
140                                    section_id_mask, need_hit_term_frequency) {}
141 
142   std::string ToString() const override;
143 
144  protected:
145   libtextclassifier3::Status RetrieveMoreHits() override;
146 };
147 
148 class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
149  public:
DocHitInfoIteratorTermLitePrefix(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,bool need_hit_term_frequency)150   explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
151                                             LiteIndex* lite_index,
152                                             const std::string& term,
153                                             int term_start_index,
154                                             int unnormalized_term_length,
155                                             SectionIdMask section_id_mask,
156                                             bool need_hit_term_frequency)
157       : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
158                                    term_start_index, unnormalized_term_length,
159                                    section_id_mask, need_hit_term_frequency) {}
160 
161   std::string ToString() const override;
162 
163  protected:
164   libtextclassifier3::Status RetrieveMoreHits() override;
165 
166  private:
167   // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
168   // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
169   // merged.
170   void SortDocumentIds();
171   void SortAndDedupeDocumentIds();
172 };
173 
174 }  // namespace lib
175 }  // namespace icing
176 
177 #endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
178