xref: /aosp_15_r20/external/icing/icing/index/lite/doc-hit-info-iterator-term-lite.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1*8b6cd535SAndroid Build Coastguard Worker // Copyright (C) 2019 Google LLC
2*8b6cd535SAndroid Build Coastguard Worker //
3*8b6cd535SAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License");
4*8b6cd535SAndroid Build Coastguard Worker // you may not use this file except in compliance with the License.
5*8b6cd535SAndroid Build Coastguard Worker // You may obtain a copy of the License at
6*8b6cd535SAndroid Build Coastguard Worker //
7*8b6cd535SAndroid Build Coastguard Worker //      http://www.apache.org/licenses/LICENSE-2.0
8*8b6cd535SAndroid Build Coastguard Worker //
9*8b6cd535SAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software
10*8b6cd535SAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS,
11*8b6cd535SAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*8b6cd535SAndroid Build Coastguard Worker // See the License for the specific language governing permissions and
13*8b6cd535SAndroid Build Coastguard Worker // limitations under the License.
14*8b6cd535SAndroid Build Coastguard Worker 
15*8b6cd535SAndroid Build Coastguard Worker #ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
16*8b6cd535SAndroid Build Coastguard Worker #define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
17*8b6cd535SAndroid Build Coastguard Worker 
18*8b6cd535SAndroid Build Coastguard Worker #include <array>
19*8b6cd535SAndroid Build Coastguard Worker #include <string>
20*8b6cd535SAndroid Build Coastguard Worker #include <utility>
21*8b6cd535SAndroid Build Coastguard Worker #include <vector>
22*8b6cd535SAndroid Build Coastguard Worker 
23*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/status.h"
24*8b6cd535SAndroid Build Coastguard Worker #include "icing/text_classifier/lib3/utils/base/statusor.h"
25*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/hit/doc-hit-info.h"
26*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/hit/hit.h"
27*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/iterator/doc-hit-info-iterator.h"
28*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/lite/lite-index.h"
29*8b6cd535SAndroid Build Coastguard Worker #include "icing/index/term-id-codec.h"
30*8b6cd535SAndroid Build Coastguard Worker #include "icing/schema/section.h"
31*8b6cd535SAndroid Build Coastguard Worker 
32*8b6cd535SAndroid Build Coastguard Worker namespace icing {
33*8b6cd535SAndroid Build Coastguard Worker namespace lib {
34*8b6cd535SAndroid Build Coastguard Worker 
35*8b6cd535SAndroid Build Coastguard Worker class DocHitInfoIteratorTermLite : public DocHitInfoLeafIterator {
36*8b6cd535SAndroid Build Coastguard Worker  public:
DocHitInfoIteratorTermLite(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_restrict_mask,bool need_hit_term_frequency)37*8b6cd535SAndroid Build Coastguard Worker   explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
38*8b6cd535SAndroid Build Coastguard Worker                                       LiteIndex* lite_index,
39*8b6cd535SAndroid Build Coastguard Worker                                       const std::string& term,
40*8b6cd535SAndroid Build Coastguard Worker                                       int term_start_index,
41*8b6cd535SAndroid Build Coastguard Worker                                       int unnormalized_term_length,
42*8b6cd535SAndroid Build Coastguard Worker                                       SectionIdMask section_restrict_mask,
43*8b6cd535SAndroid Build Coastguard Worker                                       bool need_hit_term_frequency)
44*8b6cd535SAndroid Build Coastguard Worker       : term_(term),
45*8b6cd535SAndroid Build Coastguard Worker         term_start_index_(term_start_index),
46*8b6cd535SAndroid Build Coastguard Worker         unnormalized_term_length_(unnormalized_term_length),
47*8b6cd535SAndroid Build Coastguard Worker         lite_index_(lite_index),
48*8b6cd535SAndroid Build Coastguard Worker         cached_hits_idx_(-1),
49*8b6cd535SAndroid Build Coastguard Worker         term_id_codec_(term_id_codec),
50*8b6cd535SAndroid Build Coastguard Worker         num_advance_calls_(0),
51*8b6cd535SAndroid Build Coastguard Worker         section_restrict_mask_(section_restrict_mask),
52*8b6cd535SAndroid Build Coastguard Worker         need_hit_term_frequency_(need_hit_term_frequency) {}
53*8b6cd535SAndroid Build Coastguard Worker 
54*8b6cd535SAndroid Build Coastguard Worker   libtextclassifier3::Status Advance() override;
55*8b6cd535SAndroid Build Coastguard Worker 
56*8b6cd535SAndroid Build Coastguard Worker   libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
57*8b6cd535SAndroid Build Coastguard Worker 
GetCallStats()58*8b6cd535SAndroid Build Coastguard Worker   CallStats GetCallStats() const override {
59*8b6cd535SAndroid Build Coastguard Worker     return CallStats(
60*8b6cd535SAndroid Build Coastguard Worker         /*num_leaf_advance_calls_lite_index_in=*/num_advance_calls_,
61*8b6cd535SAndroid Build Coastguard Worker         /*num_leaf_advance_calls_main_index_in=*/0,
62*8b6cd535SAndroid Build Coastguard Worker         /*num_leaf_advance_calls_integer_index_in=*/0,
63*8b6cd535SAndroid Build Coastguard Worker         /*num_leaf_advance_calls_no_index_in=*/0,
64*8b6cd535SAndroid Build Coastguard Worker         /*num_blocks_inspected_in=*/0);
65*8b6cd535SAndroid Build Coastguard Worker   }
66*8b6cd535SAndroid Build Coastguard Worker 
67*8b6cd535SAndroid Build Coastguard Worker   void PopulateMatchedTermsStats(
68*8b6cd535SAndroid Build Coastguard Worker       std::vector<TermMatchInfo>* matched_terms_stats,
69*8b6cd535SAndroid Build Coastguard Worker       SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
70*8b6cd535SAndroid Build Coastguard Worker     if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
71*8b6cd535SAndroid Build Coastguard Worker       // Current hit isn't valid, return.
72*8b6cd535SAndroid Build Coastguard Worker       return;
73*8b6cd535SAndroid Build Coastguard Worker     }
74*8b6cd535SAndroid Build Coastguard Worker     SectionIdMask section_mask =
75*8b6cd535SAndroid Build Coastguard Worker         doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
76*8b6cd535SAndroid Build Coastguard Worker     SectionIdMask section_mask_copy = section_mask;
77*8b6cd535SAndroid Build Coastguard Worker     std::array<Hit::TermFrequency, kTotalNumSections> section_term_frequencies =
78*8b6cd535SAndroid Build Coastguard Worker         {Hit::kNoTermFrequency};
79*8b6cd535SAndroid Build Coastguard Worker     while (section_mask_copy) {
80*8b6cd535SAndroid Build Coastguard Worker       SectionId section_id = __builtin_ctzll(section_mask_copy);
81*8b6cd535SAndroid Build Coastguard Worker       if (need_hit_term_frequency_) {
82*8b6cd535SAndroid Build Coastguard Worker         section_term_frequencies.at(section_id) =
83*8b6cd535SAndroid Build Coastguard Worker             cached_hit_term_frequency_.at(cached_hits_idx_)[section_id];
84*8b6cd535SAndroid Build Coastguard Worker       }
85*8b6cd535SAndroid Build Coastguard Worker       section_mask_copy &= ~(UINT64_C(1) << section_id);
86*8b6cd535SAndroid Build Coastguard Worker     }
87*8b6cd535SAndroid Build Coastguard Worker     TermMatchInfo term_stats(term_, section_mask,
88*8b6cd535SAndroid Build Coastguard Worker                              std::move(section_term_frequencies));
89*8b6cd535SAndroid Build Coastguard Worker 
90*8b6cd535SAndroid Build Coastguard Worker     for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
91*8b6cd535SAndroid Build Coastguard Worker       if (cur_term_stats.term == term_stats.term) {
92*8b6cd535SAndroid Build Coastguard Worker         // Same docId and same term, we don't need to add the term and the term
93*8b6cd535SAndroid Build Coastguard Worker         // frequency should always be the same
94*8b6cd535SAndroid Build Coastguard Worker         return;
95*8b6cd535SAndroid Build Coastguard Worker       }
96*8b6cd535SAndroid Build Coastguard Worker     }
97*8b6cd535SAndroid Build Coastguard Worker     matched_terms_stats->push_back(std::move(term_stats));
98*8b6cd535SAndroid Build Coastguard Worker   }
99*8b6cd535SAndroid Build Coastguard Worker 
100*8b6cd535SAndroid Build Coastguard Worker  protected:
101*8b6cd535SAndroid Build Coastguard Worker   // Add DocHitInfos corresponding to term_ to cached_hits_.
102*8b6cd535SAndroid Build Coastguard Worker   //
103*8b6cd535SAndroid Build Coastguard Worker   // Returns:
104*8b6cd535SAndroid Build Coastguard Worker   //   - OK, on success
105*8b6cd535SAndroid Build Coastguard Worker   //   - NOT_FOUND if no term matching term_ was found in the lexicon.
106*8b6cd535SAndroid Build Coastguard Worker   //   - INVALID_ARGUMENT if unable to properly encode the termid
107*8b6cd535SAndroid Build Coastguard Worker   virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
108*8b6cd535SAndroid Build Coastguard Worker 
109*8b6cd535SAndroid Build Coastguard Worker   const std::string term_;
110*8b6cd535SAndroid Build Coastguard Worker   // The start index of the given term in the search query
111*8b6cd535SAndroid Build Coastguard Worker   int term_start_index_;
112*8b6cd535SAndroid Build Coastguard Worker   // The length of the given unnormalized term in the search query
113*8b6cd535SAndroid Build Coastguard Worker   int unnormalized_term_length_;
114*8b6cd535SAndroid Build Coastguard Worker   LiteIndex* const lite_index_;
115*8b6cd535SAndroid Build Coastguard Worker   // Stores hits retrieved from the index. This may only be a subset of the hits
116*8b6cd535SAndroid Build Coastguard Worker   // that are present in the index. Current value pointed to by the Iterator is
117*8b6cd535SAndroid Build Coastguard Worker   // tracked by cached_hits_idx_.
118*8b6cd535SAndroid Build Coastguard Worker   std::vector<DocHitInfo> cached_hits_;
119*8b6cd535SAndroid Build Coastguard Worker   std::vector<Hit::TermFrequencyArray> cached_hit_term_frequency_;
120*8b6cd535SAndroid Build Coastguard Worker   int cached_hits_idx_;
121*8b6cd535SAndroid Build Coastguard Worker   const TermIdCodec* term_id_codec_;
122*8b6cd535SAndroid Build Coastguard Worker   int num_advance_calls_;
123*8b6cd535SAndroid Build Coastguard Worker   // Mask indicating which sections hits should be considered for.
124*8b6cd535SAndroid Build Coastguard Worker   // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
125*8b6cd535SAndroid Build Coastguard Worker   const SectionIdMask section_restrict_mask_;
126*8b6cd535SAndroid Build Coastguard Worker   const bool need_hit_term_frequency_;
127*8b6cd535SAndroid Build Coastguard Worker };
128*8b6cd535SAndroid Build Coastguard Worker 
129*8b6cd535SAndroid Build Coastguard Worker class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
130*8b6cd535SAndroid Build Coastguard Worker  public:
DocHitInfoIteratorTermLiteExact(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,bool need_hit_term_frequency)131*8b6cd535SAndroid Build Coastguard Worker   explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
132*8b6cd535SAndroid Build Coastguard Worker                                            LiteIndex* lite_index,
133*8b6cd535SAndroid Build Coastguard Worker                                            const std::string& term,
134*8b6cd535SAndroid Build Coastguard Worker                                            int term_start_index,
135*8b6cd535SAndroid Build Coastguard Worker                                            int unnormalized_term_length,
136*8b6cd535SAndroid Build Coastguard Worker                                            SectionIdMask section_id_mask,
137*8b6cd535SAndroid Build Coastguard Worker                                            bool need_hit_term_frequency)
138*8b6cd535SAndroid Build Coastguard Worker       : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
139*8b6cd535SAndroid Build Coastguard Worker                                    term_start_index, unnormalized_term_length,
140*8b6cd535SAndroid Build Coastguard Worker                                    section_id_mask, need_hit_term_frequency) {}
141*8b6cd535SAndroid Build Coastguard Worker 
142*8b6cd535SAndroid Build Coastguard Worker   std::string ToString() const override;
143*8b6cd535SAndroid Build Coastguard Worker 
144*8b6cd535SAndroid Build Coastguard Worker  protected:
145*8b6cd535SAndroid Build Coastguard Worker   libtextclassifier3::Status RetrieveMoreHits() override;
146*8b6cd535SAndroid Build Coastguard Worker };
147*8b6cd535SAndroid Build Coastguard Worker 
148*8b6cd535SAndroid Build Coastguard Worker class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
149*8b6cd535SAndroid Build Coastguard Worker  public:
DocHitInfoIteratorTermLitePrefix(const TermIdCodec * term_id_codec,LiteIndex * lite_index,const std::string & term,int term_start_index,int unnormalized_term_length,SectionIdMask section_id_mask,bool need_hit_term_frequency)150*8b6cd535SAndroid Build Coastguard Worker   explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
151*8b6cd535SAndroid Build Coastguard Worker                                             LiteIndex* lite_index,
152*8b6cd535SAndroid Build Coastguard Worker                                             const std::string& term,
153*8b6cd535SAndroid Build Coastguard Worker                                             int term_start_index,
154*8b6cd535SAndroid Build Coastguard Worker                                             int unnormalized_term_length,
155*8b6cd535SAndroid Build Coastguard Worker                                             SectionIdMask section_id_mask,
156*8b6cd535SAndroid Build Coastguard Worker                                             bool need_hit_term_frequency)
157*8b6cd535SAndroid Build Coastguard Worker       : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
158*8b6cd535SAndroid Build Coastguard Worker                                    term_start_index, unnormalized_term_length,
159*8b6cd535SAndroid Build Coastguard Worker                                    section_id_mask, need_hit_term_frequency) {}
160*8b6cd535SAndroid Build Coastguard Worker 
161*8b6cd535SAndroid Build Coastguard Worker   std::string ToString() const override;
162*8b6cd535SAndroid Build Coastguard Worker 
163*8b6cd535SAndroid Build Coastguard Worker  protected:
164*8b6cd535SAndroid Build Coastguard Worker   libtextclassifier3::Status RetrieveMoreHits() override;
165*8b6cd535SAndroid Build Coastguard Worker 
166*8b6cd535SAndroid Build Coastguard Worker  private:
167*8b6cd535SAndroid Build Coastguard Worker   // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
168*8b6cd535SAndroid Build Coastguard Worker   // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
169*8b6cd535SAndroid Build Coastguard Worker   // merged.
170*8b6cd535SAndroid Build Coastguard Worker   void SortDocumentIds();
171*8b6cd535SAndroid Build Coastguard Worker   void SortAndDedupeDocumentIds();
172*8b6cd535SAndroid Build Coastguard Worker };
173*8b6cd535SAndroid Build Coastguard Worker 
174*8b6cd535SAndroid Build Coastguard Worker }  // namespace lib
175*8b6cd535SAndroid Build Coastguard Worker }  // namespace icing
176*8b6cd535SAndroid Build Coastguard Worker 
177*8b6cd535SAndroid Build Coastguard Worker #endif  // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
178