1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/main/main-index-merger.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <cstring>
20 #include <unordered_map>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/statusor.h"
25 #include "icing/absl_ports/canonical_errors.h"
26 #include "icing/file/posting_list/posting-list-common.h"
27 #include "icing/index/hit/hit.h"
28 #include "icing/index/lite/lite-index.h"
29 #include "icing/index/lite/term-id-hit-pair.h"
30 #include "icing/index/main/main-index.h"
31 #include "icing/index/term-id-codec.h"
32 #include "icing/legacy/core/icing-string-util.h"
33 #include "icing/util/logging.h"
34 #include "icing/util/status-macros.h"
35
36 namespace icing {
37 namespace lib {
38
39 namespace {
40
41 class HitSelector {
42 public:
43 // Returns whether or not term_id_hit_pair has the same term_id, document_id
44 // and section_id as the previously selected hits.
IsEquivalentHit(const TermIdHitPair & term_id_hit_pair)45 bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
46 return prev_.term_id() == term_id_hit_pair.term_id() &&
47 prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
48 prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
49 }
50
51 // Merges term_id_hit_pair with previously added hits.
SelectIfBetter(const TermIdHitPair & term_id_hit_pair)52 void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
53 if (term_id_hit_pair.hit().is_prefix_hit()) {
54 SelectPrefixHitIfBetter(term_id_hit_pair);
55 } else {
56 SelectExactHitIfBetter(term_id_hit_pair);
57 }
58 prev_ = term_id_hit_pair;
59 }
60
61 // Adds all valid, selected hits to hits starting at position pos in hits.
62 // Returns the offset in hits after the position of the last added hit.
63 // This function may add between 0-2 hits depending on whether the HitSelector
64 // holds both a valid exact hit and a valid prefix hit, one of those or none.
InsertSelectedHits(size_t pos,std::vector<TermIdHitPair> * hits)65 size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
66 // Given the prefix/exact hits for a given term+docid+sectionid, push needed
67 // hits into hits array at offset pos. Return new pos.
68 if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
69 (*hits)[pos++] = best_exact_hit_;
70 const Hit& prefix_hit = best_prefix_hit_.hit();
71 // The prefix hit has score equal to the sum of the scores, capped at
72 // kMaxTermFrequency.
73 Hit::TermFrequency final_term_frequency = std::min(
74 static_cast<int>(Hit::kMaxTermFrequency),
75 prefix_hit.term_frequency() + best_exact_hit_.hit().term_frequency());
76 best_prefix_hit_ = TermIdHitPair(
77 best_prefix_hit_.term_id(),
78 Hit(prefix_hit.section_id(), prefix_hit.document_id(),
79 final_term_frequency, prefix_hit.is_in_prefix_section(),
80 prefix_hit.is_prefix_hit(), prefix_hit.is_stemmed_hit()));
81 (*hits)[pos++] = best_prefix_hit_;
82 // Ensure sorted.
83 if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
84 std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
85 }
86 } else if (best_prefix_hit_.hit().is_valid()) {
87 (*hits)[pos++] = best_prefix_hit_;
88 } else if (best_exact_hit_.hit().is_valid()) {
89 (*hits)[pos++] = best_exact_hit_;
90 }
91
92 return pos;
93 }
94
Reset()95 void Reset() {
96 best_prefix_hit_ = TermIdHitPair();
97 best_exact_hit_ = TermIdHitPair();
98 prev_ = TermIdHitPair();
99 }
100
101 private:
SelectPrefixHitIfBetter(const TermIdHitPair & term_id_hit_pair)102 void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
103 if (!best_prefix_hit_.hit().is_valid()) {
104 best_prefix_hit_ = term_id_hit_pair;
105 } else {
106 const Hit& hit = term_id_hit_pair.hit();
107 // Create a new prefix hit with term_frequency as the sum of the term
108 // frequencies. The term frequency is capped at kMaxTermFrequency.
109 Hit::TermFrequency final_term_frequency = std::min(
110 static_cast<int>(Hit::kMaxTermFrequency),
111 hit.term_frequency() + best_prefix_hit_.hit().term_frequency());
112 best_prefix_hit_ = TermIdHitPair(
113 term_id_hit_pair.term_id(),
114 Hit(hit.section_id(), hit.document_id(), final_term_frequency,
115 best_prefix_hit_.hit().is_in_prefix_section(),
116 best_prefix_hit_.hit().is_prefix_hit(),
117 best_prefix_hit_.hit().is_stemmed_hit()));
118 }
119 }
120
SelectExactHitIfBetter(const TermIdHitPair & term_id_hit_pair)121 void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
122 if (!best_exact_hit_.hit().is_valid()) {
123 best_exact_hit_ = term_id_hit_pair;
124 } else {
125 const Hit& hit = term_id_hit_pair.hit();
126 // Create a new exact hit with term_frequency as the sum of the term
127 // frequencies. The term frequency is capped at kMaxHitScore.
128 Hit::TermFrequency final_term_frequency = std::min(
129 static_cast<int>(Hit::kMaxTermFrequency),
130 hit.term_frequency() + best_exact_hit_.hit().term_frequency());
131 best_exact_hit_ = TermIdHitPair(
132 term_id_hit_pair.term_id(),
133 Hit(hit.section_id(), hit.document_id(), final_term_frequency,
134 best_exact_hit_.hit().is_in_prefix_section(),
135 best_exact_hit_.hit().is_prefix_hit(),
136 best_exact_hit_.hit().is_stemmed_hit()));
137 }
138 }
139
140 TermIdHitPair best_prefix_hit_;
141 TermIdHitPair best_exact_hit_;
142 TermIdHitPair prev_;
143 };
144
145 class HitComparator {
146 public:
HitComparator(const TermIdCodec & term_id_codec,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index)147 explicit HitComparator(
148 const TermIdCodec& term_id_codec,
149 const std::unordered_map<uint32_t, int>& main_tvi_to_block_index)
150 : term_id_codec_(&term_id_codec),
151 main_tvi_to_block_index_(&main_tvi_to_block_index) {}
152
operator ()(const TermIdHitPair & lhs,const TermIdHitPair & rhs) const153 bool operator()(const TermIdHitPair& lhs, const TermIdHitPair& rhs) const {
154 // Primary sort by index block. This achieves two things:
155 // 1. It reduces the number of flash writes by grouping together new hits
156 // for terms whose posting lists might share the same index block.
157 // 2. More importantly, this ensures that newly added backfill branch points
158 // will be populated first (because all newly added terms have an invalid
159 // block index of 0) before any new hits are added to the postings lists
160 // that they backfill from.
161 int lhs_index_block = GetIndexBlock(lhs.term_id());
162 int rhs_index_block = GetIndexBlock(rhs.term_id());
163 if (lhs_index_block == rhs_index_block) {
164 // Secondary sort by term_id and hit.
165 return lhs.value() < rhs.value();
166 }
167 return lhs_index_block < rhs_index_block;
168 }
169
170 private:
GetIndexBlock(uint32_t term_id) const171 int GetIndexBlock(uint32_t term_id) const {
172 auto term_info_or = term_id_codec_->DecodeTermInfo(term_id);
173 if (!term_info_or.ok()) {
174 ICING_LOG(WARNING)
175 << "Unable to decode term-info during merge. This shouldn't happen.";
176 return kInvalidBlockIndex;
177 }
178 TermIdCodec::DecodedTermInfo term_info =
179 std::move(term_info_or).ValueOrDie();
180 auto itr = main_tvi_to_block_index_->find(term_info.tvi);
181 if (itr == main_tvi_to_block_index_->end()) {
182 return kInvalidBlockIndex;
183 }
184 return itr->second;
185 }
186
187 const TermIdCodec* term_id_codec_;
188 const std::unordered_map<uint32_t, int>* main_tvi_to_block_index_;
189 };
190
191 // A helper function to dedupe hits stored in hits. Suppose that the lite index
192 // contained a single document with two hits in a single prefix section: "foot"
193 // and "fool". When expanded, there would be four hits:
194 // {"fo", docid0, sectionid0}
195 // {"fo", docid0, sectionid0}
196 // {"foot", docid0, sectionid0}
197 // {"fool", docid0, sectionid0}
198 //
199 // The first two are duplicates of each other. So, this function will dedupe
200 // and shrink hits to be:
201 // {"fo", docid0, sectionid0}
202 // {"foot", docid0, sectionid0}
203 // {"fool", docid0, sectionid0}
204 //
205 // When two or more prefix hits are duplicates, merge into one hit with term
206 // frequency as the sum of the term frequencies. If there is both an exact and
207 // prefix hit for the same term, keep the exact hit as it is, update the prefix
208 // hit so that its term frequency is the sum of the term frequencies.
DedupeHits(std::vector<TermIdHitPair> * hits,const TermIdCodec & term_id_codec,const std::unordered_map<uint32_t,int> & main_tvi_to_block_index)209 void DedupeHits(
210 std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec,
211 const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) {
212 // Now all terms are grouped together and all hits for a term are sorted.
213 // Merge equivalent hits into one.
214 std::sort(hits->begin(), hits->end(),
215 HitComparator(term_id_codec, main_tvi_to_block_index));
216 size_t current_offset = 0;
217 HitSelector hit_selector;
218 for (const TermIdHitPair& term_id_hit_pair : *hits) {
219 if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
220 // We've reached a new hit. Insert the previously selected hits that we
221 // had accumulated and reset to add this new hit.
222 current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
223 hit_selector.Reset();
224 }
225 // Update best exact and prefix hit.
226 hit_selector.SelectIfBetter(term_id_hit_pair);
227 }
228
229 // Push last.
230 current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
231
232 hits->resize(current_offset);
233 }
234
235 // Based on experiments with full prefix expansion, the multiplier
236 // is ~4x.
237 constexpr int kAvgPrefixesPerTerm = 4;
238
239 } // namespace
240
241 libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
TranslateAndExpandLiteHits(const LiteIndex & lite_index,const TermIdCodec & term_id_codec,const MainIndex::LexiconMergeOutputs & lexicon_merge_outputs)242 MainIndexMerger::TranslateAndExpandLiteHits(
243 const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
244 const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
245 std::vector<TermIdHitPair> hits;
246 if (lite_index.empty()) {
247 return hits;
248 }
249 // Reserve enough space for the average number of prefixes per term and the
250 // terms themselves.
251 hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));
252
253 // Translate lite tvis to main tvis.
254 for (const TermIdHitPair& term_id_hit_pair : lite_index) {
255 uint32_t cur_term_id = term_id_hit_pair.term_id();
256 ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
257 term_id_codec.DecodeTermInfo(cur_term_id));
258 Hit hit(term_id_hit_pair.hit());
259
260 // 1. Translate and push original.
261 auto itr =
262 lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
263 if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
264 // b/37273773
265 return absl_ports::InternalError(IcingStringUtil::StringPrintf(
266 "Trying to translate lite tvi %u that was never added to the lexicon",
267 cur_decoded_term.tvi));
268 }
269 ICING_ASSIGN_OR_RETURN(uint32_t term_id,
270 term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
271 hits.emplace_back(term_id, hit);
272
273 // 2. Expand hits in prefix sections.
274 if (hit.is_in_prefix_section()) {
275 // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
276 auto itr_prefixes =
277 lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
278 cur_decoded_term.tvi);
279 if (itr_prefixes ==
280 lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
281 ICING_VLOG(1) << "No necessary prefix expansion for "
282 << cur_decoded_term.tvi;
283 continue;
284 }
285 // The tvis of all prefixes of this hit's term that appear in the main
286 // lexicon are between [prefix_tvis_buf[offset],
287 // prefix_tvis_buf[offset+len]).
288 size_t offset = itr_prefixes->second.first;
289 size_t len = itr_prefixes->second.second;
290 size_t offset_end_exclusive = offset + len;
291 Hit prefix_hit(hit.section_id(), hit.document_id(), hit.term_frequency(),
292 /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true,
293 /*is_stemmed_hit=*/false);
294 for (; offset < offset_end_exclusive; ++offset) {
295 // Take the tvi (in the main lexicon) of each prefix term.
296 uint32_t prefix_main_tvi =
297 lexicon_merge_outputs.prefix_tvis_buf[offset];
298 // Convert it to a term_id.
299 ICING_ASSIGN_OR_RETURN(
300 uint32_t prefix_term_id,
301 term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
302 // Create add an element for this prefix TermId and prefix Hit to hits.
303 hits.emplace_back(prefix_term_id, prefix_hit);
304 }
305 }
306 }
307 // 3. Remove any duplicate hits.
308 DedupeHits(&hits, term_id_codec,
309 lexicon_merge_outputs.main_tvi_to_block_index);
310 return hits;
311 }
312
313 } // namespace lib
314 } // namespace icing
315