xref: /aosp_15_r20/external/icing/icing/index/hit/hit.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_INDEX_HIT_HIT_H_
16 #define ICING_INDEX_HIT_HIT_H_
17 
18 #include <array>
19 #include <cstdint>
20 #include <cstring>
21 #include <limits>
22 
23 #include "icing/legacy/core/icing-packed-pod.h"
24 #include "icing/schema/section.h"
25 #include "icing/store/document-id.h"
26 
27 namespace icing {
28 namespace lib {
29 
30 // BasicHit is a specific encoding that refers to content within a document. A
31 // basic hit consists of:
32 // - a DocumentId
33 // - a SectionId
34 // referring to the document and section that the hit corresponds to.
35 //
36 // The hit is the most basic unit of the index and, when grouped together by
37 // term, can be used to encode what terms appear in what documents.
38 //
39 // BasicHit is for indices (e.g. numeric index) that don't require term
40 // frequency.
41 class BasicHit {
42  public:
43   // The datatype used to encode BasicHit information: the document_id and
44   // section_id.
45   using Value = uint32_t;
46 
47   // WARNING: Changing this value will invalidate any pre-existing posting lists
48   // on user devices.
49   //
50   // kInvalidValue contains:
51   // - 0 for unused bits. Note that unused bits are always 0 for both valid and
52   //   invalid BasicHit values.
53   // - Inverted kInvalidDocumentId
54   // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
55   //   already invalidated the value. In fact, we currently use all 2^6 section
56   //   ids and there is no "invalid section id", so it doesn't matter what
57   //   SectionId we set for kInvalidValue.
58   static constexpr Value kInvalidValue = 0;
59 
60   explicit BasicHit(SectionId section_id, DocumentId document_id);
61 
BasicHit()62   explicit BasicHit() : value_(kInvalidValue) {}
BasicHit(Value value)63   explicit BasicHit(Value value) : value_(value) {}
64 
is_valid()65   bool is_valid() const { return value_ != kInvalidValue; }
value()66   Value value() const { return value_; }
67   DocumentId document_id() const;
68   SectionId section_id() const;
69 
70   bool operator<(const BasicHit& h2) const { return value_ < h2.value_; }
71   bool operator==(const BasicHit& h2) const { return value_ == h2.value_; }
72 
73  private:
74   // Value bits layout: 4 unused + 22 document_id + 6 section id.
75   //
76   // The Value is guaranteed to be an unsigned integer, but its size and the
77   // information it stores may change if the Hit's encoding format is changed.
78   Value value_;
79 } __attribute__((packed));
80 static_assert(sizeof(BasicHit) == 4, "");
81 
82 // Hit is a specific encoding that refers to content within a document. A hit
83 // consists of:
84 // - a DocumentId
85 // - a SectionId
86 //   - referring to the document and section that the hit corresponds to
87 // - Metadata about the hit:
88 //   - whether the Hit does not appear exactly in the document, but instead
89 //     represents a term that is a prefix of a term in the document
90 //     (is_prefix_hit)
91 //   - whether the Hit came from a section that has prefix expansion enabled
92 //     (is_in_prefix_section)
93 //   - whether the Hit has set any bitmask flags (has_flags)
94 //   - bitmasks in flags fields:
95 //     - whether the Hit has a TermFrequency other than the default value
96 //       (has_term_frequency)
97 //     - whether the Hit is a stemmed hit derived by stemming an original token
98 //       in the document (is_stemmed_hit)
99 //   - a term frequency for the hit
100 //
101 // The hit is the most basic unit of the index and, when grouped together by
102 // term, can be used to encode what terms appear in what documents.
103 class Hit {
104  public:
105   // The datatype used to encode Hit information: the document_id, section_id,
106   // and 3 flags: is_prefix_hit, is_hit_in_prefix_section and has_flags flag.
107   //
108   // The Value is guaranteed to be an unsigned integer, but its size and the
109   // information it stores may change if the Hit's encoding format is changed.
110   using Value = uint32_t;
111 
112   // WARNING: Changing this value will invalidate any pre-existing posting lists
113   // on user devices.
114   //
115   // kInvalidValue contains:
116   // - 0 for unused bits. Note that unused bits are always 0 for both valid and
117   //   invalid Hit values.
118   // - Inverted kInvalidDocumentId
119   // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
120   //   already invalidated the value. In fact, we currently use all 2^6 section
121   //   ids and there is no "invalid section id", so it doesn't matter what
122   //   SectionId we set for kInvalidValue.
123   static constexpr Value kInvalidValue = 0;
124   // Docs are sorted in reverse, and 0 is never used as the inverted
125   // DocumentId (because it is the inverse of kInvalidValue), so it is always
126   // the max in a descending sort.
127   static constexpr Value kMaxDocumentIdSortValue = 0;
128 
129   enum FlagOffsetsInFlagsField {
130     // Whether or not the hit has a term_frequency other than
131     // kDefaultTermFrequency.
132     kHasTermFrequency = 0,
133     // Whether or not the hit is a stemmed hit.
134     kIsStemmedHit = 1,
135     kNumFlagsInFlagsField = 2,
136   };
137 
138   enum FlagOffsetsInValueField {
139     // Whether or not the hit has a flags value other than kNoEnabledFlags (i.e.
140     // it has flags enabled in the flags field)
141     kHasFlags = 0,
142     // This hit, whether exact or not, came from a prefixed section and will
143     // need to be backfilled into branching posting lists if/when those are
144     // created.
145     kInPrefixSection = 1,
146     // This hit represents a prefix of a longer term. If exact matches are
147     // required, then this hit should be ignored.
148     kPrefixHit = 2,
149     kNumFlagsInValueField = 3,
150   };
151   static_assert(kDocumentIdBits + kSectionIdBits + kNumFlagsInValueField <=
152                     sizeof(Hit::Value) * 8,
153                 "HitOverflow");
154   static_assert(kDocumentIdBits == 22, "");
155   static_assert(kSectionIdBits == 6, "");
156   static_assert(kNumFlagsInValueField == 3, "");
157 
158   // The datatype used to encode additional bit-flags in the Hit.
159   // This is guaranteed to be an unsigned integer, but its size may change if
160   // more flags are introduced in the future and require more bits to encode.
161   using Flags = uint8_t;
162   static constexpr Flags kNoEnabledFlags = 0;
163 
164   // The Term Frequency of a Hit.
165   // This is guaranteed to be an unsigned integer, but its size may change if we
166   // need to expand the max term-frequency.
167   using TermFrequency = uint8_t;
168   using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>;
169   // Max TermFrequency is 255.
170   static constexpr TermFrequency kMaxTermFrequency =
171       std::numeric_limits<TermFrequency>::max();
172   static constexpr TermFrequency kDefaultTermFrequency = 1;
173   static constexpr TermFrequency kNoTermFrequency = 0;
174 
Hit(Value value)175   explicit Hit(Value value)
176       : Hit(value, kNoEnabledFlags, kDefaultTermFrequency) {}
177   explicit Hit(Value value, Flags flags, TermFrequency term_frequency);
178   explicit Hit(SectionId section_id, DocumentId document_id,
179                TermFrequency term_frequency, bool is_in_prefix_section,
180                bool is_prefix_hit, bool is_stemmed_hit);
181 
is_valid()182   bool is_valid() const { return value() != kInvalidValue; }
183 
value()184   Value value() const {
185     Value value;
186     memcpy(&value, value_.data(), sizeof(value));
187     return value;
188   }
189 
190   DocumentId document_id() const;
191   SectionId section_id() const;
192   bool is_prefix_hit() const;
193   bool is_in_prefix_section() const;
194   // Whether or not the hit has any flags set to true.
195   bool has_flags() const;
196 
flags()197   Flags flags() const { return flags_; }
198   // Whether or not the hit is a stemmed hit.
199   bool is_stemmed_hit() const;
200   // Whether or not the hit contains a valid term frequency.
201   bool has_term_frequency() const;
202 
term_frequency()203   TermFrequency term_frequency() const { return term_frequency_; }
204 
205   // Returns true if the flags values across the Hit's value_, term_frequency_
206   // and flags_ fields are consistent.
207   bool CheckFlagsAreConsistent() const;
208 
209   // Creates a new hit based on old_hit but with new_document_id set.
210   static Hit TranslateHit(Hit old_hit, DocumentId new_document_id);
211 
212   bool operator<(const Hit& h2) const {
213     if (value() != h2.value()) {
214       return value() < h2.value();
215     }
216     return flags() < h2.flags();
217   }
218 
219   struct EqualsDocumentIdAndSectionId {
operatorEqualsDocumentIdAndSectionId220     bool operator()(const Hit& hit1, const Hit& hit2) const {
221       return (hit1.value() >> kNumFlagsInValueField) ==
222              (hit2.value() >> kNumFlagsInValueField);
223     }
224   };
225 
226   struct EqualsValueAndFlags {
operatorEqualsValueAndFlags227     bool operator()(const Hit& hit1, const Hit& hit2) const {
228       return hit1.value() == hit2.value() && hit1.flags() == hit2.flags();
229     }
230   };
231 
232  private:
233   // Value, Flags and TermFrequency must be in this order.
234   // Value bits layout: 1 unused + 22 document_id + 6 section_id + 1
235   // is_prefix_hit + 1 is_in_prefix_section + 1 has_flags.
236   std::array<char, sizeof(Value)> value_;
237   // Flags bits layout: 1 reserved + 5 unused + 1 is_stemmed_hit + 1
238   // has_term_frequency.
239   // The left-most bit is reserved for chaining additional fields in case of
240   // future hit expansions.
241   Flags flags_;
242   TermFrequency term_frequency_;
243 };
244 static_assert(sizeof(Hit) == 6, "");
245 // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
246 static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan");
247 
248 }  // namespace lib
249 }  // namespace icing
250 
251 #endif  // ICING_INDEX_HIT_HIT_H_
252