1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_INDEX_HIT_HIT_H_ 16 #define ICING_INDEX_HIT_HIT_H_ 17 18 #include <array> 19 #include <cstdint> 20 #include <cstring> 21 #include <limits> 22 23 #include "icing/legacy/core/icing-packed-pod.h" 24 #include "icing/schema/section.h" 25 #include "icing/store/document-id.h" 26 27 namespace icing { 28 namespace lib { 29 30 // BasicHit is a specific encoding that refers to content within a document. A 31 // basic hit consists of: 32 // - a DocumentId 33 // - a SectionId 34 // referring to the document and section that the hit corresponds to. 35 // 36 // The hit is the most basic unit of the index and, when grouped together by 37 // term, can be used to encode what terms appear in what documents. 38 // 39 // BasicHit is for indices (e.g. numeric index) that don't require term 40 // frequency. 41 class BasicHit { 42 public: 43 // The datatype used to encode BasicHit information: the document_id and 44 // section_id. 45 using Value = uint32_t; 46 47 // WARNING: Changing this value will invalidate any pre-existing posting lists 48 // on user devices. 49 // 50 // kInvalidValue contains: 51 // - 0 for unused bits. Note that unused bits are always 0 for both valid and 52 // invalid BasicHit values. 53 // - Inverted kInvalidDocumentId 54 // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has 55 // already invalidated the value. In fact, we currently use all 2^6 section 56 // ids and there is no "invalid section id", so it doesn't matter what 57 // SectionId we set for kInvalidValue. 58 static constexpr Value kInvalidValue = 0; 59 60 explicit BasicHit(SectionId section_id, DocumentId document_id); 61 BasicHit()62 explicit BasicHit() : value_(kInvalidValue) {} BasicHit(Value value)63 explicit BasicHit(Value value) : value_(value) {} 64 is_valid()65 bool is_valid() const { return value_ != kInvalidValue; } value()66 Value value() const { return value_; } 67 DocumentId document_id() const; 68 SectionId section_id() const; 69 70 bool operator<(const BasicHit& h2) const { return value_ < h2.value_; } 71 bool operator==(const BasicHit& h2) const { return value_ == h2.value_; } 72 73 private: 74 // Value bits layout: 4 unused + 22 document_id + 6 section id. 75 // 76 // The Value is guaranteed to be an unsigned integer, but its size and the 77 // information it stores may change if the Hit's encoding format is changed. 78 Value value_; 79 } __attribute__((packed)); 80 static_assert(sizeof(BasicHit) == 4, ""); 81 82 // Hit is a specific encoding that refers to content within a document. A hit 83 // consists of: 84 // - a DocumentId 85 // - a SectionId 86 // - referring to the document and section that the hit corresponds to 87 // - Metadata about the hit: 88 // - whether the Hit does not appear exactly in the document, but instead 89 // represents a term that is a prefix of a term in the document 90 // (is_prefix_hit) 91 // - whether the Hit came from a section that has prefix expansion enabled 92 // (is_in_prefix_section) 93 // - whether the Hit has set any bitmask flags (has_flags) 94 // - bitmasks in flags fields: 95 // - whether the Hit has a TermFrequency other than the default value 96 // (has_term_frequency) 97 // - whether the Hit is a stemmed hit derived by stemming an original token 98 // in the document (is_stemmed_hit) 99 // - a term frequency for the hit 100 // 101 // The hit is the most basic unit of the index and, when grouped together by 102 // term, can be used to encode what terms appear in what documents. 103 class Hit { 104 public: 105 // The datatype used to encode Hit information: the document_id, section_id, 106 // and 3 flags: is_prefix_hit, is_hit_in_prefix_section and has_flags flag. 107 // 108 // The Value is guaranteed to be an unsigned integer, but its size and the 109 // information it stores may change if the Hit's encoding format is changed. 110 using Value = uint32_t; 111 112 // WARNING: Changing this value will invalidate any pre-existing posting lists 113 // on user devices. 114 // 115 // kInvalidValue contains: 116 // - 0 for unused bits. Note that unused bits are always 0 for both valid and 117 // invalid Hit values. 118 // - Inverted kInvalidDocumentId 119 // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has 120 // already invalidated the value. In fact, we currently use all 2^6 section 121 // ids and there is no "invalid section id", so it doesn't matter what 122 // SectionId we set for kInvalidValue. 123 static constexpr Value kInvalidValue = 0; 124 // Docs are sorted in reverse, and 0 is never used as the inverted 125 // DocumentId (because it is the inverse of kInvalidValue), so it is always 126 // the max in a descending sort. 127 static constexpr Value kMaxDocumentIdSortValue = 0; 128 129 enum FlagOffsetsInFlagsField { 130 // Whether or not the hit has a term_frequency other than 131 // kDefaultTermFrequency. 132 kHasTermFrequency = 0, 133 // Whether or not the hit is a stemmed hit. 134 kIsStemmedHit = 1, 135 kNumFlagsInFlagsField = 2, 136 }; 137 138 enum FlagOffsetsInValueField { 139 // Whether or not the hit has a flags value other than kNoEnabledFlags (i.e. 140 // it has flags enabled in the flags field) 141 kHasFlags = 0, 142 // This hit, whether exact or not, came from a prefixed section and will 143 // need to be backfilled into branching posting lists if/when those are 144 // created. 145 kInPrefixSection = 1, 146 // This hit represents a prefix of a longer term. If exact matches are 147 // required, then this hit should be ignored. 148 kPrefixHit = 2, 149 kNumFlagsInValueField = 3, 150 }; 151 static_assert(kDocumentIdBits + kSectionIdBits + kNumFlagsInValueField <= 152 sizeof(Hit::Value) * 8, 153 "HitOverflow"); 154 static_assert(kDocumentIdBits == 22, ""); 155 static_assert(kSectionIdBits == 6, ""); 156 static_assert(kNumFlagsInValueField == 3, ""); 157 158 // The datatype used to encode additional bit-flags in the Hit. 159 // This is guaranteed to be an unsigned integer, but its size may change if 160 // more flags are introduced in the future and require more bits to encode. 161 using Flags = uint8_t; 162 static constexpr Flags kNoEnabledFlags = 0; 163 164 // The Term Frequency of a Hit. 165 // This is guaranteed to be an unsigned integer, but its size may change if we 166 // need to expand the max term-frequency. 167 using TermFrequency = uint8_t; 168 using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>; 169 // Max TermFrequency is 255. 170 static constexpr TermFrequency kMaxTermFrequency = 171 std::numeric_limits<TermFrequency>::max(); 172 static constexpr TermFrequency kDefaultTermFrequency = 1; 173 static constexpr TermFrequency kNoTermFrequency = 0; 174 Hit(Value value)175 explicit Hit(Value value) 176 : Hit(value, kNoEnabledFlags, kDefaultTermFrequency) {} 177 explicit Hit(Value value, Flags flags, TermFrequency term_frequency); 178 explicit Hit(SectionId section_id, DocumentId document_id, 179 TermFrequency term_frequency, bool is_in_prefix_section, 180 bool is_prefix_hit, bool is_stemmed_hit); 181 is_valid()182 bool is_valid() const { return value() != kInvalidValue; } 183 value()184 Value value() const { 185 Value value; 186 memcpy(&value, value_.data(), sizeof(value)); 187 return value; 188 } 189 190 DocumentId document_id() const; 191 SectionId section_id() const; 192 bool is_prefix_hit() const; 193 bool is_in_prefix_section() const; 194 // Whether or not the hit has any flags set to true. 195 bool has_flags() const; 196 flags()197 Flags flags() const { return flags_; } 198 // Whether or not the hit is a stemmed hit. 199 bool is_stemmed_hit() const; 200 // Whether or not the hit contains a valid term frequency. 201 bool has_term_frequency() const; 202 term_frequency()203 TermFrequency term_frequency() const { return term_frequency_; } 204 205 // Returns true if the flags values across the Hit's value_, term_frequency_ 206 // and flags_ fields are consistent. 207 bool CheckFlagsAreConsistent() const; 208 209 // Creates a new hit based on old_hit but with new_document_id set. 210 static Hit TranslateHit(Hit old_hit, DocumentId new_document_id); 211 212 bool operator<(const Hit& h2) const { 213 if (value() != h2.value()) { 214 return value() < h2.value(); 215 } 216 return flags() < h2.flags(); 217 } 218 219 struct EqualsDocumentIdAndSectionId { operatorEqualsDocumentIdAndSectionId220 bool operator()(const Hit& hit1, const Hit& hit2) const { 221 return (hit1.value() >> kNumFlagsInValueField) == 222 (hit2.value() >> kNumFlagsInValueField); 223 } 224 }; 225 226 struct EqualsValueAndFlags { operatorEqualsValueAndFlags227 bool operator()(const Hit& hit1, const Hit& hit2) const { 228 return hit1.value() == hit2.value() && hit1.flags() == hit2.flags(); 229 } 230 }; 231 232 private: 233 // Value, Flags and TermFrequency must be in this order. 234 // Value bits layout: 1 unused + 22 document_id + 6 section_id + 1 235 // is_prefix_hit + 1 is_in_prefix_section + 1 has_flags. 236 std::array<char, sizeof(Value)> value_; 237 // Flags bits layout: 1 reserved + 5 unused + 1 is_stemmed_hit + 1 238 // has_term_frequency. 239 // The left-most bit is reserved for chaining additional fields in case of 240 // future hit expansions. 241 Flags flags_; 242 TermFrequency term_frequency_; 243 }; 244 static_assert(sizeof(Hit) == 6, ""); 245 // TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions. 246 static_assert(icing_is_packed_pod<Hit>::value, "go/icing-ubsan"); 247 248 } // namespace lib 249 } // namespace icing 250 251 #endif // ICING_INDEX_HIT_HIT_H_ 252