xref: /aosp_15_r20/external/icing/icing/schema/section.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_SCHEMA_SECTION_H_
16 #define ICING_SCHEMA_SECTION_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/proto/document.pb.h"
25 #include "icing/proto/schema.pb.h"
26 #include "icing/proto/term.pb.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 using SectionId = int8_t;
32 // 6 bits for 64 values.
33 inline constexpr int kSectionIdBits = 6;
34 inline constexpr SectionId kTotalNumSections = (1 << kSectionIdBits);
35 inline constexpr SectionId kInvalidSectionId = kTotalNumSections;
36 inline constexpr SectionId kMaxSectionId = kTotalNumSections - 1;
37 // Prior versions of Icing only supported 16 indexed properties.
38 inline constexpr SectionId kOldTotalNumSections = 16;
39 inline constexpr SectionId kMinSectionId = 0;
IsSectionIdValid(SectionId section_id)40 constexpr bool IsSectionIdValid(SectionId section_id) {
41   return section_id >= kMinSectionId && section_id <= kMaxSectionId;
42 }
43 
44 using SectionIdMask = int64_t;
45 inline constexpr SectionIdMask kSectionIdMaskAll = ~SectionIdMask{0};
46 inline constexpr SectionIdMask kSectionIdMaskNone = SectionIdMask{0};
47 
48 static_assert(kSectionIdBits < 8 * sizeof(SectionId),
49               "Cannot exhaust all bits of SectionId since it is a signed "
50               "integer and the most significant bit should be preserved.");
51 
52 static_assert(
53     kMaxSectionId < 8 * sizeof(SectionIdMask),
54     "SectionIdMask is not large enough to represent all section values!");
55 
56 struct SectionMetadata {
57   // Dot-joined property names, representing the location of section inside an
58   // document. E.g. "property1.property2"
59   std::string path;
60 
61   // A unique id of property within a type config
62   SectionId id;
63 
64   // Indexable data type of this section. E.g. STRING, INT64.
65   PropertyConfigProto::DataType::Code data_type;
66 
67   // How strings should be tokenized. It is invalid for a string section
68   // (data_type == 'STRING') to have tokenizer == 'NONE'.
69   StringIndexingConfig::TokenizerType::Code tokenizer;
70 
71   // How tokens in a string section should be matched.
72   //
73   // TermMatchType::UNKNOWN:
74   //   Terms will not match anything
75   //
76   // TermMatchType::PREFIX:
77   //   Terms will be stored as a prefix match, "fool" matches "foo" and "fool"
78   //
79   // TermMatchType::EXACT_ONLY:
80   //   Terms will be only stored as an exact match, "fool" only matches "fool"
81   TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
82 
83   // How tokens in a numeric section should be matched.
84   //
85   // NumericMatchType::UNKNOWN:
86   //   Contents will not match anything. It is invalid for a numeric section
87   //   (data_type == 'INT64') to have numeric_match_type == 'UNKNOWN'.
88   //
89   // NumericMatchType::RANGE:
90   //   Contents will be matched by a range query.
91   IntegerIndexingConfig::NumericMatchType::Code numeric_match_type;
92 
93   // How vectors in a vector section should be indexed.
94   //
95   // EmbeddingIndexingType::UNKNOWN:
96   //   Contents will not be indexed. It is invalid for a vector section
97   //   (data_type == 'VECTOR') to have embedding_indexing_type == 'UNKNOWN'.
98   //
99   // EmbeddingIndexingType::LINEAR_SEARCH:
100   //   Contents will be indexed for linear search.
101   EmbeddingIndexingConfig::EmbeddingIndexingType::Code embedding_indexing_type;
102 
103   // How vectors in a vector section should be quantized.
104   //
105   // QuantizationType::NONE:
106   //   Contents will not be quantized.
107   //
108   // QuantizationType::QUANTIZE_8_BIT:
109   //   Contents will be quantized to 8 bits.
110   EmbeddingIndexingConfig::QuantizationType::Code quantization_type;
111 
SectionMetadataSectionMetadata112   explicit SectionMetadata(
113       SectionId id_in, PropertyConfigProto::DataType::Code data_type_in,
114       StringIndexingConfig::TokenizerType::Code tokenizer,
115       TermMatchType::Code term_match_type_in,
116       IntegerIndexingConfig::NumericMatchType::Code numeric_match_type_in,
117       EmbeddingIndexingConfig::EmbeddingIndexingType::Code
118           embedding_indexing_type_in,
119       EmbeddingIndexingConfig::QuantizationType::Code quantization_type_in,
120       std::string&& path_in)
121       : path(std::move(path_in)),
122         id(id_in),
123         data_type(data_type_in),
124         tokenizer(tokenizer),
125         term_match_type(term_match_type_in),
126         numeric_match_type(numeric_match_type_in),
127         embedding_indexing_type(embedding_indexing_type_in),
128         quantization_type(quantization_type_in) {}
129 
130   SectionMetadata(const SectionMetadata& other) = default;
131   SectionMetadata& operator=(const SectionMetadata& other) = default;
132 
133   SectionMetadata(SectionMetadata&& other) = default;
134   SectionMetadata& operator=(SectionMetadata&& other) = default;
135 
136   bool operator==(const SectionMetadata& rhs) const {
137     return path == rhs.path && id == rhs.id && data_type == rhs.data_type &&
138            tokenizer == rhs.tokenizer &&
139            term_match_type == rhs.term_match_type &&
140            numeric_match_type == rhs.numeric_match_type &&
141            embedding_indexing_type == rhs.embedding_indexing_type &&
142            quantization_type == rhs.quantization_type;
143   }
144 };
145 
146 // Section is an icing internal concept similar to document property but with
147 // extra metadata. The content can be a value or the combination of repeated
148 // values of a property, and the type of content is specified by template.
149 //
150 // Current supported types:
151 // - std::string_view (PropertyConfigProto::DataType::STRING)
152 // - int64_t (PropertyConfigProto::DataType::INT64)
153 template <typename T>
154 struct Section {
155   SectionMetadata metadata;
156   std::vector<T> content;
157 
SectionSection158   explicit Section(SectionMetadata&& metadata_in, std::vector<T>&& content_in)
159       : metadata(std::move(metadata_in)), content(std::move(content_in)) {}
160 
data_typeSection161   PropertyConfigProto::DataType::Code data_type() const {
162     return metadata.data_type;
163   }
164 };
165 
166 // Groups of different type sections. Callers can access sections with types
167 // they want and avoid going through non-desired ones.
168 //
169 // REQUIRES: lifecycle of the property must be longer than this object, since we
170 //   use std::string_view for extracting its string_values.
171 struct SectionGroup {
172   std::vector<Section<std::string_view>> string_sections;
173   std::vector<Section<int64_t>> integer_sections;
174   std::vector<Section<PropertyProto::VectorProto>> vector_sections;
175 };
176 
177 }  // namespace lib
178 }  // namespace icing
179 
180 #endif  // ICING_SCHEMA_SECTION_H_
181