1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/section-manager.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/legacy/core/icing-string-util.h"
28 #include "icing/proto/document.pb.h"
29 #include "icing/proto/schema.pb.h"
30 #include "icing/proto/term.pb.h"
31 #include "icing/schema/property-util.h"
32 #include "icing/schema/section.h"
33 #include "icing/store/document-filter-data.h"
34 #include "icing/store/key-mapper.h"
35 #include "icing/util/status-macros.h"
36
37 namespace icing {
38 namespace lib {
39
40 namespace {
41
42 // Helper function to append a new section metadata
AppendNewSectionMetadata(std::vector<SectionMetadata> * metadata_list,std::string && concatenated_path,const PropertyConfigProto & property_config)43 libtextclassifier3::Status AppendNewSectionMetadata(
44 std::vector<SectionMetadata>* metadata_list,
45 std::string&& concatenated_path,
46 const PropertyConfigProto& property_config) {
47 // Validates next section id, makes sure that section id is the same as the
48 // list index so that we could find any section metadata by id in O(1) later.
49 SectionId new_section_id = static_cast<SectionId>(metadata_list->size());
50 if (!IsSectionIdValid(new_section_id)) {
51 // Max number of sections reached
52 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
53 "Too many properties to be indexed, max number of properties "
54 "allowed: %d",
55 kMaxSectionId - kMinSectionId + 1));
56 }
57
58 // Creates section metadata
59 metadata_list->push_back(SectionMetadata(
60 new_section_id, property_config.data_type(),
61 property_config.string_indexing_config().tokenizer_type(),
62 property_config.string_indexing_config().term_match_type(),
63 property_config.integer_indexing_config().numeric_match_type(),
64 property_config.embedding_indexing_config().embedding_indexing_type(),
65 property_config.embedding_indexing_config().quantization_type(),
66 std::move(concatenated_path)));
67 return libtextclassifier3::Status::OK;
68 }
69
70 template <typename T>
AppendSection(SectionMetadata section_metadata,libtextclassifier3::StatusOr<std::vector<T>> && section_content_or,std::vector<Section<T>> & sections_out)71 void AppendSection(
72 SectionMetadata section_metadata,
73 libtextclassifier3::StatusOr<std::vector<T>>&& section_content_or,
74 std::vector<Section<T>>& sections_out) {
75 if (!section_content_or.ok()) {
76 return;
77 }
78
79 std::vector<T> section_content = std::move(section_content_or).ValueOrDie();
80 if (!section_content.empty()) {
81 // Adds to result vector if section is found in document
82 sections_out.emplace_back(std::move(section_metadata),
83 std::move(section_content));
84 }
85 }
86
87 } // namespace
88
89 libtextclassifier3::Status
ProcessSchemaTypePropertyConfig(SchemaTypeId schema_type_id,const PropertyConfigProto & property_config,std::string && property_path)90 SectionManager::Builder::ProcessSchemaTypePropertyConfig(
91 SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
92 std::string&& property_path) {
93 if (schema_type_id < 0 || schema_type_id >= section_metadata_cache_.size()) {
94 return absl_ports::InvalidArgumentError("Invalid schema type id");
95 }
96
97 // We don't need to check if the property is indexable. This method will
98 // only be called properties that should consume sectionIds, even if the
99 // property's indexing configuration itself is not indexable.
100 // This would be the case for unknown and non-indexable property paths that
101 // are defined in the indexable_nested_properties_list.
102 ICING_RETURN_IF_ERROR(
103 AppendNewSectionMetadata(§ion_metadata_cache_[schema_type_id],
104 std::move(property_path), property_config));
105 return libtextclassifier3::Status::OK;
106 }
107
108 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const109 SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
110 SectionId section_id) const {
111 if (schema_type_id < 0 || schema_type_id >= section_metadata_cache_.size()) {
112 return absl_ports::InvalidArgumentError("Invalid schema type id");
113 }
114 if (!IsSectionIdValid(section_id)) {
115 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
116 "Section id %d is greater than the max value %d", section_id,
117 kMaxSectionId));
118 }
119
120 const std::vector<SectionMetadata>& section_metadatas =
121 section_metadata_cache_[schema_type_id];
122 if (section_id >= section_metadatas.size()) {
123 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
124 "Section with id %d doesn't exist in type config with id %d",
125 section_id, schema_type_id));
126 }
127
128 // The index of metadata list is the same as the section id, so we can use
129 // section id as the index.
130 return §ion_metadatas[section_id];
131 }
132
ExtractSections(const DocumentProto & document) const133 libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections(
134 const DocumentProto& document) const {
135 ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
136 GetMetadataList(document.schema()));
137 SectionGroup section_group;
138 for (const SectionMetadata& section_metadata : *metadata_list) {
139 switch (section_metadata.data_type) {
140 case PropertyConfigProto::DataType::STRING: {
141 if (section_metadata.term_match_type == TermMatchType::UNKNOWN ||
142 section_metadata.tokenizer ==
143 StringIndexingConfig::TokenizerType::NONE) {
144 // Skip if term-match type is UNKNOWN, or if the tokenizer-type is
145 // NONE.
146 break;
147 }
148 AppendSection(
149 section_metadata,
150 property_util::ExtractPropertyValuesFromDocument<std::string_view>(
151 document, section_metadata.path),
152 section_group.string_sections);
153 break;
154 }
155 case PropertyConfigProto::DataType::INT64: {
156 if (section_metadata.numeric_match_type ==
157 IntegerIndexingConfig::NumericMatchType::UNKNOWN) {
158 // Skip if numeric-match type is UNKNOWN.
159 break;
160 }
161 AppendSection(section_metadata,
162 property_util::ExtractPropertyValuesFromDocument<int64_t>(
163 document, section_metadata.path),
164 section_group.integer_sections);
165 break;
166 }
167 case PropertyConfigProto::DataType::VECTOR: {
168 if (section_metadata.embedding_indexing_type ==
169 EmbeddingIndexingConfig::EmbeddingIndexingType::UNKNOWN) {
170 // Skip if embedding indexing type is UNKNOWN.
171 break;
172 }
173 AppendSection(
174 section_metadata,
175 property_util::ExtractPropertyValuesFromDocument<
176 PropertyProto::VectorProto>(document, section_metadata.path),
177 section_group.vector_sections);
178 break;
179 }
180 default: {
181 // Skip other data types.
182 break;
183 }
184 }
185 }
186 return section_group;
187 }
188
189 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetMetadataList(const std::string & type_config_name) const190 SectionManager::GetMetadataList(const std::string& type_config_name) const {
191 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
192 schema_type_mapper_.Get(type_config_name));
193 return §ion_metadata_cache_.at(schema_type_id);
194 }
195
196 } // namespace lib
197 } // namespace icing
198