1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/join/qualified-id-join-indexing-handler.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/join/document-join-id-pair.h"
29 #include "icing/join/qualified-id-join-index.h"
30 #include "icing/join/qualified-id.h"
31 #include "icing/legacy/core/icing-string-util.h"
32 #include "icing/proto/logging.pb.h"
33 #include "icing/schema/joinable-property.h"
34 #include "icing/store/document-filter-data.h"
35 #include "icing/store/document-id.h"
36 #include "icing/store/document-store.h"
37 #include "icing/store/namespace-id-fingerprint.h"
38 #include "icing/store/namespace-id.h"
39 #include "icing/util/clock.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42 #include "icing/util/tokenized-document.h"
43
44 namespace icing {
45 namespace lib {
46
47 /* static */ libtextclassifier3::StatusOr<
48 std::unique_ptr<QualifiedIdJoinIndexingHandler>>
Create(const Clock * clock,const DocumentStore * doc_store,QualifiedIdJoinIndex * qualified_id_join_index)49 QualifiedIdJoinIndexingHandler::Create(
50 const Clock* clock, const DocumentStore* doc_store,
51 QualifiedIdJoinIndex* qualified_id_join_index) {
52 ICING_RETURN_ERROR_IF_NULL(clock);
53 ICING_RETURN_ERROR_IF_NULL(doc_store);
54 ICING_RETURN_ERROR_IF_NULL(qualified_id_join_index);
55
56 return std::unique_ptr<QualifiedIdJoinIndexingHandler>(
57 new QualifiedIdJoinIndexingHandler(clock, doc_store,
58 qualified_id_join_index));
59 }
60
Handle(const TokenizedDocument & tokenized_document,DocumentId document_id,DocumentId old_document_id,bool recovery_mode,PutDocumentStatsProto * put_document_stats)61 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::Handle(
62 const TokenizedDocument& tokenized_document, DocumentId document_id,
63 DocumentId old_document_id, bool recovery_mode,
64 PutDocumentStatsProto* put_document_stats) {
65 std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
66
67 if (!IsDocumentIdValid(document_id)) {
68 return absl_ports::InvalidArgumentError(
69 IcingStringUtil::StringPrintf("Invalid DocumentId %d", document_id));
70 }
71
72 if (qualified_id_join_index_.last_added_document_id() != kInvalidDocumentId &&
73 document_id <= qualified_id_join_index_.last_added_document_id()) {
74 if (recovery_mode) {
75 // Skip the document if document_id <= last_added_document_id in recovery
76 // mode without returning an error.
77 return libtextclassifier3::Status::OK;
78 }
79 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
80 "DocumentId %d must be greater than last added document_id %d",
81 document_id, qualified_id_join_index_.last_added_document_id()));
82 }
83 qualified_id_join_index_.set_last_added_document_id(document_id);
84
85 switch (qualified_id_join_index_.version()) {
86 case QualifiedIdJoinIndex::Version::kV1:
87 ICING_RETURN_IF_ERROR(HandleV1(tokenized_document, document_id));
88 break;
89 case QualifiedIdJoinIndex::Version::kV2:
90 ICING_RETURN_IF_ERROR(HandleV2(tokenized_document, document_id));
91 break;
92 case QualifiedIdJoinIndex::Version::kV3:
93 ICING_RETURN_IF_ERROR(
94 HandleV3(tokenized_document, document_id, old_document_id));
95 break;
96 }
97
98 if (put_document_stats != nullptr) {
99 put_document_stats->set_qualified_id_join_index_latency_ms(
100 index_timer->GetElapsedMilliseconds());
101 }
102
103 return libtextclassifier3::Status::OK;
104 }
105
HandleV1(const TokenizedDocument & tokenized_document,DocumentId document_id)106 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV1(
107 const TokenizedDocument& tokenized_document, DocumentId document_id) {
108 for (const JoinableProperty<std::string_view>& qualified_id_property :
109 tokenized_document.qualified_id_join_properties()) {
110 if (qualified_id_property.values.empty()) {
111 continue;
112 }
113
114 DocumentJoinIdPair document_join_id_pair(document_id,
115 qualified_id_property.metadata.id);
116 // Currently we only support single (non-repeated) joinable value under a
117 // property.
118 std::string_view ref_qualified_id_str = qualified_id_property.values[0];
119
120 // Attempt to parse qualified id string to make sure the format is
121 // correct.
122 if (!QualifiedId::Parse(ref_qualified_id_str).ok()) {
123 // Skip incorrect format of qualified id string to save disk space.
124 continue;
125 }
126
127 libtextclassifier3::Status status = qualified_id_join_index_.Put(
128 document_join_id_pair, ref_qualified_id_str);
129 if (!status.ok()) {
130 ICING_LOG(WARNING)
131 << "Failed to add data into qualified id join index due to: "
132 << status.error_message();
133 return status;
134 }
135 }
136 return libtextclassifier3::Status::OK;
137 }
138
HandleV2(const TokenizedDocument & tokenized_document,DocumentId document_id)139 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV2(
140 const TokenizedDocument& tokenized_document, DocumentId document_id) {
141 std::optional<DocumentFilterData> filter_data =
142 doc_store_.GetAliveDocumentFilterData(
143 document_id,
144 /*current_time_ms=*/std::numeric_limits<int64_t>::min());
145 if (!filter_data) {
146 // This should not happen.
147 return absl_ports::InternalError(
148 "Failed to get alive document filter data when indexing");
149 }
150
151 for (const JoinableProperty<std::string_view>& qualified_id_property :
152 tokenized_document.qualified_id_join_properties()) {
153 // Parse all qualified id strings and convert them to
154 // NamespaceIdFingerprint.
155 std::vector<NamespaceIdFingerprint> ref_doc_nsid_uri_fingerprints;
156 for (std::string_view ref_qualified_id_str : qualified_id_property.values) {
157 // Attempt to parse qualified id string to make sure the format is
158 // correct.
159 auto ref_qualified_id_or = QualifiedId::Parse(ref_qualified_id_str);
160 if (!ref_qualified_id_or.ok()) {
161 // Skip incorrect format of qualified id string.
162 continue;
163 }
164
165 QualifiedId ref_qualified_id =
166 std::move(ref_qualified_id_or).ValueOrDie();
167 auto ref_namespace_id_or =
168 doc_store_.GetNamespaceId(ref_qualified_id.name_space());
169 if (!ref_namespace_id_or.ok()) {
170 // Skip invalid namespace id.
171 continue;
172 }
173 NamespaceId ref_namespace_id =
174 std::move(ref_namespace_id_or).ValueOrDie();
175
176 ref_doc_nsid_uri_fingerprints.push_back(
177 NamespaceIdFingerprint(ref_namespace_id, ref_qualified_id.uri()));
178 }
179
180 // Batch add all join data of this (schema_type_id, joinable_property_id)
181 // into to the index.
182 libtextclassifier3::Status status = qualified_id_join_index_.Put(
183 filter_data->schema_type_id(), qualified_id_property.metadata.id,
184 document_id, std::move(ref_doc_nsid_uri_fingerprints));
185 if (!status.ok()) {
186 ICING_LOG(WARNING)
187 << "Failed to add data into qualified id join index v2 due to: "
188 << status.error_message();
189 return status;
190 }
191 }
192 return libtextclassifier3::Status::OK;
193 }
194
HandleV3(const TokenizedDocument & tokenized_document,DocumentId document_id,DocumentId old_document_id)195 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV3(
196 const TokenizedDocument& tokenized_document, DocumentId document_id,
197 DocumentId old_document_id) {
198 // (Parent perspective)
199 // When replacement, if there were any existing child documents joining to it,
200 // then we need to migrate the old document id to the new document id.
201 if (IsDocumentIdValid(old_document_id)) {
202 ICING_RETURN_IF_ERROR(
203 qualified_id_join_index_.MigrateParent(old_document_id, document_id));
204 }
205
206 // (Child perspective)
207 // Add child join data.
208 for (const JoinableProperty<std::string_view>& qualified_id_property :
209 tokenized_document.qualified_id_join_properties()) {
210 if (qualified_id_property.values.empty()) {
211 continue;
212 }
213
214 DocumentJoinIdPair child_doc_join_id_pair(
215 document_id, qualified_id_property.metadata.id);
216
217 // Extract parent qualified ids and lookup their corresponding document ids.
218 std::vector<DocumentId> parent_doc_ids;
219 parent_doc_ids.reserve(qualified_id_property.values.size());
220 for (std::string_view parent_qualified_id_str :
221 qualified_id_property.values) {
222 libtextclassifier3::StatusOr<QualifiedId> parent_qualified_id_or =
223 QualifiedId::Parse(parent_qualified_id_str);
224 if (!parent_qualified_id_or.ok()) {
225 // Skip incorrect format of qualified id string.
226 continue;
227 }
228 QualifiedId parent_qualified_id =
229 std::move(parent_qualified_id_or).ValueOrDie();
230
231 // Lookup document store to get the parent document id.
232 libtextclassifier3::StatusOr<DocumentId> parent_doc_id_or =
233 doc_store_.GetDocumentId(parent_qualified_id.name_space(),
234 parent_qualified_id.uri());
235 if (!parent_doc_id_or.ok() ||
236 parent_doc_id_or.ValueOrDie() == kInvalidDocumentId) {
237 // Skip invalid parent document id or parent document does not exist.
238 continue;
239 }
240 parent_doc_ids.push_back(parent_doc_id_or.ValueOrDie());
241 }
242
243 // Add all parent document ids to the index.
244 libtextclassifier3::Status status = qualified_id_join_index_.Put(
245 child_doc_join_id_pair, std::move(parent_doc_ids));
246 if (!status.ok()) {
247 ICING_LOG(WARNING)
248 << "Failed to add data into qualified id join index due to: "
249 << status.error_message();
250 return status;
251 }
252 }
253 return libtextclassifier3::Status::OK;
254 }
255
256 } // namespace lib
257 } // namespace icing
258