xref: /aosp_15_r20/external/icing/icing/join/qualified-id-join-indexing-handler.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/join/qualified-id-join-indexing-handler.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24 
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/join/document-join-id-pair.h"
29 #include "icing/join/qualified-id-join-index.h"
30 #include "icing/join/qualified-id.h"
31 #include "icing/legacy/core/icing-string-util.h"
32 #include "icing/proto/logging.pb.h"
33 #include "icing/schema/joinable-property.h"
34 #include "icing/store/document-filter-data.h"
35 #include "icing/store/document-id.h"
36 #include "icing/store/document-store.h"
37 #include "icing/store/namespace-id-fingerprint.h"
38 #include "icing/store/namespace-id.h"
39 #include "icing/util/clock.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42 #include "icing/util/tokenized-document.h"
43 
44 namespace icing {
45 namespace lib {
46 
47 /* static */ libtextclassifier3::StatusOr<
48     std::unique_ptr<QualifiedIdJoinIndexingHandler>>
Create(const Clock * clock,const DocumentStore * doc_store,QualifiedIdJoinIndex * qualified_id_join_index)49 QualifiedIdJoinIndexingHandler::Create(
50     const Clock* clock, const DocumentStore* doc_store,
51     QualifiedIdJoinIndex* qualified_id_join_index) {
52   ICING_RETURN_ERROR_IF_NULL(clock);
53   ICING_RETURN_ERROR_IF_NULL(doc_store);
54   ICING_RETURN_ERROR_IF_NULL(qualified_id_join_index);
55 
56   return std::unique_ptr<QualifiedIdJoinIndexingHandler>(
57       new QualifiedIdJoinIndexingHandler(clock, doc_store,
58                                          qualified_id_join_index));
59 }
60 
Handle(const TokenizedDocument & tokenized_document,DocumentId document_id,DocumentId old_document_id,bool recovery_mode,PutDocumentStatsProto * put_document_stats)61 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::Handle(
62     const TokenizedDocument& tokenized_document, DocumentId document_id,
63     DocumentId old_document_id, bool recovery_mode,
64     PutDocumentStatsProto* put_document_stats) {
65   std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
66 
67   if (!IsDocumentIdValid(document_id)) {
68     return absl_ports::InvalidArgumentError(
69         IcingStringUtil::StringPrintf("Invalid DocumentId %d", document_id));
70   }
71 
72   if (qualified_id_join_index_.last_added_document_id() != kInvalidDocumentId &&
73       document_id <= qualified_id_join_index_.last_added_document_id()) {
74     if (recovery_mode) {
75       // Skip the document if document_id <= last_added_document_id in recovery
76       // mode without returning an error.
77       return libtextclassifier3::Status::OK;
78     }
79     return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
80         "DocumentId %d must be greater than last added document_id %d",
81         document_id, qualified_id_join_index_.last_added_document_id()));
82   }
83   qualified_id_join_index_.set_last_added_document_id(document_id);
84 
85   switch (qualified_id_join_index_.version()) {
86     case QualifiedIdJoinIndex::Version::kV1:
87       ICING_RETURN_IF_ERROR(HandleV1(tokenized_document, document_id));
88       break;
89     case QualifiedIdJoinIndex::Version::kV2:
90       ICING_RETURN_IF_ERROR(HandleV2(tokenized_document, document_id));
91       break;
92     case QualifiedIdJoinIndex::Version::kV3:
93       ICING_RETURN_IF_ERROR(
94           HandleV3(tokenized_document, document_id, old_document_id));
95       break;
96   }
97 
98   if (put_document_stats != nullptr) {
99     put_document_stats->set_qualified_id_join_index_latency_ms(
100         index_timer->GetElapsedMilliseconds());
101   }
102 
103   return libtextclassifier3::Status::OK;
104 }
105 
HandleV1(const TokenizedDocument & tokenized_document,DocumentId document_id)106 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV1(
107     const TokenizedDocument& tokenized_document, DocumentId document_id) {
108   for (const JoinableProperty<std::string_view>& qualified_id_property :
109        tokenized_document.qualified_id_join_properties()) {
110     if (qualified_id_property.values.empty()) {
111       continue;
112     }
113 
114     DocumentJoinIdPair document_join_id_pair(document_id,
115                                              qualified_id_property.metadata.id);
116     // Currently we only support single (non-repeated) joinable value under a
117     // property.
118     std::string_view ref_qualified_id_str = qualified_id_property.values[0];
119 
120     // Attempt to parse qualified id string to make sure the format is
121     // correct.
122     if (!QualifiedId::Parse(ref_qualified_id_str).ok()) {
123       // Skip incorrect format of qualified id string to save disk space.
124       continue;
125     }
126 
127     libtextclassifier3::Status status = qualified_id_join_index_.Put(
128         document_join_id_pair, ref_qualified_id_str);
129     if (!status.ok()) {
130       ICING_LOG(WARNING)
131           << "Failed to add data into qualified id join index due to: "
132           << status.error_message();
133       return status;
134     }
135   }
136   return libtextclassifier3::Status::OK;
137 }
138 
HandleV2(const TokenizedDocument & tokenized_document,DocumentId document_id)139 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV2(
140     const TokenizedDocument& tokenized_document, DocumentId document_id) {
141   std::optional<DocumentFilterData> filter_data =
142       doc_store_.GetAliveDocumentFilterData(
143           document_id,
144           /*current_time_ms=*/std::numeric_limits<int64_t>::min());
145   if (!filter_data) {
146     // This should not happen.
147     return absl_ports::InternalError(
148         "Failed to get alive document filter data when indexing");
149   }
150 
151   for (const JoinableProperty<std::string_view>& qualified_id_property :
152        tokenized_document.qualified_id_join_properties()) {
153     // Parse all qualified id strings and convert them to
154     // NamespaceIdFingerprint.
155     std::vector<NamespaceIdFingerprint> ref_doc_nsid_uri_fingerprints;
156     for (std::string_view ref_qualified_id_str : qualified_id_property.values) {
157       // Attempt to parse qualified id string to make sure the format is
158       // correct.
159       auto ref_qualified_id_or = QualifiedId::Parse(ref_qualified_id_str);
160       if (!ref_qualified_id_or.ok()) {
161         // Skip incorrect format of qualified id string.
162         continue;
163       }
164 
165       QualifiedId ref_qualified_id =
166           std::move(ref_qualified_id_or).ValueOrDie();
167       auto ref_namespace_id_or =
168           doc_store_.GetNamespaceId(ref_qualified_id.name_space());
169       if (!ref_namespace_id_or.ok()) {
170         // Skip invalid namespace id.
171         continue;
172       }
173       NamespaceId ref_namespace_id =
174           std::move(ref_namespace_id_or).ValueOrDie();
175 
176       ref_doc_nsid_uri_fingerprints.push_back(
177           NamespaceIdFingerprint(ref_namespace_id, ref_qualified_id.uri()));
178     }
179 
180     // Batch add all join data of this (schema_type_id, joinable_property_id)
181     // into to the index.
182     libtextclassifier3::Status status = qualified_id_join_index_.Put(
183         filter_data->schema_type_id(), qualified_id_property.metadata.id,
184         document_id, std::move(ref_doc_nsid_uri_fingerprints));
185     if (!status.ok()) {
186       ICING_LOG(WARNING)
187           << "Failed to add data into qualified id join index v2 due to: "
188           << status.error_message();
189       return status;
190     }
191   }
192   return libtextclassifier3::Status::OK;
193 }
194 
HandleV3(const TokenizedDocument & tokenized_document,DocumentId document_id,DocumentId old_document_id)195 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV3(
196     const TokenizedDocument& tokenized_document, DocumentId document_id,
197     DocumentId old_document_id) {
198   // (Parent perspective)
199   // When replacement, if there were any existing child documents joining to it,
200   // then we need to migrate the old document id to the new document id.
201   if (IsDocumentIdValid(old_document_id)) {
202     ICING_RETURN_IF_ERROR(
203         qualified_id_join_index_.MigrateParent(old_document_id, document_id));
204   }
205 
206   // (Child perspective)
207   // Add child join data.
208   for (const JoinableProperty<std::string_view>& qualified_id_property :
209        tokenized_document.qualified_id_join_properties()) {
210     if (qualified_id_property.values.empty()) {
211       continue;
212     }
213 
214     DocumentJoinIdPair child_doc_join_id_pair(
215         document_id, qualified_id_property.metadata.id);
216 
217     // Extract parent qualified ids and lookup their corresponding document ids.
218     std::vector<DocumentId> parent_doc_ids;
219     parent_doc_ids.reserve(qualified_id_property.values.size());
220     for (std::string_view parent_qualified_id_str :
221          qualified_id_property.values) {
222       libtextclassifier3::StatusOr<QualifiedId> parent_qualified_id_or =
223           QualifiedId::Parse(parent_qualified_id_str);
224       if (!parent_qualified_id_or.ok()) {
225         // Skip incorrect format of qualified id string.
226         continue;
227       }
228       QualifiedId parent_qualified_id =
229           std::move(parent_qualified_id_or).ValueOrDie();
230 
231       // Lookup document store to get the parent document id.
232       libtextclassifier3::StatusOr<DocumentId> parent_doc_id_or =
233           doc_store_.GetDocumentId(parent_qualified_id.name_space(),
234                                    parent_qualified_id.uri());
235       if (!parent_doc_id_or.ok() ||
236           parent_doc_id_or.ValueOrDie() == kInvalidDocumentId) {
237         // Skip invalid parent document id or parent document does not exist.
238         continue;
239       }
240       parent_doc_ids.push_back(parent_doc_id_or.ValueOrDie());
241     }
242 
243     // Add all parent document ids to the index.
244     libtextclassifier3::Status status = qualified_id_join_index_.Put(
245         child_doc_join_id_pair, std::move(parent_doc_ids));
246     if (!status.ok()) {
247       ICING_LOG(WARNING)
248           << "Failed to add data into qualified id join index due to: "
249           << status.error_message();
250       return status;
251     }
252   }
253   return libtextclassifier3::Status::OK;
254 }
255 
256 }  // namespace lib
257 }  // namespace icing
258