xref: /aosp_15_r20/external/icing/icing/join/posting-list-join-data-accessor_test.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/join/posting-list-join-data-accessor.h"
16 
17 #include <cstdint>
18 #include <memory>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/status.h"
24 #include "gmock/gmock.h"
25 #include "gtest/gtest.h"
26 #include "icing/file/filesystem.h"
27 #include "icing/file/posting_list/flash-index-storage.h"
28 #include "icing/file/posting_list/posting-list-accessor.h"
29 #include "icing/file/posting_list/posting-list-common.h"
30 #include "icing/file/posting_list/posting-list-identifier.h"
31 #include "icing/join/document-id-to-join-info.h"
32 #include "icing/join/posting-list-join-data-serializer.h"
33 #include "icing/store/document-id.h"
34 #include "icing/store/namespace-id-fingerprint.h"
35 #include "icing/store/namespace-id.h"
36 #include "icing/testing/common-matchers.h"
37 #include "icing/testing/tmp-directory.h"
38 
39 namespace icing {
40 namespace lib {
41 
42 namespace {
43 
44 using ::testing::ElementsAre;
45 using ::testing::ElementsAreArray;
46 using ::testing::Eq;
47 using ::testing::Lt;
48 using ::testing::Ne;
49 using ::testing::SizeIs;
50 
51 using JoinDataType = DocumentIdToJoinInfo<NamespaceIdFingerprint>;
52 
53 static constexpr NamespaceId kDefaultNamespaceId = 1;
54 
55 class PostingListJoinDataAccessorTest : public ::testing::Test {
56  protected:
SetUp()57   void SetUp() override {
58     test_dir_ = GetTestTempDir() + "/test_dir";
59     file_name_ = test_dir_ + "/test_file.idx.index";
60 
61     ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
62     ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
63 
64     serializer_ =
65         std::make_unique<PostingListJoinDataSerializer<JoinDataType>>();
66 
67     ICING_ASSERT_OK_AND_ASSIGN(
68         FlashIndexStorage flash_index_storage,
69         FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
70     flash_index_storage_ =
71         std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
72   }
73 
TearDown()74   void TearDown() override {
75     flash_index_storage_.reset();
76     serializer_.reset();
77     ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
78   }
79 
80   Filesystem filesystem_;
81   std::string test_dir_;
82   std::string file_name_;
83   std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> serializer_;
84   std::unique_ptr<FlashIndexStorage> flash_index_storage_;
85 };
86 
CreateData(int num_data,DocumentId start_document_id,NamespaceId ref_namespace_id,uint64_t start_ref_hash_uri)87 std::vector<JoinDataType> CreateData(int num_data, DocumentId start_document_id,
88                                      NamespaceId ref_namespace_id,
89                                      uint64_t start_ref_hash_uri) {
90   std::vector<JoinDataType> data;
91   data.reserve(num_data);
92   for (int i = 0; i < num_data; ++i) {
93     data.push_back(JoinDataType(
94         start_document_id,
95         NamespaceIdFingerprint(ref_namespace_id,
96                                /*fingerprint=*/start_ref_hash_uri)));
97 
98     ++start_document_id;
99     ++start_ref_hash_uri;
100   }
101   return data;
102 }
103 
TEST_F(PostingListJoinDataAccessorTest,DataAddAndRetrieveProperly)104 TEST_F(PostingListJoinDataAccessorTest, DataAddAndRetrieveProperly) {
105   ICING_ASSERT_OK_AND_ASSIGN(
106       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
107       PostingListJoinDataAccessor<JoinDataType>::Create(
108           flash_index_storage_.get(), serializer_.get()));
109   // Add some join data
110   std::vector<JoinDataType> data_vec =
111       CreateData(/*num_data=*/5, /*start_document_id=*/0,
112                  /*ref_namespace_id=*/kDefaultNamespaceId,
113                  /*start_ref_hash_uri=*/819);
114   for (const JoinDataType& data : data_vec) {
115     EXPECT_THAT(pl_accessor->PrependData(data), IsOk());
116   }
117   PostingListAccessor::FinalizeResult result =
118       std::move(*pl_accessor).Finalize();
119   EXPECT_THAT(result.status, IsOk());
120   EXPECT_THAT(result.id.block_index(), Eq(1));
121   EXPECT_THAT(result.id.posting_list_index(), Eq(0));
122 
123   // Retrieve some data.
124   ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
125                              flash_index_storage_->GetPostingList(result.id));
126   EXPECT_THAT(
127       serializer_->GetData(&pl_holder.posting_list),
128       IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend())));
129   EXPECT_THAT(pl_holder.next_block_index, Eq(kInvalidBlockIndex));
130 }
131 
TEST_F(PostingListJoinDataAccessorTest,PreexistingPLKeepOnSameBlock)132 TEST_F(PostingListJoinDataAccessorTest, PreexistingPLKeepOnSameBlock) {
133   ICING_ASSERT_OK_AND_ASSIGN(
134       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
135       PostingListJoinDataAccessor<JoinDataType>::Create(
136           flash_index_storage_.get(), serializer_.get()));
137   // Add a single data. This will fit in a min-sized posting list.
138   JoinDataType data1(
139       /*document_id=*/1,
140       NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/123));
141   ICING_ASSERT_OK(pl_accessor->PrependData(data1));
142   PostingListAccessor::FinalizeResult result1 =
143       std::move(*pl_accessor).Finalize();
144   ICING_ASSERT_OK(result1.status);
145   // Should be allocated to the first block.
146   ASSERT_THAT(result1.id.block_index(), Eq(1));
147   ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
148 
149   // Add one more data. The minimum size for a posting list must be able to fit
150   // two data, so this should NOT cause the previous pl to be reallocated.
151   ICING_ASSERT_OK_AND_ASSIGN(
152       pl_accessor,
153       PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
154           flash_index_storage_.get(), serializer_.get(), result1.id));
155   JoinDataType data2(
156       /*document_id=*/2,
157       NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/456));
158   ICING_ASSERT_OK(pl_accessor->PrependData(data2));
159   PostingListAccessor::FinalizeResult result2 =
160       std::move(*pl_accessor).Finalize();
161   ICING_ASSERT_OK(result2.status);
162   // Should be in the same posting list.
163   EXPECT_THAT(result2.id, Eq(result1.id));
164 
165   // The posting list at result2.id should hold all of the data that have been
166   // added.
167   ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
168                              flash_index_storage_->GetPostingList(result2.id));
169   EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
170               IsOkAndHolds(ElementsAre(data2, data1)));
171 }
172 
TEST_F(PostingListJoinDataAccessorTest,PreexistingPLReallocateToLargerPL)173 TEST_F(PostingListJoinDataAccessorTest, PreexistingPLReallocateToLargerPL) {
174   ICING_ASSERT_OK_AND_ASSIGN(
175       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
176       PostingListJoinDataAccessor<JoinDataType>::Create(
177           flash_index_storage_.get(), serializer_.get()));
178   // Adding 3 data should cause Finalize allocating a 56-byte posting list,
179   // which can store at most 4 data.
180   std::vector<JoinDataType> data_vec1 =
181       CreateData(/*num_data=*/3, /*start_document_id=*/0,
182                  /*ref_namespace_id=*/kDefaultNamespaceId,
183                  /*start_ref_hash_uri=*/819);
184   for (const JoinDataType& data : data_vec1) {
185     ICING_ASSERT_OK(pl_accessor->PrependData(data));
186   }
187   PostingListAccessor::FinalizeResult result1 =
188       std::move(*pl_accessor).Finalize();
189   ICING_ASSERT_OK(result1.status);
190   // Should be allocated to the first block.
191   ASSERT_THAT(result1.id.block_index(), Eq(1));
192   ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
193 
194   // Now add more data.
195   ICING_ASSERT_OK_AND_ASSIGN(
196       pl_accessor,
197       PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
198           flash_index_storage_.get(), serializer_.get(), result1.id));
199   // The current posting list can fit 1 more data. Adding 12 more data should
200   // result in these data being moved to a larger posting list. Also the total
201   // size of these data won't exceed max size posting list, so there will be
202   // only one single posting list and no chain.
203   std::vector<JoinDataType> data_vec2 = CreateData(
204       /*num_data=*/12, /*start_document_id=*/data_vec1.back().document_id() + 1,
205       /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
206 
207   for (const JoinDataType& data : data_vec2) {
208     ICING_ASSERT_OK(pl_accessor->PrependData(data));
209   }
210   PostingListAccessor::FinalizeResult result2 =
211       std::move(*pl_accessor).Finalize();
212   ICING_ASSERT_OK(result2.status);
213   // Should be allocated to the second (new) block because the posting list
214   // should grow beyond the size that the first block maintains.
215   EXPECT_THAT(result2.id.block_index(), Eq(2));
216   EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
217 
218   // The posting list at result2.id should hold all of the data that have been
219   // added.
220   std::vector<JoinDataType> all_data_vec;
221   all_data_vec.reserve(data_vec1.size() + data_vec2.size());
222   all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
223   all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
224   ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
225                              flash_index_storage_->GetPostingList(result2.id));
226   EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
227               IsOkAndHolds(ElementsAreArray(all_data_vec.rbegin(),
228                                             all_data_vec.rend())));
229 }
230 
TEST_F(PostingListJoinDataAccessorTest,MultiBlockChainsBlocksProperly)231 TEST_F(PostingListJoinDataAccessorTest, MultiBlockChainsBlocksProperly) {
232   ICING_ASSERT_OK_AND_ASSIGN(
233       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
234       PostingListJoinDataAccessor<JoinDataType>::Create(
235           flash_index_storage_.get(), serializer_.get()));
236   // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType)
237   // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data.
238   // Adding 292 data should cause:
239   // - 2 max size posting lists being allocated to block 1 and block 2.
240   // - Chaining: block 2 -> block 1
241   std::vector<JoinDataType> data_vec = CreateData(
242       /*num_data=*/292, /*start_document_id=*/0,
243       /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
244   for (const JoinDataType& data : data_vec) {
245     ICING_ASSERT_OK(pl_accessor->PrependData(data));
246   }
247   PostingListAccessor::FinalizeResult result1 =
248       std::move(*pl_accessor).Finalize();
249   ICING_ASSERT_OK(result1.status);
250   PostingListIdentifier second_block_id = result1.id;
251   // Should be allocated to the second block.
252   EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
253                                    /*block_index=*/2, /*posting_list_index=*/0,
254                                    /*posting_list_index_bits=*/0)));
255 
256   // We should be able to retrieve all data.
257   ICING_ASSERT_OK_AND_ASSIGN(
258       PostingListHolder pl_holder,
259       flash_index_storage_->GetPostingList(second_block_id));
260   // This pl_holder will only hold a posting list with the data that didn't fit
261   // on the first block.
262   ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data,
263                              serializer_->GetData(&pl_holder.posting_list));
264   ASSERT_THAT(second_block_data, SizeIs(Lt(data_vec.size())));
265   auto first_block_data_start = data_vec.rbegin() + second_block_data.size();
266   EXPECT_THAT(second_block_data,
267               ElementsAreArray(data_vec.rbegin(), first_block_data_start));
268 
269   // Now retrieve all of the data that were on the first block.
270   uint32_t first_block_id = pl_holder.next_block_index;
271   EXPECT_THAT(first_block_id, Eq(1));
272 
273   PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
274                               /*posting_list_index_bits=*/0);
275   ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
276                              flash_index_storage_->GetPostingList(pl_id));
277   EXPECT_THAT(
278       serializer_->GetData(&pl_holder.posting_list),
279       IsOkAndHolds(ElementsAreArray(first_block_data_start, data_vec.rend())));
280 }
281 
TEST_F(PostingListJoinDataAccessorTest,PreexistingMultiBlockReusesBlocksProperly)282 TEST_F(PostingListJoinDataAccessorTest,
283        PreexistingMultiBlockReusesBlocksProperly) {
284   ICING_ASSERT_OK_AND_ASSIGN(
285       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
286       PostingListJoinDataAccessor<JoinDataType>::Create(
287           flash_index_storage_.get(), serializer_.get()));
288   // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType)
289   // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data.
290   // Adding 292 data will cause:
291   // - 2 max size posting lists being allocated to block 1 and block 2.
292   // - Chaining: block 2 -> block 1
293   std::vector<JoinDataType> data_vec1 = CreateData(
294       /*num_data=*/292, /*start_document_id=*/0,
295       /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
296   for (const JoinDataType& data : data_vec1) {
297     ICING_ASSERT_OK(pl_accessor->PrependData(data));
298   }
299   PostingListAccessor::FinalizeResult result1 =
300       std::move(*pl_accessor).Finalize();
301   ICING_ASSERT_OK(result1.status);
302   PostingListIdentifier first_add_id = result1.id;
303   EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
304                                 /*block_index=*/2, /*posting_list_index=*/0,
305                                 /*posting_list_index_bits=*/0)));
306 
307   // Now add more data. These should fit on the existing second block and not
308   // fill it up.
309   ICING_ASSERT_OK_AND_ASSIGN(
310       pl_accessor,
311       PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
312           flash_index_storage_.get(), serializer_.get(), first_add_id));
313   std::vector<JoinDataType> data_vec2 = CreateData(
314       /*num_data=*/10, /*start_document_id=*/data_vec1.back().document_id() + 1,
315       /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
316   for (const JoinDataType& data : data_vec2) {
317     ICING_ASSERT_OK(pl_accessor->PrependData(data));
318   }
319   PostingListAccessor::FinalizeResult result2 =
320       std::move(*pl_accessor).Finalize();
321   ICING_ASSERT_OK(result2.status);
322   PostingListIdentifier second_add_id = result2.id;
323   EXPECT_THAT(second_add_id, Eq(first_add_id));
324 
325   // We should be able to retrieve all data.
326   std::vector<JoinDataType> all_data_vec;
327   all_data_vec.reserve(data_vec1.size() + data_vec2.size());
328   all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
329   all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
330   ICING_ASSERT_OK_AND_ASSIGN(
331       PostingListHolder pl_holder,
332       flash_index_storage_->GetPostingList(second_add_id));
333   // This pl_holder will only hold a posting list with the data that didn't fit
334   // on the first block.
335   ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data,
336                              serializer_->GetData(&pl_holder.posting_list));
337   ASSERT_THAT(second_block_data, SizeIs(Lt(all_data_vec.size())));
338   auto first_block_data_start =
339       all_data_vec.rbegin() + second_block_data.size();
340   EXPECT_THAT(second_block_data,
341               ElementsAreArray(all_data_vec.rbegin(), first_block_data_start));
342 
343   // Now retrieve all of the data that were on the first block.
344   uint32_t first_block_id = pl_holder.next_block_index;
345   EXPECT_THAT(first_block_id, Eq(1));
346 
347   PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
348                               /*posting_list_index_bits=*/0);
349   ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
350                              flash_index_storage_->GetPostingList(pl_id));
351   EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
352               IsOkAndHolds(ElementsAreArray(first_block_data_start,
353                                             all_data_vec.rend())));
354 }
355 
TEST_F(PostingListJoinDataAccessorTest,InvalidDataShouldReturnInvalidArgument)356 TEST_F(PostingListJoinDataAccessorTest,
357        InvalidDataShouldReturnInvalidArgument) {
358   ICING_ASSERT_OK_AND_ASSIGN(
359       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
360       PostingListJoinDataAccessor<JoinDataType>::Create(
361           flash_index_storage_.get(), serializer_.get()));
362   JoinDataType invalid_data = JoinDataType::GetInvalid();
363   EXPECT_THAT(pl_accessor->PrependData(invalid_data),
364               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
365 }
366 
TEST_F(PostingListJoinDataAccessorTest,JoinDataNonIncreasingShouldReturnInvalidArgument)367 TEST_F(PostingListJoinDataAccessorTest,
368        JoinDataNonIncreasingShouldReturnInvalidArgument) {
369   ICING_ASSERT_OK_AND_ASSIGN(
370       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
371       PostingListJoinDataAccessor<JoinDataType>::Create(
372           flash_index_storage_.get(), serializer_.get()));
373   JoinDataType data1(
374       /*document_id=*/1,
375       NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/819));
376   ICING_ASSERT_OK(pl_accessor->PrependData(data1));
377 
378   JoinDataType data2(
379       /*document_id=*/1,
380       NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/818));
381   EXPECT_THAT(pl_accessor->PrependData(data2),
382               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
383 
384   JoinDataType data3(
385       /*document_id=*/1,
386       NamespaceIdFingerprint(kDefaultNamespaceId - 1, /*fingerprint=*/820));
387   EXPECT_THAT(pl_accessor->PrependData(data3),
388               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
389 
390   JoinDataType data4(
391       /*document_id=*/0,
392       NamespaceIdFingerprint(kDefaultNamespaceId + 1, /*fingerprint=*/820));
393   EXPECT_THAT(pl_accessor->PrependData(data4),
394               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
395 }
396 
TEST_F(PostingListJoinDataAccessorTest,NewPostingListNoDataAddedShouldReturnInvalidArgument)397 TEST_F(PostingListJoinDataAccessorTest,
398        NewPostingListNoDataAddedShouldReturnInvalidArgument) {
399   ICING_ASSERT_OK_AND_ASSIGN(
400       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
401       PostingListJoinDataAccessor<JoinDataType>::Create(
402           flash_index_storage_.get(), serializer_.get()));
403   PostingListAccessor::FinalizeResult result =
404       std::move(*pl_accessor).Finalize();
405   EXPECT_THAT(result.status,
406               StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
407 }
408 
TEST_F(PostingListJoinDataAccessorTest,PreexistingPostingListNoDataAddedShouldSucceed)409 TEST_F(PostingListJoinDataAccessorTest,
410        PreexistingPostingListNoDataAddedShouldSucceed) {
411   ICING_ASSERT_OK_AND_ASSIGN(
412       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor1,
413       PostingListJoinDataAccessor<JoinDataType>::Create(
414           flash_index_storage_.get(), serializer_.get()));
415   JoinDataType data1(
416       /*document_id=*/1,
417       NamespaceIdFingerprint(kDefaultNamespaceId, /*fingerprint=*/819));
418   ICING_ASSERT_OK(pl_accessor1->PrependData(data1));
419   PostingListAccessor::FinalizeResult result1 =
420       std::move(*pl_accessor1).Finalize();
421   ICING_ASSERT_OK(result1.status);
422 
423   ICING_ASSERT_OK_AND_ASSIGN(
424       std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor2,
425       PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
426           flash_index_storage_.get(), serializer_.get(), result1.id));
427   PostingListAccessor::FinalizeResult result2 =
428       std::move(*pl_accessor2).Finalize();
429   EXPECT_THAT(result2.status, IsOk());
430 }
431 
432 }  // namespace
433 
434 }  // namespace lib
435 }  // namespace icing
436