1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/index-processor.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "gmock/gmock.h"
28 #include "gtest/gtest.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/absl_ports/str_join.h"
31 #include "icing/document-builder.h"
32 #include "icing/feature-flags.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/portable-file-backed-proto-log.h"
35 #include "icing/index/data-indexing-handler.h"
36 #include "icing/index/hit/doc-hit-info.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/index/index.h"
39 #include "icing/index/integer-section-indexing-handler.h"
40 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
41 #include "icing/index/iterator/doc-hit-info-iterator.h"
42 #include "icing/index/numeric/integer-index.h"
43 #include "icing/index/numeric/numeric-index.h"
44 #include "icing/index/term-indexing-handler.h"
45 #include "icing/index/term-property-id.h"
46 #include "icing/join/qualified-id-join-index-impl-v1.h"
47 #include "icing/join/qualified-id-join-index.h"
48 #include "icing/join/qualified-id-join-indexing-handler.h"
49 #include "icing/legacy/index/icing-filesystem.h"
50 #include "icing/legacy/index/icing-mock-filesystem.h"
51 #include "icing/portable/platform.h"
52 #include "icing/proto/document.pb.h"
53 #include "icing/proto/schema.pb.h"
54 #include "icing/proto/term.pb.h"
55 #include "icing/schema-builder.h"
56 #include "icing/schema/schema-store.h"
57 #include "icing/schema/section.h"
58 #include "icing/store/document-id.h"
59 #include "icing/store/document-store.h"
60 #include "icing/testing/common-matchers.h"
61 #include "icing/testing/fake-clock.h"
62 #include "icing/testing/random-string.h"
63 #include "icing/testing/test-data.h"
64 #include "icing/testing/test-feature-flags.h"
65 #include "icing/testing/tmp-directory.h"
66 #include "icing/tokenization/language-segmenter-factory.h"
67 #include "icing/tokenization/language-segmenter.h"
68 #include "icing/transform/normalizer-factory.h"
69 #include "icing/transform/normalizer.h"
70 #include "icing/util/crc32.h"
71 #include "icing/util/icu-data-file-helper.h"
72 #include "icing/util/tokenized-document.h"
73 #include "unicode/uloc.h"
74
75 namespace icing {
76 namespace lib {
77
78 namespace {
79
80 constexpr std::string_view kIpsumText =
81 "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis "
82 "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida "
83 "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam "
84 "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo "
85 "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, "
86 "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula "
87 "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et "
88 "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, "
89 "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis "
90 "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. "
91 "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. "
92 "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur "
93 "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
94 "placerat semper.";
95
96 // schema types
97 constexpr std::string_view kFakeType = "FakeType";
98 constexpr std::string_view kNestedType = "NestedType";
99
100 // Indexable properties and section Id. Section Id is determined by the
101 // lexicographical order of indexable property path.
102 constexpr std::string_view kExactProperty = "exact";
103 constexpr std::string_view kIndexableIntegerProperty = "indexableInteger";
104 constexpr std::string_view kPrefixedProperty = "prefixed";
105 constexpr std::string_view kRepeatedProperty = "repeated";
106 constexpr std::string_view kRfc822Property = "rfc822";
107 constexpr std::string_view kSubProperty = "submessage"; // submessage.nested
108 constexpr std::string_view kNestedProperty = "nested"; // submessage.nested
109 // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
110 // to Android.
111 #ifdef ENABLE_URL_TOKENIZER
112 constexpr std::string_view kUrlExactProperty = "urlExact";
113 constexpr std::string_view kUrlPrefixedProperty = "urlPrefixed";
114 #endif // ENABLE_URL_TOKENIZER
115 constexpr std::string_view kVerbatimExactProperty = "verbatimExact";
116 constexpr std::string_view kVerbatimPrefixedProperty = "verbatimPrefixed";
117
118 constexpr SectionId kExactSectionId = 0;
119 constexpr SectionId kIndexableIntegerSectionId = 1;
120 constexpr SectionId kPrefixedSectionId = 2;
121 constexpr SectionId kRepeatedSectionId = 3;
122 constexpr SectionId kRfc822SectionId = 4;
123 constexpr SectionId kNestedSectionId = 5; // submessage.nested
124 #ifdef ENABLE_URL_TOKENIZER
125 constexpr SectionId kUrlExactSectionId = 6;
126 constexpr SectionId kUrlPrefixedSectionId = 7;
127 constexpr SectionId kVerbatimExactSectionId = 8;
128 constexpr SectionId kVerbatimPrefixedSectionId = 9;
129 #else // !ENABLE_URL_TOKENIZER
130 constexpr SectionId kVerbatimExactSectionId = 6;
131 constexpr SectionId kVerbatimPrefixedSectionId = 7;
132 #endif // ENABLE_URL_TOKENIZER
133
134 // Other non-indexable properties.
135 constexpr std::string_view kUnindexedProperty1 = "unindexed1";
136 constexpr std::string_view kUnindexedProperty2 = "unindexed2";
137
138 constexpr DocumentId kDocumentId0 = 0;
139 constexpr DocumentId kDocumentId1 = 1;
140
141 using Cardinality = PropertyConfigProto::Cardinality;
142 using DataType = PropertyConfigProto::DataType;
143 using ::testing::ElementsAre;
144 using ::testing::Eq;
145 using ::testing::IsEmpty;
146 using ::testing::IsTrue;
147 using ::testing::SizeIs;
148 using ::testing::Test;
149 using ::testing::UnorderedElementsAreArray;
150
151 #ifdef ENABLE_URL_TOKENIZER
152 constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
153 StringIndexingConfig::TokenizerType::URL;
154 #endif // ENABLE_URL_TOKENIZER
155
156 class IndexProcessorTest : public Test {
157 protected:
SetUp()158 void SetUp() override {
159 feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
160 if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
161 ICING_ASSERT_OK(
162 // File generated via icu_data_file rule in //icing/BUILD.
163 icu_data_file_helper::SetUpIcuDataFile(
164 GetTestFilePath("icing/icu.dat")));
165 }
166
167 base_dir_ = GetTestTempDir() + "/index_processor_test";
168 ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
169 IsTrue());
170
171 index_dir_ = base_dir_ + "/index";
172 integer_index_dir_ = base_dir_ + "/integer_index";
173 qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index";
174 schema_store_dir_ = base_dir_ + "/schema_store";
175 doc_store_dir_ = base_dir_ + "/doc_store";
176
177 Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
178 /*lite_index_sort_at_indexing=*/true,
179 /*lite_index_sort_size=*/1024 * 8);
180 ICING_ASSERT_OK_AND_ASSIGN(
181 index_, Index::Create(options, &filesystem_, &icing_filesystem_));
182
183 ICING_ASSERT_OK_AND_ASSIGN(
184 integer_index_,
185 IntegerIndex::Create(
186 filesystem_, integer_index_dir_,
187 IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
188 /*pre_mapping_fbv=*/false));
189
190 ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
191 QualifiedIdJoinIndexImplV1::Create(
192 filesystem_, qualified_id_join_index_dir_,
193 /*pre_mapping_fbv=*/false,
194 /*use_persistent_hash_map=*/false));
195
196 language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
197 ICING_ASSERT_OK_AND_ASSIGN(
198 lang_segmenter_,
199 language_segmenter_factory::Create(std::move(segmenter_options)));
200
201 ICING_ASSERT_OK_AND_ASSIGN(
202 normalizer_,
203 normalizer_factory::Create(
204 /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
205
206 ASSERT_TRUE(
207 filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()));
208 ICING_ASSERT_OK_AND_ASSIGN(
209 schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_,
210 &fake_clock_, feature_flags_.get()));
211 SchemaProto schema =
212 SchemaBuilder()
213 .AddType(
214 SchemaTypeConfigBuilder()
215 .SetType(kFakeType)
216 .AddProperty(PropertyConfigBuilder()
217 .SetName(kExactProperty)
218 .SetDataTypeString(TERM_MATCH_EXACT,
219 TOKENIZER_PLAIN)
220 .SetCardinality(CARDINALITY_OPTIONAL))
221 .AddProperty(PropertyConfigBuilder()
222 .SetName(kPrefixedProperty)
223 .SetDataTypeString(TERM_MATCH_PREFIX,
224 TOKENIZER_PLAIN)
225 .SetCardinality(CARDINALITY_OPTIONAL))
226 .AddProperty(PropertyConfigBuilder()
227 .SetName(kUnindexedProperty1)
228 .SetDataType(TYPE_STRING)
229 .SetCardinality(CARDINALITY_OPTIONAL))
230 .AddProperty(PropertyConfigBuilder()
231 .SetName(kUnindexedProperty2)
232 .SetDataType(TYPE_BYTES)
233 .SetCardinality(CARDINALITY_OPTIONAL))
234 .AddProperty(PropertyConfigBuilder()
235 .SetName(kRepeatedProperty)
236 .SetDataTypeString(TERM_MATCH_PREFIX,
237 TOKENIZER_PLAIN)
238 .SetCardinality(CARDINALITY_REPEATED))
239 .AddProperty(PropertyConfigBuilder()
240 .SetName(kVerbatimExactProperty)
241 .SetDataTypeString(TERM_MATCH_EXACT,
242 TOKENIZER_VERBATIM)
243 .SetCardinality(CARDINALITY_REPEATED))
244 .AddProperty(PropertyConfigBuilder()
245 .SetName(kVerbatimPrefixedProperty)
246 .SetDataTypeString(TERM_MATCH_PREFIX,
247 TOKENIZER_VERBATIM)
248 .SetCardinality(CARDINALITY_REPEATED))
249 .AddProperty(PropertyConfigBuilder()
250 .SetName(kRfc822Property)
251 .SetDataTypeString(TERM_MATCH_PREFIX,
252 TOKENIZER_RFC822)
253 .SetCardinality(CARDINALITY_REPEATED))
254 #ifdef ENABLE_URL_TOKENIZER
255 .AddProperty(
256 PropertyConfigBuilder()
257 .SetName(kUrlExactProperty)
258 .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_URL)
259 .SetCardinality(CARDINALITY_REPEATED))
260 .AddProperty(
261 PropertyConfigBuilder()
262 .SetName(kUrlPrefixedProperty)
263 .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_URL)
264 .SetCardinality(CARDINALITY_REPEATED))
265 #endif // ENABLE_URL_TOKENIZER
266 .AddProperty(PropertyConfigBuilder()
267 .SetName(kIndexableIntegerProperty)
268 .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
269 .SetCardinality(CARDINALITY_REPEATED))
270 .AddProperty(
271 PropertyConfigBuilder()
272 .SetName(kSubProperty)
273 .SetDataTypeDocument(
274 kNestedType, /*index_nested_properties=*/true)
275 .SetCardinality(CARDINALITY_OPTIONAL)))
276 .AddType(
277 SchemaTypeConfigBuilder()
278 .SetType(kNestedType)
279 .AddProperty(PropertyConfigBuilder()
280 .SetName(kNestedProperty)
281 .SetDataTypeString(TERM_MATCH_PREFIX,
282 TOKENIZER_PLAIN)
283 .SetCardinality(CARDINALITY_OPTIONAL)))
284 .Build();
285 ICING_ASSERT_OK(schema_store_->SetSchema(
286 schema, /*ignore_errors_and_delete_documents=*/false,
287 /*allow_circular_schema_definitions=*/false));
288
289 ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()));
290 ICING_ASSERT_OK_AND_ASSIGN(
291 DocumentStore::CreateResult create_result,
292 DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
293 schema_store_.get(), feature_flags_.get(),
294 /*force_recovery_and_revalidate_documents=*/false,
295 /*pre_mapping_fbv=*/false,
296 /*use_persistent_hash_map=*/true,
297 PortableFileBackedProtoLog<
298 DocumentWrapper>::kDefaultCompressionLevel,
299 /*initialize_stats=*/nullptr));
300 doc_store_ = std::move(create_result.document_store);
301
302 ICING_ASSERT_OK_AND_ASSIGN(
303 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
304 TermIndexingHandler::Create(
305 &fake_clock_, normalizer_.get(), index_.get(),
306 /*build_property_existence_metadata_hits=*/true));
307 ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
308 integer_section_indexing_handler,
309 IntegerSectionIndexingHandler::Create(
310 &fake_clock_, integer_index_.get()));
311 ICING_ASSERT_OK_AND_ASSIGN(
312 std::unique_ptr<QualifiedIdJoinIndexingHandler>
313 qualified_id_join_indexing_handler,
314 QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
315 qualified_id_join_index_.get()));
316 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
317 handlers.push_back(std::move(term_indexing_handler));
318 handlers.push_back(std::move(integer_section_indexing_handler));
319 handlers.push_back(std::move(qualified_id_join_indexing_handler));
320
321 index_processor_ =
322 std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
323
324 mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
325 }
326
TearDown()327 void TearDown() override {
328 index_processor_.reset();
329 doc_store_.reset();
330 schema_store_.reset();
331 normalizer_.reset();
332 lang_segmenter_.reset();
333 qualified_id_join_index_.reset();
334 integer_index_.reset();
335 index_.reset();
336
337 filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
338 }
339
340 std::unique_ptr<IcingMockFilesystem> mock_icing_filesystem_;
341
342 std::unique_ptr<FeatureFlags> feature_flags_;
343 Filesystem filesystem_;
344 IcingFilesystem icing_filesystem_;
345 FakeClock fake_clock_;
346 std::string base_dir_;
347 std::string index_dir_;
348 std::string integer_index_dir_;
349 std::string qualified_id_join_index_dir_;
350 std::string schema_store_dir_;
351 std::string doc_store_dir_;
352
353 std::unique_ptr<Index> index_;
354 std::unique_ptr<NumericIndex<int64_t>> integer_index_;
355 std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_;
356 std::unique_ptr<LanguageSegmenter> lang_segmenter_;
357 std::unique_ptr<Normalizer> normalizer_;
358 std::unique_ptr<SchemaStore> schema_store_;
359 std::unique_ptr<DocumentStore> doc_store_;
360
361 std::unique_ptr<IndexProcessor> index_processor_;
362 };
363
GetHits(std::unique_ptr<DocHitInfoIterator> iterator)364 std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
365 std::vector<DocHitInfo> infos;
366 while (iterator->Advance().ok()) {
367 infos.push_back(iterator->doc_hit_info());
368 }
369 return infos;
370 }
371
GetHitsWithTermFrequency(std::unique_ptr<DocHitInfoIterator> iterator)372 std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
373 std::unique_ptr<DocHitInfoIterator> iterator) {
374 std::vector<DocHitInfoTermFrequencyPair> infos;
375 while (iterator->Advance().ok()) {
376 std::vector<TermMatchInfo> matched_terms_stats;
377 iterator->PopulateMatchedTermsStats(&matched_terms_stats);
378 for (const TermMatchInfo& term_match_info : matched_terms_stats) {
379 infos.push_back(DocHitInfoTermFrequencyPair(
380 iterator->doc_hit_info(), term_match_info.term_frequencies));
381 }
382 }
383 return infos;
384 }
385
TEST_F(IndexProcessorTest,NoTermMatchTypeContent)386 TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
387 DocumentProto document =
388 DocumentBuilder()
389 .SetKey("icing", "fake_type/1")
390 .SetSchema(std::string(kFakeType))
391 .AddStringProperty(std::string(kUnindexedProperty1), "foo bar baz")
392 .AddBytesProperty(std::string(kUnindexedProperty2),
393 "attachment bytes")
394 .Build();
395 ICING_ASSERT_OK_AND_ASSIGN(
396 TokenizedDocument tokenized_document,
397 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
398 document));
399 EXPECT_THAT(
400 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
401 /*old_document_id=*/kInvalidDocumentId),
402 IsOk());
403 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
404 }
405
TEST_F(IndexProcessorTest,NoValidContent)406 TEST_F(IndexProcessorTest, NoValidContent) {
407 DocumentProto document =
408 DocumentBuilder()
409 .SetKey("icing", "fake_type/1")
410 .SetSchema(std::string(kFakeType))
411 .AddStringProperty(std::string(kExactProperty), "?...!")
412 .Build();
413 ICING_ASSERT_OK_AND_ASSIGN(
414 TokenizedDocument tokenized_document,
415 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
416 document));
417 EXPECT_THAT(
418 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
419 /*old_document_id=*/kInvalidDocumentId),
420 IsOk());
421 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
422 }
423
TEST_F(IndexProcessorTest,OneDoc)424 TEST_F(IndexProcessorTest, OneDoc) {
425 DocumentProto document =
426 DocumentBuilder()
427 .SetKey("icing", "fake_type/1")
428 .SetSchema(std::string(kFakeType))
429 .AddStringProperty(std::string(kExactProperty), "hello world")
430 .Build();
431 ICING_ASSERT_OK_AND_ASSIGN(
432 TokenizedDocument tokenized_document,
433 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
434 document));
435 EXPECT_THAT(
436 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
437 /*old_document_id=*/kInvalidDocumentId),
438 IsOk());
439 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
440
441 ICING_ASSERT_OK_AND_ASSIGN(
442 std::unique_ptr<DocHitInfoIterator> itr,
443 index_->GetIterator("hello", /*term_start_index=*/0,
444 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
445 TermMatchType::EXACT_ONLY));
446 std::vector<DocHitInfoTermFrequencyPair> hits =
447 GetHitsWithTermFrequency(std::move(itr));
448 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
449 {kExactSectionId, 1}};
450 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
451 kDocumentId0, expected_map)));
452
453 ICING_ASSERT_OK_AND_ASSIGN(
454 itr, index_->GetIterator(
455 "hello", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
456 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY));
457 EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
458 }
459
TEST_F(IndexProcessorTest,MultipleDocs)460 TEST_F(IndexProcessorTest, MultipleDocs) {
461 DocumentProto document =
462 DocumentBuilder()
463 .SetKey("icing", "fake_type/1")
464 .SetSchema(std::string(kFakeType))
465 .AddStringProperty(std::string(kExactProperty), "hello world")
466 .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
467 .Build();
468 ICING_ASSERT_OK_AND_ASSIGN(
469 TokenizedDocument tokenized_document,
470 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
471 document));
472 EXPECT_THAT(
473 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
474 /*old_document_id=*/kInvalidDocumentId),
475 IsOk());
476 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
477
478 std::string coffeeRepeatedString = "coffee";
479 for (int i = 0; i < Hit::kMaxTermFrequency + 1; i++) {
480 coffeeRepeatedString += " coffee";
481 }
482
483 document =
484 DocumentBuilder()
485 .SetKey("icing", "fake_type/2")
486 .SetSchema(std::string(kFakeType))
487 .AddStringProperty(std::string(kExactProperty), coffeeRepeatedString)
488 .AddStringProperty(std::string(kPrefixedProperty),
489 "mr. world world wide")
490 .Build();
491 ICING_ASSERT_OK_AND_ASSIGN(
492 tokenized_document,
493 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
494 document));
495 EXPECT_THAT(
496 index_processor_->IndexDocument(tokenized_document, kDocumentId1,
497 /*old_document_id=*/kInvalidDocumentId),
498 IsOk());
499 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
500
501 ICING_ASSERT_OK_AND_ASSIGN(
502 std::unique_ptr<DocHitInfoIterator> itr,
503 index_->GetIterator("world", /*term_start_index=*/0,
504 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
505 TermMatchType::EXACT_ONLY));
506 std::vector<DocHitInfoTermFrequencyPair> hits =
507 GetHitsWithTermFrequency(std::move(itr));
508 std::unordered_map<SectionId, Hit::TermFrequency> expected_map_1{
509 {kPrefixedSectionId, 2}};
510 std::unordered_map<SectionId, Hit::TermFrequency> expected_map_2{
511 {kExactSectionId, 1}};
512 EXPECT_THAT(
513 hits,
514 ElementsAre(
515 EqualsDocHitInfoWithTermFrequency(kDocumentId1, expected_map_1),
516 EqualsDocHitInfoWithTermFrequency(kDocumentId0, expected_map_2)));
517
518 ICING_ASSERT_OK_AND_ASSIGN(
519 itr, index_->GetIterator(
520 "world", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
521 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY));
522 hits = GetHitsWithTermFrequency(std::move(itr));
523 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
524 {kPrefixedSectionId, 2}};
525 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
526 kDocumentId1, expected_map)));
527
528 ICING_ASSERT_OK_AND_ASSIGN(
529 itr, index_->GetIterator("coffee", /*term_start_index=*/0,
530 /*unnormalized_term_length=*/0,
531 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
532 hits = GetHitsWithTermFrequency(std::move(itr));
533 expected_map = {{kExactSectionId, Hit::kMaxTermFrequency}};
534 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
535 kDocumentId1, expected_map)));
536 }
537
TEST_F(IndexProcessorTest,DocWithNestedProperty)538 TEST_F(IndexProcessorTest, DocWithNestedProperty) {
539 DocumentProto document =
540 DocumentBuilder()
541 .SetKey("icing", "fake_type/1")
542 .SetSchema(std::string(kFakeType))
543 .AddStringProperty(std::string(kExactProperty), "hello world")
544 .AddDocumentProperty(
545 std::string(kSubProperty),
546 DocumentBuilder()
547 .SetKey("icing", "nested_type/1")
548 .SetSchema(std::string(kNestedType))
549 .AddStringProperty(std::string(kNestedProperty),
550 "rocky raccoon")
551 .Build())
552 .Build();
553 ICING_ASSERT_OK_AND_ASSIGN(
554 TokenizedDocument tokenized_document,
555 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
556 document));
557 EXPECT_THAT(
558 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
559 /*old_document_id=*/kInvalidDocumentId),
560 IsOk());
561 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
562
563 ICING_ASSERT_OK_AND_ASSIGN(
564 std::unique_ptr<DocHitInfoIterator> itr,
565 index_->GetIterator("rocky", /*term_start_index=*/0,
566 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
567 TermMatchType::EXACT_ONLY));
568 EXPECT_THAT(GetHits(std::move(itr)),
569 ElementsAre(EqualsDocHitInfo(
570 kDocumentId0, std::vector<SectionId>{kNestedSectionId})));
571 }
572
TEST_F(IndexProcessorTest,DocWithRepeatedProperty)573 TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
574 DocumentProto document =
575 DocumentBuilder()
576 .SetKey("icing", "fake_type/1")
577 .SetSchema(std::string(kFakeType))
578 .AddStringProperty(std::string(kExactProperty), "hello world")
579 .AddStringProperty(std::string(kRepeatedProperty), "rocky",
580 "italian stallion")
581 .Build();
582 ICING_ASSERT_OK_AND_ASSIGN(
583 TokenizedDocument tokenized_document,
584 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
585 document));
586 EXPECT_THAT(
587 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
588 /*old_document_id=*/kInvalidDocumentId),
589 IsOk());
590 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
591
592 ICING_ASSERT_OK_AND_ASSIGN(
593 std::unique_ptr<DocHitInfoIterator> itr,
594 index_->GetIterator("italian", /*term_start_index=*/0,
595 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
596 TermMatchType::EXACT_ONLY));
597 EXPECT_THAT(GetHits(std::move(itr)),
598 ElementsAre(EqualsDocHitInfo(
599 kDocumentId0, std::vector<SectionId>{kRepeatedSectionId})));
600 }
601
602 // TODO(b/196771754) This test is disabled on Android because it takes too long
603 // to generate all of the unique terms and the test times out. Try storing these
604 // unique terms in a file that the test can read from.
605 #ifndef __ANDROID__
606
TEST_F(IndexProcessorTest,HitBufferExhaustedTest)607 TEST_F(IndexProcessorTest, HitBufferExhaustedTest) {
608 // Testing has shown that adding ~600,000 hits will fill up the hit buffer.
609 std::vector<std::string> unique_terms_ = GenerateUniqueTerms(200000);
610 std::string content = absl_ports::StrJoin(unique_terms_, " ");
611
612 DocumentProto document =
613 DocumentBuilder()
614 .SetKey("icing", "fake_type/1")
615 .SetSchema(std::string(kFakeType))
616 .AddStringProperty(std::string(kExactProperty), content)
617 .AddStringProperty(std::string(kPrefixedProperty), content)
618 .AddStringProperty(std::string(kRepeatedProperty), content)
619 .Build();
620 ICING_ASSERT_OK_AND_ASSIGN(
621 TokenizedDocument tokenized_document,
622 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
623 document));
624 EXPECT_THAT(
625 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
626 /*old_document_id=*/kInvalidDocumentId),
627 StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED,
628 testing::HasSubstr("Hit buffer is full!")));
629 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
630 }
631
TEST_F(IndexProcessorTest,LexiconExhaustedTest)632 TEST_F(IndexProcessorTest, LexiconExhaustedTest) {
633 // Testing has shown that adding ~300,000 terms generated this way will
634 // fill up the lexicon.
635 std::vector<std::string> unique_terms_ = GenerateUniqueTerms(300000);
636 std::string content = absl_ports::StrJoin(unique_terms_, " ");
637
638 DocumentProto document =
639 DocumentBuilder()
640 .SetKey("icing", "fake_type/1")
641 .SetSchema(std::string(kFakeType))
642 .AddStringProperty(std::string(kExactProperty), content)
643 .Build();
644 ICING_ASSERT_OK_AND_ASSIGN(
645 TokenizedDocument tokenized_document,
646 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
647 document));
648 EXPECT_THAT(
649 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
650 /*old_document_id=*/kInvalidDocumentId),
651 StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
652 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
653 }
654
655 #endif // __ANDROID__
656
TEST_F(IndexProcessorTest,TooLongTokens)657 TEST_F(IndexProcessorTest, TooLongTokens) {
658 // Only allow the tokens of length four, truncating "hello", "world" and
659 // "night".
660 ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
661 normalizer_factory::Create(
662 /*max_term_byte_size=*/4));
663
664 ICING_ASSERT_OK_AND_ASSIGN(
665 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
666 TermIndexingHandler::Create(
667 &fake_clock_, normalizer.get(), index_.get(),
668 /*build_property_existence_metadata_hits=*/true));
669 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
670 handlers.push_back(std::move(term_indexing_handler));
671
672 index_processor_ =
673 std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
674
675 DocumentProto document =
676 DocumentBuilder()
677 .SetKey("icing", "fake_type/1")
678 .SetSchema(std::string(kFakeType))
679 .AddStringProperty(std::string(kExactProperty), "hello world")
680 .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
681 .Build();
682 ICING_ASSERT_OK_AND_ASSIGN(
683 TokenizedDocument tokenized_document,
684 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
685 document));
686 EXPECT_THAT(
687 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
688 /*old_document_id=*/kInvalidDocumentId),
689 IsOk());
690 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
691
692 // "good" should have been indexed normally.
693 ICING_ASSERT_OK_AND_ASSIGN(
694 std::unique_ptr<DocHitInfoIterator> itr,
695 index_->GetIterator("good", /*term_start_index=*/0,
696 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
697 TermMatchType::EXACT_ONLY));
698 EXPECT_THAT(GetHits(std::move(itr)),
699 ElementsAre(EqualsDocHitInfo(
700 kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
701
702 // "night" should not have been.
703 ICING_ASSERT_OK_AND_ASSIGN(
704 itr, index_->GetIterator("night", /*term_start_index=*/0,
705 /*unnormalized_term_length=*/0,
706 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
707 EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
708
709 // "night" should have been truncated to "nigh".
710 ICING_ASSERT_OK_AND_ASSIGN(
711 itr, index_->GetIterator("nigh", /*term_start_index=*/0,
712 /*unnormalized_term_length=*/0,
713 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
714 EXPECT_THAT(GetHits(std::move(itr)),
715 ElementsAre(EqualsDocHitInfo(
716 kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
717 }
718
TEST_F(IndexProcessorTest,PrefixedQueryReturnsCombinedTermFrequenciesForBothIndices)719 TEST_F(IndexProcessorTest,
720 PrefixedQueryReturnsCombinedTermFrequenciesForBothIndices) {
721 DocumentProto document =
722 DocumentBuilder()
723 .SetKey("icing", "fake_type/1")
724 .SetSchema(std::string(kFakeType))
725 .AddStringProperty(
726 std::string(kPrefixedProperty),
727 "rocket the raccoon retreated from the rodent resistance")
728 .Build();
729 ICING_ASSERT_OK_AND_ASSIGN(
730 TokenizedDocument tokenized_document,
731 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
732 document));
733 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
734
735 EXPECT_THAT(
736 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
737 /*old_document_id=*/kInvalidDocumentId),
738 IsOk());
739 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
740
741 // Query the lite index. "r" should have 5 matches.
742 ICING_ASSERT_OK_AND_ASSIGN(
743 std::unique_ptr<DocHitInfoIterator> lite_itr,
744 index_->GetIterator("r", /*term_start_index=*/0,
745 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
746 TermMatchType::PREFIX));
747 std::vector<DocHitInfoTermFrequencyPair> lite_hits =
748 GetHitsWithTermFrequency(std::move(lite_itr));
749
750 // Merge the indices so that we're querying the main index, and check that
751 // results are the same.
752 ASSERT_THAT(index_->Merge(), IsOk());
753 ICING_ASSERT_OK_AND_ASSIGN(
754 std::unique_ptr<DocHitInfoIterator> main_itr,
755 index_->GetIterator("r", /*term_start_index=*/0,
756 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
757 TermMatchType::PREFIX));
758 std::vector<DocHitInfoTermFrequencyPair> main_hits =
759 GetHitsWithTermFrequency(std::move(main_itr));
760 EXPECT_THAT(main_hits, UnorderedElementsAreArray(lite_hits));
761
762 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
763 {kPrefixedSectionId, 5}};
764 EXPECT_THAT(lite_hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
765 kDocumentId0, expected_map)));
766 }
767
TEST_F(IndexProcessorTest,NonPrefixedContentPrefixQuery)768 TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
769 DocumentProto document =
770 DocumentBuilder()
771 .SetKey("icing", "fake_type/1")
772 .SetSchema(std::string(kFakeType))
773 .AddStringProperty(std::string(kExactProperty), "best rocky movies")
774 .Build();
775 ICING_ASSERT_OK_AND_ASSIGN(
776 TokenizedDocument tokenized_document,
777 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
778 document));
779 EXPECT_THAT(
780 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
781 /*old_document_id=*/kInvalidDocumentId),
782 IsOk());
783 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
784
785 document =
786 DocumentBuilder()
787 .SetKey("icing", "fake_type/2")
788 .SetSchema(std::string(kFakeType))
789 .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon")
790 .Build();
791 ICING_ASSERT_OK_AND_ASSIGN(
792 tokenized_document,
793 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
794 document));
795 EXPECT_THAT(
796 index_processor_->IndexDocument(tokenized_document, kDocumentId1,
797 /*old_document_id=*/kInvalidDocumentId),
798 IsOk());
799 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
800
801 // Only document_id 1 should surface in a prefix query for "Rock"
802 ICING_ASSERT_OK_AND_ASSIGN(
803 std::unique_ptr<DocHitInfoIterator> itr,
804 index_->GetIterator("rock", /*term_start_index=*/0,
805 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
806 TermMatchType::PREFIX));
807 EXPECT_THAT(GetHits(std::move(itr)),
808 ElementsAre(EqualsDocHitInfo(
809 kDocumentId1, std::vector<SectionId>{kPrefixedSectionId})));
810 }
811
TEST_F(IndexProcessorTest,TokenNormalization)812 TEST_F(IndexProcessorTest, TokenNormalization) {
813 DocumentProto document =
814 DocumentBuilder()
815 .SetKey("icing", "fake_type/1")
816 .SetSchema(std::string(kFakeType))
817 .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
818 .Build();
819 ICING_ASSERT_OK_AND_ASSIGN(
820 TokenizedDocument tokenized_document,
821 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
822 document));
823 EXPECT_THAT(
824 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
825 /*old_document_id=*/kInvalidDocumentId),
826 IsOk());
827 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
828
829 document =
830 DocumentBuilder()
831 .SetKey("icing", "fake_type/2")
832 .SetSchema(std::string(kFakeType))
833 .AddStringProperty(std::string(kExactProperty), "all lower case")
834 .Build();
835 ICING_ASSERT_OK_AND_ASSIGN(
836 tokenized_document,
837 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
838 document));
839 EXPECT_THAT(
840 index_processor_->IndexDocument(tokenized_document, kDocumentId1,
841 /*old_document_id=*/kInvalidDocumentId),
842 IsOk());
843 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
844
845 ICING_ASSERT_OK_AND_ASSIGN(
846 std::unique_ptr<DocHitInfoIterator> itr,
847 index_->GetIterator("case", /*term_start_index=*/0,
848 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
849 TermMatchType::EXACT_ONLY));
850 EXPECT_THAT(
851 GetHits(std::move(itr)),
852 ElementsAre(EqualsDocHitInfo(kDocumentId1,
853 std::vector<SectionId>{kExactSectionId}),
854 EqualsDocHitInfo(kDocumentId0,
855 std::vector<SectionId>{kExactSectionId})));
856 }
857
TEST_F(IndexProcessorTest,OutOfOrderDocumentIds)858 TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
859 DocumentProto document =
860 DocumentBuilder()
861 .SetKey("icing", "fake_type/1")
862 .SetSchema(std::string(kFakeType))
863 .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
864 .AddInt64Property(std::string(kIndexableIntegerProperty), 123)
865 .Build();
866 ICING_ASSERT_OK_AND_ASSIGN(
867 TokenizedDocument tokenized_document,
868 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
869 document));
870 EXPECT_THAT(
871 index_processor_->IndexDocument(tokenized_document, kDocumentId1,
872 /*old_document_id=*/kInvalidDocumentId),
873 IsOk());
874 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
875
876 ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size,
877 index_->GetElementsSize());
878 ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc,
879 integer_index_->UpdateChecksums());
880
881 // Indexing a document with document_id <= last_added_document_id should cause
882 // a failure.
883 document =
884 DocumentBuilder()
885 .SetKey("icing", "fake_type/2")
886 .SetSchema(std::string(kFakeType))
887 .AddStringProperty(std::string(kExactProperty), "all lower case")
888 .AddInt64Property(std::string(kIndexableIntegerProperty), 456)
889 .Build();
890 ICING_ASSERT_OK_AND_ASSIGN(
891 tokenized_document,
892 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
893 document));
894 EXPECT_THAT(
895 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
896 /*old_document_id=*/kInvalidDocumentId),
897 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
898 // Verify that both index_ and integer_index_ are unchanged.
899 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
900 EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
901 EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
902 EXPECT_THAT(integer_index_->UpdateChecksums(),
903 IsOkAndHolds(integer_index_crc));
904
905 // As should indexing a document document_id == last_added_document_id.
906 EXPECT_THAT(
907 index_processor_->IndexDocument(tokenized_document, kDocumentId1,
908 /*old_document_id=*/kInvalidDocumentId),
909 StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
910 // Verify that both index_ and integer_index_ are unchanged.
911 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
912 EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
913 EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
914 EXPECT_THAT(integer_index_->UpdateChecksums(),
915 IsOkAndHolds(integer_index_crc));
916 }
917
TEST_F(IndexProcessorTest,OutOfOrderDocumentIdsInRecoveryMode)918 TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
919 ICING_ASSERT_OK_AND_ASSIGN(
920 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
921 TermIndexingHandler::Create(
922 &fake_clock_, normalizer_.get(), index_.get(),
923 /*build_property_existence_metadata_hits=*/true));
924 ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
925 integer_section_indexing_handler,
926 IntegerSectionIndexingHandler::Create(
927 &fake_clock_, integer_index_.get()));
928 ICING_ASSERT_OK_AND_ASSIGN(
929 std::unique_ptr<QualifiedIdJoinIndexingHandler>
930 qualified_id_join_indexing_handler,
931 QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
932 qualified_id_join_index_.get()));
933 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
934 handlers.push_back(std::move(term_indexing_handler));
935 handlers.push_back(std::move(integer_section_indexing_handler));
936 handlers.push_back(std::move(qualified_id_join_indexing_handler));
937
938 IndexProcessor index_processor(std::move(handlers), &fake_clock_,
939 /*recovery_mode=*/true);
940
941 DocumentProto document =
942 DocumentBuilder()
943 .SetKey("icing", "fake_type/1")
944 .SetSchema(std::string(kFakeType))
945 .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
946 .AddInt64Property(std::string(kIndexableIntegerProperty), 123)
947 .Build();
948 ICING_ASSERT_OK_AND_ASSIGN(
949 TokenizedDocument tokenized_document,
950 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
951 document));
952 EXPECT_THAT(
953 index_processor.IndexDocument(tokenized_document, kDocumentId1,
954 /*old_document_id=*/kInvalidDocumentId),
955 IsOk());
956 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
957
958 ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size,
959 index_->GetElementsSize());
960 ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc,
961 integer_index_->UpdateChecksums());
962
963 // Indexing a document with document_id <= last_added_document_id in recovery
964 // mode should not get any error, but IndexProcessor should still ignore it
965 // and index data should remain unchanged.
966 document =
967 DocumentBuilder()
968 .SetKey("icing", "fake_type/2")
969 .SetSchema(std::string(kFakeType))
970 .AddStringProperty(std::string(kExactProperty), "all lower case")
971 .AddInt64Property(std::string(kIndexableIntegerProperty), 456)
972 .Build();
973 ICING_ASSERT_OK_AND_ASSIGN(
974 tokenized_document,
975 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
976 document));
977 EXPECT_THAT(
978 index_processor.IndexDocument(tokenized_document, kDocumentId0,
979 /*old_document_id=*/kInvalidDocumentId),
980 IsOk());
981 // Verify that both index_ and integer_index_ are unchanged.
982 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
983 EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
984 EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
985 EXPECT_THAT(integer_index_->UpdateChecksums(),
986 IsOkAndHolds(integer_index_crc));
987
988 // As should indexing a document document_id == last_added_document_id.
989 EXPECT_THAT(
990 index_processor.IndexDocument(tokenized_document, kDocumentId1,
991 /*old_document_id=*/kInvalidDocumentId),
992 IsOk());
993 // Verify that both index_ and integer_index_ are unchanged.
994 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
995 EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
996 EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
997 EXPECT_THAT(integer_index_->UpdateChecksums(),
998 IsOkAndHolds(integer_index_crc));
999 }
1000
TEST_F(IndexProcessorTest,NonAsciiIndexing)1001 TEST_F(IndexProcessorTest, NonAsciiIndexing) {
1002 language_segmenter_factory::SegmenterOptions segmenter_options(
1003 ULOC_SIMPLIFIED_CHINESE);
1004 ICING_ASSERT_OK_AND_ASSIGN(
1005 lang_segmenter_,
1006 language_segmenter_factory::Create(std::move(segmenter_options)));
1007
1008 DocumentProto document =
1009 DocumentBuilder()
1010 .SetKey("icing", "fake_type/1")
1011 .SetSchema(std::string(kFakeType))
1012 .AddStringProperty(std::string(kExactProperty),
1013 "你好,世界!你好:世界。“你好”世界?")
1014 .Build();
1015 ICING_ASSERT_OK_AND_ASSIGN(
1016 TokenizedDocument tokenized_document,
1017 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1018 document));
1019 EXPECT_THAT(
1020 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1021 /*old_document_id=*/kInvalidDocumentId),
1022 IsOk());
1023 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1024
1025 ICING_ASSERT_OK_AND_ASSIGN(
1026 std::unique_ptr<DocHitInfoIterator> itr,
1027 index_->GetIterator("你好", /*term_start_index=*/0,
1028 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1029 TermMatchType::EXACT_ONLY));
1030 EXPECT_THAT(GetHits(std::move(itr)),
1031 ElementsAre(EqualsDocHitInfo(
1032 kDocumentId0, std::vector<SectionId>{kExactSectionId})));
1033 }
1034
TEST_F(IndexProcessorTest,LexiconFullIndexesSmallerTokensReturnsResourceExhausted)1035 TEST_F(IndexProcessorTest,
1036 LexiconFullIndexesSmallerTokensReturnsResourceExhausted) {
1037 // This is the maximum token length that an empty lexicon constructed for a
1038 // lite index with merge size of 1MiB can support.
1039 constexpr int kMaxTokenLength = 16777217;
1040 // Create a string "ppppppp..." with a length that is too large to fit into
1041 // the lexicon.
1042 std::string enormous_string(kMaxTokenLength + 1, 'p');
1043 DocumentProto document_one =
1044 DocumentBuilder()
1045 .SetKey("icing", "fake_type/1")
1046 .SetSchema(std::string(kFakeType))
1047 .AddStringProperty(std::string(kExactProperty),
1048 absl_ports::StrCat(enormous_string, " foo"))
1049 .AddStringProperty(std::string(kPrefixedProperty), "bar baz")
1050 .Build();
1051 ICING_ASSERT_OK_AND_ASSIGN(
1052 TokenizedDocument tokenized_document,
1053 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1054 document_one));
1055 EXPECT_THAT(
1056 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1057 /*old_document_id=*/kInvalidDocumentId),
1058 StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
1059 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1060 }
1061
TEST_F(IndexProcessorTest,IndexingDocAutomaticMerge)1062 TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
1063 // Create the index with a smaller index_merge_size - merging every time we
1064 // add 101 documents. This will result in a small LiteIndex, which will be
1065 // easier to fill up. The LiteIndex itself will have a size larger than the
1066 // index_merge_size because it adds extra buffer to ensure that it always has
1067 // room to fit whatever document will trigger the merge.
1068 DocumentProto document =
1069 DocumentBuilder()
1070 .SetKey("icing", "fake_type/1")
1071 .SetSchema(std::string(kFakeType))
1072 .AddStringProperty(std::string(kExactProperty), kIpsumText)
1073 .Build();
1074 ICING_ASSERT_OK_AND_ASSIGN(
1075 TokenizedDocument tokenized_document,
1076 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1077 document));
1078 Index::Options options(index_dir_,
1079 /*index_merge_size=*/document.ByteSizeLong() * 100,
1080 /*lite_index_sort_at_indexing=*/true,
1081 /*lite_index_sort_size=*/64);
1082 ICING_ASSERT_OK_AND_ASSIGN(
1083 index_, Index::Create(options, &filesystem_, &icing_filesystem_));
1084
1085 ICING_ASSERT_OK_AND_ASSIGN(
1086 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
1087 TermIndexingHandler::Create(
1088 &fake_clock_, normalizer_.get(), index_.get(),
1089 /*build_property_existence_metadata_hits=*/true));
1090 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
1091 handlers.push_back(std::move(term_indexing_handler));
1092
1093 index_processor_ =
1094 std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
1095
1096 DocumentId doc_id = 0;
1097 // Have determined experimentally that indexing 3373 documents with this text
1098 // will cause the LiteIndex to fill up. Further indexing will fail unless the
1099 // index processor properly merges the LiteIndex into the MainIndex and
1100 // empties the LiteIndex.
1101 constexpr int kNumDocsLiteIndexExhaustion = 3373;
1102 for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) {
1103 EXPECT_THAT(
1104 index_processor_->IndexDocument(
1105 tokenized_document, doc_id,
1106 /*old_document_id=*/doc_id == 0 ? kInvalidDocumentId : doc_id - 1),
1107 IsOk());
1108 EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1109 }
1110 EXPECT_THAT(
1111 index_processor_->IndexDocument(
1112 tokenized_document, doc_id,
1113 /*old_document_id=*/doc_id == 0 ? kInvalidDocumentId : doc_id - 1),
1114 IsOk());
1115 EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1116 }
1117
TEST_F(IndexProcessorTest,IndexingDocMergeFailureResets)1118 TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
1119 // 1. Setup a mock filesystem to fail to grow the main index.
1120 auto open_write_lambda = [this](const char* filename) {
1121 std::string main_lexicon_suffix =
1122 "/main-lexicon.prop." +
1123 std::to_string(GetHasHitsInPrefixSectionPropertyId());
1124 std::string filename_string(filename);
1125 if (filename_string.length() >= main_lexicon_suffix.length() &&
1126 filename_string.substr(
1127 filename_string.length() - main_lexicon_suffix.length(),
1128 main_lexicon_suffix.length()) == main_lexicon_suffix) {
1129 return -1;
1130 }
1131 return this->filesystem_.OpenForWrite(filename);
1132 };
1133 ON_CALL(*mock_icing_filesystem_, OpenForWrite)
1134 .WillByDefault(open_write_lambda);
1135
1136 DocumentProto document =
1137 DocumentBuilder()
1138 .SetKey("icing", "fake_type/1")
1139 .SetSchema(std::string(kFakeType))
1140 .AddStringProperty(std::string(kPrefixedProperty), kIpsumText)
1141 .Build();
1142 ICING_ASSERT_OK_AND_ASSIGN(
1143 TokenizedDocument tokenized_document,
1144 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1145 document));
1146
1147 // 2. Recreate the index with the mock filesystem and a merge size that will
1148 // only allow one document to be added before requiring a merge.
1149 Index::Options options(index_dir_,
1150 /*index_merge_size=*/document.ByteSizeLong(),
1151 /*lite_index_sort_at_indexing=*/true,
1152 /*lite_index_sort_size=*/16);
1153 ICING_ASSERT_OK_AND_ASSIGN(
1154 index_,
1155 Index::Create(options, &filesystem_, mock_icing_filesystem_.get()));
1156
1157 ICING_ASSERT_OK_AND_ASSIGN(
1158 std::unique_ptr<TermIndexingHandler> term_indexing_handler,
1159 TermIndexingHandler::Create(
1160 &fake_clock_, normalizer_.get(), index_.get(),
1161 /*build_property_existence_metadata_hits=*/true));
1162 std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
1163 handlers.push_back(std::move(term_indexing_handler));
1164
1165 index_processor_ =
1166 std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
1167
1168 // 3. Index one document. This should fit in the LiteIndex without requiring a
1169 // merge.
1170 DocumentId doc_id = 0;
1171 EXPECT_THAT(
1172 index_processor_->IndexDocument(tokenized_document, doc_id,
1173 /*old_document_id=*/kInvalidDocumentId),
1174 IsOk());
1175 EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1176
1177 // 4. Add one more document to trigger a merge, which should fail and result
1178 // in a Reset.
1179 ++doc_id;
1180 EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id,
1181 /*old_document_id=*/doc_id - 1),
1182 StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
1183 EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
1184
1185 // 5. Indexing a new document should succeed.
1186 EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id,
1187 /*old_document_id=*/doc_id - 1),
1188 IsOk());
1189 EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1190 }
1191
TEST_F(IndexProcessorTest,ExactVerbatimProperty)1192 TEST_F(IndexProcessorTest, ExactVerbatimProperty) {
1193 DocumentProto document =
1194 DocumentBuilder()
1195 .SetKey("icing", "fake_type/1")
1196 .SetSchema(std::string(kFakeType))
1197 .AddStringProperty(std::string(kVerbatimExactProperty),
1198 "Hello, world!")
1199 .Build();
1200 ICING_ASSERT_OK_AND_ASSIGN(
1201 TokenizedDocument tokenized_document,
1202 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1203 document));
1204 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
1205
1206 EXPECT_THAT(
1207 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1208 /*old_document_id=*/kInvalidDocumentId),
1209 IsOk());
1210 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1211
1212 ICING_ASSERT_OK_AND_ASSIGN(
1213 std::unique_ptr<DocHitInfoIterator> itr,
1214 index_->GetIterator("Hello, world!", /*term_start_index=*/0,
1215 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1216 TermMatchType::EXACT_ONLY));
1217 std::vector<DocHitInfoTermFrequencyPair> hits =
1218 GetHitsWithTermFrequency(std::move(itr));
1219 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1220 {kVerbatimExactSectionId, 1}};
1221
1222 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1223 kDocumentId0, expected_map)));
1224 }
1225
TEST_F(IndexProcessorTest,PrefixVerbatimProperty)1226 TEST_F(IndexProcessorTest, PrefixVerbatimProperty) {
1227 DocumentProto document =
1228 DocumentBuilder()
1229 .SetKey("icing", "fake_type/1")
1230 .SetSchema(std::string(kFakeType))
1231 .AddStringProperty(std::string(kVerbatimPrefixedProperty),
1232 "Hello, world!")
1233 .Build();
1234 ICING_ASSERT_OK_AND_ASSIGN(
1235 TokenizedDocument tokenized_document,
1236 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1237 document));
1238 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
1239
1240 EXPECT_THAT(
1241 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1242 /*old_document_id=*/kInvalidDocumentId),
1243 IsOk());
1244 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1245
1246 // We expect to match the document we indexed as "Hello, w" is a prefix
1247 // of "Hello, world!"
1248 ICING_ASSERT_OK_AND_ASSIGN(
1249 std::unique_ptr<DocHitInfoIterator> itr,
1250 index_->GetIterator("Hello, w", /*term_start_index=*/0,
1251 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1252 TermMatchType::PREFIX));
1253 std::vector<DocHitInfoTermFrequencyPair> hits =
1254 GetHitsWithTermFrequency(std::move(itr));
1255 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1256 {kVerbatimPrefixedSectionId, 1}};
1257
1258 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1259 kDocumentId0, expected_map)));
1260 }
1261
TEST_F(IndexProcessorTest,VerbatimPropertyDoesntMatchSubToken)1262 TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) {
1263 DocumentProto document =
1264 DocumentBuilder()
1265 .SetKey("icing", "fake_type/1")
1266 .SetSchema(std::string(kFakeType))
1267 .AddStringProperty(std::string(kVerbatimPrefixedProperty),
1268 "Hello, world!")
1269 .Build();
1270 ICING_ASSERT_OK_AND_ASSIGN(
1271 TokenizedDocument tokenized_document,
1272 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1273 document));
1274 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
1275
1276 EXPECT_THAT(
1277 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1278 /*old_document_id=*/kInvalidDocumentId),
1279 IsOk());
1280 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1281
1282 ICING_ASSERT_OK_AND_ASSIGN(
1283 std::unique_ptr<DocHitInfoIterator> itr,
1284 index_->GetIterator("world", /*term_start_index=*/0,
1285 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1286 TermMatchType::PREFIX));
1287 std::vector<DocHitInfo> hits = GetHits(std::move(itr));
1288
1289 // We should not have hits for term "world" as the index processor should
1290 // create a sole token "Hello, world! for the document.
1291 EXPECT_THAT(hits, IsEmpty());
1292 }
1293
1294 // Some phrases that should match exactly to RFC822 tokens. We normalize the
1295 // tokens, so the case of the string property shouldn't matter.
TEST_F(IndexProcessorTest,Rfc822PropertyExact)1296 TEST_F(IndexProcessorTest, Rfc822PropertyExact) {
1297 DocumentProto document = DocumentBuilder()
1298 .SetKey("icing", "fake_type/1")
1299 .SetSchema(std::string(kFakeType))
1300 .AddStringProperty(std::string(kRfc822Property),
1301 "<[email protected]>")
1302 .Build();
1303 ICING_ASSERT_OK_AND_ASSIGN(
1304 TokenizedDocument tokenized_document,
1305 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1306 document));
1307 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1308
1309 EXPECT_THAT(
1310 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1311 /*old_document_id=*/kInvalidDocumentId),
1312 IsOk());
1313 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1314
1315 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1316 {kRfc822SectionId, 2}};
1317
1318 ICING_ASSERT_OK_AND_ASSIGN(
1319 std::unique_ptr<DocHitInfoIterator> itr,
1320 index_->GetIterator("alexsav", /*term_start_index=*/0,
1321 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1322 TermMatchType::EXACT_ONLY));
1323 std::vector<DocHitInfoTermFrequencyPair> hits =
1324 GetHitsWithTermFrequency(std::move(itr));
1325 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1326 kDocumentId0, expected_map)));
1327
1328 expected_map = {{kRfc822SectionId, 1}};
1329
1330 ICING_ASSERT_OK_AND_ASSIGN(
1331 itr, index_->GetIterator("com", /*term_start_index=*/0,
1332 /*unnormalized_term_length=*/0,
1333 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1334 hits = GetHitsWithTermFrequency(std::move(itr));
1335 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1336 kDocumentId0, expected_map)));
1337
1338 ICING_ASSERT_OK_AND_ASSIGN(
1339 itr, index_->GetIterator("[email protected]", /*term_start_index=*/0,
1340 /*unnormalized_term_length=*/0,
1341 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1342 hits = GetHitsWithTermFrequency(std::move(itr));
1343 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1344 kDocumentId0, expected_map)));
1345 }
1346
TEST_F(IndexProcessorTest,Rfc822PropertyExactShouldNotReturnPrefix)1347 TEST_F(IndexProcessorTest, Rfc822PropertyExactShouldNotReturnPrefix) {
1348 DocumentProto document = DocumentBuilder()
1349 .SetKey("icing", "fake_type/1")
1350 .SetSchema(std::string(kFakeType))
1351 .AddStringProperty(std::string(kRfc822Property),
1352 "<[email protected]>")
1353 .Build();
1354 ICING_ASSERT_OK_AND_ASSIGN(
1355 TokenizedDocument tokenized_document,
1356 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1357 document));
1358 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1359
1360 EXPECT_THAT(
1361 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1362 /*old_document_id=*/kInvalidDocumentId),
1363 IsOk());
1364 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1365
1366 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1367 {kRfc822SectionId, 2}};
1368
1369 ICING_ASSERT_OK_AND_ASSIGN(
1370 std::unique_ptr<DocHitInfoIterator> itr,
1371 index_->GetIterator("alexsa", /*term_start_index=*/0,
1372 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1373 TermMatchType::EXACT_ONLY));
1374 std::vector<DocHitInfo> hits = GetHits(std::move(itr));
1375 EXPECT_THAT(hits, IsEmpty());
1376 }
1377
1378 // Some prefixes of generated RFC822 tokens.
TEST_F(IndexProcessorTest,Rfc822PropertyPrefix)1379 TEST_F(IndexProcessorTest, Rfc822PropertyPrefix) {
1380 DocumentProto document = DocumentBuilder()
1381 .SetKey("icing", "fake_type/1")
1382 .SetSchema(std::string(kFakeType))
1383 .AddStringProperty(std::string(kRfc822Property),
1384 "<[email protected]>")
1385 .Build();
1386 ICING_ASSERT_OK_AND_ASSIGN(
1387 TokenizedDocument tokenized_document,
1388 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1389 document));
1390 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1391
1392 EXPECT_THAT(
1393 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1394 /*old_document_id=*/kInvalidDocumentId),
1395 IsOk());
1396 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1397
1398 // "alexsav@" only matches "[email protected]"
1399 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1400 {kRfc822SectionId, 1}};
1401 ICING_ASSERT_OK_AND_ASSIGN(
1402 std::unique_ptr<DocHitInfoIterator> itr,
1403 index_->GetIterator("alexsav@", /*term_start_index=*/0,
1404 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1405 TermMatchType::PREFIX));
1406 std::vector<DocHitInfoTermFrequencyPair> hits =
1407 GetHitsWithTermFrequency(std::move(itr));
1408 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1409 kDocumentId0, expected_map)));
1410
1411 // "goog" matches tokens "google" and "google.com"
1412 expected_map = {{kRfc822SectionId, 2}};
1413 ICING_ASSERT_OK_AND_ASSIGN(
1414 itr, index_->GetIterator("goog", /*term_start_index=*/0,
1415 /*unnormalized_term_length=*/0,
1416 kSectionIdMaskAll, TermMatchType::PREFIX));
1417 hits = GetHitsWithTermFrequency(std::move(itr));
1418 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1419 kDocumentId0, expected_map)));
1420
1421 // "ale" matches tokens "alexsav" (twice) and "[email protected]"
1422 expected_map = {{kRfc822SectionId, 3}};
1423 ICING_ASSERT_OK_AND_ASSIGN(
1424 itr, index_->GetIterator("ale", /*term_start_index=*/0,
1425 /*unnormalized_term_length=*/0,
1426 kSectionIdMaskAll, TermMatchType::PREFIX));
1427 hits = GetHitsWithTermFrequency(std::move(itr));
1428 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1429 kDocumentId0, expected_map)));
1430 }
1431
TEST_F(IndexProcessorTest,Rfc822PropertyNoMatch)1432 TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) {
1433 DocumentProto document = DocumentBuilder()
1434 .SetKey("icing", "fake_type/1")
1435 .SetSchema(std::string(kFakeType))
1436 .AddStringProperty(std::string(kRfc822Property),
1437 "<[email protected]>")
1438 .Build();
1439 ICING_ASSERT_OK_AND_ASSIGN(
1440 TokenizedDocument tokenized_document,
1441 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1442 document));
1443 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1444
1445 EXPECT_THAT(
1446 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1447 /*old_document_id=*/kInvalidDocumentId),
1448 IsOk());
1449 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1450
1451 std::unordered_map<SectionId, Hit::TermFrequency> expect_map{{}};
1452
1453 ICING_ASSERT_OK_AND_ASSIGN(
1454 std::unique_ptr<DocHitInfoIterator> itr,
1455 index_->GetIterator("abc.xyz", /*term_start_index=*/0,
1456 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1457 TermMatchType::PREFIX));
1458 std::vector<DocHitInfo> hits = GetHits(std::move(itr));
1459
1460 EXPECT_THAT(hits, IsEmpty());
1461 }
1462
1463 #ifdef ENABLE_URL_TOKENIZER
TEST_F(IndexProcessorTest,ExactUrlProperty)1464 TEST_F(IndexProcessorTest, ExactUrlProperty) {
1465 DocumentProto document =
1466 DocumentBuilder()
1467 .SetKey("icing", "fake_type/1")
1468 .SetSchema(std::string(kFakeType))
1469 .AddStringProperty(std::string(kUrlExactProperty),
1470 "http://www.google.com")
1471 .Build();
1472 ICING_ASSERT_OK_AND_ASSIGN(
1473 TokenizedDocument tokenized_document,
1474 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1475 document));
1476 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1477
1478 EXPECT_THAT(
1479 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1480 /*old_document_id=*/kInvalidDocumentId),
1481 IsOk());
1482 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1483
1484 ICING_ASSERT_OK_AND_ASSIGN(
1485 std::unique_ptr<DocHitInfoIterator> itr,
1486 index_->GetIterator("google", /*term_start_index=*/0,
1487 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1488 TermMatchType::EXACT_ONLY));
1489 std::vector<DocHitInfoTermFrequencyPair> hits =
1490 GetHitsWithTermFrequency(std::move(itr));
1491 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1492 {kUrlExactSectionId, 1}};
1493 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1494 kDocumentId0, expected_map)));
1495
1496 ICING_ASSERT_OK_AND_ASSIGN(
1497 itr, index_->GetIterator("http", /*term_start_index=*/0,
1498 /*unnormalized_term_length=*/0,
1499 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1500 hits = GetHitsWithTermFrequency(std::move(itr));
1501 expected_map = {{kUrlExactSectionId, 1}};
1502 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1503 kDocumentId0, expected_map)));
1504
1505 ICING_ASSERT_OK_AND_ASSIGN(
1506 itr, index_->GetIterator("www.google.com", /*term_start_index=*/0,
1507 /*unnormalized_term_length=*/0,
1508 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1509 hits = GetHitsWithTermFrequency(std::move(itr));
1510 expected_map = {{kUrlExactSectionId, 1}};
1511 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1512 kDocumentId0, expected_map)));
1513
1514 ICING_ASSERT_OK_AND_ASSIGN(
1515 itr, index_->GetIterator("http://www.google.com", /*term_start_index=*/0,
1516 /*unnormalized_term_length=*/0,
1517 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1518 hits = GetHitsWithTermFrequency(std::move(itr));
1519 expected_map = {{kUrlExactSectionId, 1}};
1520 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1521 kDocumentId0, expected_map)));
1522 }
1523
TEST_F(IndexProcessorTest,ExactUrlPropertyDoesNotMatchPrefix)1524 TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) {
1525 DocumentProto document =
1526 DocumentBuilder()
1527 .SetKey("icing", "fake_type/1")
1528 .SetSchema(std::string(kFakeType))
1529 .AddStringProperty(std::string(kUrlExactProperty),
1530 "https://mail.google.com/calendar/render")
1531 .Build();
1532 ICING_ASSERT_OK_AND_ASSIGN(
1533 TokenizedDocument tokenized_document,
1534 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1535 document));
1536 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
1537
1538 EXPECT_THAT(
1539 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1540 /*old_document_id=*/kInvalidDocumentId),
1541 IsOk());
1542 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1543
1544 ICING_ASSERT_OK_AND_ASSIGN(
1545 std::unique_ptr<DocHitInfoIterator> itr,
1546 index_->GetIterator("co", /*term_start_index=*/0,
1547 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1548 TermMatchType::EXACT_ONLY));
1549 std::vector<DocHitInfoTermFrequencyPair> hits =
1550 GetHitsWithTermFrequency(std::move(itr));
1551 EXPECT_THAT(hits, IsEmpty());
1552
1553 ICING_ASSERT_OK_AND_ASSIGN(
1554 itr, index_->GetIterator("mail.go", /*term_start_index=*/0,
1555 /*unnormalized_term_length=*/0,
1556 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1557 hits = GetHitsWithTermFrequency(std::move(itr));
1558 EXPECT_THAT(hits, IsEmpty());
1559
1560 ICING_ASSERT_OK_AND_ASSIGN(
1561 itr, index_->GetIterator("mail.google.com", /*term_start_index=*/0,
1562 /*unnormalized_term_length=*/0,
1563 kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1564 hits = GetHitsWithTermFrequency(std::move(itr));
1565 EXPECT_THAT(hits, IsEmpty());
1566 }
1567
TEST_F(IndexProcessorTest,PrefixUrlProperty)1568 TEST_F(IndexProcessorTest, PrefixUrlProperty) {
1569 DocumentProto document =
1570 DocumentBuilder()
1571 .SetKey("icing", "fake_type/1")
1572 .SetSchema(std::string(kFakeType))
1573 .AddStringProperty(std::string(kUrlPrefixedProperty),
1574 "http://www.google.com")
1575 .Build();
1576 ICING_ASSERT_OK_AND_ASSIGN(
1577 TokenizedDocument tokenized_document,
1578 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1579 document));
1580 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1581
1582 EXPECT_THAT(
1583 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1584 /*old_document_id=*/kInvalidDocumentId),
1585 IsOk());
1586 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1587
1588 // "goo" is a prefix of "google" and "google.com"
1589 ICING_ASSERT_OK_AND_ASSIGN(
1590 std::unique_ptr<DocHitInfoIterator> itr,
1591 index_->GetIterator("goo", /*term_start_index=*/0,
1592 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1593 TermMatchType::PREFIX));
1594 std::vector<DocHitInfoTermFrequencyPair> hits =
1595 GetHitsWithTermFrequency(std::move(itr));
1596 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1597 {kUrlPrefixedSectionId, 2}};
1598 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1599 kDocumentId0, expected_map)));
1600
1601 // "http" is a prefix of "http" and "http://www.google.com"
1602 ICING_ASSERT_OK_AND_ASSIGN(
1603 itr, index_->GetIterator("http", /*term_start_index=*/0,
1604 /*unnormalized_term_length=*/0,
1605 kSectionIdMaskAll, TermMatchType::PREFIX));
1606 hits = GetHitsWithTermFrequency(std::move(itr));
1607 expected_map = {{kUrlPrefixedSectionId, 2}};
1608 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1609 kDocumentId0, expected_map)));
1610
1611 // "www.go" is a prefix of "www.google.com"
1612 ICING_ASSERT_OK_AND_ASSIGN(
1613 itr, index_->GetIterator("www.go", /*term_start_index=*/0,
1614 /*unnormalized_term_length=*/0,
1615 kSectionIdMaskAll, TermMatchType::PREFIX));
1616 hits = GetHitsWithTermFrequency(std::move(itr));
1617 expected_map = {{kUrlPrefixedSectionId, 1}};
1618 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1619 kDocumentId0, expected_map)));
1620 }
1621
TEST_F(IndexProcessorTest,PrefixUrlPropertyNoMatch)1622 TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) {
1623 DocumentProto document =
1624 DocumentBuilder()
1625 .SetKey("icing", "fake_type/1")
1626 .SetSchema(std::string(kFakeType))
1627 .AddStringProperty(std::string(kUrlPrefixedProperty),
1628 "https://mail.google.com/calendar/render")
1629 .Build();
1630 ICING_ASSERT_OK_AND_ASSIGN(
1631 TokenizedDocument tokenized_document,
1632 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1633 document));
1634 EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
1635
1636 EXPECT_THAT(
1637 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1638 /*old_document_id=*/kInvalidDocumentId),
1639 IsOk());
1640 EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1641
1642 // no token starts with "gle", so we should have no hits
1643 ICING_ASSERT_OK_AND_ASSIGN(
1644 std::unique_ptr<DocHitInfoIterator> itr,
1645 index_->GetIterator("gle", /*term_start_index=*/0,
1646 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1647 TermMatchType::PREFIX));
1648 std::vector<DocHitInfoTermFrequencyPair> hits =
1649 GetHitsWithTermFrequency(std::move(itr));
1650 EXPECT_THAT(hits, IsEmpty());
1651
1652 ICING_ASSERT_OK_AND_ASSIGN(
1653 itr, index_->GetIterator("w.goo", /*term_start_index=*/0,
1654 /*unnormalized_term_length=*/0,
1655 kSectionIdMaskAll, TermMatchType::PREFIX));
1656 hits = GetHitsWithTermFrequency(std::move(itr));
1657 EXPECT_THAT(hits, IsEmpty());
1658
1659 // tokens have separators removed, so no hits here
1660 ICING_ASSERT_OK_AND_ASSIGN(
1661 itr, index_->GetIterator(".com", /*term_start_index=*/0,
1662 /*unnormalized_term_length=*/0,
1663 kSectionIdMaskAll, TermMatchType::PREFIX));
1664 hits = GetHitsWithTermFrequency(std::move(itr));
1665 EXPECT_THAT(hits, IsEmpty());
1666
1667 ICING_ASSERT_OK_AND_ASSIGN(
1668 itr, index_->GetIterator("calendar/render", /*term_start_index=*/0,
1669 /*unnormalized_term_length=*/0,
1670 kSectionIdMaskAll, TermMatchType::PREFIX));
1671 hits = GetHitsWithTermFrequency(std::move(itr));
1672 EXPECT_THAT(hits, IsEmpty());
1673 }
1674 #endif // ENABLE_URL_TOKENIZER
1675
TEST_F(IndexProcessorTest,IndexableIntegerProperty)1676 TEST_F(IndexProcessorTest, IndexableIntegerProperty) {
1677 DocumentProto document =
1678 DocumentBuilder()
1679 .SetKey("icing", "fake_type/1")
1680 .SetSchema(std::string(kFakeType))
1681 .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4,
1682 5)
1683 .Build();
1684 ICING_ASSERT_OK_AND_ASSIGN(
1685 TokenizedDocument tokenized_document,
1686 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1687 document));
1688 // Expected to have 1 integer section.
1689 EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1));
1690
1691 EXPECT_THAT(
1692 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1693 /*old_document_id=*/kInvalidDocumentId),
1694 IsOk());
1695
1696 ICING_ASSERT_OK_AND_ASSIGN(
1697 std::unique_ptr<DocHitInfoIterator> itr,
1698 integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1,
1699 /*key_upper=*/5, *doc_store_, *schema_store_,
1700 fake_clock_.GetSystemTimeMilliseconds()));
1701
1702 EXPECT_THAT(
1703 GetHits(std::move(itr)),
1704 ElementsAre(EqualsDocHitInfo(
1705 kDocumentId0, std::vector<SectionId>{kIndexableIntegerSectionId})));
1706 }
1707
TEST_F(IndexProcessorTest,IndexableIntegerPropertyNoMatch)1708 TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) {
1709 DocumentProto document =
1710 DocumentBuilder()
1711 .SetKey("icing", "fake_type/1")
1712 .SetSchema(std::string(kFakeType))
1713 .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4,
1714 5)
1715 .Build();
1716 ICING_ASSERT_OK_AND_ASSIGN(
1717 TokenizedDocument tokenized_document,
1718 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1719 document));
1720 // Expected to have 1 integer section.
1721 EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1));
1722
1723 EXPECT_THAT(
1724 index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1725 /*old_document_id=*/kInvalidDocumentId),
1726 IsOk());
1727
1728 ICING_ASSERT_OK_AND_ASSIGN(
1729 std::unique_ptr<DocHitInfoIterator> itr,
1730 integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1,
1731 /*key_upper=*/0, *doc_store_, *schema_store_,
1732 fake_clock_.GetSystemTimeMilliseconds()));
1733
1734 EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
1735 }
1736
1737 } // namespace
1738
1739 } // namespace lib
1740 } // namespace icing
1741