xref: /aosp_15_r20/external/icing/icing/index/index-processor_test.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/index-processor.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "gmock/gmock.h"
28 #include "gtest/gtest.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/absl_ports/str_join.h"
31 #include "icing/document-builder.h"
32 #include "icing/feature-flags.h"
33 #include "icing/file/filesystem.h"
34 #include "icing/file/portable-file-backed-proto-log.h"
35 #include "icing/index/data-indexing-handler.h"
36 #include "icing/index/hit/doc-hit-info.h"
37 #include "icing/index/hit/hit.h"
38 #include "icing/index/index.h"
39 #include "icing/index/integer-section-indexing-handler.h"
40 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
41 #include "icing/index/iterator/doc-hit-info-iterator.h"
42 #include "icing/index/numeric/integer-index.h"
43 #include "icing/index/numeric/numeric-index.h"
44 #include "icing/index/term-indexing-handler.h"
45 #include "icing/index/term-property-id.h"
46 #include "icing/join/qualified-id-join-index-impl-v1.h"
47 #include "icing/join/qualified-id-join-index.h"
48 #include "icing/join/qualified-id-join-indexing-handler.h"
49 #include "icing/legacy/index/icing-filesystem.h"
50 #include "icing/legacy/index/icing-mock-filesystem.h"
51 #include "icing/portable/platform.h"
52 #include "icing/proto/document.pb.h"
53 #include "icing/proto/schema.pb.h"
54 #include "icing/proto/term.pb.h"
55 #include "icing/schema-builder.h"
56 #include "icing/schema/schema-store.h"
57 #include "icing/schema/section.h"
58 #include "icing/store/document-id.h"
59 #include "icing/store/document-store.h"
60 #include "icing/testing/common-matchers.h"
61 #include "icing/testing/fake-clock.h"
62 #include "icing/testing/random-string.h"
63 #include "icing/testing/test-data.h"
64 #include "icing/testing/test-feature-flags.h"
65 #include "icing/testing/tmp-directory.h"
66 #include "icing/tokenization/language-segmenter-factory.h"
67 #include "icing/tokenization/language-segmenter.h"
68 #include "icing/transform/normalizer-factory.h"
69 #include "icing/transform/normalizer.h"
70 #include "icing/util/crc32.h"
71 #include "icing/util/icu-data-file-helper.h"
72 #include "icing/util/tokenized-document.h"
73 #include "unicode/uloc.h"
74 
75 namespace icing {
76 namespace lib {
77 
78 namespace {
79 
80 constexpr std::string_view kIpsumText =
81     "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis "
82     "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida "
83     "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam "
84     "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo "
85     "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, "
86     "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula "
87     "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et "
88     "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, "
89     "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis "
90     "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. "
91     "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. "
92     "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur "
93     "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
94     "placerat semper.";
95 
96 // schema types
97 constexpr std::string_view kFakeType = "FakeType";
98 constexpr std::string_view kNestedType = "NestedType";
99 
100 // Indexable properties and section Id. Section Id is determined by the
101 // lexicographical order of indexable property path.
102 constexpr std::string_view kExactProperty = "exact";
103 constexpr std::string_view kIndexableIntegerProperty = "indexableInteger";
104 constexpr std::string_view kPrefixedProperty = "prefixed";
105 constexpr std::string_view kRepeatedProperty = "repeated";
106 constexpr std::string_view kRfc822Property = "rfc822";
107 constexpr std::string_view kSubProperty = "submessage";  // submessage.nested
108 constexpr std::string_view kNestedProperty = "nested";   // submessage.nested
109 // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
110 // to Android.
111 #ifdef ENABLE_URL_TOKENIZER
112 constexpr std::string_view kUrlExactProperty = "urlExact";
113 constexpr std::string_view kUrlPrefixedProperty = "urlPrefixed";
114 #endif  // ENABLE_URL_TOKENIZER
115 constexpr std::string_view kVerbatimExactProperty = "verbatimExact";
116 constexpr std::string_view kVerbatimPrefixedProperty = "verbatimPrefixed";
117 
118 constexpr SectionId kExactSectionId = 0;
119 constexpr SectionId kIndexableIntegerSectionId = 1;
120 constexpr SectionId kPrefixedSectionId = 2;
121 constexpr SectionId kRepeatedSectionId = 3;
122 constexpr SectionId kRfc822SectionId = 4;
123 constexpr SectionId kNestedSectionId = 5;  // submessage.nested
124 #ifdef ENABLE_URL_TOKENIZER
125 constexpr SectionId kUrlExactSectionId = 6;
126 constexpr SectionId kUrlPrefixedSectionId = 7;
127 constexpr SectionId kVerbatimExactSectionId = 8;
128 constexpr SectionId kVerbatimPrefixedSectionId = 9;
129 #else   // !ENABLE_URL_TOKENIZER
130 constexpr SectionId kVerbatimExactSectionId = 6;
131 constexpr SectionId kVerbatimPrefixedSectionId = 7;
132 #endif  // ENABLE_URL_TOKENIZER
133 
134 // Other non-indexable properties.
135 constexpr std::string_view kUnindexedProperty1 = "unindexed1";
136 constexpr std::string_view kUnindexedProperty2 = "unindexed2";
137 
138 constexpr DocumentId kDocumentId0 = 0;
139 constexpr DocumentId kDocumentId1 = 1;
140 
141 using Cardinality = PropertyConfigProto::Cardinality;
142 using DataType = PropertyConfigProto::DataType;
143 using ::testing::ElementsAre;
144 using ::testing::Eq;
145 using ::testing::IsEmpty;
146 using ::testing::IsTrue;
147 using ::testing::SizeIs;
148 using ::testing::Test;
149 using ::testing::UnorderedElementsAreArray;
150 
151 #ifdef ENABLE_URL_TOKENIZER
152 constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
153     StringIndexingConfig::TokenizerType::URL;
154 #endif  // ENABLE_URL_TOKENIZER
155 
156 class IndexProcessorTest : public Test {
157  protected:
SetUp()158   void SetUp() override {
159     feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
160     if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
161       ICING_ASSERT_OK(
162           // File generated via icu_data_file rule in //icing/BUILD.
163           icu_data_file_helper::SetUpIcuDataFile(
164               GetTestFilePath("icing/icu.dat")));
165     }
166 
167     base_dir_ = GetTestTempDir() + "/index_processor_test";
168     ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
169                 IsTrue());
170 
171     index_dir_ = base_dir_ + "/index";
172     integer_index_dir_ = base_dir_ + "/integer_index";
173     qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index";
174     schema_store_dir_ = base_dir_ + "/schema_store";
175     doc_store_dir_ = base_dir_ + "/doc_store";
176 
177     Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
178                            /*lite_index_sort_at_indexing=*/true,
179                            /*lite_index_sort_size=*/1024 * 8);
180     ICING_ASSERT_OK_AND_ASSIGN(
181         index_, Index::Create(options, &filesystem_, &icing_filesystem_));
182 
183     ICING_ASSERT_OK_AND_ASSIGN(
184         integer_index_,
185         IntegerIndex::Create(
186             filesystem_, integer_index_dir_,
187             IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
188             /*pre_mapping_fbv=*/false));
189 
190     ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
191                                QualifiedIdJoinIndexImplV1::Create(
192                                    filesystem_, qualified_id_join_index_dir_,
193                                    /*pre_mapping_fbv=*/false,
194                                    /*use_persistent_hash_map=*/false));
195 
196     language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
197     ICING_ASSERT_OK_AND_ASSIGN(
198         lang_segmenter_,
199         language_segmenter_factory::Create(std::move(segmenter_options)));
200 
201     ICING_ASSERT_OK_AND_ASSIGN(
202         normalizer_,
203         normalizer_factory::Create(
204             /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
205 
206     ASSERT_TRUE(
207         filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()));
208     ICING_ASSERT_OK_AND_ASSIGN(
209         schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_,
210                                            &fake_clock_, feature_flags_.get()));
211     SchemaProto schema =
212         SchemaBuilder()
213             .AddType(
214                 SchemaTypeConfigBuilder()
215                     .SetType(kFakeType)
216                     .AddProperty(PropertyConfigBuilder()
217                                      .SetName(kExactProperty)
218                                      .SetDataTypeString(TERM_MATCH_EXACT,
219                                                         TOKENIZER_PLAIN)
220                                      .SetCardinality(CARDINALITY_OPTIONAL))
221                     .AddProperty(PropertyConfigBuilder()
222                                      .SetName(kPrefixedProperty)
223                                      .SetDataTypeString(TERM_MATCH_PREFIX,
224                                                         TOKENIZER_PLAIN)
225                                      .SetCardinality(CARDINALITY_OPTIONAL))
226                     .AddProperty(PropertyConfigBuilder()
227                                      .SetName(kUnindexedProperty1)
228                                      .SetDataType(TYPE_STRING)
229                                      .SetCardinality(CARDINALITY_OPTIONAL))
230                     .AddProperty(PropertyConfigBuilder()
231                                      .SetName(kUnindexedProperty2)
232                                      .SetDataType(TYPE_BYTES)
233                                      .SetCardinality(CARDINALITY_OPTIONAL))
234                     .AddProperty(PropertyConfigBuilder()
235                                      .SetName(kRepeatedProperty)
236                                      .SetDataTypeString(TERM_MATCH_PREFIX,
237                                                         TOKENIZER_PLAIN)
238                                      .SetCardinality(CARDINALITY_REPEATED))
239                     .AddProperty(PropertyConfigBuilder()
240                                      .SetName(kVerbatimExactProperty)
241                                      .SetDataTypeString(TERM_MATCH_EXACT,
242                                                         TOKENIZER_VERBATIM)
243                                      .SetCardinality(CARDINALITY_REPEATED))
244                     .AddProperty(PropertyConfigBuilder()
245                                      .SetName(kVerbatimPrefixedProperty)
246                                      .SetDataTypeString(TERM_MATCH_PREFIX,
247                                                         TOKENIZER_VERBATIM)
248                                      .SetCardinality(CARDINALITY_REPEATED))
249                     .AddProperty(PropertyConfigBuilder()
250                                      .SetName(kRfc822Property)
251                                      .SetDataTypeString(TERM_MATCH_PREFIX,
252                                                         TOKENIZER_RFC822)
253                                      .SetCardinality(CARDINALITY_REPEATED))
254 #ifdef ENABLE_URL_TOKENIZER
255                     .AddProperty(
256                         PropertyConfigBuilder()
257                             .SetName(kUrlExactProperty)
258                             .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_URL)
259                             .SetCardinality(CARDINALITY_REPEATED))
260                     .AddProperty(
261                         PropertyConfigBuilder()
262                             .SetName(kUrlPrefixedProperty)
263                             .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_URL)
264                             .SetCardinality(CARDINALITY_REPEATED))
265 #endif  // ENABLE_URL_TOKENIZER
266                     .AddProperty(PropertyConfigBuilder()
267                                      .SetName(kIndexableIntegerProperty)
268                                      .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
269                                      .SetCardinality(CARDINALITY_REPEATED))
270                     .AddProperty(
271                         PropertyConfigBuilder()
272                             .SetName(kSubProperty)
273                             .SetDataTypeDocument(
274                                 kNestedType, /*index_nested_properties=*/true)
275                             .SetCardinality(CARDINALITY_OPTIONAL)))
276             .AddType(
277                 SchemaTypeConfigBuilder()
278                     .SetType(kNestedType)
279                     .AddProperty(PropertyConfigBuilder()
280                                      .SetName(kNestedProperty)
281                                      .SetDataTypeString(TERM_MATCH_PREFIX,
282                                                         TOKENIZER_PLAIN)
283                                      .SetCardinality(CARDINALITY_OPTIONAL)))
284             .Build();
285     ICING_ASSERT_OK(schema_store_->SetSchema(
286         schema, /*ignore_errors_and_delete_documents=*/false,
287         /*allow_circular_schema_definitions=*/false));
288 
289     ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()));
290     ICING_ASSERT_OK_AND_ASSIGN(
291         DocumentStore::CreateResult create_result,
292         DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
293                               schema_store_.get(), feature_flags_.get(),
294                               /*force_recovery_and_revalidate_documents=*/false,
295                               /*pre_mapping_fbv=*/false,
296                               /*use_persistent_hash_map=*/true,
297                               PortableFileBackedProtoLog<
298                                   DocumentWrapper>::kDefaultCompressionLevel,
299                               /*initialize_stats=*/nullptr));
300     doc_store_ = std::move(create_result.document_store);
301 
302     ICING_ASSERT_OK_AND_ASSIGN(
303         std::unique_ptr<TermIndexingHandler> term_indexing_handler,
304         TermIndexingHandler::Create(
305             &fake_clock_, normalizer_.get(), index_.get(),
306             /*build_property_existence_metadata_hits=*/true));
307     ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
308                                    integer_section_indexing_handler,
309                                IntegerSectionIndexingHandler::Create(
310                                    &fake_clock_, integer_index_.get()));
311     ICING_ASSERT_OK_AND_ASSIGN(
312         std::unique_ptr<QualifiedIdJoinIndexingHandler>
313             qualified_id_join_indexing_handler,
314         QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
315                                                qualified_id_join_index_.get()));
316     std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
317     handlers.push_back(std::move(term_indexing_handler));
318     handlers.push_back(std::move(integer_section_indexing_handler));
319     handlers.push_back(std::move(qualified_id_join_indexing_handler));
320 
321     index_processor_ =
322         std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
323 
324     mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
325   }
326 
TearDown()327   void TearDown() override {
328     index_processor_.reset();
329     doc_store_.reset();
330     schema_store_.reset();
331     normalizer_.reset();
332     lang_segmenter_.reset();
333     qualified_id_join_index_.reset();
334     integer_index_.reset();
335     index_.reset();
336 
337     filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
338   }
339 
340   std::unique_ptr<IcingMockFilesystem> mock_icing_filesystem_;
341 
342   std::unique_ptr<FeatureFlags> feature_flags_;
343   Filesystem filesystem_;
344   IcingFilesystem icing_filesystem_;
345   FakeClock fake_clock_;
346   std::string base_dir_;
347   std::string index_dir_;
348   std::string integer_index_dir_;
349   std::string qualified_id_join_index_dir_;
350   std::string schema_store_dir_;
351   std::string doc_store_dir_;
352 
353   std::unique_ptr<Index> index_;
354   std::unique_ptr<NumericIndex<int64_t>> integer_index_;
355   std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_;
356   std::unique_ptr<LanguageSegmenter> lang_segmenter_;
357   std::unique_ptr<Normalizer> normalizer_;
358   std::unique_ptr<SchemaStore> schema_store_;
359   std::unique_ptr<DocumentStore> doc_store_;
360 
361   std::unique_ptr<IndexProcessor> index_processor_;
362 };
363 
GetHits(std::unique_ptr<DocHitInfoIterator> iterator)364 std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
365   std::vector<DocHitInfo> infos;
366   while (iterator->Advance().ok()) {
367     infos.push_back(iterator->doc_hit_info());
368   }
369   return infos;
370 }
371 
GetHitsWithTermFrequency(std::unique_ptr<DocHitInfoIterator> iterator)372 std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
373     std::unique_ptr<DocHitInfoIterator> iterator) {
374   std::vector<DocHitInfoTermFrequencyPair> infos;
375   while (iterator->Advance().ok()) {
376     std::vector<TermMatchInfo> matched_terms_stats;
377     iterator->PopulateMatchedTermsStats(&matched_terms_stats);
378     for (const TermMatchInfo& term_match_info : matched_terms_stats) {
379       infos.push_back(DocHitInfoTermFrequencyPair(
380           iterator->doc_hit_info(), term_match_info.term_frequencies));
381     }
382   }
383   return infos;
384 }
385 
TEST_F(IndexProcessorTest,NoTermMatchTypeContent)386 TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
387   DocumentProto document =
388       DocumentBuilder()
389           .SetKey("icing", "fake_type/1")
390           .SetSchema(std::string(kFakeType))
391           .AddStringProperty(std::string(kUnindexedProperty1), "foo bar baz")
392           .AddBytesProperty(std::string(kUnindexedProperty2),
393                             "attachment bytes")
394           .Build();
395   ICING_ASSERT_OK_AND_ASSIGN(
396       TokenizedDocument tokenized_document,
397       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
398                                 document));
399   EXPECT_THAT(
400       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
401                                       /*old_document_id=*/kInvalidDocumentId),
402       IsOk());
403   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
404 }
405 
TEST_F(IndexProcessorTest,NoValidContent)406 TEST_F(IndexProcessorTest, NoValidContent) {
407   DocumentProto document =
408       DocumentBuilder()
409           .SetKey("icing", "fake_type/1")
410           .SetSchema(std::string(kFakeType))
411           .AddStringProperty(std::string(kExactProperty), "?...!")
412           .Build();
413   ICING_ASSERT_OK_AND_ASSIGN(
414       TokenizedDocument tokenized_document,
415       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
416                                 document));
417   EXPECT_THAT(
418       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
419                                       /*old_document_id=*/kInvalidDocumentId),
420       IsOk());
421   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
422 }
423 
TEST_F(IndexProcessorTest,OneDoc)424 TEST_F(IndexProcessorTest, OneDoc) {
425   DocumentProto document =
426       DocumentBuilder()
427           .SetKey("icing", "fake_type/1")
428           .SetSchema(std::string(kFakeType))
429           .AddStringProperty(std::string(kExactProperty), "hello world")
430           .Build();
431   ICING_ASSERT_OK_AND_ASSIGN(
432       TokenizedDocument tokenized_document,
433       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
434                                 document));
435   EXPECT_THAT(
436       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
437                                       /*old_document_id=*/kInvalidDocumentId),
438       IsOk());
439   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
440 
441   ICING_ASSERT_OK_AND_ASSIGN(
442       std::unique_ptr<DocHitInfoIterator> itr,
443       index_->GetIterator("hello", /*term_start_index=*/0,
444                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
445                           TermMatchType::EXACT_ONLY));
446   std::vector<DocHitInfoTermFrequencyPair> hits =
447       GetHitsWithTermFrequency(std::move(itr));
448   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
449       {kExactSectionId, 1}};
450   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
451                         kDocumentId0, expected_map)));
452 
453   ICING_ASSERT_OK_AND_ASSIGN(
454       itr, index_->GetIterator(
455                "hello", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
456                1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY));
457   EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
458 }
459 
TEST_F(IndexProcessorTest,MultipleDocs)460 TEST_F(IndexProcessorTest, MultipleDocs) {
461   DocumentProto document =
462       DocumentBuilder()
463           .SetKey("icing", "fake_type/1")
464           .SetSchema(std::string(kFakeType))
465           .AddStringProperty(std::string(kExactProperty), "hello world")
466           .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
467           .Build();
468   ICING_ASSERT_OK_AND_ASSIGN(
469       TokenizedDocument tokenized_document,
470       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
471                                 document));
472   EXPECT_THAT(
473       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
474                                       /*old_document_id=*/kInvalidDocumentId),
475       IsOk());
476   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
477 
478   std::string coffeeRepeatedString = "coffee";
479   for (int i = 0; i < Hit::kMaxTermFrequency + 1; i++) {
480     coffeeRepeatedString += " coffee";
481   }
482 
483   document =
484       DocumentBuilder()
485           .SetKey("icing", "fake_type/2")
486           .SetSchema(std::string(kFakeType))
487           .AddStringProperty(std::string(kExactProperty), coffeeRepeatedString)
488           .AddStringProperty(std::string(kPrefixedProperty),
489                              "mr. world world wide")
490           .Build();
491   ICING_ASSERT_OK_AND_ASSIGN(
492       tokenized_document,
493       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
494                                 document));
495   EXPECT_THAT(
496       index_processor_->IndexDocument(tokenized_document, kDocumentId1,
497                                       /*old_document_id=*/kInvalidDocumentId),
498       IsOk());
499   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
500 
501   ICING_ASSERT_OK_AND_ASSIGN(
502       std::unique_ptr<DocHitInfoIterator> itr,
503       index_->GetIterator("world", /*term_start_index=*/0,
504                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
505                           TermMatchType::EXACT_ONLY));
506   std::vector<DocHitInfoTermFrequencyPair> hits =
507       GetHitsWithTermFrequency(std::move(itr));
508   std::unordered_map<SectionId, Hit::TermFrequency> expected_map_1{
509       {kPrefixedSectionId, 2}};
510   std::unordered_map<SectionId, Hit::TermFrequency> expected_map_2{
511       {kExactSectionId, 1}};
512   EXPECT_THAT(
513       hits,
514       ElementsAre(
515           EqualsDocHitInfoWithTermFrequency(kDocumentId1, expected_map_1),
516           EqualsDocHitInfoWithTermFrequency(kDocumentId0, expected_map_2)));
517 
518   ICING_ASSERT_OK_AND_ASSIGN(
519       itr, index_->GetIterator(
520                "world", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
521                1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY));
522   hits = GetHitsWithTermFrequency(std::move(itr));
523   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
524       {kPrefixedSectionId, 2}};
525   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
526                         kDocumentId1, expected_map)));
527 
528   ICING_ASSERT_OK_AND_ASSIGN(
529       itr, index_->GetIterator("coffee", /*term_start_index=*/0,
530                                /*unnormalized_term_length=*/0,
531                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
532   hits = GetHitsWithTermFrequency(std::move(itr));
533   expected_map = {{kExactSectionId, Hit::kMaxTermFrequency}};
534   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
535                         kDocumentId1, expected_map)));
536 }
537 
TEST_F(IndexProcessorTest,DocWithNestedProperty)538 TEST_F(IndexProcessorTest, DocWithNestedProperty) {
539   DocumentProto document =
540       DocumentBuilder()
541           .SetKey("icing", "fake_type/1")
542           .SetSchema(std::string(kFakeType))
543           .AddStringProperty(std::string(kExactProperty), "hello world")
544           .AddDocumentProperty(
545               std::string(kSubProperty),
546               DocumentBuilder()
547                   .SetKey("icing", "nested_type/1")
548                   .SetSchema(std::string(kNestedType))
549                   .AddStringProperty(std::string(kNestedProperty),
550                                      "rocky raccoon")
551                   .Build())
552           .Build();
553   ICING_ASSERT_OK_AND_ASSIGN(
554       TokenizedDocument tokenized_document,
555       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
556                                 document));
557   EXPECT_THAT(
558       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
559                                       /*old_document_id=*/kInvalidDocumentId),
560       IsOk());
561   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
562 
563   ICING_ASSERT_OK_AND_ASSIGN(
564       std::unique_ptr<DocHitInfoIterator> itr,
565       index_->GetIterator("rocky", /*term_start_index=*/0,
566                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
567                           TermMatchType::EXACT_ONLY));
568   EXPECT_THAT(GetHits(std::move(itr)),
569               ElementsAre(EqualsDocHitInfo(
570                   kDocumentId0, std::vector<SectionId>{kNestedSectionId})));
571 }
572 
TEST_F(IndexProcessorTest,DocWithRepeatedProperty)573 TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
574   DocumentProto document =
575       DocumentBuilder()
576           .SetKey("icing", "fake_type/1")
577           .SetSchema(std::string(kFakeType))
578           .AddStringProperty(std::string(kExactProperty), "hello world")
579           .AddStringProperty(std::string(kRepeatedProperty), "rocky",
580                              "italian stallion")
581           .Build();
582   ICING_ASSERT_OK_AND_ASSIGN(
583       TokenizedDocument tokenized_document,
584       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
585                                 document));
586   EXPECT_THAT(
587       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
588                                       /*old_document_id=*/kInvalidDocumentId),
589       IsOk());
590   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
591 
592   ICING_ASSERT_OK_AND_ASSIGN(
593       std::unique_ptr<DocHitInfoIterator> itr,
594       index_->GetIterator("italian", /*term_start_index=*/0,
595                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
596                           TermMatchType::EXACT_ONLY));
597   EXPECT_THAT(GetHits(std::move(itr)),
598               ElementsAre(EqualsDocHitInfo(
599                   kDocumentId0, std::vector<SectionId>{kRepeatedSectionId})));
600 }
601 
602 // TODO(b/196771754) This test is disabled on Android because it takes too long
603 // to generate all of the unique terms and the test times out. Try storing these
604 // unique terms in a file that the test can read from.
605 #ifndef __ANDROID__
606 
TEST_F(IndexProcessorTest,HitBufferExhaustedTest)607 TEST_F(IndexProcessorTest, HitBufferExhaustedTest) {
608   // Testing has shown that adding ~600,000 hits will fill up the hit buffer.
609   std::vector<std::string> unique_terms_ = GenerateUniqueTerms(200000);
610   std::string content = absl_ports::StrJoin(unique_terms_, " ");
611 
612   DocumentProto document =
613       DocumentBuilder()
614           .SetKey("icing", "fake_type/1")
615           .SetSchema(std::string(kFakeType))
616           .AddStringProperty(std::string(kExactProperty), content)
617           .AddStringProperty(std::string(kPrefixedProperty), content)
618           .AddStringProperty(std::string(kRepeatedProperty), content)
619           .Build();
620   ICING_ASSERT_OK_AND_ASSIGN(
621       TokenizedDocument tokenized_document,
622       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
623                                 document));
624   EXPECT_THAT(
625       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
626                                       /*old_document_id=*/kInvalidDocumentId),
627       StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED,
628                testing::HasSubstr("Hit buffer is full!")));
629   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
630 }
631 
TEST_F(IndexProcessorTest,LexiconExhaustedTest)632 TEST_F(IndexProcessorTest, LexiconExhaustedTest) {
633   // Testing has shown that adding ~300,000 terms generated this way will
634   // fill up the lexicon.
635   std::vector<std::string> unique_terms_ = GenerateUniqueTerms(300000);
636   std::string content = absl_ports::StrJoin(unique_terms_, " ");
637 
638   DocumentProto document =
639       DocumentBuilder()
640           .SetKey("icing", "fake_type/1")
641           .SetSchema(std::string(kFakeType))
642           .AddStringProperty(std::string(kExactProperty), content)
643           .Build();
644   ICING_ASSERT_OK_AND_ASSIGN(
645       TokenizedDocument tokenized_document,
646       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
647                                 document));
648   EXPECT_THAT(
649       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
650                                       /*old_document_id=*/kInvalidDocumentId),
651       StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
652   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
653 }
654 
655 #endif  // __ANDROID__
656 
TEST_F(IndexProcessorTest,TooLongTokens)657 TEST_F(IndexProcessorTest, TooLongTokens) {
658   // Only allow the tokens of length four, truncating "hello", "world" and
659   // "night".
660   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
661                              normalizer_factory::Create(
662                                  /*max_term_byte_size=*/4));
663 
664   ICING_ASSERT_OK_AND_ASSIGN(
665       std::unique_ptr<TermIndexingHandler> term_indexing_handler,
666       TermIndexingHandler::Create(
667           &fake_clock_, normalizer.get(), index_.get(),
668           /*build_property_existence_metadata_hits=*/true));
669   std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
670   handlers.push_back(std::move(term_indexing_handler));
671 
672   index_processor_ =
673       std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
674 
675   DocumentProto document =
676       DocumentBuilder()
677           .SetKey("icing", "fake_type/1")
678           .SetSchema(std::string(kFakeType))
679           .AddStringProperty(std::string(kExactProperty), "hello world")
680           .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
681           .Build();
682   ICING_ASSERT_OK_AND_ASSIGN(
683       TokenizedDocument tokenized_document,
684       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
685                                 document));
686   EXPECT_THAT(
687       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
688                                       /*old_document_id=*/kInvalidDocumentId),
689       IsOk());
690   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
691 
692   // "good" should have been indexed normally.
693   ICING_ASSERT_OK_AND_ASSIGN(
694       std::unique_ptr<DocHitInfoIterator> itr,
695       index_->GetIterator("good", /*term_start_index=*/0,
696                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
697                           TermMatchType::EXACT_ONLY));
698   EXPECT_THAT(GetHits(std::move(itr)),
699               ElementsAre(EqualsDocHitInfo(
700                   kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
701 
702   // "night" should not have been.
703   ICING_ASSERT_OK_AND_ASSIGN(
704       itr, index_->GetIterator("night", /*term_start_index=*/0,
705                                /*unnormalized_term_length=*/0,
706                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
707   EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
708 
709   // "night" should have been truncated to "nigh".
710   ICING_ASSERT_OK_AND_ASSIGN(
711       itr, index_->GetIterator("nigh", /*term_start_index=*/0,
712                                /*unnormalized_term_length=*/0,
713                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
714   EXPECT_THAT(GetHits(std::move(itr)),
715               ElementsAre(EqualsDocHitInfo(
716                   kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
717 }
718 
TEST_F(IndexProcessorTest,PrefixedQueryReturnsCombinedTermFrequenciesForBothIndices)719 TEST_F(IndexProcessorTest,
720        PrefixedQueryReturnsCombinedTermFrequenciesForBothIndices) {
721   DocumentProto document =
722       DocumentBuilder()
723           .SetKey("icing", "fake_type/1")
724           .SetSchema(std::string(kFakeType))
725           .AddStringProperty(
726               std::string(kPrefixedProperty),
727               "rocket the raccoon retreated from the rodent resistance")
728           .Build();
729   ICING_ASSERT_OK_AND_ASSIGN(
730       TokenizedDocument tokenized_document,
731       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
732                                 document));
733   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
734 
735   EXPECT_THAT(
736       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
737                                       /*old_document_id=*/kInvalidDocumentId),
738       IsOk());
739   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
740 
741   // Query the lite index. "r" should have 5 matches.
742   ICING_ASSERT_OK_AND_ASSIGN(
743       std::unique_ptr<DocHitInfoIterator> lite_itr,
744       index_->GetIterator("r", /*term_start_index=*/0,
745                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
746                           TermMatchType::PREFIX));
747   std::vector<DocHitInfoTermFrequencyPair> lite_hits =
748       GetHitsWithTermFrequency(std::move(lite_itr));
749 
750   // Merge the indices so that we're querying the main index, and check that
751   // results are the same.
752   ASSERT_THAT(index_->Merge(), IsOk());
753   ICING_ASSERT_OK_AND_ASSIGN(
754       std::unique_ptr<DocHitInfoIterator> main_itr,
755       index_->GetIterator("r", /*term_start_index=*/0,
756                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
757                           TermMatchType::PREFIX));
758   std::vector<DocHitInfoTermFrequencyPair> main_hits =
759       GetHitsWithTermFrequency(std::move(main_itr));
760   EXPECT_THAT(main_hits, UnorderedElementsAreArray(lite_hits));
761 
762   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
763       {kPrefixedSectionId, 5}};
764   EXPECT_THAT(lite_hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
765                              kDocumentId0, expected_map)));
766 }
767 
TEST_F(IndexProcessorTest,NonPrefixedContentPrefixQuery)768 TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
769   DocumentProto document =
770       DocumentBuilder()
771           .SetKey("icing", "fake_type/1")
772           .SetSchema(std::string(kFakeType))
773           .AddStringProperty(std::string(kExactProperty), "best rocky movies")
774           .Build();
775   ICING_ASSERT_OK_AND_ASSIGN(
776       TokenizedDocument tokenized_document,
777       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
778                                 document));
779   EXPECT_THAT(
780       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
781                                       /*old_document_id=*/kInvalidDocumentId),
782       IsOk());
783   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
784 
785   document =
786       DocumentBuilder()
787           .SetKey("icing", "fake_type/2")
788           .SetSchema(std::string(kFakeType))
789           .AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon")
790           .Build();
791   ICING_ASSERT_OK_AND_ASSIGN(
792       tokenized_document,
793       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
794                                 document));
795   EXPECT_THAT(
796       index_processor_->IndexDocument(tokenized_document, kDocumentId1,
797                                       /*old_document_id=*/kInvalidDocumentId),
798       IsOk());
799   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
800 
801   // Only document_id 1 should surface in a prefix query for "Rock"
802   ICING_ASSERT_OK_AND_ASSIGN(
803       std::unique_ptr<DocHitInfoIterator> itr,
804       index_->GetIterator("rock", /*term_start_index=*/0,
805                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
806                           TermMatchType::PREFIX));
807   EXPECT_THAT(GetHits(std::move(itr)),
808               ElementsAre(EqualsDocHitInfo(
809                   kDocumentId1, std::vector<SectionId>{kPrefixedSectionId})));
810 }
811 
TEST_F(IndexProcessorTest,TokenNormalization)812 TEST_F(IndexProcessorTest, TokenNormalization) {
813   DocumentProto document =
814       DocumentBuilder()
815           .SetKey("icing", "fake_type/1")
816           .SetSchema(std::string(kFakeType))
817           .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
818           .Build();
819   ICING_ASSERT_OK_AND_ASSIGN(
820       TokenizedDocument tokenized_document,
821       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
822                                 document));
823   EXPECT_THAT(
824       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
825                                       /*old_document_id=*/kInvalidDocumentId),
826       IsOk());
827   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
828 
829   document =
830       DocumentBuilder()
831           .SetKey("icing", "fake_type/2")
832           .SetSchema(std::string(kFakeType))
833           .AddStringProperty(std::string(kExactProperty), "all lower case")
834           .Build();
835   ICING_ASSERT_OK_AND_ASSIGN(
836       tokenized_document,
837       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
838                                 document));
839   EXPECT_THAT(
840       index_processor_->IndexDocument(tokenized_document, kDocumentId1,
841                                       /*old_document_id=*/kInvalidDocumentId),
842       IsOk());
843   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
844 
845   ICING_ASSERT_OK_AND_ASSIGN(
846       std::unique_ptr<DocHitInfoIterator> itr,
847       index_->GetIterator("case", /*term_start_index=*/0,
848                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
849                           TermMatchType::EXACT_ONLY));
850   EXPECT_THAT(
851       GetHits(std::move(itr)),
852       ElementsAre(EqualsDocHitInfo(kDocumentId1,
853                                    std::vector<SectionId>{kExactSectionId}),
854                   EqualsDocHitInfo(kDocumentId0,
855                                    std::vector<SectionId>{kExactSectionId})));
856 }
857 
TEST_F(IndexProcessorTest,OutOfOrderDocumentIds)858 TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
859   DocumentProto document =
860       DocumentBuilder()
861           .SetKey("icing", "fake_type/1")
862           .SetSchema(std::string(kFakeType))
863           .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
864           .AddInt64Property(std::string(kIndexableIntegerProperty), 123)
865           .Build();
866   ICING_ASSERT_OK_AND_ASSIGN(
867       TokenizedDocument tokenized_document,
868       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
869                                 document));
870   EXPECT_THAT(
871       index_processor_->IndexDocument(tokenized_document, kDocumentId1,
872                                       /*old_document_id=*/kInvalidDocumentId),
873       IsOk());
874   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
875 
876   ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size,
877                              index_->GetElementsSize());
878   ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc,
879                              integer_index_->UpdateChecksums());
880 
881   // Indexing a document with document_id <= last_added_document_id should cause
882   // a failure.
883   document =
884       DocumentBuilder()
885           .SetKey("icing", "fake_type/2")
886           .SetSchema(std::string(kFakeType))
887           .AddStringProperty(std::string(kExactProperty), "all lower case")
888           .AddInt64Property(std::string(kIndexableIntegerProperty), 456)
889           .Build();
890   ICING_ASSERT_OK_AND_ASSIGN(
891       tokenized_document,
892       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
893                                 document));
894   EXPECT_THAT(
895       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
896                                       /*old_document_id=*/kInvalidDocumentId),
897       StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
898   // Verify that both index_ and integer_index_ are unchanged.
899   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
900   EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
901   EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
902   EXPECT_THAT(integer_index_->UpdateChecksums(),
903               IsOkAndHolds(integer_index_crc));
904 
905   // As should indexing a document document_id == last_added_document_id.
906   EXPECT_THAT(
907       index_processor_->IndexDocument(tokenized_document, kDocumentId1,
908                                       /*old_document_id=*/kInvalidDocumentId),
909       StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
910   // Verify that both index_ and integer_index_ are unchanged.
911   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
912   EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
913   EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
914   EXPECT_THAT(integer_index_->UpdateChecksums(),
915               IsOkAndHolds(integer_index_crc));
916 }
917 
TEST_F(IndexProcessorTest,OutOfOrderDocumentIdsInRecoveryMode)918 TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
919   ICING_ASSERT_OK_AND_ASSIGN(
920       std::unique_ptr<TermIndexingHandler> term_indexing_handler,
921       TermIndexingHandler::Create(
922           &fake_clock_, normalizer_.get(), index_.get(),
923           /*build_property_existence_metadata_hits=*/true));
924   ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
925                                  integer_section_indexing_handler,
926                              IntegerSectionIndexingHandler::Create(
927                                  &fake_clock_, integer_index_.get()));
928   ICING_ASSERT_OK_AND_ASSIGN(
929       std::unique_ptr<QualifiedIdJoinIndexingHandler>
930           qualified_id_join_indexing_handler,
931       QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
932                                              qualified_id_join_index_.get()));
933   std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
934   handlers.push_back(std::move(term_indexing_handler));
935   handlers.push_back(std::move(integer_section_indexing_handler));
936   handlers.push_back(std::move(qualified_id_join_indexing_handler));
937 
938   IndexProcessor index_processor(std::move(handlers), &fake_clock_,
939                                  /*recovery_mode=*/true);
940 
941   DocumentProto document =
942       DocumentBuilder()
943           .SetKey("icing", "fake_type/1")
944           .SetSchema(std::string(kFakeType))
945           .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
946           .AddInt64Property(std::string(kIndexableIntegerProperty), 123)
947           .Build();
948   ICING_ASSERT_OK_AND_ASSIGN(
949       TokenizedDocument tokenized_document,
950       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
951                                 document));
952   EXPECT_THAT(
953       index_processor.IndexDocument(tokenized_document, kDocumentId1,
954                                     /*old_document_id=*/kInvalidDocumentId),
955       IsOk());
956   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
957 
958   ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size,
959                              index_->GetElementsSize());
960   ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc,
961                              integer_index_->UpdateChecksums());
962 
963   // Indexing a document with document_id <= last_added_document_id in recovery
964   // mode should not get any error, but IndexProcessor should still ignore it
965   // and index data should remain unchanged.
966   document =
967       DocumentBuilder()
968           .SetKey("icing", "fake_type/2")
969           .SetSchema(std::string(kFakeType))
970           .AddStringProperty(std::string(kExactProperty), "all lower case")
971           .AddInt64Property(std::string(kIndexableIntegerProperty), 456)
972           .Build();
973   ICING_ASSERT_OK_AND_ASSIGN(
974       tokenized_document,
975       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
976                                 document));
977   EXPECT_THAT(
978       index_processor.IndexDocument(tokenized_document, kDocumentId0,
979                                     /*old_document_id=*/kInvalidDocumentId),
980       IsOk());
981   // Verify that both index_ and integer_index_ are unchanged.
982   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
983   EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
984   EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
985   EXPECT_THAT(integer_index_->UpdateChecksums(),
986               IsOkAndHolds(integer_index_crc));
987 
988   // As should indexing a document document_id == last_added_document_id.
989   EXPECT_THAT(
990       index_processor.IndexDocument(tokenized_document, kDocumentId1,
991                                     /*old_document_id=*/kInvalidDocumentId),
992       IsOk());
993   // Verify that both index_ and integer_index_ are unchanged.
994   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
995   EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
996   EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
997   EXPECT_THAT(integer_index_->UpdateChecksums(),
998               IsOkAndHolds(integer_index_crc));
999 }
1000 
TEST_F(IndexProcessorTest,NonAsciiIndexing)1001 TEST_F(IndexProcessorTest, NonAsciiIndexing) {
1002   language_segmenter_factory::SegmenterOptions segmenter_options(
1003       ULOC_SIMPLIFIED_CHINESE);
1004   ICING_ASSERT_OK_AND_ASSIGN(
1005       lang_segmenter_,
1006       language_segmenter_factory::Create(std::move(segmenter_options)));
1007 
1008   DocumentProto document =
1009       DocumentBuilder()
1010           .SetKey("icing", "fake_type/1")
1011           .SetSchema(std::string(kFakeType))
1012           .AddStringProperty(std::string(kExactProperty),
1013                              "你好,世界!你好:世界。“你好”世界?")
1014           .Build();
1015   ICING_ASSERT_OK_AND_ASSIGN(
1016       TokenizedDocument tokenized_document,
1017       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1018                                 document));
1019   EXPECT_THAT(
1020       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1021                                       /*old_document_id=*/kInvalidDocumentId),
1022       IsOk());
1023   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1024 
1025   ICING_ASSERT_OK_AND_ASSIGN(
1026       std::unique_ptr<DocHitInfoIterator> itr,
1027       index_->GetIterator("你好", /*term_start_index=*/0,
1028                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1029                           TermMatchType::EXACT_ONLY));
1030   EXPECT_THAT(GetHits(std::move(itr)),
1031               ElementsAre(EqualsDocHitInfo(
1032                   kDocumentId0, std::vector<SectionId>{kExactSectionId})));
1033 }
1034 
TEST_F(IndexProcessorTest,LexiconFullIndexesSmallerTokensReturnsResourceExhausted)1035 TEST_F(IndexProcessorTest,
1036        LexiconFullIndexesSmallerTokensReturnsResourceExhausted) {
1037   // This is the maximum token length that an empty lexicon constructed for a
1038   // lite index with merge size of 1MiB can support.
1039   constexpr int kMaxTokenLength = 16777217;
1040   // Create a string "ppppppp..." with a length that is too large to fit into
1041   // the lexicon.
1042   std::string enormous_string(kMaxTokenLength + 1, 'p');
1043   DocumentProto document_one =
1044       DocumentBuilder()
1045           .SetKey("icing", "fake_type/1")
1046           .SetSchema(std::string(kFakeType))
1047           .AddStringProperty(std::string(kExactProperty),
1048                              absl_ports::StrCat(enormous_string, " foo"))
1049           .AddStringProperty(std::string(kPrefixedProperty), "bar baz")
1050           .Build();
1051   ICING_ASSERT_OK_AND_ASSIGN(
1052       TokenizedDocument tokenized_document,
1053       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1054                                 document_one));
1055   EXPECT_THAT(
1056       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1057                                       /*old_document_id=*/kInvalidDocumentId),
1058       StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
1059   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1060 }
1061 
TEST_F(IndexProcessorTest,IndexingDocAutomaticMerge)1062 TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
1063   // Create the index with a smaller index_merge_size - merging every time we
1064   // add 101 documents. This will result in a small LiteIndex, which will be
1065   // easier to fill up. The LiteIndex itself will have a size larger than the
1066   // index_merge_size because it adds extra buffer to ensure that it always has
1067   // room to fit whatever document will trigger the merge.
1068   DocumentProto document =
1069       DocumentBuilder()
1070           .SetKey("icing", "fake_type/1")
1071           .SetSchema(std::string(kFakeType))
1072           .AddStringProperty(std::string(kExactProperty), kIpsumText)
1073           .Build();
1074   ICING_ASSERT_OK_AND_ASSIGN(
1075       TokenizedDocument tokenized_document,
1076       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1077                                 document));
1078   Index::Options options(index_dir_,
1079                          /*index_merge_size=*/document.ByteSizeLong() * 100,
1080                          /*lite_index_sort_at_indexing=*/true,
1081                          /*lite_index_sort_size=*/64);
1082   ICING_ASSERT_OK_AND_ASSIGN(
1083       index_, Index::Create(options, &filesystem_, &icing_filesystem_));
1084 
1085   ICING_ASSERT_OK_AND_ASSIGN(
1086       std::unique_ptr<TermIndexingHandler> term_indexing_handler,
1087       TermIndexingHandler::Create(
1088           &fake_clock_, normalizer_.get(), index_.get(),
1089           /*build_property_existence_metadata_hits=*/true));
1090   std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
1091   handlers.push_back(std::move(term_indexing_handler));
1092 
1093   index_processor_ =
1094       std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
1095 
1096   DocumentId doc_id = 0;
1097   // Have determined experimentally that indexing 3373 documents with this text
1098   // will cause the LiteIndex to fill up. Further indexing will fail unless the
1099   // index processor properly merges the LiteIndex into the MainIndex and
1100   // empties the LiteIndex.
1101   constexpr int kNumDocsLiteIndexExhaustion = 3373;
1102   for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) {
1103     EXPECT_THAT(
1104         index_processor_->IndexDocument(
1105             tokenized_document, doc_id,
1106             /*old_document_id=*/doc_id == 0 ? kInvalidDocumentId : doc_id - 1),
1107         IsOk());
1108     EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1109   }
1110   EXPECT_THAT(
1111       index_processor_->IndexDocument(
1112           tokenized_document, doc_id,
1113           /*old_document_id=*/doc_id == 0 ? kInvalidDocumentId : doc_id - 1),
1114       IsOk());
1115   EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1116 }
1117 
TEST_F(IndexProcessorTest,IndexingDocMergeFailureResets)1118 TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
1119   // 1. Setup a mock filesystem to fail to grow the main index.
1120   auto open_write_lambda = [this](const char* filename) {
1121     std::string main_lexicon_suffix =
1122         "/main-lexicon.prop." +
1123         std::to_string(GetHasHitsInPrefixSectionPropertyId());
1124     std::string filename_string(filename);
1125     if (filename_string.length() >= main_lexicon_suffix.length() &&
1126         filename_string.substr(
1127             filename_string.length() - main_lexicon_suffix.length(),
1128             main_lexicon_suffix.length()) == main_lexicon_suffix) {
1129       return -1;
1130     }
1131     return this->filesystem_.OpenForWrite(filename);
1132   };
1133   ON_CALL(*mock_icing_filesystem_, OpenForWrite)
1134       .WillByDefault(open_write_lambda);
1135 
1136   DocumentProto document =
1137       DocumentBuilder()
1138           .SetKey("icing", "fake_type/1")
1139           .SetSchema(std::string(kFakeType))
1140           .AddStringProperty(std::string(kPrefixedProperty), kIpsumText)
1141           .Build();
1142   ICING_ASSERT_OK_AND_ASSIGN(
1143       TokenizedDocument tokenized_document,
1144       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1145                                 document));
1146 
1147   // 2. Recreate the index with the mock filesystem and a merge size that will
1148   // only allow one document to be added before requiring a merge.
1149   Index::Options options(index_dir_,
1150                          /*index_merge_size=*/document.ByteSizeLong(),
1151                          /*lite_index_sort_at_indexing=*/true,
1152                          /*lite_index_sort_size=*/16);
1153   ICING_ASSERT_OK_AND_ASSIGN(
1154       index_,
1155       Index::Create(options, &filesystem_, mock_icing_filesystem_.get()));
1156 
1157   ICING_ASSERT_OK_AND_ASSIGN(
1158       std::unique_ptr<TermIndexingHandler> term_indexing_handler,
1159       TermIndexingHandler::Create(
1160           &fake_clock_, normalizer_.get(), index_.get(),
1161           /*build_property_existence_metadata_hits=*/true));
1162   std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
1163   handlers.push_back(std::move(term_indexing_handler));
1164 
1165   index_processor_ =
1166       std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
1167 
1168   // 3. Index one document. This should fit in the LiteIndex without requiring a
1169   // merge.
1170   DocumentId doc_id = 0;
1171   EXPECT_THAT(
1172       index_processor_->IndexDocument(tokenized_document, doc_id,
1173                                       /*old_document_id=*/kInvalidDocumentId),
1174       IsOk());
1175   EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1176 
1177   // 4. Add one more document to trigger a merge, which should fail and result
1178   // in a Reset.
1179   ++doc_id;
1180   EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id,
1181                                               /*old_document_id=*/doc_id - 1),
1182               StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
1183   EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
1184 
1185   // 5. Indexing a new document should succeed.
1186   EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id,
1187                                               /*old_document_id=*/doc_id - 1),
1188               IsOk());
1189   EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
1190 }
1191 
TEST_F(IndexProcessorTest,ExactVerbatimProperty)1192 TEST_F(IndexProcessorTest, ExactVerbatimProperty) {
1193   DocumentProto document =
1194       DocumentBuilder()
1195           .SetKey("icing", "fake_type/1")
1196           .SetSchema(std::string(kFakeType))
1197           .AddStringProperty(std::string(kVerbatimExactProperty),
1198                              "Hello, world!")
1199           .Build();
1200   ICING_ASSERT_OK_AND_ASSIGN(
1201       TokenizedDocument tokenized_document,
1202       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1203                                 document));
1204   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
1205 
1206   EXPECT_THAT(
1207       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1208                                       /*old_document_id=*/kInvalidDocumentId),
1209       IsOk());
1210   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1211 
1212   ICING_ASSERT_OK_AND_ASSIGN(
1213       std::unique_ptr<DocHitInfoIterator> itr,
1214       index_->GetIterator("Hello, world!", /*term_start_index=*/0,
1215                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1216                           TermMatchType::EXACT_ONLY));
1217   std::vector<DocHitInfoTermFrequencyPair> hits =
1218       GetHitsWithTermFrequency(std::move(itr));
1219   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1220       {kVerbatimExactSectionId, 1}};
1221 
1222   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1223                         kDocumentId0, expected_map)));
1224 }
1225 
TEST_F(IndexProcessorTest,PrefixVerbatimProperty)1226 TEST_F(IndexProcessorTest, PrefixVerbatimProperty) {
1227   DocumentProto document =
1228       DocumentBuilder()
1229           .SetKey("icing", "fake_type/1")
1230           .SetSchema(std::string(kFakeType))
1231           .AddStringProperty(std::string(kVerbatimPrefixedProperty),
1232                              "Hello, world!")
1233           .Build();
1234   ICING_ASSERT_OK_AND_ASSIGN(
1235       TokenizedDocument tokenized_document,
1236       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1237                                 document));
1238   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
1239 
1240   EXPECT_THAT(
1241       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1242                                       /*old_document_id=*/kInvalidDocumentId),
1243       IsOk());
1244   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1245 
1246   // We expect to match the document we indexed as "Hello, w" is a prefix
1247   // of "Hello, world!"
1248   ICING_ASSERT_OK_AND_ASSIGN(
1249       std::unique_ptr<DocHitInfoIterator> itr,
1250       index_->GetIterator("Hello, w", /*term_start_index=*/0,
1251                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1252                           TermMatchType::PREFIX));
1253   std::vector<DocHitInfoTermFrequencyPair> hits =
1254       GetHitsWithTermFrequency(std::move(itr));
1255   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1256       {kVerbatimPrefixedSectionId, 1}};
1257 
1258   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1259                         kDocumentId0, expected_map)));
1260 }
1261 
TEST_F(IndexProcessorTest,VerbatimPropertyDoesntMatchSubToken)1262 TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) {
1263   DocumentProto document =
1264       DocumentBuilder()
1265           .SetKey("icing", "fake_type/1")
1266           .SetSchema(std::string(kFakeType))
1267           .AddStringProperty(std::string(kVerbatimPrefixedProperty),
1268                              "Hello, world!")
1269           .Build();
1270   ICING_ASSERT_OK_AND_ASSIGN(
1271       TokenizedDocument tokenized_document,
1272       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1273                                 document));
1274   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
1275 
1276   EXPECT_THAT(
1277       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1278                                       /*old_document_id=*/kInvalidDocumentId),
1279       IsOk());
1280   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1281 
1282   ICING_ASSERT_OK_AND_ASSIGN(
1283       std::unique_ptr<DocHitInfoIterator> itr,
1284       index_->GetIterator("world", /*term_start_index=*/0,
1285                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1286                           TermMatchType::PREFIX));
1287   std::vector<DocHitInfo> hits = GetHits(std::move(itr));
1288 
1289   // We should not have hits for term "world" as the index processor should
1290   // create a sole token "Hello, world! for the document.
1291   EXPECT_THAT(hits, IsEmpty());
1292 }
1293 
1294 // Some phrases that should match exactly to RFC822 tokens. We normalize the
1295 // tokens, so the case of the string property shouldn't matter.
TEST_F(IndexProcessorTest,Rfc822PropertyExact)1296 TEST_F(IndexProcessorTest, Rfc822PropertyExact) {
1297   DocumentProto document = DocumentBuilder()
1298                                .SetKey("icing", "fake_type/1")
1299                                .SetSchema(std::string(kFakeType))
1300                                .AddStringProperty(std::string(kRfc822Property),
1301                                                   "<[email protected]>")
1302                                .Build();
1303   ICING_ASSERT_OK_AND_ASSIGN(
1304       TokenizedDocument tokenized_document,
1305       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1306                                 document));
1307   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1308 
1309   EXPECT_THAT(
1310       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1311                                       /*old_document_id=*/kInvalidDocumentId),
1312       IsOk());
1313   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1314 
1315   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1316       {kRfc822SectionId, 2}};
1317 
1318   ICING_ASSERT_OK_AND_ASSIGN(
1319       std::unique_ptr<DocHitInfoIterator> itr,
1320       index_->GetIterator("alexsav", /*term_start_index=*/0,
1321                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1322                           TermMatchType::EXACT_ONLY));
1323   std::vector<DocHitInfoTermFrequencyPair> hits =
1324       GetHitsWithTermFrequency(std::move(itr));
1325   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1326                         kDocumentId0, expected_map)));
1327 
1328   expected_map = {{kRfc822SectionId, 1}};
1329 
1330   ICING_ASSERT_OK_AND_ASSIGN(
1331       itr, index_->GetIterator("com", /*term_start_index=*/0,
1332                                /*unnormalized_term_length=*/0,
1333                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1334   hits = GetHitsWithTermFrequency(std::move(itr));
1335   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1336                         kDocumentId0, expected_map)));
1337 
1338   ICING_ASSERT_OK_AND_ASSIGN(
1339       itr, index_->GetIterator("[email protected]", /*term_start_index=*/0,
1340                                /*unnormalized_term_length=*/0,
1341                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1342   hits = GetHitsWithTermFrequency(std::move(itr));
1343   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1344                         kDocumentId0, expected_map)));
1345 }
1346 
TEST_F(IndexProcessorTest,Rfc822PropertyExactShouldNotReturnPrefix)1347 TEST_F(IndexProcessorTest, Rfc822PropertyExactShouldNotReturnPrefix) {
1348   DocumentProto document = DocumentBuilder()
1349                                .SetKey("icing", "fake_type/1")
1350                                .SetSchema(std::string(kFakeType))
1351                                .AddStringProperty(std::string(kRfc822Property),
1352                                                   "<[email protected]>")
1353                                .Build();
1354   ICING_ASSERT_OK_AND_ASSIGN(
1355       TokenizedDocument tokenized_document,
1356       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1357                                 document));
1358   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1359 
1360   EXPECT_THAT(
1361       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1362                                       /*old_document_id=*/kInvalidDocumentId),
1363       IsOk());
1364   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1365 
1366   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1367       {kRfc822SectionId, 2}};
1368 
1369   ICING_ASSERT_OK_AND_ASSIGN(
1370       std::unique_ptr<DocHitInfoIterator> itr,
1371       index_->GetIterator("alexsa", /*term_start_index=*/0,
1372                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1373                           TermMatchType::EXACT_ONLY));
1374   std::vector<DocHitInfo> hits = GetHits(std::move(itr));
1375   EXPECT_THAT(hits, IsEmpty());
1376 }
1377 
1378 // Some prefixes of generated RFC822 tokens.
TEST_F(IndexProcessorTest,Rfc822PropertyPrefix)1379 TEST_F(IndexProcessorTest, Rfc822PropertyPrefix) {
1380   DocumentProto document = DocumentBuilder()
1381                                .SetKey("icing", "fake_type/1")
1382                                .SetSchema(std::string(kFakeType))
1383                                .AddStringProperty(std::string(kRfc822Property),
1384                                                   "<[email protected]>")
1385                                .Build();
1386   ICING_ASSERT_OK_AND_ASSIGN(
1387       TokenizedDocument tokenized_document,
1388       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1389                                 document));
1390   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1391 
1392   EXPECT_THAT(
1393       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1394                                       /*old_document_id=*/kInvalidDocumentId),
1395       IsOk());
1396   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1397 
1398   // "alexsav@" only matches "[email protected]"
1399   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1400       {kRfc822SectionId, 1}};
1401   ICING_ASSERT_OK_AND_ASSIGN(
1402       std::unique_ptr<DocHitInfoIterator> itr,
1403       index_->GetIterator("alexsav@", /*term_start_index=*/0,
1404                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1405                           TermMatchType::PREFIX));
1406   std::vector<DocHitInfoTermFrequencyPair> hits =
1407       GetHitsWithTermFrequency(std::move(itr));
1408   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1409                         kDocumentId0, expected_map)));
1410 
1411   // "goog" matches tokens "google" and "google.com"
1412   expected_map = {{kRfc822SectionId, 2}};
1413   ICING_ASSERT_OK_AND_ASSIGN(
1414       itr, index_->GetIterator("goog", /*term_start_index=*/0,
1415                                /*unnormalized_term_length=*/0,
1416                                kSectionIdMaskAll, TermMatchType::PREFIX));
1417   hits = GetHitsWithTermFrequency(std::move(itr));
1418   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1419                         kDocumentId0, expected_map)));
1420 
1421   // "ale" matches tokens "alexsav" (twice) and "[email protected]"
1422   expected_map = {{kRfc822SectionId, 3}};
1423   ICING_ASSERT_OK_AND_ASSIGN(
1424       itr, index_->GetIterator("ale", /*term_start_index=*/0,
1425                                /*unnormalized_term_length=*/0,
1426                                kSectionIdMaskAll, TermMatchType::PREFIX));
1427   hits = GetHitsWithTermFrequency(std::move(itr));
1428   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1429                         kDocumentId0, expected_map)));
1430 }
1431 
TEST_F(IndexProcessorTest,Rfc822PropertyNoMatch)1432 TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) {
1433   DocumentProto document = DocumentBuilder()
1434                                .SetKey("icing", "fake_type/1")
1435                                .SetSchema(std::string(kFakeType))
1436                                .AddStringProperty(std::string(kRfc822Property),
1437                                                   "<[email protected]>")
1438                                .Build();
1439   ICING_ASSERT_OK_AND_ASSIGN(
1440       TokenizedDocument tokenized_document,
1441       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1442                                 document));
1443   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1444 
1445   EXPECT_THAT(
1446       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1447                                       /*old_document_id=*/kInvalidDocumentId),
1448       IsOk());
1449   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1450 
1451   std::unordered_map<SectionId, Hit::TermFrequency> expect_map{{}};
1452 
1453   ICING_ASSERT_OK_AND_ASSIGN(
1454       std::unique_ptr<DocHitInfoIterator> itr,
1455       index_->GetIterator("abc.xyz", /*term_start_index=*/0,
1456                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1457                           TermMatchType::PREFIX));
1458   std::vector<DocHitInfo> hits = GetHits(std::move(itr));
1459 
1460   EXPECT_THAT(hits, IsEmpty());
1461 }
1462 
1463 #ifdef ENABLE_URL_TOKENIZER
TEST_F(IndexProcessorTest,ExactUrlProperty)1464 TEST_F(IndexProcessorTest, ExactUrlProperty) {
1465   DocumentProto document =
1466       DocumentBuilder()
1467           .SetKey("icing", "fake_type/1")
1468           .SetSchema(std::string(kFakeType))
1469           .AddStringProperty(std::string(kUrlExactProperty),
1470                              "http://www.google.com")
1471           .Build();
1472   ICING_ASSERT_OK_AND_ASSIGN(
1473       TokenizedDocument tokenized_document,
1474       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1475                                 document));
1476   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1477 
1478   EXPECT_THAT(
1479       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1480                                       /*old_document_id=*/kInvalidDocumentId),
1481       IsOk());
1482   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1483 
1484   ICING_ASSERT_OK_AND_ASSIGN(
1485       std::unique_ptr<DocHitInfoIterator> itr,
1486       index_->GetIterator("google", /*term_start_index=*/0,
1487                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1488                           TermMatchType::EXACT_ONLY));
1489   std::vector<DocHitInfoTermFrequencyPair> hits =
1490       GetHitsWithTermFrequency(std::move(itr));
1491   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1492       {kUrlExactSectionId, 1}};
1493   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1494                         kDocumentId0, expected_map)));
1495 
1496   ICING_ASSERT_OK_AND_ASSIGN(
1497       itr, index_->GetIterator("http", /*term_start_index=*/0,
1498                                /*unnormalized_term_length=*/0,
1499                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1500   hits = GetHitsWithTermFrequency(std::move(itr));
1501   expected_map = {{kUrlExactSectionId, 1}};
1502   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1503                         kDocumentId0, expected_map)));
1504 
1505   ICING_ASSERT_OK_AND_ASSIGN(
1506       itr, index_->GetIterator("www.google.com", /*term_start_index=*/0,
1507                                /*unnormalized_term_length=*/0,
1508                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1509   hits = GetHitsWithTermFrequency(std::move(itr));
1510   expected_map = {{kUrlExactSectionId, 1}};
1511   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1512                         kDocumentId0, expected_map)));
1513 
1514   ICING_ASSERT_OK_AND_ASSIGN(
1515       itr, index_->GetIterator("http://www.google.com", /*term_start_index=*/0,
1516                                /*unnormalized_term_length=*/0,
1517                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1518   hits = GetHitsWithTermFrequency(std::move(itr));
1519   expected_map = {{kUrlExactSectionId, 1}};
1520   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1521                         kDocumentId0, expected_map)));
1522 }
1523 
TEST_F(IndexProcessorTest,ExactUrlPropertyDoesNotMatchPrefix)1524 TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) {
1525   DocumentProto document =
1526       DocumentBuilder()
1527           .SetKey("icing", "fake_type/1")
1528           .SetSchema(std::string(kFakeType))
1529           .AddStringProperty(std::string(kUrlExactProperty),
1530                              "https://mail.google.com/calendar/render")
1531           .Build();
1532   ICING_ASSERT_OK_AND_ASSIGN(
1533       TokenizedDocument tokenized_document,
1534       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1535                                 document));
1536   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
1537 
1538   EXPECT_THAT(
1539       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1540                                       /*old_document_id=*/kInvalidDocumentId),
1541       IsOk());
1542   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1543 
1544   ICING_ASSERT_OK_AND_ASSIGN(
1545       std::unique_ptr<DocHitInfoIterator> itr,
1546       index_->GetIterator("co", /*term_start_index=*/0,
1547                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1548                           TermMatchType::EXACT_ONLY));
1549   std::vector<DocHitInfoTermFrequencyPair> hits =
1550       GetHitsWithTermFrequency(std::move(itr));
1551   EXPECT_THAT(hits, IsEmpty());
1552 
1553   ICING_ASSERT_OK_AND_ASSIGN(
1554       itr, index_->GetIterator("mail.go", /*term_start_index=*/0,
1555                                /*unnormalized_term_length=*/0,
1556                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1557   hits = GetHitsWithTermFrequency(std::move(itr));
1558   EXPECT_THAT(hits, IsEmpty());
1559 
1560   ICING_ASSERT_OK_AND_ASSIGN(
1561       itr, index_->GetIterator("mail.google.com", /*term_start_index=*/0,
1562                                /*unnormalized_term_length=*/0,
1563                                kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
1564   hits = GetHitsWithTermFrequency(std::move(itr));
1565   EXPECT_THAT(hits, IsEmpty());
1566 }
1567 
TEST_F(IndexProcessorTest,PrefixUrlProperty)1568 TEST_F(IndexProcessorTest, PrefixUrlProperty) {
1569   DocumentProto document =
1570       DocumentBuilder()
1571           .SetKey("icing", "fake_type/1")
1572           .SetSchema(std::string(kFakeType))
1573           .AddStringProperty(std::string(kUrlPrefixedProperty),
1574                              "http://www.google.com")
1575           .Build();
1576   ICING_ASSERT_OK_AND_ASSIGN(
1577       TokenizedDocument tokenized_document,
1578       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1579                                 document));
1580   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
1581 
1582   EXPECT_THAT(
1583       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1584                                       /*old_document_id=*/kInvalidDocumentId),
1585       IsOk());
1586   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1587 
1588   // "goo" is a prefix of "google" and "google.com"
1589   ICING_ASSERT_OK_AND_ASSIGN(
1590       std::unique_ptr<DocHitInfoIterator> itr,
1591       index_->GetIterator("goo", /*term_start_index=*/0,
1592                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1593                           TermMatchType::PREFIX));
1594   std::vector<DocHitInfoTermFrequencyPair> hits =
1595       GetHitsWithTermFrequency(std::move(itr));
1596   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
1597       {kUrlPrefixedSectionId, 2}};
1598   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1599                         kDocumentId0, expected_map)));
1600 
1601   // "http" is a prefix of "http" and "http://www.google.com"
1602   ICING_ASSERT_OK_AND_ASSIGN(
1603       itr, index_->GetIterator("http", /*term_start_index=*/0,
1604                                /*unnormalized_term_length=*/0,
1605                                kSectionIdMaskAll, TermMatchType::PREFIX));
1606   hits = GetHitsWithTermFrequency(std::move(itr));
1607   expected_map = {{kUrlPrefixedSectionId, 2}};
1608   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1609                         kDocumentId0, expected_map)));
1610 
1611   // "www.go" is a prefix of "www.google.com"
1612   ICING_ASSERT_OK_AND_ASSIGN(
1613       itr, index_->GetIterator("www.go", /*term_start_index=*/0,
1614                                /*unnormalized_term_length=*/0,
1615                                kSectionIdMaskAll, TermMatchType::PREFIX));
1616   hits = GetHitsWithTermFrequency(std::move(itr));
1617   expected_map = {{kUrlPrefixedSectionId, 1}};
1618   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
1619                         kDocumentId0, expected_map)));
1620 }
1621 
TEST_F(IndexProcessorTest,PrefixUrlPropertyNoMatch)1622 TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) {
1623   DocumentProto document =
1624       DocumentBuilder()
1625           .SetKey("icing", "fake_type/1")
1626           .SetSchema(std::string(kFakeType))
1627           .AddStringProperty(std::string(kUrlPrefixedProperty),
1628                              "https://mail.google.com/calendar/render")
1629           .Build();
1630   ICING_ASSERT_OK_AND_ASSIGN(
1631       TokenizedDocument tokenized_document,
1632       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1633                                 document));
1634   EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
1635 
1636   EXPECT_THAT(
1637       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1638                                       /*old_document_id=*/kInvalidDocumentId),
1639       IsOk());
1640   EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
1641 
1642   // no token starts with "gle", so we should have no hits
1643   ICING_ASSERT_OK_AND_ASSIGN(
1644       std::unique_ptr<DocHitInfoIterator> itr,
1645       index_->GetIterator("gle", /*term_start_index=*/0,
1646                           /*unnormalized_term_length=*/0, kSectionIdMaskAll,
1647                           TermMatchType::PREFIX));
1648   std::vector<DocHitInfoTermFrequencyPair> hits =
1649       GetHitsWithTermFrequency(std::move(itr));
1650   EXPECT_THAT(hits, IsEmpty());
1651 
1652   ICING_ASSERT_OK_AND_ASSIGN(
1653       itr, index_->GetIterator("w.goo", /*term_start_index=*/0,
1654                                /*unnormalized_term_length=*/0,
1655                                kSectionIdMaskAll, TermMatchType::PREFIX));
1656   hits = GetHitsWithTermFrequency(std::move(itr));
1657   EXPECT_THAT(hits, IsEmpty());
1658 
1659   // tokens have separators removed, so no hits here
1660   ICING_ASSERT_OK_AND_ASSIGN(
1661       itr, index_->GetIterator(".com", /*term_start_index=*/0,
1662                                /*unnormalized_term_length=*/0,
1663                                kSectionIdMaskAll, TermMatchType::PREFIX));
1664   hits = GetHitsWithTermFrequency(std::move(itr));
1665   EXPECT_THAT(hits, IsEmpty());
1666 
1667   ICING_ASSERT_OK_AND_ASSIGN(
1668       itr, index_->GetIterator("calendar/render", /*term_start_index=*/0,
1669                                /*unnormalized_term_length=*/0,
1670                                kSectionIdMaskAll, TermMatchType::PREFIX));
1671   hits = GetHitsWithTermFrequency(std::move(itr));
1672   EXPECT_THAT(hits, IsEmpty());
1673 }
1674 #endif  // ENABLE_URL_TOKENIZER
1675 
TEST_F(IndexProcessorTest,IndexableIntegerProperty)1676 TEST_F(IndexProcessorTest, IndexableIntegerProperty) {
1677   DocumentProto document =
1678       DocumentBuilder()
1679           .SetKey("icing", "fake_type/1")
1680           .SetSchema(std::string(kFakeType))
1681           .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4,
1682                             5)
1683           .Build();
1684   ICING_ASSERT_OK_AND_ASSIGN(
1685       TokenizedDocument tokenized_document,
1686       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1687                                 document));
1688   // Expected to have 1 integer section.
1689   EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1));
1690 
1691   EXPECT_THAT(
1692       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1693                                       /*old_document_id=*/kInvalidDocumentId),
1694       IsOk());
1695 
1696   ICING_ASSERT_OK_AND_ASSIGN(
1697       std::unique_ptr<DocHitInfoIterator> itr,
1698       integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1,
1699                                   /*key_upper=*/5, *doc_store_, *schema_store_,
1700                                   fake_clock_.GetSystemTimeMilliseconds()));
1701 
1702   EXPECT_THAT(
1703       GetHits(std::move(itr)),
1704       ElementsAre(EqualsDocHitInfo(
1705           kDocumentId0, std::vector<SectionId>{kIndexableIntegerSectionId})));
1706 }
1707 
TEST_F(IndexProcessorTest,IndexableIntegerPropertyNoMatch)1708 TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) {
1709   DocumentProto document =
1710       DocumentBuilder()
1711           .SetKey("icing", "fake_type/1")
1712           .SetSchema(std::string(kFakeType))
1713           .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4,
1714                             5)
1715           .Build();
1716   ICING_ASSERT_OK_AND_ASSIGN(
1717       TokenizedDocument tokenized_document,
1718       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
1719                                 document));
1720   // Expected to have 1 integer section.
1721   EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1));
1722 
1723   EXPECT_THAT(
1724       index_processor_->IndexDocument(tokenized_document, kDocumentId0,
1725                                       /*old_document_id=*/kInvalidDocumentId),
1726       IsOk());
1727 
1728   ICING_ASSERT_OK_AND_ASSIGN(
1729       std::unique_ptr<DocHitInfoIterator> itr,
1730       integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1,
1731                                   /*key_upper=*/0, *doc_store_, *schema_store_,
1732                                   fake_clock_.GetSystemTimeMilliseconds()));
1733 
1734   EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
1735 }
1736 
1737 }  // namespace
1738 
1739 }  // namespace lib
1740 }  // namespace icing
1741