xref: /aosp_15_r20/external/icing/icing/result/snippet-retriever_test.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/result/snippet-retriever.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 #include "icing/document-builder.h"
24 #include "icing/feature-flags.h"
25 #include "icing/file/mock-filesystem.h"
26 #include "icing/portable/equals-proto.h"
27 #include "icing/portable/platform.h"
28 #include "icing/proto/document.pb.h"
29 #include "icing/proto/schema.pb.h"
30 #include "icing/proto/search.pb.h"
31 #include "icing/proto/term.pb.h"
32 #include "icing/query/query-terms.h"
33 #include "icing/schema-builder.h"
34 #include "icing/schema/schema-store.h"
35 #include "icing/schema/section-manager.h"
36 #include "icing/store/document-id.h"
37 #include "icing/store/key-mapper.h"
38 #include "icing/testing/common-matchers.h"
39 #include "icing/testing/fake-clock.h"
40 #include "icing/testing/jni-test-helpers.h"
41 #include "icing/testing/test-data.h"
42 #include "icing/testing/test-feature-flags.h"
43 #include "icing/testing/tmp-directory.h"
44 #include "icing/tokenization/language-segmenter-factory.h"
45 #include "icing/tokenization/language-segmenter.h"
46 #include "icing/transform/map/map-normalizer.h"
47 #include "icing/transform/normalizer-factory.h"
48 #include "icing/transform/normalizer.h"
49 #include "icing/util/icu-data-file-helper.h"
50 #include "icing/util/snippet-helpers.h"
51 #include "unicode/uloc.h"
52 
53 namespace icing {
54 namespace lib {
55 
56 namespace {
57 
58 using ::testing::ElementsAre;
59 using ::testing::Eq;
60 using ::testing::IsEmpty;
61 using ::testing::SizeIs;
62 
63 // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
64 // to Android. Also move it to schema-builder.h
65 #ifdef ENABLE_URL_TOKENIZER
66 constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
67     StringIndexingConfig::TokenizerType::URL;
68 #endif  // ENABLE_URL_TOKENIZER
69 
GetPropertyPaths(const SnippetProto & snippet)70 std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
71   std::vector<std::string_view> paths;
72   for (const SnippetProto::EntryProto& entry : snippet.entries()) {
73     paths.push_back(entry.property_name());
74   }
75   return paths;
76 }
77 
78 class SnippetRetrieverTest : public testing::Test {
79  protected:
SetUp()80   void SetUp() override {
81     feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
82     test_dir_ = GetTestTempDir() + "/icing";
83     filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
84 
85     if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
86       ICING_ASSERT_OK(
87           // File generated via icu_data_file rule in //icing/BUILD.
88           icu_data_file_helper::SetUpIcuDataFile(
89               GetTestFilePath("icing/icu.dat")));
90     }
91 
92     jni_cache_ = GetTestJniCache();
93     language_segmenter_factory::SegmenterOptions options(ULOC_US,
94                                                          jni_cache_.get());
95     ICING_ASSERT_OK_AND_ASSIGN(
96         language_segmenter_,
97         language_segmenter_factory::Create(std::move(options)));
98 
99     // Setup the schema
100     ICING_ASSERT_OK_AND_ASSIGN(
101         schema_store_, SchemaStore::Create(&filesystem_, test_dir_,
102                                            &fake_clock_, feature_flags_.get()));
103     SchemaProto schema =
104         SchemaBuilder()
105             .AddType(
106                 SchemaTypeConfigBuilder()
107                     .SetType("email")
108                     .AddProperty(PropertyConfigBuilder()
109                                      .SetName("subject")
110                                      .SetDataTypeString(TERM_MATCH_PREFIX,
111                                                         TOKENIZER_PLAIN)
112                                      .SetCardinality(CARDINALITY_OPTIONAL))
113                     .AddProperty(PropertyConfigBuilder()
114                                      .SetName("body")
115                                      .SetDataTypeString(TERM_MATCH_EXACT,
116                                                         TOKENIZER_PLAIN)
117                                      .SetCardinality(CARDINALITY_OPTIONAL)))
118             .Build();
119     ICING_ASSERT_OK(schema_store_->SetSchema(
120         schema, /*ignore_errors_and_delete_documents=*/false,
121         /*allow_circular_schema_definitions=*/false));
122 
123     ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
124                                                 /*max_term_byte_size=*/10000));
125     ICING_ASSERT_OK_AND_ASSIGN(
126         snippet_retriever_,
127         SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
128                                  normalizer_.get()));
129 
130     // Set limits to max - effectively no limit. Enable matching and request a
131     // window of 64 bytes.
132     snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
133     snippet_spec_.set_num_matches_per_property(
134         std::numeric_limits<int32_t>::max());
135     snippet_spec_.set_max_window_utf32_length(64);
136   }
137 
TearDown()138   void TearDown() override {
139     filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
140   }
141 
142   std::unique_ptr<FeatureFlags> feature_flags_;
143   Filesystem filesystem_;
144   FakeClock fake_clock_;
145   std::unique_ptr<SchemaStore> schema_store_;
146   std::unique_ptr<LanguageSegmenter> language_segmenter_;
147   std::unique_ptr<SnippetRetriever> snippet_retriever_;
148   std::unique_ptr<Normalizer> normalizer_;
149   std::unique_ptr<const JniCache> jni_cache_;
150   ResultSpecProto::SnippetSpecProto snippet_spec_;
151   std::string test_dir_;
152 };
153 
TEST_F(SnippetRetrieverTest,CreationWithNullPointerShouldFail)154 TEST_F(SnippetRetrieverTest, CreationWithNullPointerShouldFail) {
155   EXPECT_THAT(
156       SnippetRetriever::Create(/*schema_store=*/nullptr,
157                                language_segmenter_.get(), normalizer_.get()),
158       StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
159   EXPECT_THAT(SnippetRetriever::Create(schema_store_.get(),
160                                        /*language_segmenter=*/nullptr,
161                                        normalizer_.get()),
162               StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
163   EXPECT_THAT(
164       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
165                                /*normalizer=*/nullptr),
166       StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
167 }
168 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeSmallerThanMatch)169 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
170   DocumentProto document =
171       DocumentBuilder()
172           .SetKey("icing", "email/1")
173           .SetSchema("email")
174           .AddStringProperty("subject", "counting")
175           .AddStringProperty("body", "one two three four.... five")
176           .Build();
177 
178   SectionIdMask section_mask = 0b00000011;
179   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
180 
181   // Window starts at the beginning of "three" and ends in the middle of
182   // "three". len=4, orig_window= "thre"
183   snippet_spec_.set_max_window_utf32_length(4);
184   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
185       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
186 
187   EXPECT_THAT(snippet.entries(), SizeIs(1));
188   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
189   std::string_view content =
190       GetString(&document, snippet.entries(0).property_name());
191   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
192 }
193 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch)194 TEST_F(SnippetRetrieverTest,
195        SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) {
196   DocumentProto document =
197       DocumentBuilder()
198           .SetKey("icing", "email/1")
199           .SetSchema("email")
200           .AddStringProperty("subject", "counting")
201           .AddStringProperty("body", "one two three four.... five")
202           .Build();
203 
204   SectionIdMask section_mask = 0b00000011;
205   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
206 
207   // Window starts at the beginning of "three" and at the exact end of
208   // "three". len=5, orig_window= "three"
209   snippet_spec_.set_max_window_utf32_length(5);
210   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
211       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
212 
213   EXPECT_THAT(snippet.entries(), SizeIs(1));
214   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
215   std::string_view content =
216       GetString(&document, snippet.entries(0).property_name());
217   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three"));
218 }
219 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch)220 TEST_F(SnippetRetrieverTest,
221        SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) {
222   DocumentProto document =
223       DocumentBuilder()
224           .SetKey("icing", "email/1")
225           .SetSchema("email")
226           .AddStringProperty("subject", "counting")
227           .AddStringProperty("body", "one two three four.... five")
228           .Build();
229 
230   SectionIdMask section_mask = 0b00000011;
231   SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
232 
233   // Window starts at the beginning of "four" and at the exact end of
234   // "four". len=4, orig_window= "four"
235   snippet_spec_.set_max_window_utf32_length(4);
236   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
237       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
238 
239   EXPECT_THAT(snippet.entries(), SizeIs(1));
240   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
241   std::string_view content =
242       GetString(&document, snippet.entries(0).property_name());
243   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four"));
244 }
245 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsInWhitespace)246 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
247   DocumentProto document =
248       DocumentBuilder()
249           .SetKey("icing", "email/1")
250           .SetSchema("email")
251           .AddStringProperty("subject", "counting")
252           .AddStringProperty("body", "one two three four.... five")
253           .Build();
254 
255   SectionIdMask section_mask = 0b00000011;
256   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
257 
258   // String:      "one two three four.... five"
259   //               ^   ^   ^     ^        ^   ^
260   // UTF-8 idx:    0   4   8     14       23  27
261   // UTF-32 idx:   0   4   8     14       23  27
262   //
263   // The window will be:
264   //   1. untrimmed, no-shifting window will be (2,17).
265   //   2. trimmed, no-shifting window [4,13) "two three"
266   //   3. trimmed, shifted window [4,18) "two three four"
267   snippet_spec_.set_max_window_utf32_length(14);
268   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
269       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
270 
271   EXPECT_THAT(snippet.entries(), SizeIs(1));
272   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
273   std::string_view content =
274       GetString(&document, snippet.entries(0).property_name());
275   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
276               ElementsAre("two three four"));
277 }
278 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsMidToken)279 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
280   DocumentProto document =
281       DocumentBuilder()
282           .SetKey("icing", "email/1")
283           .SetSchema("email")
284           .AddStringProperty("subject", "counting")
285           .AddStringProperty("body", "one two three four.... five")
286           .Build();
287 
288   SectionIdMask section_mask = 0b00000011;
289   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
290 
291   // String:      "one two three four.... five"
292   //               ^   ^   ^     ^        ^   ^
293   // UTF-8 idx:    0   4   8     14       23  27
294   // UTF-32 idx:   0   4   8     14       23  27
295   //
296   // The window will be:
297   //   1. untrimmed, no-shifting window will be (1,18).
298   //   2. trimmed, no-shifting window [4,18) "two three four"
299   //   3. trimmed, shifted window [4,20) "two three four.."
300   snippet_spec_.set_max_window_utf32_length(16);
301   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
302       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
303 
304   EXPECT_THAT(snippet.entries(), SizeIs(1));
305   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
306   std::string_view content =
307       GetString(&document, snippet.entries(0).property_name());
308   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
309               ElementsAre("two three four.."));
310 }
311 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInPunctuation)312 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
313   DocumentProto document =
314       DocumentBuilder()
315           .SetKey("icing", "email/1")
316           .SetSchema("email")
317           .AddStringProperty("subject", "counting")
318           .AddStringProperty("body", "one two three four.... five")
319           .Build();
320 
321   SectionIdMask section_mask = 0b00000011;
322   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
323 
324   // Window ends in the middle of all the punctuation and window starts at 0.
325   // len=20, orig_window="one two three four.."
326   snippet_spec_.set_max_window_utf32_length(20);
327   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
328       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
329 
330   EXPECT_THAT(snippet.entries(), SizeIs(1));
331   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
332   std::string_view content =
333       GetString(&document, snippet.entries(0).property_name());
334   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
335               ElementsAre("one two three four.."));
336 }
337 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMultiBytePunctuation)338 TEST_F(SnippetRetrieverTest,
339        SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
340   DocumentProto document =
341       DocumentBuilder()
342           .SetKey("icing", "email/1")
343           .SetSchema("email")
344           .AddStringProperty("subject", "counting")
345           .AddStringProperty("body",
346                              "Is everything upside down in Australia¿ Crikey!")
347           .Build();
348 
349   SectionIdMask section_mask = 0b00000011;
350   SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
351 
352   // Window ends in the middle of all the punctuation and window starts at 0.
353   // len=26, orig_window="pside down in Australia¿"
354   snippet_spec_.set_max_window_utf32_length(24);
355   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
356       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
357 
358   EXPECT_THAT(snippet.entries(), SizeIs(1));
359   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
360   std::string_view content =
361       GetString(&document, snippet.entries(0).property_name());
362   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
363               ElementsAre("down in Australia¿"));
364 }
365 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowBeyondMultiBytePunctuation)366 TEST_F(SnippetRetrieverTest,
367        SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
368   DocumentProto document =
369       DocumentBuilder()
370           .SetKey("icing", "email/1")
371           .SetSchema("email")
372           .AddStringProperty("subject", "counting")
373           .AddStringProperty("body",
374                              "Is everything upside down in Australia¿ Crikey!")
375           .Build();
376 
377   SectionIdMask section_mask = 0b00000011;
378   SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
379 
380   // Window ends in the middle of all the punctuation and window starts at 0.
381   // len=26, orig_window="upside down in Australia¿ "
382   snippet_spec_.set_max_window_utf32_length(26);
383   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
384       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
385 
386   EXPECT_THAT(snippet.entries(), SizeIs(1));
387   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
388   std::string_view content =
389       GetString(&document, snippet.entries(0).property_name());
390   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
391               ElementsAre("upside down in Australia¿"));
392 }
393 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsBeforeValueStart)394 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
395   DocumentProto document =
396       DocumentBuilder()
397           .SetKey("icing", "email/1")
398           .SetSchema("email")
399           .AddStringProperty("subject", "counting")
400           .AddStringProperty("body", "one two three four.... five")
401           .Build();
402 
403   SectionIdMask section_mask = 0b00000011;
404   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
405 
406   // String:      "one two three four.... five"
407   //               ^   ^   ^     ^        ^   ^
408   // UTF-8 idx:    0   4   8     14       23  27
409   // UTF-32 idx:   0   4   8     14       23  27
410   //
411   // The window will be:
412   //   1. untrimmed, no-shifting window will be (-2,21).
413   //   2. trimmed, no-shifting window [0,21) "one two three four..."
414   //   3. trimmed, shifted window [0,22) "one two three four...."
415   snippet_spec_.set_max_window_utf32_length(22);
416   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
417       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
418 
419   EXPECT_THAT(snippet.entries(), SizeIs(1));
420   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
421   std::string_view content =
422       GetString(&document, snippet.entries(0).property_name());
423   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
424               ElementsAre("one two three four...."));
425 }
426 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInWhitespace)427 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
428   DocumentProto document =
429       DocumentBuilder()
430           .SetKey("icing", "email/1")
431           .SetSchema("email")
432           .AddStringProperty("subject", "counting")
433           .AddStringProperty("body", "one two three four.... five")
434           .Build();
435 
436   SectionIdMask section_mask = 0b00000011;
437   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
438 
439   // Window ends before "five" but after all the punctuation
440   // len=26, orig_window="one two three four.... "
441   snippet_spec_.set_max_window_utf32_length(26);
442   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
443       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
444 
445   EXPECT_THAT(snippet.entries(), SizeIs(1));
446   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
447   std::string_view content =
448       GetString(&document, snippet.entries(0).property_name());
449   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
450               ElementsAre("one two three four...."));
451 }
452 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMidToken)453 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
454   DocumentProto document =
455       DocumentBuilder()
456           .SetKey("icing", "email/1")
457           .SetSchema("email")
458           .AddStringProperty("subject", "counting")
459           .AddStringProperty("body", "one two three four.... five")
460           .Build();
461 
462   SectionIdMask section_mask = 0b00000011;
463   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
464 
465   // String:      "one two three four.... five"
466   //               ^   ^   ^     ^        ^   ^
467   // UTF-8 idx:    0   4   8     14       23  27
468   // UTF-32 idx:   0   4   8     14       23  27
469   //
470   // The window will be:
471   //   1. untrimmed, no-shifting window will be ((-7,26).
472   //   2. trimmed, no-shifting window [0,26) "one two three four...."
473   //   3. trimmed, shifted window [0,27) "one two three four.... five"
474   snippet_spec_.set_max_window_utf32_length(32);
475   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
476       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
477 
478   EXPECT_THAT(snippet.entries(), SizeIs(1));
479   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
480   std::string_view content =
481       GetString(&document, snippet.entries(0).property_name());
482   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
483               ElementsAre("one two three four.... five"));
484 }
485 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToValueSize)486 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
487   DocumentProto document =
488       DocumentBuilder()
489           .SetKey("icing", "email/1")
490           .SetSchema("email")
491           .AddStringProperty("subject", "counting")
492           .AddStringProperty("body", "one two three four.... five")
493           .Build();
494 
495   SectionIdMask section_mask = 0b00000011;
496   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
497 
498   // Max window size equals the size of the value.
499   // len=34, orig_window="one two three four.... five"
500   snippet_spec_.set_max_window_utf32_length(34);
501   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
502       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
503 
504   EXPECT_THAT(snippet.entries(), SizeIs(1));
505   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
506   std::string_view content =
507       GetString(&document, snippet.entries(0).property_name());
508   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
509               ElementsAre("one two three four.... five"));
510 }
511 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeLargerThanValueSize)512 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
513   DocumentProto document =
514       DocumentBuilder()
515           .SetKey("icing", "email/1")
516           .SetSchema("email")
517           .AddStringProperty("subject", "counting")
518           .AddStringProperty("body", "one two three four.... five")
519           .Build();
520 
521   SectionIdMask section_mask = 0b00000011;
522   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
523 
524   // Max window size exceeds the size of the value.
525   // len=36, orig_window="one two three four.... five"
526   snippet_spec_.set_max_window_utf32_length(36);
527   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
528       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
529 
530   EXPECT_THAT(snippet.entries(), SizeIs(1));
531   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
532   std::string_view content =
533       GetString(&document, snippet.entries(0).property_name());
534   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
535               ElementsAre("one two three four.... five"));
536 }
537 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStart)538 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
539   DocumentProto document =
540       DocumentBuilder()
541           .SetKey("icing", "email/1")
542           .SetSchema("email")
543           .AddStringProperty("subject", "counting")
544           .AddStringProperty("body", "one two three four.... five six")
545           .Build();
546 
547   SectionIdMask section_mask = 0b00000011;
548   SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
549 
550   // String:      "one two three four.... five six"
551   //               ^   ^   ^     ^        ^    ^  ^
552   // UTF-8 idx:    0   4   8     14       23  28  31
553   // UTF-32 idx:   0   4   8     14       23  28  31
554   //
555   // Window size will go past the start of the window.
556   // The window will be:
557   //   1. untrimmed, no-shifting window will be (-10,19).
558   //   2. trimmed, no-shifting window [0,19) "one two three four."
559   //   3. trimmed, shifted window [0,27) "one two three four.... five"
560   snippet_spec_.set_max_window_utf32_length(28);
561   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
562       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
563 
564   EXPECT_THAT(snippet.entries(), SizeIs(1));
565   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
566   std::string_view content =
567       GetString(&document, snippet.entries(0).property_name());
568   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
569               ElementsAre("one two three four.... five"));
570 }
571 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEnd)572 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
573   DocumentProto document =
574       DocumentBuilder()
575           .SetKey("icing", "email/1")
576           .SetSchema("email")
577           .AddStringProperty("subject", "counting")
578           .AddStringProperty("body", "one two three four.... five six")
579           .Build();
580 
581   SectionIdMask section_mask = 0b00000011;
582   SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
583 
584   // String:      "one two three four.... five six"
585   //               ^   ^   ^     ^        ^    ^  ^
586   // UTF-8 idx:    0   4   8     14       23  28  31
587   // UTF-32 idx:   0   4   8     14       23  28  31
588   //
589   // Window size will go past the end of the window.
590   // The window will be:
591   //   1. untrimmed, no-shifting window will be (10,39).
592   //   2. trimmed, no-shifting window [14,31) "four.... five six"
593   //   3. trimmed, shifted window [4,31) "two three four.... five six"
594   snippet_spec_.set_max_window_utf32_length(28);
595   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
596       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
597 
598   EXPECT_THAT(snippet.entries(), SizeIs(1));
599   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
600   std::string_view content =
601       GetString(&document, snippet.entries(0).property_name());
602   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
603               ElementsAre("two three four.... five six"));
604 }
605 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStartShortText)606 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
607   DocumentProto document =
608       DocumentBuilder()
609           .SetKey("icing", "email/1")
610           .SetSchema("email")
611           .AddStringProperty("subject", "counting")
612           .AddStringProperty("body", "one two three four....")
613           .Build();
614 
615   SectionIdMask section_mask = 0b00000011;
616   SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
617 
618   // String:      "one two three four...."
619   //               ^   ^   ^     ^       ^
620   // UTF-8 idx:    0   4   8     14      22
621   // UTF-32 idx:   0   4   8     14      22
622   //
623   // Window size will go past the start of the window.
624   // The window will be:
625   //   1. untrimmed, no-shifting window will be (-10,19).
626   //   2. trimmed, no-shifting window [0, 19) "one two three four."
627   //   3. trimmed, shifted window [0, 22) "one two three four...."
628   snippet_spec_.set_max_window_utf32_length(28);
629   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
630       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
631 
632   EXPECT_THAT(snippet.entries(), SizeIs(1));
633   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
634   std::string_view content =
635       GetString(&document, snippet.entries(0).property_name());
636   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
637               ElementsAre("one two three four...."));
638 }
639 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEndShortText)640 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
641   DocumentProto document =
642       DocumentBuilder()
643           .SetKey("icing", "email/1")
644           .SetSchema("email")
645           .AddStringProperty("subject", "counting")
646           .AddStringProperty("body", "one two three four....")
647           .Build();
648 
649   SectionIdMask section_mask = 0b00000011;
650   SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
651 
652   // String:      "one two three four...."
653   //               ^   ^   ^     ^       ^
654   // UTF-8 idx:    0   4   8     14      22
655   // UTF-32 idx:   0   4   8     14      22
656   //
657   // Window size will go past the start of the window.
658   // The window will be:
659   //   1. untrimmed, no-shifting window will be (1,30).
660   //   2. trimmed, no-shifting window [4, 22) "two three four...."
661   //   3. trimmed, shifted window [0, 22) "one two three four...."
662   snippet_spec_.set_max_window_utf32_length(28);
663   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
664       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
665 
666   EXPECT_THAT(snippet.entries(), SizeIs(1));
667   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
668   std::string_view content =
669       GetString(&document, snippet.entries(0).property_name());
670   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
671               ElementsAre("one two three four...."));
672 }
673 
TEST_F(SnippetRetrieverTest,PrefixSnippeting)674 TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
675   DocumentProto document =
676       DocumentBuilder()
677           .SetKey("icing", "email/1")
678           .SetSchema("email")
679           .AddStringProperty("subject", "subject foo")
680           .AddStringProperty("body", "Only a fool would match this content.")
681           .Build();
682   SectionIdMask section_mask = 0b00000011;
683   SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
684   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
685       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
686 
687   // Check the snippets. 'f' should match prefix-enabled property 'subject', but
688   // not exact-only property 'body'
689   EXPECT_THAT(snippet.entries(), SizeIs(1));
690   EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
691   std::string_view content =
692       GetString(&document, snippet.entries(0).property_name());
693   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
694               ElementsAre("subject foo"));
695   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
696   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
697 }
698 
TEST_F(SnippetRetrieverTest,ExactSnippeting)699 TEST_F(SnippetRetrieverTest, ExactSnippeting) {
700   DocumentProto document =
701       DocumentBuilder()
702           .SetKey("icing", "email/1")
703           .SetSchema("email")
704           .AddStringProperty("subject", "subject foo")
705           .AddStringProperty("body", "Only a fool would match this content.")
706           .Build();
707 
708   SectionIdMask section_mask = 0b00000011;
709   SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
710   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
711       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
712 
713   // Check the snippets
714   EXPECT_THAT(snippet.entries(), IsEmpty());
715 }
716 
TEST_F(SnippetRetrieverTest,SimpleSnippetingNoWindowing)717 TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
718   DocumentProto document =
719       DocumentBuilder()
720           .SetKey("icing", "email/1")
721           .SetSchema("email")
722           .AddStringProperty("subject", "subject foo")
723           .AddStringProperty("body", "Only a fool would match this content.")
724           .Build();
725 
726   snippet_spec_.set_max_window_utf32_length(0);
727 
728   SectionIdMask section_mask = 0b00000011;
729   SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
730   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
731       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
732 
733   // Check the snippets
734   EXPECT_THAT(snippet.entries(), SizeIs(1));
735   EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
736   std::string_view content =
737       GetString(&document, snippet.entries(0).property_name());
738   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
739   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
740   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
741 }
742 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatches)743 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
744   DocumentProto document =
745       DocumentBuilder()
746           .SetKey("icing", "email/1")
747           .SetSchema("email")
748           .AddStringProperty("subject", "subject foo")
749           .AddStringProperty("body",
750                              "Concerning the subject of foo, we need to begin "
751                              "considering our options regarding body bar.")
752           .Build();
753   // String:      "Concerning the subject of foo, we need to begin considering "
754   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
755   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
756   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
757   //
758   // String ctd:  "our options regarding body bar."
759   //               ^   ^       ^         ^    ^   ^
760   // UTF-8 idx:    60  64      72        82   87  91
761   // UTF-32 idx:   60  64      72        82   87  91
762   SectionIdMask section_mask = 0b00000011;
763   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
764   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
765       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
766 
767   // Check the snippets
768   EXPECT_THAT(snippet.entries(), SizeIs(2));
769   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
770   std::string_view content =
771       GetString(&document, snippet.entries(0).property_name());
772   // The first window will be:
773   //   1. untrimmed, no-shifting window will be (-6,59).
774   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
775   //   3. trimmed, shifted window [0, 63) "Concerning... our"
776   // The second window will be:
777   //   1. untrimmed, no-shifting window will be (54,91).
778   //   2. trimmed, no-shifting window [60, 91) "our... bar.".
779   //   3. trimmed, shifted window [31, 91) "we... bar."
780   EXPECT_THAT(
781       GetWindows(content, snippet.entries(0)),
782       ElementsAre(
783           "Concerning the subject of foo, we need to begin considering our",
784           "we need to begin considering our options regarding body bar."));
785   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
786               ElementsAre("foo", "bar"));
787   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
788               ElementsAre("foo", "bar"));
789 
790   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
791   content = GetString(&document, snippet.entries(1).property_name());
792   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
793               ElementsAre("subject foo"));
794   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
795   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
796 }
797 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrict)798 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
799   DocumentProto document =
800       DocumentBuilder()
801           .SetKey("icing", "email/1")
802           .SetSchema("email")
803           .AddStringProperty("subject", "subject foo")
804           .AddStringProperty("body",
805                              "Concerning the subject of foo, we need to begin "
806                              "considering our options regarding body bar.")
807           .Build();
808   // String:      "Concerning the subject of foo, we need to begin considering "
809   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
810   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
811   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
812   //
813   // String ctd:  "our options regarding body bar."
814   //               ^   ^       ^         ^    ^   ^
815   // UTF-8 idx:    60  64      72        82   87  91
816   // UTF-32 idx:   60  64      72        82   87  91
817   //
818   // Section 1 "subject" is not in the section_mask, so no snippet information
819   // from that section should be returned by the SnippetRetriever.
820   SectionIdMask section_mask = 0b00000001;
821   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
822   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
823       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
824 
825   // Check the snippets
826   EXPECT_THAT(snippet.entries(), SizeIs(1));
827   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
828   std::string_view content =
829       GetString(&document, snippet.entries(0).property_name());
830   // The first window will be:
831   //   1. untrimmed, no-shifting window will be (-6,59).
832   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
833   //   3. trimmed, shifted window [0, 63) "Concerning... our"
834   // The second window will be:
835   //   1. untrimmed, no-shifting window will be (54,91).
836   //   2. trimmed, no-shifting window [60, 91) "our... bar.".
837   //   3. trimmed, shifted window [31, 91) "we... bar."
838   EXPECT_THAT(
839       GetWindows(content, snippet.entries(0)),
840       ElementsAre(
841           "Concerning the subject of foo, we need to begin considering our",
842           "we need to begin considering our options regarding body bar."));
843   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
844               ElementsAre("foo", "bar"));
845   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
846               ElementsAre("foo", "bar"));
847 }
848 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrictedTerm)849 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
850   DocumentProto document =
851       DocumentBuilder()
852           .SetKey("icing", "email/1")
853           .SetSchema("email")
854           .AddStringProperty("subject", "subject foo")
855           .AddStringProperty("body",
856                              "Concerning the subject of foo, we need to begin "
857                              "considering our options regarding body bar.")
858           .Build();
859   // String:      "Concerning the subject of foo, we need to begin considering "
860   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
861   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
862   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
863   //
864   // String ctd:  "our options regarding body bar."
865   //               ^   ^       ^         ^    ^   ^
866   // UTF-8 idx:    60  64      72        82   87  91
867   // UTF-32 idx:   60  64      72        82   87  91
868   SectionIdMask section_mask = 0b00000011;
869   // "subject" should match in both sections, but "foo" is restricted to "body"
870   // so it should only match in the 'body' section and not the 'subject'
871   // section.
872   SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
873                                            {"body", {"foo"}}};
874   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
875       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
876 
877   // Check the snippets
878   EXPECT_THAT(snippet.entries(), SizeIs(2));
879   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
880   std::string_view content =
881       GetString(&document, snippet.entries(0).property_name());
882   // The first window will be:
883   //   1. untrimmed, no-shifting window will be (-15,50).
884   //   2. trimmed, no-shifting window [0, 47) "Concerning... begin".
885   //   3. trimmed, shifted window [0, 63) "Concerning... our"
886   // The second window will be:
887   //   1. untrimmed, no-shifting window will be (-6,59).
888   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
889   //   3. trimmed, shifted window [0, 63) "Concerning... our"
890   EXPECT_THAT(
891       GetWindows(content, snippet.entries(0)),
892       ElementsAre(
893           "Concerning the subject of foo, we need to begin considering our",
894           "Concerning the subject of foo, we need to begin considering our"));
895   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
896               ElementsAre("subject", "foo"));
897   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
898               ElementsAre("subject", "foo"));
899 
900   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
901   content = GetString(&document, snippet.entries(1).property_name());
902   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
903               ElementsAre("subject foo"));
904   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
905   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
906               ElementsAre("subject"));
907 }
908 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesOneMatchPerProperty)909 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
910   DocumentProto document =
911       DocumentBuilder()
912           .SetKey("icing", "email/1")
913           .SetSchema("email")
914           .AddStringProperty("subject", "subject foo")
915           .AddStringProperty("body",
916                              "Concerning the subject of foo, we need to begin "
917                              "considering our options regarding body bar.")
918           .Build();
919 
920   // String:      "Concerning the subject of foo, we need to begin considering "
921   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
922   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
923   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
924   //
925   // String ctd:  "our options regarding body bar."
926   //               ^   ^       ^         ^    ^   ^
927   // UTF-8 idx:    60  64      72        82   87  91
928   // UTF-32 idx:   60  64      72        82   87  91
929   snippet_spec_.set_num_matches_per_property(1);
930 
931   SectionIdMask section_mask = 0b00000011;
932   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
933   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
934       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
935 
936   // Check the snippets
937   EXPECT_THAT(snippet.entries(), SizeIs(2));
938   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
939   std::string_view content =
940       GetString(&document, snippet.entries(0).property_name());
941   // The window will be:
942   //   1. untrimmed, no-shifting window will be (-6,59).
943   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
944   //   3. trimmed, shifted window [0, 63) "Concerning... our"
945   EXPECT_THAT(
946       GetWindows(content, snippet.entries(0)),
947       ElementsAre(
948           "Concerning the subject of foo, we need to begin considering our"));
949   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
950   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
951 
952   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
953   content = GetString(&document, snippet.entries(1).property_name());
954   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
955               ElementsAre("subject foo"));
956   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
957   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
958 }
959 
TEST_F(SnippetRetrieverTest,PrefixSnippetingNormalization)960 TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
961   DocumentProto document =
962       DocumentBuilder()
963           .SetKey("icing", "email/1")
964           .SetSchema("email")
965           .AddStringProperty("subject", "MDI team")
966           .AddStringProperty("body", "Some members are in Zürich.")
967           .Build();
968   SectionIdMask section_mask = 0b00000011;
969   SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
970   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
971       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
972 
973   EXPECT_THAT(snippet.entries(), SizeIs(1));
974   EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
975   std::string_view content =
976       GetString(&document, snippet.entries(0).property_name());
977   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
978   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
979   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
980 }
981 
TEST_F(SnippetRetrieverTest,ExactSnippetingNormalization)982 TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
983   DocumentProto document =
984       DocumentBuilder()
985           .SetKey("icing", "email/1")
986           .SetSchema("email")
987           .AddStringProperty("subject", "MDI team")
988           .AddStringProperty("body", "Some members are in Zürich.")
989           .Build();
990 
991   SectionIdMask section_mask = 0b00000011;
992   SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
993   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
994       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
995 
996   EXPECT_THAT(snippet.entries(), SizeIs(1));
997   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
998   std::string_view content =
999       GetString(&document, snippet.entries(0).property_name());
1000   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1001               ElementsAre("Some members are in Zürich."));
1002   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
1003 
1004   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1005               ElementsAre("Zürich"));
1006 }
1007 
TEST_F(SnippetRetrieverTest,SnippetingTestOneLevel)1008 TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
1009   SchemaProto schema =
1010       SchemaBuilder()
1011           .AddType(SchemaTypeConfigBuilder()
1012                        .SetType("SingleLevelType")
1013                        .AddProperty(PropertyConfigBuilder()
1014                                         .SetName("X")
1015                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1016                                                            TOKENIZER_PLAIN)
1017                                         .SetCardinality(CARDINALITY_REPEATED))
1018                        .AddProperty(PropertyConfigBuilder()
1019                                         .SetName("Y")
1020                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1021                                                            TOKENIZER_PLAIN)
1022                                         .SetCardinality(CARDINALITY_REPEATED))
1023                        .AddProperty(PropertyConfigBuilder()
1024                                         .SetName("Z")
1025                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1026                                                            TOKENIZER_PLAIN)
1027                                         .SetCardinality(CARDINALITY_REPEATED)))
1028           .Build();
1029   ICING_ASSERT_OK(schema_store_->SetSchema(
1030       schema, /*ignore_errors_and_delete_documents=*/true,
1031       /*allow_circular_schema_definitions=*/false));
1032   ICING_ASSERT_OK_AND_ASSIGN(
1033       snippet_retriever_,
1034       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1035                                normalizer_.get()));
1036 
1037   std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1038   DocumentProto document;
1039   document.set_schema("SingleLevelType");
1040   PropertyProto* prop = document.add_properties();
1041   prop->set_name("X");
1042   for (const std::string& s : string_values) {
1043     prop->add_string_values(s);
1044   }
1045   prop = document.add_properties();
1046   prop->set_name("Y");
1047   for (const std::string& s : string_values) {
1048     prop->add_string_values(s);
1049   }
1050   prop = document.add_properties();
1051   prop->set_name("Z");
1052   for (const std::string& s : string_values) {
1053     prop->add_string_values(s);
1054   }
1055 
1056   SectionIdMask section_mask = 0b00000111;
1057   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1058   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1059       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1060 
1061   EXPECT_THAT(snippet.entries(), SizeIs(6));
1062   EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]"));
1063   std::string_view content =
1064       GetString(&document, snippet.entries(0).property_name());
1065   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1066   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1067   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1068 
1069   EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
1070   content = GetString(&document, snippet.entries(1).property_name());
1071   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1072   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1073   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1074 
1075   EXPECT_THAT(GetPropertyPaths(snippet),
1076               ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
1077 }
1078 
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevel)1079 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
1080   SchemaProto schema =
1081       SchemaBuilder()
1082           .AddType(SchemaTypeConfigBuilder()
1083                        .SetType("SingleLevelType")
1084                        .AddProperty(PropertyConfigBuilder()
1085                                         .SetName("X")
1086                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1087                                                            TOKENIZER_PLAIN)
1088                                         .SetCardinality(CARDINALITY_REPEATED))
1089                        .AddProperty(PropertyConfigBuilder()
1090                                         .SetName("Y")
1091                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1092                                                            TOKENIZER_PLAIN)
1093                                         .SetCardinality(CARDINALITY_REPEATED))
1094                        .AddProperty(PropertyConfigBuilder()
1095                                         .SetName("Z")
1096                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1097                                                            TOKENIZER_PLAIN)
1098                                         .SetCardinality(CARDINALITY_REPEATED)))
1099           .AddType(SchemaTypeConfigBuilder()
1100                        .SetType("MultiLevelType")
1101                        .AddProperty(PropertyConfigBuilder()
1102                                         .SetName("A")
1103                                         .SetDataTypeDocument(
1104                                             "SingleLevelType",
1105                                             /*index_nested_properties=*/true)
1106                                         .SetCardinality(CARDINALITY_OPTIONAL))
1107                        .AddProperty(PropertyConfigBuilder()
1108                                         .SetName("B")
1109                                         .SetDataTypeDocument(
1110                                             "SingleLevelType",
1111                                             /*index_nested_properties=*/true)
1112                                         .SetCardinality(CARDINALITY_OPTIONAL))
1113                        .AddProperty(PropertyConfigBuilder()
1114                                         .SetName("C")
1115                                         .SetDataTypeDocument(
1116                                             "SingleLevelType",
1117                                             /*index_nested_properties=*/true)
1118                                         .SetCardinality(CARDINALITY_OPTIONAL)))
1119           .Build();
1120   ICING_ASSERT_OK(schema_store_->SetSchema(
1121       schema, /*ignore_errors_and_delete_documents=*/true,
1122       /*allow_circular_schema_definitions=*/false));
1123   ICING_ASSERT_OK_AND_ASSIGN(
1124       snippet_retriever_,
1125       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1126                                normalizer_.get()));
1127 
1128   std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1129   DocumentProto subdocument;
1130   PropertyProto* prop = subdocument.add_properties();
1131   prop->set_name("X");
1132   for (const std::string& s : string_values) {
1133     prop->add_string_values(s);
1134   }
1135   prop = subdocument.add_properties();
1136   prop->set_name("Y");
1137   for (const std::string& s : string_values) {
1138     prop->add_string_values(s);
1139   }
1140   prop = subdocument.add_properties();
1141   prop->set_name("Z");
1142   for (const std::string& s : string_values) {
1143     prop->add_string_values(s);
1144   }
1145 
1146   DocumentProto document;
1147   document.set_schema("MultiLevelType");
1148   prop = document.add_properties();
1149   prop->set_name("A");
1150   *prop->add_document_values() = subdocument;
1151 
1152   prop = document.add_properties();
1153   prop->set_name("B");
1154   *prop->add_document_values() = subdocument;
1155 
1156   prop = document.add_properties();
1157   prop->set_name("C");
1158   *prop->add_document_values() = subdocument;
1159 
1160   SectionIdMask section_mask = 0b111111111;
1161   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1162   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1163       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1164 
1165   EXPECT_THAT(snippet.entries(), SizeIs(18));
1166   EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]"));
1167   std::string_view content =
1168       GetString(&document, snippet.entries(0).property_name());
1169   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1170   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1171   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1172 
1173   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
1174   content = GetString(&document, snippet.entries(1).property_name());
1175   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1176   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1177   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1178 
1179   EXPECT_THAT(
1180       GetPropertyPaths(snippet),
1181       ElementsAre("A.X[1]", "A.X[3]", "A.Y[1]", "A.Y[3]", "A.Z[1]", "A.Z[3]",
1182                   "B.X[1]", "B.X[3]", "B.Y[1]", "B.Y[3]", "B.Z[1]", "B.Z[3]",
1183                   "C.X[1]", "C.X[3]", "C.Y[1]", "C.Y[3]", "C.Z[1]", "C.Z[3]"));
1184 }
1185 
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelRepeated)1186 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
1187   SchemaProto schema =
1188       SchemaBuilder()
1189           .AddType(SchemaTypeConfigBuilder()
1190                        .SetType("SingleLevelType")
1191                        .AddProperty(PropertyConfigBuilder()
1192                                         .SetName("X")
1193                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1194                                                            TOKENIZER_PLAIN)
1195                                         .SetCardinality(CARDINALITY_REPEATED))
1196                        .AddProperty(PropertyConfigBuilder()
1197                                         .SetName("Y")
1198                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1199                                                            TOKENIZER_PLAIN)
1200                                         .SetCardinality(CARDINALITY_REPEATED))
1201                        .AddProperty(PropertyConfigBuilder()
1202                                         .SetName("Z")
1203                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1204                                                            TOKENIZER_PLAIN)
1205                                         .SetCardinality(CARDINALITY_REPEATED)))
1206           .AddType(SchemaTypeConfigBuilder()
1207                        .SetType("MultiLevelType")
1208                        .AddProperty(PropertyConfigBuilder()
1209                                         .SetName("A")
1210                                         .SetDataTypeDocument(
1211                                             "SingleLevelType",
1212                                             /*index_nested_properties=*/true)
1213                                         .SetCardinality(CARDINALITY_REPEATED))
1214                        .AddProperty(PropertyConfigBuilder()
1215                                         .SetName("B")
1216                                         .SetDataTypeDocument(
1217                                             "SingleLevelType",
1218                                             /*index_nested_properties=*/true)
1219                                         .SetCardinality(CARDINALITY_REPEATED))
1220                        .AddProperty(PropertyConfigBuilder()
1221                                         .SetName("C")
1222                                         .SetDataTypeDocument(
1223                                             "SingleLevelType",
1224                                             /*index_nested_properties=*/true)
1225                                         .SetCardinality(CARDINALITY_REPEATED)))
1226           .Build();
1227   ICING_ASSERT_OK(schema_store_->SetSchema(
1228       schema, /*ignore_errors_and_delete_documents=*/true,
1229       /*allow_circular_schema_definitions=*/false));
1230   ICING_ASSERT_OK_AND_ASSIGN(
1231       snippet_retriever_,
1232       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1233                                normalizer_.get()));
1234 
1235   std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1236   DocumentProto subdocument;
1237   PropertyProto* prop = subdocument.add_properties();
1238   prop->set_name("X");
1239   for (const std::string& s : string_values) {
1240     prop->add_string_values(s);
1241   }
1242   prop = subdocument.add_properties();
1243   prop->set_name("Y");
1244   for (const std::string& s : string_values) {
1245     prop->add_string_values(s);
1246   }
1247   prop = subdocument.add_properties();
1248   prop->set_name("Z");
1249   for (const std::string& s : string_values) {
1250     prop->add_string_values(s);
1251   }
1252 
1253   DocumentProto document;
1254   document.set_schema("MultiLevelType");
1255   prop = document.add_properties();
1256   prop->set_name("A");
1257   *prop->add_document_values() = subdocument;
1258   *prop->add_document_values() = subdocument;
1259 
1260   prop = document.add_properties();
1261   prop->set_name("B");
1262   *prop->add_document_values() = subdocument;
1263   *prop->add_document_values() = subdocument;
1264 
1265   prop = document.add_properties();
1266   prop->set_name("C");
1267   *prop->add_document_values() = subdocument;
1268   *prop->add_document_values() = subdocument;
1269 
1270   SectionIdMask section_mask = 0b111111111;
1271   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1272   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1273       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1274 
1275   EXPECT_THAT(snippet.entries(), SizeIs(36));
1276   EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]"));
1277   std::string_view content =
1278       GetString(&document, snippet.entries(0).property_name());
1279   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1280   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1281   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1282 
1283   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
1284   content = GetString(&document, snippet.entries(1).property_name());
1285   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1286   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1287   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1288 
1289   EXPECT_THAT(GetPropertyPaths(snippet),
1290               ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
1291                           "A[0].Y[1]", "A[0].Y[3]", "A[1].Y[1]", "A[1].Y[3]",
1292                           "A[0].Z[1]", "A[0].Z[3]", "A[1].Z[1]", "A[1].Z[3]",
1293                           "B[0].X[1]", "B[0].X[3]", "B[1].X[1]", "B[1].X[3]",
1294                           "B[0].Y[1]", "B[0].Y[3]", "B[1].Y[1]", "B[1].Y[3]",
1295                           "B[0].Z[1]", "B[0].Z[3]", "B[1].Z[1]", "B[1].Z[3]",
1296                           "C[0].X[1]", "C[0].X[3]", "C[1].X[1]", "C[1].X[3]",
1297                           "C[0].Y[1]", "C[0].Y[3]", "C[1].Y[1]", "C[1].Y[3]",
1298                           "C[0].Z[1]", "C[0].Z[3]", "C[1].Z[1]", "C[1].Z[3]"));
1299 }
1300 
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelSingleValue)1301 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
1302   SchemaProto schema =
1303       SchemaBuilder()
1304           .AddType(SchemaTypeConfigBuilder()
1305                        .SetType("SingleLevelType")
1306                        .AddProperty(PropertyConfigBuilder()
1307                                         .SetName("X")
1308                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1309                                                            TOKENIZER_PLAIN)
1310                                         .SetCardinality(CARDINALITY_OPTIONAL))
1311                        .AddProperty(PropertyConfigBuilder()
1312                                         .SetName("Y")
1313                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1314                                                            TOKENIZER_PLAIN)
1315                                         .SetCardinality(CARDINALITY_OPTIONAL))
1316                        .AddProperty(PropertyConfigBuilder()
1317                                         .SetName("Z")
1318                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1319                                                            TOKENIZER_PLAIN)
1320                                         .SetCardinality(CARDINALITY_OPTIONAL)))
1321           .AddType(SchemaTypeConfigBuilder()
1322                        .SetType("MultiLevelType")
1323                        .AddProperty(PropertyConfigBuilder()
1324                                         .SetName("A")
1325                                         .SetDataTypeDocument(
1326                                             "SingleLevelType",
1327                                             /*index_nested_properties=*/true)
1328                                         .SetCardinality(CARDINALITY_REPEATED))
1329                        .AddProperty(PropertyConfigBuilder()
1330                                         .SetName("B")
1331                                         .SetDataTypeDocument(
1332                                             "SingleLevelType",
1333                                             /*index_nested_properties=*/true)
1334                                         .SetCardinality(CARDINALITY_REPEATED))
1335                        .AddProperty(PropertyConfigBuilder()
1336                                         .SetName("C")
1337                                         .SetDataTypeDocument(
1338                                             "SingleLevelType",
1339                                             /*index_nested_properties=*/true)
1340                                         .SetCardinality(CARDINALITY_REPEATED)))
1341           .Build();
1342   ICING_ASSERT_OK(schema_store_->SetSchema(
1343       schema, /*ignore_errors_and_delete_documents=*/true,
1344       /*allow_circular_schema_definitions=*/false));
1345   ICING_ASSERT_OK_AND_ASSIGN(
1346       snippet_retriever_,
1347       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1348                                normalizer_.get()));
1349 
1350   DocumentProto subdocument;
1351   PropertyProto* prop = subdocument.add_properties();
1352   prop->set_name("X");
1353   prop->add_string_values("polo");
1354   prop = subdocument.add_properties();
1355   prop->set_name("Y");
1356   prop->add_string_values("marco");
1357   prop = subdocument.add_properties();
1358   prop->set_name("Z");
1359   prop->add_string_values("polo");
1360 
1361   DocumentProto document;
1362   document.set_schema("MultiLevelType");
1363   prop = document.add_properties();
1364   prop->set_name("A");
1365   *prop->add_document_values() = subdocument;
1366   *prop->add_document_values() = subdocument;
1367 
1368   prop = document.add_properties();
1369   prop->set_name("B");
1370   *prop->add_document_values() = subdocument;
1371   *prop->add_document_values() = subdocument;
1372 
1373   prop = document.add_properties();
1374   prop->set_name("C");
1375   *prop->add_document_values() = subdocument;
1376   *prop->add_document_values() = subdocument;
1377 
1378   SectionIdMask section_mask = 0b111111111;
1379   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1380   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1381       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1382 
1383   EXPECT_THAT(snippet.entries(), SizeIs(12));
1384   EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X"));
1385   std::string_view content =
1386       GetString(&document, snippet.entries(0).property_name());
1387   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1388   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1389   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1390 
1391   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
1392   content = GetString(&document, snippet.entries(1).property_name());
1393   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1394   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1395   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1396 
1397   EXPECT_THAT(
1398       GetPropertyPaths(snippet),
1399       ElementsAre("A[0].X", "A[1].X", "A[0].Z", "A[1].Z", "B[0].X", "B[1].X",
1400                   "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
1401 }
1402 
TEST_F(SnippetRetrieverTest,CJKSnippetMatchTest)1403 TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
1404   // String:     "我每天走路去上班。"
1405   //              ^ ^  ^   ^^
1406   // UTF8 idx:    0 3  9  15 18
1407   // UTF16 idx:   0 1  3   5 6
1408   // Breaks into segments: "我", "每天", "走路", "去", "上班"
1409   constexpr std::string_view kChinese = "我每天走路去上班。";
1410   DocumentProto document =
1411       DocumentBuilder()
1412           .SetKey("icing", "email/1")
1413           .SetSchema("email")
1414           .AddStringProperty("subject", kChinese)
1415           .AddStringProperty("body",
1416                              "Concerning the subject of foo, we need to begin "
1417                              "considering our options regarding body bar.")
1418           .Build();
1419 
1420   SectionIdMask section_mask = 0b00000011;
1421   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1422 
1423   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1424       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1425 
1426   // Ensure that one and only one property was matched and it was "body"
1427   ASSERT_THAT(snippet.entries(), SizeIs(1));
1428   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1429   EXPECT_THAT(entry->property_name(), Eq("subject"));
1430   std::string_view content =
1431       GetString(&document, snippet.entries(0).property_name());
1432 
1433   // Ensure that there is one and only one match within "subject"
1434   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1435   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1436 
1437   // Ensure that the match is correct.
1438   EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
1439   EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
1440 
1441   // Ensure that the utf-16 values are also as expected
1442   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
1443   EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
1444   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
1445 }
1446 
TEST_F(SnippetRetrieverTest,CJKSnippetWindowTest)1447 TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
1448   language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
1449                                                        jni_cache_.get());
1450   ICING_ASSERT_OK_AND_ASSIGN(
1451       language_segmenter_,
1452       language_segmenter_factory::Create(std::move(options)));
1453   ICING_ASSERT_OK_AND_ASSIGN(
1454       snippet_retriever_,
1455       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1456                                normalizer_.get()));
1457 
1458   // String:     "我每天走路去上班。"
1459   //              ^ ^  ^   ^^
1460   // UTF8 idx:    0 3  9  15 18
1461   // UTF16 idx:   0 1  3   5 6
1462   // UTF32 idx:   0 1  3   5 6
1463   // Breaks into segments: "我", "每天", "走路", "去", "上班"
1464   constexpr std::string_view kChinese = "我每天走路去上班。";
1465   DocumentProto document =
1466       DocumentBuilder()
1467           .SetKey("icing", "email/1")
1468           .SetSchema("email")
1469           .AddStringProperty("subject", kChinese)
1470           .AddStringProperty("body",
1471                              "Concerning the subject of foo, we need to begin "
1472                              "considering our options regarding body bar.")
1473           .Build();
1474 
1475   SectionIdMask section_mask = 0b00000011;
1476   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1477 
1478   // The window will be:
1479   //   1. untrimmed, no-shifting window will be (0,7).
1480   //   2. trimmed, no-shifting window [1, 6) "每天走路去".
1481   //   3. trimmed, shifted window [0, 6) "我每天走路去"
1482   snippet_spec_.set_max_window_utf32_length(6);
1483 
1484   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1485       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1486 
1487   // Ensure that one and only one property was matched and it was "body"
1488   ASSERT_THAT(snippet.entries(), SizeIs(1));
1489   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1490   EXPECT_THAT(entry->property_name(), Eq("subject"));
1491   std::string_view content =
1492       GetString(&document, snippet.entries(0).property_name());
1493 
1494   // Ensure that there is one and only one match within "subject"
1495   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1496   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1497 
1498   // Ensure that the match is correct.
1499   EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
1500 
1501   // Ensure that the utf-16 values are also as expected
1502   EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
1503   EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
1504 }
1505 
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitSnippetMatchTest)1506 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
1507   // The following string has four-byte UTF-8 characters. Most importantly, it
1508   // is also two code units in UTF-16.
1509   // String:     "���� ���� ��"
1510   //              ^  ^  ^
1511   // UTF8 idx:    0  9  18
1512   // UTF16 idx:   0  5  10
1513   // Breaks into segments: "����", "����", "��"
1514   constexpr std::string_view kText = "���� ���� ��";
1515   DocumentProto document =
1516       DocumentBuilder()
1517           .SetKey("icing", "email/1")
1518           .SetSchema("email")
1519           .AddStringProperty("subject", kText)
1520           .AddStringProperty("body",
1521                              "Concerning the subject of foo, we need to begin "
1522                              "considering our options regarding body bar.")
1523           .Build();
1524 
1525   SectionIdMask section_mask = 0b00000011;
1526   SectionRestrictQueryTermsMap query_terms{{"", {"��"}}};
1527 
1528   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1529       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1530 
1531   // Ensure that one and only one property was matched and it was "body"
1532   ASSERT_THAT(snippet.entries(), SizeIs(1));
1533   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1534   EXPECT_THAT(entry->property_name(), Eq("subject"));
1535   std::string_view content =
1536       GetString(&document, snippet.entries(0).property_name());
1537 
1538   // Ensure that there is one and only one match within "subject"
1539   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1540   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1541 
1542   // Ensure that the match is correct.
1543   EXPECT_THAT(GetMatches(content, *entry), ElementsAre("����"));
1544   EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("��"));
1545 
1546   // Ensure that the utf-16 values are also as expected
1547   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
1548   EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
1549   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1550 }
1551 
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitWindowTest)1552 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
1553   // The following string has four-byte UTF-8 characters. Most importantly, it
1554   // is also two code units in UTF-16.
1555   // String:     "���� ���� ��"
1556   //              ^  ^  ^
1557   // UTF8 idx:    0  9  18
1558   // UTF16 idx:   0  5  10
1559   // UTF32 idx:   0  3  6
1560   // Breaks into segments: "����", "����", "��"
1561   constexpr std::string_view kText = "���� ���� ��";
1562   DocumentProto document =
1563       DocumentBuilder()
1564           .SetKey("icing", "email/1")
1565           .SetSchema("email")
1566           .AddStringProperty("subject", kText)
1567           .AddStringProperty("body",
1568                              "Concerning the subject of foo, we need to begin "
1569                              "considering our options regarding body bar.")
1570           .Build();
1571 
1572   SectionIdMask section_mask = 0b00000011;
1573   SectionRestrictQueryTermsMap query_terms{{"", {"��"}}};
1574 
1575   // Set a six character window. This will produce a window like this:
1576   // String:     "���� ���� ��"
1577   //                 ^   ^
1578   // UTF8 idx:       9   22
1579   // UTF16 idx:      5   12
1580   // UTF32 idx:      3   7
1581   snippet_spec_.set_max_window_utf32_length(6);
1582 
1583   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1584       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1585 
1586   // Ensure that one and only one property was matched and it was "body"
1587   ASSERT_THAT(snippet.entries(), SizeIs(1));
1588   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1589   EXPECT_THAT(entry->property_name(), Eq("subject"));
1590   std::string_view content =
1591       GetString(&document, snippet.entries(0).property_name());
1592 
1593   // Ensure that there is one and only one match within "subject"
1594   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1595   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1596 
1597   // Ensure that the match is correct.
1598   EXPECT_THAT(GetWindows(content, *entry), ElementsAre("���� ��"));
1599 
1600   // Ensure that the utf-16 values are also as expected
1601   EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
1602   EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
1603 }
1604 
TEST_F(SnippetRetrieverTest,SnippettingVerbatimAscii)1605 TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
1606   SchemaProto schema =
1607       SchemaBuilder()
1608           .AddType(SchemaTypeConfigBuilder()
1609                        .SetType("verbatimType")
1610                        .AddProperty(PropertyConfigBuilder()
1611                                         .SetName("verbatim")
1612                                         .SetDataTypeString(TERM_MATCH_EXACT,
1613                                                            TOKENIZER_VERBATIM)
1614                                         .SetCardinality(CARDINALITY_REPEATED)))
1615           .Build();
1616   ICING_ASSERT_OK(schema_store_->SetSchema(
1617       schema, /*ignore_errors_and_delete_documents=*/true,
1618       /*allow_circular_schema_definitions=*/false));
1619   ICING_ASSERT_OK_AND_ASSIGN(
1620       snippet_retriever_,
1621       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1622                                normalizer_.get()));
1623 
1624   DocumentProto document = DocumentBuilder()
1625                                .SetKey("icing", "verbatim/1")
1626                                .SetSchema("verbatimType")
1627                                .AddStringProperty("verbatim", "Hello, world!")
1628                                .Build();
1629 
1630   SectionIdMask section_mask = 0b00000001;
1631   SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
1632 
1633   snippet_spec_.set_max_window_utf32_length(13);
1634   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1635       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1636 
1637   // There should only be one snippet entry and match, the verbatim token in its
1638   // entirety.
1639   ASSERT_THAT(snippet.entries(), SizeIs(1));
1640 
1641   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1642   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1643   ASSERT_THAT(entry->property_name(), "verbatim");
1644 
1645   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1646   // We expect the match to begin at position 0, and to span the entire token
1647   // which contains 13 characters.
1648   EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1649   EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
1650 
1651   // We expect the submatch to begin at position 0 of the verbatim token and
1652   // span the length of our query term "Hello, world!", which has utf-16 length
1653   // of 13. The submatch length is equal to the window length as the query the
1654   // snippet is retrieved with an exact term match.
1655   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1656   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
1657 }
1658 
TEST_F(SnippetRetrieverTest,SnippettingVerbatimCJK)1659 TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
1660   SchemaProto schema =
1661       SchemaBuilder()
1662           .AddType(SchemaTypeConfigBuilder()
1663                        .SetType("verbatimType")
1664                        .AddProperty(PropertyConfigBuilder()
1665                                         .SetName("verbatim")
1666                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1667                                                            TOKENIZER_VERBATIM)
1668                                         .SetCardinality(CARDINALITY_REPEATED)))
1669           .Build();
1670   ICING_ASSERT_OK(schema_store_->SetSchema(
1671       schema, /*ignore_errors_and_delete_documents=*/true,
1672       /*allow_circular_schema_definitions=*/false));
1673   ICING_ASSERT_OK_AND_ASSIGN(
1674       snippet_retriever_,
1675       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1676                                normalizer_.get()));
1677 
1678   // String:     "我每天走路去上班。"
1679   //              ^ ^  ^   ^^
1680   // UTF8 idx:    0 3  9  15 18
1681   // UTF16 idx:   0 1  3   5 6
1682   // UTF32 idx:   0 1  3   5 6
1683   // Breaks into segments: "我", "每天", "走路", "去", "上班"
1684   std::string chinese_string = "我每天走路去上班。";
1685   DocumentProto document = DocumentBuilder()
1686                                .SetKey("icing", "verbatim/1")
1687                                .SetSchema("verbatimType")
1688                                .AddStringProperty("verbatim", chinese_string)
1689                                .Build();
1690 
1691   SectionIdMask section_mask = 0b00000001;
1692   SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
1693 
1694   snippet_spec_.set_max_window_utf32_length(9);
1695   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1696       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1697 
1698   // There should only be one snippet entry and match, the verbatim token in its
1699   // entirety.
1700   ASSERT_THAT(snippet.entries(), SizeIs(1));
1701 
1702   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1703   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1704   ASSERT_THAT(entry->property_name(), "verbatim");
1705 
1706   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1707   // We expect the match to begin at position 0, and to span the entire token
1708   // which has utf-16 length of 9.
1709   EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1710   EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
1711 
1712   // We expect the submatch to begin at position 0 of the verbatim token and
1713   // span the length of our query term "我每", which has utf-16 length of 2.
1714   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1715   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1716 }
1717 
TEST_F(SnippetRetrieverTest,SnippettingRfc822Ascii)1718 TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
1719   SchemaProto schema =
1720       SchemaBuilder()
1721           .AddType(SchemaTypeConfigBuilder()
1722                        .SetType("rfc822Type")
1723                        .AddProperty(PropertyConfigBuilder()
1724                                         .SetName("rfc822")
1725                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1726                                                            TOKENIZER_RFC822)
1727                                         .SetCardinality(CARDINALITY_REPEATED)))
1728           .Build();
1729   ICING_ASSERT_OK(schema_store_->SetSchema(
1730       schema, /*ignore_errors_and_delete_documents=*/true,
1731       /*allow_circular_schema_definitions=*/false));
1732 
1733   ICING_ASSERT_OK_AND_ASSIGN(
1734       snippet_retriever_,
1735       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1736                                normalizer_.get()));
1737 
1738   DocumentProto document =
1739       DocumentBuilder()
1740           .SetKey("icing", "rfc822/1")
1741           .SetSchema("rfc822Type")
1742           .AddStringProperty("rfc822",
1743                              "Alexander Sav <[email protected]>, Very Long "
1744                              "Name Example <[email protected]>")
1745           .Build();
1746 
1747   SectionIdMask section_mask = 0b00000001;
1748 
1749   // This should match both the first name token as well as the entire RFC822.
1750   SectionRestrictQueryTermsMap query_terms{{"", {"alexand"}}};
1751 
1752   snippet_spec_.set_max_window_utf32_length(35);
1753 
1754   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1755       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1756 
1757   ASSERT_THAT(snippet.entries(), SizeIs(1));
1758   EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1759 
1760   std::string_view content =
1761       GetString(&document, snippet.entries(0).property_name());
1762 
1763   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1764               ElementsAre("Alexander Sav <[email protected]>,",
1765                           "Alexander Sav <[email protected]>,"));
1766   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1767               ElementsAre("Alexander Sav <[email protected]>", "Alexander"));
1768   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1769               ElementsAre("Alexand", "Alexand"));
1770 
1771   // "tom" should match the local component, local address, and address tokens.
1772   query_terms = SectionRestrictQueryTermsMap{{"", {"tom"}}};
1773   snippet_spec_.set_max_window_utf32_length(36);
1774 
1775   snippet = snippet_retriever_->RetrieveSnippet(
1776       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1777 
1778   ASSERT_THAT(snippet.entries(), SizeIs(1));
1779   EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1780 
1781   content = GetString(&document, snippet.entries(0).property_name());
1782 
1783   // TODO(b/248362902) Stop returning duplicate matches.
1784   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1785               ElementsAre("Alexander Sav <[email protected]>,",
1786                           "Alexander Sav <[email protected]>,",
1787                           "Alexander Sav <[email protected]>,"));
1788   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1789               ElementsAre("tom.bar", "[email protected]", "tom"));
1790   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1791               ElementsAre("tom", "tom", "tom"));
1792 }
1793 
TEST_F(SnippetRetrieverTest,SnippettingRfc822CJK)1794 TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
1795   SchemaProto schema =
1796       SchemaBuilder()
1797           .AddType(SchemaTypeConfigBuilder()
1798                        .SetType("rfc822Type")
1799                        .AddProperty(PropertyConfigBuilder()
1800                                         .SetName("rfc822")
1801                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1802                                                            TOKENIZER_RFC822)
1803                                         .SetCardinality(CARDINALITY_REPEATED)))
1804           .Build();
1805   ICING_ASSERT_OK(schema_store_->SetSchema(
1806       schema, /*ignore_errors_and_delete_documents=*/true,
1807       /*allow_circular_schema_definitions=*/false));
1808 
1809   ICING_ASSERT_OK_AND_ASSIGN(
1810       snippet_retriever_,
1811       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1812                                normalizer_.get()));
1813 
1814   std::string chinese_string = "我, 每天@走路, 去@上班";
1815   DocumentProto document = DocumentBuilder()
1816                                .SetKey("icing", "rfc822/1")
1817                                .SetSchema("rfc822Type")
1818                                .AddStringProperty("rfc822", chinese_string)
1819                                .Build();
1820 
1821   SectionIdMask section_mask = 0b00000001;
1822 
1823   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1824 
1825   snippet_spec_.set_max_window_utf32_length(8);
1826 
1827   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1828       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1829 
1830   // There should only be one snippet entry and match, the local component token
1831   ASSERT_THAT(snippet.entries(), SizeIs(1));
1832   EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1833 
1834   std::string_view content =
1835       GetString(&document, snippet.entries(0).property_name());
1836 
1837   // The local component, address, local address, and token will all match. The
1838   // windows for address and token are "" as the snippet window is too small.
1839   // TODO(b/248362902) Stop returning duplicate matches.
1840   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1841               ElementsAre("每天@走路,", "每天@走路,"));
1842   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1843               ElementsAre("走路", "走路"));
1844   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1845               ElementsAre("走", "走"));
1846 }
1847 
1848 #ifdef ENABLE_URL_TOKENIZER
TEST_F(SnippetRetrieverTest,SnippettingUrlAscii)1849 TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) {
1850   SchemaProto schema =
1851       SchemaBuilder()
1852           .AddType(SchemaTypeConfigBuilder().SetType("urlType").AddProperty(
1853               PropertyConfigBuilder()
1854                   .SetName("url")
1855                   .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL)
1856                   .SetCardinality(CARDINALITY_REPEATED)))
1857           .Build();
1858   ICING_ASSERT_OK(schema_store_->SetSchema(
1859       schema, /*ignore_errors_and_delete_documents=*/true));
1860 
1861   ICING_ASSERT_OK_AND_ASSIGN(
1862       snippet_retriever_,
1863       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1864                                normalizer_.get()));
1865 
1866   DocumentProto document =
1867       DocumentBuilder()
1868           .SetKey("icing", "url/1")
1869           .SetSchema("urlType")
1870           .AddStringProperty("url", "https://mail.google.com/calendar/google/")
1871           .Build();
1872 
1873   SectionIdMask section_mask = 0b00000001;
1874 
1875   // Query with single url split-token match
1876   SectionRestrictQueryTermsMap query_terms{{"", {"com"}}};
1877   // 40 is the length of the url.
1878   // Window that is the size of the url should return entire url.
1879   snippet_spec_.set_max_window_utf32_length(40);
1880 
1881   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1882       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1883 
1884   ASSERT_THAT(snippet.entries(), SizeIs(1));
1885   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1886 
1887   std::string_view content =
1888       GetString(&document, snippet.entries(0).property_name());
1889 
1890   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1891               ElementsAre("https://mail.google.com/calendar/google/"));
1892   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com"));
1893   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com"));
1894 
1895   // Query with single url suffix-token match
1896   query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}};
1897   snippet_spec_.set_max_window_utf32_length(40);
1898 
1899   snippet = snippet_retriever_->RetrieveSnippet(
1900       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1901 
1902   ASSERT_THAT(snippet.entries(), SizeIs(1));
1903   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1904 
1905   content = GetString(&document, snippet.entries(0).property_name());
1906 
1907   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1908               ElementsAre("https://mail.google.com/calendar/google/"));
1909   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1910               ElementsAre("mail.google.com/calendar/google/"));
1911   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1912               ElementsAre("mail.goo"));
1913 
1914   // Query with multiple url split-token matches
1915   query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}};
1916   snippet_spec_.set_max_window_utf32_length(40);
1917 
1918   snippet = snippet_retriever_->RetrieveSnippet(
1919       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1920 
1921   ASSERT_THAT(snippet.entries(), SizeIs(1));
1922   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1923 
1924   content = GetString(&document, snippet.entries(0).property_name());
1925 
1926   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1927               ElementsAre("https://mail.google.com/calendar/google/",
1928                           "https://mail.google.com/calendar/google/"));
1929   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1930               ElementsAre("google", "google"));
1931   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1932               ElementsAre("goog", "goog"));
1933 
1934   // Query with both url split-token and suffix-token matches
1935   query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}};
1936   snippet_spec_.set_max_window_utf32_length(40);
1937 
1938   snippet = snippet_retriever_->RetrieveSnippet(
1939       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1940 
1941   ASSERT_THAT(snippet.entries(), SizeIs(1));
1942   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1943 
1944   content = GetString(&document, snippet.entries(0).property_name());
1945 
1946   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1947               ElementsAre("https://mail.google.com/calendar/google/",
1948                           "https://mail.google.com/calendar/google/"));
1949   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1950               ElementsAre("mail", "mail.google.com/calendar/google/"));
1951   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1952               ElementsAre("mail", "mail"));
1953 
1954   // Prefix query with both url split-token and suffix-token matches
1955   query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}};
1956   snippet_spec_.set_max_window_utf32_length(40);
1957 
1958   snippet = snippet_retriever_->RetrieveSnippet(
1959       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1960 
1961   ASSERT_THAT(snippet.entries(), SizeIs(1));
1962   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1963 
1964   content = GetString(&document, snippet.entries(0).property_name());
1965 
1966   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1967               ElementsAre("https://mail.google.com/calendar/google/",
1968                           "https://mail.google.com/calendar/google/"));
1969   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1970               ElementsAre("https", "https://mail.google.com/calendar/google/"));
1971   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1972               ElementsAre("http", "http"));
1973 
1974   // Window that's smaller than the input size should not return any matches.
1975   query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1976   snippet_spec_.set_max_window_utf32_length(10);
1977 
1978   snippet = snippet_retriever_->RetrieveSnippet(
1979       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1980 
1981   ASSERT_THAT(snippet.entries(), SizeIs(0));
1982 
1983   // Test case with more than two matches
1984   document =
1985       DocumentBuilder()
1986           .SetKey("icing", "url/1")
1987           .SetSchema("urlType")
1988           .AddStringProperty("url", "https://www.google.com/calendar/google/")
1989           .Build();
1990 
1991   // Prefix query with both url split-token and suffix-token matches
1992   query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1993   snippet_spec_.set_max_window_utf32_length(39);
1994 
1995   snippet = snippet_retriever_->RetrieveSnippet(
1996       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1997 
1998   ASSERT_THAT(snippet.entries(), SizeIs(1));
1999   EXPECT_THAT(snippet.entries(0).property_name(), "url");
2000 
2001   content = GetString(&document, snippet.entries(0).property_name());
2002 
2003   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
2004               ElementsAre("https://www.google.com/calendar/google/",
2005                           "https://www.google.com/calendar/google/",
2006                           "https://www.google.com/calendar/google/"));
2007   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
2008               ElementsAre("google", "google", "google.com/calendar/google/"));
2009   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
2010               ElementsAre("google", "google", "google"));
2011 }
2012 #endif  // ENABLE_URL_TOKENIZER
2013 
2014 }  // namespace
2015 
2016 }  // namespace lib
2017 }  // namespace icing
2018