1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/result/snippet-retriever.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 #include "icing/document-builder.h"
24 #include "icing/feature-flags.h"
25 #include "icing/file/mock-filesystem.h"
26 #include "icing/portable/equals-proto.h"
27 #include "icing/portable/platform.h"
28 #include "icing/proto/document.pb.h"
29 #include "icing/proto/schema.pb.h"
30 #include "icing/proto/search.pb.h"
31 #include "icing/proto/term.pb.h"
32 #include "icing/query/query-terms.h"
33 #include "icing/schema-builder.h"
34 #include "icing/schema/schema-store.h"
35 #include "icing/schema/section-manager.h"
36 #include "icing/store/document-id.h"
37 #include "icing/store/key-mapper.h"
38 #include "icing/testing/common-matchers.h"
39 #include "icing/testing/fake-clock.h"
40 #include "icing/testing/jni-test-helpers.h"
41 #include "icing/testing/test-data.h"
42 #include "icing/testing/test-feature-flags.h"
43 #include "icing/testing/tmp-directory.h"
44 #include "icing/tokenization/language-segmenter-factory.h"
45 #include "icing/tokenization/language-segmenter.h"
46 #include "icing/transform/map/map-normalizer.h"
47 #include "icing/transform/normalizer-factory.h"
48 #include "icing/transform/normalizer.h"
49 #include "icing/util/icu-data-file-helper.h"
50 #include "icing/util/snippet-helpers.h"
51 #include "unicode/uloc.h"
52
53 namespace icing {
54 namespace lib {
55
56 namespace {
57
58 using ::testing::ElementsAre;
59 using ::testing::Eq;
60 using ::testing::IsEmpty;
61 using ::testing::SizeIs;
62
63 // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
64 // to Android. Also move it to schema-builder.h
65 #ifdef ENABLE_URL_TOKENIZER
66 constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
67 StringIndexingConfig::TokenizerType::URL;
68 #endif // ENABLE_URL_TOKENIZER
69
GetPropertyPaths(const SnippetProto & snippet)70 std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
71 std::vector<std::string_view> paths;
72 for (const SnippetProto::EntryProto& entry : snippet.entries()) {
73 paths.push_back(entry.property_name());
74 }
75 return paths;
76 }
77
78 class SnippetRetrieverTest : public testing::Test {
79 protected:
SetUp()80 void SetUp() override {
81 feature_flags_ = std::make_unique<FeatureFlags>(GetTestFeatureFlags());
82 test_dir_ = GetTestTempDir() + "/icing";
83 filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
84
85 if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
86 ICING_ASSERT_OK(
87 // File generated via icu_data_file rule in //icing/BUILD.
88 icu_data_file_helper::SetUpIcuDataFile(
89 GetTestFilePath("icing/icu.dat")));
90 }
91
92 jni_cache_ = GetTestJniCache();
93 language_segmenter_factory::SegmenterOptions options(ULOC_US,
94 jni_cache_.get());
95 ICING_ASSERT_OK_AND_ASSIGN(
96 language_segmenter_,
97 language_segmenter_factory::Create(std::move(options)));
98
99 // Setup the schema
100 ICING_ASSERT_OK_AND_ASSIGN(
101 schema_store_, SchemaStore::Create(&filesystem_, test_dir_,
102 &fake_clock_, feature_flags_.get()));
103 SchemaProto schema =
104 SchemaBuilder()
105 .AddType(
106 SchemaTypeConfigBuilder()
107 .SetType("email")
108 .AddProperty(PropertyConfigBuilder()
109 .SetName("subject")
110 .SetDataTypeString(TERM_MATCH_PREFIX,
111 TOKENIZER_PLAIN)
112 .SetCardinality(CARDINALITY_OPTIONAL))
113 .AddProperty(PropertyConfigBuilder()
114 .SetName("body")
115 .SetDataTypeString(TERM_MATCH_EXACT,
116 TOKENIZER_PLAIN)
117 .SetCardinality(CARDINALITY_OPTIONAL)))
118 .Build();
119 ICING_ASSERT_OK(schema_store_->SetSchema(
120 schema, /*ignore_errors_and_delete_documents=*/false,
121 /*allow_circular_schema_definitions=*/false));
122
123 ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
124 /*max_term_byte_size=*/10000));
125 ICING_ASSERT_OK_AND_ASSIGN(
126 snippet_retriever_,
127 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
128 normalizer_.get()));
129
130 // Set limits to max - effectively no limit. Enable matching and request a
131 // window of 64 bytes.
132 snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
133 snippet_spec_.set_num_matches_per_property(
134 std::numeric_limits<int32_t>::max());
135 snippet_spec_.set_max_window_utf32_length(64);
136 }
137
TearDown()138 void TearDown() override {
139 filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
140 }
141
142 std::unique_ptr<FeatureFlags> feature_flags_;
143 Filesystem filesystem_;
144 FakeClock fake_clock_;
145 std::unique_ptr<SchemaStore> schema_store_;
146 std::unique_ptr<LanguageSegmenter> language_segmenter_;
147 std::unique_ptr<SnippetRetriever> snippet_retriever_;
148 std::unique_ptr<Normalizer> normalizer_;
149 std::unique_ptr<const JniCache> jni_cache_;
150 ResultSpecProto::SnippetSpecProto snippet_spec_;
151 std::string test_dir_;
152 };
153
TEST_F(SnippetRetrieverTest,CreationWithNullPointerShouldFail)154 TEST_F(SnippetRetrieverTest, CreationWithNullPointerShouldFail) {
155 EXPECT_THAT(
156 SnippetRetriever::Create(/*schema_store=*/nullptr,
157 language_segmenter_.get(), normalizer_.get()),
158 StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
159 EXPECT_THAT(SnippetRetriever::Create(schema_store_.get(),
160 /*language_segmenter=*/nullptr,
161 normalizer_.get()),
162 StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
163 EXPECT_THAT(
164 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
165 /*normalizer=*/nullptr),
166 StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
167 }
168
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeSmallerThanMatch)169 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
170 DocumentProto document =
171 DocumentBuilder()
172 .SetKey("icing", "email/1")
173 .SetSchema("email")
174 .AddStringProperty("subject", "counting")
175 .AddStringProperty("body", "one two three four.... five")
176 .Build();
177
178 SectionIdMask section_mask = 0b00000011;
179 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
180
181 // Window starts at the beginning of "three" and ends in the middle of
182 // "three". len=4, orig_window= "thre"
183 snippet_spec_.set_max_window_utf32_length(4);
184 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
185 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
186
187 EXPECT_THAT(snippet.entries(), SizeIs(1));
188 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
189 std::string_view content =
190 GetString(&document, snippet.entries(0).property_name());
191 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
192 }
193
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch)194 TEST_F(SnippetRetrieverTest,
195 SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) {
196 DocumentProto document =
197 DocumentBuilder()
198 .SetKey("icing", "email/1")
199 .SetSchema("email")
200 .AddStringProperty("subject", "counting")
201 .AddStringProperty("body", "one two three four.... five")
202 .Build();
203
204 SectionIdMask section_mask = 0b00000011;
205 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
206
207 // Window starts at the beginning of "three" and at the exact end of
208 // "three". len=5, orig_window= "three"
209 snippet_spec_.set_max_window_utf32_length(5);
210 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
211 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
212
213 EXPECT_THAT(snippet.entries(), SizeIs(1));
214 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
215 std::string_view content =
216 GetString(&document, snippet.entries(0).property_name());
217 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three"));
218 }
219
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch)220 TEST_F(SnippetRetrieverTest,
221 SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) {
222 DocumentProto document =
223 DocumentBuilder()
224 .SetKey("icing", "email/1")
225 .SetSchema("email")
226 .AddStringProperty("subject", "counting")
227 .AddStringProperty("body", "one two three four.... five")
228 .Build();
229
230 SectionIdMask section_mask = 0b00000011;
231 SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
232
233 // Window starts at the beginning of "four" and at the exact end of
234 // "four". len=4, orig_window= "four"
235 snippet_spec_.set_max_window_utf32_length(4);
236 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
237 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
238
239 EXPECT_THAT(snippet.entries(), SizeIs(1));
240 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
241 std::string_view content =
242 GetString(&document, snippet.entries(0).property_name());
243 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four"));
244 }
245
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsInWhitespace)246 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
247 DocumentProto document =
248 DocumentBuilder()
249 .SetKey("icing", "email/1")
250 .SetSchema("email")
251 .AddStringProperty("subject", "counting")
252 .AddStringProperty("body", "one two three four.... five")
253 .Build();
254
255 SectionIdMask section_mask = 0b00000011;
256 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
257
258 // String: "one two three four.... five"
259 // ^ ^ ^ ^ ^ ^
260 // UTF-8 idx: 0 4 8 14 23 27
261 // UTF-32 idx: 0 4 8 14 23 27
262 //
263 // The window will be:
264 // 1. untrimmed, no-shifting window will be (2,17).
265 // 2. trimmed, no-shifting window [4,13) "two three"
266 // 3. trimmed, shifted window [4,18) "two three four"
267 snippet_spec_.set_max_window_utf32_length(14);
268 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
269 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
270
271 EXPECT_THAT(snippet.entries(), SizeIs(1));
272 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
273 std::string_view content =
274 GetString(&document, snippet.entries(0).property_name());
275 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
276 ElementsAre("two three four"));
277 }
278
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsMidToken)279 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
280 DocumentProto document =
281 DocumentBuilder()
282 .SetKey("icing", "email/1")
283 .SetSchema("email")
284 .AddStringProperty("subject", "counting")
285 .AddStringProperty("body", "one two three four.... five")
286 .Build();
287
288 SectionIdMask section_mask = 0b00000011;
289 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
290
291 // String: "one two three four.... five"
292 // ^ ^ ^ ^ ^ ^
293 // UTF-8 idx: 0 4 8 14 23 27
294 // UTF-32 idx: 0 4 8 14 23 27
295 //
296 // The window will be:
297 // 1. untrimmed, no-shifting window will be (1,18).
298 // 2. trimmed, no-shifting window [4,18) "two three four"
299 // 3. trimmed, shifted window [4,20) "two three four.."
300 snippet_spec_.set_max_window_utf32_length(16);
301 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
302 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
303
304 EXPECT_THAT(snippet.entries(), SizeIs(1));
305 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
306 std::string_view content =
307 GetString(&document, snippet.entries(0).property_name());
308 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
309 ElementsAre("two three four.."));
310 }
311
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInPunctuation)312 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
313 DocumentProto document =
314 DocumentBuilder()
315 .SetKey("icing", "email/1")
316 .SetSchema("email")
317 .AddStringProperty("subject", "counting")
318 .AddStringProperty("body", "one two three four.... five")
319 .Build();
320
321 SectionIdMask section_mask = 0b00000011;
322 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
323
324 // Window ends in the middle of all the punctuation and window starts at 0.
325 // len=20, orig_window="one two three four.."
326 snippet_spec_.set_max_window_utf32_length(20);
327 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
328 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
329
330 EXPECT_THAT(snippet.entries(), SizeIs(1));
331 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
332 std::string_view content =
333 GetString(&document, snippet.entries(0).property_name());
334 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
335 ElementsAre("one two three four.."));
336 }
337
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMultiBytePunctuation)338 TEST_F(SnippetRetrieverTest,
339 SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
340 DocumentProto document =
341 DocumentBuilder()
342 .SetKey("icing", "email/1")
343 .SetSchema("email")
344 .AddStringProperty("subject", "counting")
345 .AddStringProperty("body",
346 "Is everything upside down in Australia¿ Crikey!")
347 .Build();
348
349 SectionIdMask section_mask = 0b00000011;
350 SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
351
352 // Window ends in the middle of all the punctuation and window starts at 0.
353 // len=26, orig_window="pside down in Australia¿"
354 snippet_spec_.set_max_window_utf32_length(24);
355 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
356 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
357
358 EXPECT_THAT(snippet.entries(), SizeIs(1));
359 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
360 std::string_view content =
361 GetString(&document, snippet.entries(0).property_name());
362 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
363 ElementsAre("down in Australia¿"));
364 }
365
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowBeyondMultiBytePunctuation)366 TEST_F(SnippetRetrieverTest,
367 SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
368 DocumentProto document =
369 DocumentBuilder()
370 .SetKey("icing", "email/1")
371 .SetSchema("email")
372 .AddStringProperty("subject", "counting")
373 .AddStringProperty("body",
374 "Is everything upside down in Australia¿ Crikey!")
375 .Build();
376
377 SectionIdMask section_mask = 0b00000011;
378 SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
379
380 // Window ends in the middle of all the punctuation and window starts at 0.
381 // len=26, orig_window="upside down in Australia¿ "
382 snippet_spec_.set_max_window_utf32_length(26);
383 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
384 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
385
386 EXPECT_THAT(snippet.entries(), SizeIs(1));
387 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
388 std::string_view content =
389 GetString(&document, snippet.entries(0).property_name());
390 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
391 ElementsAre("upside down in Australia¿"));
392 }
393
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsBeforeValueStart)394 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
395 DocumentProto document =
396 DocumentBuilder()
397 .SetKey("icing", "email/1")
398 .SetSchema("email")
399 .AddStringProperty("subject", "counting")
400 .AddStringProperty("body", "one two three four.... five")
401 .Build();
402
403 SectionIdMask section_mask = 0b00000011;
404 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
405
406 // String: "one two three four.... five"
407 // ^ ^ ^ ^ ^ ^
408 // UTF-8 idx: 0 4 8 14 23 27
409 // UTF-32 idx: 0 4 8 14 23 27
410 //
411 // The window will be:
412 // 1. untrimmed, no-shifting window will be (-2,21).
413 // 2. trimmed, no-shifting window [0,21) "one two three four..."
414 // 3. trimmed, shifted window [0,22) "one two three four...."
415 snippet_spec_.set_max_window_utf32_length(22);
416 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
417 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
418
419 EXPECT_THAT(snippet.entries(), SizeIs(1));
420 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
421 std::string_view content =
422 GetString(&document, snippet.entries(0).property_name());
423 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
424 ElementsAre("one two three four...."));
425 }
426
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInWhitespace)427 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
428 DocumentProto document =
429 DocumentBuilder()
430 .SetKey("icing", "email/1")
431 .SetSchema("email")
432 .AddStringProperty("subject", "counting")
433 .AddStringProperty("body", "one two three four.... five")
434 .Build();
435
436 SectionIdMask section_mask = 0b00000011;
437 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
438
439 // Window ends before "five" but after all the punctuation
440 // len=26, orig_window="one two three four.... "
441 snippet_spec_.set_max_window_utf32_length(26);
442 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
443 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
444
445 EXPECT_THAT(snippet.entries(), SizeIs(1));
446 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
447 std::string_view content =
448 GetString(&document, snippet.entries(0).property_name());
449 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
450 ElementsAre("one two three four...."));
451 }
452
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMidToken)453 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
454 DocumentProto document =
455 DocumentBuilder()
456 .SetKey("icing", "email/1")
457 .SetSchema("email")
458 .AddStringProperty("subject", "counting")
459 .AddStringProperty("body", "one two three four.... five")
460 .Build();
461
462 SectionIdMask section_mask = 0b00000011;
463 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
464
465 // String: "one two three four.... five"
466 // ^ ^ ^ ^ ^ ^
467 // UTF-8 idx: 0 4 8 14 23 27
468 // UTF-32 idx: 0 4 8 14 23 27
469 //
470 // The window will be:
471 // 1. untrimmed, no-shifting window will be ((-7,26).
472 // 2. trimmed, no-shifting window [0,26) "one two three four...."
473 // 3. trimmed, shifted window [0,27) "one two three four.... five"
474 snippet_spec_.set_max_window_utf32_length(32);
475 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
476 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
477
478 EXPECT_THAT(snippet.entries(), SizeIs(1));
479 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
480 std::string_view content =
481 GetString(&document, snippet.entries(0).property_name());
482 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
483 ElementsAre("one two three four.... five"));
484 }
485
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToValueSize)486 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
487 DocumentProto document =
488 DocumentBuilder()
489 .SetKey("icing", "email/1")
490 .SetSchema("email")
491 .AddStringProperty("subject", "counting")
492 .AddStringProperty("body", "one two three four.... five")
493 .Build();
494
495 SectionIdMask section_mask = 0b00000011;
496 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
497
498 // Max window size equals the size of the value.
499 // len=34, orig_window="one two three four.... five"
500 snippet_spec_.set_max_window_utf32_length(34);
501 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
502 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
503
504 EXPECT_THAT(snippet.entries(), SizeIs(1));
505 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
506 std::string_view content =
507 GetString(&document, snippet.entries(0).property_name());
508 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
509 ElementsAre("one two three four.... five"));
510 }
511
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeLargerThanValueSize)512 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
513 DocumentProto document =
514 DocumentBuilder()
515 .SetKey("icing", "email/1")
516 .SetSchema("email")
517 .AddStringProperty("subject", "counting")
518 .AddStringProperty("body", "one two three four.... five")
519 .Build();
520
521 SectionIdMask section_mask = 0b00000011;
522 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
523
524 // Max window size exceeds the size of the value.
525 // len=36, orig_window="one two three four.... five"
526 snippet_spec_.set_max_window_utf32_length(36);
527 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
528 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
529
530 EXPECT_THAT(snippet.entries(), SizeIs(1));
531 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
532 std::string_view content =
533 GetString(&document, snippet.entries(0).property_name());
534 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
535 ElementsAre("one two three four.... five"));
536 }
537
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStart)538 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
539 DocumentProto document =
540 DocumentBuilder()
541 .SetKey("icing", "email/1")
542 .SetSchema("email")
543 .AddStringProperty("subject", "counting")
544 .AddStringProperty("body", "one two three four.... five six")
545 .Build();
546
547 SectionIdMask section_mask = 0b00000011;
548 SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
549
550 // String: "one two three four.... five six"
551 // ^ ^ ^ ^ ^ ^ ^
552 // UTF-8 idx: 0 4 8 14 23 28 31
553 // UTF-32 idx: 0 4 8 14 23 28 31
554 //
555 // Window size will go past the start of the window.
556 // The window will be:
557 // 1. untrimmed, no-shifting window will be (-10,19).
558 // 2. trimmed, no-shifting window [0,19) "one two three four."
559 // 3. trimmed, shifted window [0,27) "one two three four.... five"
560 snippet_spec_.set_max_window_utf32_length(28);
561 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
562 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
563
564 EXPECT_THAT(snippet.entries(), SizeIs(1));
565 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
566 std::string_view content =
567 GetString(&document, snippet.entries(0).property_name());
568 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
569 ElementsAre("one two three four.... five"));
570 }
571
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEnd)572 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
573 DocumentProto document =
574 DocumentBuilder()
575 .SetKey("icing", "email/1")
576 .SetSchema("email")
577 .AddStringProperty("subject", "counting")
578 .AddStringProperty("body", "one two three four.... five six")
579 .Build();
580
581 SectionIdMask section_mask = 0b00000011;
582 SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
583
584 // String: "one two three four.... five six"
585 // ^ ^ ^ ^ ^ ^ ^
586 // UTF-8 idx: 0 4 8 14 23 28 31
587 // UTF-32 idx: 0 4 8 14 23 28 31
588 //
589 // Window size will go past the end of the window.
590 // The window will be:
591 // 1. untrimmed, no-shifting window will be (10,39).
592 // 2. trimmed, no-shifting window [14,31) "four.... five six"
593 // 3. trimmed, shifted window [4,31) "two three four.... five six"
594 snippet_spec_.set_max_window_utf32_length(28);
595 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
596 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
597
598 EXPECT_THAT(snippet.entries(), SizeIs(1));
599 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
600 std::string_view content =
601 GetString(&document, snippet.entries(0).property_name());
602 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
603 ElementsAre("two three four.... five six"));
604 }
605
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStartShortText)606 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
607 DocumentProto document =
608 DocumentBuilder()
609 .SetKey("icing", "email/1")
610 .SetSchema("email")
611 .AddStringProperty("subject", "counting")
612 .AddStringProperty("body", "one two three four....")
613 .Build();
614
615 SectionIdMask section_mask = 0b00000011;
616 SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
617
618 // String: "one two three four...."
619 // ^ ^ ^ ^ ^
620 // UTF-8 idx: 0 4 8 14 22
621 // UTF-32 idx: 0 4 8 14 22
622 //
623 // Window size will go past the start of the window.
624 // The window will be:
625 // 1. untrimmed, no-shifting window will be (-10,19).
626 // 2. trimmed, no-shifting window [0, 19) "one two three four."
627 // 3. trimmed, shifted window [0, 22) "one two three four...."
628 snippet_spec_.set_max_window_utf32_length(28);
629 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
630 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
631
632 EXPECT_THAT(snippet.entries(), SizeIs(1));
633 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
634 std::string_view content =
635 GetString(&document, snippet.entries(0).property_name());
636 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
637 ElementsAre("one two three four...."));
638 }
639
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEndShortText)640 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
641 DocumentProto document =
642 DocumentBuilder()
643 .SetKey("icing", "email/1")
644 .SetSchema("email")
645 .AddStringProperty("subject", "counting")
646 .AddStringProperty("body", "one two three four....")
647 .Build();
648
649 SectionIdMask section_mask = 0b00000011;
650 SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
651
652 // String: "one two three four...."
653 // ^ ^ ^ ^ ^
654 // UTF-8 idx: 0 4 8 14 22
655 // UTF-32 idx: 0 4 8 14 22
656 //
657 // Window size will go past the start of the window.
658 // The window will be:
659 // 1. untrimmed, no-shifting window will be (1,30).
660 // 2. trimmed, no-shifting window [4, 22) "two three four...."
661 // 3. trimmed, shifted window [0, 22) "one two three four...."
662 snippet_spec_.set_max_window_utf32_length(28);
663 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
664 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
665
666 EXPECT_THAT(snippet.entries(), SizeIs(1));
667 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
668 std::string_view content =
669 GetString(&document, snippet.entries(0).property_name());
670 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
671 ElementsAre("one two three four...."));
672 }
673
TEST_F(SnippetRetrieverTest,PrefixSnippeting)674 TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
675 DocumentProto document =
676 DocumentBuilder()
677 .SetKey("icing", "email/1")
678 .SetSchema("email")
679 .AddStringProperty("subject", "subject foo")
680 .AddStringProperty("body", "Only a fool would match this content.")
681 .Build();
682 SectionIdMask section_mask = 0b00000011;
683 SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
684 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
685 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
686
687 // Check the snippets. 'f' should match prefix-enabled property 'subject', but
688 // not exact-only property 'body'
689 EXPECT_THAT(snippet.entries(), SizeIs(1));
690 EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
691 std::string_view content =
692 GetString(&document, snippet.entries(0).property_name());
693 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
694 ElementsAre("subject foo"));
695 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
696 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
697 }
698
TEST_F(SnippetRetrieverTest,ExactSnippeting)699 TEST_F(SnippetRetrieverTest, ExactSnippeting) {
700 DocumentProto document =
701 DocumentBuilder()
702 .SetKey("icing", "email/1")
703 .SetSchema("email")
704 .AddStringProperty("subject", "subject foo")
705 .AddStringProperty("body", "Only a fool would match this content.")
706 .Build();
707
708 SectionIdMask section_mask = 0b00000011;
709 SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
710 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
711 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
712
713 // Check the snippets
714 EXPECT_THAT(snippet.entries(), IsEmpty());
715 }
716
TEST_F(SnippetRetrieverTest,SimpleSnippetingNoWindowing)717 TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
718 DocumentProto document =
719 DocumentBuilder()
720 .SetKey("icing", "email/1")
721 .SetSchema("email")
722 .AddStringProperty("subject", "subject foo")
723 .AddStringProperty("body", "Only a fool would match this content.")
724 .Build();
725
726 snippet_spec_.set_max_window_utf32_length(0);
727
728 SectionIdMask section_mask = 0b00000011;
729 SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
730 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
731 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
732
733 // Check the snippets
734 EXPECT_THAT(snippet.entries(), SizeIs(1));
735 EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
736 std::string_view content =
737 GetString(&document, snippet.entries(0).property_name());
738 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
739 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
740 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
741 }
742
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatches)743 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
744 DocumentProto document =
745 DocumentBuilder()
746 .SetKey("icing", "email/1")
747 .SetSchema("email")
748 .AddStringProperty("subject", "subject foo")
749 .AddStringProperty("body",
750 "Concerning the subject of foo, we need to begin "
751 "considering our options regarding body bar.")
752 .Build();
753 // String: "Concerning the subject of foo, we need to begin considering "
754 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
755 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
756 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
757 //
758 // String ctd: "our options regarding body bar."
759 // ^ ^ ^ ^ ^ ^
760 // UTF-8 idx: 60 64 72 82 87 91
761 // UTF-32 idx: 60 64 72 82 87 91
762 SectionIdMask section_mask = 0b00000011;
763 SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
764 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
765 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
766
767 // Check the snippets
768 EXPECT_THAT(snippet.entries(), SizeIs(2));
769 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
770 std::string_view content =
771 GetString(&document, snippet.entries(0).property_name());
772 // The first window will be:
773 // 1. untrimmed, no-shifting window will be (-6,59).
774 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
775 // 3. trimmed, shifted window [0, 63) "Concerning... our"
776 // The second window will be:
777 // 1. untrimmed, no-shifting window will be (54,91).
778 // 2. trimmed, no-shifting window [60, 91) "our... bar.".
779 // 3. trimmed, shifted window [31, 91) "we... bar."
780 EXPECT_THAT(
781 GetWindows(content, snippet.entries(0)),
782 ElementsAre(
783 "Concerning the subject of foo, we need to begin considering our",
784 "we need to begin considering our options regarding body bar."));
785 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
786 ElementsAre("foo", "bar"));
787 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
788 ElementsAre("foo", "bar"));
789
790 EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
791 content = GetString(&document, snippet.entries(1).property_name());
792 EXPECT_THAT(GetWindows(content, snippet.entries(1)),
793 ElementsAre("subject foo"));
794 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
795 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
796 }
797
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrict)798 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
799 DocumentProto document =
800 DocumentBuilder()
801 .SetKey("icing", "email/1")
802 .SetSchema("email")
803 .AddStringProperty("subject", "subject foo")
804 .AddStringProperty("body",
805 "Concerning the subject of foo, we need to begin "
806 "considering our options regarding body bar.")
807 .Build();
808 // String: "Concerning the subject of foo, we need to begin considering "
809 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
810 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
811 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
812 //
813 // String ctd: "our options regarding body bar."
814 // ^ ^ ^ ^ ^ ^
815 // UTF-8 idx: 60 64 72 82 87 91
816 // UTF-32 idx: 60 64 72 82 87 91
817 //
818 // Section 1 "subject" is not in the section_mask, so no snippet information
819 // from that section should be returned by the SnippetRetriever.
820 SectionIdMask section_mask = 0b00000001;
821 SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
822 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
823 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
824
825 // Check the snippets
826 EXPECT_THAT(snippet.entries(), SizeIs(1));
827 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
828 std::string_view content =
829 GetString(&document, snippet.entries(0).property_name());
830 // The first window will be:
831 // 1. untrimmed, no-shifting window will be (-6,59).
832 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
833 // 3. trimmed, shifted window [0, 63) "Concerning... our"
834 // The second window will be:
835 // 1. untrimmed, no-shifting window will be (54,91).
836 // 2. trimmed, no-shifting window [60, 91) "our... bar.".
837 // 3. trimmed, shifted window [31, 91) "we... bar."
838 EXPECT_THAT(
839 GetWindows(content, snippet.entries(0)),
840 ElementsAre(
841 "Concerning the subject of foo, we need to begin considering our",
842 "we need to begin considering our options regarding body bar."));
843 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
844 ElementsAre("foo", "bar"));
845 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
846 ElementsAre("foo", "bar"));
847 }
848
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrictedTerm)849 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
850 DocumentProto document =
851 DocumentBuilder()
852 .SetKey("icing", "email/1")
853 .SetSchema("email")
854 .AddStringProperty("subject", "subject foo")
855 .AddStringProperty("body",
856 "Concerning the subject of foo, we need to begin "
857 "considering our options regarding body bar.")
858 .Build();
859 // String: "Concerning the subject of foo, we need to begin considering "
860 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
861 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
862 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
863 //
864 // String ctd: "our options regarding body bar."
865 // ^ ^ ^ ^ ^ ^
866 // UTF-8 idx: 60 64 72 82 87 91
867 // UTF-32 idx: 60 64 72 82 87 91
868 SectionIdMask section_mask = 0b00000011;
869 // "subject" should match in both sections, but "foo" is restricted to "body"
870 // so it should only match in the 'body' section and not the 'subject'
871 // section.
872 SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
873 {"body", {"foo"}}};
874 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
875 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
876
877 // Check the snippets
878 EXPECT_THAT(snippet.entries(), SizeIs(2));
879 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
880 std::string_view content =
881 GetString(&document, snippet.entries(0).property_name());
882 // The first window will be:
883 // 1. untrimmed, no-shifting window will be (-15,50).
884 // 2. trimmed, no-shifting window [0, 47) "Concerning... begin".
885 // 3. trimmed, shifted window [0, 63) "Concerning... our"
886 // The second window will be:
887 // 1. untrimmed, no-shifting window will be (-6,59).
888 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
889 // 3. trimmed, shifted window [0, 63) "Concerning... our"
890 EXPECT_THAT(
891 GetWindows(content, snippet.entries(0)),
892 ElementsAre(
893 "Concerning the subject of foo, we need to begin considering our",
894 "Concerning the subject of foo, we need to begin considering our"));
895 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
896 ElementsAre("subject", "foo"));
897 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
898 ElementsAre("subject", "foo"));
899
900 EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
901 content = GetString(&document, snippet.entries(1).property_name());
902 EXPECT_THAT(GetWindows(content, snippet.entries(1)),
903 ElementsAre("subject foo"));
904 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
905 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
906 ElementsAre("subject"));
907 }
908
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesOneMatchPerProperty)909 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
910 DocumentProto document =
911 DocumentBuilder()
912 .SetKey("icing", "email/1")
913 .SetSchema("email")
914 .AddStringProperty("subject", "subject foo")
915 .AddStringProperty("body",
916 "Concerning the subject of foo, we need to begin "
917 "considering our options regarding body bar.")
918 .Build();
919
920 // String: "Concerning the subject of foo, we need to begin considering "
921 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
922 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
923 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
924 //
925 // String ctd: "our options regarding body bar."
926 // ^ ^ ^ ^ ^ ^
927 // UTF-8 idx: 60 64 72 82 87 91
928 // UTF-32 idx: 60 64 72 82 87 91
929 snippet_spec_.set_num_matches_per_property(1);
930
931 SectionIdMask section_mask = 0b00000011;
932 SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
933 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
934 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
935
936 // Check the snippets
937 EXPECT_THAT(snippet.entries(), SizeIs(2));
938 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
939 std::string_view content =
940 GetString(&document, snippet.entries(0).property_name());
941 // The window will be:
942 // 1. untrimmed, no-shifting window will be (-6,59).
943 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
944 // 3. trimmed, shifted window [0, 63) "Concerning... our"
945 EXPECT_THAT(
946 GetWindows(content, snippet.entries(0)),
947 ElementsAre(
948 "Concerning the subject of foo, we need to begin considering our"));
949 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
950 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
951
952 EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
953 content = GetString(&document, snippet.entries(1).property_name());
954 EXPECT_THAT(GetWindows(content, snippet.entries(1)),
955 ElementsAre("subject foo"));
956 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
957 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
958 }
959
TEST_F(SnippetRetrieverTest,PrefixSnippetingNormalization)960 TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
961 DocumentProto document =
962 DocumentBuilder()
963 .SetKey("icing", "email/1")
964 .SetSchema("email")
965 .AddStringProperty("subject", "MDI team")
966 .AddStringProperty("body", "Some members are in Zürich.")
967 .Build();
968 SectionIdMask section_mask = 0b00000011;
969 SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
970 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
971 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
972
973 EXPECT_THAT(snippet.entries(), SizeIs(1));
974 EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
975 std::string_view content =
976 GetString(&document, snippet.entries(0).property_name());
977 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
978 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
979 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
980 }
981
TEST_F(SnippetRetrieverTest,ExactSnippetingNormalization)982 TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
983 DocumentProto document =
984 DocumentBuilder()
985 .SetKey("icing", "email/1")
986 .SetSchema("email")
987 .AddStringProperty("subject", "MDI team")
988 .AddStringProperty("body", "Some members are in Zürich.")
989 .Build();
990
991 SectionIdMask section_mask = 0b00000011;
992 SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
993 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
994 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
995
996 EXPECT_THAT(snippet.entries(), SizeIs(1));
997 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
998 std::string_view content =
999 GetString(&document, snippet.entries(0).property_name());
1000 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1001 ElementsAre("Some members are in Zürich."));
1002 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
1003
1004 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1005 ElementsAre("Zürich"));
1006 }
1007
TEST_F(SnippetRetrieverTest,SnippetingTestOneLevel)1008 TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
1009 SchemaProto schema =
1010 SchemaBuilder()
1011 .AddType(SchemaTypeConfigBuilder()
1012 .SetType("SingleLevelType")
1013 .AddProperty(PropertyConfigBuilder()
1014 .SetName("X")
1015 .SetDataTypeString(TERM_MATCH_PREFIX,
1016 TOKENIZER_PLAIN)
1017 .SetCardinality(CARDINALITY_REPEATED))
1018 .AddProperty(PropertyConfigBuilder()
1019 .SetName("Y")
1020 .SetDataTypeString(TERM_MATCH_PREFIX,
1021 TOKENIZER_PLAIN)
1022 .SetCardinality(CARDINALITY_REPEATED))
1023 .AddProperty(PropertyConfigBuilder()
1024 .SetName("Z")
1025 .SetDataTypeString(TERM_MATCH_PREFIX,
1026 TOKENIZER_PLAIN)
1027 .SetCardinality(CARDINALITY_REPEATED)))
1028 .Build();
1029 ICING_ASSERT_OK(schema_store_->SetSchema(
1030 schema, /*ignore_errors_and_delete_documents=*/true,
1031 /*allow_circular_schema_definitions=*/false));
1032 ICING_ASSERT_OK_AND_ASSIGN(
1033 snippet_retriever_,
1034 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1035 normalizer_.get()));
1036
1037 std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1038 DocumentProto document;
1039 document.set_schema("SingleLevelType");
1040 PropertyProto* prop = document.add_properties();
1041 prop->set_name("X");
1042 for (const std::string& s : string_values) {
1043 prop->add_string_values(s);
1044 }
1045 prop = document.add_properties();
1046 prop->set_name("Y");
1047 for (const std::string& s : string_values) {
1048 prop->add_string_values(s);
1049 }
1050 prop = document.add_properties();
1051 prop->set_name("Z");
1052 for (const std::string& s : string_values) {
1053 prop->add_string_values(s);
1054 }
1055
1056 SectionIdMask section_mask = 0b00000111;
1057 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1058 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1059 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1060
1061 EXPECT_THAT(snippet.entries(), SizeIs(6));
1062 EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]"));
1063 std::string_view content =
1064 GetString(&document, snippet.entries(0).property_name());
1065 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1066 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1067 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1068
1069 EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
1070 content = GetString(&document, snippet.entries(1).property_name());
1071 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1072 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1073 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1074
1075 EXPECT_THAT(GetPropertyPaths(snippet),
1076 ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
1077 }
1078
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevel)1079 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
1080 SchemaProto schema =
1081 SchemaBuilder()
1082 .AddType(SchemaTypeConfigBuilder()
1083 .SetType("SingleLevelType")
1084 .AddProperty(PropertyConfigBuilder()
1085 .SetName("X")
1086 .SetDataTypeString(TERM_MATCH_PREFIX,
1087 TOKENIZER_PLAIN)
1088 .SetCardinality(CARDINALITY_REPEATED))
1089 .AddProperty(PropertyConfigBuilder()
1090 .SetName("Y")
1091 .SetDataTypeString(TERM_MATCH_PREFIX,
1092 TOKENIZER_PLAIN)
1093 .SetCardinality(CARDINALITY_REPEATED))
1094 .AddProperty(PropertyConfigBuilder()
1095 .SetName("Z")
1096 .SetDataTypeString(TERM_MATCH_PREFIX,
1097 TOKENIZER_PLAIN)
1098 .SetCardinality(CARDINALITY_REPEATED)))
1099 .AddType(SchemaTypeConfigBuilder()
1100 .SetType("MultiLevelType")
1101 .AddProperty(PropertyConfigBuilder()
1102 .SetName("A")
1103 .SetDataTypeDocument(
1104 "SingleLevelType",
1105 /*index_nested_properties=*/true)
1106 .SetCardinality(CARDINALITY_OPTIONAL))
1107 .AddProperty(PropertyConfigBuilder()
1108 .SetName("B")
1109 .SetDataTypeDocument(
1110 "SingleLevelType",
1111 /*index_nested_properties=*/true)
1112 .SetCardinality(CARDINALITY_OPTIONAL))
1113 .AddProperty(PropertyConfigBuilder()
1114 .SetName("C")
1115 .SetDataTypeDocument(
1116 "SingleLevelType",
1117 /*index_nested_properties=*/true)
1118 .SetCardinality(CARDINALITY_OPTIONAL)))
1119 .Build();
1120 ICING_ASSERT_OK(schema_store_->SetSchema(
1121 schema, /*ignore_errors_and_delete_documents=*/true,
1122 /*allow_circular_schema_definitions=*/false));
1123 ICING_ASSERT_OK_AND_ASSIGN(
1124 snippet_retriever_,
1125 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1126 normalizer_.get()));
1127
1128 std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1129 DocumentProto subdocument;
1130 PropertyProto* prop = subdocument.add_properties();
1131 prop->set_name("X");
1132 for (const std::string& s : string_values) {
1133 prop->add_string_values(s);
1134 }
1135 prop = subdocument.add_properties();
1136 prop->set_name("Y");
1137 for (const std::string& s : string_values) {
1138 prop->add_string_values(s);
1139 }
1140 prop = subdocument.add_properties();
1141 prop->set_name("Z");
1142 for (const std::string& s : string_values) {
1143 prop->add_string_values(s);
1144 }
1145
1146 DocumentProto document;
1147 document.set_schema("MultiLevelType");
1148 prop = document.add_properties();
1149 prop->set_name("A");
1150 *prop->add_document_values() = subdocument;
1151
1152 prop = document.add_properties();
1153 prop->set_name("B");
1154 *prop->add_document_values() = subdocument;
1155
1156 prop = document.add_properties();
1157 prop->set_name("C");
1158 *prop->add_document_values() = subdocument;
1159
1160 SectionIdMask section_mask = 0b111111111;
1161 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1162 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1163 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1164
1165 EXPECT_THAT(snippet.entries(), SizeIs(18));
1166 EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]"));
1167 std::string_view content =
1168 GetString(&document, snippet.entries(0).property_name());
1169 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1170 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1171 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1172
1173 EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
1174 content = GetString(&document, snippet.entries(1).property_name());
1175 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1176 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1177 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1178
1179 EXPECT_THAT(
1180 GetPropertyPaths(snippet),
1181 ElementsAre("A.X[1]", "A.X[3]", "A.Y[1]", "A.Y[3]", "A.Z[1]", "A.Z[3]",
1182 "B.X[1]", "B.X[3]", "B.Y[1]", "B.Y[3]", "B.Z[1]", "B.Z[3]",
1183 "C.X[1]", "C.X[3]", "C.Y[1]", "C.Y[3]", "C.Z[1]", "C.Z[3]"));
1184 }
1185
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelRepeated)1186 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
1187 SchemaProto schema =
1188 SchemaBuilder()
1189 .AddType(SchemaTypeConfigBuilder()
1190 .SetType("SingleLevelType")
1191 .AddProperty(PropertyConfigBuilder()
1192 .SetName("X")
1193 .SetDataTypeString(TERM_MATCH_PREFIX,
1194 TOKENIZER_PLAIN)
1195 .SetCardinality(CARDINALITY_REPEATED))
1196 .AddProperty(PropertyConfigBuilder()
1197 .SetName("Y")
1198 .SetDataTypeString(TERM_MATCH_PREFIX,
1199 TOKENIZER_PLAIN)
1200 .SetCardinality(CARDINALITY_REPEATED))
1201 .AddProperty(PropertyConfigBuilder()
1202 .SetName("Z")
1203 .SetDataTypeString(TERM_MATCH_PREFIX,
1204 TOKENIZER_PLAIN)
1205 .SetCardinality(CARDINALITY_REPEATED)))
1206 .AddType(SchemaTypeConfigBuilder()
1207 .SetType("MultiLevelType")
1208 .AddProperty(PropertyConfigBuilder()
1209 .SetName("A")
1210 .SetDataTypeDocument(
1211 "SingleLevelType",
1212 /*index_nested_properties=*/true)
1213 .SetCardinality(CARDINALITY_REPEATED))
1214 .AddProperty(PropertyConfigBuilder()
1215 .SetName("B")
1216 .SetDataTypeDocument(
1217 "SingleLevelType",
1218 /*index_nested_properties=*/true)
1219 .SetCardinality(CARDINALITY_REPEATED))
1220 .AddProperty(PropertyConfigBuilder()
1221 .SetName("C")
1222 .SetDataTypeDocument(
1223 "SingleLevelType",
1224 /*index_nested_properties=*/true)
1225 .SetCardinality(CARDINALITY_REPEATED)))
1226 .Build();
1227 ICING_ASSERT_OK(schema_store_->SetSchema(
1228 schema, /*ignore_errors_and_delete_documents=*/true,
1229 /*allow_circular_schema_definitions=*/false));
1230 ICING_ASSERT_OK_AND_ASSIGN(
1231 snippet_retriever_,
1232 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1233 normalizer_.get()));
1234
1235 std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1236 DocumentProto subdocument;
1237 PropertyProto* prop = subdocument.add_properties();
1238 prop->set_name("X");
1239 for (const std::string& s : string_values) {
1240 prop->add_string_values(s);
1241 }
1242 prop = subdocument.add_properties();
1243 prop->set_name("Y");
1244 for (const std::string& s : string_values) {
1245 prop->add_string_values(s);
1246 }
1247 prop = subdocument.add_properties();
1248 prop->set_name("Z");
1249 for (const std::string& s : string_values) {
1250 prop->add_string_values(s);
1251 }
1252
1253 DocumentProto document;
1254 document.set_schema("MultiLevelType");
1255 prop = document.add_properties();
1256 prop->set_name("A");
1257 *prop->add_document_values() = subdocument;
1258 *prop->add_document_values() = subdocument;
1259
1260 prop = document.add_properties();
1261 prop->set_name("B");
1262 *prop->add_document_values() = subdocument;
1263 *prop->add_document_values() = subdocument;
1264
1265 prop = document.add_properties();
1266 prop->set_name("C");
1267 *prop->add_document_values() = subdocument;
1268 *prop->add_document_values() = subdocument;
1269
1270 SectionIdMask section_mask = 0b111111111;
1271 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1272 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1273 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1274
1275 EXPECT_THAT(snippet.entries(), SizeIs(36));
1276 EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]"));
1277 std::string_view content =
1278 GetString(&document, snippet.entries(0).property_name());
1279 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1280 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1281 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1282
1283 EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
1284 content = GetString(&document, snippet.entries(1).property_name());
1285 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1286 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1287 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1288
1289 EXPECT_THAT(GetPropertyPaths(snippet),
1290 ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
1291 "A[0].Y[1]", "A[0].Y[3]", "A[1].Y[1]", "A[1].Y[3]",
1292 "A[0].Z[1]", "A[0].Z[3]", "A[1].Z[1]", "A[1].Z[3]",
1293 "B[0].X[1]", "B[0].X[3]", "B[1].X[1]", "B[1].X[3]",
1294 "B[0].Y[1]", "B[0].Y[3]", "B[1].Y[1]", "B[1].Y[3]",
1295 "B[0].Z[1]", "B[0].Z[3]", "B[1].Z[1]", "B[1].Z[3]",
1296 "C[0].X[1]", "C[0].X[3]", "C[1].X[1]", "C[1].X[3]",
1297 "C[0].Y[1]", "C[0].Y[3]", "C[1].Y[1]", "C[1].Y[3]",
1298 "C[0].Z[1]", "C[0].Z[3]", "C[1].Z[1]", "C[1].Z[3]"));
1299 }
1300
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelSingleValue)1301 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
1302 SchemaProto schema =
1303 SchemaBuilder()
1304 .AddType(SchemaTypeConfigBuilder()
1305 .SetType("SingleLevelType")
1306 .AddProperty(PropertyConfigBuilder()
1307 .SetName("X")
1308 .SetDataTypeString(TERM_MATCH_PREFIX,
1309 TOKENIZER_PLAIN)
1310 .SetCardinality(CARDINALITY_OPTIONAL))
1311 .AddProperty(PropertyConfigBuilder()
1312 .SetName("Y")
1313 .SetDataTypeString(TERM_MATCH_PREFIX,
1314 TOKENIZER_PLAIN)
1315 .SetCardinality(CARDINALITY_OPTIONAL))
1316 .AddProperty(PropertyConfigBuilder()
1317 .SetName("Z")
1318 .SetDataTypeString(TERM_MATCH_PREFIX,
1319 TOKENIZER_PLAIN)
1320 .SetCardinality(CARDINALITY_OPTIONAL)))
1321 .AddType(SchemaTypeConfigBuilder()
1322 .SetType("MultiLevelType")
1323 .AddProperty(PropertyConfigBuilder()
1324 .SetName("A")
1325 .SetDataTypeDocument(
1326 "SingleLevelType",
1327 /*index_nested_properties=*/true)
1328 .SetCardinality(CARDINALITY_REPEATED))
1329 .AddProperty(PropertyConfigBuilder()
1330 .SetName("B")
1331 .SetDataTypeDocument(
1332 "SingleLevelType",
1333 /*index_nested_properties=*/true)
1334 .SetCardinality(CARDINALITY_REPEATED))
1335 .AddProperty(PropertyConfigBuilder()
1336 .SetName("C")
1337 .SetDataTypeDocument(
1338 "SingleLevelType",
1339 /*index_nested_properties=*/true)
1340 .SetCardinality(CARDINALITY_REPEATED)))
1341 .Build();
1342 ICING_ASSERT_OK(schema_store_->SetSchema(
1343 schema, /*ignore_errors_and_delete_documents=*/true,
1344 /*allow_circular_schema_definitions=*/false));
1345 ICING_ASSERT_OK_AND_ASSIGN(
1346 snippet_retriever_,
1347 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1348 normalizer_.get()));
1349
1350 DocumentProto subdocument;
1351 PropertyProto* prop = subdocument.add_properties();
1352 prop->set_name("X");
1353 prop->add_string_values("polo");
1354 prop = subdocument.add_properties();
1355 prop->set_name("Y");
1356 prop->add_string_values("marco");
1357 prop = subdocument.add_properties();
1358 prop->set_name("Z");
1359 prop->add_string_values("polo");
1360
1361 DocumentProto document;
1362 document.set_schema("MultiLevelType");
1363 prop = document.add_properties();
1364 prop->set_name("A");
1365 *prop->add_document_values() = subdocument;
1366 *prop->add_document_values() = subdocument;
1367
1368 prop = document.add_properties();
1369 prop->set_name("B");
1370 *prop->add_document_values() = subdocument;
1371 *prop->add_document_values() = subdocument;
1372
1373 prop = document.add_properties();
1374 prop->set_name("C");
1375 *prop->add_document_values() = subdocument;
1376 *prop->add_document_values() = subdocument;
1377
1378 SectionIdMask section_mask = 0b111111111;
1379 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1380 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1381 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1382
1383 EXPECT_THAT(snippet.entries(), SizeIs(12));
1384 EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X"));
1385 std::string_view content =
1386 GetString(&document, snippet.entries(0).property_name());
1387 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1388 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1389 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1390
1391 EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
1392 content = GetString(&document, snippet.entries(1).property_name());
1393 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1394 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1395 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1396
1397 EXPECT_THAT(
1398 GetPropertyPaths(snippet),
1399 ElementsAre("A[0].X", "A[1].X", "A[0].Z", "A[1].Z", "B[0].X", "B[1].X",
1400 "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
1401 }
1402
TEST_F(SnippetRetrieverTest,CJKSnippetMatchTest)1403 TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
1404 // String: "我每天走路去上班。"
1405 // ^ ^ ^ ^^
1406 // UTF8 idx: 0 3 9 15 18
1407 // UTF16 idx: 0 1 3 5 6
1408 // Breaks into segments: "我", "每天", "走路", "去", "上班"
1409 constexpr std::string_view kChinese = "我每天走路去上班。";
1410 DocumentProto document =
1411 DocumentBuilder()
1412 .SetKey("icing", "email/1")
1413 .SetSchema("email")
1414 .AddStringProperty("subject", kChinese)
1415 .AddStringProperty("body",
1416 "Concerning the subject of foo, we need to begin "
1417 "considering our options regarding body bar.")
1418 .Build();
1419
1420 SectionIdMask section_mask = 0b00000011;
1421 SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1422
1423 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1424 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1425
1426 // Ensure that one and only one property was matched and it was "body"
1427 ASSERT_THAT(snippet.entries(), SizeIs(1));
1428 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1429 EXPECT_THAT(entry->property_name(), Eq("subject"));
1430 std::string_view content =
1431 GetString(&document, snippet.entries(0).property_name());
1432
1433 // Ensure that there is one and only one match within "subject"
1434 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1435 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1436
1437 // Ensure that the match is correct.
1438 EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
1439 EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
1440
1441 // Ensure that the utf-16 values are also as expected
1442 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
1443 EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
1444 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
1445 }
1446
TEST_F(SnippetRetrieverTest,CJKSnippetWindowTest)1447 TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
1448 language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
1449 jni_cache_.get());
1450 ICING_ASSERT_OK_AND_ASSIGN(
1451 language_segmenter_,
1452 language_segmenter_factory::Create(std::move(options)));
1453 ICING_ASSERT_OK_AND_ASSIGN(
1454 snippet_retriever_,
1455 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1456 normalizer_.get()));
1457
1458 // String: "我每天走路去上班。"
1459 // ^ ^ ^ ^^
1460 // UTF8 idx: 0 3 9 15 18
1461 // UTF16 idx: 0 1 3 5 6
1462 // UTF32 idx: 0 1 3 5 6
1463 // Breaks into segments: "我", "每天", "走路", "去", "上班"
1464 constexpr std::string_view kChinese = "我每天走路去上班。";
1465 DocumentProto document =
1466 DocumentBuilder()
1467 .SetKey("icing", "email/1")
1468 .SetSchema("email")
1469 .AddStringProperty("subject", kChinese)
1470 .AddStringProperty("body",
1471 "Concerning the subject of foo, we need to begin "
1472 "considering our options regarding body bar.")
1473 .Build();
1474
1475 SectionIdMask section_mask = 0b00000011;
1476 SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1477
1478 // The window will be:
1479 // 1. untrimmed, no-shifting window will be (0,7).
1480 // 2. trimmed, no-shifting window [1, 6) "每天走路去".
1481 // 3. trimmed, shifted window [0, 6) "我每天走路去"
1482 snippet_spec_.set_max_window_utf32_length(6);
1483
1484 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1485 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1486
1487 // Ensure that one and only one property was matched and it was "body"
1488 ASSERT_THAT(snippet.entries(), SizeIs(1));
1489 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1490 EXPECT_THAT(entry->property_name(), Eq("subject"));
1491 std::string_view content =
1492 GetString(&document, snippet.entries(0).property_name());
1493
1494 // Ensure that there is one and only one match within "subject"
1495 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1496 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1497
1498 // Ensure that the match is correct.
1499 EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
1500
1501 // Ensure that the utf-16 values are also as expected
1502 EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
1503 EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
1504 }
1505
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitSnippetMatchTest)1506 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
1507 // The following string has four-byte UTF-8 characters. Most importantly, it
1508 // is also two code units in UTF-16.
1509 // String: " "
1510 // ^ ^ ^
1511 // UTF8 idx: 0 9 18
1512 // UTF16 idx: 0 5 10
1513 // Breaks into segments: "", "", ""
1514 constexpr std::string_view kText = " ";
1515 DocumentProto document =
1516 DocumentBuilder()
1517 .SetKey("icing", "email/1")
1518 .SetSchema("email")
1519 .AddStringProperty("subject", kText)
1520 .AddStringProperty("body",
1521 "Concerning the subject of foo, we need to begin "
1522 "considering our options regarding body bar.")
1523 .Build();
1524
1525 SectionIdMask section_mask = 0b00000011;
1526 SectionRestrictQueryTermsMap query_terms{{"", {""}}};
1527
1528 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1529 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1530
1531 // Ensure that one and only one property was matched and it was "body"
1532 ASSERT_THAT(snippet.entries(), SizeIs(1));
1533 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1534 EXPECT_THAT(entry->property_name(), Eq("subject"));
1535 std::string_view content =
1536 GetString(&document, snippet.entries(0).property_name());
1537
1538 // Ensure that there is one and only one match within "subject"
1539 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1540 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1541
1542 // Ensure that the match is correct.
1543 EXPECT_THAT(GetMatches(content, *entry), ElementsAre(""));
1544 EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre(""));
1545
1546 // Ensure that the utf-16 values are also as expected
1547 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
1548 EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
1549 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1550 }
1551
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitWindowTest)1552 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
1553 // The following string has four-byte UTF-8 characters. Most importantly, it
1554 // is also two code units in UTF-16.
1555 // String: " "
1556 // ^ ^ ^
1557 // UTF8 idx: 0 9 18
1558 // UTF16 idx: 0 5 10
1559 // UTF32 idx: 0 3 6
1560 // Breaks into segments: "", "", ""
1561 constexpr std::string_view kText = " ";
1562 DocumentProto document =
1563 DocumentBuilder()
1564 .SetKey("icing", "email/1")
1565 .SetSchema("email")
1566 .AddStringProperty("subject", kText)
1567 .AddStringProperty("body",
1568 "Concerning the subject of foo, we need to begin "
1569 "considering our options regarding body bar.")
1570 .Build();
1571
1572 SectionIdMask section_mask = 0b00000011;
1573 SectionRestrictQueryTermsMap query_terms{{"", {""}}};
1574
1575 // Set a six character window. This will produce a window like this:
1576 // String: " "
1577 // ^ ^
1578 // UTF8 idx: 9 22
1579 // UTF16 idx: 5 12
1580 // UTF32 idx: 3 7
1581 snippet_spec_.set_max_window_utf32_length(6);
1582
1583 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1584 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1585
1586 // Ensure that one and only one property was matched and it was "body"
1587 ASSERT_THAT(snippet.entries(), SizeIs(1));
1588 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1589 EXPECT_THAT(entry->property_name(), Eq("subject"));
1590 std::string_view content =
1591 GetString(&document, snippet.entries(0).property_name());
1592
1593 // Ensure that there is one and only one match within "subject"
1594 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1595 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1596
1597 // Ensure that the match is correct.
1598 EXPECT_THAT(GetWindows(content, *entry), ElementsAre(" "));
1599
1600 // Ensure that the utf-16 values are also as expected
1601 EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
1602 EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
1603 }
1604
TEST_F(SnippetRetrieverTest,SnippettingVerbatimAscii)1605 TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
1606 SchemaProto schema =
1607 SchemaBuilder()
1608 .AddType(SchemaTypeConfigBuilder()
1609 .SetType("verbatimType")
1610 .AddProperty(PropertyConfigBuilder()
1611 .SetName("verbatim")
1612 .SetDataTypeString(TERM_MATCH_EXACT,
1613 TOKENIZER_VERBATIM)
1614 .SetCardinality(CARDINALITY_REPEATED)))
1615 .Build();
1616 ICING_ASSERT_OK(schema_store_->SetSchema(
1617 schema, /*ignore_errors_and_delete_documents=*/true,
1618 /*allow_circular_schema_definitions=*/false));
1619 ICING_ASSERT_OK_AND_ASSIGN(
1620 snippet_retriever_,
1621 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1622 normalizer_.get()));
1623
1624 DocumentProto document = DocumentBuilder()
1625 .SetKey("icing", "verbatim/1")
1626 .SetSchema("verbatimType")
1627 .AddStringProperty("verbatim", "Hello, world!")
1628 .Build();
1629
1630 SectionIdMask section_mask = 0b00000001;
1631 SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
1632
1633 snippet_spec_.set_max_window_utf32_length(13);
1634 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1635 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1636
1637 // There should only be one snippet entry and match, the verbatim token in its
1638 // entirety.
1639 ASSERT_THAT(snippet.entries(), SizeIs(1));
1640
1641 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1642 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1643 ASSERT_THAT(entry->property_name(), "verbatim");
1644
1645 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1646 // We expect the match to begin at position 0, and to span the entire token
1647 // which contains 13 characters.
1648 EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1649 EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
1650
1651 // We expect the submatch to begin at position 0 of the verbatim token and
1652 // span the length of our query term "Hello, world!", which has utf-16 length
1653 // of 13. The submatch length is equal to the window length as the query the
1654 // snippet is retrieved with an exact term match.
1655 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1656 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
1657 }
1658
TEST_F(SnippetRetrieverTest,SnippettingVerbatimCJK)1659 TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
1660 SchemaProto schema =
1661 SchemaBuilder()
1662 .AddType(SchemaTypeConfigBuilder()
1663 .SetType("verbatimType")
1664 .AddProperty(PropertyConfigBuilder()
1665 .SetName("verbatim")
1666 .SetDataTypeString(TERM_MATCH_PREFIX,
1667 TOKENIZER_VERBATIM)
1668 .SetCardinality(CARDINALITY_REPEATED)))
1669 .Build();
1670 ICING_ASSERT_OK(schema_store_->SetSchema(
1671 schema, /*ignore_errors_and_delete_documents=*/true,
1672 /*allow_circular_schema_definitions=*/false));
1673 ICING_ASSERT_OK_AND_ASSIGN(
1674 snippet_retriever_,
1675 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1676 normalizer_.get()));
1677
1678 // String: "我每天走路去上班。"
1679 // ^ ^ ^ ^^
1680 // UTF8 idx: 0 3 9 15 18
1681 // UTF16 idx: 0 1 3 5 6
1682 // UTF32 idx: 0 1 3 5 6
1683 // Breaks into segments: "我", "每天", "走路", "去", "上班"
1684 std::string chinese_string = "我每天走路去上班。";
1685 DocumentProto document = DocumentBuilder()
1686 .SetKey("icing", "verbatim/1")
1687 .SetSchema("verbatimType")
1688 .AddStringProperty("verbatim", chinese_string)
1689 .Build();
1690
1691 SectionIdMask section_mask = 0b00000001;
1692 SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
1693
1694 snippet_spec_.set_max_window_utf32_length(9);
1695 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1696 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1697
1698 // There should only be one snippet entry and match, the verbatim token in its
1699 // entirety.
1700 ASSERT_THAT(snippet.entries(), SizeIs(1));
1701
1702 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1703 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1704 ASSERT_THAT(entry->property_name(), "verbatim");
1705
1706 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1707 // We expect the match to begin at position 0, and to span the entire token
1708 // which has utf-16 length of 9.
1709 EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1710 EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
1711
1712 // We expect the submatch to begin at position 0 of the verbatim token and
1713 // span the length of our query term "我每", which has utf-16 length of 2.
1714 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1715 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1716 }
1717
TEST_F(SnippetRetrieverTest,SnippettingRfc822Ascii)1718 TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
1719 SchemaProto schema =
1720 SchemaBuilder()
1721 .AddType(SchemaTypeConfigBuilder()
1722 .SetType("rfc822Type")
1723 .AddProperty(PropertyConfigBuilder()
1724 .SetName("rfc822")
1725 .SetDataTypeString(TERM_MATCH_PREFIX,
1726 TOKENIZER_RFC822)
1727 .SetCardinality(CARDINALITY_REPEATED)))
1728 .Build();
1729 ICING_ASSERT_OK(schema_store_->SetSchema(
1730 schema, /*ignore_errors_and_delete_documents=*/true,
1731 /*allow_circular_schema_definitions=*/false));
1732
1733 ICING_ASSERT_OK_AND_ASSIGN(
1734 snippet_retriever_,
1735 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1736 normalizer_.get()));
1737
1738 DocumentProto document =
1739 DocumentBuilder()
1740 .SetKey("icing", "rfc822/1")
1741 .SetSchema("rfc822Type")
1742 .AddStringProperty("rfc822",
1743 "Alexander Sav <[email protected]>, Very Long "
1744 "Name Example <[email protected]>")
1745 .Build();
1746
1747 SectionIdMask section_mask = 0b00000001;
1748
1749 // This should match both the first name token as well as the entire RFC822.
1750 SectionRestrictQueryTermsMap query_terms{{"", {"alexand"}}};
1751
1752 snippet_spec_.set_max_window_utf32_length(35);
1753
1754 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1755 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1756
1757 ASSERT_THAT(snippet.entries(), SizeIs(1));
1758 EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1759
1760 std::string_view content =
1761 GetString(&document, snippet.entries(0).property_name());
1762
1763 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1764 ElementsAre("Alexander Sav <[email protected]>,",
1765 "Alexander Sav <[email protected]>,"));
1766 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1767 ElementsAre("Alexander Sav <[email protected]>", "Alexander"));
1768 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1769 ElementsAre("Alexand", "Alexand"));
1770
1771 // "tom" should match the local component, local address, and address tokens.
1772 query_terms = SectionRestrictQueryTermsMap{{"", {"tom"}}};
1773 snippet_spec_.set_max_window_utf32_length(36);
1774
1775 snippet = snippet_retriever_->RetrieveSnippet(
1776 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1777
1778 ASSERT_THAT(snippet.entries(), SizeIs(1));
1779 EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1780
1781 content = GetString(&document, snippet.entries(0).property_name());
1782
1783 // TODO(b/248362902) Stop returning duplicate matches.
1784 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1785 ElementsAre("Alexander Sav <[email protected]>,",
1786 "Alexander Sav <[email protected]>,",
1787 "Alexander Sav <[email protected]>,"));
1788 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1789 ElementsAre("tom.bar", "[email protected]", "tom"));
1790 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1791 ElementsAre("tom", "tom", "tom"));
1792 }
1793
TEST_F(SnippetRetrieverTest,SnippettingRfc822CJK)1794 TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
1795 SchemaProto schema =
1796 SchemaBuilder()
1797 .AddType(SchemaTypeConfigBuilder()
1798 .SetType("rfc822Type")
1799 .AddProperty(PropertyConfigBuilder()
1800 .SetName("rfc822")
1801 .SetDataTypeString(TERM_MATCH_PREFIX,
1802 TOKENIZER_RFC822)
1803 .SetCardinality(CARDINALITY_REPEATED)))
1804 .Build();
1805 ICING_ASSERT_OK(schema_store_->SetSchema(
1806 schema, /*ignore_errors_and_delete_documents=*/true,
1807 /*allow_circular_schema_definitions=*/false));
1808
1809 ICING_ASSERT_OK_AND_ASSIGN(
1810 snippet_retriever_,
1811 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1812 normalizer_.get()));
1813
1814 std::string chinese_string = "我, 每天@走路, 去@上班";
1815 DocumentProto document = DocumentBuilder()
1816 .SetKey("icing", "rfc822/1")
1817 .SetSchema("rfc822Type")
1818 .AddStringProperty("rfc822", chinese_string)
1819 .Build();
1820
1821 SectionIdMask section_mask = 0b00000001;
1822
1823 SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1824
1825 snippet_spec_.set_max_window_utf32_length(8);
1826
1827 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1828 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1829
1830 // There should only be one snippet entry and match, the local component token
1831 ASSERT_THAT(snippet.entries(), SizeIs(1));
1832 EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1833
1834 std::string_view content =
1835 GetString(&document, snippet.entries(0).property_name());
1836
1837 // The local component, address, local address, and token will all match. The
1838 // windows for address and token are "" as the snippet window is too small.
1839 // TODO(b/248362902) Stop returning duplicate matches.
1840 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1841 ElementsAre("每天@走路,", "每天@走路,"));
1842 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1843 ElementsAre("走路", "走路"));
1844 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1845 ElementsAre("走", "走"));
1846 }
1847
1848 #ifdef ENABLE_URL_TOKENIZER
TEST_F(SnippetRetrieverTest,SnippettingUrlAscii)1849 TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) {
1850 SchemaProto schema =
1851 SchemaBuilder()
1852 .AddType(SchemaTypeConfigBuilder().SetType("urlType").AddProperty(
1853 PropertyConfigBuilder()
1854 .SetName("url")
1855 .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL)
1856 .SetCardinality(CARDINALITY_REPEATED)))
1857 .Build();
1858 ICING_ASSERT_OK(schema_store_->SetSchema(
1859 schema, /*ignore_errors_and_delete_documents=*/true));
1860
1861 ICING_ASSERT_OK_AND_ASSIGN(
1862 snippet_retriever_,
1863 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1864 normalizer_.get()));
1865
1866 DocumentProto document =
1867 DocumentBuilder()
1868 .SetKey("icing", "url/1")
1869 .SetSchema("urlType")
1870 .AddStringProperty("url", "https://mail.google.com/calendar/google/")
1871 .Build();
1872
1873 SectionIdMask section_mask = 0b00000001;
1874
1875 // Query with single url split-token match
1876 SectionRestrictQueryTermsMap query_terms{{"", {"com"}}};
1877 // 40 is the length of the url.
1878 // Window that is the size of the url should return entire url.
1879 snippet_spec_.set_max_window_utf32_length(40);
1880
1881 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1882 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1883
1884 ASSERT_THAT(snippet.entries(), SizeIs(1));
1885 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1886
1887 std::string_view content =
1888 GetString(&document, snippet.entries(0).property_name());
1889
1890 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1891 ElementsAre("https://mail.google.com/calendar/google/"));
1892 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com"));
1893 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com"));
1894
1895 // Query with single url suffix-token match
1896 query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}};
1897 snippet_spec_.set_max_window_utf32_length(40);
1898
1899 snippet = snippet_retriever_->RetrieveSnippet(
1900 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1901
1902 ASSERT_THAT(snippet.entries(), SizeIs(1));
1903 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1904
1905 content = GetString(&document, snippet.entries(0).property_name());
1906
1907 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1908 ElementsAre("https://mail.google.com/calendar/google/"));
1909 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1910 ElementsAre("mail.google.com/calendar/google/"));
1911 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1912 ElementsAre("mail.goo"));
1913
1914 // Query with multiple url split-token matches
1915 query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}};
1916 snippet_spec_.set_max_window_utf32_length(40);
1917
1918 snippet = snippet_retriever_->RetrieveSnippet(
1919 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1920
1921 ASSERT_THAT(snippet.entries(), SizeIs(1));
1922 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1923
1924 content = GetString(&document, snippet.entries(0).property_name());
1925
1926 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1927 ElementsAre("https://mail.google.com/calendar/google/",
1928 "https://mail.google.com/calendar/google/"));
1929 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1930 ElementsAre("google", "google"));
1931 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1932 ElementsAre("goog", "goog"));
1933
1934 // Query with both url split-token and suffix-token matches
1935 query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}};
1936 snippet_spec_.set_max_window_utf32_length(40);
1937
1938 snippet = snippet_retriever_->RetrieveSnippet(
1939 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1940
1941 ASSERT_THAT(snippet.entries(), SizeIs(1));
1942 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1943
1944 content = GetString(&document, snippet.entries(0).property_name());
1945
1946 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1947 ElementsAre("https://mail.google.com/calendar/google/",
1948 "https://mail.google.com/calendar/google/"));
1949 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1950 ElementsAre("mail", "mail.google.com/calendar/google/"));
1951 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1952 ElementsAre("mail", "mail"));
1953
1954 // Prefix query with both url split-token and suffix-token matches
1955 query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}};
1956 snippet_spec_.set_max_window_utf32_length(40);
1957
1958 snippet = snippet_retriever_->RetrieveSnippet(
1959 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1960
1961 ASSERT_THAT(snippet.entries(), SizeIs(1));
1962 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1963
1964 content = GetString(&document, snippet.entries(0).property_name());
1965
1966 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1967 ElementsAre("https://mail.google.com/calendar/google/",
1968 "https://mail.google.com/calendar/google/"));
1969 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1970 ElementsAre("https", "https://mail.google.com/calendar/google/"));
1971 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1972 ElementsAre("http", "http"));
1973
1974 // Window that's smaller than the input size should not return any matches.
1975 query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1976 snippet_spec_.set_max_window_utf32_length(10);
1977
1978 snippet = snippet_retriever_->RetrieveSnippet(
1979 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1980
1981 ASSERT_THAT(snippet.entries(), SizeIs(0));
1982
1983 // Test case with more than two matches
1984 document =
1985 DocumentBuilder()
1986 .SetKey("icing", "url/1")
1987 .SetSchema("urlType")
1988 .AddStringProperty("url", "https://www.google.com/calendar/google/")
1989 .Build();
1990
1991 // Prefix query with both url split-token and suffix-token matches
1992 query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1993 snippet_spec_.set_max_window_utf32_length(39);
1994
1995 snippet = snippet_retriever_->RetrieveSnippet(
1996 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1997
1998 ASSERT_THAT(snippet.entries(), SizeIs(1));
1999 EXPECT_THAT(snippet.entries(0).property_name(), "url");
2000
2001 content = GetString(&document, snippet.entries(0).property_name());
2002
2003 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
2004 ElementsAre("https://www.google.com/calendar/google/",
2005 "https://www.google.com/calendar/google/",
2006 "https://www.google.com/calendar/google/"));
2007 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
2008 ElementsAre("google", "google", "google.com/calendar/google/"));
2009 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
2010 ElementsAre("google", "google", "google"));
2011 }
2012 #endif // ENABLE_URL_TOKENIZER
2013
2014 } // namespace
2015
2016 } // namespace lib
2017 } // namespace icing
2018