1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/result/snippet-retriever.h"
16
17 #include <algorithm>
18 #include <iterator>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/proto/document.pb.h"
31 #include "icing/proto/search.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/query/query-terms.h"
34 #include "icing/schema/property-util.h"
35 #include "icing/schema/schema-store.h"
36 #include "icing/schema/section.h"
37 #include "icing/store/document-filter-data.h"
38 #include "icing/tokenization/language-segmenter.h"
39 #include "icing/tokenization/token.h"
40 #include "icing/tokenization/tokenizer-factory.h"
41 #include "icing/tokenization/tokenizer.h"
42 #include "icing/transform/normalizer.h"
43 #include "icing/util/character-iterator.h"
44 #include "icing/util/i18n-utils.h"
45 #include "icing/util/logging.h"
46 #include "icing/util/status-macros.h"
47
48 namespace icing {
49 namespace lib {
50
51 namespace {
52
AddIndexToPath(int values_size,int index,const std::string & property_path)53 inline std::string AddIndexToPath(int values_size, int index,
54 const std::string& property_path) {
55 if (values_size == 1) {
56 return property_path;
57 }
58 return absl_ports::StrCat(
59 property_path, property_util::ConvertToPropertyExprIndexStr(index));
60 }
61
62 // Returns a string of the normalized text of the input Token. Normalization
63 // is applied based on the Token's type.
NormalizeToken(const Normalizer & normalizer,const Token & token)64 Normalizer::NormalizedTerm NormalizeToken(const Normalizer& normalizer,
65 const Token& token) {
66 switch (token.type) {
67 case Token::Type::RFC822_NAME:
68 [[fallthrough]];
69 case Token::Type::RFC822_COMMENT:
70 [[fallthrough]];
71 case Token::Type::RFC822_LOCAL_ADDRESS:
72 [[fallthrough]];
73 case Token::Type::RFC822_HOST_ADDRESS:
74 [[fallthrough]];
75 case Token::Type::RFC822_ADDRESS:
76 [[fallthrough]];
77 case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
78 [[fallthrough]];
79 case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
80 [[fallthrough]];
81 case Token::Type::RFC822_TOKEN:
82 [[fallthrough]];
83 case Token::Type::URL_SCHEME:
84 [[fallthrough]];
85 case Token::Type::URL_USERNAME:
86 [[fallthrough]];
87 case Token::Type::URL_PASSWORD:
88 [[fallthrough]];
89 case Token::Type::URL_HOST_COMMON_PART:
90 [[fallthrough]];
91 case Token::Type::URL_HOST_SIGNIFICANT_PART:
92 [[fallthrough]];
93 case Token::Type::URL_PORT:
94 [[fallthrough]];
95 case Token::Type::URL_PATH_PART:
96 [[fallthrough]];
97 case Token::Type::URL_QUERY:
98 [[fallthrough]];
99 case Token::Type::URL_REF:
100 [[fallthrough]];
101 case Token::Type::URL_SUFFIX:
102 [[fallthrough]];
103 case Token::Type::URL_SUFFIX_INNERMOST:
104 [[fallthrough]];
105 case Token::Type::TRIGRAM:
106 [[fallthrough]];
107 case Token::Type::REGULAR:
108 return normalizer.NormalizeTerm(token.text);
109 case Token::Type::VERBATIM:
110 return {std::string(token.text)};
111 case Token::Type::QUERY_EXCLUSION:
112 [[fallthrough]];
113 case Token::Type::QUERY_LEFT_PARENTHESES:
114 [[fallthrough]];
115 case Token::Type::QUERY_RIGHT_PARENTHESES:
116 [[fallthrough]];
117 case Token::Type::QUERY_OR:
118 [[fallthrough]];
119 case Token::Type::QUERY_PROPERTY:
120 [[fallthrough]];
121 case Token::Type::INVALID:
122 ICING_LOG(WARNING) << "Unable to normalize token of type: "
123 << static_cast<int>(token.type);
124 return {std::string(token.text)};
125 }
126 }
127
128 // Returns a CharacterIterator for token's text, advancing one past the last
129 // matching character from the query term.
FindMatchEnd(const Normalizer & normalizer,const Token & token,const std::string & match_query_term)130 CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
131 const std::string& match_query_term) {
132 switch (token.type) {
133 case Token::Type::VERBATIM: {
134 // VERBATIM tokens are not normalized. This means the non-normalized
135 // matched query term must be either equal to or a prefix of the token's
136 // text. Therefore, the match must end at the end of the matched query
137 // term.
138 CharacterIterator verbatim_match_end =
139 CharacterIterator(token.text, 0, 0, 0);
140 verbatim_match_end.AdvanceToUtf8(match_query_term.length());
141 return verbatim_match_end;
142 }
143 case Token::Type::QUERY_EXCLUSION:
144 [[fallthrough]];
145 case Token::Type::QUERY_LEFT_PARENTHESES:
146 [[fallthrough]];
147 case Token::Type::QUERY_RIGHT_PARENTHESES:
148 [[fallthrough]];
149 case Token::Type::QUERY_OR:
150 [[fallthrough]];
151 case Token::Type::QUERY_PROPERTY:
152 [[fallthrough]];
153 case Token::Type::INVALID:
154 ICING_LOG(WARNING)
155 << "Unexpected Token type " << static_cast<int>(token.type)
156 << " found when finding match end of query term and token.";
157 [[fallthrough]];
158 case Token::Type::RFC822_NAME:
159 [[fallthrough]];
160 case Token::Type::RFC822_COMMENT:
161 [[fallthrough]];
162 case Token::Type::RFC822_LOCAL_ADDRESS:
163 [[fallthrough]];
164 case Token::Type::RFC822_HOST_ADDRESS:
165 [[fallthrough]];
166 case Token::Type::RFC822_ADDRESS:
167 [[fallthrough]];
168 case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
169 [[fallthrough]];
170 case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
171 [[fallthrough]];
172 case Token::Type::RFC822_TOKEN:
173 [[fallthrough]];
174 case Token::Type::URL_SCHEME:
175 [[fallthrough]];
176 case Token::Type::URL_USERNAME:
177 [[fallthrough]];
178 case Token::Type::URL_PASSWORD:
179 [[fallthrough]];
180 case Token::Type::URL_HOST_COMMON_PART:
181 [[fallthrough]];
182 case Token::Type::URL_HOST_SIGNIFICANT_PART:
183 [[fallthrough]];
184 case Token::Type::URL_PORT:
185 [[fallthrough]];
186 case Token::Type::URL_QUERY:
187 [[fallthrough]];
188 case Token::Type::URL_PATH_PART:
189 [[fallthrough]];
190 case Token::Type::URL_REF:
191 [[fallthrough]];
192 case Token::Type::URL_SUFFIX:
193 [[fallthrough]];
194 case Token::Type::URL_SUFFIX_INNERMOST:
195 [[fallthrough]];
196 case Token::Type::TRIGRAM:
197 [[fallthrough]];
198 case Token::Type::REGULAR:
199 return normalizer.FindNormalizedMatchEndPosition(token.text,
200 match_query_term);
201 }
202 }
203
204 class TokenMatcher {
205 public:
206 virtual ~TokenMatcher() = default;
207
208 // Returns a CharacterIterator pointing just past the end of the substring in
209 // token.text that matches a query term. Note that the utf* indices will be
210 // in relation to token.text's start.
211 //
212 // If there is no match, then it will construct a CharacterIterator with all
213 // of its indices set to -1.
214 //
215 // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar",
216 // Matches will return a CharacterIterator(u8:3, u16:3, u32:3).
217 virtual CharacterIterator Matches(Token token) const = 0;
218 };
219
220 class TokenMatcherExact : public TokenMatcher {
221 public:
TokenMatcherExact(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)222 explicit TokenMatcherExact(
223 const std::unordered_set<std::string>& unrestricted_query_terms,
224 const std::unordered_set<std::string>& restricted_query_terms,
225 const Normalizer& normalizer)
226 : unrestricted_query_terms_(unrestricted_query_terms),
227 restricted_query_terms_(restricted_query_terms),
228 normalizer_(normalizer) {}
229
Matches(Token token) const230 CharacterIterator Matches(Token token) const override {
231 Normalizer::NormalizedTerm normalized_term =
232 NormalizeToken(normalizer_, token);
233 auto itr = unrestricted_query_terms_.find(normalized_term.text);
234 if (itr == unrestricted_query_terms_.end()) {
235 itr = restricted_query_terms_.find(normalized_term.text);
236 }
237 if (itr != unrestricted_query_terms_.end() &&
238 itr != restricted_query_terms_.end()) {
239 return FindMatchEnd(normalizer_, token, *itr);
240 }
241
242 return CharacterIterator(token.text, -1, -1, -1);
243 }
244
245 private:
246 const std::unordered_set<std::string>& unrestricted_query_terms_;
247 const std::unordered_set<std::string>& restricted_query_terms_;
248 const Normalizer& normalizer_;
249 };
250
251 class TokenMatcherPrefix : public TokenMatcher {
252 public:
TokenMatcherPrefix(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)253 explicit TokenMatcherPrefix(
254 const std::unordered_set<std::string>& unrestricted_query_terms,
255 const std::unordered_set<std::string>& restricted_query_terms,
256 const Normalizer& normalizer)
257 : unrestricted_query_terms_(unrestricted_query_terms),
258 restricted_query_terms_(restricted_query_terms),
259 normalizer_(normalizer) {}
260
Matches(Token token) const261 CharacterIterator Matches(Token token) const override {
262 Normalizer::NormalizedTerm normalized_term =
263 NormalizeToken(normalizer_, token);
264 for (const std::string& query_term : unrestricted_query_terms_) {
265 if (query_term.length() <= normalized_term.text.length() &&
266 normalized_term.text.compare(0, query_term.length(), query_term) ==
267 0) {
268 return FindMatchEnd(normalizer_, token, query_term);
269 }
270 }
271 for (const std::string& query_term : restricted_query_terms_) {
272 if (query_term.length() <= normalized_term.text.length() &&
273 normalized_term.text.compare(0, query_term.length(), query_term) ==
274 0) {
275 return FindMatchEnd(normalizer_, token, query_term);
276 }
277 }
278 return CharacterIterator(token.text, -1, -1, -1);
279 }
280
281 private:
282 const std::unordered_set<std::string>& unrestricted_query_terms_;
283 const std::unordered_set<std::string>& restricted_query_terms_;
284 const Normalizer& normalizer_;
285 };
286
CreateTokenMatcher(TermMatchType::Code match_type,const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)287 libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
288 TermMatchType::Code match_type,
289 const std::unordered_set<std::string>& unrestricted_query_terms,
290 const std::unordered_set<std::string>& restricted_query_terms,
291 const Normalizer& normalizer) {
292 switch (match_type) {
293 case TermMatchType::EXACT_ONLY:
294 return std::make_unique<TokenMatcherExact>(
295 unrestricted_query_terms, restricted_query_terms, normalizer);
296 case TermMatchType::PREFIX:
297 return std::make_unique<TokenMatcherPrefix>(
298 unrestricted_query_terms, restricted_query_terms, normalizer);
299 case TermMatchType::UNKNOWN:
300 [[fallthrough]];
301 default:
302 return absl_ports::InvalidArgumentError("Invalid match type provided.");
303 }
304 }
305
306 // Finds the start position of a valid token that is after
307 // window_start_min_exclusive_utf32
308 //
309 // Returns:
310 // the position of the window start if successful
311 // INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowStart(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_start_min_exclusive_utf32,Tokenizer::Iterator * iterator)312 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
313 const ResultSpecProto::SnippetSpecProto& snippet_spec,
314 std::string_view value, int window_start_min_exclusive_utf32,
315 Tokenizer::Iterator* iterator) {
316 if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
317 return absl_ports::InternalError(
318 "Couldn't reset tokenizer to determine snippet window!");
319 }
320 return iterator->CalculateTokenStart();
321 }
322
323 // Increments window_end_exclusive so long as the character at the position
324 // of window_end_exclusive is punctuation and does not exceed
325 // window_end_max_exclusive_utf32.
IncludeTrailingPunctuation(std::string_view value,CharacterIterator window_end_exclusive,int window_end_max_exclusive_utf32)326 CharacterIterator IncludeTrailingPunctuation(
327 std::string_view value, CharacterIterator window_end_exclusive,
328 int window_end_max_exclusive_utf32) {
329 size_t max_search_index = value.length() - 1;
330 while (window_end_exclusive.utf8_index() <= max_search_index &&
331 window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
332 int char_len = 0;
333 if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
334 &char_len)) {
335 break;
336 }
337 // Expand window by char_len and check the next character.
338 window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
339 }
340 return window_end_exclusive;
341 }
342
343 // Finds the end position of a valid token that is before the
344 // window_end_max_exclusive_utf32.
345 //
346 // Returns:
347 // the position of the window end if successful
348 // INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowEnd(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_end_max_exclusive_utf32,Tokenizer::Iterator * iterator)349 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
350 const ResultSpecProto::SnippetSpecProto& snippet_spec,
351 std::string_view value, int window_end_max_exclusive_utf32,
352 Tokenizer::Iterator* iterator) {
353 if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
354 return absl_ports::InternalError(
355 "Couldn't reset tokenizer to determine snippet window!");
356 }
357 ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
358 iterator->CalculateTokenEndExclusive());
359 return IncludeTrailingPunctuation(value, end_exclusive,
360 window_end_max_exclusive_utf32);
361 }
362
363 struct SectionData {
364 std::string_view section_name;
365 std::string_view section_subcontent;
366 };
367
368 // Creates a snippet match proto for the match pointed to by the iterator,
369 // between start_itr and end_itr
370 // Returns:
371 // the position of the window start if successful
372 // INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
373 // in an invalid state
374 // ABORTED_ERROR - if an invalid utf-8 sequence is encountered
RetrieveMatch(const ResultSpecProto::SnippetSpecProto & snippet_spec,const SectionData & value,Tokenizer::Iterator * iterator,const CharacterIterator & start_itr,const CharacterIterator & end_itr)375 libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
376 const ResultSpecProto::SnippetSpecProto& snippet_spec,
377 const SectionData& value, Tokenizer::Iterator* iterator,
378 const CharacterIterator& start_itr, const CharacterIterator& end_itr) {
379 SnippetMatchProto snippet_match;
380 // When finding boundaries, we have a few cases:
381 //
382 // Case 1:
383 // If we have an odd length match an odd length window, the window surrounds
384 // the match perfectly.
385 // match = "bar" in "foo bar baz"
386 // window = |---|
387 //
388 // Case 2:
389 // If we have an even length match with an even length window, the window
390 // surrounds the match perfectly.
391 // match = "baar" in "foo baar baz"
392 // window = |----|
393 //
394 // Case 3:
395 // If we have an odd length match with an even length window, we allocate
396 // that extra window byte to the beginning.
397 // match = "bar" in "foo bar baz"
398 // window = |----|
399 //
400 // Case 4:
401 // If we have an even length match with an odd length window, we allocate
402 // that extra window byte to the end.
403 // match = "baar" in "foo baar baz"
404 // window = |-----|
405 //
406 // We have do +1/-1 below to get the math to match up.
407 int match_pos_utf32 = start_itr.utf32_index();
408 int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
409 int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
410 int window_start_min_exclusive_utf32 =
411 (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
412 int window_end_max_exclusive_utf32 =
413 match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
414
415 snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
416 snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
417 snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
418 start_itr.utf8_index());
419 snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
420 start_itr.utf16_index());
421
422 // Only include windows if it'll at least include the matched text. Otherwise,
423 // it'll just be an empty string anyways.
424 if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
425 // Find the beginning of the window.
426 ICING_ASSIGN_OR_RETURN(
427 CharacterIterator window_start,
428 DetermineWindowStart(snippet_spec, value.section_subcontent,
429 window_start_min_exclusive_utf32, iterator));
430
431 // Check. Did we get fewer characters than we requested? If so, then add it
432 // on to the window_end.
433 int extra_window_space =
434 window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
435 window_end_max_exclusive_utf32 += extra_window_space;
436
437 // Find the end of the window.
438 ICING_ASSIGN_OR_RETURN(
439 CharacterIterator window_end,
440 DetermineWindowEnd(snippet_spec, value.section_subcontent,
441 window_end_max_exclusive_utf32, iterator));
442
443 // Check one more time. Did we get fewer characters than we requested? If
444 // so, then see if we can push the start back again.
445 extra_window_space =
446 window_end_max_exclusive_utf32 - window_end.utf32_index();
447 if (extra_window_space > 0) {
448 window_start_min_exclusive_utf32 =
449 window_start.utf32_index() - 1 - extra_window_space;
450 ICING_ASSIGN_OR_RETURN(
451 window_start,
452 DetermineWindowStart(snippet_spec, value.section_subcontent,
453 window_start_min_exclusive_utf32, iterator));
454 }
455
456 snippet_match.set_window_byte_position(window_start.utf8_index());
457 snippet_match.set_window_utf16_position(window_start.utf16_index());
458 snippet_match.set_window_byte_length(window_end.utf8_index() -
459 window_start.utf8_index());
460 snippet_match.set_window_utf16_length(window_end.utf16_index() -
461 window_start.utf16_index());
462
463 // DetermineWindowStart/End may change the position of the iterator, but it
464 // will be reset once the entire batch of tokens is checked.
465 }
466
467 return snippet_match;
468 }
469
470 struct MatchOptions {
471 const ResultSpecProto::SnippetSpecProto& snippet_spec;
472 int max_matches_remaining;
473 };
474
475 // Retrieves snippets in the string values of current_property.
476 // Tokenizer is provided to tokenize string content and matcher is provided to
477 // indicate when a token matches content in the query.
478 //
479 // current_property is the property with the string values to snippet.
480 // property_path is the path in the document to current_property.
481 //
482 // MatchOptions holds the snippet spec and number of desired matches remaining.
483 // Each call to GetEntriesFromProperty will decrement max_matches_remaining
484 // by the number of entries that it adds to snippet_proto.
485 //
486 // The SnippetEntries found for matched content will be added to snippet_proto.
GetEntriesFromProperty(const PropertyProto * current_property,const std::string & property_path,const TokenMatcher * matcher,const Tokenizer * tokenizer,MatchOptions * match_options,SnippetProto * snippet_proto)487 void GetEntriesFromProperty(const PropertyProto* current_property,
488 const std::string& property_path,
489 const TokenMatcher* matcher,
490 const Tokenizer* tokenizer,
491 MatchOptions* match_options,
492 SnippetProto* snippet_proto) {
493 // We're at the end. Let's check our values.
494 for (int i = 0; i < current_property->string_values_size(); ++i) {
495 SnippetProto::EntryProto snippet_entry;
496 snippet_entry.set_property_name(AddIndexToPath(
497 current_property->string_values_size(), /*index=*/i, property_path));
498 std::string_view value = current_property->string_values(i);
499 std::unique_ptr<Tokenizer::Iterator> iterator =
500 tokenizer->Tokenize(value).ValueOrDie();
501 // All iterators are moved through positions sequentially. Constructing them
502 // each time resets them to the beginning of the string. This means that,
503 // for t tokens and in a string of n chars, each MoveToUtf8 call from the
504 // beginning of the string is on average O(n/2), whereas calling MoveToUtf8
505 // from the token immediately prior to the desired one is O(n/t).
506 // Constructing each outside of the while-loop ensures that performance will
507 // be O(t * (n/t)) = O(n) rather than O(t * n / 2).
508 CharacterIterator start_itr(value);
509 CharacterIterator end_itr(value);
510 CharacterIterator reset_itr(value);
511 bool encountered_error = false;
512 while (iterator->Advance()) {
513 std::vector<Token> batch_tokens = iterator->GetTokens();
514 if (batch_tokens.empty()) {
515 continue;
516 }
517
518 bool needs_reset = false;
519 reset_itr.MoveToUtf8(batch_tokens.at(0).text.begin() - value.begin());
520 start_itr = reset_itr;
521 end_itr = start_itr;
522 for (int i = 0; i < batch_tokens.size(); ++i) {
523 const Token& token = batch_tokens.at(i);
524 CharacterIterator submatch_end = matcher->Matches(token);
525 // If the token matched a query term, then submatch_end will point to an
526 // actual position within token.text.
527 if (submatch_end.utf8_index() == -1) {
528 continue;
529 }
530 // As snippet matching may move iterator around, we save a reset
531 // iterator so that we can reset to the initial iterator state, and
532 // continue Advancing in order in the next round.
533 if (!start_itr.MoveToUtf8(token.text.begin() - value.begin())) {
534 encountered_error = true;
535 break;
536 }
537 if (!end_itr.MoveToUtf8(token.text.end() - value.begin())) {
538 encountered_error = true;
539 break;
540 }
541 SectionData data = {property_path, value};
542 auto match_or = RetrieveMatch(match_options->snippet_spec, data,
543 iterator.get(), start_itr, end_itr);
544 if (!match_or.ok()) {
545 if (absl_ports::IsAborted(match_or.status())) {
546 // Only an aborted. We can't get this match, but we might be able
547 // to retrieve others. Just continue.
548 continue;
549 } else {
550 encountered_error = true;
551 break;
552 }
553 }
554 SnippetMatchProto match = std::move(match_or).ValueOrDie();
555 if (match.window_byte_length() > 0) {
556 needs_reset = true;
557 }
558 // submatch_end refers to a position *within* token.text.
559 // This, conveniently enough, means that index that submatch_end
560 // points to is the length of the submatch (because the submatch
561 // starts at 0 in token.text).
562 match.set_submatch_byte_length(submatch_end.utf8_index());
563 match.set_submatch_utf16_length(submatch_end.utf16_index());
564 // Add the values for the submatch.
565 snippet_entry.mutable_snippet_matches()->Add(std::move(match));
566
567 if (--match_options->max_matches_remaining <= 0) {
568 *snippet_proto->add_entries() = std::move(snippet_entry);
569 return;
570 }
571 }
572
573 if (encountered_error) {
574 break;
575 }
576
577 // RetrieveMatch may call DetermineWindowStart/End if windowing is
578 // requested, which may change the position of the iterator. So, reset the
579 // iterator back to the original position. The first token of the token
580 // batch will be the token to reset to.
581 if (needs_reset) {
582 if (reset_itr.utf8_index() > 0) {
583 encountered_error =
584 !iterator->ResetToTokenStartingAfter(reset_itr.utf32_index() - 1);
585 } else {
586 encountered_error = !iterator->ResetToStart();
587 }
588 }
589 if (encountered_error) {
590 break;
591 }
592 }
593 if (!snippet_entry.snippet_matches().empty()) {
594 *snippet_proto->add_entries() = std::move(snippet_entry);
595 }
596 }
597 }
598
599 // Retrieves snippets in document from content at section_path.
600 // Tokenizer is provided to tokenize string content and matcher is provided to
601 // indicate when a token matches content in the query.
602 //
603 // section_path_index refers to the current property that is held by document.
604 // current_path is equivalent to the first section_path_index values in
605 // section_path, but with value indices present.
606 //
607 // For example, suppose that a hit appeared somewhere in the "bcc.emailAddress".
608 // The arguments for RetrieveSnippetForSection might be
609 // {section_path=["bcc", "emailAddress"], section_path_index=0, current_path=""}
610 // on the first call and
611 // {section_path=["bcc", "emailAddress"], section_path_index=1,
612 // current_path="bcc[1]"} on the second recursive call.
613 //
614 // MatchOptions holds the snippet spec and number of desired matches remaining.
615 // Each call to RetrieveSnippetForSection will decrement max_matches_remaining
616 // by the number of entries that it adds to snippet_proto.
617 //
618 // The SnippetEntries found for matched content will be added to snippet_proto.
RetrieveSnippetForSection(const DocumentProto & document,const TokenMatcher * matcher,const Tokenizer * tokenizer,const std::vector<std::string_view> & section_path,int section_path_index,const std::string & current_path,MatchOptions * match_options,SnippetProto * snippet_proto)619 void RetrieveSnippetForSection(
620 const DocumentProto& document, const TokenMatcher* matcher,
621 const Tokenizer* tokenizer,
622 const std::vector<std::string_view>& section_path, int section_path_index,
623 const std::string& current_path, MatchOptions* match_options,
624 SnippetProto* snippet_proto) {
625 std::string_view next_property_name = section_path.at(section_path_index);
626 const PropertyProto* current_property =
627 property_util::GetPropertyProto(document, next_property_name);
628 if (current_property == nullptr) {
629 ICING_VLOG(1) << "No property " << next_property_name << " found at path "
630 << current_path;
631 return;
632 }
633 std::string property_path = property_util::ConcatenatePropertyPathExpr(
634 current_path, next_property_name);
635 if (section_path_index == section_path.size() - 1) {
636 // We're at the end. Let's check our values.
637 GetEntriesFromProperty(current_property, property_path, matcher, tokenizer,
638 match_options, snippet_proto);
639 } else {
640 // Still got more to go. Let's look through our subdocuments.
641 std::vector<SnippetProto::EntryProto> entries;
642 for (int i = 0; i < current_property->document_values_size(); ++i) {
643 std::string new_path = AddIndexToPath(
644 current_property->document_values_size(), /*index=*/i, property_path);
645 RetrieveSnippetForSection(current_property->document_values(i), matcher,
646 tokenizer, section_path, section_path_index + 1,
647 new_path, match_options, snippet_proto);
648 if (match_options->max_matches_remaining <= 0) {
649 break;
650 }
651 }
652 }
653 }
654
655 } // namespace
656
657 libtextclassifier3::StatusOr<std::unique_ptr<SnippetRetriever>>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer)658 SnippetRetriever::Create(const SchemaStore* schema_store,
659 const LanguageSegmenter* language_segmenter,
660 const Normalizer* normalizer) {
661 ICING_RETURN_ERROR_IF_NULL(schema_store);
662 ICING_RETURN_ERROR_IF_NULL(language_segmenter);
663 ICING_RETURN_ERROR_IF_NULL(normalizer);
664
665 return std::unique_ptr<SnippetRetriever>(
666 new SnippetRetriever(schema_store, language_segmenter, normalizer));
667 }
668
RetrieveSnippet(const SectionRestrictQueryTermsMap & query_terms,TermMatchType::Code match_type,const ResultSpecProto::SnippetSpecProto & snippet_spec,const DocumentProto & document,SectionIdMask section_id_mask) const669 SnippetProto SnippetRetriever::RetrieveSnippet(
670 const SectionRestrictQueryTermsMap& query_terms,
671 TermMatchType::Code match_type,
672 const ResultSpecProto::SnippetSpecProto& snippet_spec,
673 const DocumentProto& document, SectionIdMask section_id_mask) const {
674 SnippetProto snippet_proto;
675 ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
676 schema_store_.GetSchemaTypeId(document.schema()),
677 snippet_proto);
678 const std::unordered_set<std::string> empty_set;
679 auto itr = query_terms.find("");
680 const std::unordered_set<std::string>& unrestricted_set =
681 (itr != query_terms.end()) ? itr->second : empty_set;
682 while (section_id_mask != kSectionIdMaskNone) {
683 SectionId section_id = __builtin_ctzll(section_id_mask);
684 // Remove this section from the mask.
685 section_id_mask &= ~(UINT64_C(1) << section_id);
686
687 MatchOptions match_options = {snippet_spec};
688 match_options.max_matches_remaining =
689 snippet_spec.num_matches_per_property();
690
691 // Determine the section name and match type.
692 auto section_metadata_or =
693 schema_store_.GetSectionMetadata(type_id, section_id);
694 if (!section_metadata_or.ok()) {
695 continue;
696 }
697 const SectionMetadata* metadata = section_metadata_or.ValueOrDie();
698 std::vector<std::string_view> section_path =
699 property_util::SplitPropertyPathExpr(metadata->path);
700
701 // Match type must be as restrictive as possible. Prefix matches for a
702 // snippet should only be included if both the query is Prefix and the
703 // section has prefixes enabled.
704 TermMatchType::Code section_match_type = TermMatchType::EXACT_ONLY;
705 if (match_type == TermMatchType::PREFIX &&
706 metadata->term_match_type == TermMatchType::PREFIX) {
707 section_match_type = TermMatchType::PREFIX;
708 }
709
710 itr = query_terms.find(metadata->path);
711 const std::unordered_set<std::string>& restricted_set =
712 (itr != query_terms.end()) ? itr->second : empty_set;
713 libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> matcher_or =
714 CreateTokenMatcher(section_match_type, unrestricted_set, restricted_set,
715 normalizer_);
716 if (!matcher_or.ok()) {
717 continue;
718 }
719 std::unique_ptr<TokenMatcher> matcher = std::move(matcher_or).ValueOrDie();
720
721 auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer(
722 metadata->tokenizer, &language_segmenter_);
723 if (!tokenizer_or.ok()) {
724 // If we couldn't create the tokenizer properly, just skip this section.
725 continue;
726 }
727 std::unique_ptr<Tokenizer> tokenizer = std::move(tokenizer_or).ValueOrDie();
728 RetrieveSnippetForSection(
729 document, matcher.get(), tokenizer.get(), section_path,
730 /*section_path_index=*/0, "", &match_options, &snippet_proto);
731 }
732 return snippet_proto;
733 }
734
735 } // namespace lib
736 } // namespace icing
737