xref: /aosp_15_r20/external/icing/icing/result/snippet-retriever.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/result/snippet-retriever.h"
16 
17 #include <algorithm>
18 #include <iterator>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "icing/absl_ports/canonical_errors.h"
29 #include "icing/absl_ports/str_cat.h"
30 #include "icing/proto/document.pb.h"
31 #include "icing/proto/search.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/query/query-terms.h"
34 #include "icing/schema/property-util.h"
35 #include "icing/schema/schema-store.h"
36 #include "icing/schema/section.h"
37 #include "icing/store/document-filter-data.h"
38 #include "icing/tokenization/language-segmenter.h"
39 #include "icing/tokenization/token.h"
40 #include "icing/tokenization/tokenizer-factory.h"
41 #include "icing/tokenization/tokenizer.h"
42 #include "icing/transform/normalizer.h"
43 #include "icing/util/character-iterator.h"
44 #include "icing/util/i18n-utils.h"
45 #include "icing/util/logging.h"
46 #include "icing/util/status-macros.h"
47 
48 namespace icing {
49 namespace lib {
50 
51 namespace {
52 
AddIndexToPath(int values_size,int index,const std::string & property_path)53 inline std::string AddIndexToPath(int values_size, int index,
54                                   const std::string& property_path) {
55   if (values_size == 1) {
56     return property_path;
57   }
58   return absl_ports::StrCat(
59       property_path, property_util::ConvertToPropertyExprIndexStr(index));
60 }
61 
62 // Returns a string of the normalized text of the input Token. Normalization
63 // is applied based on the Token's type.
NormalizeToken(const Normalizer & normalizer,const Token & token)64 Normalizer::NormalizedTerm NormalizeToken(const Normalizer& normalizer,
65                                           const Token& token) {
66   switch (token.type) {
67     case Token::Type::RFC822_NAME:
68       [[fallthrough]];
69     case Token::Type::RFC822_COMMENT:
70       [[fallthrough]];
71     case Token::Type::RFC822_LOCAL_ADDRESS:
72       [[fallthrough]];
73     case Token::Type::RFC822_HOST_ADDRESS:
74       [[fallthrough]];
75     case Token::Type::RFC822_ADDRESS:
76       [[fallthrough]];
77     case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
78       [[fallthrough]];
79     case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
80       [[fallthrough]];
81     case Token::Type::RFC822_TOKEN:
82       [[fallthrough]];
83     case Token::Type::URL_SCHEME:
84       [[fallthrough]];
85     case Token::Type::URL_USERNAME:
86       [[fallthrough]];
87     case Token::Type::URL_PASSWORD:
88       [[fallthrough]];
89     case Token::Type::URL_HOST_COMMON_PART:
90       [[fallthrough]];
91     case Token::Type::URL_HOST_SIGNIFICANT_PART:
92       [[fallthrough]];
93     case Token::Type::URL_PORT:
94       [[fallthrough]];
95     case Token::Type::URL_PATH_PART:
96       [[fallthrough]];
97     case Token::Type::URL_QUERY:
98       [[fallthrough]];
99     case Token::Type::URL_REF:
100       [[fallthrough]];
101     case Token::Type::URL_SUFFIX:
102       [[fallthrough]];
103     case Token::Type::URL_SUFFIX_INNERMOST:
104       [[fallthrough]];
105     case Token::Type::TRIGRAM:
106       [[fallthrough]];
107     case Token::Type::REGULAR:
108       return normalizer.NormalizeTerm(token.text);
109     case Token::Type::VERBATIM:
110       return {std::string(token.text)};
111     case Token::Type::QUERY_EXCLUSION:
112       [[fallthrough]];
113     case Token::Type::QUERY_LEFT_PARENTHESES:
114       [[fallthrough]];
115     case Token::Type::QUERY_RIGHT_PARENTHESES:
116       [[fallthrough]];
117     case Token::Type::QUERY_OR:
118       [[fallthrough]];
119     case Token::Type::QUERY_PROPERTY:
120       [[fallthrough]];
121     case Token::Type::INVALID:
122       ICING_LOG(WARNING) << "Unable to normalize token of type: "
123                          << static_cast<int>(token.type);
124       return {std::string(token.text)};
125   }
126 }
127 
128 // Returns a CharacterIterator for token's text, advancing one past the last
129 // matching character from the query term.
FindMatchEnd(const Normalizer & normalizer,const Token & token,const std::string & match_query_term)130 CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
131                                const std::string& match_query_term) {
132   switch (token.type) {
133     case Token::Type::VERBATIM: {
134       // VERBATIM tokens are not normalized. This means the non-normalized
135       // matched query term must be either equal to or a prefix of the token's
136       // text. Therefore, the match must end at the end of the matched query
137       // term.
138       CharacterIterator verbatim_match_end =
139           CharacterIterator(token.text, 0, 0, 0);
140       verbatim_match_end.AdvanceToUtf8(match_query_term.length());
141       return verbatim_match_end;
142     }
143     case Token::Type::QUERY_EXCLUSION:
144       [[fallthrough]];
145     case Token::Type::QUERY_LEFT_PARENTHESES:
146       [[fallthrough]];
147     case Token::Type::QUERY_RIGHT_PARENTHESES:
148       [[fallthrough]];
149     case Token::Type::QUERY_OR:
150       [[fallthrough]];
151     case Token::Type::QUERY_PROPERTY:
152       [[fallthrough]];
153     case Token::Type::INVALID:
154       ICING_LOG(WARNING)
155           << "Unexpected Token type " << static_cast<int>(token.type)
156           << " found when finding match end of query term and token.";
157       [[fallthrough]];
158     case Token::Type::RFC822_NAME:
159       [[fallthrough]];
160     case Token::Type::RFC822_COMMENT:
161       [[fallthrough]];
162     case Token::Type::RFC822_LOCAL_ADDRESS:
163       [[fallthrough]];
164     case Token::Type::RFC822_HOST_ADDRESS:
165       [[fallthrough]];
166     case Token::Type::RFC822_ADDRESS:
167       [[fallthrough]];
168     case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
169       [[fallthrough]];
170     case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
171       [[fallthrough]];
172     case Token::Type::RFC822_TOKEN:
173       [[fallthrough]];
174     case Token::Type::URL_SCHEME:
175       [[fallthrough]];
176     case Token::Type::URL_USERNAME:
177       [[fallthrough]];
178     case Token::Type::URL_PASSWORD:
179       [[fallthrough]];
180     case Token::Type::URL_HOST_COMMON_PART:
181       [[fallthrough]];
182     case Token::Type::URL_HOST_SIGNIFICANT_PART:
183       [[fallthrough]];
184     case Token::Type::URL_PORT:
185       [[fallthrough]];
186     case Token::Type::URL_QUERY:
187       [[fallthrough]];
188     case Token::Type::URL_PATH_PART:
189       [[fallthrough]];
190     case Token::Type::URL_REF:
191       [[fallthrough]];
192     case Token::Type::URL_SUFFIX:
193       [[fallthrough]];
194     case Token::Type::URL_SUFFIX_INNERMOST:
195       [[fallthrough]];
196     case Token::Type::TRIGRAM:
197       [[fallthrough]];
198     case Token::Type::REGULAR:
199       return normalizer.FindNormalizedMatchEndPosition(token.text,
200                                                        match_query_term);
201   }
202 }
203 
204 class TokenMatcher {
205  public:
206   virtual ~TokenMatcher() = default;
207 
208   // Returns a CharacterIterator pointing just past the end of the substring in
209   // token.text that matches a query term. Note that the utf* indices will be
210   // in relation to token.text's start.
211   //
212   // If there is no match, then it will construct a CharacterIterator with all
213   // of its indices set to -1.
214   //
215   // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar",
216   // Matches will return a CharacterIterator(u8:3, u16:3, u32:3).
217   virtual CharacterIterator Matches(Token token) const = 0;
218 };
219 
220 class TokenMatcherExact : public TokenMatcher {
221  public:
TokenMatcherExact(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)222   explicit TokenMatcherExact(
223       const std::unordered_set<std::string>& unrestricted_query_terms,
224       const std::unordered_set<std::string>& restricted_query_terms,
225       const Normalizer& normalizer)
226       : unrestricted_query_terms_(unrestricted_query_terms),
227         restricted_query_terms_(restricted_query_terms),
228         normalizer_(normalizer) {}
229 
Matches(Token token) const230   CharacterIterator Matches(Token token) const override {
231     Normalizer::NormalizedTerm normalized_term =
232         NormalizeToken(normalizer_, token);
233     auto itr = unrestricted_query_terms_.find(normalized_term.text);
234     if (itr == unrestricted_query_terms_.end()) {
235       itr = restricted_query_terms_.find(normalized_term.text);
236     }
237     if (itr != unrestricted_query_terms_.end() &&
238         itr != restricted_query_terms_.end()) {
239       return FindMatchEnd(normalizer_, token, *itr);
240     }
241 
242     return CharacterIterator(token.text, -1, -1, -1);
243   }
244 
245  private:
246   const std::unordered_set<std::string>& unrestricted_query_terms_;
247   const std::unordered_set<std::string>& restricted_query_terms_;
248   const Normalizer& normalizer_;
249 };
250 
251 class TokenMatcherPrefix : public TokenMatcher {
252  public:
TokenMatcherPrefix(const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)253   explicit TokenMatcherPrefix(
254       const std::unordered_set<std::string>& unrestricted_query_terms,
255       const std::unordered_set<std::string>& restricted_query_terms,
256       const Normalizer& normalizer)
257       : unrestricted_query_terms_(unrestricted_query_terms),
258         restricted_query_terms_(restricted_query_terms),
259         normalizer_(normalizer) {}
260 
Matches(Token token) const261   CharacterIterator Matches(Token token) const override {
262     Normalizer::NormalizedTerm normalized_term =
263         NormalizeToken(normalizer_, token);
264     for (const std::string& query_term : unrestricted_query_terms_) {
265       if (query_term.length() <= normalized_term.text.length() &&
266           normalized_term.text.compare(0, query_term.length(), query_term) ==
267               0) {
268         return FindMatchEnd(normalizer_, token, query_term);
269       }
270     }
271     for (const std::string& query_term : restricted_query_terms_) {
272       if (query_term.length() <= normalized_term.text.length() &&
273           normalized_term.text.compare(0, query_term.length(), query_term) ==
274               0) {
275         return FindMatchEnd(normalizer_, token, query_term);
276       }
277     }
278     return CharacterIterator(token.text, -1, -1, -1);
279   }
280 
281  private:
282   const std::unordered_set<std::string>& unrestricted_query_terms_;
283   const std::unordered_set<std::string>& restricted_query_terms_;
284   const Normalizer& normalizer_;
285 };
286 
CreateTokenMatcher(TermMatchType::Code match_type,const std::unordered_set<std::string> & unrestricted_query_terms,const std::unordered_set<std::string> & restricted_query_terms,const Normalizer & normalizer)287 libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
288     TermMatchType::Code match_type,
289     const std::unordered_set<std::string>& unrestricted_query_terms,
290     const std::unordered_set<std::string>& restricted_query_terms,
291     const Normalizer& normalizer) {
292   switch (match_type) {
293     case TermMatchType::EXACT_ONLY:
294       return std::make_unique<TokenMatcherExact>(
295           unrestricted_query_terms, restricted_query_terms, normalizer);
296     case TermMatchType::PREFIX:
297       return std::make_unique<TokenMatcherPrefix>(
298           unrestricted_query_terms, restricted_query_terms, normalizer);
299     case TermMatchType::UNKNOWN:
300       [[fallthrough]];
301     default:
302       return absl_ports::InvalidArgumentError("Invalid match type provided.");
303   }
304 }
305 
306 // Finds the start position of a valid token that is after
307 // window_start_min_exclusive_utf32
308 //
309 // Returns:
310 //   the position of the window start if successful
311 //   INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowStart(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_start_min_exclusive_utf32,Tokenizer::Iterator * iterator)312 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
313     const ResultSpecProto::SnippetSpecProto& snippet_spec,
314     std::string_view value, int window_start_min_exclusive_utf32,
315     Tokenizer::Iterator* iterator) {
316   if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
317     return absl_ports::InternalError(
318         "Couldn't reset tokenizer to determine snippet window!");
319   }
320   return iterator->CalculateTokenStart();
321 }
322 
323 // Increments window_end_exclusive so long as the character at the position
324 // of window_end_exclusive is punctuation and does not exceed
325 // window_end_max_exclusive_utf32.
IncludeTrailingPunctuation(std::string_view value,CharacterIterator window_end_exclusive,int window_end_max_exclusive_utf32)326 CharacterIterator IncludeTrailingPunctuation(
327     std::string_view value, CharacterIterator window_end_exclusive,
328     int window_end_max_exclusive_utf32) {
329   size_t max_search_index = value.length() - 1;
330   while (window_end_exclusive.utf8_index() <= max_search_index &&
331          window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
332     int char_len = 0;
333     if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
334                                      &char_len)) {
335       break;
336     }
337     // Expand window by char_len and check the next character.
338     window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
339   }
340   return window_end_exclusive;
341 }
342 
343 // Finds the end position of a valid token that is before the
344 // window_end_max_exclusive_utf32.
345 //
346 // Returns:
347 //   the position of the window end if successful
348 //   INTERNAL_ERROR - if a tokenizer error is encountered
DetermineWindowEnd(const ResultSpecProto::SnippetSpecProto & snippet_spec,std::string_view value,int window_end_max_exclusive_utf32,Tokenizer::Iterator * iterator)349 libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
350     const ResultSpecProto::SnippetSpecProto& snippet_spec,
351     std::string_view value, int window_end_max_exclusive_utf32,
352     Tokenizer::Iterator* iterator) {
353   if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
354     return absl_ports::InternalError(
355         "Couldn't reset tokenizer to determine snippet window!");
356   }
357   ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
358                          iterator->CalculateTokenEndExclusive());
359   return IncludeTrailingPunctuation(value, end_exclusive,
360                                     window_end_max_exclusive_utf32);
361 }
362 
363 struct SectionData {
364   std::string_view section_name;
365   std::string_view section_subcontent;
366 };
367 
368 // Creates a snippet match proto for the match pointed to by the iterator,
369 // between start_itr and end_itr
370 // Returns:
371 //   the position of the window start if successful
372 //   INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
373 //     in an invalid state
374 //   ABORTED_ERROR - if an invalid utf-8 sequence is encountered
RetrieveMatch(const ResultSpecProto::SnippetSpecProto & snippet_spec,const SectionData & value,Tokenizer::Iterator * iterator,const CharacterIterator & start_itr,const CharacterIterator & end_itr)375 libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
376     const ResultSpecProto::SnippetSpecProto& snippet_spec,
377     const SectionData& value, Tokenizer::Iterator* iterator,
378     const CharacterIterator& start_itr, const CharacterIterator& end_itr) {
379   SnippetMatchProto snippet_match;
380   // When finding boundaries,  we have a few cases:
381   //
382   // Case 1:
383   //   If we have an odd length match an odd length window, the window surrounds
384   //   the match perfectly.
385   //     match  = "bar" in "foo bar baz"
386   //     window =              |---|
387   //
388   // Case 2:
389   //   If we have an even length match with an even length window, the window
390   //   surrounds the match perfectly.
391   //     match  = "baar" in "foo baar baz"
392   //     window =               |----|
393   //
394   // Case 3:
395   //   If we have an odd length match with an even length window, we allocate
396   //   that extra window byte to the beginning.
397   //     match  = "bar" in "foo bar baz"
398   //     window =             |----|
399   //
400   // Case 4:
401   //   If we have an even length match with an odd length window, we allocate
402   //   that extra window byte to the end.
403   //     match  = "baar" in "foo baar baz"
404   //     window =               |-----|
405   //
406   // We have do +1/-1 below to get the math to match up.
407   int match_pos_utf32 = start_itr.utf32_index();
408   int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
409   int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
410   int window_start_min_exclusive_utf32 =
411       (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
412   int window_end_max_exclusive_utf32 =
413       match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
414 
415   snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
416   snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
417   snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
418                                             start_itr.utf8_index());
419   snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
420                                              start_itr.utf16_index());
421 
422   // Only include windows if it'll at least include the matched text. Otherwise,
423   // it'll just be an empty string anyways.
424   if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
425     // Find the beginning of the window.
426     ICING_ASSIGN_OR_RETURN(
427         CharacterIterator window_start,
428         DetermineWindowStart(snippet_spec, value.section_subcontent,
429                              window_start_min_exclusive_utf32, iterator));
430 
431     // Check. Did we get fewer characters than we requested? If so, then add it
432     // on to the window_end.
433     int extra_window_space =
434         window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
435     window_end_max_exclusive_utf32 += extra_window_space;
436 
437     // Find the end of the window.
438     ICING_ASSIGN_OR_RETURN(
439         CharacterIterator window_end,
440         DetermineWindowEnd(snippet_spec, value.section_subcontent,
441                            window_end_max_exclusive_utf32, iterator));
442 
443     // Check one more time. Did we get fewer characters than we requested? If
444     // so, then see if we can push the start back again.
445     extra_window_space =
446         window_end_max_exclusive_utf32 - window_end.utf32_index();
447     if (extra_window_space > 0) {
448       window_start_min_exclusive_utf32 =
449           window_start.utf32_index() - 1 - extra_window_space;
450       ICING_ASSIGN_OR_RETURN(
451           window_start,
452           DetermineWindowStart(snippet_spec, value.section_subcontent,
453                                window_start_min_exclusive_utf32, iterator));
454     }
455 
456     snippet_match.set_window_byte_position(window_start.utf8_index());
457     snippet_match.set_window_utf16_position(window_start.utf16_index());
458     snippet_match.set_window_byte_length(window_end.utf8_index() -
459                                          window_start.utf8_index());
460     snippet_match.set_window_utf16_length(window_end.utf16_index() -
461                                           window_start.utf16_index());
462 
463     // DetermineWindowStart/End may change the position of the iterator, but it
464     // will be reset once the entire batch of tokens is checked.
465   }
466 
467   return snippet_match;
468 }
469 
470 struct MatchOptions {
471   const ResultSpecProto::SnippetSpecProto& snippet_spec;
472   int max_matches_remaining;
473 };
474 
475 // Retrieves snippets in the string values of current_property.
476 // Tokenizer is provided to tokenize string content and matcher is provided to
477 // indicate when a token matches content in the query.
478 //
479 // current_property is the property with the string values to snippet.
480 // property_path is the path in the document to current_property.
481 //
482 // MatchOptions holds the snippet spec and number of desired matches remaining.
483 // Each call to GetEntriesFromProperty will decrement max_matches_remaining
484 // by the number of entries that it adds to snippet_proto.
485 //
486 // The SnippetEntries found for matched content will be added to snippet_proto.
GetEntriesFromProperty(const PropertyProto * current_property,const std::string & property_path,const TokenMatcher * matcher,const Tokenizer * tokenizer,MatchOptions * match_options,SnippetProto * snippet_proto)487 void GetEntriesFromProperty(const PropertyProto* current_property,
488                             const std::string& property_path,
489                             const TokenMatcher* matcher,
490                             const Tokenizer* tokenizer,
491                             MatchOptions* match_options,
492                             SnippetProto* snippet_proto) {
493   // We're at the end. Let's check our values.
494   for (int i = 0; i < current_property->string_values_size(); ++i) {
495     SnippetProto::EntryProto snippet_entry;
496     snippet_entry.set_property_name(AddIndexToPath(
497         current_property->string_values_size(), /*index=*/i, property_path));
498     std::string_view value = current_property->string_values(i);
499     std::unique_ptr<Tokenizer::Iterator> iterator =
500         tokenizer->Tokenize(value).ValueOrDie();
501     // All iterators are moved through positions sequentially. Constructing them
502     // each time resets them to the beginning of the string. This means that,
503     // for t tokens and in a string of n chars, each MoveToUtf8 call from the
504     // beginning of the string is on average O(n/2), whereas calling MoveToUtf8
505     // from the token immediately prior to the desired one is O(n/t).
506     // Constructing each outside of the while-loop ensures that performance will
507     // be O(t * (n/t)) = O(n) rather than O(t * n / 2).
508     CharacterIterator start_itr(value);
509     CharacterIterator end_itr(value);
510     CharacterIterator reset_itr(value);
511     bool encountered_error = false;
512     while (iterator->Advance()) {
513       std::vector<Token> batch_tokens = iterator->GetTokens();
514       if (batch_tokens.empty()) {
515         continue;
516       }
517 
518       bool needs_reset = false;
519       reset_itr.MoveToUtf8(batch_tokens.at(0).text.begin() - value.begin());
520       start_itr = reset_itr;
521       end_itr = start_itr;
522       for (int i = 0; i < batch_tokens.size(); ++i) {
523         const Token& token = batch_tokens.at(i);
524         CharacterIterator submatch_end = matcher->Matches(token);
525         // If the token matched a query term, then submatch_end will point to an
526         // actual position within token.text.
527         if (submatch_end.utf8_index() == -1) {
528           continue;
529         }
530         // As snippet matching may move iterator around, we save a reset
531         // iterator so that we can reset to the initial iterator state, and
532         // continue Advancing in order in the next round.
533         if (!start_itr.MoveToUtf8(token.text.begin() - value.begin())) {
534           encountered_error = true;
535           break;
536         }
537         if (!end_itr.MoveToUtf8(token.text.end() - value.begin())) {
538           encountered_error = true;
539           break;
540         }
541         SectionData data = {property_path, value};
542         auto match_or = RetrieveMatch(match_options->snippet_spec, data,
543                                       iterator.get(), start_itr, end_itr);
544         if (!match_or.ok()) {
545           if (absl_ports::IsAborted(match_or.status())) {
546             // Only an aborted. We can't get this match, but we might be able
547             // to retrieve others. Just continue.
548             continue;
549           } else {
550             encountered_error = true;
551             break;
552           }
553         }
554         SnippetMatchProto match = std::move(match_or).ValueOrDie();
555         if (match.window_byte_length() > 0) {
556           needs_reset = true;
557         }
558         // submatch_end refers to a position *within* token.text.
559         // This, conveniently enough, means that index that submatch_end
560         // points to is the length of the submatch (because the submatch
561         // starts at 0 in token.text).
562         match.set_submatch_byte_length(submatch_end.utf8_index());
563         match.set_submatch_utf16_length(submatch_end.utf16_index());
564         // Add the values for the submatch.
565         snippet_entry.mutable_snippet_matches()->Add(std::move(match));
566 
567         if (--match_options->max_matches_remaining <= 0) {
568           *snippet_proto->add_entries() = std::move(snippet_entry);
569           return;
570         }
571       }
572 
573       if (encountered_error) {
574         break;
575       }
576 
577       // RetrieveMatch may call DetermineWindowStart/End if windowing is
578       // requested, which may change the position of the iterator. So, reset the
579       // iterator back to the original position. The first token of the token
580       // batch will be the token to reset to.
581       if (needs_reset) {
582         if (reset_itr.utf8_index() > 0) {
583           encountered_error =
584               !iterator->ResetToTokenStartingAfter(reset_itr.utf32_index() - 1);
585         } else {
586           encountered_error = !iterator->ResetToStart();
587         }
588       }
589       if (encountered_error) {
590         break;
591       }
592     }
593     if (!snippet_entry.snippet_matches().empty()) {
594       *snippet_proto->add_entries() = std::move(snippet_entry);
595     }
596   }
597 }
598 
599 // Retrieves snippets in document from content at section_path.
600 // Tokenizer is provided to tokenize string content and matcher is provided to
601 // indicate when a token matches content in the query.
602 //
603 // section_path_index refers to the current property that is held by document.
604 // current_path is equivalent to the first section_path_index values in
605 // section_path, but with value indices present.
606 //
607 // For example, suppose that a hit appeared somewhere in the "bcc.emailAddress".
608 // The arguments for RetrieveSnippetForSection might be
609 // {section_path=["bcc", "emailAddress"], section_path_index=0, current_path=""}
610 // on the first call and
611 // {section_path=["bcc", "emailAddress"], section_path_index=1,
612 // current_path="bcc[1]"} on the second recursive call.
613 //
614 // MatchOptions holds the snippet spec and number of desired matches remaining.
615 // Each call to RetrieveSnippetForSection will decrement max_matches_remaining
616 // by the number of entries that it adds to snippet_proto.
617 //
618 // The SnippetEntries found for matched content will be added to snippet_proto.
RetrieveSnippetForSection(const DocumentProto & document,const TokenMatcher * matcher,const Tokenizer * tokenizer,const std::vector<std::string_view> & section_path,int section_path_index,const std::string & current_path,MatchOptions * match_options,SnippetProto * snippet_proto)619 void RetrieveSnippetForSection(
620     const DocumentProto& document, const TokenMatcher* matcher,
621     const Tokenizer* tokenizer,
622     const std::vector<std::string_view>& section_path, int section_path_index,
623     const std::string& current_path, MatchOptions* match_options,
624     SnippetProto* snippet_proto) {
625   std::string_view next_property_name = section_path.at(section_path_index);
626   const PropertyProto* current_property =
627       property_util::GetPropertyProto(document, next_property_name);
628   if (current_property == nullptr) {
629     ICING_VLOG(1) << "No property " << next_property_name << " found at path "
630                   << current_path;
631     return;
632   }
633   std::string property_path = property_util::ConcatenatePropertyPathExpr(
634       current_path, next_property_name);
635   if (section_path_index == section_path.size() - 1) {
636     // We're at the end. Let's check our values.
637     GetEntriesFromProperty(current_property, property_path, matcher, tokenizer,
638                            match_options, snippet_proto);
639   } else {
640     // Still got more to go. Let's look through our subdocuments.
641     std::vector<SnippetProto::EntryProto> entries;
642     for (int i = 0; i < current_property->document_values_size(); ++i) {
643       std::string new_path = AddIndexToPath(
644           current_property->document_values_size(), /*index=*/i, property_path);
645       RetrieveSnippetForSection(current_property->document_values(i), matcher,
646                                 tokenizer, section_path, section_path_index + 1,
647                                 new_path, match_options, snippet_proto);
648       if (match_options->max_matches_remaining <= 0) {
649         break;
650       }
651     }
652   }
653 }
654 
655 }  // namespace
656 
657 libtextclassifier3::StatusOr<std::unique_ptr<SnippetRetriever>>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const Normalizer * normalizer)658 SnippetRetriever::Create(const SchemaStore* schema_store,
659                          const LanguageSegmenter* language_segmenter,
660                          const Normalizer* normalizer) {
661   ICING_RETURN_ERROR_IF_NULL(schema_store);
662   ICING_RETURN_ERROR_IF_NULL(language_segmenter);
663   ICING_RETURN_ERROR_IF_NULL(normalizer);
664 
665   return std::unique_ptr<SnippetRetriever>(
666       new SnippetRetriever(schema_store, language_segmenter, normalizer));
667 }
668 
RetrieveSnippet(const SectionRestrictQueryTermsMap & query_terms,TermMatchType::Code match_type,const ResultSpecProto::SnippetSpecProto & snippet_spec,const DocumentProto & document,SectionIdMask section_id_mask) const669 SnippetProto SnippetRetriever::RetrieveSnippet(
670     const SectionRestrictQueryTermsMap& query_terms,
671     TermMatchType::Code match_type,
672     const ResultSpecProto::SnippetSpecProto& snippet_spec,
673     const DocumentProto& document, SectionIdMask section_id_mask) const {
674   SnippetProto snippet_proto;
675   ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
676                          schema_store_.GetSchemaTypeId(document.schema()),
677                          snippet_proto);
678   const std::unordered_set<std::string> empty_set;
679   auto itr = query_terms.find("");
680   const std::unordered_set<std::string>& unrestricted_set =
681       (itr != query_terms.end()) ? itr->second : empty_set;
682   while (section_id_mask != kSectionIdMaskNone) {
683     SectionId section_id = __builtin_ctzll(section_id_mask);
684     // Remove this section from the mask.
685     section_id_mask &= ~(UINT64_C(1) << section_id);
686 
687     MatchOptions match_options = {snippet_spec};
688     match_options.max_matches_remaining =
689         snippet_spec.num_matches_per_property();
690 
691     // Determine the section name and match type.
692     auto section_metadata_or =
693         schema_store_.GetSectionMetadata(type_id, section_id);
694     if (!section_metadata_or.ok()) {
695       continue;
696     }
697     const SectionMetadata* metadata = section_metadata_or.ValueOrDie();
698     std::vector<std::string_view> section_path =
699         property_util::SplitPropertyPathExpr(metadata->path);
700 
701     // Match type must be as restrictive as possible. Prefix matches for a
702     // snippet should only be included if both the query is Prefix and the
703     // section has prefixes enabled.
704     TermMatchType::Code section_match_type = TermMatchType::EXACT_ONLY;
705     if (match_type == TermMatchType::PREFIX &&
706         metadata->term_match_type == TermMatchType::PREFIX) {
707       section_match_type = TermMatchType::PREFIX;
708     }
709 
710     itr = query_terms.find(metadata->path);
711     const std::unordered_set<std::string>& restricted_set =
712         (itr != query_terms.end()) ? itr->second : empty_set;
713     libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> matcher_or =
714         CreateTokenMatcher(section_match_type, unrestricted_set, restricted_set,
715                            normalizer_);
716     if (!matcher_or.ok()) {
717       continue;
718     }
719     std::unique_ptr<TokenMatcher> matcher = std::move(matcher_or).ValueOrDie();
720 
721     auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer(
722         metadata->tokenizer, &language_segmenter_);
723     if (!tokenizer_or.ok()) {
724       // If we couldn't create the tokenizer properly, just skip this section.
725       continue;
726     }
727     std::unique_ptr<Tokenizer> tokenizer = std::move(tokenizer_or).ValueOrDie();
728     RetrieveSnippetForSection(
729         document, matcher.get(), tokenizer.get(), section_path,
730         /*section_path_index=*/0, "", &match_options, &snippet_proto);
731   }
732   return snippet_proto;
733 }
734 
735 }  // namespace lib
736 }  // namespace icing
737