xref: /aosp_15_r20/external/libtextclassifier/native/utils/container/sorted-strings-table.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/container/sorted-strings-table.h"
18 
19 #include <algorithm>
20 
21 #include "utils/base/endian.h"
22 #include "utils/base/logging.h"
23 
24 namespace libtextclassifier3 {
25 
GatherPrefixMatches(StringPiece input,const std::function<void (Match)> & update_fn) const26 void SortedStringsTable::GatherPrefixMatches(
27     StringPiece input, const std::function<void(Match)>& update_fn) const {
28   int left = 0;
29   int right = num_pieces_;
30   int span_size = right - left;
31   int match_length = 0;
32 
33   // Loop invariant:
34   // at the ith iteration, all strings from `left` ... `right` match the input
35   // on the first `match_length` characters.
36   while (span_size > use_linear_scan_threshold_) {
37     if (match_length >= input.length()) {
38       return;
39     }
40 
41     // We find the possible range of pieces in `left` ... `right` matching the
42     // `match_length` + 1 character with two binary searches:
43     //     `lower_bound` to find the start of the range of matching pieces.
44     //     `upper_bound` to find the non-inclusive end of the range.
45     left = (std::lower_bound(
46                 offsets_ + left, offsets_ + right,
47                 static_cast<unsigned char>(input[match_length]),
48                 [this, match_length](uint32 piece_offset, uint32 c) -> bool {
49                   return static_cast<unsigned char>(
50                              pieces_[piece_offset + match_length]) <
51                          LittleEndian::ToHost32(c);
52                 }) -
53             offsets_);
54     right = (std::upper_bound(
55                  offsets_ + left, offsets_ + right,
56                  static_cast<unsigned char>(input[match_length]),
57                  [this, match_length](uint32 c, uint32 piece_offset) -> bool {
58                    return LittleEndian::ToHost32(c) <
59                           static_cast<unsigned char>(
60                               pieces_[piece_offset + match_length]);
61                  }) -
62              offsets_);
63     span_size = right - left;
64     if (span_size <= 0) {
65       return;
66     }
67     ++match_length;
68 
69     // Due to the loop invariant and the fact that the strings are sorted, there
70     // can only be one piece matching completely now, namely at left.
71     if (pieces_[LittleEndian::ToHost32(offsets_[left]) + match_length] == 0) {
72       update_fn(Match(/*id=*/left,
73                       /*match_length=*/match_length));
74       left++;
75     }
76   }
77 
78   // Use linear scan for small problem instances.
79   // By the loop invariant characters 0...`match_length` of all pieces in
80   // in `left`...`right` match the input on 0...`match_length`.
81   for (int i = left; i < right; i++) {
82     bool matches = true;
83     int piece_match_length = match_length;
84     for (int k = LittleEndian::ToHost32(offsets_[i]) + piece_match_length;
85          pieces_[k] != 0; k++) {
86       if (piece_match_length >= input.size() ||
87           input[piece_match_length] != pieces_[k]) {
88         matches = false;
89         break;
90       }
91       piece_match_length++;
92     }
93     if (matches) {
94       update_fn(Match(/*id=*/i,
95                       /*match_length=*/piece_match_length));
96     }
97   }
98 }
99 
FindAllPrefixMatches(StringPiece input,std::vector<Match> * matches) const100 bool SortedStringsTable::FindAllPrefixMatches(
101     StringPiece input, std::vector<Match>* matches) const {
102   GatherPrefixMatches(
103       input, [matches](const Match match) { matches->push_back(match); });
104   return true;
105 }
106 
LongestPrefixMatch(StringPiece input,Match * longest_match) const107 bool SortedStringsTable::LongestPrefixMatch(StringPiece input,
108                                             Match* longest_match) const {
109   *longest_match = Match();
110   GatherPrefixMatches(
111       input, [longest_match](const Match match) { *longest_match = match; });
112   return true;
113 }
114 
115 }  // namespace libtextclassifier3
116