1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/container/sorted-strings-table.h"
18
19 #include <algorithm>
20
21 #include "utils/base/endian.h"
22 #include "utils/base/logging.h"
23
24 namespace libtextclassifier3 {
25
GatherPrefixMatches(StringPiece input,const std::function<void (Match)> & update_fn) const26 void SortedStringsTable::GatherPrefixMatches(
27 StringPiece input, const std::function<void(Match)>& update_fn) const {
28 int left = 0;
29 int right = num_pieces_;
30 int span_size = right - left;
31 int match_length = 0;
32
33 // Loop invariant:
34 // at the ith iteration, all strings from `left` ... `right` match the input
35 // on the first `match_length` characters.
36 while (span_size > use_linear_scan_threshold_) {
37 if (match_length >= input.length()) {
38 return;
39 }
40
41 // We find the possible range of pieces in `left` ... `right` matching the
42 // `match_length` + 1 character with two binary searches:
43 // `lower_bound` to find the start of the range of matching pieces.
44 // `upper_bound` to find the non-inclusive end of the range.
45 left = (std::lower_bound(
46 offsets_ + left, offsets_ + right,
47 static_cast<unsigned char>(input[match_length]),
48 [this, match_length](uint32 piece_offset, uint32 c) -> bool {
49 return static_cast<unsigned char>(
50 pieces_[piece_offset + match_length]) <
51 LittleEndian::ToHost32(c);
52 }) -
53 offsets_);
54 right = (std::upper_bound(
55 offsets_ + left, offsets_ + right,
56 static_cast<unsigned char>(input[match_length]),
57 [this, match_length](uint32 c, uint32 piece_offset) -> bool {
58 return LittleEndian::ToHost32(c) <
59 static_cast<unsigned char>(
60 pieces_[piece_offset + match_length]);
61 }) -
62 offsets_);
63 span_size = right - left;
64 if (span_size <= 0) {
65 return;
66 }
67 ++match_length;
68
69 // Due to the loop invariant and the fact that the strings are sorted, there
70 // can only be one piece matching completely now, namely at left.
71 if (pieces_[LittleEndian::ToHost32(offsets_[left]) + match_length] == 0) {
72 update_fn(Match(/*id=*/left,
73 /*match_length=*/match_length));
74 left++;
75 }
76 }
77
78 // Use linear scan for small problem instances.
79 // By the loop invariant characters 0...`match_length` of all pieces in
80 // in `left`...`right` match the input on 0...`match_length`.
81 for (int i = left; i < right; i++) {
82 bool matches = true;
83 int piece_match_length = match_length;
84 for (int k = LittleEndian::ToHost32(offsets_[i]) + piece_match_length;
85 pieces_[k] != 0; k++) {
86 if (piece_match_length >= input.size() ||
87 input[piece_match_length] != pieces_[k]) {
88 matches = false;
89 break;
90 }
91 piece_match_length++;
92 }
93 if (matches) {
94 update_fn(Match(/*id=*/i,
95 /*match_length=*/piece_match_length));
96 }
97 }
98 }
99
FindAllPrefixMatches(StringPiece input,std::vector<Match> * matches) const100 bool SortedStringsTable::FindAllPrefixMatches(
101 StringPiece input, std::vector<Match>* matches) const {
102 GatherPrefixMatches(
103 input, [matches](const Match match) { matches->push_back(match); });
104 return true;
105 }
106
LongestPrefixMatch(StringPiece input,Match * longest_match) const107 bool SortedStringsTable::LongestPrefixMatch(StringPiece input,
108 Match* longest_match) const {
109 *longest_match = Match();
110 GatherPrefixMatches(
111 input, [longest_match](const Match match) { *longest_match = match; });
112 return true;
113 }
114
115 } // namespace libtextclassifier3
116