1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/zucchini/suffix_array.h"
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include <algorithm>
11 #include <initializer_list>
12 #include <string>
13 #include <vector>
14
15 #include "testing/gtest/include/gtest/gtest.h"
16
17 namespace zucchini {
18
19 namespace {
20
21 using SLType = InducedSuffixSort::SLType;
22
23 } // namespace
24
25 using ustring = std::basic_string<unsigned char>;
26
27 constexpr uint16_t kNumChar = 256;
28
MakeUnsignedString(const std::string & str)29 ustring MakeUnsignedString(const std::string& str) {
30 return {str.begin(), str.end()};
31 }
32
33 template <class T>
MakeVector(const std::initializer_list<T> & ilist)34 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
35 return {ilist.begin(), ilist.end()};
36 }
37
TestSlPartition(std::initializer_list<SLType> expected_sl_partition,std::initializer_list<size_t> expected_lms_indices,std::string str)38 void TestSlPartition(std::initializer_list<SLType> expected_sl_partition,
39 std::initializer_list<size_t> expected_lms_indices,
40 std::string str) {
41 using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
42
43 std::vector<SLType> sl_partition(str.size());
44 EXPECT_EQ(expected_lms_indices.size(),
45 SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar,
46 sl_partition.rbegin()));
47 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
48
49 std::vector<size_t> lms_indices(expected_lms_indices.size());
50 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
51 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
52 }
53
TEST(InducedSuffixSortTest,BuildSLPartition)54 TEST(InducedSuffixSortTest, BuildSLPartition) {
55 TestSlPartition({}, {}, "");
56 TestSlPartition(
57 {
58 SLType::LType,
59 },
60 {}, "a");
61 TestSlPartition(
62 {
63 SLType::LType,
64 SLType::LType,
65 },
66 {}, "ba");
67 TestSlPartition(
68 {
69 SLType::SType,
70 SLType::LType,
71 },
72 {}, "ab");
73 TestSlPartition(
74 {
75 SLType::SType,
76 SLType::SType,
77 SLType::LType,
78 },
79 {}, "aab");
80 TestSlPartition(
81 {
82 SLType::LType,
83 SLType::LType,
84 SLType::LType,
85 },
86 {}, "bba");
87 TestSlPartition(
88 {
89 SLType::LType,
90 SLType::SType,
91 SLType::LType,
92 },
93 {1}, "bab");
94 TestSlPartition(
95 {
96 SLType::LType,
97 SLType::SType,
98 SLType::SType,
99 SLType::LType,
100 },
101 {1}, "baab");
102
103 TestSlPartition(
104 {
105 SLType::LType, // zucchini
106 SLType::LType, // ucchini
107 SLType::SType, // cchini
108 SLType::SType, // chini
109 SLType::SType, // hini
110 SLType::SType, // ini
111 SLType::LType, // ni
112 SLType::LType, // i
113 },
114 {2}, "zucchini");
115 }
116
BucketCount(const std::initializer_list<unsigned char> str,uint16_t max_key)117 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
118 uint16_t max_key) {
119 using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
120 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
121 }
122
TEST(InducedSuffixSortTest,BucketCount)123 TEST(InducedSuffixSortTest, BucketCount) {
124 using vec = std::vector<size_t>;
125
126 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
127 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
128 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
129 }
130
InducedSortSubstring(ustring str)131 std::vector<size_t> InducedSortSubstring(ustring str) {
132 using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
133 std::vector<SLType> sl_partition(str.size());
134 size_t lms_count = SaisImpl::BuildSLPartition(
135 str.begin(), str.size(), kNumChar, sl_partition.rbegin());
136 std::vector<size_t> lms_indices(lms_count);
137 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
138 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar);
139
140 std::vector<size_t> suffix_array(str.size());
141 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
142 suffix_array.begin());
143
144 return suffix_array;
145 }
146
TEST(InducedSuffixSortTest,InducedSortSubstring)147 TEST(InducedSuffixSortTest, InducedSortSubstring) {
148 using vec = std::vector<size_t>;
149
150 auto us = MakeUnsignedString;
151
152 // L; a$
153 EXPECT_EQ(vec({0}), InducedSortSubstring(us("a")));
154
155 // SL; ab$, b$
156 EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab")));
157
158 // LL; a$, ba$
159 EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba")));
160
161 // SLL; a$, aba$, ba$
162 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba")));
163
164 // LSL; ab$, b$, ba
165 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab")));
166
167 // SSL; aab$, ab$, b$
168 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab")));
169
170 // LSSL; aab$, ab$, b$, ba
171 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab")));
172 }
173
174 template <class Algorithm>
TestSuffixSort(ustring test_str)175 void TestSuffixSort(ustring test_str) {
176 std::vector<size_t> suffix_array =
177 MakeSuffixArray<Algorithm>(test_str, kNumChar);
178 EXPECT_EQ(test_str.size(), suffix_array.size());
179
180 // Expect that I[] is a permutation of [0, len].
181 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
182 std::sort(sorted_suffix.begin(), sorted_suffix.end());
183 for (size_t i = 0; i < test_str.size(); ++i)
184 EXPECT_EQ(i, sorted_suffix[i]);
185
186 // Expect that all suffixes are strictly ordered.
187 auto end = test_str.end();
188 for (size_t i = 1; i < test_str.size(); ++i) {
189 auto suf1 = test_str.begin() + suffix_array[i - 1];
190 auto suf2 = test_str.begin() + suffix_array[i];
191 bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
192 EXPECT_TRUE(is_less);
193 }
194 }
195
196 constexpr const char* test_strs[] = {
197 "",
198 "a",
199 "aa",
200 "za",
201 "CACAO",
202 "aaaaa",
203 "banana",
204 "tobeornottobe",
205 "The quick brown fox jumps over the lazy dog.",
206 "elephantelephantelephantelephantelephant",
207 "walawalawashington",
208 "-------------------------",
209 "011010011001011010010110011010010",
210 "3141592653589793238462643383279502884197169399375105",
211 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
212 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
213 "0123456789876543210",
214 "9876543210123456789",
215 "aababcabcdabcdeabcdefabcdefg",
216 "asdhklgalksdjghalksdjghalksdjgh",
217 };
218
TEST(SuffixSortTest,NaiveSuffixSort)219 TEST(SuffixSortTest, NaiveSuffixSort) {
220 for (const std::string& test_str : test_strs) {
221 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
222 }
223 }
224
TEST(SuffixSortTest,InducedSuffixSortSort)225 TEST(SuffixSortTest, InducedSuffixSortSort) {
226 for (const std::string& test_str : test_strs) {
227 TestSuffixSort<InducedSuffixSort>(MakeUnsignedString(test_str));
228 }
229 }
230
231 // Test with sequence that has every character.
TEST(SuffixSortTest,AllChar)232 TEST(SuffixSortTest, AllChar) {
233 std::vector<unsigned char> all_char(kNumChar);
234 std::iota(all_char.begin(), all_char.end(), 0);
235
236 {
237 std::vector<size_t> suffix_array =
238 MakeSuffixArray<InducedSuffixSort>(all_char, kNumChar);
239 for (size_t i = 0; i < kNumChar; ++i)
240 EXPECT_EQ(i, suffix_array[i]);
241 }
242
243 std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
244 all_char.rend());
245 {
246 std::vector<size_t> suffix_array =
247 MakeSuffixArray<InducedSuffixSort>(all_char_reverse, kNumChar);
248 for (size_t i = 0; i < kNumChar; ++i)
249 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
250 }
251 }
252
TestSuffixLowerBound(ustring base_str,ustring search_str)253 void TestSuffixLowerBound(ustring base_str, ustring search_str) {
254 std::vector<size_t> suffix_array =
255 MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar);
256
257 auto pos = SuffixLowerBound(suffix_array, base_str.begin(),
258 search_str.begin(), search_str.end());
259
260 auto end = base_str.end();
261 if (pos != suffix_array.begin()) {
262 // Previous suffix is less than |search_str|.
263 auto suf = base_str.begin() + pos[-1];
264 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
265 search_str.end());
266 EXPECT_TRUE(is_less);
267 }
268 if (pos != suffix_array.end()) {
269 // Current suffix is greater of equal to |search_str|.
270 auto suf = base_str.begin() + *pos;
271 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
272 search_str.end());
273 EXPECT_FALSE(is_less);
274 }
275 }
276
TEST(SuffixArrayTest,LowerBound)277 TEST(SuffixArrayTest, LowerBound) {
278 auto us = MakeUnsignedString;
279
280 TestSuffixLowerBound(us(""), us(""));
281 TestSuffixLowerBound(us(""), us("a"));
282 TestSuffixLowerBound(us("b"), us(""));
283 TestSuffixLowerBound(us("b"), us("a"));
284 TestSuffixLowerBound(us("b"), us("c"));
285 TestSuffixLowerBound(us("b"), us("bc"));
286 TestSuffixLowerBound(us("aa"), us("a"));
287 TestSuffixLowerBound(us("aa"), us("aa"));
288
289 ustring sentence = us("the quick brown fox jumps over the lazy dog.");
290 // Entire string: exact and unique.
291 TestSuffixLowerBound(sentence, sentence);
292 // Empty string: exact and non-unique.
293 TestSuffixLowerBound(sentence, us(""));
294 // Exact and unique suffix matches.
295 TestSuffixLowerBound(sentence, us("."));
296 TestSuffixLowerBound(sentence, us("the lazy dog."));
297 // Exact and unique non-suffix matches.
298 TestSuffixLowerBound(sentence, us("quick"));
299 TestSuffixLowerBound(sentence, us("the quick"));
300 // Partial and unique matches.
301 TestSuffixLowerBound(sentence, us("fox jumps with the hosps"));
302 TestSuffixLowerBound(sentence, us("xyz"));
303 // Exact and non-unique match: take lexicographical first.
304 TestSuffixLowerBound(sentence, us("the"));
305 TestSuffixLowerBound(sentence, us(" "));
306 // Partial and non-unique match.
307 // query < "the l"... < "the q"...
308 TestSuffixLowerBound(sentence, us("the apple"));
309 // "the l"... < query < "the q"...
310 TestSuffixLowerBound(sentence, us("the opera"));
311 // "the l"... < "the q"... < query
312 TestSuffixLowerBound(sentence, us("the zebra"));
313 // Prefix match dominates suffix match (unique).
314 TestSuffixLowerBound(sentence, us("over quick brown fox"));
315 // Empty matchs.
316 TestSuffixLowerBound(sentence, us(","));
317 TestSuffixLowerBound(sentence, us("1234"));
318 TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX"));
319 TestSuffixLowerBound(sentence, us("(the"));
320 }
321
TEST(SuffixArrayTest,LowerBoundExact)322 TEST(SuffixArrayTest, LowerBoundExact) {
323 for (const std::string& test_str : test_strs) {
324 ustring test_ustr = MakeUnsignedString(test_str);
325
326 std::vector<size_t> suffix_array =
327 MakeSuffixArray<InducedSuffixSort>(test_ustr, kNumChar);
328
329 for (size_t lo = 0; lo < test_str.size(); ++lo) {
330 for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) {
331 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
332 ASSERT_EQ(query.size(), hi - lo);
333 auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(),
334 query.begin(), query.end());
335 EXPECT_TRUE(
336 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
337 }
338 }
339 }
340 }
341
342 } // namespace zucchini
343