xref: /aosp_15_r20/external/zucchini/suffix_array_unittest.cc (revision a03ca8b91e029cd15055c20c78c2e087c84792e4)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/zucchini/suffix_array.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include <algorithm>
11 #include <initializer_list>
12 #include <string>
13 #include <vector>
14 
15 #include "testing/gtest/include/gtest/gtest.h"
16 
17 namespace zucchini {
18 
19 namespace {
20 
21 using SLType = InducedSuffixSort::SLType;
22 
23 }  // namespace
24 
25 using ustring = std::basic_string<unsigned char>;
26 
27 constexpr uint16_t kNumChar = 256;
28 
MakeUnsignedString(const std::string & str)29 ustring MakeUnsignedString(const std::string& str) {
30   return {str.begin(), str.end()};
31 }
32 
33 template <class T>
MakeVector(const std::initializer_list<T> & ilist)34 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
35   return {ilist.begin(), ilist.end()};
36 }
37 
TestSlPartition(std::initializer_list<SLType> expected_sl_partition,std::initializer_list<size_t> expected_lms_indices,std::string str)38 void TestSlPartition(std::initializer_list<SLType> expected_sl_partition,
39                      std::initializer_list<size_t> expected_lms_indices,
40                      std::string str) {
41   using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
42 
43   std::vector<SLType> sl_partition(str.size());
44   EXPECT_EQ(expected_lms_indices.size(),
45             SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar,
46                                        sl_partition.rbegin()));
47   EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
48 
49   std::vector<size_t> lms_indices(expected_lms_indices.size());
50   SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
51   EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
52 }
53 
TEST(InducedSuffixSortTest,BuildSLPartition)54 TEST(InducedSuffixSortTest, BuildSLPartition) {
55   TestSlPartition({}, {}, "");
56   TestSlPartition(
57       {
58           SLType::LType,
59       },
60       {}, "a");
61   TestSlPartition(
62       {
63           SLType::LType,
64           SLType::LType,
65       },
66       {}, "ba");
67   TestSlPartition(
68       {
69           SLType::SType,
70           SLType::LType,
71       },
72       {}, "ab");
73   TestSlPartition(
74       {
75           SLType::SType,
76           SLType::SType,
77           SLType::LType,
78       },
79       {}, "aab");
80   TestSlPartition(
81       {
82           SLType::LType,
83           SLType::LType,
84           SLType::LType,
85       },
86       {}, "bba");
87   TestSlPartition(
88       {
89           SLType::LType,
90           SLType::SType,
91           SLType::LType,
92       },
93       {1}, "bab");
94   TestSlPartition(
95       {
96           SLType::LType,
97           SLType::SType,
98           SLType::SType,
99           SLType::LType,
100       },
101       {1}, "baab");
102 
103   TestSlPartition(
104       {
105           SLType::LType,  // zucchini
106           SLType::LType,  // ucchini
107           SLType::SType,  // cchini
108           SLType::SType,  // chini
109           SLType::SType,  // hini
110           SLType::SType,  // ini
111           SLType::LType,  // ni
112           SLType::LType,  // i
113       },
114       {2}, "zucchini");
115 }
116 
BucketCount(const std::initializer_list<unsigned char> str,uint16_t max_key)117 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
118                                 uint16_t max_key) {
119   using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
120   return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
121 }
122 
TEST(InducedSuffixSortTest,BucketCount)123 TEST(InducedSuffixSortTest, BucketCount) {
124   using vec = std::vector<size_t>;
125 
126   EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
127   EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
128   EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
129 }
130 
InducedSortSubstring(ustring str)131 std::vector<size_t> InducedSortSubstring(ustring str) {
132   using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>;
133   std::vector<SLType> sl_partition(str.size());
134   size_t lms_count = SaisImpl::BuildSLPartition(
135       str.begin(), str.size(), kNumChar, sl_partition.rbegin());
136   std::vector<size_t> lms_indices(lms_count);
137   SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
138   auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar);
139 
140   std::vector<size_t> suffix_array(str.size());
141   SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
142                         suffix_array.begin());
143 
144   return suffix_array;
145 }
146 
TEST(InducedSuffixSortTest,InducedSortSubstring)147 TEST(InducedSuffixSortTest, InducedSortSubstring) {
148   using vec = std::vector<size_t>;
149 
150   auto us = MakeUnsignedString;
151 
152   // L; a$
153   EXPECT_EQ(vec({0}), InducedSortSubstring(us("a")));
154 
155   // SL; ab$, b$
156   EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab")));
157 
158   // LL; a$, ba$
159   EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba")));
160 
161   // SLL; a$, aba$, ba$
162   EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba")));
163 
164   // LSL; ab$, b$, ba
165   EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab")));
166 
167   // SSL; aab$, ab$, b$
168   EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab")));
169 
170   // LSSL; aab$, ab$, b$, ba
171   EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab")));
172 }
173 
174 template <class Algorithm>
TestSuffixSort(ustring test_str)175 void TestSuffixSort(ustring test_str) {
176   std::vector<size_t> suffix_array =
177       MakeSuffixArray<Algorithm>(test_str, kNumChar);
178   EXPECT_EQ(test_str.size(), suffix_array.size());
179 
180   // Expect that I[] is a permutation of [0, len].
181   std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
182   std::sort(sorted_suffix.begin(), sorted_suffix.end());
183   for (size_t i = 0; i < test_str.size(); ++i)
184     EXPECT_EQ(i, sorted_suffix[i]);
185 
186   // Expect that all suffixes are strictly ordered.
187   auto end = test_str.end();
188   for (size_t i = 1; i < test_str.size(); ++i) {
189     auto suf1 = test_str.begin() + suffix_array[i - 1];
190     auto suf2 = test_str.begin() + suffix_array[i];
191     bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
192     EXPECT_TRUE(is_less);
193   }
194 }
195 
196 constexpr const char* test_strs[] = {
197     "",
198     "a",
199     "aa",
200     "za",
201     "CACAO",
202     "aaaaa",
203     "banana",
204     "tobeornottobe",
205     "The quick brown fox jumps over the lazy dog.",
206     "elephantelephantelephantelephantelephant",
207     "walawalawashington",
208     "-------------------------",
209     "011010011001011010010110011010010",
210     "3141592653589793238462643383279502884197169399375105",
211     "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
212     "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
213     "0123456789876543210",
214     "9876543210123456789",
215     "aababcabcdabcdeabcdefabcdefg",
216     "asdhklgalksdjghalksdjghalksdjgh",
217 };
218 
TEST(SuffixSortTest,NaiveSuffixSort)219 TEST(SuffixSortTest, NaiveSuffixSort) {
220   for (const std::string& test_str : test_strs) {
221     TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
222   }
223 }
224 
TEST(SuffixSortTest,InducedSuffixSortSort)225 TEST(SuffixSortTest, InducedSuffixSortSort) {
226   for (const std::string& test_str : test_strs) {
227     TestSuffixSort<InducedSuffixSort>(MakeUnsignedString(test_str));
228   }
229 }
230 
231 // Test with sequence that has every character.
TEST(SuffixSortTest,AllChar)232 TEST(SuffixSortTest, AllChar) {
233   std::vector<unsigned char> all_char(kNumChar);
234   std::iota(all_char.begin(), all_char.end(), 0);
235 
236   {
237     std::vector<size_t> suffix_array =
238         MakeSuffixArray<InducedSuffixSort>(all_char, kNumChar);
239     for (size_t i = 0; i < kNumChar; ++i)
240       EXPECT_EQ(i, suffix_array[i]);
241   }
242 
243   std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
244                                               all_char.rend());
245   {
246     std::vector<size_t> suffix_array =
247         MakeSuffixArray<InducedSuffixSort>(all_char_reverse, kNumChar);
248     for (size_t i = 0; i < kNumChar; ++i)
249       EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
250   }
251 }
252 
TestSuffixLowerBound(ustring base_str,ustring search_str)253 void TestSuffixLowerBound(ustring base_str, ustring search_str) {
254   std::vector<size_t> suffix_array =
255       MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar);
256 
257   auto pos = SuffixLowerBound(suffix_array, base_str.begin(),
258                               search_str.begin(), search_str.end());
259 
260   auto end = base_str.end();
261   if (pos != suffix_array.begin()) {
262     // Previous suffix is less than |search_str|.
263     auto suf = base_str.begin() + pos[-1];
264     bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
265                                                 search_str.end());
266     EXPECT_TRUE(is_less);
267   }
268   if (pos != suffix_array.end()) {
269     // Current suffix is greater of equal to |search_str|.
270     auto suf = base_str.begin() + *pos;
271     bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
272                                                 search_str.end());
273     EXPECT_FALSE(is_less);
274   }
275 }
276 
TEST(SuffixArrayTest,LowerBound)277 TEST(SuffixArrayTest, LowerBound) {
278   auto us = MakeUnsignedString;
279 
280   TestSuffixLowerBound(us(""), us(""));
281   TestSuffixLowerBound(us(""), us("a"));
282   TestSuffixLowerBound(us("b"), us(""));
283   TestSuffixLowerBound(us("b"), us("a"));
284   TestSuffixLowerBound(us("b"), us("c"));
285   TestSuffixLowerBound(us("b"), us("bc"));
286   TestSuffixLowerBound(us("aa"), us("a"));
287   TestSuffixLowerBound(us("aa"), us("aa"));
288 
289   ustring sentence = us("the quick brown fox jumps over the lazy dog.");
290   // Entire string: exact and unique.
291   TestSuffixLowerBound(sentence, sentence);
292   // Empty string: exact and non-unique.
293   TestSuffixLowerBound(sentence, us(""));
294   // Exact and unique suffix matches.
295   TestSuffixLowerBound(sentence, us("."));
296   TestSuffixLowerBound(sentence, us("the lazy dog."));
297   // Exact and unique non-suffix matches.
298   TestSuffixLowerBound(sentence, us("quick"));
299   TestSuffixLowerBound(sentence, us("the quick"));
300   // Partial and unique matches.
301   TestSuffixLowerBound(sentence, us("fox jumps with the hosps"));
302   TestSuffixLowerBound(sentence, us("xyz"));
303   // Exact and non-unique match: take lexicographical first.
304   TestSuffixLowerBound(sentence, us("the"));
305   TestSuffixLowerBound(sentence, us(" "));
306   // Partial and non-unique match.
307   // query      < "the l"... < "the q"...
308   TestSuffixLowerBound(sentence, us("the apple"));
309   // "the l"... < query      < "the q"...
310   TestSuffixLowerBound(sentence, us("the opera"));
311   // "the l"... < "the q"... < query
312   TestSuffixLowerBound(sentence, us("the zebra"));
313   // Prefix match dominates suffix match (unique).
314   TestSuffixLowerBound(sentence, us("over quick brown fox"));
315   // Empty matchs.
316   TestSuffixLowerBound(sentence, us(","));
317   TestSuffixLowerBound(sentence, us("1234"));
318   TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX"));
319   TestSuffixLowerBound(sentence, us("(the"));
320 }
321 
TEST(SuffixArrayTest,LowerBoundExact)322 TEST(SuffixArrayTest, LowerBoundExact) {
323   for (const std::string& test_str : test_strs) {
324     ustring test_ustr = MakeUnsignedString(test_str);
325 
326     std::vector<size_t> suffix_array =
327         MakeSuffixArray<InducedSuffixSort>(test_ustr, kNumChar);
328 
329     for (size_t lo = 0; lo < test_str.size(); ++lo) {
330       for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) {
331         ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
332         ASSERT_EQ(query.size(), hi - lo);
333         auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(),
334                                     query.begin(), query.end());
335         EXPECT_TRUE(
336             std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
337       }
338     }
339   }
340 }
341 
342 }  // namespace zucchini
343