1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
6 #define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
7 
8 #include <algorithm>
9 #include <iterator>
10 #include <numeric>
11 #include <vector>
12 
13 #include "base/check.h"
14 
15 namespace zucchini {
16 
17 // A functor class that implements the naive suffix sorting algorithm that uses
18 // std::sort with lexicographical compare. This is only meant as reference of
19 // the interface.
20 class NaiveSuffixSort {
21  public:
22   // Type requirements:
23   // |InputRng| is an input random access range.
24   // |KeyType| is an unsigned integer type.
25   // |SAIt| is a random access iterator with mutable references.
26   template <class InputRng, class KeyType, class SAIt>
27   // |str| is the input string on which suffix sort is applied.
28   // Characters found in |str| must be in the range [0, |key_bound|)
29   // |suffix_array| is the beginning of the destination range, which is at least
30   // as large as |str|.
operator()31   void operator()(const InputRng& str,
32                   KeyType key_bound,
33                   SAIt suffix_array) const {
34     using size_type = typename SAIt::value_type;
35 
36     size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
37 
38     // |suffix_array| is first filled with ordered indices of |str|.
39     // Those indices are then sorted with lexicographical comparisons in |str|.
40     std::iota(suffix_array, suffix_array + n, 0);
41     std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
42       return std::lexicographical_compare(std::begin(str) + i, std::end(str),
43                                           std::begin(str) + j, std::end(str));
44     });
45   }
46 };
47 
48 // A functor class that implements suffix array induced sorting (SA-IS)
49 // algorithm with linear time and memory complexity,
50 // see http://ieeexplore.ieee.org/abstract/document/5582081/
51 class InducedSuffixSort {
52  public:
53   // Type requirements:
54   // |InputRng| is an input random access range.
55   // |KeyType| is an unsigned integer type.
56   // |SAIt| is a random access iterator with mutable values.
57   template <class InputRng, class KeyType, class SAIt>
58   // |str| is the input string on which suffix sort is applied.
59   // Characters found in |str| must be in the range [0, |key_bound|)
60   // |suffix_array| is the beginning of the destination range, which is at least
61   // as large as |str|.
operator()62   void operator()(const InputRng& str,
63                   KeyType key_bound,
64                   SAIt suffix_array) const {
65     using value_type = typename InputRng::value_type;
66     using size_type = typename SAIt::value_type;
67 
68     static_assert(std::is_unsigned<value_type>::value,
69                   "SA-IS only supports input string with unsigned values");
70     static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
71 
72     size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
73 
74     Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
75                                                    key_bound, suffix_array);
76   }
77 
78   // Given string S of length n. We assume S is terminated by a unique sentinel
79   // $, which is considered as the smallest character. This sentinel does not
80   // exist in memory and is only treated implicitly, hence |n| does not count
81   // the sentinel in this implementation. We denote suf(S,i) the suffix formed
82   // by S[i..n).
83 
84   // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
85   // or suf(S,i) > suf(S,i+1), respectively.
86   enum SLType : bool { SType, LType };
87 
88   // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
89   // S-type or L-type, respectively.
90 
91   // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
92   // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
93   // character.
94 
95   // A substring S[i..j) is an LMS-substring if
96   // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
97   //     LMS characters, or
98   // (2) S[i..j) is the sentinel $.
99 
100   template <class SizeType, class KeyType>
101   struct Implementation {
102     static_assert(std::is_unsigned<SizeType>::value,
103                   "SizeType must be unsigned");
104     static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
105     using size_type = SizeType;
106     using key_type = KeyType;
107 
108     using iterator = typename std::vector<size_type>::iterator;
109     using const_iterator = typename std::vector<size_type>::const_iterator;
110 
111     // Partition every suffix based on SL-type. Returns the number of LMS
112     // suffixes.
113     template <class StrIt>
BuildSLPartitionImplementation114     static size_type BuildSLPartition(
115         StrIt str,
116         size_type length,
117         key_type key_bound,
118         std::vector<SLType>::reverse_iterator sl_partition_it) {
119       // We will count LMS suffixes (S to L-type or last S-type).
120       size_type lms_count = 0;
121 
122       // |previous_type| is initialized to L-type to avoid counting an extra
123       // LMS suffix at the end
124       SLType previous_type = LType;
125 
126       // Initialized to dummy, impossible key.
127       key_type previous_key = key_bound;
128 
129       // We're travelling backward to determine the partition,
130       // as if we prepend one character at a time to the string, ex:
131       // b$ is L-type because b > $.
132       // ab$ is S-type because a < b, implying ab$ < b$.
133       // bab$ is L-type because b > a, implying bab$ > ab$.
134       // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
135       for (auto str_it = std::reverse_iterator<StrIt>(str + length);
136            str_it != std::reverse_iterator<StrIt>(str);
137            ++str_it, ++sl_partition_it) {
138         key_type current_key = *str_it;
139 
140         if (current_key > previous_key || previous_key == key_bound) {
141           // S[i] > S[i + 1] or S[i] is last character.
142           if (previous_type == SType)
143             // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
144             // suf(S,i+1) was a LMS suffix.
145             ++lms_count;
146 
147           previous_type = LType;  // For next round.
148         } else if (current_key < previous_key) {
149           // S[i] < S[i + 1]
150           previous_type = SType;  // For next round.
151         }
152         // Else, S[i] == S[i + 1]:
153         // The next character that differs determines the SL-type,
154         // so we reuse the last seen type.
155 
156         *sl_partition_it = previous_type;
157         previous_key = current_key;  // For next round.
158       }
159 
160       return lms_count;
161     }
162 
163     // Find indices of LMS suffixes and write result to |lms_indices|.
FindLmsSuffixesImplementation164     static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
165                                 iterator lms_indices) {
166       // |previous_type| is initialized to S-type to avoid counting an extra
167       // LMS suffix at the beginning
168       SLType previous_type = SType;
169       for (size_type i = 0; i < sl_partition.size(); ++i) {
170         if (sl_partition[i] == SType && previous_type == LType)
171           *lms_indices++ = i;
172         previous_type = sl_partition[i];
173       }
174     }
175 
176     template <class StrIt>
MakeBucketCountImplementation177     static std::vector<size_type> MakeBucketCount(StrIt str,
178                                                   size_type length,
179                                                   key_type key_bound) {
180       // Occurrence of every unique character is counted in |buckets|
181       std::vector<size_type> buckets(static_cast<size_type>(key_bound));
182 
183       for (auto it = str; it != str + length; ++it)
184         ++buckets[*it];
185       return buckets;
186     }
187 
188     // Apply induced sort from |lms_indices| to |suffix_array| associated with
189     // the string |str|.
190     template <class StrIt, class SAIt>
InducedSortImplementation191     static void InducedSort(StrIt str,
192                             size_type length,
193                             const std::vector<SLType>& sl_partition,
194                             const std::vector<size_type>& lms_indices,
195                             const std::vector<size_type>& buckets,
196                             SAIt suffix_array) {
197       // All indices are first marked as unset with the illegal value |length|.
198       std::fill(suffix_array, suffix_array + length, length);
199 
200       // Used to mark bucket boundaries (head or end) as indices in str.
201       DCHECK(!buckets.empty());
202       std::vector<size_type> bucket_bounds(buckets.size());
203 
204       // Step 1: Assign indices for LMS suffixes, populating the end of
205       // respective buckets but keeping relative order.
206 
207       // Find the end of each bucket and write it to |bucket_bounds|.
208       std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
209 
210       // Process each |lms_indices| backward, and assign them to the end of
211       // their respective buckets, so relative order is preserved.
212       for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
213         key_type key = str[*it];
214         suffix_array[--bucket_bounds[key]] = *it;
215       }
216 
217       // Step 2
218       // Scan forward |suffix_array|; for each modified suf(S,i) for which
219       // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
220       // head of the corresponding bucket and forward the bucket head to the
221       // right.
222 
223       // Find the head of each bucket and write it to |bucket_bounds|. Since
224       // only LMS suffixes where inserted in |suffix_array| during Step 1,
225       // |bucket_bounds| does not contains the head of each bucket and needs to
226       // be updated.
227       bucket_bounds[0] = 0;
228       std::partial_sum(buckets.begin(), buckets.end() - 1,
229                        bucket_bounds.begin() + 1);
230 
231       // From Step 1, the sentinel $, which we treat implicitly, would have
232       // been placed at the beginning of |suffix_array|, since $ is always
233       // considered as the smallest character. We then have to deal with the
234       // previous (last) suffix.
235       if (sl_partition[length - 1] == LType) {
236         key_type key = str[length - 1];
237         suffix_array[bucket_bounds[key]++] = length - 1;
238       }
239       for (auto it = suffix_array; it != suffix_array + length; ++it) {
240         size_type suffix_index = *it;
241 
242         // While the original algorithm marks unset suffixes with -1,
243         // we found that marking them with |length| is also possible and more
244         // convenient because we are working with unsigned integers.
245         if (suffix_index != length && suffix_index > 0 &&
246             sl_partition[--suffix_index] == LType) {
247           key_type key = str[suffix_index];
248           suffix_array[bucket_bounds[key]++] = suffix_index;
249         }
250       }
251 
252       // Step 3
253       // Scan backward |suffix_array|; for each modified suf(S, i) for which
254       // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
255       // end of the corresponding bucket and forward the bucket head to the
256       // left.
257 
258       // Find the end of each bucket and write it to |bucket_bounds|. Since
259       // only L-type suffixes where inserted in |suffix_array| during Step 2,
260       // |bucket_bounds| does not contain the end of each bucket and needs to
261       // be updated.
262       std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
263 
264       for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
265            it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
266         size_type suffix_index = *it;
267         if (suffix_index != length && suffix_index > 0 &&
268             sl_partition[--suffix_index] == SType) {
269           key_type key = str[suffix_index];
270           suffix_array[--bucket_bounds[key]] = suffix_index;
271         }
272       }
273       // Deals with the last suffix, because of the sentinel.
274       if (sl_partition[length - 1] == SType) {
275         key_type key = str[length - 1];
276         suffix_array[--bucket_bounds[key]] = length - 1;
277       }
278     }
279 
280     // Given a string S starting at |str| with length |length|, an array
281     // starting at |substring_array| containing lexicographically ordered LMS
282     // terminated substring indices of S and an SL-Type partition |sl_partition|
283     // of S, assigns a unique label to every unique LMS substring. The sorted
284     // labels for all LMS substrings are written to |lms_str|, while the indices
285     // of LMS suffixes are written to |lms_indices|. In addition, returns the
286     // total number of unique labels.
287     template <class StrIt, class SAIt>
LabelLmsSubstringsImplementation288     static size_type LabelLmsSubstrings(StrIt str,
289                                         size_type length,
290                                         const std::vector<SLType>& sl_partition,
291                                         SAIt suffix_array,
292                                         iterator lms_indices,
293                                         iterator lms_str) {
294       // Labelling starts at 0.
295       size_type label = 0;
296 
297       // |previous_lms| is initialized to 0 to indicate it is unset.
298       // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
299       // lexicographical order.
300       size_type previous_lms = 0;
301       for (auto it = suffix_array; it != suffix_array + length; ++it) {
302         if (*it > 0 && sl_partition[*it] == SType &&
303             sl_partition[*it - 1] == LType) {
304           // suf(S, *it) is a LMS suffix.
305 
306           size_type current_lms = *it;
307           if (previous_lms != 0) {
308             // There was a previous LMS suffix. Check if the current LMS
309             // substring is equal to the previous one.
310             SLType current_lms_type = SType;
311             SLType previous_lms_type = SType;
312             for (size_type k = 0;; ++k) {
313               // |current_lms_end| and |previous_lms_end| denote whether we have
314               // reached the end of the current and previous LMS substring,
315               // respectively
316               bool current_lms_end = false;
317               bool previous_lms_end = false;
318 
319               // Check for both previous and current substring ends.
320               // Note that it is more convenient to check if
321               // suf(S,current_lms + k) is an LMS suffix than to retrieve it
322               // from lms_indices.
323               if (current_lms + k >= length ||
324                   (current_lms_type == LType &&
325                    sl_partition[current_lms + k] == SType)) {
326                 current_lms_end = true;
327               }
328               if (previous_lms + k >= length ||
329                   (previous_lms_type == LType &&
330                    sl_partition[previous_lms + k] == SType)) {
331                 previous_lms_end = true;
332               }
333 
334               if (current_lms_end && previous_lms_end) {
335                 break;  // Previous and current substrings are identical.
336               } else if (current_lms_end != previous_lms_end ||
337                          str[current_lms + k] != str[previous_lms + k]) {
338                 // Previous and current substrings differ, a new label is used.
339                 ++label;
340                 break;
341               }
342 
343               current_lms_type = sl_partition[current_lms + k];
344               previous_lms_type = sl_partition[previous_lms + k];
345             }
346           }
347           *lms_indices++ = *it;
348           *lms_str++ = label;
349           previous_lms = current_lms;
350         }
351       }
352 
353       return label + 1;
354     }
355 
356     // Implementation of the SA-IS algorithm. |str| must be a random access
357     // iterator pointing at the beginning of S with length |length|. The result
358     // is writtend in |suffix_array|, a random access iterator.
359     template <class StrIt, class SAIt>
SuffixSortImplementation360     static void SuffixSort(StrIt str,
361                            size_type length,
362                            key_type key_bound,
363                            SAIt suffix_array) {
364       if (length == 1)
365         *suffix_array = 0;
366       if (length < 2)
367         return;
368 
369       std::vector<SLType> sl_partition(length);
370       size_type lms_count =
371           BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
372       std::vector<size_type> lms_indices(lms_count);
373       FindLmsSuffixes(sl_partition, lms_indices.begin());
374       std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
375 
376       if (lms_indices.size() > 1) {
377         // Given |lms_indices| in the same order they appear in |str|, induce
378         // LMS substrings relative order and write result to |suffix_array|.
379         InducedSort(str, length, sl_partition, lms_indices, buckets,
380                     suffix_array);
381         std::vector<size_type> lms_str(lms_indices.size());
382 
383         // Given LMS substrings in relative order found in |suffix_array|,
384         // map LMS substrings to unique labels to form a new string, |lms_str|.
385         size_type label_count =
386             LabelLmsSubstrings(str, length, sl_partition, suffix_array,
387                                lms_indices.begin(), lms_str.begin());
388 
389         if (label_count < lms_str.size()) {
390           // Reorder |lms_str| to have LMS suffixes in the same order they
391           // appear in |str|.
392           for (size_type i = 0; i < lms_indices.size(); ++i)
393             suffix_array[lms_indices[i]] = lms_str[i];
394 
395           SLType previous_type = SType;
396           for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
397             if (sl_partition[i] == SType && previous_type == LType) {
398               lms_str[j] = suffix_array[i];
399               lms_indices[j++] = i;
400             }
401             previous_type = sl_partition[i];
402           }
403 
404           // Recursively apply SuffixSort on |lms_str|, which is formed from
405           // labeled LMS suffixes in the same order they appear in |str|.
406           // Note that |KeyType| will be size_type because |lms_str| contains
407           // indices. |lms_str| is at most half the length of |str|.
408           Implementation<size_type, size_type>::SuffixSort(
409               lms_str.begin(), static_cast<size_type>(lms_str.size()),
410               label_count, suffix_array);
411 
412           // Map LMS labels back to indices in |str| and write result to
413           // |lms_indices|. We're using |suffix_array| as a temporary buffer.
414           for (size_type i = 0; i < lms_indices.size(); ++i)
415             suffix_array[i] = lms_indices[suffix_array[i]];
416           std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
417 
418           // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
419         }
420       }
421       // Given |lms_indices| where LMS suffixes are sorted, induce the full
422       // order of suffixes in |str|.
423       InducedSort(str, length, sl_partition, lms_indices, buckets,
424                   suffix_array);
425     }
426 
427     Implementation() = delete;
428     Implementation(const Implementation&) = delete;
429     const Implementation& operator=(const Implementation&) = delete;
430   };
431 };
432 
433 // Generates a sorted suffix array for the input string |str| using the functor
434 // |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
435 /// Characters found in |str| are assumed to be in range [0, |key_bound|).
436 // Returns the suffix array as a vector.
437 // |StrRng| is an input random access range.
438 // |KeyType| is an unsigned integer type.
439 template <class Algorithm, class StrRng, class KeyType>
MakeSuffixArray(const StrRng & str,KeyType key_bound)440 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
441                                                         KeyType key_bound) {
442   Algorithm sort;
443   std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
444   sort(str, key_bound, suffix_array.begin());
445   return suffix_array;
446 }
447 
448 // Type requirements:
449 // |SARng| is an input random access range.
450 // |StrIt1| is a random access iterator.
451 // |StrIt2| is a forward iterator.
452 template <class SARng, class StrIt1, class StrIt2>
453 // Lexicographical lower bound using binary search for
454 // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
455 // starting at |str1_first|. This does not necessarily return the index of
456 // the longest matching substring.
457 auto SuffixLowerBound(const SARng& suffix_array,
458                       StrIt1 str1_first,
459                       StrIt2 str2_first,
460                       StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
461   using size_type = typename SARng::value_type;
462 
463   size_t n = std::end(suffix_array) - std::begin(suffix_array);
464   auto it = std::lower_bound(
465       std::begin(suffix_array), std::end(suffix_array), str2_first,
466       [str1_first, str2_last, n](size_type a, StrIt2 b) {
467         return std::lexicographical_compare(str1_first + a, str1_first + n, b,
468                                             str2_last);
469       });
470   return it;
471 }
472 
473 }  // namespace zucchini
474 
475 #endif  // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
476