1*a03ca8b9SKrzysztof Kosiński // Copyright 2017 The Chromium Authors. All rights reserved.
2*a03ca8b9SKrzysztof Kosiński // Use of this source code is governed by a BSD-style license that can be
3*a03ca8b9SKrzysztof Kosiński // found in the LICENSE file.
4*a03ca8b9SKrzysztof Kosiński
5*a03ca8b9SKrzysztof Kosiński #ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
6*a03ca8b9SKrzysztof Kosiński #define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
7*a03ca8b9SKrzysztof Kosiński
8*a03ca8b9SKrzysztof Kosiński #include <algorithm>
9*a03ca8b9SKrzysztof Kosiński #include <iterator>
10*a03ca8b9SKrzysztof Kosiński #include <numeric>
11*a03ca8b9SKrzysztof Kosiński #include <vector>
12*a03ca8b9SKrzysztof Kosiński
13*a03ca8b9SKrzysztof Kosiński #include "base/check.h"
14*a03ca8b9SKrzysztof Kosiński
15*a03ca8b9SKrzysztof Kosiński namespace zucchini {
16*a03ca8b9SKrzysztof Kosiński
17*a03ca8b9SKrzysztof Kosiński // A functor class that implements the naive suffix sorting algorithm that uses
18*a03ca8b9SKrzysztof Kosiński // std::sort with lexicographical compare. This is only meant as reference of
19*a03ca8b9SKrzysztof Kosiński // the interface.
20*a03ca8b9SKrzysztof Kosiński class NaiveSuffixSort {
21*a03ca8b9SKrzysztof Kosiński public:
22*a03ca8b9SKrzysztof Kosiński // Type requirements:
23*a03ca8b9SKrzysztof Kosiński // |InputRng| is an input random access range.
24*a03ca8b9SKrzysztof Kosiński // |KeyType| is an unsigned integer type.
25*a03ca8b9SKrzysztof Kosiński // |SAIt| is a random access iterator with mutable references.
26*a03ca8b9SKrzysztof Kosiński template <class InputRng, class KeyType, class SAIt>
27*a03ca8b9SKrzysztof Kosiński // |str| is the input string on which suffix sort is applied.
28*a03ca8b9SKrzysztof Kosiński // Characters found in |str| must be in the range [0, |key_bound|)
29*a03ca8b9SKrzysztof Kosiński // |suffix_array| is the beginning of the destination range, which is at least
30*a03ca8b9SKrzysztof Kosiński // as large as |str|.
operator()31*a03ca8b9SKrzysztof Kosiński void operator()(const InputRng& str,
32*a03ca8b9SKrzysztof Kosiński KeyType key_bound,
33*a03ca8b9SKrzysztof Kosiński SAIt suffix_array) const {
34*a03ca8b9SKrzysztof Kosiński using size_type = typename SAIt::value_type;
35*a03ca8b9SKrzysztof Kosiński
36*a03ca8b9SKrzysztof Kosiński size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
37*a03ca8b9SKrzysztof Kosiński
38*a03ca8b9SKrzysztof Kosiński // |suffix_array| is first filled with ordered indices of |str|.
39*a03ca8b9SKrzysztof Kosiński // Those indices are then sorted with lexicographical comparisons in |str|.
40*a03ca8b9SKrzysztof Kosiński std::iota(suffix_array, suffix_array + n, 0);
41*a03ca8b9SKrzysztof Kosiński std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
42*a03ca8b9SKrzysztof Kosiński return std::lexicographical_compare(std::begin(str) + i, std::end(str),
43*a03ca8b9SKrzysztof Kosiński std::begin(str) + j, std::end(str));
44*a03ca8b9SKrzysztof Kosiński });
45*a03ca8b9SKrzysztof Kosiński }
46*a03ca8b9SKrzysztof Kosiński };
47*a03ca8b9SKrzysztof Kosiński
48*a03ca8b9SKrzysztof Kosiński // A functor class that implements suffix array induced sorting (SA-IS)
49*a03ca8b9SKrzysztof Kosiński // algorithm with linear time and memory complexity,
50*a03ca8b9SKrzysztof Kosiński // see http://ieeexplore.ieee.org/abstract/document/5582081/
51*a03ca8b9SKrzysztof Kosiński class InducedSuffixSort {
52*a03ca8b9SKrzysztof Kosiński public:
53*a03ca8b9SKrzysztof Kosiński // Type requirements:
54*a03ca8b9SKrzysztof Kosiński // |InputRng| is an input random access range.
55*a03ca8b9SKrzysztof Kosiński // |KeyType| is an unsigned integer type.
56*a03ca8b9SKrzysztof Kosiński // |SAIt| is a random access iterator with mutable values.
57*a03ca8b9SKrzysztof Kosiński template <class InputRng, class KeyType, class SAIt>
58*a03ca8b9SKrzysztof Kosiński // |str| is the input string on which suffix sort is applied.
59*a03ca8b9SKrzysztof Kosiński // Characters found in |str| must be in the range [0, |key_bound|)
60*a03ca8b9SKrzysztof Kosiński // |suffix_array| is the beginning of the destination range, which is at least
61*a03ca8b9SKrzysztof Kosiński // as large as |str|.
operator()62*a03ca8b9SKrzysztof Kosiński void operator()(const InputRng& str,
63*a03ca8b9SKrzysztof Kosiński KeyType key_bound,
64*a03ca8b9SKrzysztof Kosiński SAIt suffix_array) const {
65*a03ca8b9SKrzysztof Kosiński using value_type = typename InputRng::value_type;
66*a03ca8b9SKrzysztof Kosiński using size_type = typename SAIt::value_type;
67*a03ca8b9SKrzysztof Kosiński
68*a03ca8b9SKrzysztof Kosiński static_assert(std::is_unsigned<value_type>::value,
69*a03ca8b9SKrzysztof Kosiński "SA-IS only supports input string with unsigned values");
70*a03ca8b9SKrzysztof Kosiński static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
71*a03ca8b9SKrzysztof Kosiński
72*a03ca8b9SKrzysztof Kosiński size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
73*a03ca8b9SKrzysztof Kosiński
74*a03ca8b9SKrzysztof Kosiński Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
75*a03ca8b9SKrzysztof Kosiński key_bound, suffix_array);
76*a03ca8b9SKrzysztof Kosiński }
77*a03ca8b9SKrzysztof Kosiński
78*a03ca8b9SKrzysztof Kosiński // Given string S of length n. We assume S is terminated by a unique sentinel
79*a03ca8b9SKrzysztof Kosiński // $, which is considered as the smallest character. This sentinel does not
80*a03ca8b9SKrzysztof Kosiński // exist in memory and is only treated implicitly, hence |n| does not count
81*a03ca8b9SKrzysztof Kosiński // the sentinel in this implementation. We denote suf(S,i) the suffix formed
82*a03ca8b9SKrzysztof Kosiński // by S[i..n).
83*a03ca8b9SKrzysztof Kosiński
84*a03ca8b9SKrzysztof Kosiński // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
85*a03ca8b9SKrzysztof Kosiński // or suf(S,i) > suf(S,i+1), respectively.
86*a03ca8b9SKrzysztof Kosiński enum SLType : bool { SType, LType };
87*a03ca8b9SKrzysztof Kosiński
88*a03ca8b9SKrzysztof Kosiński // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
89*a03ca8b9SKrzysztof Kosiński // S-type or L-type, respectively.
90*a03ca8b9SKrzysztof Kosiński
91*a03ca8b9SKrzysztof Kosiński // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
92*a03ca8b9SKrzysztof Kosiński // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
93*a03ca8b9SKrzysztof Kosiński // character.
94*a03ca8b9SKrzysztof Kosiński
95*a03ca8b9SKrzysztof Kosiński // A substring S[i..j) is an LMS-substring if
96*a03ca8b9SKrzysztof Kosiński // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
97*a03ca8b9SKrzysztof Kosiński // LMS characters, or
98*a03ca8b9SKrzysztof Kosiński // (2) S[i..j) is the sentinel $.
99*a03ca8b9SKrzysztof Kosiński
100*a03ca8b9SKrzysztof Kosiński template <class SizeType, class KeyType>
101*a03ca8b9SKrzysztof Kosiński struct Implementation {
102*a03ca8b9SKrzysztof Kosiński static_assert(std::is_unsigned<SizeType>::value,
103*a03ca8b9SKrzysztof Kosiński "SizeType must be unsigned");
104*a03ca8b9SKrzysztof Kosiński static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
105*a03ca8b9SKrzysztof Kosiński using size_type = SizeType;
106*a03ca8b9SKrzysztof Kosiński using key_type = KeyType;
107*a03ca8b9SKrzysztof Kosiński
108*a03ca8b9SKrzysztof Kosiński using iterator = typename std::vector<size_type>::iterator;
109*a03ca8b9SKrzysztof Kosiński using const_iterator = typename std::vector<size_type>::const_iterator;
110*a03ca8b9SKrzysztof Kosiński
111*a03ca8b9SKrzysztof Kosiński // Partition every suffix based on SL-type. Returns the number of LMS
112*a03ca8b9SKrzysztof Kosiński // suffixes.
113*a03ca8b9SKrzysztof Kosiński template <class StrIt>
BuildSLPartitionImplementation114*a03ca8b9SKrzysztof Kosiński static size_type BuildSLPartition(
115*a03ca8b9SKrzysztof Kosiński StrIt str,
116*a03ca8b9SKrzysztof Kosiński size_type length,
117*a03ca8b9SKrzysztof Kosiński key_type key_bound,
118*a03ca8b9SKrzysztof Kosiński std::vector<SLType>::reverse_iterator sl_partition_it) {
119*a03ca8b9SKrzysztof Kosiński // We will count LMS suffixes (S to L-type or last S-type).
120*a03ca8b9SKrzysztof Kosiński size_type lms_count = 0;
121*a03ca8b9SKrzysztof Kosiński
122*a03ca8b9SKrzysztof Kosiński // |previous_type| is initialized to L-type to avoid counting an extra
123*a03ca8b9SKrzysztof Kosiński // LMS suffix at the end
124*a03ca8b9SKrzysztof Kosiński SLType previous_type = LType;
125*a03ca8b9SKrzysztof Kosiński
126*a03ca8b9SKrzysztof Kosiński // Initialized to dummy, impossible key.
127*a03ca8b9SKrzysztof Kosiński key_type previous_key = key_bound;
128*a03ca8b9SKrzysztof Kosiński
129*a03ca8b9SKrzysztof Kosiński // We're travelling backward to determine the partition,
130*a03ca8b9SKrzysztof Kosiński // as if we prepend one character at a time to the string, ex:
131*a03ca8b9SKrzysztof Kosiński // b$ is L-type because b > $.
132*a03ca8b9SKrzysztof Kosiński // ab$ is S-type because a < b, implying ab$ < b$.
133*a03ca8b9SKrzysztof Kosiński // bab$ is L-type because b > a, implying bab$ > ab$.
134*a03ca8b9SKrzysztof Kosiński // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
135*a03ca8b9SKrzysztof Kosiński for (auto str_it = std::reverse_iterator<StrIt>(str + length);
136*a03ca8b9SKrzysztof Kosiński str_it != std::reverse_iterator<StrIt>(str);
137*a03ca8b9SKrzysztof Kosiński ++str_it, ++sl_partition_it) {
138*a03ca8b9SKrzysztof Kosiński key_type current_key = *str_it;
139*a03ca8b9SKrzysztof Kosiński
140*a03ca8b9SKrzysztof Kosiński if (current_key > previous_key || previous_key == key_bound) {
141*a03ca8b9SKrzysztof Kosiński // S[i] > S[i + 1] or S[i] is last character.
142*a03ca8b9SKrzysztof Kosiński if (previous_type == SType)
143*a03ca8b9SKrzysztof Kosiński // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
144*a03ca8b9SKrzysztof Kosiński // suf(S,i+1) was a LMS suffix.
145*a03ca8b9SKrzysztof Kosiński ++lms_count;
146*a03ca8b9SKrzysztof Kosiński
147*a03ca8b9SKrzysztof Kosiński previous_type = LType; // For next round.
148*a03ca8b9SKrzysztof Kosiński } else if (current_key < previous_key) {
149*a03ca8b9SKrzysztof Kosiński // S[i] < S[i + 1]
150*a03ca8b9SKrzysztof Kosiński previous_type = SType; // For next round.
151*a03ca8b9SKrzysztof Kosiński }
152*a03ca8b9SKrzysztof Kosiński // Else, S[i] == S[i + 1]:
153*a03ca8b9SKrzysztof Kosiński // The next character that differs determines the SL-type,
154*a03ca8b9SKrzysztof Kosiński // so we reuse the last seen type.
155*a03ca8b9SKrzysztof Kosiński
156*a03ca8b9SKrzysztof Kosiński *sl_partition_it = previous_type;
157*a03ca8b9SKrzysztof Kosiński previous_key = current_key; // For next round.
158*a03ca8b9SKrzysztof Kosiński }
159*a03ca8b9SKrzysztof Kosiński
160*a03ca8b9SKrzysztof Kosiński return lms_count;
161*a03ca8b9SKrzysztof Kosiński }
162*a03ca8b9SKrzysztof Kosiński
163*a03ca8b9SKrzysztof Kosiński // Find indices of LMS suffixes and write result to |lms_indices|.
FindLmsSuffixesImplementation164*a03ca8b9SKrzysztof Kosiński static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
165*a03ca8b9SKrzysztof Kosiński iterator lms_indices) {
166*a03ca8b9SKrzysztof Kosiński // |previous_type| is initialized to S-type to avoid counting an extra
167*a03ca8b9SKrzysztof Kosiński // LMS suffix at the beginning
168*a03ca8b9SKrzysztof Kosiński SLType previous_type = SType;
169*a03ca8b9SKrzysztof Kosiński for (size_type i = 0; i < sl_partition.size(); ++i) {
170*a03ca8b9SKrzysztof Kosiński if (sl_partition[i] == SType && previous_type == LType)
171*a03ca8b9SKrzysztof Kosiński *lms_indices++ = i;
172*a03ca8b9SKrzysztof Kosiński previous_type = sl_partition[i];
173*a03ca8b9SKrzysztof Kosiński }
174*a03ca8b9SKrzysztof Kosiński }
175*a03ca8b9SKrzysztof Kosiński
176*a03ca8b9SKrzysztof Kosiński template <class StrIt>
MakeBucketCountImplementation177*a03ca8b9SKrzysztof Kosiński static std::vector<size_type> MakeBucketCount(StrIt str,
178*a03ca8b9SKrzysztof Kosiński size_type length,
179*a03ca8b9SKrzysztof Kosiński key_type key_bound) {
180*a03ca8b9SKrzysztof Kosiński // Occurrence of every unique character is counted in |buckets|
181*a03ca8b9SKrzysztof Kosiński std::vector<size_type> buckets(static_cast<size_type>(key_bound));
182*a03ca8b9SKrzysztof Kosiński
183*a03ca8b9SKrzysztof Kosiński for (auto it = str; it != str + length; ++it)
184*a03ca8b9SKrzysztof Kosiński ++buckets[*it];
185*a03ca8b9SKrzysztof Kosiński return buckets;
186*a03ca8b9SKrzysztof Kosiński }
187*a03ca8b9SKrzysztof Kosiński
188*a03ca8b9SKrzysztof Kosiński // Apply induced sort from |lms_indices| to |suffix_array| associated with
189*a03ca8b9SKrzysztof Kosiński // the string |str|.
190*a03ca8b9SKrzysztof Kosiński template <class StrIt, class SAIt>
InducedSortImplementation191*a03ca8b9SKrzysztof Kosiński static void InducedSort(StrIt str,
192*a03ca8b9SKrzysztof Kosiński size_type length,
193*a03ca8b9SKrzysztof Kosiński const std::vector<SLType>& sl_partition,
194*a03ca8b9SKrzysztof Kosiński const std::vector<size_type>& lms_indices,
195*a03ca8b9SKrzysztof Kosiński const std::vector<size_type>& buckets,
196*a03ca8b9SKrzysztof Kosiński SAIt suffix_array) {
197*a03ca8b9SKrzysztof Kosiński // All indices are first marked as unset with the illegal value |length|.
198*a03ca8b9SKrzysztof Kosiński std::fill(suffix_array, suffix_array + length, length);
199*a03ca8b9SKrzysztof Kosiński
200*a03ca8b9SKrzysztof Kosiński // Used to mark bucket boundaries (head or end) as indices in str.
201*a03ca8b9SKrzysztof Kosiński DCHECK(!buckets.empty());
202*a03ca8b9SKrzysztof Kosiński std::vector<size_type> bucket_bounds(buckets.size());
203*a03ca8b9SKrzysztof Kosiński
204*a03ca8b9SKrzysztof Kosiński // Step 1: Assign indices for LMS suffixes, populating the end of
205*a03ca8b9SKrzysztof Kosiński // respective buckets but keeping relative order.
206*a03ca8b9SKrzysztof Kosiński
207*a03ca8b9SKrzysztof Kosiński // Find the end of each bucket and write it to |bucket_bounds|.
208*a03ca8b9SKrzysztof Kosiński std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
209*a03ca8b9SKrzysztof Kosiński
210*a03ca8b9SKrzysztof Kosiński // Process each |lms_indices| backward, and assign them to the end of
211*a03ca8b9SKrzysztof Kosiński // their respective buckets, so relative order is preserved.
212*a03ca8b9SKrzysztof Kosiński for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
213*a03ca8b9SKrzysztof Kosiński key_type key = str[*it];
214*a03ca8b9SKrzysztof Kosiński suffix_array[--bucket_bounds[key]] = *it;
215*a03ca8b9SKrzysztof Kosiński }
216*a03ca8b9SKrzysztof Kosiński
217*a03ca8b9SKrzysztof Kosiński // Step 2
218*a03ca8b9SKrzysztof Kosiński // Scan forward |suffix_array|; for each modified suf(S,i) for which
219*a03ca8b9SKrzysztof Kosiński // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
220*a03ca8b9SKrzysztof Kosiński // head of the corresponding bucket and forward the bucket head to the
221*a03ca8b9SKrzysztof Kosiński // right.
222*a03ca8b9SKrzysztof Kosiński
223*a03ca8b9SKrzysztof Kosiński // Find the head of each bucket and write it to |bucket_bounds|. Since
224*a03ca8b9SKrzysztof Kosiński // only LMS suffixes where inserted in |suffix_array| during Step 1,
225*a03ca8b9SKrzysztof Kosiński // |bucket_bounds| does not contains the head of each bucket and needs to
226*a03ca8b9SKrzysztof Kosiński // be updated.
227*a03ca8b9SKrzysztof Kosiński bucket_bounds[0] = 0;
228*a03ca8b9SKrzysztof Kosiński std::partial_sum(buckets.begin(), buckets.end() - 1,
229*a03ca8b9SKrzysztof Kosiński bucket_bounds.begin() + 1);
230*a03ca8b9SKrzysztof Kosiński
231*a03ca8b9SKrzysztof Kosiński // From Step 1, the sentinel $, which we treat implicitly, would have
232*a03ca8b9SKrzysztof Kosiński // been placed at the beginning of |suffix_array|, since $ is always
233*a03ca8b9SKrzysztof Kosiński // considered as the smallest character. We then have to deal with the
234*a03ca8b9SKrzysztof Kosiński // previous (last) suffix.
235*a03ca8b9SKrzysztof Kosiński if (sl_partition[length - 1] == LType) {
236*a03ca8b9SKrzysztof Kosiński key_type key = str[length - 1];
237*a03ca8b9SKrzysztof Kosiński suffix_array[bucket_bounds[key]++] = length - 1;
238*a03ca8b9SKrzysztof Kosiński }
239*a03ca8b9SKrzysztof Kosiński for (auto it = suffix_array; it != suffix_array + length; ++it) {
240*a03ca8b9SKrzysztof Kosiński size_type suffix_index = *it;
241*a03ca8b9SKrzysztof Kosiński
242*a03ca8b9SKrzysztof Kosiński // While the original algorithm marks unset suffixes with -1,
243*a03ca8b9SKrzysztof Kosiński // we found that marking them with |length| is also possible and more
244*a03ca8b9SKrzysztof Kosiński // convenient because we are working with unsigned integers.
245*a03ca8b9SKrzysztof Kosiński if (suffix_index != length && suffix_index > 0 &&
246*a03ca8b9SKrzysztof Kosiński sl_partition[--suffix_index] == LType) {
247*a03ca8b9SKrzysztof Kosiński key_type key = str[suffix_index];
248*a03ca8b9SKrzysztof Kosiński suffix_array[bucket_bounds[key]++] = suffix_index;
249*a03ca8b9SKrzysztof Kosiński }
250*a03ca8b9SKrzysztof Kosiński }
251*a03ca8b9SKrzysztof Kosiński
252*a03ca8b9SKrzysztof Kosiński // Step 3
253*a03ca8b9SKrzysztof Kosiński // Scan backward |suffix_array|; for each modified suf(S, i) for which
254*a03ca8b9SKrzysztof Kosiński // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
255*a03ca8b9SKrzysztof Kosiński // end of the corresponding bucket and forward the bucket head to the
256*a03ca8b9SKrzysztof Kosiński // left.
257*a03ca8b9SKrzysztof Kosiński
258*a03ca8b9SKrzysztof Kosiński // Find the end of each bucket and write it to |bucket_bounds|. Since
259*a03ca8b9SKrzysztof Kosiński // only L-type suffixes where inserted in |suffix_array| during Step 2,
260*a03ca8b9SKrzysztof Kosiński // |bucket_bounds| does not contain the end of each bucket and needs to
261*a03ca8b9SKrzysztof Kosiński // be updated.
262*a03ca8b9SKrzysztof Kosiński std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
263*a03ca8b9SKrzysztof Kosiński
264*a03ca8b9SKrzysztof Kosiński for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
265*a03ca8b9SKrzysztof Kosiński it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
266*a03ca8b9SKrzysztof Kosiński size_type suffix_index = *it;
267*a03ca8b9SKrzysztof Kosiński if (suffix_index != length && suffix_index > 0 &&
268*a03ca8b9SKrzysztof Kosiński sl_partition[--suffix_index] == SType) {
269*a03ca8b9SKrzysztof Kosiński key_type key = str[suffix_index];
270*a03ca8b9SKrzysztof Kosiński suffix_array[--bucket_bounds[key]] = suffix_index;
271*a03ca8b9SKrzysztof Kosiński }
272*a03ca8b9SKrzysztof Kosiński }
273*a03ca8b9SKrzysztof Kosiński // Deals with the last suffix, because of the sentinel.
274*a03ca8b9SKrzysztof Kosiński if (sl_partition[length - 1] == SType) {
275*a03ca8b9SKrzysztof Kosiński key_type key = str[length - 1];
276*a03ca8b9SKrzysztof Kosiński suffix_array[--bucket_bounds[key]] = length - 1;
277*a03ca8b9SKrzysztof Kosiński }
278*a03ca8b9SKrzysztof Kosiński }
279*a03ca8b9SKrzysztof Kosiński
280*a03ca8b9SKrzysztof Kosiński // Given a string S starting at |str| with length |length|, an array
281*a03ca8b9SKrzysztof Kosiński // starting at |substring_array| containing lexicographically ordered LMS
282*a03ca8b9SKrzysztof Kosiński // terminated substring indices of S and an SL-Type partition |sl_partition|
283*a03ca8b9SKrzysztof Kosiński // of S, assigns a unique label to every unique LMS substring. The sorted
284*a03ca8b9SKrzysztof Kosiński // labels for all LMS substrings are written to |lms_str|, while the indices
285*a03ca8b9SKrzysztof Kosiński // of LMS suffixes are written to |lms_indices|. In addition, returns the
286*a03ca8b9SKrzysztof Kosiński // total number of unique labels.
287*a03ca8b9SKrzysztof Kosiński template <class StrIt, class SAIt>
LabelLmsSubstringsImplementation288*a03ca8b9SKrzysztof Kosiński static size_type LabelLmsSubstrings(StrIt str,
289*a03ca8b9SKrzysztof Kosiński size_type length,
290*a03ca8b9SKrzysztof Kosiński const std::vector<SLType>& sl_partition,
291*a03ca8b9SKrzysztof Kosiński SAIt suffix_array,
292*a03ca8b9SKrzysztof Kosiński iterator lms_indices,
293*a03ca8b9SKrzysztof Kosiński iterator lms_str) {
294*a03ca8b9SKrzysztof Kosiński // Labelling starts at 0.
295*a03ca8b9SKrzysztof Kosiński size_type label = 0;
296*a03ca8b9SKrzysztof Kosiński
297*a03ca8b9SKrzysztof Kosiński // |previous_lms| is initialized to 0 to indicate it is unset.
298*a03ca8b9SKrzysztof Kosiński // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
299*a03ca8b9SKrzysztof Kosiński // lexicographical order.
300*a03ca8b9SKrzysztof Kosiński size_type previous_lms = 0;
301*a03ca8b9SKrzysztof Kosiński for (auto it = suffix_array; it != suffix_array + length; ++it) {
302*a03ca8b9SKrzysztof Kosiński if (*it > 0 && sl_partition[*it] == SType &&
303*a03ca8b9SKrzysztof Kosiński sl_partition[*it - 1] == LType) {
304*a03ca8b9SKrzysztof Kosiński // suf(S, *it) is a LMS suffix.
305*a03ca8b9SKrzysztof Kosiński
306*a03ca8b9SKrzysztof Kosiński size_type current_lms = *it;
307*a03ca8b9SKrzysztof Kosiński if (previous_lms != 0) {
308*a03ca8b9SKrzysztof Kosiński // There was a previous LMS suffix. Check if the current LMS
309*a03ca8b9SKrzysztof Kosiński // substring is equal to the previous one.
310*a03ca8b9SKrzysztof Kosiński SLType current_lms_type = SType;
311*a03ca8b9SKrzysztof Kosiński SLType previous_lms_type = SType;
312*a03ca8b9SKrzysztof Kosiński for (size_type k = 0;; ++k) {
313*a03ca8b9SKrzysztof Kosiński // |current_lms_end| and |previous_lms_end| denote whether we have
314*a03ca8b9SKrzysztof Kosiński // reached the end of the current and previous LMS substring,
315*a03ca8b9SKrzysztof Kosiński // respectively
316*a03ca8b9SKrzysztof Kosiński bool current_lms_end = false;
317*a03ca8b9SKrzysztof Kosiński bool previous_lms_end = false;
318*a03ca8b9SKrzysztof Kosiński
319*a03ca8b9SKrzysztof Kosiński // Check for both previous and current substring ends.
320*a03ca8b9SKrzysztof Kosiński // Note that it is more convenient to check if
321*a03ca8b9SKrzysztof Kosiński // suf(S,current_lms + k) is an LMS suffix than to retrieve it
322*a03ca8b9SKrzysztof Kosiński // from lms_indices.
323*a03ca8b9SKrzysztof Kosiński if (current_lms + k >= length ||
324*a03ca8b9SKrzysztof Kosiński (current_lms_type == LType &&
325*a03ca8b9SKrzysztof Kosiński sl_partition[current_lms + k] == SType)) {
326*a03ca8b9SKrzysztof Kosiński current_lms_end = true;
327*a03ca8b9SKrzysztof Kosiński }
328*a03ca8b9SKrzysztof Kosiński if (previous_lms + k >= length ||
329*a03ca8b9SKrzysztof Kosiński (previous_lms_type == LType &&
330*a03ca8b9SKrzysztof Kosiński sl_partition[previous_lms + k] == SType)) {
331*a03ca8b9SKrzysztof Kosiński previous_lms_end = true;
332*a03ca8b9SKrzysztof Kosiński }
333*a03ca8b9SKrzysztof Kosiński
334*a03ca8b9SKrzysztof Kosiński if (current_lms_end && previous_lms_end) {
335*a03ca8b9SKrzysztof Kosiński break; // Previous and current substrings are identical.
336*a03ca8b9SKrzysztof Kosiński } else if (current_lms_end != previous_lms_end ||
337*a03ca8b9SKrzysztof Kosiński str[current_lms + k] != str[previous_lms + k]) {
338*a03ca8b9SKrzysztof Kosiński // Previous and current substrings differ, a new label is used.
339*a03ca8b9SKrzysztof Kosiński ++label;
340*a03ca8b9SKrzysztof Kosiński break;
341*a03ca8b9SKrzysztof Kosiński }
342*a03ca8b9SKrzysztof Kosiński
343*a03ca8b9SKrzysztof Kosiński current_lms_type = sl_partition[current_lms + k];
344*a03ca8b9SKrzysztof Kosiński previous_lms_type = sl_partition[previous_lms + k];
345*a03ca8b9SKrzysztof Kosiński }
346*a03ca8b9SKrzysztof Kosiński }
347*a03ca8b9SKrzysztof Kosiński *lms_indices++ = *it;
348*a03ca8b9SKrzysztof Kosiński *lms_str++ = label;
349*a03ca8b9SKrzysztof Kosiński previous_lms = current_lms;
350*a03ca8b9SKrzysztof Kosiński }
351*a03ca8b9SKrzysztof Kosiński }
352*a03ca8b9SKrzysztof Kosiński
353*a03ca8b9SKrzysztof Kosiński return label + 1;
354*a03ca8b9SKrzysztof Kosiński }
355*a03ca8b9SKrzysztof Kosiński
356*a03ca8b9SKrzysztof Kosiński // Implementation of the SA-IS algorithm. |str| must be a random access
357*a03ca8b9SKrzysztof Kosiński // iterator pointing at the beginning of S with length |length|. The result
358*a03ca8b9SKrzysztof Kosiński // is writtend in |suffix_array|, a random access iterator.
359*a03ca8b9SKrzysztof Kosiński template <class StrIt, class SAIt>
SuffixSortImplementation360*a03ca8b9SKrzysztof Kosiński static void SuffixSort(StrIt str,
361*a03ca8b9SKrzysztof Kosiński size_type length,
362*a03ca8b9SKrzysztof Kosiński key_type key_bound,
363*a03ca8b9SKrzysztof Kosiński SAIt suffix_array) {
364*a03ca8b9SKrzysztof Kosiński if (length == 1)
365*a03ca8b9SKrzysztof Kosiński *suffix_array = 0;
366*a03ca8b9SKrzysztof Kosiński if (length < 2)
367*a03ca8b9SKrzysztof Kosiński return;
368*a03ca8b9SKrzysztof Kosiński
369*a03ca8b9SKrzysztof Kosiński std::vector<SLType> sl_partition(length);
370*a03ca8b9SKrzysztof Kosiński size_type lms_count =
371*a03ca8b9SKrzysztof Kosiński BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
372*a03ca8b9SKrzysztof Kosiński std::vector<size_type> lms_indices(lms_count);
373*a03ca8b9SKrzysztof Kosiński FindLmsSuffixes(sl_partition, lms_indices.begin());
374*a03ca8b9SKrzysztof Kosiński std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
375*a03ca8b9SKrzysztof Kosiński
376*a03ca8b9SKrzysztof Kosiński if (lms_indices.size() > 1) {
377*a03ca8b9SKrzysztof Kosiński // Given |lms_indices| in the same order they appear in |str|, induce
378*a03ca8b9SKrzysztof Kosiński // LMS substrings relative order and write result to |suffix_array|.
379*a03ca8b9SKrzysztof Kosiński InducedSort(str, length, sl_partition, lms_indices, buckets,
380*a03ca8b9SKrzysztof Kosiński suffix_array);
381*a03ca8b9SKrzysztof Kosiński std::vector<size_type> lms_str(lms_indices.size());
382*a03ca8b9SKrzysztof Kosiński
383*a03ca8b9SKrzysztof Kosiński // Given LMS substrings in relative order found in |suffix_array|,
384*a03ca8b9SKrzysztof Kosiński // map LMS substrings to unique labels to form a new string, |lms_str|.
385*a03ca8b9SKrzysztof Kosiński size_type label_count =
386*a03ca8b9SKrzysztof Kosiński LabelLmsSubstrings(str, length, sl_partition, suffix_array,
387*a03ca8b9SKrzysztof Kosiński lms_indices.begin(), lms_str.begin());
388*a03ca8b9SKrzysztof Kosiński
389*a03ca8b9SKrzysztof Kosiński if (label_count < lms_str.size()) {
390*a03ca8b9SKrzysztof Kosiński // Reorder |lms_str| to have LMS suffixes in the same order they
391*a03ca8b9SKrzysztof Kosiński // appear in |str|.
392*a03ca8b9SKrzysztof Kosiński for (size_type i = 0; i < lms_indices.size(); ++i)
393*a03ca8b9SKrzysztof Kosiński suffix_array[lms_indices[i]] = lms_str[i];
394*a03ca8b9SKrzysztof Kosiński
395*a03ca8b9SKrzysztof Kosiński SLType previous_type = SType;
396*a03ca8b9SKrzysztof Kosiński for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
397*a03ca8b9SKrzysztof Kosiński if (sl_partition[i] == SType && previous_type == LType) {
398*a03ca8b9SKrzysztof Kosiński lms_str[j] = suffix_array[i];
399*a03ca8b9SKrzysztof Kosiński lms_indices[j++] = i;
400*a03ca8b9SKrzysztof Kosiński }
401*a03ca8b9SKrzysztof Kosiński previous_type = sl_partition[i];
402*a03ca8b9SKrzysztof Kosiński }
403*a03ca8b9SKrzysztof Kosiński
404*a03ca8b9SKrzysztof Kosiński // Recursively apply SuffixSort on |lms_str|, which is formed from
405*a03ca8b9SKrzysztof Kosiński // labeled LMS suffixes in the same order they appear in |str|.
406*a03ca8b9SKrzysztof Kosiński // Note that |KeyType| will be size_type because |lms_str| contains
407*a03ca8b9SKrzysztof Kosiński // indices. |lms_str| is at most half the length of |str|.
408*a03ca8b9SKrzysztof Kosiński Implementation<size_type, size_type>::SuffixSort(
409*a03ca8b9SKrzysztof Kosiński lms_str.begin(), static_cast<size_type>(lms_str.size()),
410*a03ca8b9SKrzysztof Kosiński label_count, suffix_array);
411*a03ca8b9SKrzysztof Kosiński
412*a03ca8b9SKrzysztof Kosiński // Map LMS labels back to indices in |str| and write result to
413*a03ca8b9SKrzysztof Kosiński // |lms_indices|. We're using |suffix_array| as a temporary buffer.
414*a03ca8b9SKrzysztof Kosiński for (size_type i = 0; i < lms_indices.size(); ++i)
415*a03ca8b9SKrzysztof Kosiński suffix_array[i] = lms_indices[suffix_array[i]];
416*a03ca8b9SKrzysztof Kosiński std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
417*a03ca8b9SKrzysztof Kosiński
418*a03ca8b9SKrzysztof Kosiński // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
419*a03ca8b9SKrzysztof Kosiński }
420*a03ca8b9SKrzysztof Kosiński }
421*a03ca8b9SKrzysztof Kosiński // Given |lms_indices| where LMS suffixes are sorted, induce the full
422*a03ca8b9SKrzysztof Kosiński // order of suffixes in |str|.
423*a03ca8b9SKrzysztof Kosiński InducedSort(str, length, sl_partition, lms_indices, buckets,
424*a03ca8b9SKrzysztof Kosiński suffix_array);
425*a03ca8b9SKrzysztof Kosiński }
426*a03ca8b9SKrzysztof Kosiński
427*a03ca8b9SKrzysztof Kosiński Implementation() = delete;
428*a03ca8b9SKrzysztof Kosiński Implementation(const Implementation&) = delete;
429*a03ca8b9SKrzysztof Kosiński const Implementation& operator=(const Implementation&) = delete;
430*a03ca8b9SKrzysztof Kosiński };
431*a03ca8b9SKrzysztof Kosiński };
432*a03ca8b9SKrzysztof Kosiński
433*a03ca8b9SKrzysztof Kosiński // Generates a sorted suffix array for the input string |str| using the functor
434*a03ca8b9SKrzysztof Kosiński // |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
435*a03ca8b9SKrzysztof Kosiński /// Characters found in |str| are assumed to be in range [0, |key_bound|).
436*a03ca8b9SKrzysztof Kosiński // Returns the suffix array as a vector.
437*a03ca8b9SKrzysztof Kosiński // |StrRng| is an input random access range.
438*a03ca8b9SKrzysztof Kosiński // |KeyType| is an unsigned integer type.
439*a03ca8b9SKrzysztof Kosiński template <class Algorithm, class StrRng, class KeyType>
MakeSuffixArray(const StrRng & str,KeyType key_bound)440*a03ca8b9SKrzysztof Kosiński std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
441*a03ca8b9SKrzysztof Kosiński KeyType key_bound) {
442*a03ca8b9SKrzysztof Kosiński Algorithm sort;
443*a03ca8b9SKrzysztof Kosiński std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
444*a03ca8b9SKrzysztof Kosiński sort(str, key_bound, suffix_array.begin());
445*a03ca8b9SKrzysztof Kosiński return suffix_array;
446*a03ca8b9SKrzysztof Kosiński }
447*a03ca8b9SKrzysztof Kosiński
448*a03ca8b9SKrzysztof Kosiński // Type requirements:
449*a03ca8b9SKrzysztof Kosiński // |SARng| is an input random access range.
450*a03ca8b9SKrzysztof Kosiński // |StrIt1| is a random access iterator.
451*a03ca8b9SKrzysztof Kosiński // |StrIt2| is a forward iterator.
452*a03ca8b9SKrzysztof Kosiński template <class SARng, class StrIt1, class StrIt2>
453*a03ca8b9SKrzysztof Kosiński // Lexicographical lower bound using binary search for
454*a03ca8b9SKrzysztof Kosiński // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
455*a03ca8b9SKrzysztof Kosiński // starting at |str1_first|. This does not necessarily return the index of
456*a03ca8b9SKrzysztof Kosiński // the longest matching substring.
457*a03ca8b9SKrzysztof Kosiński auto SuffixLowerBound(const SARng& suffix_array,
458*a03ca8b9SKrzysztof Kosiński StrIt1 str1_first,
459*a03ca8b9SKrzysztof Kosiński StrIt2 str2_first,
460*a03ca8b9SKrzysztof Kosiński StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
461*a03ca8b9SKrzysztof Kosiński using size_type = typename SARng::value_type;
462*a03ca8b9SKrzysztof Kosiński
463*a03ca8b9SKrzysztof Kosiński size_t n = std::end(suffix_array) - std::begin(suffix_array);
464*a03ca8b9SKrzysztof Kosiński auto it = std::lower_bound(
465*a03ca8b9SKrzysztof Kosiński std::begin(suffix_array), std::end(suffix_array), str2_first,
466*a03ca8b9SKrzysztof Kosiński [str1_first, str2_last, n](size_type a, StrIt2 b) {
467*a03ca8b9SKrzysztof Kosiński return std::lexicographical_compare(str1_first + a, str1_first + n, b,
468*a03ca8b9SKrzysztof Kosiński str2_last);
469*a03ca8b9SKrzysztof Kosiński });
470*a03ca8b9SKrzysztof Kosiński return it;
471*a03ca8b9SKrzysztof Kosiński }
472*a03ca8b9SKrzysztof Kosiński
473*a03ca8b9SKrzysztof Kosiński } // namespace zucchini
474*a03ca8b9SKrzysztof Kosiński
475*a03ca8b9SKrzysztof Kosiński #endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
476