1*a3a45f30SXin Li // Copyright 2017 The Chromium OS Authors. All rights reserved.
2*a3a45f30SXin Li // Use of this source code is governed by a BSD-style license that can be
3*a3a45f30SXin Li // found in the LICENSE file.
4*a3a45f30SXin Li
5*a3a45f30SXin Li #include "bsdiff/suffix_array_index.h"
6*a3a45f30SXin Li
7*a3a45f30SXin Li #include <algorithm>
8*a3a45f30SXin Li #include <limits>
9*a3a45f30SXin Li #include <vector>
10*a3a45f30SXin Li
11*a3a45f30SXin Li #include <divsufsort.h>
12*a3a45f30SXin Li #include <divsufsort64.h>
13*a3a45f30SXin Li
14*a3a45f30SXin Li #include "bsdiff/logging.h"
15*a3a45f30SXin Li
16*a3a45f30SXin Li namespace {
17*a3a45f30SXin Li
18*a3a45f30SXin Li // libdivsufsort C++ overloaded functions used to allow calling the right
19*a3a45f30SXin Li // implementation based on the pointer size.
CallDivSufSort(const uint8_t * text,saidx_t * sa,size_t n)20*a3a45f30SXin Li int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
21*a3a45f30SXin Li return divsufsort(text, sa, n);
22*a3a45f30SXin Li }
CallDivSufSort(const uint8_t * text,saidx64_t * sa,size_t n)23*a3a45f30SXin Li int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
24*a3a45f30SXin Li return divsufsort64(text, sa, n);
25*a3a45f30SXin Li }
26*a3a45f30SXin Li
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx_t * sa,size_t sa_size,saidx_t * left)27*a3a45f30SXin Li saidx_t CallSaSearch(const uint8_t* text,
28*a3a45f30SXin Li size_t text_size,
29*a3a45f30SXin Li const uint8_t* pattern,
30*a3a45f30SXin Li size_t pattern_size,
31*a3a45f30SXin Li const saidx_t* sa,
32*a3a45f30SXin Li size_t sa_size,
33*a3a45f30SXin Li saidx_t* left) {
34*a3a45f30SXin Li return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
35*a3a45f30SXin Li }
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx64_t * sa,size_t sa_size,saidx64_t * left)36*a3a45f30SXin Li saidx64_t CallSaSearch(const uint8_t* text,
37*a3a45f30SXin Li size_t text_size,
38*a3a45f30SXin Li const uint8_t* pattern,
39*a3a45f30SXin Li size_t pattern_size,
40*a3a45f30SXin Li const saidx64_t* sa,
41*a3a45f30SXin Li size_t sa_size,
42*a3a45f30SXin Li saidx64_t* left) {
43*a3a45f30SXin Li return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
44*a3a45f30SXin Li }
45*a3a45f30SXin Li
46*a3a45f30SXin Li } // namespace
47*a3a45f30SXin Li
48*a3a45f30SXin Li namespace bsdiff {
49*a3a45f30SXin Li
50*a3a45f30SXin Li // The SAIDX template type must be either saidx_t or saidx64_t, which will
51*a3a45f30SXin Li // depend on the maximum size of the input text needed.
52*a3a45f30SXin Li template <typename SAIDX>
53*a3a45f30SXin Li class SuffixArrayIndex : public SuffixArrayIndexInterface {
54*a3a45f30SXin Li public:
55*a3a45f30SXin Li SuffixArrayIndex() = default;
56*a3a45f30SXin Li
57*a3a45f30SXin Li // Initialize and construct the suffix array of the |text| string of length
58*a3a45f30SXin Li // |n|. The memory pointed by |text| must be kept alive. Returns whether the
59*a3a45f30SXin Li // construction succeeded.
60*a3a45f30SXin Li bool Init(const uint8_t* text, size_t n);
61*a3a45f30SXin Li
62*a3a45f30SXin Li // SuffixArrayIndexInterface overrides.
63*a3a45f30SXin Li void SearchPrefix(const uint8_t* target,
64*a3a45f30SXin Li size_t length,
65*a3a45f30SXin Li size_t* out_length,
66*a3a45f30SXin Li uint64_t* out_pos) const override;
67*a3a45f30SXin Li
68*a3a45f30SXin Li private:
69*a3a45f30SXin Li const uint8_t* text_{nullptr}; // Owned by the caller.
70*a3a45f30SXin Li size_t n_{0};
71*a3a45f30SXin Li
72*a3a45f30SXin Li std::vector<SAIDX> sa_;
73*a3a45f30SXin Li };
74*a3a45f30SXin Li
75*a3a45f30SXin Li template <typename SAIDX>
Init(const uint8_t * text,size_t n)76*a3a45f30SXin Li bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
77*a3a45f30SXin Li if (!sa_.empty()) {
78*a3a45f30SXin Li // Already initialized.
79*a3a45f30SXin Li LOG(ERROR) << "SuffixArray already initialized";
80*a3a45f30SXin Li return false;
81*a3a45f30SXin Li }
82*a3a45f30SXin Li if (static_cast<uint64_t>(n) >
83*a3a45f30SXin Li static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
84*a3a45f30SXin Li LOG(ERROR) << "Input too big (" << n << ") for this implementation";
85*a3a45f30SXin Li return false;
86*a3a45f30SXin Li }
87*a3a45f30SXin Li text_ = text;
88*a3a45f30SXin Li n_ = n;
89*a3a45f30SXin Li sa_.resize(n + 1);
90*a3a45f30SXin Li
91*a3a45f30SXin Li if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
92*a3a45f30SXin Li LOG(ERROR) << "divsufsrot() failed";
93*a3a45f30SXin Li return false;
94*a3a45f30SXin Li }
95*a3a45f30SXin Li
96*a3a45f30SXin Li return true;
97*a3a45f30SXin Li }
98*a3a45f30SXin Li
99*a3a45f30SXin Li template <typename SAIDX>
SearchPrefix(const uint8_t * target,size_t length,size_t * out_length,uint64_t * out_pos) const100*a3a45f30SXin Li void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
101*a3a45f30SXin Li size_t length,
102*a3a45f30SXin Li size_t* out_length,
103*a3a45f30SXin Li uint64_t* out_pos) const {
104*a3a45f30SXin Li SAIDX suf_left;
105*a3a45f30SXin Li SAIDX count =
106*a3a45f30SXin Li CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
107*a3a45f30SXin Li if (count > 0) {
108*a3a45f30SXin Li // This is the simple case where we found the whole |target| string was
109*a3a45f30SXin Li // found.
110*a3a45f30SXin Li *out_pos = sa_[suf_left];
111*a3a45f30SXin Li *out_length = length;
112*a3a45f30SXin Li return;
113*a3a45f30SXin Li }
114*a3a45f30SXin Li // In this case, |suf_left| points to the first suffix array position such
115*a3a45f30SXin Li // that the suffix at that position is lexicographically larger than |target|.
116*a3a45f30SXin Li // We only need to check whether the previous entry or the current entry is a
117*a3a45f30SXin Li // longer match.
118*a3a45f30SXin Li size_t prev_suffix_len = 0;
119*a3a45f30SXin Li if (suf_left > 0) {
120*a3a45f30SXin Li const size_t prev_max_len =
121*a3a45f30SXin Li std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
122*a3a45f30SXin Li const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
123*a3a45f30SXin Li prev_suffix_len =
124*a3a45f30SXin Li std::mismatch(target, target + prev_max_len, prev_suffix).first -
125*a3a45f30SXin Li target;
126*a3a45f30SXin Li }
127*a3a45f30SXin Li
128*a3a45f30SXin Li size_t next_suffix_len = 0;
129*a3a45f30SXin Li if (static_cast<size_t>(suf_left) < n_) {
130*a3a45f30SXin Li const uint8_t* next_suffix = text_ + sa_[suf_left];
131*a3a45f30SXin Li const size_t next_max_len =
132*a3a45f30SXin Li std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
133*a3a45f30SXin Li next_suffix_len =
134*a3a45f30SXin Li std::mismatch(target, target + next_max_len, next_suffix).first -
135*a3a45f30SXin Li target;
136*a3a45f30SXin Li }
137*a3a45f30SXin Li
138*a3a45f30SXin Li *out_length = std::max(next_suffix_len, prev_suffix_len);
139*a3a45f30SXin Li if (!*out_length) {
140*a3a45f30SXin Li *out_pos = 0;
141*a3a45f30SXin Li } else if (next_suffix_len >= prev_suffix_len) {
142*a3a45f30SXin Li *out_pos = sa_[suf_left];
143*a3a45f30SXin Li } else {
144*a3a45f30SXin Li *out_pos = sa_[suf_left - 1];
145*a3a45f30SXin Li }
146*a3a45f30SXin Li }
147*a3a45f30SXin Li
CreateSuffixArrayIndex(const uint8_t * text,size_t n)148*a3a45f30SXin Li std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
149*a3a45f30SXin Li const uint8_t* text,
150*a3a45f30SXin Li size_t n) {
151*a3a45f30SXin Li // The maximum supported size when using the suffix array based on the 32-bit
152*a3a45f30SXin Li // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
153*a3a45f30SXin Li // than the maximum representable number so references like "n + 1" are don't
154*a3a45f30SXin Li // overflow.
155*a3a45f30SXin Li const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
156*a3a45f30SXin Li std::unique_ptr<SuffixArrayIndexInterface> ret;
157*a3a45f30SXin Li
158*a3a45f30SXin Li if (n > kMaxSaidxSize) {
159*a3a45f30SXin Li SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
160*a3a45f30SXin Li ret.reset(sa_ptr);
161*a3a45f30SXin Li if (!sa_ptr->Init(text, n))
162*a3a45f30SXin Li return nullptr;
163*a3a45f30SXin Li } else {
164*a3a45f30SXin Li SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
165*a3a45f30SXin Li ret.reset(sa_ptr);
166*a3a45f30SXin Li if (!sa_ptr->Init(text, n))
167*a3a45f30SXin Li return nullptr;
168*a3a45f30SXin Li }
169*a3a45f30SXin Li return ret;
170*a3a45f30SXin Li }
171*a3a45f30SXin Li
172*a3a45f30SXin Li
173*a3a45f30SXin Li } // namespace bsdiff
174