xref: /aosp_15_r20/external/bsdiff/suffix_array_index.cc (revision a3a45f308bd90ef1a6e6a5e8fb92fe449b895909)
1*a3a45f30SXin Li // Copyright 2017 The Chromium OS Authors. All rights reserved.
2*a3a45f30SXin Li // Use of this source code is governed by a BSD-style license that can be
3*a3a45f30SXin Li // found in the LICENSE file.
4*a3a45f30SXin Li 
5*a3a45f30SXin Li #include "bsdiff/suffix_array_index.h"
6*a3a45f30SXin Li 
7*a3a45f30SXin Li #include <algorithm>
8*a3a45f30SXin Li #include <limits>
9*a3a45f30SXin Li #include <vector>
10*a3a45f30SXin Li 
11*a3a45f30SXin Li #include <divsufsort.h>
12*a3a45f30SXin Li #include <divsufsort64.h>
13*a3a45f30SXin Li 
14*a3a45f30SXin Li #include "bsdiff/logging.h"
15*a3a45f30SXin Li 
16*a3a45f30SXin Li namespace {
17*a3a45f30SXin Li 
18*a3a45f30SXin Li // libdivsufsort C++ overloaded functions used to allow calling the right
19*a3a45f30SXin Li // implementation based on the pointer size.
CallDivSufSort(const uint8_t * text,saidx_t * sa,size_t n)20*a3a45f30SXin Li int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
21*a3a45f30SXin Li   return divsufsort(text, sa, n);
22*a3a45f30SXin Li }
CallDivSufSort(const uint8_t * text,saidx64_t * sa,size_t n)23*a3a45f30SXin Li int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
24*a3a45f30SXin Li   return divsufsort64(text, sa, n);
25*a3a45f30SXin Li }
26*a3a45f30SXin Li 
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx_t * sa,size_t sa_size,saidx_t * left)27*a3a45f30SXin Li saidx_t CallSaSearch(const uint8_t* text,
28*a3a45f30SXin Li                      size_t text_size,
29*a3a45f30SXin Li                      const uint8_t* pattern,
30*a3a45f30SXin Li                      size_t pattern_size,
31*a3a45f30SXin Li                      const saidx_t* sa,
32*a3a45f30SXin Li                      size_t sa_size,
33*a3a45f30SXin Li                      saidx_t* left) {
34*a3a45f30SXin Li   return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
35*a3a45f30SXin Li }
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx64_t * sa,size_t sa_size,saidx64_t * left)36*a3a45f30SXin Li saidx64_t CallSaSearch(const uint8_t* text,
37*a3a45f30SXin Li                        size_t text_size,
38*a3a45f30SXin Li                        const uint8_t* pattern,
39*a3a45f30SXin Li                        size_t pattern_size,
40*a3a45f30SXin Li                        const saidx64_t* sa,
41*a3a45f30SXin Li                        size_t sa_size,
42*a3a45f30SXin Li                        saidx64_t* left) {
43*a3a45f30SXin Li   return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
44*a3a45f30SXin Li }
45*a3a45f30SXin Li 
46*a3a45f30SXin Li }  // namespace
47*a3a45f30SXin Li 
48*a3a45f30SXin Li namespace bsdiff {
49*a3a45f30SXin Li 
50*a3a45f30SXin Li // The SAIDX template type must be either saidx_t or saidx64_t, which will
51*a3a45f30SXin Li // depend on the maximum size of the input text needed.
52*a3a45f30SXin Li template <typename SAIDX>
53*a3a45f30SXin Li class SuffixArrayIndex : public SuffixArrayIndexInterface {
54*a3a45f30SXin Li  public:
55*a3a45f30SXin Li   SuffixArrayIndex() = default;
56*a3a45f30SXin Li 
57*a3a45f30SXin Li   // Initialize and construct the suffix array of the |text| string of length
58*a3a45f30SXin Li   // |n|. The memory pointed by |text| must be kept alive. Returns whether the
59*a3a45f30SXin Li   // construction succeeded.
60*a3a45f30SXin Li   bool Init(const uint8_t* text, size_t n);
61*a3a45f30SXin Li 
62*a3a45f30SXin Li   // SuffixArrayIndexInterface overrides.
63*a3a45f30SXin Li   void SearchPrefix(const uint8_t* target,
64*a3a45f30SXin Li                     size_t length,
65*a3a45f30SXin Li                     size_t* out_length,
66*a3a45f30SXin Li                     uint64_t* out_pos) const override;
67*a3a45f30SXin Li 
68*a3a45f30SXin Li  private:
69*a3a45f30SXin Li   const uint8_t* text_{nullptr};  // Owned by the caller.
70*a3a45f30SXin Li   size_t n_{0};
71*a3a45f30SXin Li 
72*a3a45f30SXin Li   std::vector<SAIDX> sa_;
73*a3a45f30SXin Li };
74*a3a45f30SXin Li 
75*a3a45f30SXin Li template <typename SAIDX>
Init(const uint8_t * text,size_t n)76*a3a45f30SXin Li bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
77*a3a45f30SXin Li   if (!sa_.empty()) {
78*a3a45f30SXin Li     // Already initialized.
79*a3a45f30SXin Li     LOG(ERROR) << "SuffixArray already initialized";
80*a3a45f30SXin Li     return false;
81*a3a45f30SXin Li   }
82*a3a45f30SXin Li   if (static_cast<uint64_t>(n) >
83*a3a45f30SXin Li       static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
84*a3a45f30SXin Li     LOG(ERROR) << "Input too big (" << n << ") for this implementation";
85*a3a45f30SXin Li     return false;
86*a3a45f30SXin Li   }
87*a3a45f30SXin Li   text_ = text;
88*a3a45f30SXin Li   n_ = n;
89*a3a45f30SXin Li   sa_.resize(n + 1);
90*a3a45f30SXin Li 
91*a3a45f30SXin Li   if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
92*a3a45f30SXin Li     LOG(ERROR) << "divsufsrot() failed";
93*a3a45f30SXin Li     return false;
94*a3a45f30SXin Li   }
95*a3a45f30SXin Li 
96*a3a45f30SXin Li   return true;
97*a3a45f30SXin Li }
98*a3a45f30SXin Li 
99*a3a45f30SXin Li template <typename SAIDX>
SearchPrefix(const uint8_t * target,size_t length,size_t * out_length,uint64_t * out_pos) const100*a3a45f30SXin Li void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
101*a3a45f30SXin Li                                            size_t length,
102*a3a45f30SXin Li                                            size_t* out_length,
103*a3a45f30SXin Li                                            uint64_t* out_pos) const {
104*a3a45f30SXin Li   SAIDX suf_left;
105*a3a45f30SXin Li   SAIDX count =
106*a3a45f30SXin Li       CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
107*a3a45f30SXin Li   if (count > 0) {
108*a3a45f30SXin Li     // This is the simple case where we found the whole |target| string was
109*a3a45f30SXin Li     // found.
110*a3a45f30SXin Li     *out_pos = sa_[suf_left];
111*a3a45f30SXin Li     *out_length = length;
112*a3a45f30SXin Li     return;
113*a3a45f30SXin Li   }
114*a3a45f30SXin Li   // In this case, |suf_left| points to the first suffix array position such
115*a3a45f30SXin Li   // that the suffix at that position is lexicographically larger than |target|.
116*a3a45f30SXin Li   // We only need to check whether the previous entry or the current entry is a
117*a3a45f30SXin Li   // longer match.
118*a3a45f30SXin Li   size_t prev_suffix_len = 0;
119*a3a45f30SXin Li   if (suf_left > 0) {
120*a3a45f30SXin Li     const size_t prev_max_len =
121*a3a45f30SXin Li         std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
122*a3a45f30SXin Li     const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
123*a3a45f30SXin Li     prev_suffix_len =
124*a3a45f30SXin Li         std::mismatch(target, target + prev_max_len, prev_suffix).first -
125*a3a45f30SXin Li         target;
126*a3a45f30SXin Li   }
127*a3a45f30SXin Li 
128*a3a45f30SXin Li   size_t next_suffix_len = 0;
129*a3a45f30SXin Li   if (static_cast<size_t>(suf_left) < n_) {
130*a3a45f30SXin Li     const uint8_t* next_suffix = text_ + sa_[suf_left];
131*a3a45f30SXin Li     const size_t next_max_len =
132*a3a45f30SXin Li         std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
133*a3a45f30SXin Li     next_suffix_len =
134*a3a45f30SXin Li         std::mismatch(target, target + next_max_len, next_suffix).first -
135*a3a45f30SXin Li         target;
136*a3a45f30SXin Li   }
137*a3a45f30SXin Li 
138*a3a45f30SXin Li   *out_length = std::max(next_suffix_len, prev_suffix_len);
139*a3a45f30SXin Li   if (!*out_length) {
140*a3a45f30SXin Li     *out_pos = 0;
141*a3a45f30SXin Li   } else if (next_suffix_len >= prev_suffix_len) {
142*a3a45f30SXin Li     *out_pos = sa_[suf_left];
143*a3a45f30SXin Li   } else {
144*a3a45f30SXin Li     *out_pos = sa_[suf_left - 1];
145*a3a45f30SXin Li   }
146*a3a45f30SXin Li }
147*a3a45f30SXin Li 
CreateSuffixArrayIndex(const uint8_t * text,size_t n)148*a3a45f30SXin Li std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
149*a3a45f30SXin Li     const uint8_t* text,
150*a3a45f30SXin Li     size_t n) {
151*a3a45f30SXin Li   // The maximum supported size when using the suffix array based on the 32-bit
152*a3a45f30SXin Li   // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
153*a3a45f30SXin Li   // than the maximum representable number so references like "n + 1" are don't
154*a3a45f30SXin Li   // overflow.
155*a3a45f30SXin Li   const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
156*a3a45f30SXin Li   std::unique_ptr<SuffixArrayIndexInterface> ret;
157*a3a45f30SXin Li 
158*a3a45f30SXin Li   if (n > kMaxSaidxSize) {
159*a3a45f30SXin Li     SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
160*a3a45f30SXin Li     ret.reset(sa_ptr);
161*a3a45f30SXin Li     if (!sa_ptr->Init(text, n))
162*a3a45f30SXin Li       return nullptr;
163*a3a45f30SXin Li   } else {
164*a3a45f30SXin Li     SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
165*a3a45f30SXin Li     ret.reset(sa_ptr);
166*a3a45f30SXin Li     if (!sa_ptr->Init(text, n))
167*a3a45f30SXin Li       return nullptr;
168*a3a45f30SXin Li   }
169*a3a45f30SXin Li   return ret;
170*a3a45f30SXin Li }
171*a3a45f30SXin Li 
172*a3a45f30SXin Li 
173*a3a45f30SXin Li }  // namespace bsdiff
174