1 /*!
2 An implementation of the [Two-Way substring search algorithm][two-way].
3 
4 [`Finder`] can be built for forward searches, while [`FinderRev`] can be built
5 for reverse searches.
6 
7 Two-Way makes for a nice general purpose substring search algorithm because of
8 its time and space complexity properties. It also performs well in practice.
9 Namely, with `m = len(needle)` and `n = len(haystack)`, Two-Way takes `O(m)`
10 time to create a finder, `O(1)` space and `O(n)` search time. In other words,
11 the preprocessing step is quick, doesn't require any heap memory and the worst
12 case search time is guaranteed to be linear in the haystack regardless of the
13 size of the needle.
14 
15 While vector algorithms will usually beat Two-Way handedly, vector algorithms
16 also usually have pathological or edge cases that are better handled by Two-Way.
17 Moreover, not all targets support vector algorithms or implementations for them
18 simply may not exist yet.
19 
20 Two-Way can be found in the `memmem` implementations in at least [GNU libc] and
21 [musl].
22 
23 [two-way]: https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm
24 [GNU libc]: https://www.gnu.org/software/libc/
25 [musl]: https://www.musl-libc.org/
26 */
27 
28 use core::cmp;
29 
30 use crate::{
31     arch::all::{is_prefix, is_suffix},
32     memmem::Pre,
33 };
34 
35 /// A forward substring searcher that uses the Two-Way algorithm.
36 #[derive(Clone, Copy, Debug)]
37 pub struct Finder(TwoWay);
38 
39 /// A reverse substring searcher that uses the Two-Way algorithm.
40 #[derive(Clone, Copy, Debug)]
41 pub struct FinderRev(TwoWay);
42 
43 /// An implementation of the TwoWay substring search algorithm.
44 ///
45 /// This searcher supports forward and reverse search, although not
46 /// simultaneously. It runs in `O(n + m)` time and `O(1)` space, where
47 /// `n ~ len(needle)` and `m ~ len(haystack)`.
48 ///
49 /// The implementation here roughly matches that which was developed by
50 /// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The
51 /// changes in this implementation are 1) the use of zero-based indices, 2) a
52 /// heuristic skip table based on the last byte (borrowed from Rust's standard
53 /// library) and 3) the addition of heuristics for a fast skip loop. For (3),
54 /// callers can pass any kind of prefilter they want, but usually it's one
55 /// based on a heuristic that uses an approximate background frequency of bytes
56 /// to choose rare bytes to quickly look for candidate match positions. Note
57 /// though that currently, this prefilter functionality is not exposed directly
58 /// in the public API. (File an issue if you want it and provide a use case
59 /// please.)
60 ///
61 /// The heuristic for fast skipping is automatically shut off if it's
62 /// detected to be ineffective at search time. Generally, this only occurs in
63 /// pathological cases. But this is generally necessary in order to preserve
64 /// a `O(n + m)` time bound.
65 ///
66 /// The code below is fairly complex and not obviously correct at all. It's
67 /// likely necessary to read the Two-Way paper cited above in order to fully
68 /// grok this code. The essence of it is:
69 ///
70 /// 1. Do something to detect a "critical" position in the needle.
71 /// 2. For the current position in the haystack, look if `needle[critical..]`
72 /// matches at that position.
73 /// 3. If so, look if `needle[..critical]` matches.
74 /// 4. If a mismatch occurs, shift the search by some amount based on the
75 /// critical position and a pre-computed shift.
76 ///
77 /// This type is wrapped in the forward and reverse finders that expose
78 /// consistent forward or reverse APIs.
79 #[derive(Clone, Copy, Debug)]
80 struct TwoWay {
81     /// A small bitset used as a quick prefilter (in addition to any prefilter
82     /// given by the caller). Namely, a bit `i` is set if and only if `b%64==i`
83     /// for any `b == needle[i]`.
84     ///
85     /// When used as a prefilter, if the last byte at the current candidate
86     /// position is NOT in this set, then we can skip that entire candidate
87     /// position (the length of the needle). This is essentially the shift
88     /// trick found in Boyer-Moore, but only applied to bytes that don't appear
89     /// in the needle.
90     ///
91     /// N.B. This trick was inspired by something similar in std's
92     /// implementation of Two-Way.
93     byteset: ApproximateByteSet,
94     /// A critical position in needle. Specifically, this position corresponds
95     /// to beginning of either the minimal or maximal suffix in needle. (N.B.
96     /// See SuffixType below for why "minimal" isn't quite the correct word
97     /// here.)
98     ///
99     /// This is the position at which every search begins. Namely, search
100     /// starts by scanning text to the right of this position, and only if
101     /// there's a match does the text to the left of this position get scanned.
102     critical_pos: usize,
103     /// The amount we shift by in the Two-Way search algorithm. This
104     /// corresponds to the "small period" and "large period" cases.
105     shift: Shift,
106 }
107 
108 impl Finder {
109     /// Create a searcher that finds occurrences of the given `needle`.
110     ///
111     /// An empty `needle` results in a match at every position in a haystack,
112     /// including at `haystack.len()`.
113     #[inline]
new(needle: &[u8]) -> Finder114     pub fn new(needle: &[u8]) -> Finder {
115         let byteset = ApproximateByteSet::new(needle);
116         let min_suffix = Suffix::forward(needle, SuffixKind::Minimal);
117         let max_suffix = Suffix::forward(needle, SuffixKind::Maximal);
118         let (period_lower_bound, critical_pos) =
119             if min_suffix.pos > max_suffix.pos {
120                 (min_suffix.period, min_suffix.pos)
121             } else {
122                 (max_suffix.period, max_suffix.pos)
123             };
124         let shift = Shift::forward(needle, period_lower_bound, critical_pos);
125         Finder(TwoWay { byteset, critical_pos, shift })
126     }
127 
128     /// Returns the first occurrence of `needle` in the given `haystack`, or
129     /// `None` if no such occurrence could be found.
130     ///
131     /// The `needle` given must be the same as the `needle` provided to
132     /// [`Finder::new`].
133     ///
134     /// An empty `needle` results in a match at every position in a haystack,
135     /// including at `haystack.len()`.
136     #[inline]
find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize>137     pub fn find(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
138         self.find_with_prefilter(None, haystack, needle)
139     }
140 
141     /// This is like [`Finder::find`], but it accepts a prefilter for
142     /// accelerating searches.
143     ///
144     /// Currently this is not exposed in the public API because, at the time
145     /// of writing, I didn't want to spend time thinking about how to expose
146     /// the prefilter infrastructure (if at all). If you have a compelling use
147     /// case for exposing this routine, please create an issue. Do *not* open
148     /// a PR that just exposes `Pre` and friends. Exporting this routine will
149     /// require API design.
150     #[inline(always)]
find_with_prefilter( &self, pre: Option<Pre<'_>>, haystack: &[u8], needle: &[u8], ) -> Option<usize>151     pub(crate) fn find_with_prefilter(
152         &self,
153         pre: Option<Pre<'_>>,
154         haystack: &[u8],
155         needle: &[u8],
156     ) -> Option<usize> {
157         match self.0.shift {
158             Shift::Small { period } => {
159                 self.find_small_imp(pre, haystack, needle, period)
160             }
161             Shift::Large { shift } => {
162                 self.find_large_imp(pre, haystack, needle, shift)
163             }
164         }
165     }
166 
167     // Each of the two search implementations below can be accelerated by a
168     // prefilter, but it is not always enabled. To avoid its overhead when
169     // its disabled, we explicitly inline each search implementation based on
170     // whether a prefilter will be used or not. The decision on which to use
171     // is made in the parent meta searcher.
172 
173     #[inline(always)]
find_small_imp( &self, mut pre: Option<Pre<'_>>, haystack: &[u8], needle: &[u8], period: usize, ) -> Option<usize>174     fn find_small_imp(
175         &self,
176         mut pre: Option<Pre<'_>>,
177         haystack: &[u8],
178         needle: &[u8],
179         period: usize,
180     ) -> Option<usize> {
181         let mut pos = 0;
182         let mut shift = 0;
183         let last_byte_pos = match needle.len().checked_sub(1) {
184             None => return Some(pos),
185             Some(last_byte) => last_byte,
186         };
187         while pos + needle.len() <= haystack.len() {
188             let mut i = cmp::max(self.0.critical_pos, shift);
189             if let Some(pre) = pre.as_mut() {
190                 if pre.is_effective() {
191                     pos += pre.find(&haystack[pos..])?;
192                     shift = 0;
193                     i = self.0.critical_pos;
194                     if pos + needle.len() > haystack.len() {
195                         return None;
196                     }
197                 }
198             }
199             if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
200                 pos += needle.len();
201                 shift = 0;
202                 continue;
203             }
204             while i < needle.len() && needle[i] == haystack[pos + i] {
205                 i += 1;
206             }
207             if i < needle.len() {
208                 pos += i - self.0.critical_pos + 1;
209                 shift = 0;
210             } else {
211                 let mut j = self.0.critical_pos;
212                 while j > shift && needle[j] == haystack[pos + j] {
213                     j -= 1;
214                 }
215                 if j <= shift && needle[shift] == haystack[pos + shift] {
216                     return Some(pos);
217                 }
218                 pos += period;
219                 shift = needle.len() - period;
220             }
221         }
222         None
223     }
224 
225     #[inline(always)]
find_large_imp( &self, mut pre: Option<Pre<'_>>, haystack: &[u8], needle: &[u8], shift: usize, ) -> Option<usize>226     fn find_large_imp(
227         &self,
228         mut pre: Option<Pre<'_>>,
229         haystack: &[u8],
230         needle: &[u8],
231         shift: usize,
232     ) -> Option<usize> {
233         let mut pos = 0;
234         let last_byte_pos = match needle.len().checked_sub(1) {
235             None => return Some(pos),
236             Some(last_byte) => last_byte,
237         };
238         'outer: while pos + needle.len() <= haystack.len() {
239             if let Some(pre) = pre.as_mut() {
240                 if pre.is_effective() {
241                     pos += pre.find(&haystack[pos..])?;
242                     if pos + needle.len() > haystack.len() {
243                         return None;
244                     }
245                 }
246             }
247 
248             if !self.0.byteset.contains(haystack[pos + last_byte_pos]) {
249                 pos += needle.len();
250                 continue;
251             }
252             let mut i = self.0.critical_pos;
253             while i < needle.len() && needle[i] == haystack[pos + i] {
254                 i += 1;
255             }
256             if i < needle.len() {
257                 pos += i - self.0.critical_pos + 1;
258             } else {
259                 for j in (0..self.0.critical_pos).rev() {
260                     if needle[j] != haystack[pos + j] {
261                         pos += shift;
262                         continue 'outer;
263                     }
264                 }
265                 return Some(pos);
266             }
267         }
268         None
269     }
270 }
271 
272 impl FinderRev {
273     /// Create a searcher that finds occurrences of the given `needle`.
274     ///
275     /// An empty `needle` results in a match at every position in a haystack,
276     /// including at `haystack.len()`.
277     #[inline]
new(needle: &[u8]) -> FinderRev278     pub fn new(needle: &[u8]) -> FinderRev {
279         let byteset = ApproximateByteSet::new(needle);
280         let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal);
281         let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal);
282         let (period_lower_bound, critical_pos) =
283             if min_suffix.pos < max_suffix.pos {
284                 (min_suffix.period, min_suffix.pos)
285             } else {
286                 (max_suffix.period, max_suffix.pos)
287             };
288         let shift = Shift::reverse(needle, period_lower_bound, critical_pos);
289         FinderRev(TwoWay { byteset, critical_pos, shift })
290     }
291 
292     /// Returns the last occurrence of `needle` in the given `haystack`, or
293     /// `None` if no such occurrence could be found.
294     ///
295     /// The `needle` given must be the same as the `needle` provided to
296     /// [`FinderRev::new`].
297     ///
298     /// An empty `needle` results in a match at every position in a haystack,
299     /// including at `haystack.len()`.
300     #[inline]
rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize>301     pub fn rfind(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
302         // For the reverse case, we don't use a prefilter. It's plausible that
303         // perhaps we should, but it's a lot of additional code to do it, and
304         // it's not clear that it's actually worth it. If you have a really
305         // compelling use case for this, please file an issue.
306         match self.0.shift {
307             Shift::Small { period } => {
308                 self.rfind_small_imp(haystack, needle, period)
309             }
310             Shift::Large { shift } => {
311                 self.rfind_large_imp(haystack, needle, shift)
312             }
313         }
314     }
315 
316     #[inline(always)]
rfind_small_imp( &self, haystack: &[u8], needle: &[u8], period: usize, ) -> Option<usize>317     fn rfind_small_imp(
318         &self,
319         haystack: &[u8],
320         needle: &[u8],
321         period: usize,
322     ) -> Option<usize> {
323         let nlen = needle.len();
324         let mut pos = haystack.len();
325         let mut shift = nlen;
326         let first_byte = match needle.get(0) {
327             None => return Some(pos),
328             Some(&first_byte) => first_byte,
329         };
330         while pos >= nlen {
331             if !self.0.byteset.contains(haystack[pos - nlen]) {
332                 pos -= nlen;
333                 shift = nlen;
334                 continue;
335             }
336             let mut i = cmp::min(self.0.critical_pos, shift);
337             while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
338                 i -= 1;
339             }
340             if i > 0 || first_byte != haystack[pos - nlen] {
341                 pos -= self.0.critical_pos - i + 1;
342                 shift = nlen;
343             } else {
344                 let mut j = self.0.critical_pos;
345                 while j < shift && needle[j] == haystack[pos - nlen + j] {
346                     j += 1;
347                 }
348                 if j >= shift {
349                     return Some(pos - nlen);
350                 }
351                 pos -= period;
352                 shift = period;
353             }
354         }
355         None
356     }
357 
358     #[inline(always)]
rfind_large_imp( &self, haystack: &[u8], needle: &[u8], shift: usize, ) -> Option<usize>359     fn rfind_large_imp(
360         &self,
361         haystack: &[u8],
362         needle: &[u8],
363         shift: usize,
364     ) -> Option<usize> {
365         let nlen = needle.len();
366         let mut pos = haystack.len();
367         let first_byte = match needle.get(0) {
368             None => return Some(pos),
369             Some(&first_byte) => first_byte,
370         };
371         while pos >= nlen {
372             if !self.0.byteset.contains(haystack[pos - nlen]) {
373                 pos -= nlen;
374                 continue;
375             }
376             let mut i = self.0.critical_pos;
377             while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] {
378                 i -= 1;
379             }
380             if i > 0 || first_byte != haystack[pos - nlen] {
381                 pos -= self.0.critical_pos - i + 1;
382             } else {
383                 let mut j = self.0.critical_pos;
384                 while j < nlen && needle[j] == haystack[pos - nlen + j] {
385                     j += 1;
386                 }
387                 if j == nlen {
388                     return Some(pos - nlen);
389                 }
390                 pos -= shift;
391             }
392         }
393         None
394     }
395 }
396 
397 /// A representation of the amount we're allowed to shift by during Two-Way
398 /// search.
399 ///
400 /// When computing a critical factorization of the needle, we find the position
401 /// of the critical factorization by finding the needle's maximal (or minimal)
402 /// suffix, along with the period of that suffix. It turns out that the period
403 /// of that suffix is a lower bound on the period of the needle itself.
404 ///
405 /// This lower bound is equivalent to the actual period of the needle in
406 /// some cases. To describe that case, we denote the needle as `x` where
407 /// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower
408 /// bound given here is always the period of `v`, which is `<= period(x)`. The
409 /// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and
410 /// where `u` is a suffix of `v[0..period(v)]`.
411 ///
412 /// This case is important because the search algorithm for when the
413 /// periods are equivalent is slightly different than the search algorithm
414 /// for when the periods are not equivalent. In particular, when they aren't
415 /// equivalent, we know that the period of the needle is no less than half its
416 /// length. In this case, we shift by an amount less than or equal to the
417 /// period of the needle (determined by the maximum length of the components
418 /// of the critical factorization of `x`, i.e., `max(len(u), len(v))`)..
419 ///
420 /// The above two cases are represented by the variants below. Each entails
421 /// a different instantiation of the Two-Way search algorithm.
422 ///
423 /// N.B. If we could find a way to compute the exact period in all cases,
424 /// then we could collapse this case analysis and simplify the algorithm. The
425 /// Two-Way paper suggests this is possible, but more reading is required to
426 /// grok why the authors didn't pursue that path.
427 #[derive(Clone, Copy, Debug)]
428 enum Shift {
429     Small { period: usize },
430     Large { shift: usize },
431 }
432 
433 impl Shift {
434     /// Compute the shift for a given needle in the forward direction.
435     ///
436     /// This requires a lower bound on the period and a critical position.
437     /// These can be computed by extracting both the minimal and maximal
438     /// lexicographic suffixes, and choosing the right-most starting position.
439     /// The lower bound on the period is then the period of the chosen suffix.
forward( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift440     fn forward(
441         needle: &[u8],
442         period_lower_bound: usize,
443         critical_pos: usize,
444     ) -> Shift {
445         let large = cmp::max(critical_pos, needle.len() - critical_pos);
446         if critical_pos * 2 >= needle.len() {
447             return Shift::Large { shift: large };
448         }
449 
450         let (u, v) = needle.split_at(critical_pos);
451         if !is_suffix(&v[..period_lower_bound], u) {
452             return Shift::Large { shift: large };
453         }
454         Shift::Small { period: period_lower_bound }
455     }
456 
457     /// Compute the shift for a given needle in the reverse direction.
458     ///
459     /// This requires a lower bound on the period and a critical position.
460     /// These can be computed by extracting both the minimal and maximal
461     /// lexicographic suffixes, and choosing the left-most starting position.
462     /// The lower bound on the period is then the period of the chosen suffix.
reverse( needle: &[u8], period_lower_bound: usize, critical_pos: usize, ) -> Shift463     fn reverse(
464         needle: &[u8],
465         period_lower_bound: usize,
466         critical_pos: usize,
467     ) -> Shift {
468         let large = cmp::max(critical_pos, needle.len() - critical_pos);
469         if (needle.len() - critical_pos) * 2 >= needle.len() {
470             return Shift::Large { shift: large };
471         }
472 
473         let (v, u) = needle.split_at(critical_pos);
474         if !is_prefix(&v[v.len() - period_lower_bound..], u) {
475             return Shift::Large { shift: large };
476         }
477         Shift::Small { period: period_lower_bound }
478     }
479 }
480 
481 /// A suffix extracted from a needle along with its period.
482 #[derive(Debug)]
483 struct Suffix {
484     /// The starting position of this suffix.
485     ///
486     /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this
487     /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for
488     /// forward suffixes, this is an inclusive starting position, where as for
489     /// reverse suffixes, this is an exclusive ending position.
490     pos: usize,
491     /// The period of this suffix.
492     ///
493     /// Note that this is NOT necessarily the period of the string from which
494     /// this suffix comes from. (It is always less than or equal to the period
495     /// of the original string.)
496     period: usize,
497 }
498 
499 impl Suffix {
forward(needle: &[u8], kind: SuffixKind) -> Suffix500     fn forward(needle: &[u8], kind: SuffixKind) -> Suffix {
501         // suffix represents our maximal (or minimal) suffix, along with
502         // its period.
503         let mut suffix = Suffix { pos: 0, period: 1 };
504         // The start of a suffix in `needle` that we are considering as a
505         // more maximal (or minimal) suffix than what's in `suffix`.
506         let mut candidate_start = 1;
507         // The current offset of our suffixes that we're comparing.
508         //
509         // When the characters at this offset are the same, then we mush on
510         // to the next position since no decision is possible. When the
511         // candidate's character is greater (or lesser) than the corresponding
512         // character than our current maximal (or minimal) suffix, then the
513         // current suffix is changed over to the candidate and we restart our
514         // search. Otherwise, the candidate suffix is no good and we restart
515         // our search on the next candidate.
516         //
517         // The three cases above correspond to the three cases in the loop
518         // below.
519         let mut offset = 0;
520 
521         while candidate_start + offset < needle.len() {
522             let current = needle[suffix.pos + offset];
523             let candidate = needle[candidate_start + offset];
524             match kind.cmp(current, candidate) {
525                 SuffixOrdering::Accept => {
526                     suffix = Suffix { pos: candidate_start, period: 1 };
527                     candidate_start += 1;
528                     offset = 0;
529                 }
530                 SuffixOrdering::Skip => {
531                     candidate_start += offset + 1;
532                     offset = 0;
533                     suffix.period = candidate_start - suffix.pos;
534                 }
535                 SuffixOrdering::Push => {
536                     if offset + 1 == suffix.period {
537                         candidate_start += suffix.period;
538                         offset = 0;
539                     } else {
540                         offset += 1;
541                     }
542                 }
543             }
544         }
545         suffix
546     }
547 
reverse(needle: &[u8], kind: SuffixKind) -> Suffix548     fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix {
549         // See the comments in `forward` for how this works.
550         let mut suffix = Suffix { pos: needle.len(), period: 1 };
551         if needle.len() == 1 {
552             return suffix;
553         }
554         let mut candidate_start = match needle.len().checked_sub(1) {
555             None => return suffix,
556             Some(candidate_start) => candidate_start,
557         };
558         let mut offset = 0;
559 
560         while offset < candidate_start {
561             let current = needle[suffix.pos - offset - 1];
562             let candidate = needle[candidate_start - offset - 1];
563             match kind.cmp(current, candidate) {
564                 SuffixOrdering::Accept => {
565                     suffix = Suffix { pos: candidate_start, period: 1 };
566                     candidate_start -= 1;
567                     offset = 0;
568                 }
569                 SuffixOrdering::Skip => {
570                     candidate_start -= offset + 1;
571                     offset = 0;
572                     suffix.period = suffix.pos - candidate_start;
573                 }
574                 SuffixOrdering::Push => {
575                     if offset + 1 == suffix.period {
576                         candidate_start -= suffix.period;
577                         offset = 0;
578                     } else {
579                         offset += 1;
580                     }
581                 }
582             }
583         }
584         suffix
585     }
586 }
587 
588 /// The kind of suffix to extract.
589 #[derive(Clone, Copy, Debug)]
590 enum SuffixKind {
591     /// Extract the smallest lexicographic suffix from a string.
592     ///
593     /// Technically, this doesn't actually pick the smallest lexicographic
594     /// suffix. e.g., Given the choice between `a` and `aa`, this will choose
595     /// the latter over the former, even though `a < aa`. The reasoning for
596     /// this isn't clear from the paper, but it still smells like a minimal
597     /// suffix.
598     Minimal,
599     /// Extract the largest lexicographic suffix from a string.
600     ///
601     /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given
602     /// the choice between `z` and `zz`, this will choose the latter over the
603     /// former.
604     Maximal,
605 }
606 
607 /// The result of comparing corresponding bytes between two suffixes.
608 #[derive(Clone, Copy, Debug)]
609 enum SuffixOrdering {
610     /// This occurs when the given candidate byte indicates that the candidate
611     /// suffix is better than the current maximal (or minimal) suffix. That is,
612     /// the current candidate suffix should supplant the current maximal (or
613     /// minimal) suffix.
614     Accept,
615     /// This occurs when the given candidate byte excludes the candidate suffix
616     /// from being better than the current maximal (or minimal) suffix. That
617     /// is, the current candidate suffix should be dropped and the next one
618     /// should be considered.
619     Skip,
620     /// This occurs when no decision to accept or skip the candidate suffix
621     /// can be made, e.g., when corresponding bytes are equivalent. In this
622     /// case, the next corresponding bytes should be compared.
623     Push,
624 }
625 
626 impl SuffixKind {
627     /// Returns true if and only if the given candidate byte indicates that
628     /// it should replace the current suffix as the maximal (or minimal)
629     /// suffix.
cmp(self, current: u8, candidate: u8) -> SuffixOrdering630     fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering {
631         use self::SuffixOrdering::*;
632 
633         match self {
634             SuffixKind::Minimal if candidate < current => Accept,
635             SuffixKind::Minimal if candidate > current => Skip,
636             SuffixKind::Minimal => Push,
637             SuffixKind::Maximal if candidate > current => Accept,
638             SuffixKind::Maximal if candidate < current => Skip,
639             SuffixKind::Maximal => Push,
640         }
641     }
642 }
643 
644 /// A bitset used to track whether a particular byte exists in a needle or not.
645 ///
646 /// Namely, bit 'i' is set if and only if byte%64==i for any byte in the
647 /// needle. If a particular byte in the haystack is NOT in this set, then one
648 /// can conclude that it is also not in the needle, and thus, one can advance
649 /// in the haystack by needle.len() bytes.
650 #[derive(Clone, Copy, Debug)]
651 struct ApproximateByteSet(u64);
652 
653 impl ApproximateByteSet {
654     /// Create a new set from the given needle.
new(needle: &[u8]) -> ApproximateByteSet655     fn new(needle: &[u8]) -> ApproximateByteSet {
656         let mut bits = 0;
657         for &b in needle {
658             bits |= 1 << (b % 64);
659         }
660         ApproximateByteSet(bits)
661     }
662 
663     /// Return true if and only if the given byte might be in this set. This
664     /// may return a false positive, but will never return a false negative.
665     #[inline(always)]
contains(&self, byte: u8) -> bool666     fn contains(&self, byte: u8) -> bool {
667         self.0 & (1 << (byte % 64)) != 0
668     }
669 }
670 
671 #[cfg(test)]
672 mod tests {
673     use alloc::vec::Vec;
674 
675     use super::*;
676 
677     /// Convenience wrapper for computing the suffix as a byte string.
get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)678     fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
679         let s = Suffix::forward(needle, kind);
680         (&needle[s.pos..], s.period)
681     }
682 
683     /// Convenience wrapper for computing the reverse suffix as a byte string.
get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize)684     fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) {
685         let s = Suffix::reverse(needle, kind);
686         (&needle[..s.pos], s.period)
687     }
688 
689     /// Return all of the non-empty suffixes in the given byte string.
suffixes(bytes: &[u8]) -> Vec<&[u8]>690     fn suffixes(bytes: &[u8]) -> Vec<&[u8]> {
691         (0..bytes.len()).map(|i| &bytes[i..]).collect()
692     }
693 
694     /// Return the lexicographically maximal suffix of the given byte string.
naive_maximal_suffix_forward(needle: &[u8]) -> &[u8]695     fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] {
696         let mut sufs = suffixes(needle);
697         sufs.sort();
698         sufs.pop().unwrap()
699     }
700 
701     /// Return the lexicographically maximal suffix of the reverse of the given
702     /// byte string.
naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8>703     fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec<u8> {
704         let mut reversed = needle.to_vec();
705         reversed.reverse();
706         let mut got = naive_maximal_suffix_forward(&reversed).to_vec();
707         got.reverse();
708         got
709     }
710 
711     define_substring_forward_quickcheck!(|h, n| Some(
712         Finder::new(n).find(h, n)
713     ));
714     define_substring_reverse_quickcheck!(|h, n| Some(
715         FinderRev::new(n).rfind(h, n)
716     ));
717 
718     #[test]
forward()719     fn forward() {
720         crate::tests::substring::Runner::new()
721             .fwd(|h, n| Some(Finder::new(n).find(h, n)))
722             .run();
723     }
724 
725     #[test]
reverse()726     fn reverse() {
727         crate::tests::substring::Runner::new()
728             .rev(|h, n| Some(FinderRev::new(n).rfind(h, n)))
729             .run();
730     }
731 
732     #[test]
suffix_forward()733     fn suffix_forward() {
734         macro_rules! assert_suffix_min {
735             ($given:expr, $expected:expr, $period:expr) => {
736                 let (got_suffix, got_period) =
737                     get_suffix_forward($given.as_bytes(), SuffixKind::Minimal);
738                 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
739                 assert_eq!(($expected, $period), (got_suffix, got_period));
740             };
741         }
742 
743         macro_rules! assert_suffix_max {
744             ($given:expr, $expected:expr, $period:expr) => {
745                 let (got_suffix, got_period) =
746                     get_suffix_forward($given.as_bytes(), SuffixKind::Maximal);
747                 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
748                 assert_eq!(($expected, $period), (got_suffix, got_period));
749             };
750         }
751 
752         assert_suffix_min!("a", "a", 1);
753         assert_suffix_max!("a", "a", 1);
754 
755         assert_suffix_min!("ab", "ab", 2);
756         assert_suffix_max!("ab", "b", 1);
757 
758         assert_suffix_min!("ba", "a", 1);
759         assert_suffix_max!("ba", "ba", 2);
760 
761         assert_suffix_min!("abc", "abc", 3);
762         assert_suffix_max!("abc", "c", 1);
763 
764         assert_suffix_min!("acb", "acb", 3);
765         assert_suffix_max!("acb", "cb", 2);
766 
767         assert_suffix_min!("cba", "a", 1);
768         assert_suffix_max!("cba", "cba", 3);
769 
770         assert_suffix_min!("abcabc", "abcabc", 3);
771         assert_suffix_max!("abcabc", "cabc", 3);
772 
773         assert_suffix_min!("abcabcabc", "abcabcabc", 3);
774         assert_suffix_max!("abcabcabc", "cabcabc", 3);
775 
776         assert_suffix_min!("abczz", "abczz", 5);
777         assert_suffix_max!("abczz", "zz", 1);
778 
779         assert_suffix_min!("zzabc", "abc", 3);
780         assert_suffix_max!("zzabc", "zzabc", 5);
781 
782         assert_suffix_min!("aaa", "aaa", 1);
783         assert_suffix_max!("aaa", "aaa", 1);
784 
785         assert_suffix_min!("foobar", "ar", 2);
786         assert_suffix_max!("foobar", "r", 1);
787     }
788 
789     #[test]
suffix_reverse()790     fn suffix_reverse() {
791         macro_rules! assert_suffix_min {
792             ($given:expr, $expected:expr, $period:expr) => {
793                 let (got_suffix, got_period) =
794                     get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal);
795                 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
796                 assert_eq!(($expected, $period), (got_suffix, got_period));
797             };
798         }
799 
800         macro_rules! assert_suffix_max {
801             ($given:expr, $expected:expr, $period:expr) => {
802                 let (got_suffix, got_period) =
803                     get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal);
804                 let got_suffix = core::str::from_utf8(got_suffix).unwrap();
805                 assert_eq!(($expected, $period), (got_suffix, got_period));
806             };
807         }
808 
809         assert_suffix_min!("a", "a", 1);
810         assert_suffix_max!("a", "a", 1);
811 
812         assert_suffix_min!("ab", "a", 1);
813         assert_suffix_max!("ab", "ab", 2);
814 
815         assert_suffix_min!("ba", "ba", 2);
816         assert_suffix_max!("ba", "b", 1);
817 
818         assert_suffix_min!("abc", "a", 1);
819         assert_suffix_max!("abc", "abc", 3);
820 
821         assert_suffix_min!("acb", "a", 1);
822         assert_suffix_max!("acb", "ac", 2);
823 
824         assert_suffix_min!("cba", "cba", 3);
825         assert_suffix_max!("cba", "c", 1);
826 
827         assert_suffix_min!("abcabc", "abca", 3);
828         assert_suffix_max!("abcabc", "abcabc", 3);
829 
830         assert_suffix_min!("abcabcabc", "abcabca", 3);
831         assert_suffix_max!("abcabcabc", "abcabcabc", 3);
832 
833         assert_suffix_min!("abczz", "a", 1);
834         assert_suffix_max!("abczz", "abczz", 5);
835 
836         assert_suffix_min!("zzabc", "zza", 3);
837         assert_suffix_max!("zzabc", "zz", 1);
838 
839         assert_suffix_min!("aaa", "aaa", 1);
840         assert_suffix_max!("aaa", "aaa", 1);
841     }
842 
843     #[cfg(not(miri))]
844     quickcheck::quickcheck! {
845         fn qc_suffix_forward_maximal(bytes: Vec<u8>) -> bool {
846             if bytes.is_empty() {
847                 return true;
848             }
849 
850             let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal);
851             let expected = naive_maximal_suffix_forward(&bytes);
852             got == expected
853         }
854 
855         fn qc_suffix_reverse_maximal(bytes: Vec<u8>) -> bool {
856             if bytes.is_empty() {
857                 return true;
858             }
859 
860             let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal);
861             let expected = naive_maximal_suffix_reverse(&bytes);
862             expected == got
863         }
864     }
865 
866     // This is a regression test caught by quickcheck that exercised a bug in
867     // the reverse small period handling. The bug was that we were using 'if j
868     // == shift' to determine if a match occurred, but the correct guard is 'if
869     // j >= shift', which matches the corresponding guard in the forward impl.
870     #[test]
regression_rev_small_period()871     fn regression_rev_small_period() {
872         let rfind = |h, n| FinderRev::new(n).rfind(h, n);
873         let haystack = "ababaz";
874         let needle = "abab";
875         assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes()));
876     }
877 }
878