1 /*!
2 Provides a noncontiguous NFA implementation of Aho-Corasick.
3 
4 This is a low-level API that generally only needs to be used in niche
5 circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6 instead of a noncontiguous NFA directly. Using an `NFA` directly is typically
7 only necessary when one needs access to the [`Automaton`] trait implementation.
8 */
9 
10 use alloc::{
11     collections::{BTreeSet, VecDeque},
12     vec,
13     vec::Vec,
14 };
15 
16 use crate::{
17     automaton::Automaton,
18     util::{
19         alphabet::{ByteClassSet, ByteClasses},
20         error::{BuildError, MatchError},
21         prefilter::{self, opposite_ascii_case, Prefilter},
22         primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
23         remapper::Remapper,
24         search::{Anchored, MatchKind},
25         special::Special,
26     },
27 };
28 
29 /// A noncontiguous NFA implementation of Aho-Corasick.
30 ///
31 /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
32 /// this type directly. Using an `NFA` directly is typically only necessary
33 /// when one needs access to the [`Automaton`] trait implementation.
34 ///
35 /// This NFA represents the "core" implementation of Aho-Corasick in this
36 /// crate. Namely, constructing this NFA involving building a trie and then
37 /// filling in the failure transitions between states, similar to what is
38 /// described in any standard textbook description of Aho-Corasick.
39 ///
40 /// In order to minimize heap usage and to avoid additional construction costs,
41 /// this implementation represents the transitions of all states as distinct
42 /// sparse memory allocations. This is where it gets its name from. That is,
43 /// this NFA has no contiguous memory allocation for its transition table. Each
44 /// state gets its own allocation.
45 ///
46 /// While the sparse representation keeps memory usage to somewhat reasonable
47 /// levels, it is still quite large and also results in somewhat mediocre
48 /// search performance. For this reason, it is almost always a good idea to
49 /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is
50 /// marginally slower to build, but has higher throughput and can sometimes use
51 /// an order of magnitude less memory. The main reason to use a noncontiguous
52 /// NFA is when you need the fastest possible construction time, or when a
53 /// contiguous NFA does not have the desired capacity. (The total number of NFA
54 /// states it can have is fewer than a noncontiguous NFA.)
55 ///
56 /// # Example
57 ///
58 /// This example shows how to build an `NFA` directly and use it to execute
59 /// [`Automaton::try_find`]:
60 ///
61 /// ```
62 /// use aho_corasick::{
63 ///     automaton::Automaton,
64 ///     nfa::noncontiguous::NFA,
65 ///     Input, Match,
66 /// };
67 ///
68 /// let patterns = &["b", "abc", "abcd"];
69 /// let haystack = "abcd";
70 ///
71 /// let nfa = NFA::new(patterns).unwrap();
72 /// assert_eq!(
73 ///     Some(Match::must(0, 1..2)),
74 ///     nfa.try_find(&Input::new(haystack))?,
75 /// );
76 /// # Ok::<(), Box<dyn std::error::Error>>(())
77 /// ```
78 ///
79 /// It is also possible to implement your own version of `try_find`. See the
80 /// [`Automaton`] documentation for an example.
81 #[derive(Clone)]
82 pub struct NFA {
83     /// The match semantics built into this NFA.
84     match_kind: MatchKind,
85     /// A set of states. Each state defines its own transitions, a fail
86     /// transition and a set of indices corresponding to matches.
87     ///
88     /// The first state is always the fail state, which is used only as a
89     /// sentinel. Namely, in the final NFA, no transition into the fail state
90     /// exists. (Well, they do, but they aren't followed. Instead, the state's
91     /// failure transition is followed.)
92     ///
93     /// The second state (index 1) is always the dead state. Dead states are
94     /// in every automaton, but only used when leftmost-{first,longest} match
95     /// semantics are enabled. Specifically, they instruct search to stop
96     /// at specific points in order to report the correct match location. In
97     /// the standard Aho-Corasick construction, there are no transitions to
98     /// the dead state.
99     ///
100     /// The third state (index 2) is generally intended to be the starting or
101     /// "root" state.
102     states: Vec<State>,
103     /// Transitions stored in a sparse representation via a linked list.
104     ///
105     /// Each transition contains three pieces of information: the byte it
106     /// is defined for, the state it transitions to and a link to the next
107     /// transition in the same state (or `StateID::ZERO` if it is the last
108     /// transition).
109     ///
110     /// The first transition for each state is determined by `State::sparse`.
111     ///
112     /// Note that this contains a complete set of all transitions in this NFA,
113     /// including states that have a dense representation for transitions.
114     /// (Adding dense transitions for a state doesn't remove its sparse
115     /// transitions, since deleting transitions from this particular sparse
116     /// representation would be fairly expensive.)
117     sparse: Vec<Transition>,
118     /// Transitions stored in a dense representation.
119     ///
120     /// A state has a row in this table if and only if `State::dense` is
121     /// not equal to `StateID::ZERO`. When not zero, there are precisely
122     /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense`
123     /// in this table.
124     ///
125     /// Generally a very small minority of states have a dense representation
126     /// since it uses so much memory.
127     dense: Vec<StateID>,
128     /// Matches stored in linked list for each state.
129     ///
130     /// Like sparse transitions, each match has a link to the next match in the
131     /// state.
132     ///
133     /// The first match for each state is determined by `State::matches`.
134     matches: Vec<Match>,
135     /// The length, in bytes, of each pattern in this NFA. This slice is
136     /// indexed by `PatternID`.
137     ///
138     /// The number of entries in this vector corresponds to the total number of
139     /// patterns in this automaton.
140     pattern_lens: Vec<SmallIndex>,
141     /// A prefilter for quickly skipping to candidate matches, if pertinent.
142     prefilter: Option<Prefilter>,
143     /// A set of equivalence classes in terms of bytes. We compute this while
144     /// building the NFA, but don't use it in the NFA's states. Instead, we
145     /// use this for building the DFA. We store it on the NFA since it's easy
146     /// to compute while visiting the patterns.
147     byte_classes: ByteClasses,
148     /// The length, in bytes, of the shortest pattern in this automaton. This
149     /// information is useful for detecting whether an automaton matches the
150     /// empty string or not.
151     min_pattern_len: usize,
152     /// The length, in bytes, of the longest pattern in this automaton. This
153     /// information is useful for keeping correct buffer sizes when searching
154     /// on streams.
155     max_pattern_len: usize,
156     /// The information required to deduce which states are "special" in this
157     /// NFA.
158     ///
159     /// Since the DEAD and FAIL states are always the first two states and
160     /// there are only ever two start states (which follow all of the match
161     /// states), it follows that we can determine whether a state is a fail,
162     /// dead, match or start with just a few comparisons on the ID itself:
163     ///
164     ///    is_dead(sid): sid == NFA::DEAD
165     ///    is_fail(sid): sid == NFA::FAIL
166     ///   is_match(sid): NFA::FAIL < sid && sid <= max_match_id
167     ///   is_start(sid): sid == start_unanchored_id || sid == start_anchored_id
168     ///
169     /// Note that this only applies to the NFA after it has been constructed.
170     /// During construction, the start states are the first ones added and the
171     /// match states are inter-leaved with non-match states. Once all of the
172     /// states have been added, the states are shuffled such that the above
173     /// predicates hold.
174     special: Special,
175 }
176 
177 impl NFA {
178     /// Create a new Aho-Corasick noncontiguous NFA using the default
179     /// configuration.
180     ///
181     /// Use a [`Builder`] if you want to change the configuration.
new<I, P>(patterns: I) -> Result<NFA, BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,182     pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
183     where
184         I: IntoIterator<Item = P>,
185         P: AsRef<[u8]>,
186     {
187         NFA::builder().build(patterns)
188     }
189 
190     /// A convenience method for returning a new Aho-Corasick noncontiguous NFA
191     /// builder.
192     ///
193     /// This usually permits one to just import the `NFA` type.
builder() -> Builder194     pub fn builder() -> Builder {
195         Builder::new()
196     }
197 }
198 
199 impl NFA {
200     /// The DEAD state is a sentinel state like the FAIL state. The DEAD state
201     /// instructs any search to stop and return any currently recorded match,
202     /// or no match otherwise. Generally speaking, it is impossible for an
203     /// unanchored standard search to enter a DEAD state. But an anchored
204     /// search can, and so to can a leftmost search.
205     ///
206     /// We put DEAD before FAIL so that DEAD is always 0. We repeat this
207     /// decision across the other Aho-Corasicm automata, so that DEAD
208     /// states there are always 0 too. It's not that we need all of the
209     /// implementations to agree, but rather, the contiguous NFA and the DFA
210     /// use a sort of "premultiplied" state identifier where the only state
211     /// whose ID is always known and constant is the first state. Subsequent
212     /// state IDs depend on how much space has already been used in the
213     /// transition table.
214     pub(crate) const DEAD: StateID = StateID::new_unchecked(0);
215     /// The FAIL state mostly just corresponds to the ID of any transition on a
216     /// state that isn't explicitly defined. When one transitions into the FAIL
217     /// state, one must follow the previous state's failure transition before
218     /// doing the next state lookup. In this way, FAIL is more of a sentinel
219     /// than a state that one actually transitions into. In particular, it is
220     /// never exposed in the `Automaton` interface.
221     pub(crate) const FAIL: StateID = StateID::new_unchecked(1);
222 
223     /// Returns the equivalence classes of bytes found while constructing
224     /// this NFA.
225     ///
226     /// Note that the NFA doesn't actually make use of these equivalence
227     /// classes. Instead, these are useful for building the DFA when desired.
byte_classes(&self) -> &ByteClasses228     pub(crate) fn byte_classes(&self) -> &ByteClasses {
229         &self.byte_classes
230     }
231 
232     /// Returns a slice containing the length of each pattern in this searcher.
233     /// It is indexed by `PatternID` and has length `NFA::patterns_len`.
234     ///
235     /// This is exposed for convenience when building a contiguous NFA. But it
236     /// can be reconstructed from the `Automaton` API if necessary.
pattern_lens_raw(&self) -> &[SmallIndex]237     pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] {
238         &self.pattern_lens
239     }
240 
241     /// Returns a slice of all states in this non-contiguous NFA.
states(&self) -> &[State]242     pub(crate) fn states(&self) -> &[State] {
243         &self.states
244     }
245 
246     /// Returns the underlying "special" state information for this NFA.
special(&self) -> &Special247     pub(crate) fn special(&self) -> &Special {
248         &self.special
249     }
250 
251     /// Swaps the states at `id1` and `id2`.
252     ///
253     /// This does not update the transitions of any state to account for the
254     /// state swap.
swap_states(&mut self, id1: StateID, id2: StateID)255     pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
256         self.states.swap(id1.as_usize(), id2.as_usize());
257     }
258 
259     /// Re-maps all state IDs in this NFA according to the `map` function
260     /// given.
remap(&mut self, map: impl Fn(StateID) -> StateID)261     pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
262         let alphabet_len = self.byte_classes.alphabet_len();
263         for state in self.states.iter_mut() {
264             state.fail = map(state.fail);
265             let mut link = state.sparse;
266             while link != StateID::ZERO {
267                 let t = &mut self.sparse[link];
268                 t.next = map(t.next);
269                 link = t.link;
270             }
271             if state.dense != StateID::ZERO {
272                 let start = state.dense.as_usize();
273                 for next in self.dense[start..][..alphabet_len].iter_mut() {
274                     *next = map(*next);
275                 }
276             }
277         }
278     }
279 
280     /// Iterate over all of the transitions for the given state ID.
iter_trans( &self, sid: StateID, ) -> impl Iterator<Item = Transition> + '_281     pub(crate) fn iter_trans(
282         &self,
283         sid: StateID,
284     ) -> impl Iterator<Item = Transition> + '_ {
285         let mut link = self.states[sid].sparse;
286         core::iter::from_fn(move || {
287             if link == StateID::ZERO {
288                 return None;
289             }
290             let t = self.sparse[link];
291             link = t.link;
292             Some(t)
293         })
294     }
295 
296     /// Iterate over all of the matches for the given state ID.
iter_matches( &self, sid: StateID, ) -> impl Iterator<Item = PatternID> + '_297     pub(crate) fn iter_matches(
298         &self,
299         sid: StateID,
300     ) -> impl Iterator<Item = PatternID> + '_ {
301         let mut link = self.states[sid].matches;
302         core::iter::from_fn(move || {
303             if link == StateID::ZERO {
304                 return None;
305             }
306             let m = self.matches[link];
307             link = m.link;
308             Some(m.pid)
309         })
310     }
311 
312     /// Return the link following the one given. If the one given is the last
313     /// link for the given state, then return `None`.
314     ///
315     /// If no previous link is given, then this returns the first link in the
316     /// state, if one exists.
317     ///
318     /// This is useful for manually iterating over the transitions in a single
319     /// state without borrowing the NFA. This permits mutating other parts of
320     /// the NFA during iteration. Namely, one can access the transition pointed
321     /// to by the link via `self.sparse[link]`.
next_link( &self, sid: StateID, prev: Option<StateID>, ) -> Option<StateID>322     fn next_link(
323         &self,
324         sid: StateID,
325         prev: Option<StateID>,
326     ) -> Option<StateID> {
327         let link =
328             prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link);
329         if link == StateID::ZERO {
330             None
331         } else {
332             Some(link)
333         }
334     }
335 
336     /// Follow the transition for the given byte in the given state. If no such
337     /// transition exists, then the FAIL state ID is returned.
338     #[inline(always)]
follow_transition(&self, sid: StateID, byte: u8) -> StateID339     fn follow_transition(&self, sid: StateID, byte: u8) -> StateID {
340         let s = &self.states[sid];
341         // This is a special case that targets starting states and states
342         // near a start state. Namely, after the initial trie is constructed,
343         // we look for states close to the start state to convert to a dense
344         // representation for their transitions. This winds up using a lot more
345         // memory per state in exchange for faster transition lookups. But
346         // since we only do this for a small number of states (by default), the
347         // memory usage is usually minimal.
348         //
349         // This has *massive* benefit when executing searches because the
350         // unanchored starting state is by far the hottest state and is
351         // frequently visited. Moreover, the 'for' loop below that works
352         // decently on an actually sparse state is disastrous on a state that
353         // is nearly or completely dense.
354         if s.dense == StateID::ZERO {
355             self.follow_transition_sparse(sid, byte)
356         } else {
357             let class = usize::from(self.byte_classes.get(byte));
358             self.dense[s.dense.as_usize() + class]
359         }
360     }
361 
362     /// Like `follow_transition`, but always uses the sparse representation.
363     #[inline(always)]
follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID364     fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID {
365         for t in self.iter_trans(sid) {
366             if byte <= t.byte {
367                 if byte == t.byte {
368                     return t.next;
369                 }
370                 break;
371             }
372         }
373         NFA::FAIL
374     }
375 
376     /// Set the transition for the given byte to the state ID given.
377     ///
378     /// Note that one should not set transitions to the FAIL state. It is not
379     /// technically incorrect, but it wastes space. If a transition is not
380     /// defined, then it is automatically assumed to lead to the FAIL state.
add_transition( &mut self, prev: StateID, byte: u8, next: StateID, ) -> Result<(), BuildError>381     fn add_transition(
382         &mut self,
383         prev: StateID,
384         byte: u8,
385         next: StateID,
386     ) -> Result<(), BuildError> {
387         if self.states[prev].dense != StateID::ZERO {
388             let dense = self.states[prev].dense;
389             let class = usize::from(self.byte_classes.get(byte));
390             self.dense[dense.as_usize() + class] = next;
391         }
392 
393         let head = self.states[prev].sparse;
394         if head == StateID::ZERO || byte < self.sparse[head].byte {
395             let new_link = self.alloc_transition()?;
396             self.sparse[new_link] = Transition { byte, next, link: head };
397             self.states[prev].sparse = new_link;
398             return Ok(());
399         } else if byte == self.sparse[head].byte {
400             self.sparse[head].next = next;
401             return Ok(());
402         }
403 
404         // We handled the only cases where the beginning of the transition
405         // chain needs to change. At this point, we now know that there is
406         // at least one entry in the transition chain and the byte for that
407         // transition is less than the byte for the transition we're adding.
408         let (mut link_prev, mut link_next) = (head, self.sparse[head].link);
409         while link_next != StateID::ZERO && byte > self.sparse[link_next].byte
410         {
411             link_prev = link_next;
412             link_next = self.sparse[link_next].link;
413         }
414         if link_next == StateID::ZERO || byte < self.sparse[link_next].byte {
415             let link = self.alloc_transition()?;
416             self.sparse[link] = Transition { byte, next, link: link_next };
417             self.sparse[link_prev].link = link;
418         } else {
419             assert_eq!(byte, self.sparse[link_next].byte);
420             self.sparse[link_next].next = next;
421         }
422         Ok(())
423     }
424 
425     /// This sets every possible transition (all 255 of them) for the given
426     /// state to the name `next` value.
427     ///
428     /// This is useful for efficiently initializing start/dead states.
429     ///
430     /// # Panics
431     ///
432     /// This requires that the state has no transitions added to it already.
433     /// If it has any transitions, then this panics. It will also panic if
434     /// the state has been densified prior to calling this.
init_full_state( &mut self, prev: StateID, next: StateID, ) -> Result<(), BuildError>435     fn init_full_state(
436         &mut self,
437         prev: StateID,
438         next: StateID,
439     ) -> Result<(), BuildError> {
440         assert_eq!(
441             StateID::ZERO,
442             self.states[prev].dense,
443             "state must not be dense yet"
444         );
445         assert_eq!(
446             StateID::ZERO,
447             self.states[prev].sparse,
448             "state must have zero transitions"
449         );
450         let mut prev_link = StateID::ZERO;
451         for byte in 0..=255 {
452             let new_link = self.alloc_transition()?;
453             self.sparse[new_link] =
454                 Transition { byte, next, link: StateID::ZERO };
455             if prev_link == StateID::ZERO {
456                 self.states[prev].sparse = new_link;
457             } else {
458                 self.sparse[prev_link].link = new_link;
459             }
460             prev_link = new_link;
461         }
462         Ok(())
463     }
464 
465     /// Add a match for the given pattern ID to the state for the given ID.
add_match( &mut self, sid: StateID, pid: PatternID, ) -> Result<(), BuildError>466     fn add_match(
467         &mut self,
468         sid: StateID,
469         pid: PatternID,
470     ) -> Result<(), BuildError> {
471         let head = self.states[sid].matches;
472         let mut link = head;
473         while self.matches[link].link != StateID::ZERO {
474             link = self.matches[link].link;
475         }
476         let new_match_link = self.alloc_match()?;
477         self.matches[new_match_link].pid = pid;
478         if link == StateID::ZERO {
479             self.states[sid].matches = new_match_link;
480         } else {
481             self.matches[link].link = new_match_link;
482         }
483         Ok(())
484     }
485 
486     /// Copy matches from the `src` state to the `dst` state. This is useful
487     /// when a match state can be reached via a failure transition. In which
488     /// case, you'll want to copy the matches (if any) from the state reached
489     /// by the failure transition to the original state you were at.
copy_matches( &mut self, src: StateID, dst: StateID, ) -> Result<(), BuildError>490     fn copy_matches(
491         &mut self,
492         src: StateID,
493         dst: StateID,
494     ) -> Result<(), BuildError> {
495         let head_dst = self.states[dst].matches;
496         let mut link_dst = head_dst;
497         while self.matches[link_dst].link != StateID::ZERO {
498             link_dst = self.matches[link_dst].link;
499         }
500         let mut link_src = self.states[src].matches;
501         while link_src != StateID::ZERO {
502             let new_match_link =
503                 StateID::new(self.matches.len()).map_err(|e| {
504                     BuildError::state_id_overflow(
505                         StateID::MAX.as_u64(),
506                         e.attempted(),
507                     )
508                 })?;
509             self.matches.push(Match {
510                 pid: self.matches[link_src].pid,
511                 link: StateID::ZERO,
512             });
513             if link_dst == StateID::ZERO {
514                 self.states[dst].matches = new_match_link;
515             } else {
516                 self.matches[link_dst].link = new_match_link;
517             }
518 
519             link_dst = new_match_link;
520             link_src = self.matches[link_src].link;
521         }
522         Ok(())
523     }
524 
525     /// Create a new entry in `NFA::trans`, if there's room, and return that
526     /// entry's ID. If there's no room, then an error is returned.
alloc_transition(&mut self) -> Result<StateID, BuildError>527     fn alloc_transition(&mut self) -> Result<StateID, BuildError> {
528         let id = StateID::new(self.sparse.len()).map_err(|e| {
529             BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
530         })?;
531         self.sparse.push(Transition::default());
532         Ok(id)
533     }
534 
535     /// Create a new entry in `NFA::matches`, if there's room, and return that
536     /// entry's ID. If there's no room, then an error is returned.
alloc_match(&mut self) -> Result<StateID, BuildError>537     fn alloc_match(&mut self) -> Result<StateID, BuildError> {
538         let id = StateID::new(self.matches.len()).map_err(|e| {
539             BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
540         })?;
541         self.matches.push(Match::default());
542         Ok(id)
543     }
544 
545     /// Create a new set of `N` transitions in this NFA's dense transition
546     /// table. The ID return corresponds to the index at which the `N`
547     /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is
548     /// the last.
549     ///
550     /// `N` is determined via `NFA::byte_classes::alphabet_len`.
alloc_dense_state(&mut self) -> Result<StateID, BuildError>551     fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> {
552         let id = StateID::new(self.dense.len()).map_err(|e| {
553             BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
554         })?;
555         // We use FAIL because it's the correct default. If a state doesn't
556         // have a transition defined for every possible byte value, then the
557         // transition function should return NFA::FAIL.
558         self.dense.extend(
559             core::iter::repeat(NFA::FAIL)
560                 .take(self.byte_classes.alphabet_len()),
561         );
562         Ok(id)
563     }
564 
565     /// Allocate and add a fresh state to the underlying NFA and return its
566     /// ID (guaranteed to be one more than the ID of the previously allocated
567     /// state). If the ID would overflow `StateID`, then this returns an error.
alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError>568     fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> {
569         // This is OK because we error when building the trie if we see a
570         // pattern whose length cannot fit into a 'SmallIndex', and the longest
571         // possible depth corresponds to the length of the longest pattern.
572         let depth = SmallIndex::new(depth)
573             .expect("patterns longer than SmallIndex::MAX are not allowed");
574         let id = StateID::new(self.states.len()).map_err(|e| {
575             BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
576         })?;
577         self.states.push(State {
578             sparse: StateID::ZERO,
579             dense: StateID::ZERO,
580             matches: StateID::ZERO,
581             fail: self.special.start_unanchored_id,
582             depth,
583         });
584         Ok(id)
585     }
586 }
587 
588 // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
589 // returns a valid state ID given a valid state ID. We otherwise claim that
590 // all other methods are correct as well.
591 unsafe impl Automaton for NFA {
592     #[inline(always)]
start_state(&self, anchored: Anchored) -> Result<StateID, MatchError>593     fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
594         match anchored {
595             Anchored::No => Ok(self.special.start_unanchored_id),
596             Anchored::Yes => Ok(self.special.start_anchored_id),
597         }
598     }
599 
600     #[inline(always)]
next_state( &self, anchored: Anchored, mut sid: StateID, byte: u8, ) -> StateID601     fn next_state(
602         &self,
603         anchored: Anchored,
604         mut sid: StateID,
605         byte: u8,
606     ) -> StateID {
607         // This terminates since:
608         //
609         // 1. state.fail never points to the FAIL state.
610         // 2. All state.fail values point to a state closer to the start state.
611         // 3. The start state has no transitions to the FAIL state.
612         loop {
613             let next = self.follow_transition(sid, byte);
614             if next != NFA::FAIL {
615                 return next;
616             }
617             // For an anchored search, we never follow failure transitions
618             // because failure transitions lead us down a path to matching
619             // a *proper* suffix of the path we were on. Thus, it can only
620             // produce matches that appear after the beginning of the search.
621             if anchored.is_anchored() {
622                 return NFA::DEAD;
623             }
624             sid = self.states[sid].fail();
625         }
626     }
627 
628     #[inline(always)]
is_special(&self, sid: StateID) -> bool629     fn is_special(&self, sid: StateID) -> bool {
630         sid <= self.special.max_special_id
631     }
632 
633     #[inline(always)]
is_dead(&self, sid: StateID) -> bool634     fn is_dead(&self, sid: StateID) -> bool {
635         sid == NFA::DEAD
636     }
637 
638     #[inline(always)]
is_match(&self, sid: StateID) -> bool639     fn is_match(&self, sid: StateID) -> bool {
640         // N.B. This returns true when sid==NFA::FAIL but that's okay because
641         // NFA::FAIL is not actually a valid state ID from the perspective of
642         // the Automaton trait. Namely, it is never returned by 'start_state'
643         // or by 'next_state'. So we don't need to care about it here.
644         !self.is_dead(sid) && sid <= self.special.max_match_id
645     }
646 
647     #[inline(always)]
is_start(&self, sid: StateID) -> bool648     fn is_start(&self, sid: StateID) -> bool {
649         sid == self.special.start_unanchored_id
650             || sid == self.special.start_anchored_id
651     }
652 
653     #[inline(always)]
match_kind(&self) -> MatchKind654     fn match_kind(&self) -> MatchKind {
655         self.match_kind
656     }
657 
658     #[inline(always)]
patterns_len(&self) -> usize659     fn patterns_len(&self) -> usize {
660         self.pattern_lens.len()
661     }
662 
663     #[inline(always)]
pattern_len(&self, pid: PatternID) -> usize664     fn pattern_len(&self, pid: PatternID) -> usize {
665         self.pattern_lens[pid].as_usize()
666     }
667 
668     #[inline(always)]
min_pattern_len(&self) -> usize669     fn min_pattern_len(&self) -> usize {
670         self.min_pattern_len
671     }
672 
673     #[inline(always)]
max_pattern_len(&self) -> usize674     fn max_pattern_len(&self) -> usize {
675         self.max_pattern_len
676     }
677 
678     #[inline(always)]
match_len(&self, sid: StateID) -> usize679     fn match_len(&self, sid: StateID) -> usize {
680         self.iter_matches(sid).count()
681     }
682 
683     #[inline(always)]
match_pattern(&self, sid: StateID, index: usize) -> PatternID684     fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
685         self.iter_matches(sid).nth(index).unwrap()
686     }
687 
688     #[inline(always)]
memory_usage(&self) -> usize689     fn memory_usage(&self) -> usize {
690         self.states.len() * core::mem::size_of::<State>()
691             + self.sparse.len() * core::mem::size_of::<Transition>()
692             + self.matches.len() * core::mem::size_of::<Match>()
693             + self.dense.len() * StateID::SIZE
694             + self.pattern_lens.len() * SmallIndex::SIZE
695             + self.prefilter.as_ref().map_or(0, |p| p.memory_usage())
696     }
697 
698     #[inline(always)]
prefilter(&self) -> Option<&Prefilter>699     fn prefilter(&self) -> Option<&Prefilter> {
700         self.prefilter.as_ref()
701     }
702 }
703 
704 /// A representation of a sparse NFA state for an Aho-Corasick automaton.
705 ///
706 /// It contains the transitions to the next state, a failure transition for
707 /// cases where there exists no other transition for the current input byte
708 /// and the matches implied by visiting this state (if any).
709 #[derive(Clone, Debug)]
710 pub(crate) struct State {
711     /// A pointer to `NFA::trans` corresponding to the head of a linked list
712     /// containing all of the transitions for this state.
713     ///
714     /// This is `StateID::ZERO` if and only if this state has zero transitions.
715     sparse: StateID,
716     /// A pointer to a row of `N` transitions in `NFA::dense`. These
717     /// transitions correspond precisely to what is obtained by traversing
718     /// `sparse`, but permits constant time lookup.
719     ///
720     /// When this is zero (which is true for most states in the default
721     /// configuration), then this state has no dense representation.
722     ///
723     /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is
724     /// typically much less than 256 (the maximum value).
725     dense: StateID,
726     /// A pointer to `NFA::matches` corresponding to the head of a linked list
727     /// containing all of the matches for this state.
728     ///
729     /// This is `StateID::ZERO` if and only if this state is not a match state.
730     matches: StateID,
731     /// The state that should be transitioned to if the current byte in the
732     /// haystack does not have a corresponding transition defined in this
733     /// state.
734     fail: StateID,
735     /// The depth of this state. Specifically, this is the distance from this
736     /// state to the starting state. (For the special sentinel states DEAD and
737     /// FAIL, their depth is always 0.) The depth of a starting state is 0.
738     ///
739     /// Note that depth is currently not used in this non-contiguous NFA. It
740     /// may in the future, but it is used in the contiguous NFA. Namely, it
741     /// permits an optimization where states near the starting state have their
742     /// transitions stored in a dense fashion, but all other states have their
743     /// transitions stored in a sparse fashion. (This non-contiguous NFA uses
744     /// a sparse representation for all states unconditionally.) In any case,
745     /// this is really the only convenient place to compute and store this
746     /// information, which we need when building the contiguous NFA.
747     depth: SmallIndex,
748 }
749 
750 impl State {
751     /// Return true if and only if this state is a match state.
is_match(&self) -> bool752     pub(crate) fn is_match(&self) -> bool {
753         self.matches != StateID::ZERO
754     }
755 
756     /// Returns the failure transition for this state.
fail(&self) -> StateID757     pub(crate) fn fail(&self) -> StateID {
758         self.fail
759     }
760 
761     /// Returns the depth of this state. That is, the number of transitions
762     /// this state is from the start state of the NFA.
depth(&self) -> SmallIndex763     pub(crate) fn depth(&self) -> SmallIndex {
764         self.depth
765     }
766 }
767 
768 /// A single transition in a non-contiguous NFA.
769 #[derive(Clone, Copy, Default)]
770 #[repr(packed)]
771 pub(crate) struct Transition {
772     byte: u8,
773     next: StateID,
774     link: StateID,
775 }
776 
777 impl Transition {
778     /// Return the byte for which this transition is defined.
byte(&self) -> u8779     pub(crate) fn byte(&self) -> u8 {
780         self.byte
781     }
782 
783     /// Return the ID of the state that this transition points to.
next(&self) -> StateID784     pub(crate) fn next(&self) -> StateID {
785         self.next
786     }
787 
788     /// Return the ID of the next transition.
link(&self) -> StateID789     fn link(&self) -> StateID {
790         self.link
791     }
792 }
793 
794 impl core::fmt::Debug for Transition {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result795     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
796         write!(
797             f,
798             "Transition(byte: {:X?}, next: {:?}, link: {:?})",
799             self.byte,
800             self.next().as_usize(),
801             self.link().as_usize()
802         )
803     }
804 }
805 
806 /// A single match in a non-contiguous NFA.
807 #[derive(Clone, Copy, Default)]
808 struct Match {
809     pid: PatternID,
810     link: StateID,
811 }
812 
813 impl Match {
814     /// Return the pattern ID for this match.
pattern(&self) -> PatternID815     pub(crate) fn pattern(&self) -> PatternID {
816         self.pid
817     }
818 
819     /// Return the ID of the next match.
link(&self) -> StateID820     fn link(&self) -> StateID {
821         self.link
822     }
823 }
824 
825 impl core::fmt::Debug for Match {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result826     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
827         write!(
828             f,
829             "Match(pid: {:?}, link: {:?})",
830             self.pattern().as_usize(),
831             self.link().as_usize()
832         )
833     }
834 }
835 
836 /// A builder for configuring an Aho-Corasick noncontiguous NFA.
837 ///
838 /// This builder has a subset of the options available to a
839 /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
840 /// their behavior is identical.
841 #[derive(Clone, Debug)]
842 pub struct Builder {
843     match_kind: MatchKind,
844     prefilter: bool,
845     ascii_case_insensitive: bool,
846     dense_depth: usize,
847 }
848 
849 impl Default for Builder {
default() -> Builder850     fn default() -> Builder {
851         Builder {
852             match_kind: MatchKind::default(),
853             prefilter: true,
854             ascii_case_insensitive: false,
855             dense_depth: 3,
856         }
857     }
858 }
859 
860 impl Builder {
861     /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA.
new() -> Builder862     pub fn new() -> Builder {
863         Builder::default()
864     }
865 
866     /// Build an Aho-Corasick noncontiguous NFA from the given iterator of
867     /// patterns.
868     ///
869     /// A builder may be reused to create more NFAs.
build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,870     pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
871     where
872         I: IntoIterator<Item = P>,
873         P: AsRef<[u8]>,
874     {
875         debug!("building non-contiguous NFA");
876         let nfa = Compiler::new(self)?.compile(patterns)?;
877         debug!(
878             "non-contiguous NFA built, <states: {:?}, size: {:?}>",
879             nfa.states.len(),
880             nfa.memory_usage()
881         );
882         Ok(nfa)
883     }
884 
885     /// Set the desired match semantics.
886     ///
887     /// See
888     /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
889     /// for more documentation and examples.
match_kind(&mut self, kind: MatchKind) -> &mut Builder890     pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
891         self.match_kind = kind;
892         self
893     }
894 
895     /// Enable ASCII-aware case insensitive matching.
896     ///
897     /// See
898     /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
899     /// for more documentation and examples.
ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder900     pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
901         self.ascii_case_insensitive = yes;
902         self
903     }
904 
905     /// Set the limit on how many states use a dense representation for their
906     /// transitions. Other states will generally use a sparse representation.
907     ///
908     /// See
909     /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
910     /// for more documentation and examples.
dense_depth(&mut self, depth: usize) -> &mut Builder911     pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
912         self.dense_depth = depth;
913         self
914     }
915 
916     /// Enable heuristic prefilter optimizations.
917     ///
918     /// See
919     /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
920     /// for more documentation and examples.
prefilter(&mut self, yes: bool) -> &mut Builder921     pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
922         self.prefilter = yes;
923         self
924     }
925 }
926 
927 /// A compiler uses a builder configuration and builds up the NFA formulation
928 /// of an Aho-Corasick automaton. This roughly corresponds to the standard
929 /// formulation described in textbooks, with some tweaks to support leftmost
930 /// searching.
931 #[derive(Debug)]
932 struct Compiler<'a> {
933     builder: &'a Builder,
934     prefilter: prefilter::Builder,
935     nfa: NFA,
936     byteset: ByteClassSet,
937 }
938 
939 impl<'a> Compiler<'a> {
new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError>940     fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> {
941         let prefilter = prefilter::Builder::new(builder.match_kind)
942             .ascii_case_insensitive(builder.ascii_case_insensitive);
943         Ok(Compiler {
944             builder,
945             prefilter,
946             nfa: NFA {
947                 match_kind: builder.match_kind,
948                 states: vec![],
949                 sparse: vec![],
950                 dense: vec![],
951                 matches: vec![],
952                 pattern_lens: vec![],
953                 prefilter: None,
954                 byte_classes: ByteClasses::singletons(),
955                 min_pattern_len: usize::MAX,
956                 max_pattern_len: 0,
957                 special: Special::zero(),
958             },
959             byteset: ByteClassSet::empty(),
960         })
961     }
962 
compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,963     fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError>
964     where
965         I: IntoIterator<Item = P>,
966         P: AsRef<[u8]>,
967     {
968         // Add dummy transition/match links, so that no valid link will point
969         // to another link at index 0.
970         self.nfa.sparse.push(Transition::default());
971         self.nfa.matches.push(Match::default());
972         // Add a dummy dense transition so that no states can have dense==0
973         // represent a valid pointer to dense transitions. This permits
974         // dense==0 to be a sentinel indicating "no dense transitions."
975         self.nfa.dense.push(NFA::DEAD);
976         // the dead state, only used for leftmost and fixed to id==0
977         self.nfa.alloc_state(0)?;
978         // the fail state, which is never entered and fixed to id==1
979         self.nfa.alloc_state(0)?;
980         // unanchored start state, initially fixed to id==2 but later shuffled
981         // to appear after all non-start match states.
982         self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?;
983         // anchored start state, initially fixed to id==3 but later shuffled
984         // to appear after unanchored start state.
985         self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?;
986         // Initialize the unanchored starting state in order to make it dense,
987         // and thus make transition lookups on this state faster.
988         self.init_unanchored_start_state()?;
989         // Set all transitions on the DEAD state to point to itself. This way,
990         // the DEAD state can never be escaped. It MUST be used as a sentinel
991         // in any correct search.
992         self.add_dead_state_loop()?;
993         // Build the base trie from the given patterns.
994         self.build_trie(patterns)?;
995         self.nfa.states.shrink_to_fit();
996         // Turn our set of bytes into equivalent classes. This NFA
997         // implementation uses byte classes only for states that use a dense
998         // representation of transitions. (And that's why this comes before
999         // `self.densify()`, as the byte classes need to be set first.)
1000         self.nfa.byte_classes = self.byteset.byte_classes();
1001         // Add transitions (and maybe matches) to the anchored starting state.
1002         // The anchored starting state is used for anchored searches. The only
1003         // mechanical difference between it and the unanchored start state is
1004         // that missing transitions map to the DEAD state instead of the FAIL
1005         // state.
1006         self.set_anchored_start_state()?;
1007         // Rewrite transitions to the FAIL state on the unanchored start state
1008         // as self-transitions. This keeps the start state active at all times.
1009         self.add_unanchored_start_state_loop();
1010         // Make some (possibly zero) states use a dense representation for
1011         // transitions. It's important to do this right after the states
1012         // and non-failure transitions are solidified. That way, subsequent
1013         // accesses (particularly `fill_failure_transitions`) will benefit from
1014         // the faster transition lookup in densified states.
1015         self.densify()?;
1016         // The meat of the Aho-Corasick algorithm: compute and write failure
1017         // transitions. i.e., the state to move to when a transition isn't
1018         // defined in the current state. These are epsilon transitions and thus
1019         // make this formulation an NFA.
1020         self.fill_failure_transitions()?;
1021         // Handle a special case under leftmost semantics when at least one
1022         // of the patterns is the empty string.
1023         self.close_start_state_loop_for_leftmost();
1024         // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START,
1025         // NON-MATCH, ... This permits us to very quickly query the type of
1026         // the state we're currently in during a search.
1027         self.shuffle();
1028         self.nfa.prefilter = self.prefilter.build();
1029         // Store the maximum ID of all *relevant* special states. Start states
1030         // are only relevant when we have a prefilter, otherwise, there is zero
1031         // reason to care about whether a state is a start state or not during
1032         // a search. Indeed, without a prefilter, we are careful to explicitly
1033         // NOT care about start states, otherwise the search can ping pong
1034         // between the unrolled loop and the handling of special-status states
1035         // and destroy perf.
1036         self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() {
1037             // Why the anchored starting state? Because we always put it
1038             // after the unanchored starting state and it is therefore the
1039             // maximum. Why put unanchored followed by anchored? No particular
1040             // reason, but that's how the states are logically organized in the
1041             // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯
1042             self.nfa.special.start_anchored_id
1043         } else {
1044             self.nfa.special.max_match_id
1045         };
1046         self.nfa.sparse.shrink_to_fit();
1047         self.nfa.dense.shrink_to_fit();
1048         self.nfa.matches.shrink_to_fit();
1049         self.nfa.pattern_lens.shrink_to_fit();
1050         Ok(self.nfa)
1051     }
1052 
1053     /// This sets up the initial prefix trie that makes up the Aho-Corasick
1054     /// automaton. Effectively, it creates the basic structure of the
1055     /// automaton, where every pattern given has a path from the start state to
1056     /// the end of the pattern.
build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,1057     fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError>
1058     where
1059         I: IntoIterator<Item = P>,
1060         P: AsRef<[u8]>,
1061     {
1062         'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() {
1063             let pid = PatternID::new(i).map_err(|e| {
1064                 BuildError::pattern_id_overflow(
1065                     PatternID::MAX.as_u64(),
1066                     e.attempted(),
1067                 )
1068             })?;
1069             let pat = pat.as_ref();
1070             let patlen = SmallIndex::new(pat.len())
1071                 .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?;
1072             self.nfa.min_pattern_len =
1073                 core::cmp::min(self.nfa.min_pattern_len, pat.len());
1074             self.nfa.max_pattern_len =
1075                 core::cmp::max(self.nfa.max_pattern_len, pat.len());
1076             assert_eq!(
1077                 i,
1078                 self.nfa.pattern_lens.len(),
1079                 "expected number of patterns to match pattern ID"
1080             );
1081             self.nfa.pattern_lens.push(patlen);
1082             // We add the pattern to the prefilter here because the pattern
1083             // ID in the prefilter is determined with respect to the patterns
1084             // added to the prefilter. That is, it isn't the ID we have here,
1085             // but the one determined by its own accounting of patterns.
1086             // To ensure they line up, we add every pattern we see to the
1087             // prefilter, even if some patterns ultimately are impossible to
1088             // match (in leftmost-first semantics specifically).
1089             //
1090             // Another way of doing this would be to expose an API in the
1091             // prefilter to permit setting your own pattern IDs. Or to just use
1092             // our own map and go between them. But this case is sufficiently
1093             // rare that we don't bother and just make sure they're in sync.
1094             if self.builder.prefilter {
1095                 self.prefilter.add(pat);
1096             }
1097 
1098             let mut prev = self.nfa.special.start_unanchored_id;
1099             let mut saw_match = false;
1100             for (depth, &b) in pat.iter().enumerate() {
1101                 // When leftmost-first match semantics are requested, we
1102                 // specifically stop adding patterns when a previously added
1103                 // pattern is a prefix of it. We avoid adding it because
1104                 // leftmost-first semantics imply that the pattern can never
1105                 // match. This is not just an optimization to save space! It
1106                 // is necessary for correctness. In fact, this is the only
1107                 // difference in the automaton between the implementations for
1108                 // leftmost-first and leftmost-longest.
1109                 saw_match = saw_match || self.nfa.states[prev].is_match();
1110                 if self.builder.match_kind.is_leftmost_first() && saw_match {
1111                     // Skip to the next pattern immediately. This avoids
1112                     // incorrectly adding a match after this loop terminates.
1113                     continue 'PATTERNS;
1114                 }
1115 
1116                 // Add this byte to our equivalence classes. These don't
1117                 // get used while building the trie, but other Aho-Corasick
1118                 // implementations may use them.
1119                 self.byteset.set_range(b, b);
1120                 if self.builder.ascii_case_insensitive {
1121                     let b = opposite_ascii_case(b);
1122                     self.byteset.set_range(b, b);
1123                 }
1124 
1125                 // If the transition from prev using the current byte already
1126                 // exists, then just move through it. Otherwise, add a new
1127                 // state. We track the depth here so that we can determine
1128                 // how to represent transitions. States near the start state
1129                 // use a dense representation that uses more memory but is
1130                 // faster. Other states use a sparse representation that uses
1131                 // less memory but is slower.
1132                 let next = self.nfa.follow_transition(prev, b);
1133                 if next != NFA::FAIL {
1134                     prev = next;
1135                 } else {
1136                     let next = self.nfa.alloc_state(depth)?;
1137                     self.nfa.add_transition(prev, b, next)?;
1138                     if self.builder.ascii_case_insensitive {
1139                         let b = opposite_ascii_case(b);
1140                         self.nfa.add_transition(prev, b, next)?;
1141                     }
1142                     prev = next;
1143                 }
1144             }
1145             // Once the pattern has been added, log the match in the final
1146             // state that it reached.
1147             self.nfa.add_match(prev, pid)?;
1148         }
1149         Ok(())
1150     }
1151 
1152     /// This routine creates failure transitions according to the standard
1153     /// textbook formulation of the Aho-Corasick algorithm, with a couple small
1154     /// tweaks to support "leftmost" semantics.
1155     ///
1156     /// Building failure transitions is the most interesting part of building
1157     /// the Aho-Corasick automaton, because they are what allow searches to
1158     /// be performed in linear time. Specifically, a failure transition is
1159     /// a single transition associated with each state that points back to
1160     /// the longest proper suffix of the pattern being searched. The failure
1161     /// transition is followed whenever there exists no transition on the
1162     /// current state for the current input byte. If there is no other proper
1163     /// suffix, then the failure transition points back to the starting state.
1164     ///
1165     /// For example, let's say we built an Aho-Corasick automaton with the
1166     /// following patterns: 'abcd' and 'cef'. The trie looks like this:
1167     ///
1168     /// ```ignore
1169     ///          a - S1 - b - S2 - c - S3 - d - S4*
1170     ///         /
1171     ///     S0 - c - S5 - e - S6 - f - S7*
1172     /// ```
1173     ///
1174     /// At this point, it should be fairly straight-forward to see how this
1175     /// trie can be used in a simplistic way. At any given position in the
1176     /// text we're searching (called the "subject" string), all we need to do
1177     /// is follow the transitions in the trie by consuming one transition for
1178     /// each byte in the subject string. If we reach a match state, then we can
1179     /// report that location as a match.
1180     ///
1181     /// The trick comes when searching a subject string like 'abcef'. We'll
1182     /// initially follow the transition from S0 to S1 and wind up in S3 after
1183     /// observng the 'c' byte. At this point, the next byte is 'e' but state
1184     /// S3 has no transition for 'e', so the search fails. We then would need
1185     /// to restart the search at the next position in 'abcef', which
1186     /// corresponds to 'b'. The match would fail, but the next search starting
1187     /// at 'c' would finally succeed. The problem with this approach is that
1188     /// we wind up searching the subject string potentially many times. In
1189     /// effect, this makes the algorithm have worst case `O(n * m)` complexity,
1190     /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead
1191     /// like to achieve a `O(n + m)` worst case complexity.
1192     ///
1193     /// This is where failure transitions come in. Instead of dying at S3 in
1194     /// the first search, the automaton can instruct the search to move to
1195     /// another part of the automaton that corresponds to a suffix of what
1196     /// we've seen so far. Recall that we've seen 'abc' in the subject string,
1197     /// and the automaton does indeed have a non-empty suffix, 'c', that could
1198     /// potentially lead to another match. Thus, the actual Aho-Corasick
1199     /// automaton for our patterns in this case looks like this:
1200     ///
1201     /// ```ignore
1202     ///          a - S1 - b - S2 - c - S3 - d - S4*
1203     ///         /                      /
1204     ///        /       ----------------
1205     ///       /       /
1206     ///     S0 - c - S5 - e - S6 - f - S7*
1207     /// ```
1208     ///
1209     /// That is, we have a failure transition from S3 to S5, which is followed
1210     /// exactly in cases when we are in state S3 but see any byte other than
1211     /// 'd' (that is, we've "failed" to find a match in this portion of our
1212     /// trie). We know we can transition back to S5 because we've already seen
1213     /// a 'c' byte, so we don't need to re-scan it. We can then pick back up
1214     /// with the search starting at S5 and complete our match.
1215     ///
1216     /// Adding failure transitions to a trie is fairly simple, but subtle. The
1217     /// key issue is that you might have multiple failure transition that you
1218     /// need to follow. For example, look at the trie for the patterns
1219     /// 'abcd', 'b', 'bcd' and 'cd':
1220     ///
1221     /// ```ignore
1222     ///          - a - S1 - b - S2* - c - S3 - d - S4*
1223     ///         /               /         /
1224     ///        /         -------   -------
1225     ///       /         /         /
1226     ///     S0 --- b - S5* - c - S6 - d - S7*
1227     ///       \                  /
1228     ///        \         --------
1229     ///         \       /
1230     ///          - c - S8 - d - S9*
1231     /// ```
1232     ///
1233     /// The failure transitions for this trie are defined from S2 to S5,
1234     /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it
1235     /// corresponds to a match, since its failure transition to S5 is itself
1236     /// a match state.
1237     ///
1238     /// Perhaps simplest way to think about adding these failure transitions
1239     /// is recursively. That is, if you know the failure transitions for every
1240     /// possible previous state that could be visited (e.g., when computing the
1241     /// failure transition for S3, you already know the failure transitions
1242     /// for S0, S1 and S2), then you can simply follow the failure transition
1243     /// of the previous state and check whether the incoming transition is
1244     /// defined after following the failure transition.
1245     ///
1246     /// For example, when determining the failure state for S3, by our
1247     /// assumptions, we already know that there is a failure transition from
1248     /// S2 (the previous state) to S5. So we follow that transition and check
1249     /// whether the transition connecting S2 to S3 is defined. Indeed, it is,
1250     /// as there is a transition from S5 to S6 for the byte 'c'. If no such
1251     /// transition existed, we could keep following the failure transitions
1252     /// until we reach the start state, which is the failure transition for
1253     /// every state that has no corresponding proper suffix.
1254     ///
1255     /// We don't actually use recursion to implement this, but instead, use a
1256     /// breadth first search of the automaton. Our base case is the start
1257     /// state, whose failure transition is just a transition to itself.
1258     ///
1259     /// When building a leftmost automaton, we proceed as above, but only
1260     /// include a subset of failure transitions. Namely, we omit any failure
1261     /// transitions that appear after a match state in the trie. This is
1262     /// because failure transitions always point back to a proper suffix of
1263     /// what has been seen so far. Thus, following a failure transition after
1264     /// a match implies looking for a match that starts after the one that has
1265     /// already been seen, which is of course therefore not the leftmost match.
1266     ///
1267     /// N.B. I came up with this algorithm on my own, and after scouring all of
1268     /// the other AC implementations I know of (Perl, Snort, many on GitHub).
1269     /// I couldn't find any that implement leftmost semantics like this.
1270     /// Perl of course needs leftmost-first semantics, but they implement it
1271     /// with a seeming hack at *search* time instead of encoding it into the
1272     /// automaton. There are also a couple Java libraries that support leftmost
1273     /// longest semantics, but they do it by building a queue of matches at
1274     /// search time, which is even worse than what Perl is doing. ---AG
fill_failure_transitions(&mut self) -> Result<(), BuildError>1275     fn fill_failure_transitions(&mut self) -> Result<(), BuildError> {
1276         let is_leftmost = self.builder.match_kind.is_leftmost();
1277         let start_uid = self.nfa.special.start_unanchored_id;
1278         // Initialize the queue for breadth first search with all transitions
1279         // out of the start state. We handle the start state specially because
1280         // we only want to follow non-self transitions. If we followed self
1281         // transitions, then this would never terminate.
1282         let mut queue = VecDeque::new();
1283         let mut seen = self.queued_set();
1284         let mut prev_link = None;
1285         while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1286             prev_link = Some(link);
1287             let t = self.nfa.sparse[link];
1288 
1289             // Skip anything we've seen before and any self-transitions on the
1290             // start state.
1291             if start_uid == t.next() || seen.contains(t.next) {
1292                 continue;
1293             }
1294             queue.push_back(t.next);
1295             seen.insert(t.next);
1296             // Under leftmost semantics, if a state immediately following
1297             // the start state is a match state, then we never want to
1298             // follow its failure transition since the failure transition
1299             // necessarily leads back to the start state, which we never
1300             // want to do for leftmost matching after a match has been
1301             // found.
1302             //
1303             // We apply the same logic to non-start states below as well.
1304             if is_leftmost && self.nfa.states[t.next].is_match() {
1305                 self.nfa.states[t.next].fail = NFA::DEAD;
1306             }
1307         }
1308         while let Some(id) = queue.pop_front() {
1309             let mut prev_link = None;
1310             while let Some(link) = self.nfa.next_link(id, prev_link) {
1311                 prev_link = Some(link);
1312                 let t = self.nfa.sparse[link];
1313 
1314                 if seen.contains(t.next) {
1315                     // The only way to visit a duplicate state in a transition
1316                     // list is when ASCII case insensitivity is enabled. In
1317                     // this case, we want to skip it since it's redundant work.
1318                     // But it would also end up duplicating matches, which
1319                     // results in reporting duplicate matches in some cases.
1320                     // See the 'acasei010' regression test.
1321                     continue;
1322                 }
1323                 queue.push_back(t.next);
1324                 seen.insert(t.next);
1325 
1326                 // As above for start states, under leftmost semantics, once
1327                 // we see a match all subsequent states should have no failure
1328                 // transitions because failure transitions always imply looking
1329                 // for a match that is a suffix of what has been seen so far
1330                 // (where "seen so far" corresponds to the string formed by
1331                 // following the transitions from the start state to the
1332                 // current state). Under leftmost semantics, we specifically do
1333                 // not want to allow this to happen because we always want to
1334                 // report the match found at the leftmost position.
1335                 //
1336                 // The difference between leftmost-first and leftmost-longest
1337                 // occurs previously while we build the trie. For
1338                 // leftmost-first, we simply omit any entries that would
1339                 // otherwise require passing through a match state.
1340                 //
1341                 // Note that for correctness, the failure transition has to be
1342                 // set to the dead state for ALL states following a match, not
1343                 // just the match state itself. However, by setting the failure
1344                 // transition to the dead state on all match states, the dead
1345                 // state will automatically propagate to all subsequent states
1346                 // via the failure state computation below.
1347                 if is_leftmost && self.nfa.states[t.next].is_match() {
1348                     self.nfa.states[t.next].fail = NFA::DEAD;
1349                     continue;
1350                 }
1351                 let mut fail = self.nfa.states[id].fail;
1352                 while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL {
1353                     fail = self.nfa.states[fail].fail;
1354                 }
1355                 fail = self.nfa.follow_transition(fail, t.byte);
1356                 self.nfa.states[t.next].fail = fail;
1357                 self.nfa.copy_matches(fail, t.next)?;
1358             }
1359             // If the start state is a match state, then this automaton can
1360             // match the empty string. This implies all states are match states
1361             // since every position matches the empty string, so copy the
1362             // matches from the start state to every state. Strictly speaking,
1363             // this is only necessary for overlapping matches since each
1364             // non-empty non-start match state needs to report empty matches
1365             // in addition to its own. For the non-overlapping case, such
1366             // states only report the first match, which is never empty since
1367             // it isn't a start state.
1368             if !is_leftmost {
1369                 self.nfa
1370                     .copy_matches(self.nfa.special.start_unanchored_id, id)?;
1371             }
1372         }
1373         Ok(())
1374     }
1375 
1376     /// Shuffle the states so that they appear in this sequence:
1377     ///
1378     ///   DEAD, FAIL, MATCH..., START, START, NON-MATCH...
1379     ///
1380     /// The idea here is that if we know how special states are laid out in our
1381     /// transition table, then we can determine what "kind" of state we're in
1382     /// just by comparing our current state ID with a particular value. In this
1383     /// way, we avoid doing extra memory lookups.
1384     ///
1385     /// Before shuffling begins, our states look something like this:
1386     ///
1387     ///   DEAD, FAIL, START, START, (MATCH | NON-MATCH)...
1388     ///
1389     /// So all we need to do is move all of the MATCH states so that they
1390     /// all appear before any NON-MATCH state, like so:
1391     ///
1392     ///   DEAD, FAIL, START, START, MATCH... NON-MATCH...
1393     ///
1394     /// Then it's just a simple matter of swapping the two START states with
1395     /// the last two MATCH states.
1396     ///
1397     /// (This is the same technique used for fully compiled DFAs in
1398     /// regex-automata.)
shuffle(&mut self)1399     fn shuffle(&mut self) {
1400         let old_start_uid = self.nfa.special.start_unanchored_id;
1401         let old_start_aid = self.nfa.special.start_anchored_id;
1402         assert!(old_start_uid < old_start_aid);
1403         assert_eq!(
1404             3,
1405             old_start_aid.as_usize(),
1406             "anchored start state should be at index 3"
1407         );
1408         // We implement shuffling by a sequence of pairwise swaps of states.
1409         // Since we have a number of things referencing states via their
1410         // IDs and swapping them changes their IDs, we need to record every
1411         // swap we make so that we can remap IDs. The remapper handles this
1412         // book-keeping for us.
1413         let mut remapper = Remapper::new(&self.nfa, 0);
1414         // The way we proceed here is by moving all match states so that
1415         // they directly follow the start states. So it will go: DEAD, FAIL,
1416         // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ...
1417         //
1418         // To do that, we proceed forward through all states after
1419         // START-ANCHORED and swap match states so that they appear before all
1420         // non-match states.
1421         let mut next_avail = StateID::from(4u8);
1422         for i in next_avail.as_usize()..self.nfa.states.len() {
1423             let sid = StateID::new(i).unwrap();
1424             if !self.nfa.states[sid].is_match() {
1425                 continue;
1426             }
1427             remapper.swap(&mut self.nfa, sid, next_avail);
1428             // The key invariant here is that only non-match states exist
1429             // between 'next_avail' and 'sid' (with them being potentially
1430             // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed
1431             // to land on the leftmost non-match state. (Unless 'next_avail'
1432             // and 'sid' are equivalent, in which case, a swap will occur but
1433             // it is a no-op.)
1434             next_avail = StateID::new(next_avail.one_more()).unwrap();
1435         }
1436         // Now we'd like to move the start states to immediately following the
1437         // match states. (The start states may themselves be match states, but
1438         // we'll handle that later.) We arrange the states this way so that we
1439         // don't necessarily need to check whether a state is a start state or
1440         // not before checking whether a state is a match state. For example,
1441         // we'd like to be able to write this as our state machine loop:
1442         //
1443         //   sid = start()
1444         //   for byte in haystack:
1445         //     sid = next(sid, byte)
1446         //     if sid <= nfa.max_start_id:
1447         //       if sid <= nfa.max_dead_id:
1448         //         # search complete
1449         //       elif sid <= nfa.max_match_id:
1450         //         # found match
1451         //
1452         // The important context here is that we might not want to look for
1453         // start states at all. Namely, if a searcher doesn't have a prefilter,
1454         // then there is no reason to care about whether we're in a start state
1455         // or not. And indeed, if we did check for it, this very hot loop would
1456         // ping pong between the special state handling and the main state
1457         // transition logic. This in turn stalls the CPU by killing branch
1458         // prediction.
1459         //
1460         // So essentially, we really want to be able to "forget" that start
1461         // states even exist and this is why we put them at the end.
1462         let new_start_aid =
1463             StateID::new(next_avail.as_usize().checked_sub(1).unwrap())
1464                 .unwrap();
1465         remapper.swap(&mut self.nfa, old_start_aid, new_start_aid);
1466         let new_start_uid =
1467             StateID::new(next_avail.as_usize().checked_sub(2).unwrap())
1468                 .unwrap();
1469         remapper.swap(&mut self.nfa, old_start_uid, new_start_uid);
1470         let new_max_match_id =
1471             StateID::new(next_avail.as_usize().checked_sub(3).unwrap())
1472                 .unwrap();
1473         self.nfa.special.max_match_id = new_max_match_id;
1474         self.nfa.special.start_unanchored_id = new_start_uid;
1475         self.nfa.special.start_anchored_id = new_start_aid;
1476         // If one start state is a match state, then they both are.
1477         if self.nfa.states[self.nfa.special.start_anchored_id].is_match() {
1478             self.nfa.special.max_match_id = self.nfa.special.start_anchored_id;
1479         }
1480         remapper.remap(&mut self.nfa);
1481     }
1482 
1483     /// Attempts to convert the transition representation of a subset of states
1484     /// in this NFA from sparse to dense. This can greatly improve search
1485     /// performance since states with a higher number of transitions tend to
1486     /// correlate with very active states.
1487     ///
1488     /// We generally only densify states that are close to the start state.
1489     /// These tend to be the most active states and thus benefit from a dense
1490     /// representation more than other states.
1491     ///
1492     /// This tends to best balance between memory usage and performance. In
1493     /// particular, the *vast majority* of all states in a typical Aho-Corasick
1494     /// automaton have only 1 transition and are usually farther from the start
1495     /// state and thus don't get densified.
1496     ///
1497     /// Note that this doesn't remove the sparse representation of transitions
1498     /// for states that are densified. It could be done, but actually removing
1499     /// entries from `NFA::sparse` is likely more expensive than it's worth.
densify(&mut self) -> Result<(), BuildError>1500     fn densify(&mut self) -> Result<(), BuildError> {
1501         for i in 0..self.nfa.states.len() {
1502             let sid = StateID::new(i).unwrap();
1503             // Don't bother densifying states that are only used as sentinels.
1504             if sid == NFA::DEAD || sid == NFA::FAIL {
1505                 continue;
1506             }
1507             // Only densify states that are "close enough" to the start state.
1508             if self.nfa.states[sid].depth.as_usize()
1509                 >= self.builder.dense_depth
1510             {
1511                 continue;
1512             }
1513             let dense = self.nfa.alloc_dense_state()?;
1514             let mut prev_link = None;
1515             while let Some(link) = self.nfa.next_link(sid, prev_link) {
1516                 prev_link = Some(link);
1517                 let t = self.nfa.sparse[link];
1518 
1519                 let class = usize::from(self.nfa.byte_classes.get(t.byte));
1520                 let index = dense.as_usize() + class;
1521                 self.nfa.dense[index] = t.next;
1522             }
1523             self.nfa.states[sid].dense = dense;
1524         }
1525         Ok(())
1526     }
1527 
1528     /// Returns a set that tracked queued states.
1529     ///
1530     /// This is only necessary when ASCII case insensitivity is enabled, since
1531     /// it is the only way to visit the same state twice. Otherwise, this
1532     /// returns an inert set that nevers adds anything and always reports
1533     /// `false` for every member test.
queued_set(&self) -> QueuedSet1534     fn queued_set(&self) -> QueuedSet {
1535         if self.builder.ascii_case_insensitive {
1536             QueuedSet::active()
1537         } else {
1538             QueuedSet::inert()
1539         }
1540     }
1541 
1542     /// Initializes the unanchored start state by making it dense. This is
1543     /// achieved by explicitly setting every transition to the FAIL state.
1544     /// This isn't necessary for correctness, since any missing transition is
1545     /// automatically assumed to be mapped to the FAIL state. We do this to
1546     /// make the unanchored starting state dense, and thus in turn make
1547     /// transition lookups on it faster. (Which is worth doing because it's
1548     /// the most active state.)
init_unanchored_start_state(&mut self) -> Result<(), BuildError>1549     fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> {
1550         let start_uid = self.nfa.special.start_unanchored_id;
1551         let start_aid = self.nfa.special.start_anchored_id;
1552         self.nfa.init_full_state(start_uid, NFA::FAIL)?;
1553         self.nfa.init_full_state(start_aid, NFA::FAIL)?;
1554         Ok(())
1555     }
1556 
1557     /// Setup the anchored start state by copying all of the transitions and
1558     /// matches from the unanchored starting state with one change: the failure
1559     /// transition is changed to the DEAD state, so that for any undefined
1560     /// transitions, the search will stop.
set_anchored_start_state(&mut self) -> Result<(), BuildError>1561     fn set_anchored_start_state(&mut self) -> Result<(), BuildError> {
1562         let start_uid = self.nfa.special.start_unanchored_id;
1563         let start_aid = self.nfa.special.start_anchored_id;
1564         let (mut uprev_link, mut aprev_link) = (None, None);
1565         loop {
1566             let unext = self.nfa.next_link(start_uid, uprev_link);
1567             let anext = self.nfa.next_link(start_aid, aprev_link);
1568             let (ulink, alink) = match (unext, anext) {
1569                 (Some(ulink), Some(alink)) => (ulink, alink),
1570                 (None, None) => break,
1571                 _ => unreachable!(),
1572             };
1573             uprev_link = Some(ulink);
1574             aprev_link = Some(alink);
1575             self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next;
1576         }
1577         self.nfa.copy_matches(start_uid, start_aid)?;
1578         // This is the main difference between the unanchored and anchored
1579         // starting states. If a lookup on an anchored starting state fails,
1580         // then the search should stop.
1581         //
1582         // N.B. This assumes that the loop on the unanchored starting state
1583         // hasn't been created yet.
1584         self.nfa.states[start_aid].fail = NFA::DEAD;
1585         Ok(())
1586     }
1587 
1588     /// Set the failure transitions on the start state to loop back to the
1589     /// start state. This effectively permits the Aho-Corasick automaton to
1590     /// match at any position. This is also required for finding the next
1591     /// state to terminate, namely, finding the next state should never return
1592     /// a fail_id.
1593     ///
1594     /// This must be done after building the initial trie, since trie
1595     /// construction depends on transitions to `fail_id` to determine whether a
1596     /// state already exists or not.
add_unanchored_start_state_loop(&mut self)1597     fn add_unanchored_start_state_loop(&mut self) {
1598         let start_uid = self.nfa.special.start_unanchored_id;
1599         let mut prev_link = None;
1600         while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1601             prev_link = Some(link);
1602             if self.nfa.sparse[link].next() == NFA::FAIL {
1603                 self.nfa.sparse[link].next = start_uid;
1604             }
1605         }
1606     }
1607 
1608     /// Remove the start state loop by rewriting any transitions on the start
1609     /// state back to the start state with transitions to the dead state.
1610     ///
1611     /// The loop is only closed when two conditions are met: the start state
1612     /// is a match state and the match kind is leftmost-first or
1613     /// leftmost-longest.
1614     ///
1615     /// The reason for this is that under leftmost semantics, a start state
1616     /// that is also a match implies that we should never restart the search
1617     /// process. We allow normal transitions out of the start state, but if
1618     /// none exist, we transition to the dead state, which signals that
1619     /// searching should stop.
close_start_state_loop_for_leftmost(&mut self)1620     fn close_start_state_loop_for_leftmost(&mut self) {
1621         let start_uid = self.nfa.special.start_unanchored_id;
1622         let start = &mut self.nfa.states[start_uid];
1623         let dense = start.dense;
1624         if self.builder.match_kind.is_leftmost() && start.is_match() {
1625             let mut prev_link = None;
1626             while let Some(link) = self.nfa.next_link(start_uid, prev_link) {
1627                 prev_link = Some(link);
1628                 if self.nfa.sparse[link].next() == start_uid {
1629                     self.nfa.sparse[link].next = NFA::DEAD;
1630                     if dense != StateID::ZERO {
1631                         let b = self.nfa.sparse[link].byte;
1632                         let class = usize::from(self.nfa.byte_classes.get(b));
1633                         self.nfa.dense[dense.as_usize() + class] = NFA::DEAD;
1634                     }
1635                 }
1636             }
1637         }
1638     }
1639 
1640     /// Sets all transitions on the dead state to point back to the dead state.
1641     /// Normally, missing transitions map back to the failure state, but the
1642     /// point of the dead state is to act as a sink that can never be escaped.
add_dead_state_loop(&mut self) -> Result<(), BuildError>1643     fn add_dead_state_loop(&mut self) -> Result<(), BuildError> {
1644         self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?;
1645         Ok(())
1646     }
1647 }
1648 
1649 /// A set of state identifiers used to avoid revisiting the same state multiple
1650 /// times when filling in failure transitions.
1651 ///
1652 /// This set has an "inert" and an "active" mode. When inert, the set never
1653 /// stores anything and always returns `false` for every member test. This is
1654 /// useful to avoid the performance and memory overhead of maintaining this
1655 /// set when it is not needed.
1656 #[derive(Debug)]
1657 struct QueuedSet {
1658     set: Option<BTreeSet<StateID>>,
1659 }
1660 
1661 impl QueuedSet {
1662     /// Return an inert set that returns `false` for every state ID membership
1663     /// test.
inert() -> QueuedSet1664     fn inert() -> QueuedSet {
1665         QueuedSet { set: None }
1666     }
1667 
1668     /// Return an active set that tracks state ID membership.
active() -> QueuedSet1669     fn active() -> QueuedSet {
1670         QueuedSet { set: Some(BTreeSet::new()) }
1671     }
1672 
1673     /// Inserts the given state ID into this set. (If the set is inert, then
1674     /// this is a no-op.)
insert(&mut self, state_id: StateID)1675     fn insert(&mut self, state_id: StateID) {
1676         if let Some(ref mut set) = self.set {
1677             set.insert(state_id);
1678         }
1679     }
1680 
1681     /// Returns true if and only if the given state ID is in this set. If the
1682     /// set is inert, this always returns false.
contains(&self, state_id: StateID) -> bool1683     fn contains(&self, state_id: StateID) -> bool {
1684         match self.set {
1685             None => false,
1686             Some(ref set) => set.contains(&state_id),
1687         }
1688     }
1689 }
1690 
1691 impl core::fmt::Debug for NFA {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result1692     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1693         use crate::{
1694             automaton::{fmt_state_indicator, sparse_transitions},
1695             util::debug::DebugByte,
1696         };
1697 
1698         writeln!(f, "noncontiguous::NFA(")?;
1699         for (sid, state) in self.states.iter().with_state_ids() {
1700             // The FAIL state doesn't actually have space for a state allocated
1701             // for it, so we have to treat it as a special case.
1702             if sid == NFA::FAIL {
1703                 writeln!(f, "F {:06}:", sid.as_usize())?;
1704                 continue;
1705             }
1706             fmt_state_indicator(f, self, sid)?;
1707             write!(
1708                 f,
1709                 "{:06}({:06}): ",
1710                 sid.as_usize(),
1711                 state.fail.as_usize()
1712             )?;
1713 
1714             let it = sparse_transitions(
1715                 self.iter_trans(sid).map(|t| (t.byte, t.next)),
1716             )
1717             .enumerate();
1718             for (i, (start, end, sid)) in it {
1719                 if i > 0 {
1720                     write!(f, ", ")?;
1721                 }
1722                 if start == end {
1723                     write!(
1724                         f,
1725                         "{:?} => {:?}",
1726                         DebugByte(start),
1727                         sid.as_usize()
1728                     )?;
1729                 } else {
1730                     write!(
1731                         f,
1732                         "{:?}-{:?} => {:?}",
1733                         DebugByte(start),
1734                         DebugByte(end),
1735                         sid.as_usize()
1736                     )?;
1737                 }
1738             }
1739 
1740             write!(f, "\n")?;
1741             if self.is_match(sid) {
1742                 write!(f, "         matches: ")?;
1743                 for (i, pid) in self.iter_matches(sid).enumerate() {
1744                     if i > 0 {
1745                         write!(f, ", ")?;
1746                     }
1747                     write!(f, "{}", pid.as_usize())?;
1748                 }
1749                 write!(f, "\n")?;
1750             }
1751         }
1752         writeln!(f, "match kind: {:?}", self.match_kind)?;
1753         writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
1754         writeln!(f, "state length: {:?}", self.states.len())?;
1755         writeln!(f, "pattern length: {:?}", self.patterns_len())?;
1756         writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
1757         writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
1758         writeln!(f, "memory usage: {:?}", self.memory_usage())?;
1759         writeln!(f, ")")?;
1760         Ok(())
1761     }
1762 }
1763