1 /*! 2 Provides a noncontiguous NFA implementation of Aho-Corasick. 3 4 This is a low-level API that generally only needs to be used in niche 5 circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) 6 instead of a noncontiguous NFA directly. Using an `NFA` directly is typically 7 only necessary when one needs access to the [`Automaton`] trait implementation. 8 */ 9 10 use alloc::{ 11 collections::{BTreeSet, VecDeque}, 12 vec, 13 vec::Vec, 14 }; 15 16 use crate::{ 17 automaton::Automaton, 18 util::{ 19 alphabet::{ByteClassSet, ByteClasses}, 20 error::{BuildError, MatchError}, 21 prefilter::{self, opposite_ascii_case, Prefilter}, 22 primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, 23 remapper::Remapper, 24 search::{Anchored, MatchKind}, 25 special::Special, 26 }, 27 }; 28 29 /// A noncontiguous NFA implementation of Aho-Corasick. 30 /// 31 /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of 32 /// this type directly. Using an `NFA` directly is typically only necessary 33 /// when one needs access to the [`Automaton`] trait implementation. 34 /// 35 /// This NFA represents the "core" implementation of Aho-Corasick in this 36 /// crate. Namely, constructing this NFA involving building a trie and then 37 /// filling in the failure transitions between states, similar to what is 38 /// described in any standard textbook description of Aho-Corasick. 39 /// 40 /// In order to minimize heap usage and to avoid additional construction costs, 41 /// this implementation represents the transitions of all states as distinct 42 /// sparse memory allocations. This is where it gets its name from. That is, 43 /// this NFA has no contiguous memory allocation for its transition table. Each 44 /// state gets its own allocation. 45 /// 46 /// While the sparse representation keeps memory usage to somewhat reasonable 47 /// levels, it is still quite large and also results in somewhat mediocre 48 /// search performance. For this reason, it is almost always a good idea to 49 /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is 50 /// marginally slower to build, but has higher throughput and can sometimes use 51 /// an order of magnitude less memory. The main reason to use a noncontiguous 52 /// NFA is when you need the fastest possible construction time, or when a 53 /// contiguous NFA does not have the desired capacity. (The total number of NFA 54 /// states it can have is fewer than a noncontiguous NFA.) 55 /// 56 /// # Example 57 /// 58 /// This example shows how to build an `NFA` directly and use it to execute 59 /// [`Automaton::try_find`]: 60 /// 61 /// ``` 62 /// use aho_corasick::{ 63 /// automaton::Automaton, 64 /// nfa::noncontiguous::NFA, 65 /// Input, Match, 66 /// }; 67 /// 68 /// let patterns = &["b", "abc", "abcd"]; 69 /// let haystack = "abcd"; 70 /// 71 /// let nfa = NFA::new(patterns).unwrap(); 72 /// assert_eq!( 73 /// Some(Match::must(0, 1..2)), 74 /// nfa.try_find(&Input::new(haystack))?, 75 /// ); 76 /// # Ok::<(), Box<dyn std::error::Error>>(()) 77 /// ``` 78 /// 79 /// It is also possible to implement your own version of `try_find`. See the 80 /// [`Automaton`] documentation for an example. 81 #[derive(Clone)] 82 pub struct NFA { 83 /// The match semantics built into this NFA. 84 match_kind: MatchKind, 85 /// A set of states. Each state defines its own transitions, a fail 86 /// transition and a set of indices corresponding to matches. 87 /// 88 /// The first state is always the fail state, which is used only as a 89 /// sentinel. Namely, in the final NFA, no transition into the fail state 90 /// exists. (Well, they do, but they aren't followed. Instead, the state's 91 /// failure transition is followed.) 92 /// 93 /// The second state (index 1) is always the dead state. Dead states are 94 /// in every automaton, but only used when leftmost-{first,longest} match 95 /// semantics are enabled. Specifically, they instruct search to stop 96 /// at specific points in order to report the correct match location. In 97 /// the standard Aho-Corasick construction, there are no transitions to 98 /// the dead state. 99 /// 100 /// The third state (index 2) is generally intended to be the starting or 101 /// "root" state. 102 states: Vec<State>, 103 /// Transitions stored in a sparse representation via a linked list. 104 /// 105 /// Each transition contains three pieces of information: the byte it 106 /// is defined for, the state it transitions to and a link to the next 107 /// transition in the same state (or `StateID::ZERO` if it is the last 108 /// transition). 109 /// 110 /// The first transition for each state is determined by `State::sparse`. 111 /// 112 /// Note that this contains a complete set of all transitions in this NFA, 113 /// including states that have a dense representation for transitions. 114 /// (Adding dense transitions for a state doesn't remove its sparse 115 /// transitions, since deleting transitions from this particular sparse 116 /// representation would be fairly expensive.) 117 sparse: Vec<Transition>, 118 /// Transitions stored in a dense representation. 119 /// 120 /// A state has a row in this table if and only if `State::dense` is 121 /// not equal to `StateID::ZERO`. When not zero, there are precisely 122 /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense` 123 /// in this table. 124 /// 125 /// Generally a very small minority of states have a dense representation 126 /// since it uses so much memory. 127 dense: Vec<StateID>, 128 /// Matches stored in linked list for each state. 129 /// 130 /// Like sparse transitions, each match has a link to the next match in the 131 /// state. 132 /// 133 /// The first match for each state is determined by `State::matches`. 134 matches: Vec<Match>, 135 /// The length, in bytes, of each pattern in this NFA. This slice is 136 /// indexed by `PatternID`. 137 /// 138 /// The number of entries in this vector corresponds to the total number of 139 /// patterns in this automaton. 140 pattern_lens: Vec<SmallIndex>, 141 /// A prefilter for quickly skipping to candidate matches, if pertinent. 142 prefilter: Option<Prefilter>, 143 /// A set of equivalence classes in terms of bytes. We compute this while 144 /// building the NFA, but don't use it in the NFA's states. Instead, we 145 /// use this for building the DFA. We store it on the NFA since it's easy 146 /// to compute while visiting the patterns. 147 byte_classes: ByteClasses, 148 /// The length, in bytes, of the shortest pattern in this automaton. This 149 /// information is useful for detecting whether an automaton matches the 150 /// empty string or not. 151 min_pattern_len: usize, 152 /// The length, in bytes, of the longest pattern in this automaton. This 153 /// information is useful for keeping correct buffer sizes when searching 154 /// on streams. 155 max_pattern_len: usize, 156 /// The information required to deduce which states are "special" in this 157 /// NFA. 158 /// 159 /// Since the DEAD and FAIL states are always the first two states and 160 /// there are only ever two start states (which follow all of the match 161 /// states), it follows that we can determine whether a state is a fail, 162 /// dead, match or start with just a few comparisons on the ID itself: 163 /// 164 /// is_dead(sid): sid == NFA::DEAD 165 /// is_fail(sid): sid == NFA::FAIL 166 /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id 167 /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id 168 /// 169 /// Note that this only applies to the NFA after it has been constructed. 170 /// During construction, the start states are the first ones added and the 171 /// match states are inter-leaved with non-match states. Once all of the 172 /// states have been added, the states are shuffled such that the above 173 /// predicates hold. 174 special: Special, 175 } 176 177 impl NFA { 178 /// Create a new Aho-Corasick noncontiguous NFA using the default 179 /// configuration. 180 /// 181 /// Use a [`Builder`] if you want to change the configuration. new<I, P>(patterns: I) -> Result<NFA, BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,182 pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError> 183 where 184 I: IntoIterator<Item = P>, 185 P: AsRef<[u8]>, 186 { 187 NFA::builder().build(patterns) 188 } 189 190 /// A convenience method for returning a new Aho-Corasick noncontiguous NFA 191 /// builder. 192 /// 193 /// This usually permits one to just import the `NFA` type. builder() -> Builder194 pub fn builder() -> Builder { 195 Builder::new() 196 } 197 } 198 199 impl NFA { 200 /// The DEAD state is a sentinel state like the FAIL state. The DEAD state 201 /// instructs any search to stop and return any currently recorded match, 202 /// or no match otherwise. Generally speaking, it is impossible for an 203 /// unanchored standard search to enter a DEAD state. But an anchored 204 /// search can, and so to can a leftmost search. 205 /// 206 /// We put DEAD before FAIL so that DEAD is always 0. We repeat this 207 /// decision across the other Aho-Corasicm automata, so that DEAD 208 /// states there are always 0 too. It's not that we need all of the 209 /// implementations to agree, but rather, the contiguous NFA and the DFA 210 /// use a sort of "premultiplied" state identifier where the only state 211 /// whose ID is always known and constant is the first state. Subsequent 212 /// state IDs depend on how much space has already been used in the 213 /// transition table. 214 pub(crate) const DEAD: StateID = StateID::new_unchecked(0); 215 /// The FAIL state mostly just corresponds to the ID of any transition on a 216 /// state that isn't explicitly defined. When one transitions into the FAIL 217 /// state, one must follow the previous state's failure transition before 218 /// doing the next state lookup. In this way, FAIL is more of a sentinel 219 /// than a state that one actually transitions into. In particular, it is 220 /// never exposed in the `Automaton` interface. 221 pub(crate) const FAIL: StateID = StateID::new_unchecked(1); 222 223 /// Returns the equivalence classes of bytes found while constructing 224 /// this NFA. 225 /// 226 /// Note that the NFA doesn't actually make use of these equivalence 227 /// classes. Instead, these are useful for building the DFA when desired. byte_classes(&self) -> &ByteClasses228 pub(crate) fn byte_classes(&self) -> &ByteClasses { 229 &self.byte_classes 230 } 231 232 /// Returns a slice containing the length of each pattern in this searcher. 233 /// It is indexed by `PatternID` and has length `NFA::patterns_len`. 234 /// 235 /// This is exposed for convenience when building a contiguous NFA. But it 236 /// can be reconstructed from the `Automaton` API if necessary. pattern_lens_raw(&self) -> &[SmallIndex]237 pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { 238 &self.pattern_lens 239 } 240 241 /// Returns a slice of all states in this non-contiguous NFA. states(&self) -> &[State]242 pub(crate) fn states(&self) -> &[State] { 243 &self.states 244 } 245 246 /// Returns the underlying "special" state information for this NFA. special(&self) -> &Special247 pub(crate) fn special(&self) -> &Special { 248 &self.special 249 } 250 251 /// Swaps the states at `id1` and `id2`. 252 /// 253 /// This does not update the transitions of any state to account for the 254 /// state swap. swap_states(&mut self, id1: StateID, id2: StateID)255 pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { 256 self.states.swap(id1.as_usize(), id2.as_usize()); 257 } 258 259 /// Re-maps all state IDs in this NFA according to the `map` function 260 /// given. remap(&mut self, map: impl Fn(StateID) -> StateID)261 pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { 262 let alphabet_len = self.byte_classes.alphabet_len(); 263 for state in self.states.iter_mut() { 264 state.fail = map(state.fail); 265 let mut link = state.sparse; 266 while link != StateID::ZERO { 267 let t = &mut self.sparse[link]; 268 t.next = map(t.next); 269 link = t.link; 270 } 271 if state.dense != StateID::ZERO { 272 let start = state.dense.as_usize(); 273 for next in self.dense[start..][..alphabet_len].iter_mut() { 274 *next = map(*next); 275 } 276 } 277 } 278 } 279 280 /// Iterate over all of the transitions for the given state ID. iter_trans( &self, sid: StateID, ) -> impl Iterator<Item = Transition> + '_281 pub(crate) fn iter_trans( 282 &self, 283 sid: StateID, 284 ) -> impl Iterator<Item = Transition> + '_ { 285 let mut link = self.states[sid].sparse; 286 core::iter::from_fn(move || { 287 if link == StateID::ZERO { 288 return None; 289 } 290 let t = self.sparse[link]; 291 link = t.link; 292 Some(t) 293 }) 294 } 295 296 /// Iterate over all of the matches for the given state ID. iter_matches( &self, sid: StateID, ) -> impl Iterator<Item = PatternID> + '_297 pub(crate) fn iter_matches( 298 &self, 299 sid: StateID, 300 ) -> impl Iterator<Item = PatternID> + '_ { 301 let mut link = self.states[sid].matches; 302 core::iter::from_fn(move || { 303 if link == StateID::ZERO { 304 return None; 305 } 306 let m = self.matches[link]; 307 link = m.link; 308 Some(m.pid) 309 }) 310 } 311 312 /// Return the link following the one given. If the one given is the last 313 /// link for the given state, then return `None`. 314 /// 315 /// If no previous link is given, then this returns the first link in the 316 /// state, if one exists. 317 /// 318 /// This is useful for manually iterating over the transitions in a single 319 /// state without borrowing the NFA. This permits mutating other parts of 320 /// the NFA during iteration. Namely, one can access the transition pointed 321 /// to by the link via `self.sparse[link]`. next_link( &self, sid: StateID, prev: Option<StateID>, ) -> Option<StateID>322 fn next_link( 323 &self, 324 sid: StateID, 325 prev: Option<StateID>, 326 ) -> Option<StateID> { 327 let link = 328 prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link); 329 if link == StateID::ZERO { 330 None 331 } else { 332 Some(link) 333 } 334 } 335 336 /// Follow the transition for the given byte in the given state. If no such 337 /// transition exists, then the FAIL state ID is returned. 338 #[inline(always)] follow_transition(&self, sid: StateID, byte: u8) -> StateID339 fn follow_transition(&self, sid: StateID, byte: u8) -> StateID { 340 let s = &self.states[sid]; 341 // This is a special case that targets starting states and states 342 // near a start state. Namely, after the initial trie is constructed, 343 // we look for states close to the start state to convert to a dense 344 // representation for their transitions. This winds up using a lot more 345 // memory per state in exchange for faster transition lookups. But 346 // since we only do this for a small number of states (by default), the 347 // memory usage is usually minimal. 348 // 349 // This has *massive* benefit when executing searches because the 350 // unanchored starting state is by far the hottest state and is 351 // frequently visited. Moreover, the 'for' loop below that works 352 // decently on an actually sparse state is disastrous on a state that 353 // is nearly or completely dense. 354 if s.dense == StateID::ZERO { 355 self.follow_transition_sparse(sid, byte) 356 } else { 357 let class = usize::from(self.byte_classes.get(byte)); 358 self.dense[s.dense.as_usize() + class] 359 } 360 } 361 362 /// Like `follow_transition`, but always uses the sparse representation. 363 #[inline(always)] follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID364 fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID { 365 for t in self.iter_trans(sid) { 366 if byte <= t.byte { 367 if byte == t.byte { 368 return t.next; 369 } 370 break; 371 } 372 } 373 NFA::FAIL 374 } 375 376 /// Set the transition for the given byte to the state ID given. 377 /// 378 /// Note that one should not set transitions to the FAIL state. It is not 379 /// technically incorrect, but it wastes space. If a transition is not 380 /// defined, then it is automatically assumed to lead to the FAIL state. add_transition( &mut self, prev: StateID, byte: u8, next: StateID, ) -> Result<(), BuildError>381 fn add_transition( 382 &mut self, 383 prev: StateID, 384 byte: u8, 385 next: StateID, 386 ) -> Result<(), BuildError> { 387 if self.states[prev].dense != StateID::ZERO { 388 let dense = self.states[prev].dense; 389 let class = usize::from(self.byte_classes.get(byte)); 390 self.dense[dense.as_usize() + class] = next; 391 } 392 393 let head = self.states[prev].sparse; 394 if head == StateID::ZERO || byte < self.sparse[head].byte { 395 let new_link = self.alloc_transition()?; 396 self.sparse[new_link] = Transition { byte, next, link: head }; 397 self.states[prev].sparse = new_link; 398 return Ok(()); 399 } else if byte == self.sparse[head].byte { 400 self.sparse[head].next = next; 401 return Ok(()); 402 } 403 404 // We handled the only cases where the beginning of the transition 405 // chain needs to change. At this point, we now know that there is 406 // at least one entry in the transition chain and the byte for that 407 // transition is less than the byte for the transition we're adding. 408 let (mut link_prev, mut link_next) = (head, self.sparse[head].link); 409 while link_next != StateID::ZERO && byte > self.sparse[link_next].byte 410 { 411 link_prev = link_next; 412 link_next = self.sparse[link_next].link; 413 } 414 if link_next == StateID::ZERO || byte < self.sparse[link_next].byte { 415 let link = self.alloc_transition()?; 416 self.sparse[link] = Transition { byte, next, link: link_next }; 417 self.sparse[link_prev].link = link; 418 } else { 419 assert_eq!(byte, self.sparse[link_next].byte); 420 self.sparse[link_next].next = next; 421 } 422 Ok(()) 423 } 424 425 /// This sets every possible transition (all 255 of them) for the given 426 /// state to the name `next` value. 427 /// 428 /// This is useful for efficiently initializing start/dead states. 429 /// 430 /// # Panics 431 /// 432 /// This requires that the state has no transitions added to it already. 433 /// If it has any transitions, then this panics. It will also panic if 434 /// the state has been densified prior to calling this. init_full_state( &mut self, prev: StateID, next: StateID, ) -> Result<(), BuildError>435 fn init_full_state( 436 &mut self, 437 prev: StateID, 438 next: StateID, 439 ) -> Result<(), BuildError> { 440 assert_eq!( 441 StateID::ZERO, 442 self.states[prev].dense, 443 "state must not be dense yet" 444 ); 445 assert_eq!( 446 StateID::ZERO, 447 self.states[prev].sparse, 448 "state must have zero transitions" 449 ); 450 let mut prev_link = StateID::ZERO; 451 for byte in 0..=255 { 452 let new_link = self.alloc_transition()?; 453 self.sparse[new_link] = 454 Transition { byte, next, link: StateID::ZERO }; 455 if prev_link == StateID::ZERO { 456 self.states[prev].sparse = new_link; 457 } else { 458 self.sparse[prev_link].link = new_link; 459 } 460 prev_link = new_link; 461 } 462 Ok(()) 463 } 464 465 /// Add a match for the given pattern ID to the state for the given ID. add_match( &mut self, sid: StateID, pid: PatternID, ) -> Result<(), BuildError>466 fn add_match( 467 &mut self, 468 sid: StateID, 469 pid: PatternID, 470 ) -> Result<(), BuildError> { 471 let head = self.states[sid].matches; 472 let mut link = head; 473 while self.matches[link].link != StateID::ZERO { 474 link = self.matches[link].link; 475 } 476 let new_match_link = self.alloc_match()?; 477 self.matches[new_match_link].pid = pid; 478 if link == StateID::ZERO { 479 self.states[sid].matches = new_match_link; 480 } else { 481 self.matches[link].link = new_match_link; 482 } 483 Ok(()) 484 } 485 486 /// Copy matches from the `src` state to the `dst` state. This is useful 487 /// when a match state can be reached via a failure transition. In which 488 /// case, you'll want to copy the matches (if any) from the state reached 489 /// by the failure transition to the original state you were at. copy_matches( &mut self, src: StateID, dst: StateID, ) -> Result<(), BuildError>490 fn copy_matches( 491 &mut self, 492 src: StateID, 493 dst: StateID, 494 ) -> Result<(), BuildError> { 495 let head_dst = self.states[dst].matches; 496 let mut link_dst = head_dst; 497 while self.matches[link_dst].link != StateID::ZERO { 498 link_dst = self.matches[link_dst].link; 499 } 500 let mut link_src = self.states[src].matches; 501 while link_src != StateID::ZERO { 502 let new_match_link = 503 StateID::new(self.matches.len()).map_err(|e| { 504 BuildError::state_id_overflow( 505 StateID::MAX.as_u64(), 506 e.attempted(), 507 ) 508 })?; 509 self.matches.push(Match { 510 pid: self.matches[link_src].pid, 511 link: StateID::ZERO, 512 }); 513 if link_dst == StateID::ZERO { 514 self.states[dst].matches = new_match_link; 515 } else { 516 self.matches[link_dst].link = new_match_link; 517 } 518 519 link_dst = new_match_link; 520 link_src = self.matches[link_src].link; 521 } 522 Ok(()) 523 } 524 525 /// Create a new entry in `NFA::trans`, if there's room, and return that 526 /// entry's ID. If there's no room, then an error is returned. alloc_transition(&mut self) -> Result<StateID, BuildError>527 fn alloc_transition(&mut self) -> Result<StateID, BuildError> { 528 let id = StateID::new(self.sparse.len()).map_err(|e| { 529 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) 530 })?; 531 self.sparse.push(Transition::default()); 532 Ok(id) 533 } 534 535 /// Create a new entry in `NFA::matches`, if there's room, and return that 536 /// entry's ID. If there's no room, then an error is returned. alloc_match(&mut self) -> Result<StateID, BuildError>537 fn alloc_match(&mut self) -> Result<StateID, BuildError> { 538 let id = StateID::new(self.matches.len()).map_err(|e| { 539 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) 540 })?; 541 self.matches.push(Match::default()); 542 Ok(id) 543 } 544 545 /// Create a new set of `N` transitions in this NFA's dense transition 546 /// table. The ID return corresponds to the index at which the `N` 547 /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is 548 /// the last. 549 /// 550 /// `N` is determined via `NFA::byte_classes::alphabet_len`. alloc_dense_state(&mut self) -> Result<StateID, BuildError>551 fn alloc_dense_state(&mut self) -> Result<StateID, BuildError> { 552 let id = StateID::new(self.dense.len()).map_err(|e| { 553 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) 554 })?; 555 // We use FAIL because it's the correct default. If a state doesn't 556 // have a transition defined for every possible byte value, then the 557 // transition function should return NFA::FAIL. 558 self.dense.extend( 559 core::iter::repeat(NFA::FAIL) 560 .take(self.byte_classes.alphabet_len()), 561 ); 562 Ok(id) 563 } 564 565 /// Allocate and add a fresh state to the underlying NFA and return its 566 /// ID (guaranteed to be one more than the ID of the previously allocated 567 /// state). If the ID would overflow `StateID`, then this returns an error. alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError>568 fn alloc_state(&mut self, depth: usize) -> Result<StateID, BuildError> { 569 // This is OK because we error when building the trie if we see a 570 // pattern whose length cannot fit into a 'SmallIndex', and the longest 571 // possible depth corresponds to the length of the longest pattern. 572 let depth = SmallIndex::new(depth) 573 .expect("patterns longer than SmallIndex::MAX are not allowed"); 574 let id = StateID::new(self.states.len()).map_err(|e| { 575 BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) 576 })?; 577 self.states.push(State { 578 sparse: StateID::ZERO, 579 dense: StateID::ZERO, 580 matches: StateID::ZERO, 581 fail: self.special.start_unanchored_id, 582 depth, 583 }); 584 Ok(id) 585 } 586 } 587 588 // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always 589 // returns a valid state ID given a valid state ID. We otherwise claim that 590 // all other methods are correct as well. 591 unsafe impl Automaton for NFA { 592 #[inline(always)] start_state(&self, anchored: Anchored) -> Result<StateID, MatchError>593 fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> { 594 match anchored { 595 Anchored::No => Ok(self.special.start_unanchored_id), 596 Anchored::Yes => Ok(self.special.start_anchored_id), 597 } 598 } 599 600 #[inline(always)] next_state( &self, anchored: Anchored, mut sid: StateID, byte: u8, ) -> StateID601 fn next_state( 602 &self, 603 anchored: Anchored, 604 mut sid: StateID, 605 byte: u8, 606 ) -> StateID { 607 // This terminates since: 608 // 609 // 1. state.fail never points to the FAIL state. 610 // 2. All state.fail values point to a state closer to the start state. 611 // 3. The start state has no transitions to the FAIL state. 612 loop { 613 let next = self.follow_transition(sid, byte); 614 if next != NFA::FAIL { 615 return next; 616 } 617 // For an anchored search, we never follow failure transitions 618 // because failure transitions lead us down a path to matching 619 // a *proper* suffix of the path we were on. Thus, it can only 620 // produce matches that appear after the beginning of the search. 621 if anchored.is_anchored() { 622 return NFA::DEAD; 623 } 624 sid = self.states[sid].fail(); 625 } 626 } 627 628 #[inline(always)] is_special(&self, sid: StateID) -> bool629 fn is_special(&self, sid: StateID) -> bool { 630 sid <= self.special.max_special_id 631 } 632 633 #[inline(always)] is_dead(&self, sid: StateID) -> bool634 fn is_dead(&self, sid: StateID) -> bool { 635 sid == NFA::DEAD 636 } 637 638 #[inline(always)] is_match(&self, sid: StateID) -> bool639 fn is_match(&self, sid: StateID) -> bool { 640 // N.B. This returns true when sid==NFA::FAIL but that's okay because 641 // NFA::FAIL is not actually a valid state ID from the perspective of 642 // the Automaton trait. Namely, it is never returned by 'start_state' 643 // or by 'next_state'. So we don't need to care about it here. 644 !self.is_dead(sid) && sid <= self.special.max_match_id 645 } 646 647 #[inline(always)] is_start(&self, sid: StateID) -> bool648 fn is_start(&self, sid: StateID) -> bool { 649 sid == self.special.start_unanchored_id 650 || sid == self.special.start_anchored_id 651 } 652 653 #[inline(always)] match_kind(&self) -> MatchKind654 fn match_kind(&self) -> MatchKind { 655 self.match_kind 656 } 657 658 #[inline(always)] patterns_len(&self) -> usize659 fn patterns_len(&self) -> usize { 660 self.pattern_lens.len() 661 } 662 663 #[inline(always)] pattern_len(&self, pid: PatternID) -> usize664 fn pattern_len(&self, pid: PatternID) -> usize { 665 self.pattern_lens[pid].as_usize() 666 } 667 668 #[inline(always)] min_pattern_len(&self) -> usize669 fn min_pattern_len(&self) -> usize { 670 self.min_pattern_len 671 } 672 673 #[inline(always)] max_pattern_len(&self) -> usize674 fn max_pattern_len(&self) -> usize { 675 self.max_pattern_len 676 } 677 678 #[inline(always)] match_len(&self, sid: StateID) -> usize679 fn match_len(&self, sid: StateID) -> usize { 680 self.iter_matches(sid).count() 681 } 682 683 #[inline(always)] match_pattern(&self, sid: StateID, index: usize) -> PatternID684 fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { 685 self.iter_matches(sid).nth(index).unwrap() 686 } 687 688 #[inline(always)] memory_usage(&self) -> usize689 fn memory_usage(&self) -> usize { 690 self.states.len() * core::mem::size_of::<State>() 691 + self.sparse.len() * core::mem::size_of::<Transition>() 692 + self.matches.len() * core::mem::size_of::<Match>() 693 + self.dense.len() * StateID::SIZE 694 + self.pattern_lens.len() * SmallIndex::SIZE 695 + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) 696 } 697 698 #[inline(always)] prefilter(&self) -> Option<&Prefilter>699 fn prefilter(&self) -> Option<&Prefilter> { 700 self.prefilter.as_ref() 701 } 702 } 703 704 /// A representation of a sparse NFA state for an Aho-Corasick automaton. 705 /// 706 /// It contains the transitions to the next state, a failure transition for 707 /// cases where there exists no other transition for the current input byte 708 /// and the matches implied by visiting this state (if any). 709 #[derive(Clone, Debug)] 710 pub(crate) struct State { 711 /// A pointer to `NFA::trans` corresponding to the head of a linked list 712 /// containing all of the transitions for this state. 713 /// 714 /// This is `StateID::ZERO` if and only if this state has zero transitions. 715 sparse: StateID, 716 /// A pointer to a row of `N` transitions in `NFA::dense`. These 717 /// transitions correspond precisely to what is obtained by traversing 718 /// `sparse`, but permits constant time lookup. 719 /// 720 /// When this is zero (which is true for most states in the default 721 /// configuration), then this state has no dense representation. 722 /// 723 /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is 724 /// typically much less than 256 (the maximum value). 725 dense: StateID, 726 /// A pointer to `NFA::matches` corresponding to the head of a linked list 727 /// containing all of the matches for this state. 728 /// 729 /// This is `StateID::ZERO` if and only if this state is not a match state. 730 matches: StateID, 731 /// The state that should be transitioned to if the current byte in the 732 /// haystack does not have a corresponding transition defined in this 733 /// state. 734 fail: StateID, 735 /// The depth of this state. Specifically, this is the distance from this 736 /// state to the starting state. (For the special sentinel states DEAD and 737 /// FAIL, their depth is always 0.) The depth of a starting state is 0. 738 /// 739 /// Note that depth is currently not used in this non-contiguous NFA. It 740 /// may in the future, but it is used in the contiguous NFA. Namely, it 741 /// permits an optimization where states near the starting state have their 742 /// transitions stored in a dense fashion, but all other states have their 743 /// transitions stored in a sparse fashion. (This non-contiguous NFA uses 744 /// a sparse representation for all states unconditionally.) In any case, 745 /// this is really the only convenient place to compute and store this 746 /// information, which we need when building the contiguous NFA. 747 depth: SmallIndex, 748 } 749 750 impl State { 751 /// Return true if and only if this state is a match state. is_match(&self) -> bool752 pub(crate) fn is_match(&self) -> bool { 753 self.matches != StateID::ZERO 754 } 755 756 /// Returns the failure transition for this state. fail(&self) -> StateID757 pub(crate) fn fail(&self) -> StateID { 758 self.fail 759 } 760 761 /// Returns the depth of this state. That is, the number of transitions 762 /// this state is from the start state of the NFA. depth(&self) -> SmallIndex763 pub(crate) fn depth(&self) -> SmallIndex { 764 self.depth 765 } 766 } 767 768 /// A single transition in a non-contiguous NFA. 769 #[derive(Clone, Copy, Default)] 770 #[repr(packed)] 771 pub(crate) struct Transition { 772 byte: u8, 773 next: StateID, 774 link: StateID, 775 } 776 777 impl Transition { 778 /// Return the byte for which this transition is defined. byte(&self) -> u8779 pub(crate) fn byte(&self) -> u8 { 780 self.byte 781 } 782 783 /// Return the ID of the state that this transition points to. next(&self) -> StateID784 pub(crate) fn next(&self) -> StateID { 785 self.next 786 } 787 788 /// Return the ID of the next transition. link(&self) -> StateID789 fn link(&self) -> StateID { 790 self.link 791 } 792 } 793 794 impl core::fmt::Debug for Transition { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result795 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 796 write!( 797 f, 798 "Transition(byte: {:X?}, next: {:?}, link: {:?})", 799 self.byte, 800 self.next().as_usize(), 801 self.link().as_usize() 802 ) 803 } 804 } 805 806 /// A single match in a non-contiguous NFA. 807 #[derive(Clone, Copy, Default)] 808 struct Match { 809 pid: PatternID, 810 link: StateID, 811 } 812 813 impl Match { 814 /// Return the pattern ID for this match. pattern(&self) -> PatternID815 pub(crate) fn pattern(&self) -> PatternID { 816 self.pid 817 } 818 819 /// Return the ID of the next match. link(&self) -> StateID820 fn link(&self) -> StateID { 821 self.link 822 } 823 } 824 825 impl core::fmt::Debug for Match { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result826 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 827 write!( 828 f, 829 "Match(pid: {:?}, link: {:?})", 830 self.pattern().as_usize(), 831 self.link().as_usize() 832 ) 833 } 834 } 835 836 /// A builder for configuring an Aho-Corasick noncontiguous NFA. 837 /// 838 /// This builder has a subset of the options available to a 839 /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, 840 /// their behavior is identical. 841 #[derive(Clone, Debug)] 842 pub struct Builder { 843 match_kind: MatchKind, 844 prefilter: bool, 845 ascii_case_insensitive: bool, 846 dense_depth: usize, 847 } 848 849 impl Default for Builder { default() -> Builder850 fn default() -> Builder { 851 Builder { 852 match_kind: MatchKind::default(), 853 prefilter: true, 854 ascii_case_insensitive: false, 855 dense_depth: 3, 856 } 857 } 858 } 859 860 impl Builder { 861 /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. new() -> Builder862 pub fn new() -> Builder { 863 Builder::default() 864 } 865 866 /// Build an Aho-Corasick noncontiguous NFA from the given iterator of 867 /// patterns. 868 /// 869 /// A builder may be reused to create more NFAs. build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,870 pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError> 871 where 872 I: IntoIterator<Item = P>, 873 P: AsRef<[u8]>, 874 { 875 debug!("building non-contiguous NFA"); 876 let nfa = Compiler::new(self)?.compile(patterns)?; 877 debug!( 878 "non-contiguous NFA built, <states: {:?}, size: {:?}>", 879 nfa.states.len(), 880 nfa.memory_usage() 881 ); 882 Ok(nfa) 883 } 884 885 /// Set the desired match semantics. 886 /// 887 /// See 888 /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) 889 /// for more documentation and examples. match_kind(&mut self, kind: MatchKind) -> &mut Builder890 pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { 891 self.match_kind = kind; 892 self 893 } 894 895 /// Enable ASCII-aware case insensitive matching. 896 /// 897 /// See 898 /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) 899 /// for more documentation and examples. ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder900 pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { 901 self.ascii_case_insensitive = yes; 902 self 903 } 904 905 /// Set the limit on how many states use a dense representation for their 906 /// transitions. Other states will generally use a sparse representation. 907 /// 908 /// See 909 /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) 910 /// for more documentation and examples. dense_depth(&mut self, depth: usize) -> &mut Builder911 pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { 912 self.dense_depth = depth; 913 self 914 } 915 916 /// Enable heuristic prefilter optimizations. 917 /// 918 /// See 919 /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) 920 /// for more documentation and examples. prefilter(&mut self, yes: bool) -> &mut Builder921 pub fn prefilter(&mut self, yes: bool) -> &mut Builder { 922 self.prefilter = yes; 923 self 924 } 925 } 926 927 /// A compiler uses a builder configuration and builds up the NFA formulation 928 /// of an Aho-Corasick automaton. This roughly corresponds to the standard 929 /// formulation described in textbooks, with some tweaks to support leftmost 930 /// searching. 931 #[derive(Debug)] 932 struct Compiler<'a> { 933 builder: &'a Builder, 934 prefilter: prefilter::Builder, 935 nfa: NFA, 936 byteset: ByteClassSet, 937 } 938 939 impl<'a> Compiler<'a> { new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError>940 fn new(builder: &'a Builder) -> Result<Compiler<'a>, BuildError> { 941 let prefilter = prefilter::Builder::new(builder.match_kind) 942 .ascii_case_insensitive(builder.ascii_case_insensitive); 943 Ok(Compiler { 944 builder, 945 prefilter, 946 nfa: NFA { 947 match_kind: builder.match_kind, 948 states: vec![], 949 sparse: vec![], 950 dense: vec![], 951 matches: vec![], 952 pattern_lens: vec![], 953 prefilter: None, 954 byte_classes: ByteClasses::singletons(), 955 min_pattern_len: usize::MAX, 956 max_pattern_len: 0, 957 special: Special::zero(), 958 }, 959 byteset: ByteClassSet::empty(), 960 }) 961 } 962 compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,963 fn compile<I, P>(mut self, patterns: I) -> Result<NFA, BuildError> 964 where 965 I: IntoIterator<Item = P>, 966 P: AsRef<[u8]>, 967 { 968 // Add dummy transition/match links, so that no valid link will point 969 // to another link at index 0. 970 self.nfa.sparse.push(Transition::default()); 971 self.nfa.matches.push(Match::default()); 972 // Add a dummy dense transition so that no states can have dense==0 973 // represent a valid pointer to dense transitions. This permits 974 // dense==0 to be a sentinel indicating "no dense transitions." 975 self.nfa.dense.push(NFA::DEAD); 976 // the dead state, only used for leftmost and fixed to id==0 977 self.nfa.alloc_state(0)?; 978 // the fail state, which is never entered and fixed to id==1 979 self.nfa.alloc_state(0)?; 980 // unanchored start state, initially fixed to id==2 but later shuffled 981 // to appear after all non-start match states. 982 self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?; 983 // anchored start state, initially fixed to id==3 but later shuffled 984 // to appear after unanchored start state. 985 self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?; 986 // Initialize the unanchored starting state in order to make it dense, 987 // and thus make transition lookups on this state faster. 988 self.init_unanchored_start_state()?; 989 // Set all transitions on the DEAD state to point to itself. This way, 990 // the DEAD state can never be escaped. It MUST be used as a sentinel 991 // in any correct search. 992 self.add_dead_state_loop()?; 993 // Build the base trie from the given patterns. 994 self.build_trie(patterns)?; 995 self.nfa.states.shrink_to_fit(); 996 // Turn our set of bytes into equivalent classes. This NFA 997 // implementation uses byte classes only for states that use a dense 998 // representation of transitions. (And that's why this comes before 999 // `self.densify()`, as the byte classes need to be set first.) 1000 self.nfa.byte_classes = self.byteset.byte_classes(); 1001 // Add transitions (and maybe matches) to the anchored starting state. 1002 // The anchored starting state is used for anchored searches. The only 1003 // mechanical difference between it and the unanchored start state is 1004 // that missing transitions map to the DEAD state instead of the FAIL 1005 // state. 1006 self.set_anchored_start_state()?; 1007 // Rewrite transitions to the FAIL state on the unanchored start state 1008 // as self-transitions. This keeps the start state active at all times. 1009 self.add_unanchored_start_state_loop(); 1010 // Make some (possibly zero) states use a dense representation for 1011 // transitions. It's important to do this right after the states 1012 // and non-failure transitions are solidified. That way, subsequent 1013 // accesses (particularly `fill_failure_transitions`) will benefit from 1014 // the faster transition lookup in densified states. 1015 self.densify()?; 1016 // The meat of the Aho-Corasick algorithm: compute and write failure 1017 // transitions. i.e., the state to move to when a transition isn't 1018 // defined in the current state. These are epsilon transitions and thus 1019 // make this formulation an NFA. 1020 self.fill_failure_transitions()?; 1021 // Handle a special case under leftmost semantics when at least one 1022 // of the patterns is the empty string. 1023 self.close_start_state_loop_for_leftmost(); 1024 // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, 1025 // NON-MATCH, ... This permits us to very quickly query the type of 1026 // the state we're currently in during a search. 1027 self.shuffle(); 1028 self.nfa.prefilter = self.prefilter.build(); 1029 // Store the maximum ID of all *relevant* special states. Start states 1030 // are only relevant when we have a prefilter, otherwise, there is zero 1031 // reason to care about whether a state is a start state or not during 1032 // a search. Indeed, without a prefilter, we are careful to explicitly 1033 // NOT care about start states, otherwise the search can ping pong 1034 // between the unrolled loop and the handling of special-status states 1035 // and destroy perf. 1036 self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { 1037 // Why the anchored starting state? Because we always put it 1038 // after the unanchored starting state and it is therefore the 1039 // maximum. Why put unanchored followed by anchored? No particular 1040 // reason, but that's how the states are logically organized in the 1041 // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯ 1042 self.nfa.special.start_anchored_id 1043 } else { 1044 self.nfa.special.max_match_id 1045 }; 1046 self.nfa.sparse.shrink_to_fit(); 1047 self.nfa.dense.shrink_to_fit(); 1048 self.nfa.matches.shrink_to_fit(); 1049 self.nfa.pattern_lens.shrink_to_fit(); 1050 Ok(self.nfa) 1051 } 1052 1053 /// This sets up the initial prefix trie that makes up the Aho-Corasick 1054 /// automaton. Effectively, it creates the basic structure of the 1055 /// automaton, where every pattern given has a path from the start state to 1056 /// the end of the pattern. build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> where I: IntoIterator<Item = P>, P: AsRef<[u8]>,1057 fn build_trie<I, P>(&mut self, patterns: I) -> Result<(), BuildError> 1058 where 1059 I: IntoIterator<Item = P>, 1060 P: AsRef<[u8]>, 1061 { 1062 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { 1063 let pid = PatternID::new(i).map_err(|e| { 1064 BuildError::pattern_id_overflow( 1065 PatternID::MAX.as_u64(), 1066 e.attempted(), 1067 ) 1068 })?; 1069 let pat = pat.as_ref(); 1070 let patlen = SmallIndex::new(pat.len()) 1071 .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; 1072 self.nfa.min_pattern_len = 1073 core::cmp::min(self.nfa.min_pattern_len, pat.len()); 1074 self.nfa.max_pattern_len = 1075 core::cmp::max(self.nfa.max_pattern_len, pat.len()); 1076 assert_eq!( 1077 i, 1078 self.nfa.pattern_lens.len(), 1079 "expected number of patterns to match pattern ID" 1080 ); 1081 self.nfa.pattern_lens.push(patlen); 1082 // We add the pattern to the prefilter here because the pattern 1083 // ID in the prefilter is determined with respect to the patterns 1084 // added to the prefilter. That is, it isn't the ID we have here, 1085 // but the one determined by its own accounting of patterns. 1086 // To ensure they line up, we add every pattern we see to the 1087 // prefilter, even if some patterns ultimately are impossible to 1088 // match (in leftmost-first semantics specifically). 1089 // 1090 // Another way of doing this would be to expose an API in the 1091 // prefilter to permit setting your own pattern IDs. Or to just use 1092 // our own map and go between them. But this case is sufficiently 1093 // rare that we don't bother and just make sure they're in sync. 1094 if self.builder.prefilter { 1095 self.prefilter.add(pat); 1096 } 1097 1098 let mut prev = self.nfa.special.start_unanchored_id; 1099 let mut saw_match = false; 1100 for (depth, &b) in pat.iter().enumerate() { 1101 // When leftmost-first match semantics are requested, we 1102 // specifically stop adding patterns when a previously added 1103 // pattern is a prefix of it. We avoid adding it because 1104 // leftmost-first semantics imply that the pattern can never 1105 // match. This is not just an optimization to save space! It 1106 // is necessary for correctness. In fact, this is the only 1107 // difference in the automaton between the implementations for 1108 // leftmost-first and leftmost-longest. 1109 saw_match = saw_match || self.nfa.states[prev].is_match(); 1110 if self.builder.match_kind.is_leftmost_first() && saw_match { 1111 // Skip to the next pattern immediately. This avoids 1112 // incorrectly adding a match after this loop terminates. 1113 continue 'PATTERNS; 1114 } 1115 1116 // Add this byte to our equivalence classes. These don't 1117 // get used while building the trie, but other Aho-Corasick 1118 // implementations may use them. 1119 self.byteset.set_range(b, b); 1120 if self.builder.ascii_case_insensitive { 1121 let b = opposite_ascii_case(b); 1122 self.byteset.set_range(b, b); 1123 } 1124 1125 // If the transition from prev using the current byte already 1126 // exists, then just move through it. Otherwise, add a new 1127 // state. We track the depth here so that we can determine 1128 // how to represent transitions. States near the start state 1129 // use a dense representation that uses more memory but is 1130 // faster. Other states use a sparse representation that uses 1131 // less memory but is slower. 1132 let next = self.nfa.follow_transition(prev, b); 1133 if next != NFA::FAIL { 1134 prev = next; 1135 } else { 1136 let next = self.nfa.alloc_state(depth)?; 1137 self.nfa.add_transition(prev, b, next)?; 1138 if self.builder.ascii_case_insensitive { 1139 let b = opposite_ascii_case(b); 1140 self.nfa.add_transition(prev, b, next)?; 1141 } 1142 prev = next; 1143 } 1144 } 1145 // Once the pattern has been added, log the match in the final 1146 // state that it reached. 1147 self.nfa.add_match(prev, pid)?; 1148 } 1149 Ok(()) 1150 } 1151 1152 /// This routine creates failure transitions according to the standard 1153 /// textbook formulation of the Aho-Corasick algorithm, with a couple small 1154 /// tweaks to support "leftmost" semantics. 1155 /// 1156 /// Building failure transitions is the most interesting part of building 1157 /// the Aho-Corasick automaton, because they are what allow searches to 1158 /// be performed in linear time. Specifically, a failure transition is 1159 /// a single transition associated with each state that points back to 1160 /// the longest proper suffix of the pattern being searched. The failure 1161 /// transition is followed whenever there exists no transition on the 1162 /// current state for the current input byte. If there is no other proper 1163 /// suffix, then the failure transition points back to the starting state. 1164 /// 1165 /// For example, let's say we built an Aho-Corasick automaton with the 1166 /// following patterns: 'abcd' and 'cef'. The trie looks like this: 1167 /// 1168 /// ```ignore 1169 /// a - S1 - b - S2 - c - S3 - d - S4* 1170 /// / 1171 /// S0 - c - S5 - e - S6 - f - S7* 1172 /// ``` 1173 /// 1174 /// At this point, it should be fairly straight-forward to see how this 1175 /// trie can be used in a simplistic way. At any given position in the 1176 /// text we're searching (called the "subject" string), all we need to do 1177 /// is follow the transitions in the trie by consuming one transition for 1178 /// each byte in the subject string. If we reach a match state, then we can 1179 /// report that location as a match. 1180 /// 1181 /// The trick comes when searching a subject string like 'abcef'. We'll 1182 /// initially follow the transition from S0 to S1 and wind up in S3 after 1183 /// observng the 'c' byte. At this point, the next byte is 'e' but state 1184 /// S3 has no transition for 'e', so the search fails. We then would need 1185 /// to restart the search at the next position in 'abcef', which 1186 /// corresponds to 'b'. The match would fail, but the next search starting 1187 /// at 'c' would finally succeed. The problem with this approach is that 1188 /// we wind up searching the subject string potentially many times. In 1189 /// effect, this makes the algorithm have worst case `O(n * m)` complexity, 1190 /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead 1191 /// like to achieve a `O(n + m)` worst case complexity. 1192 /// 1193 /// This is where failure transitions come in. Instead of dying at S3 in 1194 /// the first search, the automaton can instruct the search to move to 1195 /// another part of the automaton that corresponds to a suffix of what 1196 /// we've seen so far. Recall that we've seen 'abc' in the subject string, 1197 /// and the automaton does indeed have a non-empty suffix, 'c', that could 1198 /// potentially lead to another match. Thus, the actual Aho-Corasick 1199 /// automaton for our patterns in this case looks like this: 1200 /// 1201 /// ```ignore 1202 /// a - S1 - b - S2 - c - S3 - d - S4* 1203 /// / / 1204 /// / ---------------- 1205 /// / / 1206 /// S0 - c - S5 - e - S6 - f - S7* 1207 /// ``` 1208 /// 1209 /// That is, we have a failure transition from S3 to S5, which is followed 1210 /// exactly in cases when we are in state S3 but see any byte other than 1211 /// 'd' (that is, we've "failed" to find a match in this portion of our 1212 /// trie). We know we can transition back to S5 because we've already seen 1213 /// a 'c' byte, so we don't need to re-scan it. We can then pick back up 1214 /// with the search starting at S5 and complete our match. 1215 /// 1216 /// Adding failure transitions to a trie is fairly simple, but subtle. The 1217 /// key issue is that you might have multiple failure transition that you 1218 /// need to follow. For example, look at the trie for the patterns 1219 /// 'abcd', 'b', 'bcd' and 'cd': 1220 /// 1221 /// ```ignore 1222 /// - a - S1 - b - S2* - c - S3 - d - S4* 1223 /// / / / 1224 /// / ------- ------- 1225 /// / / / 1226 /// S0 --- b - S5* - c - S6 - d - S7* 1227 /// \ / 1228 /// \ -------- 1229 /// \ / 1230 /// - c - S8 - d - S9* 1231 /// ``` 1232 /// 1233 /// The failure transitions for this trie are defined from S2 to S5, 1234 /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it 1235 /// corresponds to a match, since its failure transition to S5 is itself 1236 /// a match state. 1237 /// 1238 /// Perhaps simplest way to think about adding these failure transitions 1239 /// is recursively. That is, if you know the failure transitions for every 1240 /// possible previous state that could be visited (e.g., when computing the 1241 /// failure transition for S3, you already know the failure transitions 1242 /// for S0, S1 and S2), then you can simply follow the failure transition 1243 /// of the previous state and check whether the incoming transition is 1244 /// defined after following the failure transition. 1245 /// 1246 /// For example, when determining the failure state for S3, by our 1247 /// assumptions, we already know that there is a failure transition from 1248 /// S2 (the previous state) to S5. So we follow that transition and check 1249 /// whether the transition connecting S2 to S3 is defined. Indeed, it is, 1250 /// as there is a transition from S5 to S6 for the byte 'c'. If no such 1251 /// transition existed, we could keep following the failure transitions 1252 /// until we reach the start state, which is the failure transition for 1253 /// every state that has no corresponding proper suffix. 1254 /// 1255 /// We don't actually use recursion to implement this, but instead, use a 1256 /// breadth first search of the automaton. Our base case is the start 1257 /// state, whose failure transition is just a transition to itself. 1258 /// 1259 /// When building a leftmost automaton, we proceed as above, but only 1260 /// include a subset of failure transitions. Namely, we omit any failure 1261 /// transitions that appear after a match state in the trie. This is 1262 /// because failure transitions always point back to a proper suffix of 1263 /// what has been seen so far. Thus, following a failure transition after 1264 /// a match implies looking for a match that starts after the one that has 1265 /// already been seen, which is of course therefore not the leftmost match. 1266 /// 1267 /// N.B. I came up with this algorithm on my own, and after scouring all of 1268 /// the other AC implementations I know of (Perl, Snort, many on GitHub). 1269 /// I couldn't find any that implement leftmost semantics like this. 1270 /// Perl of course needs leftmost-first semantics, but they implement it 1271 /// with a seeming hack at *search* time instead of encoding it into the 1272 /// automaton. There are also a couple Java libraries that support leftmost 1273 /// longest semantics, but they do it by building a queue of matches at 1274 /// search time, which is even worse than what Perl is doing. ---AG fill_failure_transitions(&mut self) -> Result<(), BuildError>1275 fn fill_failure_transitions(&mut self) -> Result<(), BuildError> { 1276 let is_leftmost = self.builder.match_kind.is_leftmost(); 1277 let start_uid = self.nfa.special.start_unanchored_id; 1278 // Initialize the queue for breadth first search with all transitions 1279 // out of the start state. We handle the start state specially because 1280 // we only want to follow non-self transitions. If we followed self 1281 // transitions, then this would never terminate. 1282 let mut queue = VecDeque::new(); 1283 let mut seen = self.queued_set(); 1284 let mut prev_link = None; 1285 while let Some(link) = self.nfa.next_link(start_uid, prev_link) { 1286 prev_link = Some(link); 1287 let t = self.nfa.sparse[link]; 1288 1289 // Skip anything we've seen before and any self-transitions on the 1290 // start state. 1291 if start_uid == t.next() || seen.contains(t.next) { 1292 continue; 1293 } 1294 queue.push_back(t.next); 1295 seen.insert(t.next); 1296 // Under leftmost semantics, if a state immediately following 1297 // the start state is a match state, then we never want to 1298 // follow its failure transition since the failure transition 1299 // necessarily leads back to the start state, which we never 1300 // want to do for leftmost matching after a match has been 1301 // found. 1302 // 1303 // We apply the same logic to non-start states below as well. 1304 if is_leftmost && self.nfa.states[t.next].is_match() { 1305 self.nfa.states[t.next].fail = NFA::DEAD; 1306 } 1307 } 1308 while let Some(id) = queue.pop_front() { 1309 let mut prev_link = None; 1310 while let Some(link) = self.nfa.next_link(id, prev_link) { 1311 prev_link = Some(link); 1312 let t = self.nfa.sparse[link]; 1313 1314 if seen.contains(t.next) { 1315 // The only way to visit a duplicate state in a transition 1316 // list is when ASCII case insensitivity is enabled. In 1317 // this case, we want to skip it since it's redundant work. 1318 // But it would also end up duplicating matches, which 1319 // results in reporting duplicate matches in some cases. 1320 // See the 'acasei010' regression test. 1321 continue; 1322 } 1323 queue.push_back(t.next); 1324 seen.insert(t.next); 1325 1326 // As above for start states, under leftmost semantics, once 1327 // we see a match all subsequent states should have no failure 1328 // transitions because failure transitions always imply looking 1329 // for a match that is a suffix of what has been seen so far 1330 // (where "seen so far" corresponds to the string formed by 1331 // following the transitions from the start state to the 1332 // current state). Under leftmost semantics, we specifically do 1333 // not want to allow this to happen because we always want to 1334 // report the match found at the leftmost position. 1335 // 1336 // The difference between leftmost-first and leftmost-longest 1337 // occurs previously while we build the trie. For 1338 // leftmost-first, we simply omit any entries that would 1339 // otherwise require passing through a match state. 1340 // 1341 // Note that for correctness, the failure transition has to be 1342 // set to the dead state for ALL states following a match, not 1343 // just the match state itself. However, by setting the failure 1344 // transition to the dead state on all match states, the dead 1345 // state will automatically propagate to all subsequent states 1346 // via the failure state computation below. 1347 if is_leftmost && self.nfa.states[t.next].is_match() { 1348 self.nfa.states[t.next].fail = NFA::DEAD; 1349 continue; 1350 } 1351 let mut fail = self.nfa.states[id].fail; 1352 while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL { 1353 fail = self.nfa.states[fail].fail; 1354 } 1355 fail = self.nfa.follow_transition(fail, t.byte); 1356 self.nfa.states[t.next].fail = fail; 1357 self.nfa.copy_matches(fail, t.next)?; 1358 } 1359 // If the start state is a match state, then this automaton can 1360 // match the empty string. This implies all states are match states 1361 // since every position matches the empty string, so copy the 1362 // matches from the start state to every state. Strictly speaking, 1363 // this is only necessary for overlapping matches since each 1364 // non-empty non-start match state needs to report empty matches 1365 // in addition to its own. For the non-overlapping case, such 1366 // states only report the first match, which is never empty since 1367 // it isn't a start state. 1368 if !is_leftmost { 1369 self.nfa 1370 .copy_matches(self.nfa.special.start_unanchored_id, id)?; 1371 } 1372 } 1373 Ok(()) 1374 } 1375 1376 /// Shuffle the states so that they appear in this sequence: 1377 /// 1378 /// DEAD, FAIL, MATCH..., START, START, NON-MATCH... 1379 /// 1380 /// The idea here is that if we know how special states are laid out in our 1381 /// transition table, then we can determine what "kind" of state we're in 1382 /// just by comparing our current state ID with a particular value. In this 1383 /// way, we avoid doing extra memory lookups. 1384 /// 1385 /// Before shuffling begins, our states look something like this: 1386 /// 1387 /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)... 1388 /// 1389 /// So all we need to do is move all of the MATCH states so that they 1390 /// all appear before any NON-MATCH state, like so: 1391 /// 1392 /// DEAD, FAIL, START, START, MATCH... NON-MATCH... 1393 /// 1394 /// Then it's just a simple matter of swapping the two START states with 1395 /// the last two MATCH states. 1396 /// 1397 /// (This is the same technique used for fully compiled DFAs in 1398 /// regex-automata.) shuffle(&mut self)1399 fn shuffle(&mut self) { 1400 let old_start_uid = self.nfa.special.start_unanchored_id; 1401 let old_start_aid = self.nfa.special.start_anchored_id; 1402 assert!(old_start_uid < old_start_aid); 1403 assert_eq!( 1404 3, 1405 old_start_aid.as_usize(), 1406 "anchored start state should be at index 3" 1407 ); 1408 // We implement shuffling by a sequence of pairwise swaps of states. 1409 // Since we have a number of things referencing states via their 1410 // IDs and swapping them changes their IDs, we need to record every 1411 // swap we make so that we can remap IDs. The remapper handles this 1412 // book-keeping for us. 1413 let mut remapper = Remapper::new(&self.nfa, 0); 1414 // The way we proceed here is by moving all match states so that 1415 // they directly follow the start states. So it will go: DEAD, FAIL, 1416 // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... 1417 // 1418 // To do that, we proceed forward through all states after 1419 // START-ANCHORED and swap match states so that they appear before all 1420 // non-match states. 1421 let mut next_avail = StateID::from(4u8); 1422 for i in next_avail.as_usize()..self.nfa.states.len() { 1423 let sid = StateID::new(i).unwrap(); 1424 if !self.nfa.states[sid].is_match() { 1425 continue; 1426 } 1427 remapper.swap(&mut self.nfa, sid, next_avail); 1428 // The key invariant here is that only non-match states exist 1429 // between 'next_avail' and 'sid' (with them being potentially 1430 // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed 1431 // to land on the leftmost non-match state. (Unless 'next_avail' 1432 // and 'sid' are equivalent, in which case, a swap will occur but 1433 // it is a no-op.) 1434 next_avail = StateID::new(next_avail.one_more()).unwrap(); 1435 } 1436 // Now we'd like to move the start states to immediately following the 1437 // match states. (The start states may themselves be match states, but 1438 // we'll handle that later.) We arrange the states this way so that we 1439 // don't necessarily need to check whether a state is a start state or 1440 // not before checking whether a state is a match state. For example, 1441 // we'd like to be able to write this as our state machine loop: 1442 // 1443 // sid = start() 1444 // for byte in haystack: 1445 // sid = next(sid, byte) 1446 // if sid <= nfa.max_start_id: 1447 // if sid <= nfa.max_dead_id: 1448 // # search complete 1449 // elif sid <= nfa.max_match_id: 1450 // # found match 1451 // 1452 // The important context here is that we might not want to look for 1453 // start states at all. Namely, if a searcher doesn't have a prefilter, 1454 // then there is no reason to care about whether we're in a start state 1455 // or not. And indeed, if we did check for it, this very hot loop would 1456 // ping pong between the special state handling and the main state 1457 // transition logic. This in turn stalls the CPU by killing branch 1458 // prediction. 1459 // 1460 // So essentially, we really want to be able to "forget" that start 1461 // states even exist and this is why we put them at the end. 1462 let new_start_aid = 1463 StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) 1464 .unwrap(); 1465 remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); 1466 let new_start_uid = 1467 StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) 1468 .unwrap(); 1469 remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); 1470 let new_max_match_id = 1471 StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) 1472 .unwrap(); 1473 self.nfa.special.max_match_id = new_max_match_id; 1474 self.nfa.special.start_unanchored_id = new_start_uid; 1475 self.nfa.special.start_anchored_id = new_start_aid; 1476 // If one start state is a match state, then they both are. 1477 if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { 1478 self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; 1479 } 1480 remapper.remap(&mut self.nfa); 1481 } 1482 1483 /// Attempts to convert the transition representation of a subset of states 1484 /// in this NFA from sparse to dense. This can greatly improve search 1485 /// performance since states with a higher number of transitions tend to 1486 /// correlate with very active states. 1487 /// 1488 /// We generally only densify states that are close to the start state. 1489 /// These tend to be the most active states and thus benefit from a dense 1490 /// representation more than other states. 1491 /// 1492 /// This tends to best balance between memory usage and performance. In 1493 /// particular, the *vast majority* of all states in a typical Aho-Corasick 1494 /// automaton have only 1 transition and are usually farther from the start 1495 /// state and thus don't get densified. 1496 /// 1497 /// Note that this doesn't remove the sparse representation of transitions 1498 /// for states that are densified. It could be done, but actually removing 1499 /// entries from `NFA::sparse` is likely more expensive than it's worth. densify(&mut self) -> Result<(), BuildError>1500 fn densify(&mut self) -> Result<(), BuildError> { 1501 for i in 0..self.nfa.states.len() { 1502 let sid = StateID::new(i).unwrap(); 1503 // Don't bother densifying states that are only used as sentinels. 1504 if sid == NFA::DEAD || sid == NFA::FAIL { 1505 continue; 1506 } 1507 // Only densify states that are "close enough" to the start state. 1508 if self.nfa.states[sid].depth.as_usize() 1509 >= self.builder.dense_depth 1510 { 1511 continue; 1512 } 1513 let dense = self.nfa.alloc_dense_state()?; 1514 let mut prev_link = None; 1515 while let Some(link) = self.nfa.next_link(sid, prev_link) { 1516 prev_link = Some(link); 1517 let t = self.nfa.sparse[link]; 1518 1519 let class = usize::from(self.nfa.byte_classes.get(t.byte)); 1520 let index = dense.as_usize() + class; 1521 self.nfa.dense[index] = t.next; 1522 } 1523 self.nfa.states[sid].dense = dense; 1524 } 1525 Ok(()) 1526 } 1527 1528 /// Returns a set that tracked queued states. 1529 /// 1530 /// This is only necessary when ASCII case insensitivity is enabled, since 1531 /// it is the only way to visit the same state twice. Otherwise, this 1532 /// returns an inert set that nevers adds anything and always reports 1533 /// `false` for every member test. queued_set(&self) -> QueuedSet1534 fn queued_set(&self) -> QueuedSet { 1535 if self.builder.ascii_case_insensitive { 1536 QueuedSet::active() 1537 } else { 1538 QueuedSet::inert() 1539 } 1540 } 1541 1542 /// Initializes the unanchored start state by making it dense. This is 1543 /// achieved by explicitly setting every transition to the FAIL state. 1544 /// This isn't necessary for correctness, since any missing transition is 1545 /// automatically assumed to be mapped to the FAIL state. We do this to 1546 /// make the unanchored starting state dense, and thus in turn make 1547 /// transition lookups on it faster. (Which is worth doing because it's 1548 /// the most active state.) init_unanchored_start_state(&mut self) -> Result<(), BuildError>1549 fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> { 1550 let start_uid = self.nfa.special.start_unanchored_id; 1551 let start_aid = self.nfa.special.start_anchored_id; 1552 self.nfa.init_full_state(start_uid, NFA::FAIL)?; 1553 self.nfa.init_full_state(start_aid, NFA::FAIL)?; 1554 Ok(()) 1555 } 1556 1557 /// Setup the anchored start state by copying all of the transitions and 1558 /// matches from the unanchored starting state with one change: the failure 1559 /// transition is changed to the DEAD state, so that for any undefined 1560 /// transitions, the search will stop. set_anchored_start_state(&mut self) -> Result<(), BuildError>1561 fn set_anchored_start_state(&mut self) -> Result<(), BuildError> { 1562 let start_uid = self.nfa.special.start_unanchored_id; 1563 let start_aid = self.nfa.special.start_anchored_id; 1564 let (mut uprev_link, mut aprev_link) = (None, None); 1565 loop { 1566 let unext = self.nfa.next_link(start_uid, uprev_link); 1567 let anext = self.nfa.next_link(start_aid, aprev_link); 1568 let (ulink, alink) = match (unext, anext) { 1569 (Some(ulink), Some(alink)) => (ulink, alink), 1570 (None, None) => break, 1571 _ => unreachable!(), 1572 }; 1573 uprev_link = Some(ulink); 1574 aprev_link = Some(alink); 1575 self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next; 1576 } 1577 self.nfa.copy_matches(start_uid, start_aid)?; 1578 // This is the main difference between the unanchored and anchored 1579 // starting states. If a lookup on an anchored starting state fails, 1580 // then the search should stop. 1581 // 1582 // N.B. This assumes that the loop on the unanchored starting state 1583 // hasn't been created yet. 1584 self.nfa.states[start_aid].fail = NFA::DEAD; 1585 Ok(()) 1586 } 1587 1588 /// Set the failure transitions on the start state to loop back to the 1589 /// start state. This effectively permits the Aho-Corasick automaton to 1590 /// match at any position. This is also required for finding the next 1591 /// state to terminate, namely, finding the next state should never return 1592 /// a fail_id. 1593 /// 1594 /// This must be done after building the initial trie, since trie 1595 /// construction depends on transitions to `fail_id` to determine whether a 1596 /// state already exists or not. add_unanchored_start_state_loop(&mut self)1597 fn add_unanchored_start_state_loop(&mut self) { 1598 let start_uid = self.nfa.special.start_unanchored_id; 1599 let mut prev_link = None; 1600 while let Some(link) = self.nfa.next_link(start_uid, prev_link) { 1601 prev_link = Some(link); 1602 if self.nfa.sparse[link].next() == NFA::FAIL { 1603 self.nfa.sparse[link].next = start_uid; 1604 } 1605 } 1606 } 1607 1608 /// Remove the start state loop by rewriting any transitions on the start 1609 /// state back to the start state with transitions to the dead state. 1610 /// 1611 /// The loop is only closed when two conditions are met: the start state 1612 /// is a match state and the match kind is leftmost-first or 1613 /// leftmost-longest. 1614 /// 1615 /// The reason for this is that under leftmost semantics, a start state 1616 /// that is also a match implies that we should never restart the search 1617 /// process. We allow normal transitions out of the start state, but if 1618 /// none exist, we transition to the dead state, which signals that 1619 /// searching should stop. close_start_state_loop_for_leftmost(&mut self)1620 fn close_start_state_loop_for_leftmost(&mut self) { 1621 let start_uid = self.nfa.special.start_unanchored_id; 1622 let start = &mut self.nfa.states[start_uid]; 1623 let dense = start.dense; 1624 if self.builder.match_kind.is_leftmost() && start.is_match() { 1625 let mut prev_link = None; 1626 while let Some(link) = self.nfa.next_link(start_uid, prev_link) { 1627 prev_link = Some(link); 1628 if self.nfa.sparse[link].next() == start_uid { 1629 self.nfa.sparse[link].next = NFA::DEAD; 1630 if dense != StateID::ZERO { 1631 let b = self.nfa.sparse[link].byte; 1632 let class = usize::from(self.nfa.byte_classes.get(b)); 1633 self.nfa.dense[dense.as_usize() + class] = NFA::DEAD; 1634 } 1635 } 1636 } 1637 } 1638 } 1639 1640 /// Sets all transitions on the dead state to point back to the dead state. 1641 /// Normally, missing transitions map back to the failure state, but the 1642 /// point of the dead state is to act as a sink that can never be escaped. add_dead_state_loop(&mut self) -> Result<(), BuildError>1643 fn add_dead_state_loop(&mut self) -> Result<(), BuildError> { 1644 self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?; 1645 Ok(()) 1646 } 1647 } 1648 1649 /// A set of state identifiers used to avoid revisiting the same state multiple 1650 /// times when filling in failure transitions. 1651 /// 1652 /// This set has an "inert" and an "active" mode. When inert, the set never 1653 /// stores anything and always returns `false` for every member test. This is 1654 /// useful to avoid the performance and memory overhead of maintaining this 1655 /// set when it is not needed. 1656 #[derive(Debug)] 1657 struct QueuedSet { 1658 set: Option<BTreeSet<StateID>>, 1659 } 1660 1661 impl QueuedSet { 1662 /// Return an inert set that returns `false` for every state ID membership 1663 /// test. inert() -> QueuedSet1664 fn inert() -> QueuedSet { 1665 QueuedSet { set: None } 1666 } 1667 1668 /// Return an active set that tracks state ID membership. active() -> QueuedSet1669 fn active() -> QueuedSet { 1670 QueuedSet { set: Some(BTreeSet::new()) } 1671 } 1672 1673 /// Inserts the given state ID into this set. (If the set is inert, then 1674 /// this is a no-op.) insert(&mut self, state_id: StateID)1675 fn insert(&mut self, state_id: StateID) { 1676 if let Some(ref mut set) = self.set { 1677 set.insert(state_id); 1678 } 1679 } 1680 1681 /// Returns true if and only if the given state ID is in this set. If the 1682 /// set is inert, this always returns false. contains(&self, state_id: StateID) -> bool1683 fn contains(&self, state_id: StateID) -> bool { 1684 match self.set { 1685 None => false, 1686 Some(ref set) => set.contains(&state_id), 1687 } 1688 } 1689 } 1690 1691 impl core::fmt::Debug for NFA { fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result1692 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 1693 use crate::{ 1694 automaton::{fmt_state_indicator, sparse_transitions}, 1695 util::debug::DebugByte, 1696 }; 1697 1698 writeln!(f, "noncontiguous::NFA(")?; 1699 for (sid, state) in self.states.iter().with_state_ids() { 1700 // The FAIL state doesn't actually have space for a state allocated 1701 // for it, so we have to treat it as a special case. 1702 if sid == NFA::FAIL { 1703 writeln!(f, "F {:06}:", sid.as_usize())?; 1704 continue; 1705 } 1706 fmt_state_indicator(f, self, sid)?; 1707 write!( 1708 f, 1709 "{:06}({:06}): ", 1710 sid.as_usize(), 1711 state.fail.as_usize() 1712 )?; 1713 1714 let it = sparse_transitions( 1715 self.iter_trans(sid).map(|t| (t.byte, t.next)), 1716 ) 1717 .enumerate(); 1718 for (i, (start, end, sid)) in it { 1719 if i > 0 { 1720 write!(f, ", ")?; 1721 } 1722 if start == end { 1723 write!( 1724 f, 1725 "{:?} => {:?}", 1726 DebugByte(start), 1727 sid.as_usize() 1728 )?; 1729 } else { 1730 write!( 1731 f, 1732 "{:?}-{:?} => {:?}", 1733 DebugByte(start), 1734 DebugByte(end), 1735 sid.as_usize() 1736 )?; 1737 } 1738 } 1739 1740 write!(f, "\n")?; 1741 if self.is_match(sid) { 1742 write!(f, " matches: ")?; 1743 for (i, pid) in self.iter_matches(sid).enumerate() { 1744 if i > 0 { 1745 write!(f, ", ")?; 1746 } 1747 write!(f, "{}", pid.as_usize())?; 1748 } 1749 write!(f, "\n")?; 1750 } 1751 } 1752 writeln!(f, "match kind: {:?}", self.match_kind)?; 1753 writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; 1754 writeln!(f, "state length: {:?}", self.states.len())?; 1755 writeln!(f, "pattern length: {:?}", self.patterns_len())?; 1756 writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; 1757 writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; 1758 writeln!(f, "memory usage: {:?}", self.memory_usage())?; 1759 writeln!(f, ")")?; 1760 Ok(()) 1761 } 1762 } 1763