1 /*! 2 An NFA backed Pike VM for executing regex searches with capturing groups. 3 4 This module provides a [`PikeVM`] that works by simulating an NFA and 5 resolving all spans of capturing groups that participate in a match. 6 */ 7 8 #[cfg(feature = "internal-instrument-pikevm")] 9 use core::cell::RefCell; 10 11 use alloc::{vec, vec::Vec}; 12 13 use crate::{ 14 nfa::thompson::{self, BuildError, State, NFA}, 15 util::{ 16 captures::Captures, 17 empty, iter, 18 prefilter::Prefilter, 19 primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, 20 search::{ 21 Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, 22 }, 23 sparse_set::SparseSet, 24 }, 25 }; 26 27 /// A simple macro for conditionally executing instrumentation logic when 28 /// the 'trace' log level is enabled. This is a compile-time no-op when the 29 /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that 30 /// this makes it easier to avoid doing extra work when instrumentation isn't 31 /// enabled. 32 /// 33 /// This macro accepts a closure of type `|&mut Counters|`. The closure can 34 /// then increment counters (or whatever) in accordance with what one wants 35 /// to track. 36 macro_rules! instrument { 37 ($fun:expr) => { 38 #[cfg(feature = "internal-instrument-pikevm")] 39 { 40 let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; 41 COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); 42 } 43 }; 44 } 45 46 #[cfg(feature = "internal-instrument-pikevm")] 47 std::thread_local! { 48 /// Effectively global state used to keep track of instrumentation 49 /// counters. The "proper" way to do this is to thread it through the 50 /// PikeVM, but it makes the code quite icky. Since this is just a 51 /// debugging feature, we're content to relegate it to thread local 52 /// state. When instrumentation is enabled, the counters are reset at the 53 /// beginning of every search and printed (with the 'trace' log level) at 54 /// the end of every search. 55 static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); 56 } 57 58 /// The configuration used for building a [`PikeVM`]. 59 /// 60 /// A PikeVM configuration is a simple data object that is typically used with 61 /// [`Builder::configure`]. It can be cheaply cloned. 62 /// 63 /// A default configuration can be created either with `Config::new`, or 64 /// perhaps more conveniently, with [`PikeVM::config`]. 65 #[derive(Clone, Debug, Default)] 66 pub struct Config { 67 match_kind: Option<MatchKind>, 68 pre: Option<Option<Prefilter>>, 69 } 70 71 impl Config { 72 /// Return a new default PikeVM configuration. new() -> Config73 pub fn new() -> Config { 74 Config::default() 75 } 76 77 /// Set the desired match semantics. 78 /// 79 /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the 80 /// match semantics of Perl-like regex engines. That is, when multiple 81 /// patterns would match at the same leftmost position, the pattern that 82 /// appears first in the concrete syntax is chosen. 83 /// 84 /// Currently, the only other kind of match semantics supported is 85 /// [`MatchKind::All`]. This corresponds to "classical DFA" construction 86 /// where all possible matches are visited in the NFA by the `PikeVM`. 87 /// 88 /// Typically, `All` is used when one wants to execute an overlapping 89 /// search and `LeftmostFirst` otherwise. In particular, it rarely makes 90 /// sense to use `All` with the various "leftmost" find routines, since the 91 /// leftmost routines depend on the `LeftmostFirst` automata construction 92 /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` 93 /// simulating dead states as a way to terminate the search and report a 94 /// match. `LeftmostFirst` also supports non-greedy matches using this 95 /// strategy where as `All` does not. match_kind(mut self, kind: MatchKind) -> Config96 pub fn match_kind(mut self, kind: MatchKind) -> Config { 97 self.match_kind = Some(kind); 98 self 99 } 100 101 /// Set a prefilter to be used whenever a start state is entered. 102 /// 103 /// A [`Prefilter`] in this context is meant to accelerate searches by 104 /// looking for literal prefixes that every match for the corresponding 105 /// pattern (or patterns) must start with. Once a prefilter produces a 106 /// match, the underlying search routine continues on to try and confirm 107 /// the match. 108 /// 109 /// Be warned that setting a prefilter does not guarantee that the search 110 /// will be faster. While it's usually a good bet, if the prefilter 111 /// produces a lot of false positive candidates (i.e., positions matched 112 /// by the prefilter but not by the regex), then the overall result can 113 /// be slower than if you had just executed the regex engine without any 114 /// prefilters. 115 /// 116 /// By default no prefilter is set. 117 /// 118 /// # Example 119 /// 120 /// ``` 121 /// use regex_automata::{ 122 /// nfa::thompson::pikevm::PikeVM, 123 /// util::prefilter::Prefilter, 124 /// Input, Match, MatchKind, 125 /// }; 126 /// 127 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); 128 /// let re = PikeVM::builder() 129 /// .configure(PikeVM::config().prefilter(pre)) 130 /// .build(r"(foo|bar)[a-z]+")?; 131 /// let mut cache = re.create_cache(); 132 /// let input = Input::new("foo1 barfox bar"); 133 /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); 134 /// 135 /// # Ok::<(), Box<dyn std::error::Error>>(()) 136 /// ``` 137 /// 138 /// Be warned though that an incorrect prefilter can lead to incorrect 139 /// results! 140 /// 141 /// ``` 142 /// use regex_automata::{ 143 /// nfa::thompson::pikevm::PikeVM, 144 /// util::prefilter::Prefilter, 145 /// Input, HalfMatch, MatchKind, 146 /// }; 147 /// 148 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); 149 /// let re = PikeVM::builder() 150 /// .configure(PikeVM::config().prefilter(pre)) 151 /// .build(r"(foo|bar)[a-z]+")?; 152 /// let mut cache = re.create_cache(); 153 /// let input = Input::new("foo1 barfox bar"); 154 /// // No match reported even though there clearly is one! 155 /// assert_eq!(None, re.find(&mut cache, input)); 156 /// 157 /// # Ok::<(), Box<dyn std::error::Error>>(()) 158 /// ``` prefilter(mut self, pre: Option<Prefilter>) -> Config159 pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { 160 self.pre = Some(pre); 161 self 162 } 163 164 /// Returns the match semantics set in this configuration. get_match_kind(&self) -> MatchKind165 pub fn get_match_kind(&self) -> MatchKind { 166 self.match_kind.unwrap_or(MatchKind::LeftmostFirst) 167 } 168 169 /// Returns the prefilter set in this configuration, if one at all. get_prefilter(&self) -> Option<&Prefilter>170 pub fn get_prefilter(&self) -> Option<&Prefilter> { 171 self.pre.as_ref().unwrap_or(&None).as_ref() 172 } 173 174 /// Overwrite the default configuration such that the options in `o` are 175 /// always used. If an option in `o` is not set, then the corresponding 176 /// option in `self` is used. If it's not set in `self` either, then it 177 /// remains not set. overwrite(&self, o: Config) -> Config178 pub(crate) fn overwrite(&self, o: Config) -> Config { 179 Config { 180 match_kind: o.match_kind.or(self.match_kind), 181 pre: o.pre.or_else(|| self.pre.clone()), 182 } 183 } 184 } 185 186 /// A builder for a `PikeVM`. 187 /// 188 /// This builder permits configuring options for the syntax of a pattern, 189 /// the NFA construction and the `PikeVM` construction. This builder is 190 /// different from a general purpose regex builder in that it permits fine 191 /// grain configuration of the construction process. The trade off for this is 192 /// complexity, and the possibility of setting a configuration that might not 193 /// make sense. For example, there are two different UTF-8 modes: 194 /// 195 /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) 196 /// controls whether the pattern itself can contain sub-expressions that match 197 /// invalid UTF-8. 198 /// * [`thompson::Config::utf8`] controls whether empty matches that split a 199 /// Unicode codepoint are reported or not. 200 /// 201 /// Generally speaking, callers will want to either enable all of these or 202 /// disable all of these. 203 /// 204 /// # Example 205 /// 206 /// This example shows how to disable UTF-8 mode in the syntax and the regex 207 /// itself. This is generally what you want for matching on arbitrary bytes. 208 /// 209 /// ``` 210 /// use regex_automata::{ 211 /// nfa::thompson::{self, pikevm::PikeVM}, 212 /// util::syntax, 213 /// Match, 214 /// }; 215 /// 216 /// let re = PikeVM::builder() 217 /// .syntax(syntax::Config::new().utf8(false)) 218 /// .thompson(thompson::Config::new().utf8(false)) 219 /// .build(r"foo(?-u:[^b])ar.*")?; 220 /// let mut cache = re.create_cache(); 221 /// 222 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; 223 /// let expected = Some(Match::must(0, 1..9)); 224 /// let got = re.find_iter(&mut cache, haystack).next(); 225 /// assert_eq!(expected, got); 226 /// // Notice that `(?-u:[^b])` matches invalid UTF-8, 227 /// // but the subsequent `.*` does not! Disabling UTF-8 228 /// // on the syntax permits this. 229 /// // 230 /// // N.B. This example does not show the impact of 231 /// // disabling UTF-8 mode on a PikeVM Config, since that 232 /// // only impacts regexes that can produce matches of 233 /// // length 0. 234 /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); 235 /// 236 /// # Ok::<(), Box<dyn std::error::Error>>(()) 237 /// ``` 238 #[derive(Clone, Debug)] 239 pub struct Builder { 240 config: Config, 241 #[cfg(feature = "syntax")] 242 thompson: thompson::Compiler, 243 } 244 245 impl Builder { 246 /// Create a new PikeVM builder with its default configuration. new() -> Builder247 pub fn new() -> Builder { 248 Builder { 249 config: Config::default(), 250 #[cfg(feature = "syntax")] 251 thompson: thompson::Compiler::new(), 252 } 253 } 254 255 /// Build a `PikeVM` from the given pattern. 256 /// 257 /// If there was a problem parsing or compiling the pattern, then an error 258 /// is returned. 259 #[cfg(feature = "syntax")] build(&self, pattern: &str) -> Result<PikeVM, BuildError>260 pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { 261 self.build_many(&[pattern]) 262 } 263 264 /// Build a `PikeVM` from the given patterns. 265 #[cfg(feature = "syntax")] build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<PikeVM, BuildError>266 pub fn build_many<P: AsRef<str>>( 267 &self, 268 patterns: &[P], 269 ) -> Result<PikeVM, BuildError> { 270 let nfa = self.thompson.build_many(patterns)?; 271 self.build_from_nfa(nfa) 272 } 273 274 /// Build a `PikeVM` directly from its NFA. 275 /// 276 /// Note that when using this method, any configuration that applies to the 277 /// construction of the NFA itself will of course be ignored, since the NFA 278 /// given here is already built. build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError>279 pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { 280 nfa.look_set_any().available().map_err(BuildError::word)?; 281 Ok(PikeVM { config: self.config.clone(), nfa }) 282 } 283 284 /// Apply the given `PikeVM` configuration options to this builder. configure(&mut self, config: Config) -> &mut Builder285 pub fn configure(&mut self, config: Config) -> &mut Builder { 286 self.config = self.config.overwrite(config); 287 self 288 } 289 290 /// Set the syntax configuration for this builder using 291 /// [`syntax::Config`](crate::util::syntax::Config). 292 /// 293 /// This permits setting things like case insensitivity, Unicode and multi 294 /// line mode. 295 /// 296 /// These settings only apply when constructing a PikeVM directly from a 297 /// pattern. 298 #[cfg(feature = "syntax")] syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder299 pub fn syntax( 300 &mut self, 301 config: crate::util::syntax::Config, 302 ) -> &mut Builder { 303 self.thompson.syntax(config); 304 self 305 } 306 307 /// Set the Thompson NFA configuration for this builder using 308 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). 309 /// 310 /// This permits setting things like if additional time should be spent 311 /// shrinking the size of the NFA. 312 /// 313 /// These settings only apply when constructing a PikeVM directly from a 314 /// pattern. 315 #[cfg(feature = "syntax")] thompson(&mut self, config: thompson::Config) -> &mut Builder316 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { 317 self.thompson.configure(config); 318 self 319 } 320 } 321 322 /// A virtual machine for executing regex searches with capturing groups. 323 /// 324 /// # Infallible APIs 325 /// 326 /// Unlike most other regex engines in this crate, a `PikeVM` never returns an 327 /// error at search time. It supports all [`Anchored`] configurations, never 328 /// quits and works on haystacks of arbitrary length. 329 /// 330 /// There are two caveats to mention though: 331 /// 332 /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], 333 /// then the PikeVM will report "no match." This is consistent with all other 334 /// regex engines in this crate. 335 /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] 336 /// that has insufficient capacity to store all valid pattern IDs, then if a 337 /// match occurs for a `PatternID` that cannot be inserted, it is silently 338 /// dropped as if it did not match. 339 /// 340 /// # Advice 341 /// 342 /// The `PikeVM` is generally the most "powerful" regex engine in this crate. 343 /// "Powerful" in this context means that it can handle any regular expression 344 /// that is parseable by `regex-syntax` and any size haystack. Regretably, 345 /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in 346 /// practice. This results in an annoying situation where one generally tries 347 /// to pick any other regex engine (or perhaps none at all) before being 348 /// forced to fall back to a `PikeVM`. 349 /// 350 /// For example, a common strategy for dealing with capturing groups is to 351 /// actually look for the overall match of the regex using a faster regex 352 /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall 353 /// match is found, one can then run the `PikeVM` on just the match span to 354 /// find the spans of the capturing groups. In this way, the faster regex 355 /// engine does the majority of the work, while the `PikeVM` only lends its 356 /// power in a more limited role. 357 /// 358 /// Unfortunately, this isn't always possible because the faster regex engines 359 /// don't support all of the regex features in `regex-syntax`. This notably 360 /// includes (and is currently limited to) Unicode word boundaries. So if 361 /// your pattern has Unicode word boundaries, you typically can't use a 362 /// DFA-based regex engine at all (unless you [enable heuristic support for 363 /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass 364 /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for 365 /// anchored searches only, but in a cruel sort of joke, many Unicode features 366 /// tend to result in making the regex _not_ one-pass.) 367 /// 368 /// # Example 369 /// 370 /// This example shows that the `PikeVM` implements Unicode word boundaries 371 /// correctly by default. 372 /// 373 /// ``` 374 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 375 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 376 /// 377 /// let re = PikeVM::new(r"\b\w+\b")?; 378 /// let mut cache = re.create_cache(); 379 /// 380 /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); 381 /// assert_eq!(Some(Match::must(0, 0..12)), it.next()); 382 /// assert_eq!(Some(Match::must(0, 13..23)), it.next()); 383 /// assert_eq!(None, it.next()); 384 /// # Ok::<(), Box<dyn std::error::Error>>(()) 385 /// ``` 386 #[derive(Clone, Debug)] 387 pub struct PikeVM { 388 config: Config, 389 nfa: NFA, 390 } 391 392 impl PikeVM { 393 /// Parse the given regular expression using the default configuration and 394 /// return the corresponding `PikeVM`. 395 /// 396 /// If you want a non-default configuration, then use the [`Builder`] to 397 /// set your own configuration. 398 /// 399 /// # Example 400 /// 401 /// ``` 402 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 403 /// 404 /// let re = PikeVM::new("foo[0-9]+bar")?; 405 /// let mut cache = re.create_cache(); 406 /// assert_eq!( 407 /// Some(Match::must(0, 3..14)), 408 /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), 409 /// ); 410 /// # Ok::<(), Box<dyn std::error::Error>>(()) 411 /// ``` 412 #[cfg(feature = "syntax")] new(pattern: &str) -> Result<PikeVM, BuildError>413 pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { 414 PikeVM::builder().build(pattern) 415 } 416 417 /// Like `new`, but parses multiple patterns into a single "multi regex." 418 /// This similarly uses the default regex configuration. 419 /// 420 /// # Example 421 /// 422 /// ``` 423 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 424 /// 425 /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; 426 /// let mut cache = re.create_cache(); 427 /// 428 /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); 429 /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); 430 /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); 431 /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); 432 /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); 433 /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); 434 /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); 435 /// assert_eq!(None, it.next()); 436 /// # Ok::<(), Box<dyn std::error::Error>>(()) 437 /// ``` 438 #[cfg(feature = "syntax")] new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<PikeVM, BuildError>439 pub fn new_many<P: AsRef<str>>( 440 patterns: &[P], 441 ) -> Result<PikeVM, BuildError> { 442 PikeVM::builder().build_many(patterns) 443 } 444 445 /// Like `new`, but builds a PikeVM directly from an NFA. This is useful 446 /// if you already have an NFA, or even if you hand-assembled the NFA. 447 /// 448 /// # Example 449 /// 450 /// This shows how to hand assemble a regular expression via its HIR, 451 /// compile an NFA from it and build a PikeVM from the NFA. 452 /// 453 /// ``` 454 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 455 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; 456 /// 457 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ 458 /// ClassBytesRange::new(b'0', b'9'), 459 /// ClassBytesRange::new(b'A', b'Z'), 460 /// ClassBytesRange::new(b'_', b'_'), 461 /// ClassBytesRange::new(b'a', b'z'), 462 /// ]))); 463 /// 464 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 465 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; 466 /// 467 /// let re = PikeVM::new_from_nfa(nfa)?; 468 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 469 /// let expected = Some(Match::must(0, 3..4)); 470 /// re.captures(&mut cache, "!@#A#@!", &mut caps); 471 /// assert_eq!(expected, caps.get_match()); 472 /// 473 /// # Ok::<(), Box<dyn std::error::Error>>(()) 474 /// ``` new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError>475 pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { 476 PikeVM::builder().build_from_nfa(nfa) 477 } 478 479 /// Create a new `PikeVM` that matches every input. 480 /// 481 /// # Example 482 /// 483 /// ``` 484 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 485 /// 486 /// let re = PikeVM::always_match()?; 487 /// let mut cache = re.create_cache(); 488 /// 489 /// let expected = Match::must(0, 0..0); 490 /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); 491 /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); 492 /// # Ok::<(), Box<dyn std::error::Error>>(()) 493 /// ``` always_match() -> Result<PikeVM, BuildError>494 pub fn always_match() -> Result<PikeVM, BuildError> { 495 let nfa = thompson::NFA::always_match(); 496 PikeVM::new_from_nfa(nfa) 497 } 498 499 /// Create a new `PikeVM` that never matches any input. 500 /// 501 /// # Example 502 /// 503 /// ``` 504 /// use regex_automata::nfa::thompson::pikevm::PikeVM; 505 /// 506 /// let re = PikeVM::never_match()?; 507 /// let mut cache = re.create_cache(); 508 /// 509 /// assert_eq!(None, re.find_iter(&mut cache, "").next()); 510 /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); 511 /// # Ok::<(), Box<dyn std::error::Error>>(()) 512 /// ``` never_match() -> Result<PikeVM, BuildError>513 pub fn never_match() -> Result<PikeVM, BuildError> { 514 let nfa = thompson::NFA::never_match(); 515 PikeVM::new_from_nfa(nfa) 516 } 517 518 /// Return a default configuration for a `PikeVM`. 519 /// 520 /// This is a convenience routine to avoid needing to import the `Config` 521 /// type when customizing the construction of a `PikeVM`. 522 /// 523 /// # Example 524 /// 525 /// This example shows how to disable UTF-8 mode. When UTF-8 mode is 526 /// disabled, zero-width matches that split a codepoint are allowed. 527 /// Otherwise they are never reported. 528 /// 529 /// In the code below, notice that `""` is permitted to match positions 530 /// that split the encoding of a codepoint. 531 /// 532 /// ``` 533 /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; 534 /// 535 /// let re = PikeVM::builder() 536 /// .thompson(thompson::Config::new().utf8(false)) 537 /// .build(r"")?; 538 /// let mut cache = re.create_cache(); 539 /// 540 /// let haystack = "a☃z"; 541 /// let mut it = re.find_iter(&mut cache, haystack); 542 /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); 543 /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); 544 /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); 545 /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); 546 /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); 547 /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); 548 /// assert_eq!(None, it.next()); 549 /// 550 /// # Ok::<(), Box<dyn std::error::Error>>(()) 551 /// ``` config() -> Config552 pub fn config() -> Config { 553 Config::new() 554 } 555 556 /// Return a builder for configuring the construction of a `PikeVM`. 557 /// 558 /// This is a convenience routine to avoid needing to import the 559 /// [`Builder`] type in common cases. 560 /// 561 /// # Example 562 /// 563 /// This example shows how to use the builder to disable UTF-8 mode 564 /// everywhere. 565 /// 566 /// ``` 567 /// use regex_automata::{ 568 /// nfa::thompson::{self, pikevm::PikeVM}, 569 /// util::syntax, 570 /// Match, 571 /// }; 572 /// 573 /// let re = PikeVM::builder() 574 /// .syntax(syntax::Config::new().utf8(false)) 575 /// .thompson(thompson::Config::new().utf8(false)) 576 /// .build(r"foo(?-u:[^b])ar.*")?; 577 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 578 /// 579 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; 580 /// let expected = Some(Match::must(0, 1..9)); 581 /// re.captures(&mut cache, haystack, &mut caps); 582 /// assert_eq!(expected, caps.get_match()); 583 /// 584 /// # Ok::<(), Box<dyn std::error::Error>>(()) 585 /// ``` builder() -> Builder586 pub fn builder() -> Builder { 587 Builder::new() 588 } 589 590 /// Create a new empty set of capturing groups that is guaranteed to be 591 /// valid for the search APIs on this `PikeVM`. 592 /// 593 /// A `Captures` value created for a specific `PikeVM` cannot be used with 594 /// any other `PikeVM`. 595 /// 596 /// This is a convenience function for [`Captures::all`]. See the 597 /// [`Captures`] documentation for an explanation of its alternative 598 /// constructors that permit the `PikeVM` to do less work during a search, 599 /// and thus might make it faster. create_captures(&self) -> Captures600 pub fn create_captures(&self) -> Captures { 601 Captures::all(self.get_nfa().group_info().clone()) 602 } 603 604 /// Create a new cache for this `PikeVM`. 605 /// 606 /// The cache returned should only be used for searches for this 607 /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then 608 /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, 609 /// [`PikeVM::reset_cache`]). create_cache(&self) -> Cache610 pub fn create_cache(&self) -> Cache { 611 Cache::new(self) 612 } 613 614 /// Reset the given cache such that it can be used for searching with the 615 /// this `PikeVM` (and only this `PikeVM`). 616 /// 617 /// A cache reset permits reusing memory already allocated in this cache 618 /// with a different `PikeVM`. 619 /// 620 /// # Example 621 /// 622 /// This shows how to re-purpose a cache for use with a different `PikeVM`. 623 /// 624 /// ``` 625 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 626 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 627 /// 628 /// let re1 = PikeVM::new(r"\w")?; 629 /// let re2 = PikeVM::new(r"\W")?; 630 /// 631 /// let mut cache = re1.create_cache(); 632 /// assert_eq!( 633 /// Some(Match::must(0, 0..2)), 634 /// re1.find_iter(&mut cache, "Δ").next(), 635 /// ); 636 /// 637 /// // Using 'cache' with re2 is not allowed. It may result in panics or 638 /// // incorrect results. In order to re-purpose the cache, we must reset 639 /// // it with the PikeVM we'd like to use it with. 640 /// // 641 /// // Similarly, after this reset, using the cache with 're1' is also not 642 /// // allowed. 643 /// re2.reset_cache(&mut cache); 644 /// assert_eq!( 645 /// Some(Match::must(0, 0..3)), 646 /// re2.find_iter(&mut cache, "☃").next(), 647 /// ); 648 /// 649 /// # Ok::<(), Box<dyn std::error::Error>>(()) 650 /// ``` reset_cache(&self, cache: &mut Cache)651 pub fn reset_cache(&self, cache: &mut Cache) { 652 cache.reset(self); 653 } 654 655 /// Returns the total number of patterns compiled into this `PikeVM`. 656 /// 657 /// In the case of a `PikeVM` that contains no patterns, this returns `0`. 658 /// 659 /// # Example 660 /// 661 /// This example shows the pattern length for a `PikeVM` that never 662 /// matches: 663 /// 664 /// ``` 665 /// use regex_automata::nfa::thompson::pikevm::PikeVM; 666 /// 667 /// let re = PikeVM::never_match()?; 668 /// assert_eq!(re.pattern_len(), 0); 669 /// # Ok::<(), Box<dyn std::error::Error>>(()) 670 /// ``` 671 /// 672 /// And another example for a `PikeVM` that matches at every position: 673 /// 674 /// ``` 675 /// use regex_automata::nfa::thompson::pikevm::PikeVM; 676 /// 677 /// let re = PikeVM::always_match()?; 678 /// assert_eq!(re.pattern_len(), 1); 679 /// # Ok::<(), Box<dyn std::error::Error>>(()) 680 /// ``` 681 /// 682 /// And finally, a `PikeVM` that was constructed from multiple patterns: 683 /// 684 /// ``` 685 /// use regex_automata::nfa::thompson::pikevm::PikeVM; 686 /// 687 /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; 688 /// assert_eq!(re.pattern_len(), 3); 689 /// # Ok::<(), Box<dyn std::error::Error>>(()) 690 /// ``` pattern_len(&self) -> usize691 pub fn pattern_len(&self) -> usize { 692 self.nfa.pattern_len() 693 } 694 695 /// Return the config for this `PikeVM`. 696 #[inline] get_config(&self) -> &Config697 pub fn get_config(&self) -> &Config { 698 &self.config 699 } 700 701 /// Returns a reference to the underlying NFA. 702 #[inline] get_nfa(&self) -> &NFA703 pub fn get_nfa(&self) -> &NFA { 704 &self.nfa 705 } 706 } 707 708 impl PikeVM { 709 /// Returns true if and only if this `PikeVM` matches the given haystack. 710 /// 711 /// This routine may short circuit if it knows that scanning future 712 /// input will never lead to a different result. In particular, if the 713 /// underlying NFA enters a match state, then this routine will return 714 /// `true` immediately without inspecting any future input. (Consider how 715 /// this might make a difference given the regex `a+` on the haystack 716 /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, 717 /// but routines like `find` need to continue searching because `+` is 718 /// greedy by default.) 719 /// 720 /// # Example 721 /// 722 /// This shows basic usage: 723 /// 724 /// ``` 725 /// use regex_automata::nfa::thompson::pikevm::PikeVM; 726 /// 727 /// let re = PikeVM::new("foo[0-9]+bar")?; 728 /// let mut cache = re.create_cache(); 729 /// 730 /// assert!(re.is_match(&mut cache, "foo12345bar")); 731 /// assert!(!re.is_match(&mut cache, "foobar")); 732 /// # Ok::<(), Box<dyn std::error::Error>>(()) 733 /// ``` 734 /// 735 /// # Example: consistency with search APIs 736 /// 737 /// `is_match` is guaranteed to return `true` whenever `find` returns a 738 /// match. This includes searches that are executed entirely within a 739 /// codepoint: 740 /// 741 /// ``` 742 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; 743 /// 744 /// let re = PikeVM::new("a*")?; 745 /// let mut cache = re.create_cache(); 746 /// 747 /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); 748 /// # Ok::<(), Box<dyn std::error::Error>>(()) 749 /// ``` 750 /// 751 /// Notice that when UTF-8 mode is disabled, then the above reports a 752 /// match because the restriction against zero-width matches that split a 753 /// codepoint has been lifted: 754 /// 755 /// ``` 756 /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; 757 /// 758 /// let re = PikeVM::builder() 759 /// .thompson(NFA::config().utf8(false)) 760 /// .build("a*")?; 761 /// let mut cache = re.create_cache(); 762 /// 763 /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); 764 /// # Ok::<(), Box<dyn std::error::Error>>(()) 765 /// ``` 766 #[inline] is_match<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> bool767 pub fn is_match<'h, I: Into<Input<'h>>>( 768 &self, 769 cache: &mut Cache, 770 input: I, 771 ) -> bool { 772 let input = input.into().earliest(true); 773 self.search_slots(cache, &input, &mut []).is_some() 774 } 775 776 /// Executes a leftmost forward search and returns a `Match` if one exists. 777 /// 778 /// This routine only includes the overall match span. To get access to the 779 /// individual spans of each capturing group, use [`PikeVM::captures`]. 780 /// 781 /// # Example 782 /// 783 /// Leftmost first match semantics corresponds to the match with the 784 /// smallest starting offset, but where the end offset is determined by 785 /// preferring earlier branches in the original regular expression. For 786 /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` 787 /// will match `Samwise` in `Samwise`. 788 /// 789 /// Generally speaking, the "leftmost first" match is how most backtracking 790 /// regular expressions tend to work. This is in contrast to POSIX-style 791 /// regular expressions that yield "leftmost longest" matches. Namely, 792 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using 793 /// leftmost longest semantics. (This crate does not currently support 794 /// leftmost longest semantics.) 795 /// 796 /// ``` 797 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 798 /// 799 /// let re = PikeVM::new("foo[0-9]+")?; 800 /// let mut cache = re.create_cache(); 801 /// let expected = Match::must(0, 0..8); 802 /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); 803 /// 804 /// // Even though a match is found after reading the first byte (`a`), 805 /// // the leftmost first match semantics demand that we find the earliest 806 /// // match that prefers earlier parts of the pattern over later parts. 807 /// let re = PikeVM::new("abc|a")?; 808 /// let mut cache = re.create_cache(); 809 /// let expected = Match::must(0, 0..3); 810 /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); 811 /// 812 /// # Ok::<(), Box<dyn std::error::Error>>(()) 813 /// ``` 814 #[inline] find<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> Option<Match>815 pub fn find<'h, I: Into<Input<'h>>>( 816 &self, 817 cache: &mut Cache, 818 input: I, 819 ) -> Option<Match> { 820 let input = input.into(); 821 if self.get_nfa().pattern_len() == 1 { 822 let mut slots = [None, None]; 823 let pid = self.search_slots(cache, &input, &mut slots)?; 824 let start = slots[0]?.get(); 825 let end = slots[1]?.get(); 826 return Some(Match::new(pid, Span { start, end })); 827 } 828 let ginfo = self.get_nfa().group_info(); 829 let slots_len = ginfo.implicit_slot_len(); 830 let mut slots = vec![None; slots_len]; 831 let pid = self.search_slots(cache, &input, &mut slots)?; 832 let start = slots[pid.as_usize() * 2]?.get(); 833 let end = slots[pid.as_usize() * 2 + 1]?.get(); 834 Some(Match::new(pid, Span { start, end })) 835 } 836 837 /// Executes a leftmost forward search and writes the spans of capturing 838 /// groups that participated in a match into the provided [`Captures`] 839 /// value. If no match was found, then [`Captures::is_match`] is guaranteed 840 /// to return `false`. 841 /// 842 /// # Example 843 /// 844 /// ``` 845 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; 846 /// 847 /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; 848 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 849 /// 850 /// re.captures(&mut cache, "2010-03-14", &mut caps); 851 /// assert!(caps.is_match()); 852 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); 853 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); 854 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); 855 /// 856 /// # Ok::<(), Box<dyn std::error::Error>>(()) 857 /// ``` 858 #[inline] captures<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, )859 pub fn captures<'h, I: Into<Input<'h>>>( 860 &self, 861 cache: &mut Cache, 862 input: I, 863 caps: &mut Captures, 864 ) { 865 self.search(cache, &input.into(), caps) 866 } 867 868 /// Returns an iterator over all non-overlapping leftmost matches in the 869 /// given bytes. If no match exists, then the iterator yields no elements. 870 /// 871 /// # Example 872 /// 873 /// ``` 874 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 875 /// 876 /// let re = PikeVM::new("foo[0-9]+")?; 877 /// let mut cache = re.create_cache(); 878 /// 879 /// let text = "foo1 foo12 foo123"; 880 /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); 881 /// assert_eq!(matches, vec![ 882 /// Match::must(0, 0..4), 883 /// Match::must(0, 5..10), 884 /// Match::must(0, 11..17), 885 /// ]); 886 /// # Ok::<(), Box<dyn std::error::Error>>(()) 887 /// ``` 888 #[inline] find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h>889 pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( 890 &'r self, 891 cache: &'c mut Cache, 892 input: I, 893 ) -> FindMatches<'r, 'c, 'h> { 894 let caps = Captures::matches(self.get_nfa().group_info().clone()); 895 let it = iter::Searcher::new(input.into()); 896 FindMatches { re: self, cache, caps, it } 897 } 898 899 /// Returns an iterator over all non-overlapping `Captures` values. If no 900 /// match exists, then the iterator yields no elements. 901 /// 902 /// This yields the same matches as [`PikeVM::find_iter`], but it includes 903 /// the spans of all capturing groups that participate in each match. 904 /// 905 /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for 906 /// how to correctly iterate over all matches in a haystack while avoiding 907 /// the creation of a new `Captures` value for every match. (Which you are 908 /// forced to do with an `Iterator`.) 909 /// 910 /// # Example 911 /// 912 /// ``` 913 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; 914 /// 915 /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; 916 /// let mut cache = re.create_cache(); 917 /// 918 /// let text = "foo1 foo12 foo123"; 919 /// let matches: Vec<Span> = re 920 /// .captures_iter(&mut cache, text) 921 /// // The unwrap is OK since 'numbers' matches if the pattern matches. 922 /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) 923 /// .collect(); 924 /// assert_eq!(matches, vec![ 925 /// Span::from(3..4), 926 /// Span::from(8..10), 927 /// Span::from(14..17), 928 /// ]); 929 /// # Ok::<(), Box<dyn std::error::Error>>(()) 930 /// ``` 931 #[inline] captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h>932 pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( 933 &'r self, 934 cache: &'c mut Cache, 935 input: I, 936 ) -> CapturesMatches<'r, 'c, 'h> { 937 let caps = self.create_captures(); 938 let it = iter::Searcher::new(input.into()); 939 CapturesMatches { re: self, cache, caps, it } 940 } 941 } 942 943 impl PikeVM { 944 /// Executes a leftmost forward search and writes the spans of capturing 945 /// groups that participated in a match into the provided [`Captures`] 946 /// value. If no match was found, then [`Captures::is_match`] is guaranteed 947 /// to return `false`. 948 /// 949 /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` 950 /// instead of an `Into<Input>`. 951 /// 952 /// # Example: specific pattern search 953 /// 954 /// This example shows how to build a multi-PikeVM that permits searching 955 /// for specific patterns. 956 /// 957 /// ``` 958 /// use regex_automata::{ 959 /// nfa::thompson::pikevm::PikeVM, 960 /// Anchored, Match, PatternID, Input, 961 /// }; 962 /// 963 /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; 964 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 965 /// let haystack = "foo123"; 966 /// 967 /// // Since we are using the default leftmost-first match and both 968 /// // patterns match at the same starting position, only the first pattern 969 /// // will be returned in this case when doing a search for any of the 970 /// // patterns. 971 /// let expected = Some(Match::must(0, 0..6)); 972 /// re.search(&mut cache, &Input::new(haystack), &mut caps); 973 /// assert_eq!(expected, caps.get_match()); 974 /// 975 /// // But if we want to check whether some other pattern matches, then we 976 /// // can provide its pattern ID. 977 /// let expected = Some(Match::must(1, 0..6)); 978 /// let input = Input::new(haystack) 979 /// .anchored(Anchored::Pattern(PatternID::must(1))); 980 /// re.search(&mut cache, &input, &mut caps); 981 /// assert_eq!(expected, caps.get_match()); 982 /// 983 /// # Ok::<(), Box<dyn std::error::Error>>(()) 984 /// ``` 985 /// 986 /// # Example: specifying the bounds of a search 987 /// 988 /// This example shows how providing the bounds of a search can produce 989 /// different results than simply sub-slicing the haystack. 990 /// 991 /// ``` 992 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 993 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; 994 /// 995 /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; 996 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 997 /// let haystack = "foo123bar"; 998 /// 999 /// // Since we sub-slice the haystack, the search doesn't know about 1000 /// // the larger context and assumes that `123` is surrounded by word 1001 /// // boundaries. And of course, the match position is reported relative 1002 /// // to the sub-slice as well, which means we get `0..3` instead of 1003 /// // `3..6`. 1004 /// let expected = Some(Match::must(0, 0..3)); 1005 /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); 1006 /// assert_eq!(expected, caps.get_match()); 1007 /// 1008 /// // But if we provide the bounds of the search within the context of the 1009 /// // entire haystack, then the search can take the surrounding context 1010 /// // into account. (And if we did find a match, it would be reported 1011 /// // as a valid offset into `haystack` instead of its sub-slice.) 1012 /// let expected = None; 1013 /// let input = Input::new(haystack).range(3..6); 1014 /// re.search(&mut cache, &input, &mut caps); 1015 /// assert_eq!(expected, caps.get_match()); 1016 /// 1017 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1018 /// ``` 1019 #[inline] search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, )1020 pub fn search( 1021 &self, 1022 cache: &mut Cache, 1023 input: &Input<'_>, 1024 caps: &mut Captures, 1025 ) { 1026 caps.set_pattern(None); 1027 let pid = self.search_slots(cache, input, caps.slots_mut()); 1028 caps.set_pattern(pid); 1029 } 1030 1031 /// Executes a leftmost forward search and writes the spans of capturing 1032 /// groups that participated in a match into the provided `slots`, and 1033 /// returns the matching pattern ID. The contents of the slots for patterns 1034 /// other than the matching pattern are unspecified. If no match was found, 1035 /// then `None` is returned and the contents of `slots` is unspecified. 1036 /// 1037 /// This is like [`PikeVM::search`], but it accepts a raw slots slice 1038 /// instead of a `Captures` value. This is useful in contexts where you 1039 /// don't want or need to allocate a `Captures`. 1040 /// 1041 /// It is legal to pass _any_ number of slots to this routine. If the regex 1042 /// engine would otherwise write a slot offset that doesn't fit in the 1043 /// provided slice, then it is simply skipped. In general though, there are 1044 /// usually three slice lengths you might want to use: 1045 /// 1046 /// * An empty slice, if you only care about which pattern matched. 1047 /// * A slice with 1048 /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) 1049 /// slots, if you only care about the overall match spans for each matching 1050 /// pattern. 1051 /// * A slice with 1052 /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which 1053 /// permits recording match offsets for every capturing group in every 1054 /// pattern. 1055 /// 1056 /// # Example 1057 /// 1058 /// This example shows how to find the overall match offsets in a 1059 /// multi-pattern search without allocating a `Captures` value. Indeed, we 1060 /// can put our slots right on the stack. 1061 /// 1062 /// ``` 1063 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1064 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; 1065 /// 1066 /// let re = PikeVM::new_many(&[ 1067 /// r"\pL+", 1068 /// r"\d+", 1069 /// ])?; 1070 /// let mut cache = re.create_cache(); 1071 /// let input = Input::new("!@#123"); 1072 /// 1073 /// // We only care about the overall match offsets here, so we just 1074 /// // allocate two slots for each pattern. Each slot records the start 1075 /// // and end of the match. 1076 /// let mut slots = [None; 4]; 1077 /// let pid = re.search_slots(&mut cache, &input, &mut slots); 1078 /// assert_eq!(Some(PatternID::must(1)), pid); 1079 /// 1080 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. 1081 /// // See 'GroupInfo' for more details on the mapping between groups and 1082 /// // slot indices. 1083 /// let slot_start = pid.unwrap().as_usize() * 2; 1084 /// let slot_end = slot_start + 1; 1085 /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); 1086 /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); 1087 /// 1088 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1089 /// ``` 1090 #[inline] search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<PatternID>1091 pub fn search_slots( 1092 &self, 1093 cache: &mut Cache, 1094 input: &Input<'_>, 1095 slots: &mut [Option<NonMaxUsize>], 1096 ) -> Option<PatternID> { 1097 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); 1098 if !utf8empty { 1099 let hm = self.search_slots_imp(cache, input, slots)?; 1100 return Some(hm.pattern()); 1101 } 1102 // There is an unfortunate special case where if the regex can 1103 // match the empty string and UTF-8 mode is enabled, the search 1104 // implementation requires that the slots have at least as much space 1105 // to report the bounds of any match. This is so zero-width matches 1106 // that split a codepoint can be filtered out. 1107 // 1108 // Note that if utf8empty is true, we specialize the case for when 1109 // the number of patterns is 1. In that case, we can just use a stack 1110 // allocation. Otherwise we resort to a heap allocation, which we 1111 // convince ourselves we're fine with due to the pathological nature of 1112 // this case. 1113 let min = self.get_nfa().group_info().implicit_slot_len(); 1114 if slots.len() >= min { 1115 let hm = self.search_slots_imp(cache, input, slots)?; 1116 return Some(hm.pattern()); 1117 } 1118 if self.get_nfa().pattern_len() == 1 { 1119 let mut enough = [None, None]; 1120 let got = self.search_slots_imp(cache, input, &mut enough); 1121 // This is OK because we know `enough` is strictly bigger than 1122 // `slots`, otherwise this special case isn't reached. 1123 slots.copy_from_slice(&enough[..slots.len()]); 1124 return got.map(|hm| hm.pattern()); 1125 } 1126 let mut enough = vec![None; min]; 1127 let got = self.search_slots_imp(cache, input, &mut enough); 1128 // This is OK because we know `enough` is strictly bigger than `slots`, 1129 // otherwise this special case isn't reached. 1130 slots.copy_from_slice(&enough[..slots.len()]); 1131 got.map(|hm| hm.pattern()) 1132 } 1133 1134 /// This is the actual implementation of `search_slots_imp` that 1135 /// doesn't account for the special case when 1) the NFA has UTF-8 mode 1136 /// enabled, 2) the NFA can match the empty string and 3) the caller has 1137 /// provided an insufficient number of slots to record match offsets. 1138 #[inline(never)] search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1139 fn search_slots_imp( 1140 &self, 1141 cache: &mut Cache, 1142 input: &Input<'_>, 1143 slots: &mut [Option<NonMaxUsize>], 1144 ) -> Option<HalfMatch> { 1145 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); 1146 let hm = match self.search_imp(cache, input, slots) { 1147 None => return None, 1148 Some(hm) if !utf8empty => return Some(hm), 1149 Some(hm) => hm, 1150 }; 1151 empty::skip_splits_fwd(input, hm, hm.offset(), |input| { 1152 Ok(self 1153 .search_imp(cache, input, slots) 1154 .map(|hm| (hm, hm.offset()))) 1155 }) 1156 // OK because the PikeVM never errors. 1157 .unwrap() 1158 } 1159 1160 /// Writes the set of patterns that match anywhere in the given search 1161 /// configuration to `patset`. If multiple patterns match at the same 1162 /// position and this `PikeVM` was configured with [`MatchKind::All`] 1163 /// semantics, then all matching patterns are written to the given set. 1164 /// 1165 /// Unless all of the patterns in this `PikeVM` are anchored, then 1166 /// generally speaking, this will visit every byte in the haystack. 1167 /// 1168 /// This search routine *does not* clear the pattern set. This gives some 1169 /// flexibility to the caller (e.g., running multiple searches with the 1170 /// same pattern set), but does make the API bug-prone if you're reusing 1171 /// the same pattern set for multiple searches but intended them to be 1172 /// independent. 1173 /// 1174 /// If a pattern ID matched but the given `PatternSet` does not have 1175 /// sufficient capacity to store it, then it is not inserted and silently 1176 /// dropped. 1177 /// 1178 /// # Example 1179 /// 1180 /// This example shows how to find all matching patterns in a haystack, 1181 /// even when some patterns match at the same position as other patterns. 1182 /// 1183 /// ``` 1184 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1185 /// use regex_automata::{ 1186 /// nfa::thompson::pikevm::PikeVM, 1187 /// Input, MatchKind, PatternSet, 1188 /// }; 1189 /// 1190 /// let patterns = &[ 1191 /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", 1192 /// ]; 1193 /// let re = PikeVM::builder() 1194 /// .configure(PikeVM::config().match_kind(MatchKind::All)) 1195 /// .build_many(patterns)?; 1196 /// let mut cache = re.create_cache(); 1197 /// 1198 /// let input = Input::new("foobar"); 1199 /// let mut patset = PatternSet::new(re.pattern_len()); 1200 /// re.which_overlapping_matches(&mut cache, &input, &mut patset); 1201 /// let expected = vec![0, 2, 3, 4, 6]; 1202 /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); 1203 /// assert_eq!(expected, got); 1204 /// 1205 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1206 /// ``` 1207 #[inline] which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, )1208 pub fn which_overlapping_matches( 1209 &self, 1210 cache: &mut Cache, 1211 input: &Input<'_>, 1212 patset: &mut PatternSet, 1213 ) { 1214 self.which_overlapping_imp(cache, input, patset) 1215 } 1216 } 1217 1218 impl PikeVM { 1219 /// The implementation of standard leftmost search. 1220 /// 1221 /// Capturing group spans are written to `slots`, but only if requested. 1222 /// `slots` can be any length. Any slot in the NFA that is activated but 1223 /// which is out of bounds for the given `slots` is ignored. search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1224 fn search_imp( 1225 &self, 1226 cache: &mut Cache, 1227 input: &Input<'_>, 1228 slots: &mut [Option<NonMaxUsize>], 1229 ) -> Option<HalfMatch> { 1230 cache.setup_search(slots.len()); 1231 if input.is_done() { 1232 return None; 1233 } 1234 // Why do we even care about this? Well, in our 'Captures' 1235 // representation, we use usize::MAX as a sentinel to indicate "no 1236 // match." This isn't problematic so long as our haystack doesn't have 1237 // a maximal length. Byte slices are guaranteed by Rust to have a 1238 // length that fits into isize, and so this assert should always pass. 1239 // But we put it here to make our assumption explicit. 1240 assert!( 1241 input.haystack().len() < core::usize::MAX, 1242 "byte slice lengths must be less than usize MAX", 1243 ); 1244 instrument!(|c| c.reset(&self.nfa)); 1245 1246 // Whether we want to visit all match states instead of emulating the 1247 // 'leftmost' semantics of typical backtracking regex engines. 1248 let allmatches = 1249 self.config.get_match_kind().continue_past_first_match(); 1250 let (anchored, start_id) = match self.start_config(input) { 1251 None => return None, 1252 Some(config) => config, 1253 }; 1254 1255 let pre = 1256 if anchored { None } else { self.get_config().get_prefilter() }; 1257 let Cache { ref mut stack, ref mut curr, ref mut next } = cache; 1258 let mut hm = None; 1259 // Yes, our search doesn't end at input.end(), but includes it. This 1260 // is necessary because matches are delayed by one byte, just like 1261 // how the DFA engines work. The delay is used to handle look-behind 1262 // assertions. In the case of the PikeVM, the delay is implemented 1263 // by not considering a match to exist until it is visited in 1264 // 'steps'. Technically, we know a match exists in the previous 1265 // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA 1266 // determinization. We don't mark a DFA state as a match state if it 1267 // contains an NFA match state, but rather, whether the DFA state was 1268 // generated by a transition from a DFA state that contains an NFA 1269 // match state.) 1270 let mut at = input.start(); 1271 while at <= input.end() { 1272 // If we have no states left to visit, then there are some cases 1273 // where we know we can quit early or even skip ahead. 1274 if curr.set.is_empty() { 1275 // We have a match and we haven't been instructed to continue 1276 // on even after finding a match, so we can quit. 1277 if hm.is_some() && !allmatches { 1278 break; 1279 } 1280 // If we're running an anchored search and we've advanced 1281 // beyond the start position with no other states to try, then 1282 // we will never observe a match and thus can stop. 1283 if anchored && at > input.start() { 1284 break; 1285 } 1286 // If there no states left to explore at this position and we 1287 // know we can't terminate early, then we are effectively at 1288 // the starting state of the NFA. If we fell through here, 1289 // we'd end up adding our '(?s-u:.)*?' prefix and it would be 1290 // the only thing in 'curr'. So we might as well just skip 1291 // ahead until we find something that we know might advance us 1292 // forward. 1293 if let Some(ref pre) = pre { 1294 let span = Span::from(at..input.end()); 1295 match pre.find(input.haystack(), span) { 1296 None => break, 1297 Some(ref span) => at = span.start, 1298 } 1299 } 1300 } 1301 // Instead of using the NFA's unanchored start state, we actually 1302 // always use its anchored starting state. As a result, when doing 1303 // an unanchored search, we need to simulate our own '(?s-u:.)*?' 1304 // prefix, to permit a match to appear anywhere. 1305 // 1306 // Now, we don't *have* to do things this way. We could use the 1307 // NFA's unanchored starting state and do one 'epsilon_closure' 1308 // call from that starting state before the main loop here. And 1309 // that is just as correct. However, it turns out to be slower 1310 // than our approach here because it slightly increases the cost 1311 // of processing each byte by requiring us to visit more NFA 1312 // states to deal with the additional NFA states in the unanchored 1313 // prefix. By simulating it explicitly here, we lower those costs 1314 // substantially. The cost is itself small, but it adds up for 1315 // large haystacks. 1316 // 1317 // In order to simulate the '(?s-u:.)*?' prefix---which is not 1318 // greedy---we are careful not to perform an epsilon closure on 1319 // the start state if we already have a match. Namely, if we 1320 // did otherwise, we would never reach a terminating condition 1321 // because there would always be additional states to process. 1322 // In effect, the exclusion of running 'epsilon_closure' when 1323 // we have a match corresponds to the "dead" states we have in 1324 // our DFA regex engines. Namely, in a DFA, match states merely 1325 // instruct the search execution to record the current offset as 1326 // the most recently seen match. It is the dead state that actually 1327 // indicates when to stop the search (other than EOF or quit 1328 // states). 1329 // 1330 // However, when 'allmatches' is true, the caller has asked us to 1331 // leave in every possible match state. This tends not to make a 1332 // whole lot of sense in unanchored searches, because it means the 1333 // search really cannot terminate until EOF. And often, in that 1334 // case, you wind up skipping over a bunch of matches and are left 1335 // with the "last" match. Arguably, it just doesn't make a lot of 1336 // sense to run a 'leftmost' search (which is what this routine is) 1337 // with 'allmatches' set to true. But the DFAs support it and this 1338 // matches their behavior. (Generally, 'allmatches' is useful for 1339 // overlapping searches or leftmost anchored searches to find the 1340 // longest possible match by ignoring match priority.) 1341 // 1342 // Additionally, when we're running an anchored search, this 1343 // epsilon closure should only be computed at the beginning of the 1344 // search. If we re-computed it at every position, we would be 1345 // simulating an unanchored search when we were tasked to perform 1346 // an anchored search. 1347 if (!hm.is_some() || allmatches) 1348 && (!anchored || at == input.start()) 1349 { 1350 // Since we are adding to the 'curr' active states and since 1351 // this is for the start ID, we use a slots slice that is 1352 // guaranteed to have the right length but where every element 1353 // is absent. This is exactly what we want, because this 1354 // epsilon closure is responsible for simulating an unanchored 1355 // '(?s:.)*?' prefix. It is specifically outside of any 1356 // capturing groups, and thus, using slots that are always 1357 // absent is correct. 1358 // 1359 // Note though that we can't just use '&mut []' here, since 1360 // this epsilon closure may traverse through 'Captures' epsilon 1361 // transitions, and thus must be able to write offsets to the 1362 // slots given which are later copied to slot values in 'curr'. 1363 let slots = next.slot_table.all_absent(); 1364 self.epsilon_closure(stack, slots, curr, input, at, start_id); 1365 } 1366 if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) 1367 { 1368 hm = Some(HalfMatch::new(pid, at)); 1369 } 1370 // Unless the caller asked us to return early, we need to mush on 1371 // to see if we can extend our match. (But note that 'nexts' will 1372 // quit right after seeing a match when match_kind==LeftmostFirst, 1373 // as is consistent with leftmost-first match priority.) 1374 if input.get_earliest() && hm.is_some() { 1375 break; 1376 } 1377 core::mem::swap(curr, next); 1378 next.set.clear(); 1379 at += 1; 1380 } 1381 instrument!(|c| c.eprint(&self.nfa)); 1382 hm 1383 } 1384 1385 /// The implementation for the 'which_overlapping_matches' API. Basically, 1386 /// we do a single scan through the entire haystack (unless our regex 1387 /// or search is anchored) and record every pattern that matched. In 1388 /// particular, when MatchKind::All is used, this supports overlapping 1389 /// matches. So if we have the regexes 'sam' and 'samwise', they will 1390 /// *both* be reported in the pattern set when searching the haystack 1391 /// 'samwise'. which_overlapping_imp( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, )1392 fn which_overlapping_imp( 1393 &self, 1394 cache: &mut Cache, 1395 input: &Input<'_>, 1396 patset: &mut PatternSet, 1397 ) { 1398 // NOTE: This is effectively a copy of 'search_imp' above, but with no 1399 // captures support and instead writes patterns that matched directly 1400 // to 'patset'. See that routine for better commentary about what's 1401 // going on in this routine. We probably could unify the routines using 1402 // generics or more helper routines, but I'm not sure it's worth it. 1403 // 1404 // NOTE: We somewhat go out of our way here to support things like 1405 // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither 1406 // of those seem particularly relevant to this routine, but they are 1407 // both supported by the DFA analogs of this routine by construction 1408 // and composition, so it seems like good sense to have the PikeVM 1409 // match that behavior. 1410 1411 cache.setup_search(0); 1412 if input.is_done() { 1413 return; 1414 } 1415 assert!( 1416 input.haystack().len() < core::usize::MAX, 1417 "byte slice lengths must be less than usize MAX", 1418 ); 1419 instrument!(|c| c.reset(&self.nfa)); 1420 1421 let allmatches = 1422 self.config.get_match_kind().continue_past_first_match(); 1423 let (anchored, start_id) = match self.start_config(input) { 1424 None => return, 1425 Some(config) => config, 1426 }; 1427 1428 let Cache { ref mut stack, ref mut curr, ref mut next } = cache; 1429 for at in input.start()..=input.end() { 1430 let any_matches = !patset.is_empty(); 1431 if curr.set.is_empty() { 1432 if any_matches && !allmatches { 1433 break; 1434 } 1435 if anchored && at > input.start() { 1436 break; 1437 } 1438 } 1439 if !any_matches || allmatches { 1440 let slots = &mut []; 1441 self.epsilon_closure(stack, slots, curr, input, at, start_id); 1442 } 1443 self.nexts_overlapping(stack, curr, next, input, at, patset); 1444 // If we found a match and filled our set, then there is no more 1445 // additional info that we can provide. Thus, we can quit. We also 1446 // quit if the caller asked us to stop at the earliest point that 1447 // we know a match exists. 1448 if patset.is_full() || input.get_earliest() { 1449 break; 1450 } 1451 core::mem::swap(curr, next); 1452 next.set.clear(); 1453 } 1454 instrument!(|c| c.eprint(&self.nfa)); 1455 } 1456 1457 /// Process the active states in 'curr' to find the states (written to 1458 /// 'next') we should process for the next byte in the haystack. 1459 /// 1460 /// 'stack' is used to perform a depth first traversal of the NFA when 1461 /// computing an epsilon closure. 1462 /// 1463 /// When a match is found, the slots for that match state (in 'curr') are 1464 /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' 1465 /// stops (unless the PikeVM was configured with MatchKind::All semantics). 1466 #[cfg_attr(feature = "perf-inline", inline(always))] nexts( &self, stack: &mut Vec<FollowEpsilon>, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, slots: &mut [Option<NonMaxUsize>], ) -> Option<PatternID>1467 fn nexts( 1468 &self, 1469 stack: &mut Vec<FollowEpsilon>, 1470 curr: &mut ActiveStates, 1471 next: &mut ActiveStates, 1472 input: &Input<'_>, 1473 at: usize, 1474 slots: &mut [Option<NonMaxUsize>], 1475 ) -> Option<PatternID> { 1476 instrument!(|c| c.record_state_set(&curr.set)); 1477 let mut pid = None; 1478 let ActiveStates { ref set, ref mut slot_table } = *curr; 1479 for sid in set.iter() { 1480 pid = match self.next(stack, slot_table, next, input, at, sid) { 1481 None => continue, 1482 Some(pid) => Some(pid), 1483 }; 1484 slots.copy_from_slice(slot_table.for_state(sid)); 1485 if !self.config.get_match_kind().continue_past_first_match() { 1486 break; 1487 } 1488 } 1489 pid 1490 } 1491 1492 /// Like 'nexts', but for the overlapping case. This doesn't write any 1493 /// slots, and instead just writes which pattern matched in 'patset'. 1494 #[cfg_attr(feature = "perf-inline", inline(always))] nexts_overlapping( &self, stack: &mut Vec<FollowEpsilon>, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, patset: &mut PatternSet, )1495 fn nexts_overlapping( 1496 &self, 1497 stack: &mut Vec<FollowEpsilon>, 1498 curr: &mut ActiveStates, 1499 next: &mut ActiveStates, 1500 input: &Input<'_>, 1501 at: usize, 1502 patset: &mut PatternSet, 1503 ) { 1504 instrument!(|c| c.record_state_set(&curr.set)); 1505 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); 1506 let ActiveStates { ref set, ref mut slot_table } = *curr; 1507 for sid in set.iter() { 1508 let pid = match self.next(stack, slot_table, next, input, at, sid) 1509 { 1510 None => continue, 1511 Some(pid) => pid, 1512 }; 1513 // This handles the case of finding a zero-width match that splits 1514 // a codepoint. Namely, if we're in UTF-8 mode AND we know we can 1515 // match the empty string, then the only valid way of getting to 1516 // this point with an offset that splits a codepoint is when we 1517 // have an empty match. Such matches, in UTF-8 mode, must not be 1518 // reported. So we just skip them here and pretend as if we did 1519 // not see a match. 1520 if utf8empty && !input.is_char_boundary(at) { 1521 continue; 1522 } 1523 let _ = patset.try_insert(pid); 1524 if !self.config.get_match_kind().continue_past_first_match() { 1525 break; 1526 } 1527 } 1528 } 1529 1530 /// Starting from 'sid', if the position 'at' in the 'input' haystack has a 1531 /// transition defined out of 'sid', then add the state transitioned to and 1532 /// its epsilon closure to the 'next' set of states to explore. 1533 /// 1534 /// 'stack' is used by the epsilon closure computation to perform a depth 1535 /// first traversal of the NFA. 1536 /// 1537 /// 'curr_slot_table' should be the table of slots for the current set of 1538 /// states being explored. If there is a transition out of 'sid', then 1539 /// sid's row in the slot table is used to perform the epsilon closure. 1540 #[cfg_attr(feature = "perf-inline", inline(always))] next( &self, stack: &mut Vec<FollowEpsilon>, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, ) -> Option<PatternID>1541 fn next( 1542 &self, 1543 stack: &mut Vec<FollowEpsilon>, 1544 curr_slot_table: &mut SlotTable, 1545 next: &mut ActiveStates, 1546 input: &Input<'_>, 1547 at: usize, 1548 sid: StateID, 1549 ) -> Option<PatternID> { 1550 instrument!(|c| c.record_step(sid)); 1551 match *self.nfa.state(sid) { 1552 State::Fail 1553 | State::Look { .. } 1554 | State::Union { .. } 1555 | State::BinaryUnion { .. } 1556 | State::Capture { .. } => None, 1557 State::ByteRange { ref trans } => { 1558 if trans.matches(input.haystack(), at) { 1559 let slots = curr_slot_table.for_state(sid); 1560 // OK because 'at <= haystack.len() < usize::MAX', so 1561 // adding 1 will never wrap. 1562 let at = at.wrapping_add(1); 1563 self.epsilon_closure( 1564 stack, slots, next, input, at, trans.next, 1565 ); 1566 } 1567 None 1568 } 1569 State::Sparse(ref sparse) => { 1570 if let Some(next_sid) = sparse.matches(input.haystack(), at) { 1571 let slots = curr_slot_table.for_state(sid); 1572 // OK because 'at <= haystack.len() < usize::MAX', so 1573 // adding 1 will never wrap. 1574 let at = at.wrapping_add(1); 1575 self.epsilon_closure( 1576 stack, slots, next, input, at, next_sid, 1577 ); 1578 } 1579 None 1580 } 1581 State::Dense(ref dense) => { 1582 if let Some(next_sid) = dense.matches(input.haystack(), at) { 1583 let slots = curr_slot_table.for_state(sid); 1584 // OK because 'at <= haystack.len() < usize::MAX', so 1585 // adding 1 will never wrap. 1586 let at = at.wrapping_add(1); 1587 self.epsilon_closure( 1588 stack, slots, next, input, at, next_sid, 1589 ); 1590 } 1591 None 1592 } 1593 State::Match { pattern_id } => Some(pattern_id), 1594 } 1595 } 1596 1597 /// Compute the epsilon closure of 'sid', writing the closure into 'next' 1598 /// while copying slot values from 'curr_slots' into corresponding states 1599 /// in 'next'. 'curr_slots' should be the slot values corresponding to 1600 /// 'sid'. 1601 /// 1602 /// The given 'stack' is used to perform a depth first traversal of the 1603 /// NFA by recursively following all epsilon transitions out of 'sid'. 1604 /// Conditional epsilon transitions are followed if and only if they are 1605 /// satisfied for the position 'at' in the 'input' haystack. 1606 /// 1607 /// While this routine may write to 'curr_slots', once it returns, any 1608 /// writes are undone and the original values (even if absent) are 1609 /// restored. 1610 #[cfg_attr(feature = "perf-inline", inline(always))] epsilon_closure( &self, stack: &mut Vec<FollowEpsilon>, curr_slots: &mut [Option<NonMaxUsize>], next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, )1611 fn epsilon_closure( 1612 &self, 1613 stack: &mut Vec<FollowEpsilon>, 1614 curr_slots: &mut [Option<NonMaxUsize>], 1615 next: &mut ActiveStates, 1616 input: &Input<'_>, 1617 at: usize, 1618 sid: StateID, 1619 ) { 1620 instrument!(|c| { 1621 c.record_closure(sid); 1622 c.record_stack_push(sid); 1623 }); 1624 stack.push(FollowEpsilon::Explore(sid)); 1625 while let Some(frame) = stack.pop() { 1626 match frame { 1627 FollowEpsilon::RestoreCapture { slot, offset: pos } => { 1628 curr_slots[slot] = pos; 1629 } 1630 FollowEpsilon::Explore(sid) => { 1631 self.epsilon_closure_explore( 1632 stack, curr_slots, next, input, at, sid, 1633 ); 1634 } 1635 } 1636 } 1637 } 1638 1639 /// Explore all of the epsilon transitions out of 'sid'. This is mostly 1640 /// split out from 'epsilon_closure' in order to clearly delineate 1641 /// the actual work of computing an epsilon closure from the stack 1642 /// book-keeping. 1643 /// 1644 /// This will push any additional explorations needed on to 'stack'. 1645 /// 1646 /// 'curr_slots' should refer to the slots for the currently active NFA 1647 /// state. That is, the current state we are stepping through. These 1648 /// slots are mutated in place as new 'Captures' states are traversed 1649 /// during epsilon closure, but the slots are restored to their original 1650 /// values once the full epsilon closure is completed. The ultimate use of 1651 /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that 1652 /// the capturing group spans are forwarded from the currently active state 1653 /// to the next. 1654 /// 1655 /// 'next' refers to the next set of active states. Computing an epsilon 1656 /// closure may increase the next set of active states. 1657 /// 1658 /// 'input' refers to the caller's input configuration and 'at' refers to 1659 /// the current position in the haystack. These are used to check whether 1660 /// conditional epsilon transitions (like look-around) are satisfied at 1661 /// the current position. If they aren't, then the epsilon closure won't 1662 /// include them. 1663 #[cfg_attr(feature = "perf-inline", inline(always))] epsilon_closure_explore( &self, stack: &mut Vec<FollowEpsilon>, curr_slots: &mut [Option<NonMaxUsize>], next: &mut ActiveStates, input: &Input<'_>, at: usize, mut sid: StateID, )1664 fn epsilon_closure_explore( 1665 &self, 1666 stack: &mut Vec<FollowEpsilon>, 1667 curr_slots: &mut [Option<NonMaxUsize>], 1668 next: &mut ActiveStates, 1669 input: &Input<'_>, 1670 at: usize, 1671 mut sid: StateID, 1672 ) { 1673 // We can avoid pushing some state IDs on to our stack in precisely 1674 // the cases where a 'push(x)' would be immediately followed by a 'x 1675 // = pop()'. This is achieved by this outer-loop. We simply set 'sid' 1676 // to be the next state ID we want to explore once we're done with 1677 // our initial exploration. In practice, this avoids a lot of stack 1678 // thrashing. 1679 loop { 1680 instrument!(|c| c.record_set_insert(sid)); 1681 // Record this state as part of our next set of active states. If 1682 // we've already explored it, then no need to do it again. 1683 if !next.set.insert(sid) { 1684 return; 1685 } 1686 match *self.nfa.state(sid) { 1687 State::Fail 1688 | State::Match { .. } 1689 | State::ByteRange { .. } 1690 | State::Sparse { .. } 1691 | State::Dense { .. } => { 1692 next.slot_table.for_state(sid).copy_from_slice(curr_slots); 1693 return; 1694 } 1695 State::Look { look, next } => { 1696 // OK because we don't permit building a searcher with a 1697 // Unicode word boundary if the requisite Unicode data is 1698 // unavailable. 1699 if !self.nfa.look_matcher().matches_inline( 1700 look, 1701 input.haystack(), 1702 at, 1703 ) { 1704 return; 1705 } 1706 sid = next; 1707 } 1708 State::Union { ref alternates } => { 1709 sid = match alternates.get(0) { 1710 None => return, 1711 Some(&sid) => sid, 1712 }; 1713 instrument!(|c| { 1714 for &alt in &alternates[1..] { 1715 c.record_stack_push(alt); 1716 } 1717 }); 1718 stack.extend( 1719 alternates[1..] 1720 .iter() 1721 .copied() 1722 .rev() 1723 .map(FollowEpsilon::Explore), 1724 ); 1725 } 1726 State::BinaryUnion { alt1, alt2 } => { 1727 sid = alt1; 1728 instrument!(|c| c.record_stack_push(sid)); 1729 stack.push(FollowEpsilon::Explore(alt2)); 1730 } 1731 State::Capture { next, slot, .. } => { 1732 // There's no need to do anything with slots that 1733 // ultimately won't be copied into the caller-provided 1734 // 'Captures' value. So we just skip dealing with them at 1735 // all. 1736 if slot.as_usize() < curr_slots.len() { 1737 instrument!(|c| c.record_stack_push(sid)); 1738 stack.push(FollowEpsilon::RestoreCapture { 1739 slot, 1740 offset: curr_slots[slot], 1741 }); 1742 // OK because length of a slice must fit into an isize. 1743 curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); 1744 } 1745 sid = next; 1746 } 1747 } 1748 } 1749 } 1750 1751 /// Return the starting configuration of a PikeVM search. 1752 /// 1753 /// The "start config" is basically whether the search should be anchored 1754 /// or not and the NFA state ID at which to begin the search. The state ID 1755 /// returned always corresponds to an anchored starting state even when the 1756 /// search is unanchored. This is because the PikeVM search loop deals with 1757 /// unanchored searches with an explicit epsilon closure out of the start 1758 /// state. 1759 /// 1760 /// This routine accounts for both the caller's `Input` configuration 1761 /// and the pattern itself. For example, even if the caller asks for an 1762 /// unanchored search, if the pattern itself is anchored, then this will 1763 /// always return 'true' because implementing an unanchored search in that 1764 /// case would be incorrect. 1765 /// 1766 /// Similarly, if the caller requests an anchored search for a particular 1767 /// pattern, then the starting state ID returned will reflect that. 1768 /// 1769 /// If a pattern ID is given in the input configuration that is not in 1770 /// this regex, then `None` is returned. start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)>1771 fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { 1772 match input.get_anchored() { 1773 // Only way we're unanchored is if both the caller asked for an 1774 // unanchored search *and* the pattern is itself not anchored. 1775 Anchored::No => Some(( 1776 self.nfa.is_always_start_anchored(), 1777 self.nfa.start_anchored(), 1778 )), 1779 Anchored::Yes => Some((true, self.nfa.start_anchored())), 1780 Anchored::Pattern(pid) => { 1781 Some((true, self.nfa.start_pattern(pid)?)) 1782 } 1783 } 1784 } 1785 } 1786 1787 /// An iterator over all non-overlapping matches for a particular search. 1788 /// 1789 /// The iterator yields a [`Match`] value until no more matches could be found. 1790 /// 1791 /// The lifetime parameters are as follows: 1792 /// 1793 /// * `'r` represents the lifetime of the PikeVM. 1794 /// * `'c` represents the lifetime of the PikeVM's cache. 1795 /// * `'h` represents the lifetime of the haystack being searched. 1796 /// 1797 /// This iterator can be created with the [`PikeVM::find_iter`] method. 1798 #[derive(Debug)] 1799 pub struct FindMatches<'r, 'c, 'h> { 1800 re: &'r PikeVM, 1801 cache: &'c mut Cache, 1802 caps: Captures, 1803 it: iter::Searcher<'h>, 1804 } 1805 1806 impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { 1807 type Item = Match; 1808 1809 #[inline] next(&mut self) -> Option<Match>1810 fn next(&mut self) -> Option<Match> { 1811 // Splitting 'self' apart seems necessary to appease borrowck. 1812 let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = 1813 *self; 1814 // 'advance' converts errors into panics, which is OK here because 1815 // the PikeVM can never return an error. 1816 it.advance(|input| { 1817 re.search(cache, input, caps); 1818 Ok(caps.get_match()) 1819 }) 1820 } 1821 } 1822 1823 /// An iterator over all non-overlapping leftmost matches, with their capturing 1824 /// groups, for a particular search. 1825 /// 1826 /// The iterator yields a [`Captures`] value until no more matches could be 1827 /// found. 1828 /// 1829 /// The lifetime parameters are as follows: 1830 /// 1831 /// * `'r` represents the lifetime of the PikeVM. 1832 /// * `'c` represents the lifetime of the PikeVM's cache. 1833 /// * `'h` represents the lifetime of the haystack being searched. 1834 /// 1835 /// This iterator can be created with the [`PikeVM::captures_iter`] method. 1836 #[derive(Debug)] 1837 pub struct CapturesMatches<'r, 'c, 'h> { 1838 re: &'r PikeVM, 1839 cache: &'c mut Cache, 1840 caps: Captures, 1841 it: iter::Searcher<'h>, 1842 } 1843 1844 impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { 1845 type Item = Captures; 1846 1847 #[inline] next(&mut self) -> Option<Captures>1848 fn next(&mut self) -> Option<Captures> { 1849 // Splitting 'self' apart seems necessary to appease borrowck. 1850 let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = 1851 *self; 1852 // 'advance' converts errors into panics, which is OK here because 1853 // the PikeVM can never return an error. 1854 it.advance(|input| { 1855 re.search(cache, input, caps); 1856 Ok(caps.get_match()) 1857 }); 1858 if caps.is_match() { 1859 Some(caps.clone()) 1860 } else { 1861 None 1862 } 1863 } 1864 } 1865 1866 /// A cache represents mutable state that a [`PikeVM`] requires during a 1867 /// search. 1868 /// 1869 /// For a given [`PikeVM`], its corresponding cache may be created either via 1870 /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in 1871 /// every way, except the former does not require explicitly importing `Cache`. 1872 /// 1873 /// A particular `Cache` is coupled with the [`PikeVM`] from which it 1874 /// was created. It may only be used with that `PikeVM`. A cache and its 1875 /// allocations may be re-purposed via [`Cache::reset`], in which case, it can 1876 /// only be used with the new `PikeVM` (and not the old one). 1877 #[derive(Clone, Debug)] 1878 pub struct Cache { 1879 /// Stack used while computing epsilon closure. This effectively lets us 1880 /// move what is more naturally expressed through recursion to a stack 1881 /// on the heap. 1882 stack: Vec<FollowEpsilon>, 1883 /// The current active states being explored for the current byte in the 1884 /// haystack. 1885 curr: ActiveStates, 1886 /// The next set of states we're building that will be explored for the 1887 /// next byte in the haystack. 1888 next: ActiveStates, 1889 } 1890 1891 impl Cache { 1892 /// Create a new [`PikeVM`] cache. 1893 /// 1894 /// A potentially more convenient routine to create a cache is 1895 /// [`PikeVM::create_cache`], as it does not require also importing the 1896 /// `Cache` type. 1897 /// 1898 /// If you want to reuse the returned `Cache` with some other `PikeVM`, 1899 /// then you must call [`Cache::reset`] with the desired `PikeVM`. new(re: &PikeVM) -> Cache1900 pub fn new(re: &PikeVM) -> Cache { 1901 Cache { 1902 stack: vec![], 1903 curr: ActiveStates::new(re), 1904 next: ActiveStates::new(re), 1905 } 1906 } 1907 1908 /// Reset this cache such that it can be used for searching with a 1909 /// different [`PikeVM`]. 1910 /// 1911 /// A cache reset permits reusing memory already allocated in this cache 1912 /// with a different `PikeVM`. 1913 /// 1914 /// # Example 1915 /// 1916 /// This shows how to re-purpose a cache for use with a different `PikeVM`. 1917 /// 1918 /// ``` 1919 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1920 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 1921 /// 1922 /// let re1 = PikeVM::new(r"\w")?; 1923 /// let re2 = PikeVM::new(r"\W")?; 1924 /// 1925 /// let mut cache = re1.create_cache(); 1926 /// assert_eq!( 1927 /// Some(Match::must(0, 0..2)), 1928 /// re1.find_iter(&mut cache, "Δ").next(), 1929 /// ); 1930 /// 1931 /// // Using 'cache' with re2 is not allowed. It may result in panics or 1932 /// // incorrect results. In order to re-purpose the cache, we must reset 1933 /// // it with the PikeVM we'd like to use it with. 1934 /// // 1935 /// // Similarly, after this reset, using the cache with 're1' is also not 1936 /// // allowed. 1937 /// cache.reset(&re2); 1938 /// assert_eq!( 1939 /// Some(Match::must(0, 0..3)), 1940 /// re2.find_iter(&mut cache, "☃").next(), 1941 /// ); 1942 /// 1943 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1944 /// ``` reset(&mut self, re: &PikeVM)1945 pub fn reset(&mut self, re: &PikeVM) { 1946 self.curr.reset(re); 1947 self.next.reset(re); 1948 } 1949 1950 /// Returns the heap memory usage, in bytes, of this cache. 1951 /// 1952 /// This does **not** include the stack size used up by this cache. To 1953 /// compute that, use `std::mem::size_of::<Cache>()`. memory_usage(&self) -> usize1954 pub fn memory_usage(&self) -> usize { 1955 use core::mem::size_of; 1956 (self.stack.len() * size_of::<FollowEpsilon>()) 1957 + self.curr.memory_usage() 1958 + self.next.memory_usage() 1959 } 1960 1961 /// Clears this cache. This should be called at the start of every search 1962 /// to ensure we start with a clean slate. 1963 /// 1964 /// This also sets the length of the capturing groups used in the current 1965 /// search. This permits an optimization where by 'SlotTable::for_state' 1966 /// only returns the number of slots equivalent to the number of slots 1967 /// given in the 'Captures' value. This may be less than the total number 1968 /// of possible slots, e.g., when one only wants to track overall match 1969 /// offsets. This in turn permits less copying of capturing group spans 1970 /// in the PikeVM. setup_search(&mut self, captures_slot_len: usize)1971 fn setup_search(&mut self, captures_slot_len: usize) { 1972 self.stack.clear(); 1973 self.curr.setup_search(captures_slot_len); 1974 self.next.setup_search(captures_slot_len); 1975 } 1976 } 1977 1978 /// A set of active states used to "simulate" the execution of an NFA via the 1979 /// PikeVM. 1980 /// 1981 /// There are two sets of these used during NFA simulation. One set corresponds 1982 /// to the "current" set of states being traversed for the current position 1983 /// in a haystack. The other set corresponds to the "next" set of states being 1984 /// built, which will become the new "current" set for the next position in the 1985 /// haystack. These two sets correspond to CLIST and NLIST in Thompson's 1986 /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 1987 /// 1988 /// In addition to representing a set of NFA states, this also maintains slot 1989 /// values for each state. These slot values are what turn the NFA simulation 1990 /// into the "Pike VM." Namely, they track capturing group values for each 1991 /// state. During the computation of epsilon closure, we copy slot values from 1992 /// states in the "current" set to the "next" set. Eventually, once a match 1993 /// is found, the slot values for that match state are what we write to the 1994 /// caller provided 'Captures' value. 1995 #[derive(Clone, Debug)] 1996 struct ActiveStates { 1997 /// The set of active NFA states. This set preserves insertion order, which 1998 /// is critical for simulating the match semantics of backtracking regex 1999 /// engines. 2000 set: SparseSet, 2001 /// The slots for every NFA state, where each slot stores a (possibly 2002 /// absent) offset. Every capturing group has two slots. One for a start 2003 /// offset and one for an end offset. 2004 slot_table: SlotTable, 2005 } 2006 2007 impl ActiveStates { 2008 /// Create a new set of active states for the given PikeVM. The active 2009 /// states returned may only be used with the given PikeVM. (Use 'reset' 2010 /// to re-purpose the allocation for a different PikeVM.) new(re: &PikeVM) -> ActiveStates2011 fn new(re: &PikeVM) -> ActiveStates { 2012 let mut active = ActiveStates { 2013 set: SparseSet::new(0), 2014 slot_table: SlotTable::new(), 2015 }; 2016 active.reset(re); 2017 active 2018 } 2019 2020 /// Reset this set of active states such that it can be used with the given 2021 /// PikeVM (and only that PikeVM). reset(&mut self, re: &PikeVM)2022 fn reset(&mut self, re: &PikeVM) { 2023 self.set.resize(re.get_nfa().states().len()); 2024 self.slot_table.reset(re); 2025 } 2026 2027 /// Return the heap memory usage, in bytes, used by this set of active 2028 /// states. 2029 /// 2030 /// This does not include the stack size of this value. memory_usage(&self) -> usize2031 fn memory_usage(&self) -> usize { 2032 self.set.memory_usage() + self.slot_table.memory_usage() 2033 } 2034 2035 /// Setup this set of active states for a new search. The given slot 2036 /// length should be the number of slots in a caller provided 'Captures' 2037 /// (and may be zero). setup_search(&mut self, captures_slot_len: usize)2038 fn setup_search(&mut self, captures_slot_len: usize) { 2039 self.set.clear(); 2040 self.slot_table.setup_search(captures_slot_len); 2041 } 2042 } 2043 2044 /// A table of slots, where each row represent a state in an NFA. Thus, the 2045 /// table has room for storing slots for every single state in an NFA. 2046 /// 2047 /// This table is represented with a single contiguous allocation. In general, 2048 /// the notion of "capturing group" doesn't really exist at this level of 2049 /// abstraction, hence the name "slot" instead. (Indeed, every capturing group 2050 /// maps to a pair of slots, one for the start offset and one for the end 2051 /// offset.) Slots are indexed by the 'Captures' NFA state. 2052 /// 2053 /// N.B. Not every state actually needs a row of slots. Namely, states that 2054 /// only have epsilon transitions currently never have anything written to 2055 /// their rows in this table. Thus, the table is somewhat wasteful in its heap 2056 /// usage. However, it is important to maintain fast random access by state 2057 /// ID, which means one giant table tends to work well. RE2 takes a different 2058 /// approach here and allocates each row as its own reference counted thing. 2059 /// I explored such a strategy at one point here, but couldn't get it to work 2060 /// well using entirely safe code. (To the ambitious reader: I encourage you to 2061 /// re-litigate that experiment.) I very much wanted to stick to safe code, but 2062 /// could be convinced otherwise if there was a solid argument and the safety 2063 /// was encapsulated well. 2064 #[derive(Clone, Debug)] 2065 struct SlotTable { 2066 /// The actual table of offsets. 2067 table: Vec<Option<NonMaxUsize>>, 2068 /// The number of slots per state, i.e., the table's stride or the length 2069 /// of each row. 2070 slots_per_state: usize, 2071 /// The number of slots in the caller-provided 'Captures' value for the 2072 /// current search. Setting this to 'slots_per_state' is always correct, 2073 /// but may be wasteful. 2074 slots_for_captures: usize, 2075 } 2076 2077 impl SlotTable { 2078 /// Create a new slot table. 2079 /// 2080 /// One should call 'reset' with the corresponding PikeVM before use. new() -> SlotTable2081 fn new() -> SlotTable { 2082 SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } 2083 } 2084 2085 /// Reset this slot table such that it can be used with the given PikeVM 2086 /// (and only that PikeVM). reset(&mut self, re: &PikeVM)2087 fn reset(&mut self, re: &PikeVM) { 2088 let nfa = re.get_nfa(); 2089 self.slots_per_state = nfa.group_info().slot_len(); 2090 // This is always correct, but may be reduced for a particular search 2091 // if a 'Captures' has fewer slots, e.g., none at all or only slots 2092 // for tracking the overall match instead of all slots for every 2093 // group. 2094 self.slots_for_captures = core::cmp::max( 2095 self.slots_per_state, 2096 nfa.pattern_len().checked_mul(2).unwrap(), 2097 ); 2098 let len = nfa 2099 .states() 2100 .len() 2101 .checked_mul(self.slots_per_state) 2102 // Add space to account for scratch space used during a search. 2103 .and_then(|x| x.checked_add(self.slots_for_captures)) 2104 // It seems like this could actually panic on legitimate inputs on 2105 // 32-bit targets, and very likely to panic on 16-bit. Should we 2106 // somehow convert this to an error? What about something similar 2107 // for the lazy DFA cache? If you're tripping this assert, please 2108 // file a bug. 2109 .expect("slot table length doesn't overflow"); 2110 // This happens about as often as a regex is compiled, so it probably 2111 // should be at debug level, but I found it quite distracting and not 2112 // particularly useful. 2113 trace!( 2114 "resizing PikeVM active states table to {} entries \ 2115 (slots_per_state={})", 2116 len, 2117 self.slots_per_state, 2118 ); 2119 self.table.resize(len, None); 2120 } 2121 2122 /// Return the heap memory usage, in bytes, used by this slot table. 2123 /// 2124 /// This does not include the stack size of this value. memory_usage(&self) -> usize2125 fn memory_usage(&self) -> usize { 2126 self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() 2127 } 2128 2129 /// Perform any per-search setup for this slot table. 2130 /// 2131 /// In particular, this sets the length of the number of slots used in the 2132 /// 'Captures' given by the caller (if any at all). This number may be 2133 /// smaller than the total number of slots available, e.g., when the caller 2134 /// is only interested in tracking the overall match and not the spans of 2135 /// every matching capturing group. Only tracking the overall match can 2136 /// save a substantial amount of time copying capturing spans during a 2137 /// search. setup_search(&mut self, captures_slot_len: usize)2138 fn setup_search(&mut self, captures_slot_len: usize) { 2139 self.slots_for_captures = captures_slot_len; 2140 } 2141 2142 /// Return a mutable slice of the slots for the given state. 2143 /// 2144 /// Note that the length of the slice returned may be less than the total 2145 /// number of slots available for this state. In particular, the length 2146 /// always matches the number of slots indicated via 'setup_search'. for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>]2147 fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { 2148 let i = sid.as_usize() * self.slots_per_state; 2149 &mut self.table[i..i + self.slots_for_captures] 2150 } 2151 2152 /// Return a slice of slots of appropriate length where every slot offset 2153 /// is guaranteed to be absent. This is useful in cases where you need to 2154 /// compute an epsilon closure outside of the user supplied regex, and thus 2155 /// never want it to have any capturing slots set. all_absent(&mut self) -> &mut [Option<NonMaxUsize>]2156 fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { 2157 let i = self.table.len() - self.slots_for_captures; 2158 &mut self.table[i..i + self.slots_for_captures] 2159 } 2160 } 2161 2162 /// Represents a stack frame for use while computing an epsilon closure. 2163 /// 2164 /// (An "epsilon closure" refers to the set of reachable NFA states from a 2165 /// single state without consuming any input. That is, the set of all epsilon 2166 /// transitions not only from that single state, but from every other state 2167 /// reachable by an epsilon transition as well. This is why it's called a 2168 /// "closure." Computing an epsilon closure is also done during DFA 2169 /// determinization! Compare and contrast the epsilon closure here in this 2170 /// PikeVM and the one used for determinization in crate::util::determinize.) 2171 /// 2172 /// Computing the epsilon closure in a Thompson NFA proceeds via a depth 2173 /// first traversal over all epsilon transitions from a particular state. 2174 /// (A depth first traversal is important because it emulates the same priority 2175 /// of matches that is typically found in backtracking regex engines.) This 2176 /// depth first traversal is naturally expressed using recursion, but to avoid 2177 /// a call stack size proportional to the size of a regex, we put our stack on 2178 /// the heap instead. 2179 /// 2180 /// This stack thus consists of call frames. The typical call frame is 2181 /// `Explore`, which instructs epsilon closure to explore the epsilon 2182 /// transitions from that state. (Subsequent epsilon transitions are then 2183 /// pushed on to the stack as more `Explore` frames.) If the state ID being 2184 /// explored has no epsilon transitions, then the capturing group slots are 2185 /// copied from the original state that sparked the epsilon closure (from the 2186 /// 'step' routine) to the state ID being explored. This way, capturing group 2187 /// slots are forwarded from the previous state to the next. 2188 /// 2189 /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to 2190 /// set the position for a particular slot back to some particular offset. This 2191 /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will 2192 /// set the offset of the slot indicated in `Capture` to the current offset, 2193 /// and then push the old offset on to the stack as a `RestoreCapture` frame. 2194 /// Thus, the new offset is only used until the epsilon closure reverts back to 2195 /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon 2196 /// transition its "scope" to only states that come "after" it during depth 2197 /// first traversal. 2198 #[derive(Clone, Debug)] 2199 enum FollowEpsilon { 2200 /// Explore the epsilon transitions from a state ID. 2201 Explore(StateID), 2202 /// Reset the given `slot` to the given `offset` (which might be `None`). 2203 RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, 2204 } 2205 2206 /// A set of counters that "instruments" a PikeVM search. To enable this, you 2207 /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust 2208 /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in 2209 /// the environment. The metrics collected will be dumped automatically for 2210 /// every search executed by the PikeVM. 2211 /// 2212 /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an 2213 /// absolute decrease in wall-clock performance, even if the 'trace' log level 2214 /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't 2215 /// enabled.) The main point of instrumentation is to get counts of various 2216 /// events that occur during the PikeVM's execution. 2217 /// 2218 /// This is a somewhat hacked together collection of metrics that are useful 2219 /// to gather from a PikeVM search. In particular, it lets us scrutinize the 2220 /// performance profile of a search beyond what general purpose profiling tools 2221 /// give us. Namely, we orient the profiling data around the specific states of 2222 /// the NFA. 2223 /// 2224 /// In other words, this lets us see which parts of the NFA graph are most 2225 /// frequently activated. This then provides direction for optimization 2226 /// opportunities. 2227 /// 2228 /// The really sad part about this is that it absolutely clutters up the PikeVM 2229 /// implementation. :'( Another approach would be to just manually add this 2230 /// code in whenever I want this kind of profiling data, but it's complicated 2231 /// and tedious enough that I went with this approach... for now. 2232 /// 2233 /// When instrumentation is enabled (which also turns on 'logging'), then a 2234 /// `Counters` is initialized for every search and `trace`'d just before the 2235 /// search returns to the caller. 2236 /// 2237 /// Tip: When debugging performance problems with the PikeVM, it's best to try 2238 /// to work with an NFA that is as small as possible. Otherwise the state graph 2239 /// is likely to be too big to digest. 2240 #[cfg(feature = "internal-instrument-pikevm")] 2241 #[derive(Clone, Debug)] 2242 struct Counters { 2243 /// The number of times the NFA is in a particular permutation of states. 2244 state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, 2245 /// The number of times 'step' is called for a particular state ID (which 2246 /// indexes this array). 2247 steps: Vec<u64>, 2248 /// The number of times an epsilon closure was computed for a state. 2249 closures: Vec<u64>, 2250 /// The number of times a particular state ID is pushed on to a stack while 2251 /// computing an epsilon closure. 2252 stack_pushes: Vec<u64>, 2253 /// The number of times a particular state ID is inserted into a sparse set 2254 /// while computing an epsilon closure. 2255 set_inserts: Vec<u64>, 2256 } 2257 2258 #[cfg(feature = "internal-instrument-pikevm")] 2259 impl Counters { empty() -> Counters2260 fn empty() -> Counters { 2261 Counters { 2262 state_sets: alloc::collections::BTreeMap::new(), 2263 steps: vec![], 2264 closures: vec![], 2265 stack_pushes: vec![], 2266 set_inserts: vec![], 2267 } 2268 } 2269 reset(&mut self, nfa: &NFA)2270 fn reset(&mut self, nfa: &NFA) { 2271 let len = nfa.states().len(); 2272 2273 self.state_sets.clear(); 2274 2275 self.steps.clear(); 2276 self.steps.resize(len, 0); 2277 2278 self.closures.clear(); 2279 self.closures.resize(len, 0); 2280 2281 self.stack_pushes.clear(); 2282 self.stack_pushes.resize(len, 0); 2283 2284 self.set_inserts.clear(); 2285 self.set_inserts.resize(len, 0); 2286 } 2287 eprint(&self, nfa: &NFA)2288 fn eprint(&self, nfa: &NFA) { 2289 trace!("===== START PikeVM Instrumentation Output ====="); 2290 // We take the top-K most occurring state sets. Otherwise the output 2291 // is likely to be overwhelming. And we probably only care about the 2292 // most frequently occurring ones anyway. 2293 const LIMIT: usize = 20; 2294 let mut set_counts = 2295 self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); 2296 set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); 2297 trace!("## PikeVM frequency of state sets (top {})", LIMIT); 2298 for (set, count) in set_counts.iter().take(LIMIT) { 2299 trace!("{:?}: {}", set, count); 2300 } 2301 if set_counts.len() > LIMIT { 2302 trace!( 2303 "... {} sets omitted (out of {} total)", 2304 set_counts.len() - LIMIT, 2305 set_counts.len(), 2306 ); 2307 } 2308 2309 trace!(""); 2310 trace!("## PikeVM total frequency of events"); 2311 trace!( 2312 "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", 2313 self.steps.iter().copied().sum::<u64>(), 2314 self.closures.iter().copied().sum::<u64>(), 2315 self.stack_pushes.iter().copied().sum::<u64>(), 2316 self.set_inserts.iter().copied().sum::<u64>(), 2317 ); 2318 2319 trace!(""); 2320 trace!("## PikeVM frequency of events broken down by state"); 2321 for sid in 0..self.steps.len() { 2322 trace!( 2323 "{:06}: steps: {}, closures: {}, \ 2324 stack-pushes: {}, set-inserts: {}", 2325 sid, 2326 self.steps[sid], 2327 self.closures[sid], 2328 self.stack_pushes[sid], 2329 self.set_inserts[sid], 2330 ); 2331 } 2332 2333 trace!(""); 2334 trace!("## NFA debug display"); 2335 trace!("{:?}", nfa); 2336 trace!("===== END PikeVM Instrumentation Output ====="); 2337 } 2338 record_state_set(&mut self, set: &SparseSet)2339 fn record_state_set(&mut self, set: &SparseSet) { 2340 let set = set.iter().collect::<Vec<StateID>>(); 2341 *self.state_sets.entry(set).or_insert(0) += 1; 2342 } 2343 record_step(&mut self, sid: StateID)2344 fn record_step(&mut self, sid: StateID) { 2345 self.steps[sid] += 1; 2346 } 2347 record_closure(&mut self, sid: StateID)2348 fn record_closure(&mut self, sid: StateID) { 2349 self.closures[sid] += 1; 2350 } 2351 record_stack_push(&mut self, sid: StateID)2352 fn record_stack_push(&mut self, sid: StateID) { 2353 self.stack_pushes[sid] += 1; 2354 } 2355 record_set_insert(&mut self, sid: StateID)2356 fn record_set_insert(&mut self, sid: StateID) { 2357 self.set_inserts[sid] += 1; 2358 } 2359 } 2360