1 /*!
2 An NFA backed Pike VM for executing regex searches with capturing groups.
3 
4 This module provides a [`PikeVM`] that works by simulating an NFA and
5 resolving all spans of capturing groups that participate in a match.
6 */
7 
8 #[cfg(feature = "internal-instrument-pikevm")]
9 use core::cell::RefCell;
10 
11 use alloc::{vec, vec::Vec};
12 
13 use crate::{
14     nfa::thompson::{self, BuildError, State, NFA},
15     util::{
16         captures::Captures,
17         empty, iter,
18         prefilter::Prefilter,
19         primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
20         search::{
21             Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
22         },
23         sparse_set::SparseSet,
24     },
25 };
26 
27 /// A simple macro for conditionally executing instrumentation logic when
28 /// the 'trace' log level is enabled. This is a compile-time no-op when the
29 /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
30 /// this makes it easier to avoid doing extra work when instrumentation isn't
31 /// enabled.
32 ///
33 /// This macro accepts a closure of type `|&mut Counters|`. The closure can
34 /// then increment counters (or whatever) in accordance with what one wants
35 /// to track.
36 macro_rules! instrument {
37     ($fun:expr) => {
38         #[cfg(feature = "internal-instrument-pikevm")]
39         {
40             let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
41             COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut()));
42         }
43     };
44 }
45 
46 #[cfg(feature = "internal-instrument-pikevm")]
47 std::thread_local! {
48     /// Effectively global state used to keep track of instrumentation
49     /// counters. The "proper" way to do this is to thread it through the
50     /// PikeVM, but it makes the code quite icky. Since this is just a
51     /// debugging feature, we're content to relegate it to thread local
52     /// state. When instrumentation is enabled, the counters are reset at the
53     /// beginning of every search and printed (with the 'trace' log level) at
54     /// the end of every search.
55     static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
56 }
57 
58 /// The configuration used for building a [`PikeVM`].
59 ///
60 /// A PikeVM configuration is a simple data object that is typically used with
61 /// [`Builder::configure`]. It can be cheaply cloned.
62 ///
63 /// A default configuration can be created either with `Config::new`, or
64 /// perhaps more conveniently, with [`PikeVM::config`].
65 #[derive(Clone, Debug, Default)]
66 pub struct Config {
67     match_kind: Option<MatchKind>,
68     pre: Option<Option<Prefilter>>,
69 }
70 
71 impl Config {
72     /// Return a new default PikeVM configuration.
new() -> Config73     pub fn new() -> Config {
74         Config::default()
75     }
76 
77     /// Set the desired match semantics.
78     ///
79     /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
80     /// match semantics of Perl-like regex engines. That is, when multiple
81     /// patterns would match at the same leftmost position, the pattern that
82     /// appears first in the concrete syntax is chosen.
83     ///
84     /// Currently, the only other kind of match semantics supported is
85     /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
86     /// where all possible matches are visited in the NFA by the `PikeVM`.
87     ///
88     /// Typically, `All` is used when one wants to execute an overlapping
89     /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
90     /// sense to use `All` with the various "leftmost" find routines, since the
91     /// leftmost routines depend on the `LeftmostFirst` automata construction
92     /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM`
93     /// simulating dead states as a way to terminate the search and report a
94     /// match. `LeftmostFirst` also supports non-greedy matches using this
95     /// strategy where as `All` does not.
match_kind(mut self, kind: MatchKind) -> Config96     pub fn match_kind(mut self, kind: MatchKind) -> Config {
97         self.match_kind = Some(kind);
98         self
99     }
100 
101     /// Set a prefilter to be used whenever a start state is entered.
102     ///
103     /// A [`Prefilter`] in this context is meant to accelerate searches by
104     /// looking for literal prefixes that every match for the corresponding
105     /// pattern (or patterns) must start with. Once a prefilter produces a
106     /// match, the underlying search routine continues on to try and confirm
107     /// the match.
108     ///
109     /// Be warned that setting a prefilter does not guarantee that the search
110     /// will be faster. While it's usually a good bet, if the prefilter
111     /// produces a lot of false positive candidates (i.e., positions matched
112     /// by the prefilter but not by the regex), then the overall result can
113     /// be slower than if you had just executed the regex engine without any
114     /// prefilters.
115     ///
116     /// By default no prefilter is set.
117     ///
118     /// # Example
119     ///
120     /// ```
121     /// use regex_automata::{
122     ///     nfa::thompson::pikevm::PikeVM,
123     ///     util::prefilter::Prefilter,
124     ///     Input, Match, MatchKind,
125     /// };
126     ///
127     /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
128     /// let re = PikeVM::builder()
129     ///     .configure(PikeVM::config().prefilter(pre))
130     ///     .build(r"(foo|bar)[a-z]+")?;
131     /// let mut cache = re.create_cache();
132     /// let input = Input::new("foo1 barfox bar");
133     /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input));
134     ///
135     /// # Ok::<(), Box<dyn std::error::Error>>(())
136     /// ```
137     ///
138     /// Be warned though that an incorrect prefilter can lead to incorrect
139     /// results!
140     ///
141     /// ```
142     /// use regex_automata::{
143     ///     nfa::thompson::pikevm::PikeVM,
144     ///     util::prefilter::Prefilter,
145     ///     Input, HalfMatch, MatchKind,
146     /// };
147     ///
148     /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
149     /// let re = PikeVM::builder()
150     ///     .configure(PikeVM::config().prefilter(pre))
151     ///     .build(r"(foo|bar)[a-z]+")?;
152     /// let mut cache = re.create_cache();
153     /// let input = Input::new("foo1 barfox bar");
154     /// // No match reported even though there clearly is one!
155     /// assert_eq!(None, re.find(&mut cache, input));
156     ///
157     /// # Ok::<(), Box<dyn std::error::Error>>(())
158     /// ```
prefilter(mut self, pre: Option<Prefilter>) -> Config159     pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
160         self.pre = Some(pre);
161         self
162     }
163 
164     /// Returns the match semantics set in this configuration.
get_match_kind(&self) -> MatchKind165     pub fn get_match_kind(&self) -> MatchKind {
166         self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
167     }
168 
169     /// Returns the prefilter set in this configuration, if one at all.
get_prefilter(&self) -> Option<&Prefilter>170     pub fn get_prefilter(&self) -> Option<&Prefilter> {
171         self.pre.as_ref().unwrap_or(&None).as_ref()
172     }
173 
174     /// Overwrite the default configuration such that the options in `o` are
175     /// always used. If an option in `o` is not set, then the corresponding
176     /// option in `self` is used. If it's not set in `self` either, then it
177     /// remains not set.
overwrite(&self, o: Config) -> Config178     pub(crate) fn overwrite(&self, o: Config) -> Config {
179         Config {
180             match_kind: o.match_kind.or(self.match_kind),
181             pre: o.pre.or_else(|| self.pre.clone()),
182         }
183     }
184 }
185 
186 /// A builder for a `PikeVM`.
187 ///
188 /// This builder permits configuring options for the syntax of a pattern,
189 /// the NFA construction and the `PikeVM` construction. This builder is
190 /// different from a general purpose regex builder in that it permits fine
191 /// grain configuration of the construction process. The trade off for this is
192 /// complexity, and the possibility of setting a configuration that might not
193 /// make sense. For example, there are two different UTF-8 modes:
194 ///
195 /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8)
196 /// controls whether the pattern itself can contain sub-expressions that match
197 /// invalid UTF-8.
198 /// * [`thompson::Config::utf8`] controls whether empty matches that split a
199 /// Unicode codepoint are reported or not.
200 ///
201 /// Generally speaking, callers will want to either enable all of these or
202 /// disable all of these.
203 ///
204 /// # Example
205 ///
206 /// This example shows how to disable UTF-8 mode in the syntax and the regex
207 /// itself. This is generally what you want for matching on arbitrary bytes.
208 ///
209 /// ```
210 /// use regex_automata::{
211 ///     nfa::thompson::{self, pikevm::PikeVM},
212 ///     util::syntax,
213 ///     Match,
214 /// };
215 ///
216 /// let re = PikeVM::builder()
217 ///     .syntax(syntax::Config::new().utf8(false))
218 ///     .thompson(thompson::Config::new().utf8(false))
219 ///     .build(r"foo(?-u:[^b])ar.*")?;
220 /// let mut cache = re.create_cache();
221 ///
222 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
223 /// let expected = Some(Match::must(0, 1..9));
224 /// let got = re.find_iter(&mut cache, haystack).next();
225 /// assert_eq!(expected, got);
226 /// // Notice that `(?-u:[^b])` matches invalid UTF-8,
227 /// // but the subsequent `.*` does not! Disabling UTF-8
228 /// // on the syntax permits this.
229 /// //
230 /// // N.B. This example does not show the impact of
231 /// // disabling UTF-8 mode on a PikeVM Config, since that
232 /// // only impacts regexes that can produce matches of
233 /// // length 0.
234 /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
235 ///
236 /// # Ok::<(), Box<dyn std::error::Error>>(())
237 /// ```
238 #[derive(Clone, Debug)]
239 pub struct Builder {
240     config: Config,
241     #[cfg(feature = "syntax")]
242     thompson: thompson::Compiler,
243 }
244 
245 impl Builder {
246     /// Create a new PikeVM builder with its default configuration.
new() -> Builder247     pub fn new() -> Builder {
248         Builder {
249             config: Config::default(),
250             #[cfg(feature = "syntax")]
251             thompson: thompson::Compiler::new(),
252         }
253     }
254 
255     /// Build a `PikeVM` from the given pattern.
256     ///
257     /// If there was a problem parsing or compiling the pattern, then an error
258     /// is returned.
259     #[cfg(feature = "syntax")]
build(&self, pattern: &str) -> Result<PikeVM, BuildError>260     pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> {
261         self.build_many(&[pattern])
262     }
263 
264     /// Build a `PikeVM` from the given patterns.
265     #[cfg(feature = "syntax")]
build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<PikeVM, BuildError>266     pub fn build_many<P: AsRef<str>>(
267         &self,
268         patterns: &[P],
269     ) -> Result<PikeVM, BuildError> {
270         let nfa = self.thompson.build_many(patterns)?;
271         self.build_from_nfa(nfa)
272     }
273 
274     /// Build a `PikeVM` directly from its NFA.
275     ///
276     /// Note that when using this method, any configuration that applies to the
277     /// construction of the NFA itself will of course be ignored, since the NFA
278     /// given here is already built.
build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError>279     pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
280         nfa.look_set_any().available().map_err(BuildError::word)?;
281         Ok(PikeVM { config: self.config.clone(), nfa })
282     }
283 
284     /// Apply the given `PikeVM` configuration options to this builder.
configure(&mut self, config: Config) -> &mut Builder285     pub fn configure(&mut self, config: Config) -> &mut Builder {
286         self.config = self.config.overwrite(config);
287         self
288     }
289 
290     /// Set the syntax configuration for this builder using
291     /// [`syntax::Config`](crate::util::syntax::Config).
292     ///
293     /// This permits setting things like case insensitivity, Unicode and multi
294     /// line mode.
295     ///
296     /// These settings only apply when constructing a PikeVM directly from a
297     /// pattern.
298     #[cfg(feature = "syntax")]
syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder299     pub fn syntax(
300         &mut self,
301         config: crate::util::syntax::Config,
302     ) -> &mut Builder {
303         self.thompson.syntax(config);
304         self
305     }
306 
307     /// Set the Thompson NFA configuration for this builder using
308     /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
309     ///
310     /// This permits setting things like if additional time should be spent
311     /// shrinking the size of the NFA.
312     ///
313     /// These settings only apply when constructing a PikeVM directly from a
314     /// pattern.
315     #[cfg(feature = "syntax")]
thompson(&mut self, config: thompson::Config) -> &mut Builder316     pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
317         self.thompson.configure(config);
318         self
319     }
320 }
321 
322 /// A virtual machine for executing regex searches with capturing groups.
323 ///
324 /// # Infallible APIs
325 ///
326 /// Unlike most other regex engines in this crate, a `PikeVM` never returns an
327 /// error at search time. It supports all [`Anchored`] configurations, never
328 /// quits and works on haystacks of arbitrary length.
329 ///
330 /// There are two caveats to mention though:
331 ///
332 /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`],
333 /// then the PikeVM will report "no match." This is consistent with all other
334 /// regex engines in this crate.
335 /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`]
336 /// that has insufficient capacity to store all valid pattern IDs, then if a
337 /// match occurs for a `PatternID` that cannot be inserted, it is silently
338 /// dropped as if it did not match.
339 ///
340 /// # Advice
341 ///
342 /// The `PikeVM` is generally the most "powerful" regex engine in this crate.
343 /// "Powerful" in this context means that it can handle any regular expression
344 /// that is parseable by `regex-syntax` and any size haystack. Regretably,
345 /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in
346 /// practice. This results in an annoying situation where one generally tries
347 /// to pick any other regex engine (or perhaps none at all) before being
348 /// forced to fall back to a `PikeVM`.
349 ///
350 /// For example, a common strategy for dealing with capturing groups is to
351 /// actually look for the overall match of the regex using a faster regex
352 /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall
353 /// match is found, one can then run the `PikeVM` on just the match span to
354 /// find the spans of the capturing groups. In this way, the faster regex
355 /// engine does the majority of the work, while the `PikeVM` only lends its
356 /// power in a more limited role.
357 ///
358 /// Unfortunately, this isn't always possible because the faster regex engines
359 /// don't support all of the regex features in `regex-syntax`. This notably
360 /// includes (and is currently limited to) Unicode word boundaries. So if
361 /// your pattern has Unicode word boundaries, you typically can't use a
362 /// DFA-based regex engine at all (unless you [enable heuristic support for
363 /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass
364 /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for
365 /// anchored searches only, but in a cruel sort of joke, many Unicode features
366 /// tend to result in making the regex _not_ one-pass.)
367 ///
368 /// # Example
369 ///
370 /// This example shows that the `PikeVM` implements Unicode word boundaries
371 /// correctly by default.
372 ///
373 /// ```
374 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
375 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
376 ///
377 /// let re = PikeVM::new(r"\b\w+\b")?;
378 /// let mut cache = re.create_cache();
379 ///
380 /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс");
381 /// assert_eq!(Some(Match::must(0, 0..12)), it.next());
382 /// assert_eq!(Some(Match::must(0, 13..23)), it.next());
383 /// assert_eq!(None, it.next());
384 /// # Ok::<(), Box<dyn std::error::Error>>(())
385 /// ```
386 #[derive(Clone, Debug)]
387 pub struct PikeVM {
388     config: Config,
389     nfa: NFA,
390 }
391 
392 impl PikeVM {
393     /// Parse the given regular expression using the default configuration and
394     /// return the corresponding `PikeVM`.
395     ///
396     /// If you want a non-default configuration, then use the [`Builder`] to
397     /// set your own configuration.
398     ///
399     /// # Example
400     ///
401     /// ```
402     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
403     ///
404     /// let re = PikeVM::new("foo[0-9]+bar")?;
405     /// let mut cache = re.create_cache();
406     /// assert_eq!(
407     ///     Some(Match::must(0, 3..14)),
408     ///     re.find_iter(&mut cache, "zzzfoo12345barzzz").next(),
409     /// );
410     /// # Ok::<(), Box<dyn std::error::Error>>(())
411     /// ```
412     #[cfg(feature = "syntax")]
new(pattern: &str) -> Result<PikeVM, BuildError>413     pub fn new(pattern: &str) -> Result<PikeVM, BuildError> {
414         PikeVM::builder().build(pattern)
415     }
416 
417     /// Like `new`, but parses multiple patterns into a single "multi regex."
418     /// This similarly uses the default regex configuration.
419     ///
420     /// # Example
421     ///
422     /// ```
423     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
424     ///
425     /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?;
426     /// let mut cache = re.create_cache();
427     ///
428     /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
429     /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
430     /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
431     /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
432     /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
433     /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
434     /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
435     /// assert_eq!(None, it.next());
436     /// # Ok::<(), Box<dyn std::error::Error>>(())
437     /// ```
438     #[cfg(feature = "syntax")]
new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<PikeVM, BuildError>439     pub fn new_many<P: AsRef<str>>(
440         patterns: &[P],
441     ) -> Result<PikeVM, BuildError> {
442         PikeVM::builder().build_many(patterns)
443     }
444 
445     /// Like `new`, but builds a PikeVM directly from an NFA. This is useful
446     /// if you already have an NFA, or even if you hand-assembled the NFA.
447     ///
448     /// # Example
449     ///
450     /// This shows how to hand assemble a regular expression via its HIR,
451     /// compile an NFA from it and build a PikeVM from the NFA.
452     ///
453     /// ```
454     /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
455     /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
456     ///
457     /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
458     ///     ClassBytesRange::new(b'0', b'9'),
459     ///     ClassBytesRange::new(b'A', b'Z'),
460     ///     ClassBytesRange::new(b'_', b'_'),
461     ///     ClassBytesRange::new(b'a', b'z'),
462     /// ])));
463     ///
464     /// let config = NFA::config().nfa_size_limit(Some(1_000));
465     /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
466     ///
467     /// let re = PikeVM::new_from_nfa(nfa)?;
468     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
469     /// let expected = Some(Match::must(0, 3..4));
470     /// re.captures(&mut cache, "!@#A#@!", &mut caps);
471     /// assert_eq!(expected, caps.get_match());
472     ///
473     /// # Ok::<(), Box<dyn std::error::Error>>(())
474     /// ```
new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError>475     pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> {
476         PikeVM::builder().build_from_nfa(nfa)
477     }
478 
479     /// Create a new `PikeVM` that matches every input.
480     ///
481     /// # Example
482     ///
483     /// ```
484     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
485     ///
486     /// let re = PikeVM::always_match()?;
487     /// let mut cache = re.create_cache();
488     ///
489     /// let expected = Match::must(0, 0..0);
490     /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next());
491     /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next());
492     /// # Ok::<(), Box<dyn std::error::Error>>(())
493     /// ```
always_match() -> Result<PikeVM, BuildError>494     pub fn always_match() -> Result<PikeVM, BuildError> {
495         let nfa = thompson::NFA::always_match();
496         PikeVM::new_from_nfa(nfa)
497     }
498 
499     /// Create a new `PikeVM` that never matches any input.
500     ///
501     /// # Example
502     ///
503     /// ```
504     /// use regex_automata::nfa::thompson::pikevm::PikeVM;
505     ///
506     /// let re = PikeVM::never_match()?;
507     /// let mut cache = re.create_cache();
508     ///
509     /// assert_eq!(None, re.find_iter(&mut cache, "").next());
510     /// assert_eq!(None, re.find_iter(&mut cache, "foo").next());
511     /// # Ok::<(), Box<dyn std::error::Error>>(())
512     /// ```
never_match() -> Result<PikeVM, BuildError>513     pub fn never_match() -> Result<PikeVM, BuildError> {
514         let nfa = thompson::NFA::never_match();
515         PikeVM::new_from_nfa(nfa)
516     }
517 
518     /// Return a default configuration for a `PikeVM`.
519     ///
520     /// This is a convenience routine to avoid needing to import the `Config`
521     /// type when customizing the construction of a `PikeVM`.
522     ///
523     /// # Example
524     ///
525     /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
526     /// disabled, zero-width matches that split a codepoint are allowed.
527     /// Otherwise they are never reported.
528     ///
529     /// In the code below, notice that `""` is permitted to match positions
530     /// that split the encoding of a codepoint.
531     ///
532     /// ```
533     /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match};
534     ///
535     /// let re = PikeVM::builder()
536     ///     .thompson(thompson::Config::new().utf8(false))
537     ///     .build(r"")?;
538     /// let mut cache = re.create_cache();
539     ///
540     /// let haystack = "a☃z";
541     /// let mut it = re.find_iter(&mut cache, haystack);
542     /// assert_eq!(Some(Match::must(0, 0..0)), it.next());
543     /// assert_eq!(Some(Match::must(0, 1..1)), it.next());
544     /// assert_eq!(Some(Match::must(0, 2..2)), it.next());
545     /// assert_eq!(Some(Match::must(0, 3..3)), it.next());
546     /// assert_eq!(Some(Match::must(0, 4..4)), it.next());
547     /// assert_eq!(Some(Match::must(0, 5..5)), it.next());
548     /// assert_eq!(None, it.next());
549     ///
550     /// # Ok::<(), Box<dyn std::error::Error>>(())
551     /// ```
config() -> Config552     pub fn config() -> Config {
553         Config::new()
554     }
555 
556     /// Return a builder for configuring the construction of a `PikeVM`.
557     ///
558     /// This is a convenience routine to avoid needing to import the
559     /// [`Builder`] type in common cases.
560     ///
561     /// # Example
562     ///
563     /// This example shows how to use the builder to disable UTF-8 mode
564     /// everywhere.
565     ///
566     /// ```
567     /// use regex_automata::{
568     ///     nfa::thompson::{self, pikevm::PikeVM},
569     ///     util::syntax,
570     ///     Match,
571     /// };
572     ///
573     /// let re = PikeVM::builder()
574     ///     .syntax(syntax::Config::new().utf8(false))
575     ///     .thompson(thompson::Config::new().utf8(false))
576     ///     .build(r"foo(?-u:[^b])ar.*")?;
577     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
578     ///
579     /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
580     /// let expected = Some(Match::must(0, 1..9));
581     /// re.captures(&mut cache, haystack, &mut caps);
582     /// assert_eq!(expected, caps.get_match());
583     ///
584     /// # Ok::<(), Box<dyn std::error::Error>>(())
585     /// ```
builder() -> Builder586     pub fn builder() -> Builder {
587         Builder::new()
588     }
589 
590     /// Create a new empty set of capturing groups that is guaranteed to be
591     /// valid for the search APIs on this `PikeVM`.
592     ///
593     /// A `Captures` value created for a specific `PikeVM` cannot be used with
594     /// any other `PikeVM`.
595     ///
596     /// This is a convenience function for [`Captures::all`]. See the
597     /// [`Captures`] documentation for an explanation of its alternative
598     /// constructors that permit the `PikeVM` to do less work during a search,
599     /// and thus might make it faster.
create_captures(&self) -> Captures600     pub fn create_captures(&self) -> Captures {
601         Captures::all(self.get_nfa().group_info().clone())
602     }
603 
604     /// Create a new cache for this `PikeVM`.
605     ///
606     /// The cache returned should only be used for searches for this
607     /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then
608     /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently,
609     /// [`PikeVM::reset_cache`]).
create_cache(&self) -> Cache610     pub fn create_cache(&self) -> Cache {
611         Cache::new(self)
612     }
613 
614     /// Reset the given cache such that it can be used for searching with the
615     /// this `PikeVM` (and only this `PikeVM`).
616     ///
617     /// A cache reset permits reusing memory already allocated in this cache
618     /// with a different `PikeVM`.
619     ///
620     /// # Example
621     ///
622     /// This shows how to re-purpose a cache for use with a different `PikeVM`.
623     ///
624     /// ```
625     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
626     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
627     ///
628     /// let re1 = PikeVM::new(r"\w")?;
629     /// let re2 = PikeVM::new(r"\W")?;
630     ///
631     /// let mut cache = re1.create_cache();
632     /// assert_eq!(
633     ///     Some(Match::must(0, 0..2)),
634     ///     re1.find_iter(&mut cache, "Δ").next(),
635     /// );
636     ///
637     /// // Using 'cache' with re2 is not allowed. It may result in panics or
638     /// // incorrect results. In order to re-purpose the cache, we must reset
639     /// // it with the PikeVM we'd like to use it with.
640     /// //
641     /// // Similarly, after this reset, using the cache with 're1' is also not
642     /// // allowed.
643     /// re2.reset_cache(&mut cache);
644     /// assert_eq!(
645     ///     Some(Match::must(0, 0..3)),
646     ///     re2.find_iter(&mut cache, "☃").next(),
647     /// );
648     ///
649     /// # Ok::<(), Box<dyn std::error::Error>>(())
650     /// ```
reset_cache(&self, cache: &mut Cache)651     pub fn reset_cache(&self, cache: &mut Cache) {
652         cache.reset(self);
653     }
654 
655     /// Returns the total number of patterns compiled into this `PikeVM`.
656     ///
657     /// In the case of a `PikeVM` that contains no patterns, this returns `0`.
658     ///
659     /// # Example
660     ///
661     /// This example shows the pattern length for a `PikeVM` that never
662     /// matches:
663     ///
664     /// ```
665     /// use regex_automata::nfa::thompson::pikevm::PikeVM;
666     ///
667     /// let re = PikeVM::never_match()?;
668     /// assert_eq!(re.pattern_len(), 0);
669     /// # Ok::<(), Box<dyn std::error::Error>>(())
670     /// ```
671     ///
672     /// And another example for a `PikeVM` that matches at every position:
673     ///
674     /// ```
675     /// use regex_automata::nfa::thompson::pikevm::PikeVM;
676     ///
677     /// let re = PikeVM::always_match()?;
678     /// assert_eq!(re.pattern_len(), 1);
679     /// # Ok::<(), Box<dyn std::error::Error>>(())
680     /// ```
681     ///
682     /// And finally, a `PikeVM` that was constructed from multiple patterns:
683     ///
684     /// ```
685     /// use regex_automata::nfa::thompson::pikevm::PikeVM;
686     ///
687     /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
688     /// assert_eq!(re.pattern_len(), 3);
689     /// # Ok::<(), Box<dyn std::error::Error>>(())
690     /// ```
pattern_len(&self) -> usize691     pub fn pattern_len(&self) -> usize {
692         self.nfa.pattern_len()
693     }
694 
695     /// Return the config for this `PikeVM`.
696     #[inline]
get_config(&self) -> &Config697     pub fn get_config(&self) -> &Config {
698         &self.config
699     }
700 
701     /// Returns a reference to the underlying NFA.
702     #[inline]
get_nfa(&self) -> &NFA703     pub fn get_nfa(&self) -> &NFA {
704         &self.nfa
705     }
706 }
707 
708 impl PikeVM {
709     /// Returns true if and only if this `PikeVM` matches the given haystack.
710     ///
711     /// This routine may short circuit if it knows that scanning future
712     /// input will never lead to a different result. In particular, if the
713     /// underlying NFA enters a match state, then this routine will return
714     /// `true` immediately without inspecting any future input. (Consider how
715     /// this might make a difference given the regex `a+` on the haystack
716     /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
717     /// but routines like `find` need to continue searching because `+` is
718     /// greedy by default.)
719     ///
720     /// # Example
721     ///
722     /// This shows basic usage:
723     ///
724     /// ```
725     /// use regex_automata::nfa::thompson::pikevm::PikeVM;
726     ///
727     /// let re = PikeVM::new("foo[0-9]+bar")?;
728     /// let mut cache = re.create_cache();
729     ///
730     /// assert!(re.is_match(&mut cache, "foo12345bar"));
731     /// assert!(!re.is_match(&mut cache, "foobar"));
732     /// # Ok::<(), Box<dyn std::error::Error>>(())
733     /// ```
734     ///
735     /// # Example: consistency with search APIs
736     ///
737     /// `is_match` is guaranteed to return `true` whenever `find` returns a
738     /// match. This includes searches that are executed entirely within a
739     /// codepoint:
740     ///
741     /// ```
742     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
743     ///
744     /// let re = PikeVM::new("a*")?;
745     /// let mut cache = re.create_cache();
746     ///
747     /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
748     /// # Ok::<(), Box<dyn std::error::Error>>(())
749     /// ```
750     ///
751     /// Notice that when UTF-8 mode is disabled, then the above reports a
752     /// match because the restriction against zero-width matches that split a
753     /// codepoint has been lifted:
754     ///
755     /// ```
756     /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
757     ///
758     /// let re = PikeVM::builder()
759     ///     .thompson(NFA::config().utf8(false))
760     ///     .build("a*")?;
761     /// let mut cache = re.create_cache();
762     ///
763     /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
764     /// # Ok::<(), Box<dyn std::error::Error>>(())
765     /// ```
766     #[inline]
is_match<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> bool767     pub fn is_match<'h, I: Into<Input<'h>>>(
768         &self,
769         cache: &mut Cache,
770         input: I,
771     ) -> bool {
772         let input = input.into().earliest(true);
773         self.search_slots(cache, &input, &mut []).is_some()
774     }
775 
776     /// Executes a leftmost forward search and returns a `Match` if one exists.
777     ///
778     /// This routine only includes the overall match span. To get access to the
779     /// individual spans of each capturing group, use [`PikeVM::captures`].
780     ///
781     /// # Example
782     ///
783     /// Leftmost first match semantics corresponds to the match with the
784     /// smallest starting offset, but where the end offset is determined by
785     /// preferring earlier branches in the original regular expression. For
786     /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
787     /// will match `Samwise` in `Samwise`.
788     ///
789     /// Generally speaking, the "leftmost first" match is how most backtracking
790     /// regular expressions tend to work. This is in contrast to POSIX-style
791     /// regular expressions that yield "leftmost longest" matches. Namely,
792     /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
793     /// leftmost longest semantics. (This crate does not currently support
794     /// leftmost longest semantics.)
795     ///
796     /// ```
797     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
798     ///
799     /// let re = PikeVM::new("foo[0-9]+")?;
800     /// let mut cache = re.create_cache();
801     /// let expected = Match::must(0, 0..8);
802     /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
803     ///
804     /// // Even though a match is found after reading the first byte (`a`),
805     /// // the leftmost first match semantics demand that we find the earliest
806     /// // match that prefers earlier parts of the pattern over later parts.
807     /// let re = PikeVM::new("abc|a")?;
808     /// let mut cache = re.create_cache();
809     /// let expected = Match::must(0, 0..3);
810     /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
811     ///
812     /// # Ok::<(), Box<dyn std::error::Error>>(())
813     /// ```
814     #[inline]
find<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> Option<Match>815     pub fn find<'h, I: Into<Input<'h>>>(
816         &self,
817         cache: &mut Cache,
818         input: I,
819     ) -> Option<Match> {
820         let input = input.into();
821         if self.get_nfa().pattern_len() == 1 {
822             let mut slots = [None, None];
823             let pid = self.search_slots(cache, &input, &mut slots)?;
824             let start = slots[0]?.get();
825             let end = slots[1]?.get();
826             return Some(Match::new(pid, Span { start, end }));
827         }
828         let ginfo = self.get_nfa().group_info();
829         let slots_len = ginfo.implicit_slot_len();
830         let mut slots = vec![None; slots_len];
831         let pid = self.search_slots(cache, &input, &mut slots)?;
832         let start = slots[pid.as_usize() * 2]?.get();
833         let end = slots[pid.as_usize() * 2 + 1]?.get();
834         Some(Match::new(pid, Span { start, end }))
835     }
836 
837     /// Executes a leftmost forward search and writes the spans of capturing
838     /// groups that participated in a match into the provided [`Captures`]
839     /// value. If no match was found, then [`Captures::is_match`] is guaranteed
840     /// to return `false`.
841     ///
842     /// # Example
843     ///
844     /// ```
845     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
846     ///
847     /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
848     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
849     ///
850     /// re.captures(&mut cache, "2010-03-14", &mut caps);
851     /// assert!(caps.is_match());
852     /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
853     /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
854     /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
855     ///
856     /// # Ok::<(), Box<dyn std::error::Error>>(())
857     /// ```
858     #[inline]
captures<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, )859     pub fn captures<'h, I: Into<Input<'h>>>(
860         &self,
861         cache: &mut Cache,
862         input: I,
863         caps: &mut Captures,
864     ) {
865         self.search(cache, &input.into(), caps)
866     }
867 
868     /// Returns an iterator over all non-overlapping leftmost matches in the
869     /// given bytes. If no match exists, then the iterator yields no elements.
870     ///
871     /// # Example
872     ///
873     /// ```
874     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
875     ///
876     /// let re = PikeVM::new("foo[0-9]+")?;
877     /// let mut cache = re.create_cache();
878     ///
879     /// let text = "foo1 foo12 foo123";
880     /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
881     /// assert_eq!(matches, vec![
882     ///     Match::must(0, 0..4),
883     ///     Match::must(0, 5..10),
884     ///     Match::must(0, 11..17),
885     /// ]);
886     /// # Ok::<(), Box<dyn std::error::Error>>(())
887     /// ```
888     #[inline]
find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h>889     pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
890         &'r self,
891         cache: &'c mut Cache,
892         input: I,
893     ) -> FindMatches<'r, 'c, 'h> {
894         let caps = Captures::matches(self.get_nfa().group_info().clone());
895         let it = iter::Searcher::new(input.into());
896         FindMatches { re: self, cache, caps, it }
897     }
898 
899     /// Returns an iterator over all non-overlapping `Captures` values. If no
900     /// match exists, then the iterator yields no elements.
901     ///
902     /// This yields the same matches as [`PikeVM::find_iter`], but it includes
903     /// the spans of all capturing groups that participate in each match.
904     ///
905     /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
906     /// how to correctly iterate over all matches in a haystack while avoiding
907     /// the creation of a new `Captures` value for every match. (Which you are
908     /// forced to do with an `Iterator`.)
909     ///
910     /// # Example
911     ///
912     /// ```
913     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
914     ///
915     /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
916     /// let mut cache = re.create_cache();
917     ///
918     /// let text = "foo1 foo12 foo123";
919     /// let matches: Vec<Span> = re
920     ///     .captures_iter(&mut cache, text)
921     ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
922     ///     .map(|caps| caps.get_group_by_name("numbers").unwrap())
923     ///     .collect();
924     /// assert_eq!(matches, vec![
925     ///     Span::from(3..4),
926     ///     Span::from(8..10),
927     ///     Span::from(14..17),
928     /// ]);
929     /// # Ok::<(), Box<dyn std::error::Error>>(())
930     /// ```
931     #[inline]
captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h>932     pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
933         &'r self,
934         cache: &'c mut Cache,
935         input: I,
936     ) -> CapturesMatches<'r, 'c, 'h> {
937         let caps = self.create_captures();
938         let it = iter::Searcher::new(input.into());
939         CapturesMatches { re: self, cache, caps, it }
940     }
941 }
942 
943 impl PikeVM {
944     /// Executes a leftmost forward search and writes the spans of capturing
945     /// groups that participated in a match into the provided [`Captures`]
946     /// value. If no match was found, then [`Captures::is_match`] is guaranteed
947     /// to return `false`.
948     ///
949     /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
950     /// instead of an `Into<Input>`.
951     ///
952     /// # Example: specific pattern search
953     ///
954     /// This example shows how to build a multi-PikeVM that permits searching
955     /// for specific patterns.
956     ///
957     /// ```
958     /// use regex_automata::{
959     ///     nfa::thompson::pikevm::PikeVM,
960     ///     Anchored, Match, PatternID, Input,
961     /// };
962     ///
963     /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
964     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
965     /// let haystack = "foo123";
966     ///
967     /// // Since we are using the default leftmost-first match and both
968     /// // patterns match at the same starting position, only the first pattern
969     /// // will be returned in this case when doing a search for any of the
970     /// // patterns.
971     /// let expected = Some(Match::must(0, 0..6));
972     /// re.search(&mut cache, &Input::new(haystack), &mut caps);
973     /// assert_eq!(expected, caps.get_match());
974     ///
975     /// // But if we want to check whether some other pattern matches, then we
976     /// // can provide its pattern ID.
977     /// let expected = Some(Match::must(1, 0..6));
978     /// let input = Input::new(haystack)
979     ///     .anchored(Anchored::Pattern(PatternID::must(1)));
980     /// re.search(&mut cache, &input, &mut caps);
981     /// assert_eq!(expected, caps.get_match());
982     ///
983     /// # Ok::<(), Box<dyn std::error::Error>>(())
984     /// ```
985     ///
986     /// # Example: specifying the bounds of a search
987     ///
988     /// This example shows how providing the bounds of a search can produce
989     /// different results than simply sub-slicing the haystack.
990     ///
991     /// ```
992     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
993     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
994     ///
995     /// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
996     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
997     /// let haystack = "foo123bar";
998     ///
999     /// // Since we sub-slice the haystack, the search doesn't know about
1000     /// // the larger context and assumes that `123` is surrounded by word
1001     /// // boundaries. And of course, the match position is reported relative
1002     /// // to the sub-slice as well, which means we get `0..3` instead of
1003     /// // `3..6`.
1004     /// let expected = Some(Match::must(0, 0..3));
1005     /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps);
1006     /// assert_eq!(expected, caps.get_match());
1007     ///
1008     /// // But if we provide the bounds of the search within the context of the
1009     /// // entire haystack, then the search can take the surrounding context
1010     /// // into account. (And if we did find a match, it would be reported
1011     /// // as a valid offset into `haystack` instead of its sub-slice.)
1012     /// let expected = None;
1013     /// let input = Input::new(haystack).range(3..6);
1014     /// re.search(&mut cache, &input, &mut caps);
1015     /// assert_eq!(expected, caps.get_match());
1016     ///
1017     /// # Ok::<(), Box<dyn std::error::Error>>(())
1018     /// ```
1019     #[inline]
search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, )1020     pub fn search(
1021         &self,
1022         cache: &mut Cache,
1023         input: &Input<'_>,
1024         caps: &mut Captures,
1025     ) {
1026         caps.set_pattern(None);
1027         let pid = self.search_slots(cache, input, caps.slots_mut());
1028         caps.set_pattern(pid);
1029     }
1030 
1031     /// Executes a leftmost forward search and writes the spans of capturing
1032     /// groups that participated in a match into the provided `slots`, and
1033     /// returns the matching pattern ID. The contents of the slots for patterns
1034     /// other than the matching pattern are unspecified. If no match was found,
1035     /// then `None` is returned and the contents of `slots` is unspecified.
1036     ///
1037     /// This is like [`PikeVM::search`], but it accepts a raw slots slice
1038     /// instead of a `Captures` value. This is useful in contexts where you
1039     /// don't want or need to allocate a `Captures`.
1040     ///
1041     /// It is legal to pass _any_ number of slots to this routine. If the regex
1042     /// engine would otherwise write a slot offset that doesn't fit in the
1043     /// provided slice, then it is simply skipped. In general though, there are
1044     /// usually three slice lengths you might want to use:
1045     ///
1046     /// * An empty slice, if you only care about which pattern matched.
1047     /// * A slice with
1048     /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1049     /// slots, if you only care about the overall match spans for each matching
1050     /// pattern.
1051     /// * A slice with
1052     /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1053     /// permits recording match offsets for every capturing group in every
1054     /// pattern.
1055     ///
1056     /// # Example
1057     ///
1058     /// This example shows how to find the overall match offsets in a
1059     /// multi-pattern search without allocating a `Captures` value. Indeed, we
1060     /// can put our slots right on the stack.
1061     ///
1062     /// ```
1063     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1064     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input};
1065     ///
1066     /// let re = PikeVM::new_many(&[
1067     ///     r"\pL+",
1068     ///     r"\d+",
1069     /// ])?;
1070     /// let mut cache = re.create_cache();
1071     /// let input = Input::new("!@#123");
1072     ///
1073     /// // We only care about the overall match offsets here, so we just
1074     /// // allocate two slots for each pattern. Each slot records the start
1075     /// // and end of the match.
1076     /// let mut slots = [None; 4];
1077     /// let pid = re.search_slots(&mut cache, &input, &mut slots);
1078     /// assert_eq!(Some(PatternID::must(1)), pid);
1079     ///
1080     /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1081     /// // See 'GroupInfo' for more details on the mapping between groups and
1082     /// // slot indices.
1083     /// let slot_start = pid.unwrap().as_usize() * 2;
1084     /// let slot_end = slot_start + 1;
1085     /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1086     /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1087     ///
1088     /// # Ok::<(), Box<dyn std::error::Error>>(())
1089     /// ```
1090     #[inline]
search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<PatternID>1091     pub fn search_slots(
1092         &self,
1093         cache: &mut Cache,
1094         input: &Input<'_>,
1095         slots: &mut [Option<NonMaxUsize>],
1096     ) -> Option<PatternID> {
1097         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1098         if !utf8empty {
1099             let hm = self.search_slots_imp(cache, input, slots)?;
1100             return Some(hm.pattern());
1101         }
1102         // There is an unfortunate special case where if the regex can
1103         // match the empty string and UTF-8 mode is enabled, the search
1104         // implementation requires that the slots have at least as much space
1105         // to report the bounds of any match. This is so zero-width matches
1106         // that split a codepoint can be filtered out.
1107         //
1108         // Note that if utf8empty is true, we specialize the case for when
1109         // the number of patterns is 1. In that case, we can just use a stack
1110         // allocation. Otherwise we resort to a heap allocation, which we
1111         // convince ourselves we're fine with due to the pathological nature of
1112         // this case.
1113         let min = self.get_nfa().group_info().implicit_slot_len();
1114         if slots.len() >= min {
1115             let hm = self.search_slots_imp(cache, input, slots)?;
1116             return Some(hm.pattern());
1117         }
1118         if self.get_nfa().pattern_len() == 1 {
1119             let mut enough = [None, None];
1120             let got = self.search_slots_imp(cache, input, &mut enough);
1121             // This is OK because we know `enough` is strictly bigger than
1122             // `slots`, otherwise this special case isn't reached.
1123             slots.copy_from_slice(&enough[..slots.len()]);
1124             return got.map(|hm| hm.pattern());
1125         }
1126         let mut enough = vec![None; min];
1127         let got = self.search_slots_imp(cache, input, &mut enough);
1128         // This is OK because we know `enough` is strictly bigger than `slots`,
1129         // otherwise this special case isn't reached.
1130         slots.copy_from_slice(&enough[..slots.len()]);
1131         got.map(|hm| hm.pattern())
1132     }
1133 
1134     /// This is the actual implementation of `search_slots_imp` that
1135     /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1136     /// enabled, 2) the NFA can match the empty string and 3) the caller has
1137     /// provided an insufficient number of slots to record match offsets.
1138     #[inline(never)]
search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1139     fn search_slots_imp(
1140         &self,
1141         cache: &mut Cache,
1142         input: &Input<'_>,
1143         slots: &mut [Option<NonMaxUsize>],
1144     ) -> Option<HalfMatch> {
1145         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1146         let hm = match self.search_imp(cache, input, slots) {
1147             None => return None,
1148             Some(hm) if !utf8empty => return Some(hm),
1149             Some(hm) => hm,
1150         };
1151         empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1152             Ok(self
1153                 .search_imp(cache, input, slots)
1154                 .map(|hm| (hm, hm.offset())))
1155         })
1156         // OK because the PikeVM never errors.
1157         .unwrap()
1158     }
1159 
1160     /// Writes the set of patterns that match anywhere in the given search
1161     /// configuration to `patset`. If multiple patterns match at the same
1162     /// position and this `PikeVM` was configured with [`MatchKind::All`]
1163     /// semantics, then all matching patterns are written to the given set.
1164     ///
1165     /// Unless all of the patterns in this `PikeVM` are anchored, then
1166     /// generally speaking, this will visit every byte in the haystack.
1167     ///
1168     /// This search routine *does not* clear the pattern set. This gives some
1169     /// flexibility to the caller (e.g., running multiple searches with the
1170     /// same pattern set), but does make the API bug-prone if you're reusing
1171     /// the same pattern set for multiple searches but intended them to be
1172     /// independent.
1173     ///
1174     /// If a pattern ID matched but the given `PatternSet` does not have
1175     /// sufficient capacity to store it, then it is not inserted and silently
1176     /// dropped.
1177     ///
1178     /// # Example
1179     ///
1180     /// This example shows how to find all matching patterns in a haystack,
1181     /// even when some patterns match at the same position as other patterns.
1182     ///
1183     /// ```
1184     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1185     /// use regex_automata::{
1186     ///     nfa::thompson::pikevm::PikeVM,
1187     ///     Input, MatchKind, PatternSet,
1188     /// };
1189     ///
1190     /// let patterns = &[
1191     ///     r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1192     /// ];
1193     /// let re = PikeVM::builder()
1194     ///     .configure(PikeVM::config().match_kind(MatchKind::All))
1195     ///     .build_many(patterns)?;
1196     /// let mut cache = re.create_cache();
1197     ///
1198     /// let input = Input::new("foobar");
1199     /// let mut patset = PatternSet::new(re.pattern_len());
1200     /// re.which_overlapping_matches(&mut cache, &input, &mut patset);
1201     /// let expected = vec![0, 2, 3, 4, 6];
1202     /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1203     /// assert_eq!(expected, got);
1204     ///
1205     /// # Ok::<(), Box<dyn std::error::Error>>(())
1206     /// ```
1207     #[inline]
which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, )1208     pub fn which_overlapping_matches(
1209         &self,
1210         cache: &mut Cache,
1211         input: &Input<'_>,
1212         patset: &mut PatternSet,
1213     ) {
1214         self.which_overlapping_imp(cache, input, patset)
1215     }
1216 }
1217 
1218 impl PikeVM {
1219     /// The implementation of standard leftmost search.
1220     ///
1221     /// Capturing group spans are written to `slots`, but only if requested.
1222     /// `slots` can be any length. Any slot in the NFA that is activated but
1223     /// which is out of bounds for the given `slots` is ignored.
search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1224     fn search_imp(
1225         &self,
1226         cache: &mut Cache,
1227         input: &Input<'_>,
1228         slots: &mut [Option<NonMaxUsize>],
1229     ) -> Option<HalfMatch> {
1230         cache.setup_search(slots.len());
1231         if input.is_done() {
1232             return None;
1233         }
1234         // Why do we even care about this? Well, in our 'Captures'
1235         // representation, we use usize::MAX as a sentinel to indicate "no
1236         // match." This isn't problematic so long as our haystack doesn't have
1237         // a maximal length. Byte slices are guaranteed by Rust to have a
1238         // length that fits into isize, and so this assert should always pass.
1239         // But we put it here to make our assumption explicit.
1240         assert!(
1241             input.haystack().len() < core::usize::MAX,
1242             "byte slice lengths must be less than usize MAX",
1243         );
1244         instrument!(|c| c.reset(&self.nfa));
1245 
1246         // Whether we want to visit all match states instead of emulating the
1247         // 'leftmost' semantics of typical backtracking regex engines.
1248         let allmatches =
1249             self.config.get_match_kind().continue_past_first_match();
1250         let (anchored, start_id) = match self.start_config(input) {
1251             None => return None,
1252             Some(config) => config,
1253         };
1254 
1255         let pre =
1256             if anchored { None } else { self.get_config().get_prefilter() };
1257         let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1258         let mut hm = None;
1259         // Yes, our search doesn't end at input.end(), but includes it. This
1260         // is necessary because matches are delayed by one byte, just like
1261         // how the DFA engines work. The delay is used to handle look-behind
1262         // assertions. In the case of the PikeVM, the delay is implemented
1263         // by not considering a match to exist until it is visited in
1264         // 'steps'. Technically, we know a match exists in the previous
1265         // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
1266         // determinization. We don't mark a DFA state as a match state if it
1267         // contains an NFA match state, but rather, whether the DFA state was
1268         // generated by a transition from a DFA state that contains an NFA
1269         // match state.)
1270         let mut at = input.start();
1271         while at <= input.end() {
1272             // If we have no states left to visit, then there are some cases
1273             // where we know we can quit early or even skip ahead.
1274             if curr.set.is_empty() {
1275                 // We have a match and we haven't been instructed to continue
1276                 // on even after finding a match, so we can quit.
1277                 if hm.is_some() && !allmatches {
1278                     break;
1279                 }
1280                 // If we're running an anchored search and we've advanced
1281                 // beyond the start position with no other states to try, then
1282                 // we will never observe a match and thus can stop.
1283                 if anchored && at > input.start() {
1284                     break;
1285                 }
1286                 // If there no states left to explore at this position and we
1287                 // know we can't terminate early, then we are effectively at
1288                 // the starting state of the NFA. If we fell through here,
1289                 // we'd end up adding our '(?s-u:.)*?' prefix and it would be
1290                 // the only thing in 'curr'. So we might as well just skip
1291                 // ahead until we find something that we know might advance us
1292                 // forward.
1293                 if let Some(ref pre) = pre {
1294                     let span = Span::from(at..input.end());
1295                     match pre.find(input.haystack(), span) {
1296                         None => break,
1297                         Some(ref span) => at = span.start,
1298                     }
1299                 }
1300             }
1301             // Instead of using the NFA's unanchored start state, we actually
1302             // always use its anchored starting state. As a result, when doing
1303             // an unanchored search, we need to simulate our own '(?s-u:.)*?'
1304             // prefix, to permit a match to appear anywhere.
1305             //
1306             // Now, we don't *have* to do things this way. We could use the
1307             // NFA's unanchored starting state and do one 'epsilon_closure'
1308             // call from that starting state before the main loop here. And
1309             // that is just as correct. However, it turns out to be slower
1310             // than our approach here because it slightly increases the cost
1311             // of processing each byte by requiring us to visit more NFA
1312             // states to deal with the additional NFA states in the unanchored
1313             // prefix. By simulating it explicitly here, we lower those costs
1314             // substantially. The cost is itself small, but it adds up for
1315             // large haystacks.
1316             //
1317             // In order to simulate the '(?s-u:.)*?' prefix---which is not
1318             // greedy---we are careful not to perform an epsilon closure on
1319             // the start state if we already have a match. Namely, if we
1320             // did otherwise, we would never reach a terminating condition
1321             // because there would always be additional states to process.
1322             // In effect, the exclusion of running 'epsilon_closure' when
1323             // we have a match corresponds to the "dead" states we have in
1324             // our DFA regex engines. Namely, in a DFA, match states merely
1325             // instruct the search execution to record the current offset as
1326             // the most recently seen match. It is the dead state that actually
1327             // indicates when to stop the search (other than EOF or quit
1328             // states).
1329             //
1330             // However, when 'allmatches' is true, the caller has asked us to
1331             // leave in every possible match state. This tends not to make a
1332             // whole lot of sense in unanchored searches, because it means the
1333             // search really cannot terminate until EOF. And often, in that
1334             // case, you wind up skipping over a bunch of matches and are left
1335             // with the "last" match. Arguably, it just doesn't make a lot of
1336             // sense to run a 'leftmost' search (which is what this routine is)
1337             // with 'allmatches' set to true. But the DFAs support it and this
1338             // matches their behavior. (Generally, 'allmatches' is useful for
1339             // overlapping searches or leftmost anchored searches to find the
1340             // longest possible match by ignoring match priority.)
1341             //
1342             // Additionally, when we're running an anchored search, this
1343             // epsilon closure should only be computed at the beginning of the
1344             // search. If we re-computed it at every position, we would be
1345             // simulating an unanchored search when we were tasked to perform
1346             // an anchored search.
1347             if (!hm.is_some() || allmatches)
1348                 && (!anchored || at == input.start())
1349             {
1350                 // Since we are adding to the 'curr' active states and since
1351                 // this is for the start ID, we use a slots slice that is
1352                 // guaranteed to have the right length but where every element
1353                 // is absent. This is exactly what we want, because this
1354                 // epsilon closure is responsible for simulating an unanchored
1355                 // '(?s:.)*?' prefix. It is specifically outside of any
1356                 // capturing groups, and thus, using slots that are always
1357                 // absent is correct.
1358                 //
1359                 // Note though that we can't just use '&mut []' here, since
1360                 // this epsilon closure may traverse through 'Captures' epsilon
1361                 // transitions, and thus must be able to write offsets to the
1362                 // slots given which are later copied to slot values in 'curr'.
1363                 let slots = next.slot_table.all_absent();
1364                 self.epsilon_closure(stack, slots, curr, input, at, start_id);
1365             }
1366             if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
1367             {
1368                 hm = Some(HalfMatch::new(pid, at));
1369             }
1370             // Unless the caller asked us to return early, we need to mush on
1371             // to see if we can extend our match. (But note that 'nexts' will
1372             // quit right after seeing a match when match_kind==LeftmostFirst,
1373             // as is consistent with leftmost-first match priority.)
1374             if input.get_earliest() && hm.is_some() {
1375                 break;
1376             }
1377             core::mem::swap(curr, next);
1378             next.set.clear();
1379             at += 1;
1380         }
1381         instrument!(|c| c.eprint(&self.nfa));
1382         hm
1383     }
1384 
1385     /// The implementation for the 'which_overlapping_matches' API. Basically,
1386     /// we do a single scan through the entire haystack (unless our regex
1387     /// or search is anchored) and record every pattern that matched. In
1388     /// particular, when MatchKind::All is used, this supports overlapping
1389     /// matches. So if we have the regexes 'sam' and 'samwise', they will
1390     /// *both* be reported in the pattern set when searching the haystack
1391     /// 'samwise'.
which_overlapping_imp( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, )1392     fn which_overlapping_imp(
1393         &self,
1394         cache: &mut Cache,
1395         input: &Input<'_>,
1396         patset: &mut PatternSet,
1397     ) {
1398         // NOTE: This is effectively a copy of 'search_imp' above, but with no
1399         // captures support and instead writes patterns that matched directly
1400         // to 'patset'. See that routine for better commentary about what's
1401         // going on in this routine. We probably could unify the routines using
1402         // generics or more helper routines, but I'm not sure it's worth it.
1403         //
1404         // NOTE: We somewhat go out of our way here to support things like
1405         // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither
1406         // of those seem particularly relevant to this routine, but they are
1407         // both supported by the DFA analogs of this routine by construction
1408         // and composition, so it seems like good sense to have the PikeVM
1409         // match that behavior.
1410 
1411         cache.setup_search(0);
1412         if input.is_done() {
1413             return;
1414         }
1415         assert!(
1416             input.haystack().len() < core::usize::MAX,
1417             "byte slice lengths must be less than usize MAX",
1418         );
1419         instrument!(|c| c.reset(&self.nfa));
1420 
1421         let allmatches =
1422             self.config.get_match_kind().continue_past_first_match();
1423         let (anchored, start_id) = match self.start_config(input) {
1424             None => return,
1425             Some(config) => config,
1426         };
1427 
1428         let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1429         for at in input.start()..=input.end() {
1430             let any_matches = !patset.is_empty();
1431             if curr.set.is_empty() {
1432                 if any_matches && !allmatches {
1433                     break;
1434                 }
1435                 if anchored && at > input.start() {
1436                     break;
1437                 }
1438             }
1439             if !any_matches || allmatches {
1440                 let slots = &mut [];
1441                 self.epsilon_closure(stack, slots, curr, input, at, start_id);
1442             }
1443             self.nexts_overlapping(stack, curr, next, input, at, patset);
1444             // If we found a match and filled our set, then there is no more
1445             // additional info that we can provide. Thus, we can quit. We also
1446             // quit if the caller asked us to stop at the earliest point that
1447             // we know a match exists.
1448             if patset.is_full() || input.get_earliest() {
1449                 break;
1450             }
1451             core::mem::swap(curr, next);
1452             next.set.clear();
1453         }
1454         instrument!(|c| c.eprint(&self.nfa));
1455     }
1456 
1457     /// Process the active states in 'curr' to find the states (written to
1458     /// 'next') we should process for the next byte in the haystack.
1459     ///
1460     /// 'stack' is used to perform a depth first traversal of the NFA when
1461     /// computing an epsilon closure.
1462     ///
1463     /// When a match is found, the slots for that match state (in 'curr') are
1464     /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
1465     /// stops (unless the PikeVM was configured with MatchKind::All semantics).
1466     #[cfg_attr(feature = "perf-inline", inline(always))]
nexts( &self, stack: &mut Vec<FollowEpsilon>, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, slots: &mut [Option<NonMaxUsize>], ) -> Option<PatternID>1467     fn nexts(
1468         &self,
1469         stack: &mut Vec<FollowEpsilon>,
1470         curr: &mut ActiveStates,
1471         next: &mut ActiveStates,
1472         input: &Input<'_>,
1473         at: usize,
1474         slots: &mut [Option<NonMaxUsize>],
1475     ) -> Option<PatternID> {
1476         instrument!(|c| c.record_state_set(&curr.set));
1477         let mut pid = None;
1478         let ActiveStates { ref set, ref mut slot_table } = *curr;
1479         for sid in set.iter() {
1480             pid = match self.next(stack, slot_table, next, input, at, sid) {
1481                 None => continue,
1482                 Some(pid) => Some(pid),
1483             };
1484             slots.copy_from_slice(slot_table.for_state(sid));
1485             if !self.config.get_match_kind().continue_past_first_match() {
1486                 break;
1487             }
1488         }
1489         pid
1490     }
1491 
1492     /// Like 'nexts', but for the overlapping case. This doesn't write any
1493     /// slots, and instead just writes which pattern matched in 'patset'.
1494     #[cfg_attr(feature = "perf-inline", inline(always))]
nexts_overlapping( &self, stack: &mut Vec<FollowEpsilon>, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, patset: &mut PatternSet, )1495     fn nexts_overlapping(
1496         &self,
1497         stack: &mut Vec<FollowEpsilon>,
1498         curr: &mut ActiveStates,
1499         next: &mut ActiveStates,
1500         input: &Input<'_>,
1501         at: usize,
1502         patset: &mut PatternSet,
1503     ) {
1504         instrument!(|c| c.record_state_set(&curr.set));
1505         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1506         let ActiveStates { ref set, ref mut slot_table } = *curr;
1507         for sid in set.iter() {
1508             let pid = match self.next(stack, slot_table, next, input, at, sid)
1509             {
1510                 None => continue,
1511                 Some(pid) => pid,
1512             };
1513             // This handles the case of finding a zero-width match that splits
1514             // a codepoint. Namely, if we're in UTF-8 mode AND we know we can
1515             // match the empty string, then the only valid way of getting to
1516             // this point with an offset that splits a codepoint is when we
1517             // have an empty match. Such matches, in UTF-8 mode, must not be
1518             // reported. So we just skip them here and pretend as if we did
1519             // not see a match.
1520             if utf8empty && !input.is_char_boundary(at) {
1521                 continue;
1522             }
1523             let _ = patset.try_insert(pid);
1524             if !self.config.get_match_kind().continue_past_first_match() {
1525                 break;
1526             }
1527         }
1528     }
1529 
1530     /// Starting from 'sid', if the position 'at' in the 'input' haystack has a
1531     /// transition defined out of 'sid', then add the state transitioned to and
1532     /// its epsilon closure to the 'next' set of states to explore.
1533     ///
1534     /// 'stack' is used by the epsilon closure computation to perform a depth
1535     /// first traversal of the NFA.
1536     ///
1537     /// 'curr_slot_table' should be the table of slots for the current set of
1538     /// states being explored. If there is a transition out of 'sid', then
1539     /// sid's row in the slot table is used to perform the epsilon closure.
1540     #[cfg_attr(feature = "perf-inline", inline(always))]
next( &self, stack: &mut Vec<FollowEpsilon>, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, ) -> Option<PatternID>1541     fn next(
1542         &self,
1543         stack: &mut Vec<FollowEpsilon>,
1544         curr_slot_table: &mut SlotTable,
1545         next: &mut ActiveStates,
1546         input: &Input<'_>,
1547         at: usize,
1548         sid: StateID,
1549     ) -> Option<PatternID> {
1550         instrument!(|c| c.record_step(sid));
1551         match *self.nfa.state(sid) {
1552             State::Fail
1553             | State::Look { .. }
1554             | State::Union { .. }
1555             | State::BinaryUnion { .. }
1556             | State::Capture { .. } => None,
1557             State::ByteRange { ref trans } => {
1558                 if trans.matches(input.haystack(), at) {
1559                     let slots = curr_slot_table.for_state(sid);
1560                     // OK because 'at <= haystack.len() < usize::MAX', so
1561                     // adding 1 will never wrap.
1562                     let at = at.wrapping_add(1);
1563                     self.epsilon_closure(
1564                         stack, slots, next, input, at, trans.next,
1565                     );
1566                 }
1567                 None
1568             }
1569             State::Sparse(ref sparse) => {
1570                 if let Some(next_sid) = sparse.matches(input.haystack(), at) {
1571                     let slots = curr_slot_table.for_state(sid);
1572                     // OK because 'at <= haystack.len() < usize::MAX', so
1573                     // adding 1 will never wrap.
1574                     let at = at.wrapping_add(1);
1575                     self.epsilon_closure(
1576                         stack, slots, next, input, at, next_sid,
1577                     );
1578                 }
1579                 None
1580             }
1581             State::Dense(ref dense) => {
1582                 if let Some(next_sid) = dense.matches(input.haystack(), at) {
1583                     let slots = curr_slot_table.for_state(sid);
1584                     // OK because 'at <= haystack.len() < usize::MAX', so
1585                     // adding 1 will never wrap.
1586                     let at = at.wrapping_add(1);
1587                     self.epsilon_closure(
1588                         stack, slots, next, input, at, next_sid,
1589                     );
1590                 }
1591                 None
1592             }
1593             State::Match { pattern_id } => Some(pattern_id),
1594         }
1595     }
1596 
1597     /// Compute the epsilon closure of 'sid', writing the closure into 'next'
1598     /// while copying slot values from 'curr_slots' into corresponding states
1599     /// in 'next'. 'curr_slots' should be the slot values corresponding to
1600     /// 'sid'.
1601     ///
1602     /// The given 'stack' is used to perform a depth first traversal of the
1603     /// NFA by recursively following all epsilon transitions out of 'sid'.
1604     /// Conditional epsilon transitions are followed if and only if they are
1605     /// satisfied for the position 'at' in the 'input' haystack.
1606     ///
1607     /// While this routine may write to 'curr_slots', once it returns, any
1608     /// writes are undone and the original values (even if absent) are
1609     /// restored.
1610     #[cfg_attr(feature = "perf-inline", inline(always))]
epsilon_closure( &self, stack: &mut Vec<FollowEpsilon>, curr_slots: &mut [Option<NonMaxUsize>], next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, )1611     fn epsilon_closure(
1612         &self,
1613         stack: &mut Vec<FollowEpsilon>,
1614         curr_slots: &mut [Option<NonMaxUsize>],
1615         next: &mut ActiveStates,
1616         input: &Input<'_>,
1617         at: usize,
1618         sid: StateID,
1619     ) {
1620         instrument!(|c| {
1621             c.record_closure(sid);
1622             c.record_stack_push(sid);
1623         });
1624         stack.push(FollowEpsilon::Explore(sid));
1625         while let Some(frame) = stack.pop() {
1626             match frame {
1627                 FollowEpsilon::RestoreCapture { slot, offset: pos } => {
1628                     curr_slots[slot] = pos;
1629                 }
1630                 FollowEpsilon::Explore(sid) => {
1631                     self.epsilon_closure_explore(
1632                         stack, curr_slots, next, input, at, sid,
1633                     );
1634                 }
1635             }
1636         }
1637     }
1638 
1639     /// Explore all of the epsilon transitions out of 'sid'. This is mostly
1640     /// split out from 'epsilon_closure' in order to clearly delineate
1641     /// the actual work of computing an epsilon closure from the stack
1642     /// book-keeping.
1643     ///
1644     /// This will push any additional explorations needed on to 'stack'.
1645     ///
1646     /// 'curr_slots' should refer to the slots for the currently active NFA
1647     /// state. That is, the current state we are stepping through. These
1648     /// slots are mutated in place as new 'Captures' states are traversed
1649     /// during epsilon closure, but the slots are restored to their original
1650     /// values once the full epsilon closure is completed. The ultimate use of
1651     /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
1652     /// the capturing group spans are forwarded from the currently active state
1653     /// to the next.
1654     ///
1655     /// 'next' refers to the next set of active states. Computing an epsilon
1656     /// closure may increase the next set of active states.
1657     ///
1658     /// 'input' refers to the caller's input configuration and 'at' refers to
1659     /// the current position in the haystack. These are used to check whether
1660     /// conditional epsilon transitions (like look-around) are satisfied at
1661     /// the current position. If they aren't, then the epsilon closure won't
1662     /// include them.
1663     #[cfg_attr(feature = "perf-inline", inline(always))]
epsilon_closure_explore( &self, stack: &mut Vec<FollowEpsilon>, curr_slots: &mut [Option<NonMaxUsize>], next: &mut ActiveStates, input: &Input<'_>, at: usize, mut sid: StateID, )1664     fn epsilon_closure_explore(
1665         &self,
1666         stack: &mut Vec<FollowEpsilon>,
1667         curr_slots: &mut [Option<NonMaxUsize>],
1668         next: &mut ActiveStates,
1669         input: &Input<'_>,
1670         at: usize,
1671         mut sid: StateID,
1672     ) {
1673         // We can avoid pushing some state IDs on to our stack in precisely
1674         // the cases where a 'push(x)' would be immediately followed by a 'x
1675         // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
1676         // to be the next state ID we want to explore once we're done with
1677         // our initial exploration. In practice, this avoids a lot of stack
1678         // thrashing.
1679         loop {
1680             instrument!(|c| c.record_set_insert(sid));
1681             // Record this state as part of our next set of active states. If
1682             // we've already explored it, then no need to do it again.
1683             if !next.set.insert(sid) {
1684                 return;
1685             }
1686             match *self.nfa.state(sid) {
1687                 State::Fail
1688                 | State::Match { .. }
1689                 | State::ByteRange { .. }
1690                 | State::Sparse { .. }
1691                 | State::Dense { .. } => {
1692                     next.slot_table.for_state(sid).copy_from_slice(curr_slots);
1693                     return;
1694                 }
1695                 State::Look { look, next } => {
1696                     // OK because we don't permit building a searcher with a
1697                     // Unicode word boundary if the requisite Unicode data is
1698                     // unavailable.
1699                     if !self.nfa.look_matcher().matches_inline(
1700                         look,
1701                         input.haystack(),
1702                         at,
1703                     ) {
1704                         return;
1705                     }
1706                     sid = next;
1707                 }
1708                 State::Union { ref alternates } => {
1709                     sid = match alternates.get(0) {
1710                         None => return,
1711                         Some(&sid) => sid,
1712                     };
1713                     instrument!(|c| {
1714                         for &alt in &alternates[1..] {
1715                             c.record_stack_push(alt);
1716                         }
1717                     });
1718                     stack.extend(
1719                         alternates[1..]
1720                             .iter()
1721                             .copied()
1722                             .rev()
1723                             .map(FollowEpsilon::Explore),
1724                     );
1725                 }
1726                 State::BinaryUnion { alt1, alt2 } => {
1727                     sid = alt1;
1728                     instrument!(|c| c.record_stack_push(sid));
1729                     stack.push(FollowEpsilon::Explore(alt2));
1730                 }
1731                 State::Capture { next, slot, .. } => {
1732                     // There's no need to do anything with slots that
1733                     // ultimately won't be copied into the caller-provided
1734                     // 'Captures' value. So we just skip dealing with them at
1735                     // all.
1736                     if slot.as_usize() < curr_slots.len() {
1737                         instrument!(|c| c.record_stack_push(sid));
1738                         stack.push(FollowEpsilon::RestoreCapture {
1739                             slot,
1740                             offset: curr_slots[slot],
1741                         });
1742                         // OK because length of a slice must fit into an isize.
1743                         curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap());
1744                     }
1745                     sid = next;
1746                 }
1747             }
1748         }
1749     }
1750 
1751     /// Return the starting configuration of a PikeVM search.
1752     ///
1753     /// The "start config" is basically whether the search should be anchored
1754     /// or not and the NFA state ID at which to begin the search. The state ID
1755     /// returned always corresponds to an anchored starting state even when the
1756     /// search is unanchored. This is because the PikeVM search loop deals with
1757     /// unanchored searches with an explicit epsilon closure out of the start
1758     /// state.
1759     ///
1760     /// This routine accounts for both the caller's `Input` configuration
1761     /// and the pattern itself. For example, even if the caller asks for an
1762     /// unanchored search, if the pattern itself is anchored, then this will
1763     /// always return 'true' because implementing an unanchored search in that
1764     /// case would be incorrect.
1765     ///
1766     /// Similarly, if the caller requests an anchored search for a particular
1767     /// pattern, then the starting state ID returned will reflect that.
1768     ///
1769     /// If a pattern ID is given in the input configuration that is not in
1770     /// this regex, then `None` is returned.
start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)>1771     fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> {
1772         match input.get_anchored() {
1773             // Only way we're unanchored is if both the caller asked for an
1774             // unanchored search *and* the pattern is itself not anchored.
1775             Anchored::No => Some((
1776                 self.nfa.is_always_start_anchored(),
1777                 self.nfa.start_anchored(),
1778             )),
1779             Anchored::Yes => Some((true, self.nfa.start_anchored())),
1780             Anchored::Pattern(pid) => {
1781                 Some((true, self.nfa.start_pattern(pid)?))
1782             }
1783         }
1784     }
1785 }
1786 
1787 /// An iterator over all non-overlapping matches for a particular search.
1788 ///
1789 /// The iterator yields a [`Match`] value until no more matches could be found.
1790 ///
1791 /// The lifetime parameters are as follows:
1792 ///
1793 /// * `'r` represents the lifetime of the PikeVM.
1794 /// * `'c` represents the lifetime of the PikeVM's cache.
1795 /// * `'h` represents the lifetime of the haystack being searched.
1796 ///
1797 /// This iterator can be created with the [`PikeVM::find_iter`] method.
1798 #[derive(Debug)]
1799 pub struct FindMatches<'r, 'c, 'h> {
1800     re: &'r PikeVM,
1801     cache: &'c mut Cache,
1802     caps: Captures,
1803     it: iter::Searcher<'h>,
1804 }
1805 
1806 impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
1807     type Item = Match;
1808 
1809     #[inline]
next(&mut self) -> Option<Match>1810     fn next(&mut self) -> Option<Match> {
1811         // Splitting 'self' apart seems necessary to appease borrowck.
1812         let FindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1813             *self;
1814         // 'advance' converts errors into panics, which is OK here because
1815         // the PikeVM can never return an error.
1816         it.advance(|input| {
1817             re.search(cache, input, caps);
1818             Ok(caps.get_match())
1819         })
1820     }
1821 }
1822 
1823 /// An iterator over all non-overlapping leftmost matches, with their capturing
1824 /// groups, for a particular search.
1825 ///
1826 /// The iterator yields a [`Captures`] value until no more matches could be
1827 /// found.
1828 ///
1829 /// The lifetime parameters are as follows:
1830 ///
1831 /// * `'r` represents the lifetime of the PikeVM.
1832 /// * `'c` represents the lifetime of the PikeVM's cache.
1833 /// * `'h` represents the lifetime of the haystack being searched.
1834 ///
1835 /// This iterator can be created with the [`PikeVM::captures_iter`] method.
1836 #[derive(Debug)]
1837 pub struct CapturesMatches<'r, 'c, 'h> {
1838     re: &'r PikeVM,
1839     cache: &'c mut Cache,
1840     caps: Captures,
1841     it: iter::Searcher<'h>,
1842 }
1843 
1844 impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
1845     type Item = Captures;
1846 
1847     #[inline]
next(&mut self) -> Option<Captures>1848     fn next(&mut self) -> Option<Captures> {
1849         // Splitting 'self' apart seems necessary to appease borrowck.
1850         let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1851             *self;
1852         // 'advance' converts errors into panics, which is OK here because
1853         // the PikeVM can never return an error.
1854         it.advance(|input| {
1855             re.search(cache, input, caps);
1856             Ok(caps.get_match())
1857         });
1858         if caps.is_match() {
1859             Some(caps.clone())
1860         } else {
1861             None
1862         }
1863     }
1864 }
1865 
1866 /// A cache represents mutable state that a [`PikeVM`] requires during a
1867 /// search.
1868 ///
1869 /// For a given [`PikeVM`], its corresponding cache may be created either via
1870 /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
1871 /// every way, except the former does not require explicitly importing `Cache`.
1872 ///
1873 /// A particular `Cache` is coupled with the [`PikeVM`] from which it
1874 /// was created. It may only be used with that `PikeVM`. A cache and its
1875 /// allocations may be re-purposed via [`Cache::reset`], in which case, it can
1876 /// only be used with the new `PikeVM` (and not the old one).
1877 #[derive(Clone, Debug)]
1878 pub struct Cache {
1879     /// Stack used while computing epsilon closure. This effectively lets us
1880     /// move what is more naturally expressed through recursion to a stack
1881     /// on the heap.
1882     stack: Vec<FollowEpsilon>,
1883     /// The current active states being explored for the current byte in the
1884     /// haystack.
1885     curr: ActiveStates,
1886     /// The next set of states we're building that will be explored for the
1887     /// next byte in the haystack.
1888     next: ActiveStates,
1889 }
1890 
1891 impl Cache {
1892     /// Create a new [`PikeVM`] cache.
1893     ///
1894     /// A potentially more convenient routine to create a cache is
1895     /// [`PikeVM::create_cache`], as it does not require also importing the
1896     /// `Cache` type.
1897     ///
1898     /// If you want to reuse the returned `Cache` with some other `PikeVM`,
1899     /// then you must call [`Cache::reset`] with the desired `PikeVM`.
new(re: &PikeVM) -> Cache1900     pub fn new(re: &PikeVM) -> Cache {
1901         Cache {
1902             stack: vec![],
1903             curr: ActiveStates::new(re),
1904             next: ActiveStates::new(re),
1905         }
1906     }
1907 
1908     /// Reset this cache such that it can be used for searching with a
1909     /// different [`PikeVM`].
1910     ///
1911     /// A cache reset permits reusing memory already allocated in this cache
1912     /// with a different `PikeVM`.
1913     ///
1914     /// # Example
1915     ///
1916     /// This shows how to re-purpose a cache for use with a different `PikeVM`.
1917     ///
1918     /// ```
1919     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1920     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
1921     ///
1922     /// let re1 = PikeVM::new(r"\w")?;
1923     /// let re2 = PikeVM::new(r"\W")?;
1924     ///
1925     /// let mut cache = re1.create_cache();
1926     /// assert_eq!(
1927     ///     Some(Match::must(0, 0..2)),
1928     ///     re1.find_iter(&mut cache, "Δ").next(),
1929     /// );
1930     ///
1931     /// // Using 'cache' with re2 is not allowed. It may result in panics or
1932     /// // incorrect results. In order to re-purpose the cache, we must reset
1933     /// // it with the PikeVM we'd like to use it with.
1934     /// //
1935     /// // Similarly, after this reset, using the cache with 're1' is also not
1936     /// // allowed.
1937     /// cache.reset(&re2);
1938     /// assert_eq!(
1939     ///     Some(Match::must(0, 0..3)),
1940     ///     re2.find_iter(&mut cache, "☃").next(),
1941     /// );
1942     ///
1943     /// # Ok::<(), Box<dyn std::error::Error>>(())
1944     /// ```
reset(&mut self, re: &PikeVM)1945     pub fn reset(&mut self, re: &PikeVM) {
1946         self.curr.reset(re);
1947         self.next.reset(re);
1948     }
1949 
1950     /// Returns the heap memory usage, in bytes, of this cache.
1951     ///
1952     /// This does **not** include the stack size used up by this cache. To
1953     /// compute that, use `std::mem::size_of::<Cache>()`.
memory_usage(&self) -> usize1954     pub fn memory_usage(&self) -> usize {
1955         use core::mem::size_of;
1956         (self.stack.len() * size_of::<FollowEpsilon>())
1957             + self.curr.memory_usage()
1958             + self.next.memory_usage()
1959     }
1960 
1961     /// Clears this cache. This should be called at the start of every search
1962     /// to ensure we start with a clean slate.
1963     ///
1964     /// This also sets the length of the capturing groups used in the current
1965     /// search. This permits an optimization where by 'SlotTable::for_state'
1966     /// only returns the number of slots equivalent to the number of slots
1967     /// given in the 'Captures' value. This may be less than the total number
1968     /// of possible slots, e.g., when one only wants to track overall match
1969     /// offsets. This in turn permits less copying of capturing group spans
1970     /// in the PikeVM.
setup_search(&mut self, captures_slot_len: usize)1971     fn setup_search(&mut self, captures_slot_len: usize) {
1972         self.stack.clear();
1973         self.curr.setup_search(captures_slot_len);
1974         self.next.setup_search(captures_slot_len);
1975     }
1976 }
1977 
1978 /// A set of active states used to "simulate" the execution of an NFA via the
1979 /// PikeVM.
1980 ///
1981 /// There are two sets of these used during NFA simulation. One set corresponds
1982 /// to the "current" set of states being traversed for the current position
1983 /// in a haystack. The other set corresponds to the "next" set of states being
1984 /// built, which will become the new "current" set for the next position in the
1985 /// haystack. These two sets correspond to CLIST and NLIST in Thompson's
1986 /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
1987 ///
1988 /// In addition to representing a set of NFA states, this also maintains slot
1989 /// values for each state. These slot values are what turn the NFA simulation
1990 /// into the "Pike VM." Namely, they track capturing group values for each
1991 /// state. During the computation of epsilon closure, we copy slot values from
1992 /// states in the "current" set to the "next" set. Eventually, once a match
1993 /// is found, the slot values for that match state are what we write to the
1994 /// caller provided 'Captures' value.
1995 #[derive(Clone, Debug)]
1996 struct ActiveStates {
1997     /// The set of active NFA states. This set preserves insertion order, which
1998     /// is critical for simulating the match semantics of backtracking regex
1999     /// engines.
2000     set: SparseSet,
2001     /// The slots for every NFA state, where each slot stores a (possibly
2002     /// absent) offset. Every capturing group has two slots. One for a start
2003     /// offset and one for an end offset.
2004     slot_table: SlotTable,
2005 }
2006 
2007 impl ActiveStates {
2008     /// Create a new set of active states for the given PikeVM. The active
2009     /// states returned may only be used with the given PikeVM. (Use 'reset'
2010     /// to re-purpose the allocation for a different PikeVM.)
new(re: &PikeVM) -> ActiveStates2011     fn new(re: &PikeVM) -> ActiveStates {
2012         let mut active = ActiveStates {
2013             set: SparseSet::new(0),
2014             slot_table: SlotTable::new(),
2015         };
2016         active.reset(re);
2017         active
2018     }
2019 
2020     /// Reset this set of active states such that it can be used with the given
2021     /// PikeVM (and only that PikeVM).
reset(&mut self, re: &PikeVM)2022     fn reset(&mut self, re: &PikeVM) {
2023         self.set.resize(re.get_nfa().states().len());
2024         self.slot_table.reset(re);
2025     }
2026 
2027     /// Return the heap memory usage, in bytes, used by this set of active
2028     /// states.
2029     ///
2030     /// This does not include the stack size of this value.
memory_usage(&self) -> usize2031     fn memory_usage(&self) -> usize {
2032         self.set.memory_usage() + self.slot_table.memory_usage()
2033     }
2034 
2035     /// Setup this set of active states for a new search. The given slot
2036     /// length should be the number of slots in a caller provided 'Captures'
2037     /// (and may be zero).
setup_search(&mut self, captures_slot_len: usize)2038     fn setup_search(&mut self, captures_slot_len: usize) {
2039         self.set.clear();
2040         self.slot_table.setup_search(captures_slot_len);
2041     }
2042 }
2043 
2044 /// A table of slots, where each row represent a state in an NFA. Thus, the
2045 /// table has room for storing slots for every single state in an NFA.
2046 ///
2047 /// This table is represented with a single contiguous allocation. In general,
2048 /// the notion of "capturing group" doesn't really exist at this level of
2049 /// abstraction, hence the name "slot" instead. (Indeed, every capturing group
2050 /// maps to a pair of slots, one for the start offset and one for the end
2051 /// offset.) Slots are indexed by the 'Captures' NFA state.
2052 ///
2053 /// N.B. Not every state actually needs a row of slots. Namely, states that
2054 /// only have epsilon transitions currently never have anything written to
2055 /// their rows in this table. Thus, the table is somewhat wasteful in its heap
2056 /// usage. However, it is important to maintain fast random access by state
2057 /// ID, which means one giant table tends to work well. RE2 takes a different
2058 /// approach here and allocates each row as its own reference counted thing.
2059 /// I explored such a strategy at one point here, but couldn't get it to work
2060 /// well using entirely safe code. (To the ambitious reader: I encourage you to
2061 /// re-litigate that experiment.) I very much wanted to stick to safe code, but
2062 /// could be convinced otherwise if there was a solid argument and the safety
2063 /// was encapsulated well.
2064 #[derive(Clone, Debug)]
2065 struct SlotTable {
2066     /// The actual table of offsets.
2067     table: Vec<Option<NonMaxUsize>>,
2068     /// The number of slots per state, i.e., the table's stride or the length
2069     /// of each row.
2070     slots_per_state: usize,
2071     /// The number of slots in the caller-provided 'Captures' value for the
2072     /// current search. Setting this to 'slots_per_state' is always correct,
2073     /// but may be wasteful.
2074     slots_for_captures: usize,
2075 }
2076 
2077 impl SlotTable {
2078     /// Create a new slot table.
2079     ///
2080     /// One should call 'reset' with the corresponding PikeVM before use.
new() -> SlotTable2081     fn new() -> SlotTable {
2082         SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
2083     }
2084 
2085     /// Reset this slot table such that it can be used with the given PikeVM
2086     /// (and only that PikeVM).
reset(&mut self, re: &PikeVM)2087     fn reset(&mut self, re: &PikeVM) {
2088         let nfa = re.get_nfa();
2089         self.slots_per_state = nfa.group_info().slot_len();
2090         // This is always correct, but may be reduced for a particular search
2091         // if a 'Captures' has fewer slots, e.g., none at all or only slots
2092         // for tracking the overall match instead of all slots for every
2093         // group.
2094         self.slots_for_captures = core::cmp::max(
2095             self.slots_per_state,
2096             nfa.pattern_len().checked_mul(2).unwrap(),
2097         );
2098         let len = nfa
2099             .states()
2100             .len()
2101             .checked_mul(self.slots_per_state)
2102             // Add space to account for scratch space used during a search.
2103             .and_then(|x| x.checked_add(self.slots_for_captures))
2104             // It seems like this could actually panic on legitimate inputs on
2105             // 32-bit targets, and very likely to panic on 16-bit. Should we
2106             // somehow convert this to an error? What about something similar
2107             // for the lazy DFA cache? If you're tripping this assert, please
2108             // file a bug.
2109             .expect("slot table length doesn't overflow");
2110         // This happens about as often as a regex is compiled, so it probably
2111         // should be at debug level, but I found it quite distracting and not
2112         // particularly useful.
2113         trace!(
2114             "resizing PikeVM active states table to {} entries \
2115              (slots_per_state={})",
2116             len,
2117             self.slots_per_state,
2118         );
2119         self.table.resize(len, None);
2120     }
2121 
2122     /// Return the heap memory usage, in bytes, used by this slot table.
2123     ///
2124     /// This does not include the stack size of this value.
memory_usage(&self) -> usize2125     fn memory_usage(&self) -> usize {
2126         self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2127     }
2128 
2129     /// Perform any per-search setup for this slot table.
2130     ///
2131     /// In particular, this sets the length of the number of slots used in the
2132     /// 'Captures' given by the caller (if any at all). This number may be
2133     /// smaller than the total number of slots available, e.g., when the caller
2134     /// is only interested in tracking the overall match and not the spans of
2135     /// every matching capturing group. Only tracking the overall match can
2136     /// save a substantial amount of time copying capturing spans during a
2137     /// search.
setup_search(&mut self, captures_slot_len: usize)2138     fn setup_search(&mut self, captures_slot_len: usize) {
2139         self.slots_for_captures = captures_slot_len;
2140     }
2141 
2142     /// Return a mutable slice of the slots for the given state.
2143     ///
2144     /// Note that the length of the slice returned may be less than the total
2145     /// number of slots available for this state. In particular, the length
2146     /// always matches the number of slots indicated via 'setup_search'.
for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>]2147     fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
2148         let i = sid.as_usize() * self.slots_per_state;
2149         &mut self.table[i..i + self.slots_for_captures]
2150     }
2151 
2152     /// Return a slice of slots of appropriate length where every slot offset
2153     /// is guaranteed to be absent. This is useful in cases where you need to
2154     /// compute an epsilon closure outside of the user supplied regex, and thus
2155     /// never want it to have any capturing slots set.
all_absent(&mut self) -> &mut [Option<NonMaxUsize>]2156     fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
2157         let i = self.table.len() - self.slots_for_captures;
2158         &mut self.table[i..i + self.slots_for_captures]
2159     }
2160 }
2161 
2162 /// Represents a stack frame for use while computing an epsilon closure.
2163 ///
2164 /// (An "epsilon closure" refers to the set of reachable NFA states from a
2165 /// single state without consuming any input. That is, the set of all epsilon
2166 /// transitions not only from that single state, but from every other state
2167 /// reachable by an epsilon transition as well. This is why it's called a
2168 /// "closure." Computing an epsilon closure is also done during DFA
2169 /// determinization! Compare and contrast the epsilon closure here in this
2170 /// PikeVM and the one used for determinization in crate::util::determinize.)
2171 ///
2172 /// Computing the epsilon closure in a Thompson NFA proceeds via a depth
2173 /// first traversal over all epsilon transitions from a particular state.
2174 /// (A depth first traversal is important because it emulates the same priority
2175 /// of matches that is typically found in backtracking regex engines.) This
2176 /// depth first traversal is naturally expressed using recursion, but to avoid
2177 /// a call stack size proportional to the size of a regex, we put our stack on
2178 /// the heap instead.
2179 ///
2180 /// This stack thus consists of call frames. The typical call frame is
2181 /// `Explore`, which instructs epsilon closure to explore the epsilon
2182 /// transitions from that state. (Subsequent epsilon transitions are then
2183 /// pushed on to the stack as more `Explore` frames.) If the state ID being
2184 /// explored has no epsilon transitions, then the capturing group slots are
2185 /// copied from the original state that sparked the epsilon closure (from the
2186 /// 'step' routine) to the state ID being explored. This way, capturing group
2187 /// slots are forwarded from the previous state to the next.
2188 ///
2189 /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
2190 /// set the position for a particular slot back to some particular offset. This
2191 /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
2192 /// set the offset of the slot indicated in `Capture` to the current offset,
2193 /// and then push the old offset on to the stack as a `RestoreCapture` frame.
2194 /// Thus, the new offset is only used until the epsilon closure reverts back to
2195 /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
2196 /// transition its "scope" to only states that come "after" it during depth
2197 /// first traversal.
2198 #[derive(Clone, Debug)]
2199 enum FollowEpsilon {
2200     /// Explore the epsilon transitions from a state ID.
2201     Explore(StateID),
2202     /// Reset the given `slot` to the given `offset` (which might be `None`).
2203     RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
2204 }
2205 
2206 /// A set of counters that "instruments" a PikeVM search. To enable this, you
2207 /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust
2208 /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in
2209 /// the environment. The metrics collected will be dumped automatically for
2210 /// every search executed by the PikeVM.
2211 ///
2212 /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an
2213 /// absolute decrease in wall-clock performance, even if the 'trace' log level
2214 /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't
2215 /// enabled.) The main point of instrumentation is to get counts of various
2216 /// events that occur during the PikeVM's execution.
2217 ///
2218 /// This is a somewhat hacked together collection of metrics that are useful
2219 /// to gather from a PikeVM search. In particular, it lets us scrutinize the
2220 /// performance profile of a search beyond what general purpose profiling tools
2221 /// give us. Namely, we orient the profiling data around the specific states of
2222 /// the NFA.
2223 ///
2224 /// In other words, this lets us see which parts of the NFA graph are most
2225 /// frequently activated. This then provides direction for optimization
2226 /// opportunities.
2227 ///
2228 /// The really sad part about this is that it absolutely clutters up the PikeVM
2229 /// implementation. :'( Another approach would be to just manually add this
2230 /// code in whenever I want this kind of profiling data, but it's complicated
2231 /// and tedious enough that I went with this approach... for now.
2232 ///
2233 /// When instrumentation is enabled (which also turns on 'logging'), then a
2234 /// `Counters` is initialized for every search and `trace`'d just before the
2235 /// search returns to the caller.
2236 ///
2237 /// Tip: When debugging performance problems with the PikeVM, it's best to try
2238 /// to work with an NFA that is as small as possible. Otherwise the state graph
2239 /// is likely to be too big to digest.
2240 #[cfg(feature = "internal-instrument-pikevm")]
2241 #[derive(Clone, Debug)]
2242 struct Counters {
2243     /// The number of times the NFA is in a particular permutation of states.
2244     state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>,
2245     /// The number of times 'step' is called for a particular state ID (which
2246     /// indexes this array).
2247     steps: Vec<u64>,
2248     /// The number of times an epsilon closure was computed for a state.
2249     closures: Vec<u64>,
2250     /// The number of times a particular state ID is pushed on to a stack while
2251     /// computing an epsilon closure.
2252     stack_pushes: Vec<u64>,
2253     /// The number of times a particular state ID is inserted into a sparse set
2254     /// while computing an epsilon closure.
2255     set_inserts: Vec<u64>,
2256 }
2257 
2258 #[cfg(feature = "internal-instrument-pikevm")]
2259 impl Counters {
empty() -> Counters2260     fn empty() -> Counters {
2261         Counters {
2262             state_sets: alloc::collections::BTreeMap::new(),
2263             steps: vec![],
2264             closures: vec![],
2265             stack_pushes: vec![],
2266             set_inserts: vec![],
2267         }
2268     }
2269 
reset(&mut self, nfa: &NFA)2270     fn reset(&mut self, nfa: &NFA) {
2271         let len = nfa.states().len();
2272 
2273         self.state_sets.clear();
2274 
2275         self.steps.clear();
2276         self.steps.resize(len, 0);
2277 
2278         self.closures.clear();
2279         self.closures.resize(len, 0);
2280 
2281         self.stack_pushes.clear();
2282         self.stack_pushes.resize(len, 0);
2283 
2284         self.set_inserts.clear();
2285         self.set_inserts.resize(len, 0);
2286     }
2287 
eprint(&self, nfa: &NFA)2288     fn eprint(&self, nfa: &NFA) {
2289         trace!("===== START PikeVM Instrumentation Output =====");
2290         // We take the top-K most occurring state sets. Otherwise the output
2291         // is likely to be overwhelming. And we probably only care about the
2292         // most frequently occurring ones anyway.
2293         const LIMIT: usize = 20;
2294         let mut set_counts =
2295             self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>();
2296         set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count));
2297         trace!("## PikeVM frequency of state sets (top {})", LIMIT);
2298         for (set, count) in set_counts.iter().take(LIMIT) {
2299             trace!("{:?}: {}", set, count);
2300         }
2301         if set_counts.len() > LIMIT {
2302             trace!(
2303                 "... {} sets omitted (out of {} total)",
2304                 set_counts.len() - LIMIT,
2305                 set_counts.len(),
2306             );
2307         }
2308 
2309         trace!("");
2310         trace!("## PikeVM total frequency of events");
2311         trace!(
2312             "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}",
2313             self.steps.iter().copied().sum::<u64>(),
2314             self.closures.iter().copied().sum::<u64>(),
2315             self.stack_pushes.iter().copied().sum::<u64>(),
2316             self.set_inserts.iter().copied().sum::<u64>(),
2317         );
2318 
2319         trace!("");
2320         trace!("## PikeVM frequency of events broken down by state");
2321         for sid in 0..self.steps.len() {
2322             trace!(
2323                 "{:06}: steps: {}, closures: {}, \
2324                  stack-pushes: {}, set-inserts: {}",
2325                 sid,
2326                 self.steps[sid],
2327                 self.closures[sid],
2328                 self.stack_pushes[sid],
2329                 self.set_inserts[sid],
2330             );
2331         }
2332 
2333         trace!("");
2334         trace!("## NFA debug display");
2335         trace!("{:?}", nfa);
2336         trace!("===== END PikeVM Instrumentation Output =====");
2337     }
2338 
record_state_set(&mut self, set: &SparseSet)2339     fn record_state_set(&mut self, set: &SparseSet) {
2340         let set = set.iter().collect::<Vec<StateID>>();
2341         *self.state_sets.entry(set).or_insert(0) += 1;
2342     }
2343 
record_step(&mut self, sid: StateID)2344     fn record_step(&mut self, sid: StateID) {
2345         self.steps[sid] += 1;
2346     }
2347 
record_closure(&mut self, sid: StateID)2348     fn record_closure(&mut self, sid: StateID) {
2349         self.closures[sid] += 1;
2350     }
2351 
record_stack_push(&mut self, sid: StateID)2352     fn record_stack_push(&mut self, sid: StateID) {
2353         self.stack_pushes[sid] += 1;
2354     }
2355 
record_set_insert(&mut self, sid: StateID)2356     fn record_set_insert(&mut self, sid: StateID) {
2357         self.set_inserts[sid] += 1;
2358     }
2359 }
2360