1 /*!
2 An NFA backed bounded backtracker for executing regex searches with capturing
3 groups.
4 
5 This module provides a [`BoundedBacktracker`] that works by simulating an NFA
6 using the classical backtracking algorithm with a twist: it avoids redoing
7 work that it has done before and thereby avoids worst case exponential time.
8 In exchange, it can only be used on "short" haystacks. Its advantage is that
9 is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
10 because it does less book-keeping.
11 */
12 
13 use alloc::{vec, vec::Vec};
14 
15 use crate::{
16     nfa::thompson::{self, BuildError, State, NFA},
17     util::{
18         captures::Captures,
19         empty, iter,
20         prefilter::Prefilter,
21         primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22         search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
23     },
24 };
25 
26 /// Returns the minimum visited capacity for the given haystack.
27 ///
28 /// This function can be used as the argument to [`Config::visited_capacity`]
29 /// in order to guarantee that a backtracking search for the given `input`
30 /// won't return an error when using a [`BoundedBacktracker`] built from the
31 /// given `NFA`.
32 ///
33 /// This routine exists primarily as a way to test that the bounded backtracker
34 /// works correctly when its capacity is set to the smallest possible amount.
35 /// Still, it may be useful in cases where you know you want to use the bounded
36 /// backtracker for a specific input, and just need to know what visited
37 /// capacity to provide to make it work.
38 ///
39 /// Be warned that this number could be quite large as it is multiplicative in
40 /// the size the given NFA and haystack.
min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize41 pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
42     div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8)
43 }
44 
45 /// The configuration used for building a bounded backtracker.
46 ///
47 /// A bounded backtracker configuration is a simple data object that is
48 /// typically used with [`Builder::configure`].
49 #[derive(Clone, Debug, Default)]
50 pub struct Config {
51     pre: Option<Option<Prefilter>>,
52     visited_capacity: Option<usize>,
53 }
54 
55 impl Config {
56     /// Return a new default regex configuration.
new() -> Config57     pub fn new() -> Config {
58         Config::default()
59     }
60 
61     /// Set a prefilter to be used whenever a start state is entered.
62     ///
63     /// A [`Prefilter`] in this context is meant to accelerate searches by
64     /// looking for literal prefixes that every match for the corresponding
65     /// pattern (or patterns) must start with. Once a prefilter produces a
66     /// match, the underlying search routine continues on to try and confirm
67     /// the match.
68     ///
69     /// Be warned that setting a prefilter does not guarantee that the search
70     /// will be faster. While it's usually a good bet, if the prefilter
71     /// produces a lot of false positive candidates (i.e., positions matched
72     /// by the prefilter but not by the regex), then the overall result can
73     /// be slower than if you had just executed the regex engine without any
74     /// prefilters.
75     ///
76     /// By default no prefilter is set.
77     ///
78     /// # Example
79     ///
80     /// ```
81     /// use regex_automata::{
82     ///     nfa::thompson::backtrack::BoundedBacktracker,
83     ///     util::prefilter::Prefilter,
84     ///     Input, Match, MatchKind,
85     /// };
86     ///
87     /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
88     /// let re = BoundedBacktracker::builder()
89     ///     .configure(BoundedBacktracker::config().prefilter(pre))
90     ///     .build(r"(foo|bar)[a-z]+")?;
91     /// let mut cache = re.create_cache();
92     /// let input = Input::new("foo1 barfox bar");
93     /// assert_eq!(
94     ///     Some(Match::must(0, 5..11)),
95     ///     re.try_find(&mut cache, input)?,
96     /// );
97     ///
98     /// # Ok::<(), Box<dyn std::error::Error>>(())
99     /// ```
100     ///
101     /// Be warned though that an incorrect prefilter can lead to incorrect
102     /// results!
103     ///
104     /// ```
105     /// use regex_automata::{
106     ///     nfa::thompson::backtrack::BoundedBacktracker,
107     ///     util::prefilter::Prefilter,
108     ///     Input, HalfMatch, MatchKind,
109     /// };
110     ///
111     /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
112     /// let re = BoundedBacktracker::builder()
113     ///     .configure(BoundedBacktracker::config().prefilter(pre))
114     ///     .build(r"(foo|bar)[a-z]+")?;
115     /// let mut cache = re.create_cache();
116     /// let input = Input::new("foo1 barfox bar");
117     /// // No match reported even though there clearly is one!
118     /// assert_eq!(None, re.try_find(&mut cache, input)?);
119     ///
120     /// # Ok::<(), Box<dyn std::error::Error>>(())
121     /// ```
prefilter(mut self, pre: Option<Prefilter>) -> Config122     pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
123         self.pre = Some(pre);
124         self
125     }
126 
127     /// Set the visited capacity used to bound backtracking.
128     ///
129     /// The visited capacity represents the amount of heap memory (in bytes) to
130     /// allocate toward tracking which parts of the backtracking search have
131     /// been done before. The heap memory needed for any particular search is
132     /// proportional to `haystack.len() * nfa.states().len()`, which an be
133     /// quite large. Therefore, the bounded backtracker is typically only able
134     /// to run on shorter haystacks.
135     ///
136     /// For a given regex, increasing the visited capacity means that the
137     /// maximum haystack length that can be searched is increased. The
138     /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
139     ///
140     /// The default capacity is a reasonable but empirically chosen size.
141     ///
142     /// # Example
143     ///
144     /// As with other regex engines, Unicode is what tends to make the bounded
145     /// backtracker less useful by making the maximum haystack length quite
146     /// small. If necessary, increasing the visited capacity using this routine
147     /// will increase the maximum haystack length at the cost of using more
148     /// memory.
149     ///
150     /// Note though that the specific maximum values here are not an API
151     /// guarantee. The default visited capacity is subject to change and not
152     /// covered by semver.
153     ///
154     /// ```
155     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156     /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
157     ///
158     /// // Unicode inflates the size of the underlying NFA quite a bit, and
159     /// // thus means that the backtracker can only handle smaller haystacks,
160     /// // assuming that the visited capacity remains unchanged.
161     /// let re = BoundedBacktracker::new(r"\w+")?;
162     /// assert!(re.max_haystack_len() <= 7_000);
163     /// // But we can increase the visited capacity to handle bigger haystacks!
164     /// let re = BoundedBacktracker::builder()
165     ///     .configure(BoundedBacktracker::config().visited_capacity(1<<20))
166     ///     .build(r"\w+")?;
167     /// assert!(re.max_haystack_len() >= 25_000);
168     /// assert!(re.max_haystack_len() <= 28_000);
169     /// # Ok::<(), Box<dyn std::error::Error>>(())
170     /// ```
visited_capacity(mut self, capacity: usize) -> Config171     pub fn visited_capacity(mut self, capacity: usize) -> Config {
172         self.visited_capacity = Some(capacity);
173         self
174     }
175 
176     /// Returns the prefilter set in this configuration, if one at all.
get_prefilter(&self) -> Option<&Prefilter>177     pub fn get_prefilter(&self) -> Option<&Prefilter> {
178         self.pre.as_ref().unwrap_or(&None).as_ref()
179     }
180 
181     /// Returns the configured visited capacity.
182     ///
183     /// Note that the actual capacity used may be slightly bigger than the
184     /// configured capacity.
get_visited_capacity(&self) -> usize185     pub fn get_visited_capacity(&self) -> usize {
186         const DEFAULT: usize = 256 * (1 << 10); // 256 KB
187         self.visited_capacity.unwrap_or(DEFAULT)
188     }
189 
190     /// Overwrite the default configuration such that the options in `o` are
191     /// always used. If an option in `o` is not set, then the corresponding
192     /// option in `self` is used. If it's not set in `self` either, then it
193     /// remains not set.
overwrite(&self, o: Config) -> Config194     pub(crate) fn overwrite(&self, o: Config) -> Config {
195         Config {
196             pre: o.pre.or_else(|| self.pre.clone()),
197             visited_capacity: o.visited_capacity.or(self.visited_capacity),
198         }
199     }
200 }
201 
202 /// A builder for a bounded backtracker.
203 ///
204 /// This builder permits configuring options for the syntax of a pattern, the
205 /// NFA construction and the `BoundedBacktracker` construction. This builder
206 /// is different from a general purpose regex builder in that it permits fine
207 /// grain configuration of the construction process. The trade off for this is
208 /// complexity, and the possibility of setting a configuration that might not
209 /// make sense. For example, there are two different UTF-8 modes:
210 ///
211 /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
212 /// whether the pattern itself can contain sub-expressions that match invalid
213 /// UTF-8.
214 /// * [`thompson::Config::utf8`] controls how the regex iterators themselves
215 /// advance the starting position of the next search when a match with zero
216 /// length is found.
217 ///
218 /// Generally speaking, callers will want to either enable all of these or
219 /// disable all of these.
220 ///
221 /// # Example
222 ///
223 /// This example shows how to disable UTF-8 mode in the syntax and the regex
224 /// itself. This is generally what you want for matching on arbitrary bytes.
225 ///
226 /// ```
227 /// use regex_automata::{
228 ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
229 ///     util::syntax,
230 ///     Match,
231 /// };
232 ///
233 /// let re = BoundedBacktracker::builder()
234 ///     .syntax(syntax::Config::new().utf8(false))
235 ///     .thompson(thompson::Config::new().utf8(false))
236 ///     .build(r"foo(?-u:[^b])ar.*")?;
237 /// let mut cache = re.create_cache();
238 ///
239 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
240 /// let expected = Some(Ok(Match::must(0, 1..9)));
241 /// let got = re.try_find_iter(&mut cache, haystack).next();
242 /// assert_eq!(expected, got);
243 /// // Notice that `(?-u:[^b])` matches invalid UTF-8,
244 /// // but the subsequent `.*` does not! Disabling UTF-8
245 /// // on the syntax permits this.
246 /// //
247 /// // N.B. This example does not show the impact of
248 /// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
249 /// // only impacts regexes that can produce matches of
250 /// // length 0.
251 /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]);
252 ///
253 /// # Ok::<(), Box<dyn std::error::Error>>(())
254 /// ```
255 #[derive(Clone, Debug)]
256 pub struct Builder {
257     config: Config,
258     #[cfg(feature = "syntax")]
259     thompson: thompson::Compiler,
260 }
261 
262 impl Builder {
263     /// Create a new BoundedBacktracker builder with its default configuration.
new() -> Builder264     pub fn new() -> Builder {
265         Builder {
266             config: Config::default(),
267             #[cfg(feature = "syntax")]
268             thompson: thompson::Compiler::new(),
269         }
270     }
271 
272     /// Build a `BoundedBacktracker` from the given pattern.
273     ///
274     /// If there was a problem parsing or compiling the pattern, then an error
275     /// is returned.
276     #[cfg(feature = "syntax")]
build( &self, pattern: &str, ) -> Result<BoundedBacktracker, BuildError>277     pub fn build(
278         &self,
279         pattern: &str,
280     ) -> Result<BoundedBacktracker, BuildError> {
281         self.build_many(&[pattern])
282     }
283 
284     /// Build a `BoundedBacktracker` from the given patterns.
285     #[cfg(feature = "syntax")]
build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<BoundedBacktracker, BuildError>286     pub fn build_many<P: AsRef<str>>(
287         &self,
288         patterns: &[P],
289     ) -> Result<BoundedBacktracker, BuildError> {
290         let nfa = self.thompson.build_many(patterns)?;
291         self.build_from_nfa(nfa)
292     }
293 
294     /// Build a `BoundedBacktracker` directly from its NFA.
295     ///
296     /// Note that when using this method, any configuration that applies to the
297     /// construction of the NFA itself will of course be ignored, since the NFA
298     /// given here is already built.
build_from_nfa( &self, nfa: NFA, ) -> Result<BoundedBacktracker, BuildError>299     pub fn build_from_nfa(
300         &self,
301         nfa: NFA,
302     ) -> Result<BoundedBacktracker, BuildError> {
303         nfa.look_set_any().available().map_err(BuildError::word)?;
304         Ok(BoundedBacktracker { config: self.config.clone(), nfa })
305     }
306 
307     /// Apply the given `BoundedBacktracker` configuration options to this
308     /// builder.
configure(&mut self, config: Config) -> &mut Builder309     pub fn configure(&mut self, config: Config) -> &mut Builder {
310         self.config = self.config.overwrite(config);
311         self
312     }
313 
314     /// Set the syntax configuration for this builder using
315     /// [`syntax::Config`](crate::util::syntax::Config).
316     ///
317     /// This permits setting things like case insensitivity, Unicode and multi
318     /// line mode.
319     ///
320     /// These settings only apply when constructing a `BoundedBacktracker`
321     /// directly from a pattern.
322     #[cfg(feature = "syntax")]
syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder323     pub fn syntax(
324         &mut self,
325         config: crate::util::syntax::Config,
326     ) -> &mut Builder {
327         self.thompson.syntax(config);
328         self
329     }
330 
331     /// Set the Thompson NFA configuration for this builder using
332     /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
333     ///
334     /// This permits setting things like if additional time should be spent
335     /// shrinking the size of the NFA.
336     ///
337     /// These settings only apply when constructing a `BoundedBacktracker`
338     /// directly from a pattern.
339     #[cfg(feature = "syntax")]
thompson(&mut self, config: thompson::Config) -> &mut Builder340     pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
341         self.thompson.configure(config);
342         self
343     }
344 }
345 
346 /// A backtracking regex engine that bounds its execution to avoid exponential
347 /// blow-up.
348 ///
349 /// This regex engine only implements leftmost-first match semantics and
350 /// only supports leftmost searches. It effectively does the same thing as a
351 /// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
352 /// it doesn't have to worry about copying capturing group spans for most NFA
353 /// states. Instead, the backtracker can maintain one set of captures (provided
354 /// by the caller) and never needs to copy them. In exchange, the backtracker
355 /// bounds itself to ensure it doesn't exhibit worst case exponential time.
356 /// This results in the backtracker only being able to handle short haystacks
357 /// given reasonable memory usage.
358 ///
359 /// # Searches may return an error!
360 ///
361 /// By design, this backtracking regex engine is bounded. This bound is
362 /// implemented by not visiting any combination of NFA state ID and position
363 /// in a haystack more than once. Thus, the total memory required to bound
364 /// backtracking is proportional to `haystack.len() * nfa.states().len()`.
365 /// This can obviously get quite large, since large haystacks aren't terribly
366 /// uncommon. To avoid using exorbitant memory, the capacity is bounded by
367 /// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
368 /// capacity required for a particular regex and a haystack exceeds this
369 /// capacity, then the search routine will return an error.
370 ///
371 /// Unlike other regex engines that may return an error at search time (like
372 /// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
373 /// backtracker will work for every haystack. Therefore, this regex engine
374 /// _only_ exposes fallible search routines to avoid the footgun of panicking
375 /// when running a search on a haystack that is too big.
376 ///
377 /// If one wants to use the fallible search APIs without handling the
378 /// error, the only way to guarantee an error won't occur from the
379 /// haystack length is to ensure the haystack length does not exceed
380 /// [`BoundedBacktracker::max_haystack_len`].
381 ///
382 /// # Example: Unicode word boundaries
383 ///
384 /// This example shows that the bounded backtracker implements Unicode word
385 /// boundaries correctly by default.
386 ///
387 /// ```
388 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
389 /// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
390 ///
391 /// let re = BoundedBacktracker::new(r"\b\w+\b")?;
392 /// let mut cache = re.create_cache();
393 ///
394 /// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
395 /// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next());
396 /// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next());
397 /// assert_eq!(None, it.next());
398 /// # Ok::<(), Box<dyn std::error::Error>>(())
399 /// ```
400 ///
401 /// # Example: multiple regex patterns
402 ///
403 /// The bounded backtracker supports searching for multiple patterns
404 /// simultaneously, just like other regex engines. Note though that because it
405 /// uses a backtracking strategy, this regex engine is unlikely to scale well
406 /// as more patterns are added. But then again, as more patterns are added, the
407 /// maximum haystack length allowed will also shorten (assuming the visited
408 /// capacity remains invariant).
409 ///
410 /// ```
411 /// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
412 ///
413 /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
414 /// let mut cache = re.create_cache();
415 ///
416 /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
417 /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
418 /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
419 /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
420 /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
421 /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
422 /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
423 /// assert_eq!(None, it.next());
424 /// # Ok::<(), Box<dyn std::error::Error>>(())
425 /// ```
426 #[derive(Clone, Debug)]
427 pub struct BoundedBacktracker {
428     config: Config,
429     nfa: NFA,
430 }
431 
432 impl BoundedBacktracker {
433     /// Parse the given regular expression using the default configuration and
434     /// return the corresponding `BoundedBacktracker`.
435     ///
436     /// If you want a non-default configuration, then use the [`Builder`] to
437     /// set your own configuration.
438     ///
439     /// # Example
440     ///
441     /// ```
442     /// use regex_automata::{
443     ///     nfa::thompson::backtrack::BoundedBacktracker,
444     ///     Match,
445     /// };
446     ///
447     /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
448     /// let mut cache = re.create_cache();
449     /// assert_eq!(
450     ///     Some(Ok(Match::must(0, 3..14))),
451     ///     re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
452     /// );
453     /// # Ok::<(), Box<dyn std::error::Error>>(())
454     /// ```
455     #[cfg(feature = "syntax")]
new(pattern: &str) -> Result<BoundedBacktracker, BuildError>456     pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
457         BoundedBacktracker::builder().build(pattern)
458     }
459 
460     /// Like `new`, but parses multiple patterns into a single "multi regex."
461     /// This similarly uses the default regex configuration.
462     ///
463     /// # Example
464     ///
465     /// ```
466     /// use regex_automata::{
467     ///     nfa::thompson::backtrack::BoundedBacktracker,
468     ///     Match,
469     /// };
470     ///
471     /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
472     /// let mut cache = re.create_cache();
473     ///
474     /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
475     /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
476     /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
477     /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
478     /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
479     /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
480     /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
481     /// assert_eq!(None, it.next());
482     /// # Ok::<(), Box<dyn std::error::Error>>(())
483     /// ```
484     #[cfg(feature = "syntax")]
new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<BoundedBacktracker, BuildError>485     pub fn new_many<P: AsRef<str>>(
486         patterns: &[P],
487     ) -> Result<BoundedBacktracker, BuildError> {
488         BoundedBacktracker::builder().build_many(patterns)
489     }
490 
491     /// # Example
492     ///
493     /// This shows how to hand assemble a regular expression via its HIR,
494     /// compile an NFA from it and build a BoundedBacktracker from the NFA.
495     ///
496     /// ```
497     /// use regex_automata::{
498     ///     nfa::thompson::{NFA, backtrack::BoundedBacktracker},
499     ///     Match,
500     /// };
501     /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
502     ///
503     /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
504     ///     ClassBytesRange::new(b'0', b'9'),
505     ///     ClassBytesRange::new(b'A', b'Z'),
506     ///     ClassBytesRange::new(b'_', b'_'),
507     ///     ClassBytesRange::new(b'a', b'z'),
508     /// ])));
509     ///
510     /// let config = NFA::config().nfa_size_limit(Some(1_000));
511     /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
512     ///
513     /// let re = BoundedBacktracker::new_from_nfa(nfa)?;
514     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
515     /// let expected = Some(Match::must(0, 3..4));
516     /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
517     /// assert_eq!(expected, caps.get_match());
518     ///
519     /// # Ok::<(), Box<dyn std::error::Error>>(())
520     /// ```
new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError>521     pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
522         BoundedBacktracker::builder().build_from_nfa(nfa)
523     }
524 
525     /// Create a new `BoundedBacktracker` that matches every input.
526     ///
527     /// # Example
528     ///
529     /// ```
530     /// use regex_automata::{
531     ///     nfa::thompson::backtrack::BoundedBacktracker,
532     ///     Match,
533     /// };
534     ///
535     /// let re = BoundedBacktracker::always_match()?;
536     /// let mut cache = re.create_cache();
537     ///
538     /// let expected = Some(Ok(Match::must(0, 0..0)));
539     /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
540     /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
541     /// # Ok::<(), Box<dyn std::error::Error>>(())
542     /// ```
always_match() -> Result<BoundedBacktracker, BuildError>543     pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
544         let nfa = thompson::NFA::always_match();
545         BoundedBacktracker::new_from_nfa(nfa)
546     }
547 
548     /// Create a new `BoundedBacktracker` that never matches any input.
549     ///
550     /// # Example
551     ///
552     /// ```
553     /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
554     ///
555     /// let re = BoundedBacktracker::never_match()?;
556     /// let mut cache = re.create_cache();
557     ///
558     /// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
559     /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
560     /// # Ok::<(), Box<dyn std::error::Error>>(())
561     /// ```
never_match() -> Result<BoundedBacktracker, BuildError>562     pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
563         let nfa = thompson::NFA::never_match();
564         BoundedBacktracker::new_from_nfa(nfa)
565     }
566 
567     /// Return a default configuration for a `BoundedBacktracker`.
568     ///
569     /// This is a convenience routine to avoid needing to import the `Config`
570     /// type when customizing the construction of a `BoundedBacktracker`.
571     ///
572     /// # Example
573     ///
574     /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
575     /// disabled, zero-width matches that split a codepoint are allowed.
576     /// Otherwise they are never reported.
577     ///
578     /// In the code below, notice that `""` is permitted to match positions
579     /// that split the encoding of a codepoint.
580     ///
581     /// ```
582     /// use regex_automata::{
583     ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
584     ///     Match,
585     /// };
586     ///
587     /// let re = BoundedBacktracker::builder()
588     ///     .thompson(thompson::Config::new().utf8(false))
589     ///     .build(r"")?;
590     /// let mut cache = re.create_cache();
591     ///
592     /// let haystack = "a☃z";
593     /// let mut it = re.try_find_iter(&mut cache, haystack);
594     /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next());
595     /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next());
596     /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next());
597     /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next());
598     /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next());
599     /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next());
600     /// assert_eq!(None, it.next());
601     ///
602     /// # Ok::<(), Box<dyn std::error::Error>>(())
603     /// ```
config() -> Config604     pub fn config() -> Config {
605         Config::new()
606     }
607 
608     /// Return a builder for configuring the construction of a
609     /// `BoundedBacktracker`.
610     ///
611     /// This is a convenience routine to avoid needing to import the
612     /// [`Builder`] type in common cases.
613     ///
614     /// # Example
615     ///
616     /// This example shows how to use the builder to disable UTF-8 mode
617     /// everywhere.
618     ///
619     /// ```
620     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
621     /// use regex_automata::{
622     ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
623     ///     util::syntax,
624     ///     Match,
625     /// };
626     ///
627     /// let re = BoundedBacktracker::builder()
628     ///     .syntax(syntax::Config::new().utf8(false))
629     ///     .thompson(thompson::Config::new().utf8(false))
630     ///     .build(r"foo(?-u:[^b])ar.*")?;
631     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
632     ///
633     /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
634     /// let expected = Some(Match::must(0, 1..9));
635     /// re.try_captures(&mut cache, haystack, &mut caps)?;
636     /// assert_eq!(expected, caps.get_match());
637     ///
638     /// # Ok::<(), Box<dyn std::error::Error>>(())
639     /// ```
builder() -> Builder640     pub fn builder() -> Builder {
641         Builder::new()
642     }
643 
644     /// Create a new cache for this regex.
645     ///
646     /// The cache returned should only be used for searches for this
647     /// regex. If you want to reuse the cache for another regex, then you
648     /// must call [`Cache::reset`] with that regex (or, equivalently,
649     /// [`BoundedBacktracker::reset_cache`]).
create_cache(&self) -> Cache650     pub fn create_cache(&self) -> Cache {
651         Cache::new(self)
652     }
653 
654     /// Create a new empty set of capturing groups that is guaranteed to be
655     /// valid for the search APIs on this `BoundedBacktracker`.
656     ///
657     /// A `Captures` value created for a specific `BoundedBacktracker` cannot
658     /// be used with any other `BoundedBacktracker`.
659     ///
660     /// This is a convenience function for [`Captures::all`]. See the
661     /// [`Captures`] documentation for an explanation of its alternative
662     /// constructors that permit the `BoundedBacktracker` to do less work
663     /// during a search, and thus might make it faster.
create_captures(&self) -> Captures664     pub fn create_captures(&self) -> Captures {
665         Captures::all(self.get_nfa().group_info().clone())
666     }
667 
668     /// Reset the given cache such that it can be used for searching with the
669     /// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
670     ///
671     /// A cache reset permits reusing memory already allocated in this cache
672     /// with a different `BoundedBacktracker`.
673     ///
674     /// # Example
675     ///
676     /// This shows how to re-purpose a cache for use with a different
677     /// `BoundedBacktracker`.
678     ///
679     /// ```
680     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
681     /// use regex_automata::{
682     ///     nfa::thompson::backtrack::BoundedBacktracker,
683     ///     Match,
684     /// };
685     ///
686     /// let re1 = BoundedBacktracker::new(r"\w")?;
687     /// let re2 = BoundedBacktracker::new(r"\W")?;
688     ///
689     /// let mut cache = re1.create_cache();
690     /// assert_eq!(
691     ///     Some(Ok(Match::must(0, 0..2))),
692     ///     re1.try_find_iter(&mut cache, "Δ").next(),
693     /// );
694     ///
695     /// // Using 'cache' with re2 is not allowed. It may result in panics or
696     /// // incorrect results. In order to re-purpose the cache, we must reset
697     /// // it with the BoundedBacktracker we'd like to use it with.
698     /// //
699     /// // Similarly, after this reset, using the cache with 're1' is also not
700     /// // allowed.
701     /// cache.reset(&re2);
702     /// assert_eq!(
703     ///     Some(Ok(Match::must(0, 0..3))),
704     ///     re2.try_find_iter(&mut cache, "☃").next(),
705     /// );
706     ///
707     /// # Ok::<(), Box<dyn std::error::Error>>(())
708     /// ```
reset_cache(&self, cache: &mut Cache)709     pub fn reset_cache(&self, cache: &mut Cache) {
710         cache.reset(self);
711     }
712 
713     /// Returns the total number of patterns compiled into this
714     /// `BoundedBacktracker`.
715     ///
716     /// In the case of a `BoundedBacktracker` that contains no patterns, this
717     /// returns `0`.
718     ///
719     /// # Example
720     ///
721     /// This example shows the pattern length for a `BoundedBacktracker` that
722     /// never matches:
723     ///
724     /// ```
725     /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
726     ///
727     /// let re = BoundedBacktracker::never_match()?;
728     /// assert_eq!(re.pattern_len(), 0);
729     /// # Ok::<(), Box<dyn std::error::Error>>(())
730     /// ```
731     ///
732     /// And another example for a `BoundedBacktracker` that matches at every
733     /// position:
734     ///
735     /// ```
736     /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
737     ///
738     /// let re = BoundedBacktracker::always_match()?;
739     /// assert_eq!(re.pattern_len(), 1);
740     /// # Ok::<(), Box<dyn std::error::Error>>(())
741     /// ```
742     ///
743     /// And finally, a `BoundedBacktracker` that was constructed from multiple
744     /// patterns:
745     ///
746     /// ```
747     /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
748     ///
749     /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
750     /// assert_eq!(re.pattern_len(), 3);
751     /// # Ok::<(), Box<dyn std::error::Error>>(())
752     /// ```
pattern_len(&self) -> usize753     pub fn pattern_len(&self) -> usize {
754         self.nfa.pattern_len()
755     }
756 
757     /// Return the config for this `BoundedBacktracker`.
758     #[inline]
get_config(&self) -> &Config759     pub fn get_config(&self) -> &Config {
760         &self.config
761     }
762 
763     /// Returns a reference to the underlying NFA.
764     #[inline]
get_nfa(&self) -> &NFA765     pub fn get_nfa(&self) -> &NFA {
766         &self.nfa
767     }
768 
769     /// Returns the maximum haystack length supported by this backtracker.
770     ///
771     /// This routine is a function of both [`Config::visited_capacity`] and the
772     /// internal size of the backtracker's NFA.
773     ///
774     /// # Example
775     ///
776     /// This example shows how the maximum haystack length can vary depending
777     /// on the size of the regex itself. Note though that the specific maximum
778     /// values here are not an API guarantee. The default visited capacity is
779     /// subject to change and not covered by semver.
780     ///
781     /// ```
782     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
783     /// use regex_automata::{
784     ///     nfa::thompson::backtrack::BoundedBacktracker,
785     ///     Match, MatchError,
786     /// };
787     ///
788     /// // If you're only using ASCII, you get a big budget.
789     /// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
790     /// let mut cache = re.create_cache();
791     /// assert_eq!(re.max_haystack_len(), 299_592);
792     /// // Things work up to the max.
793     /// let mut haystack = "a".repeat(299_592);
794     /// let expected = Some(Ok(Match::must(0, 0..299_592)));
795     /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
796     /// // But you'll get an error if you provide a haystack that's too big.
797     /// // Notice that we use the 'try_find_iter' routine instead, which
798     /// // yields Result<Match, MatchError> instead of Match.
799     /// haystack.push('a');
800     /// let expected = Some(Err(MatchError::haystack_too_long(299_593)));
801     /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
802     ///
803     /// // Unicode inflates the size of the underlying NFA quite a bit, and
804     /// // thus means that the backtracker can only handle smaller haystacks,
805     /// // assuming that the visited capacity remains unchanged.
806     /// let re = BoundedBacktracker::new(r"\w+")?;
807     /// assert!(re.max_haystack_len() <= 7_000);
808     /// // But we can increase the visited capacity to handle bigger haystacks!
809     /// let re = BoundedBacktracker::builder()
810     ///     .configure(BoundedBacktracker::config().visited_capacity(1<<20))
811     ///     .build(r"\w+")?;
812     /// assert!(re.max_haystack_len() >= 25_000);
813     /// assert!(re.max_haystack_len() <= 28_000);
814     /// # Ok::<(), Box<dyn std::error::Error>>(())
815     /// ```
816     #[inline]
max_haystack_len(&self) -> usize817     pub fn max_haystack_len(&self) -> usize {
818         // The capacity given in the config is "bytes of heap memory," but the
819         // capacity we use here is "number of bits." So convert the capacity in
820         // bytes to the capacity in bits.
821         let capacity = 8 * self.get_config().get_visited_capacity();
822         let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
823         let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
824         // It's possible for `real_capacity` to be smaller than the number of
825         // NFA states for particularly large regexes, so we saturate towards
826         // zero.
827         (real_capacity / self.nfa.states().len()).saturating_sub(1)
828     }
829 }
830 
831 impl BoundedBacktracker {
832     /// Returns true if and only if this regex matches the given haystack.
833     ///
834     /// In the case of a backtracking regex engine, and unlike most other
835     /// regex engines in this crate, short circuiting isn't practical. However,
836     /// this routine may still be faster because it instructs backtracking to
837     /// not keep track of any capturing groups.
838     ///
839     /// # Errors
840     ///
841     /// This routine only errors if the search could not complete. For this
842     /// backtracking regex engine, this only occurs when the haystack length
843     /// exceeds [`BoundedBacktracker::max_haystack_len`].
844     ///
845     /// When a search cannot complete, callers cannot know whether a match
846     /// exists or not.
847     ///
848     /// # Example
849     ///
850     /// ```
851     /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
852     ///
853     /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
854     /// let mut cache = re.create_cache();
855     ///
856     /// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
857     /// assert!(!re.try_is_match(&mut cache, "foobar")?);
858     /// # Ok::<(), Box<dyn std::error::Error>>(())
859     /// ```
860     ///
861     /// # Example: consistency with search APIs
862     ///
863     /// `is_match` is guaranteed to return `true` whenever `find` returns a
864     /// match. This includes searches that are executed entirely within a
865     /// codepoint:
866     ///
867     /// ```
868     /// use regex_automata::{
869     ///     nfa::thompson::backtrack::BoundedBacktracker,
870     ///     Input,
871     /// };
872     ///
873     /// let re = BoundedBacktracker::new("a*")?;
874     /// let mut cache = re.create_cache();
875     ///
876     /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
877     /// # Ok::<(), Box<dyn std::error::Error>>(())
878     /// ```
879     ///
880     /// Notice that when UTF-8 mode is disabled, then the above reports a
881     /// match because the restriction against zero-width matches that split a
882     /// codepoint has been lifted:
883     ///
884     /// ```
885     /// use regex_automata::{
886     ///     nfa::thompson::{backtrack::BoundedBacktracker, NFA},
887     ///     Input,
888     /// };
889     ///
890     /// let re = BoundedBacktracker::builder()
891     ///     .thompson(NFA::config().utf8(false))
892     ///     .build("a*")?;
893     /// let mut cache = re.create_cache();
894     ///
895     /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
896     /// # Ok::<(), Box<dyn std::error::Error>>(())
897     /// ```
898     #[inline]
try_is_match<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> Result<bool, MatchError>899     pub fn try_is_match<'h, I: Into<Input<'h>>>(
900         &self,
901         cache: &mut Cache,
902         input: I,
903     ) -> Result<bool, MatchError> {
904         let input = input.into().earliest(true);
905         self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some())
906     }
907 
908     /// Executes a leftmost forward search and returns a `Match` if one exists.
909     ///
910     /// This routine only includes the overall match span. To get
911     /// access to the individual spans of each capturing group, use
912     /// [`BoundedBacktracker::try_captures`].
913     ///
914     /// # Errors
915     ///
916     /// This routine only errors if the search could not complete. For this
917     /// backtracking regex engine, this only occurs when the haystack length
918     /// exceeds [`BoundedBacktracker::max_haystack_len`].
919     ///
920     /// When a search cannot complete, callers cannot know whether a match
921     /// exists or not.
922     ///
923     /// # Example
924     ///
925     /// ```
926     /// use regex_automata::{
927     ///     nfa::thompson::backtrack::BoundedBacktracker,
928     ///     Match,
929     /// };
930     ///
931     /// let re = BoundedBacktracker::new("foo[0-9]+")?;
932     /// let mut cache = re.create_cache();
933     /// let expected = Match::must(0, 0..8);
934     /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
935     ///
936     /// # Ok::<(), Box<dyn std::error::Error>>(())
937     /// ```
938     #[inline]
try_find<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> Result<Option<Match>, MatchError>939     pub fn try_find<'h, I: Into<Input<'h>>>(
940         &self,
941         cache: &mut Cache,
942         input: I,
943     ) -> Result<Option<Match>, MatchError> {
944         let input = input.into();
945         if self.get_nfa().pattern_len() == 1 {
946             let mut slots = [None, None];
947             let pid = match self.try_search_slots(cache, &input, &mut slots)? {
948                 None => return Ok(None),
949                 Some(pid) => pid,
950             };
951             let start = match slots[0] {
952                 None => return Ok(None),
953                 Some(s) => s.get(),
954             };
955             let end = match slots[1] {
956                 None => return Ok(None),
957                 Some(s) => s.get(),
958             };
959             return Ok(Some(Match::new(pid, Span { start, end })));
960         }
961         let ginfo = self.get_nfa().group_info();
962         let slots_len = ginfo.implicit_slot_len();
963         let mut slots = vec![None; slots_len];
964         let pid = match self.try_search_slots(cache, &input, &mut slots)? {
965             None => return Ok(None),
966             Some(pid) => pid,
967         };
968         let start = match slots[pid.as_usize() * 2] {
969             None => return Ok(None),
970             Some(s) => s.get(),
971         };
972         let end = match slots[pid.as_usize() * 2 + 1] {
973             None => return Ok(None),
974             Some(s) => s.get(),
975         };
976         Ok(Some(Match::new(pid, Span { start, end })))
977     }
978 
979     /// Executes a leftmost forward search and writes the spans of capturing
980     /// groups that participated in a match into the provided [`Captures`]
981     /// value. If no match was found, then [`Captures::is_match`] is guaranteed
982     /// to return `false`.
983     ///
984     /// # Errors
985     ///
986     /// This routine only errors if the search could not complete. For this
987     /// backtracking regex engine, this only occurs when the haystack length
988     /// exceeds [`BoundedBacktracker::max_haystack_len`].
989     ///
990     /// When a search cannot complete, callers cannot know whether a match
991     /// exists or not.
992     ///
993     /// # Example
994     ///
995     /// ```
996     /// use regex_automata::{
997     ///     nfa::thompson::backtrack::BoundedBacktracker,
998     ///     Span,
999     /// };
1000     ///
1001     /// let re = BoundedBacktracker::new(
1002     ///     r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
1003     /// )?;
1004     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1005     ///
1006     /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
1007     /// assert!(caps.is_match());
1008     /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
1009     /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
1010     /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
1011     ///
1012     /// # Ok::<(), Box<dyn std::error::Error>>(())
1013     /// ```
1014     #[inline]
try_captures<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, ) -> Result<(), MatchError>1015     pub fn try_captures<'h, I: Into<Input<'h>>>(
1016         &self,
1017         cache: &mut Cache,
1018         input: I,
1019         caps: &mut Captures,
1020     ) -> Result<(), MatchError> {
1021         self.try_search(cache, &input.into(), caps)
1022     }
1023 
1024     /// Returns an iterator over all non-overlapping leftmost matches in the
1025     /// given bytes. If no match exists, then the iterator yields no elements.
1026     ///
1027     /// If the regex engine returns an error at any point, then the iterator
1028     /// will yield that error.
1029     ///
1030     /// # Example
1031     ///
1032     /// ```
1033     /// use regex_automata::{
1034     ///     nfa::thompson::backtrack::BoundedBacktracker,
1035     ///     Match, MatchError,
1036     /// };
1037     ///
1038     /// let re = BoundedBacktracker::new("foo[0-9]+")?;
1039     /// let mut cache = re.create_cache();
1040     ///
1041     /// let text = "foo1 foo12 foo123";
1042     /// let result: Result<Vec<Match>, MatchError> = re
1043     ///     .try_find_iter(&mut cache, text)
1044     ///     .collect();
1045     /// let matches = result?;
1046     /// assert_eq!(matches, vec![
1047     ///     Match::must(0, 0..4),
1048     ///     Match::must(0, 5..10),
1049     ///     Match::must(0, 11..17),
1050     /// ]);
1051     /// # Ok::<(), Box<dyn std::error::Error>>(())
1052     /// ```
1053     #[inline]
try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> TryFindMatches<'r, 'c, 'h>1054     pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1055         &'r self,
1056         cache: &'c mut Cache,
1057         input: I,
1058     ) -> TryFindMatches<'r, 'c, 'h> {
1059         let caps = Captures::matches(self.get_nfa().group_info().clone());
1060         let it = iter::Searcher::new(input.into());
1061         TryFindMatches { re: self, cache, caps, it }
1062     }
1063 
1064     /// Returns an iterator over all non-overlapping `Captures` values. If no
1065     /// match exists, then the iterator yields no elements.
1066     ///
1067     /// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
1068     /// but it includes the spans of all capturing groups that participate in
1069     /// each match.
1070     ///
1071     /// If the regex engine returns an error at any point, then the iterator
1072     /// will yield that error.
1073     ///
1074     /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
1075     /// how to correctly iterate over all matches in a haystack while avoiding
1076     /// the creation of a new `Captures` value for every match. (Which you are
1077     /// forced to do with an `Iterator`.)
1078     ///
1079     /// # Example
1080     ///
1081     /// ```
1082     /// use regex_automata::{
1083     ///     nfa::thompson::backtrack::BoundedBacktracker,
1084     ///     Span,
1085     /// };
1086     ///
1087     /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
1088     /// let mut cache = re.create_cache();
1089     ///
1090     /// let text = "foo1 foo12 foo123";
1091     /// let mut spans = vec![];
1092     /// for result in re.try_captures_iter(&mut cache, text) {
1093     ///     let caps = result?;
1094     ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
1095     ///     spans.push(caps.get_group_by_name("numbers").unwrap());
1096     /// }
1097     /// assert_eq!(spans, vec![
1098     ///     Span::from(3..4),
1099     ///     Span::from(8..10),
1100     ///     Span::from(14..17),
1101     /// ]);
1102     /// # Ok::<(), Box<dyn std::error::Error>>(())
1103     /// ```
1104     #[inline]
try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> TryCapturesMatches<'r, 'c, 'h>1105     pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1106         &'r self,
1107         cache: &'c mut Cache,
1108         input: I,
1109     ) -> TryCapturesMatches<'r, 'c, 'h> {
1110         let caps = self.create_captures();
1111         let it = iter::Searcher::new(input.into());
1112         TryCapturesMatches { re: self, cache, caps, it }
1113     }
1114 }
1115 
1116 impl BoundedBacktracker {
1117     /// Executes a leftmost forward search and writes the spans of capturing
1118     /// groups that participated in a match into the provided [`Captures`]
1119     /// value. If no match was found, then [`Captures::is_match`] is guaranteed
1120     /// to return `false`.
1121     ///
1122     /// This is like [`BoundedBacktracker::try_captures`], but it accepts a
1123     /// concrete `&Input` instead of an `Into<Input>`.
1124     ///
1125     /// # Errors
1126     ///
1127     /// This routine only errors if the search could not complete. For this
1128     /// backtracking regex engine, this only occurs when the haystack length
1129     /// exceeds [`BoundedBacktracker::max_haystack_len`].
1130     ///
1131     /// When a search cannot complete, callers cannot know whether a match
1132     /// exists or not.
1133     ///
1134     /// # Example: specific pattern search
1135     ///
1136     /// This example shows how to build a multi bounded backtracker that
1137     /// permits searching for specific patterns.
1138     ///
1139     /// ```
1140     /// use regex_automata::{
1141     ///     nfa::thompson::backtrack::BoundedBacktracker,
1142     ///     Anchored, Input, Match, PatternID,
1143     /// };
1144     ///
1145     /// let re = BoundedBacktracker::new_many(&[
1146     ///     "[a-z0-9]{6}",
1147     ///     "[a-z][a-z0-9]{5}",
1148     /// ])?;
1149     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1150     /// let haystack = "foo123";
1151     ///
1152     /// // Since we are using the default leftmost-first match and both
1153     /// // patterns match at the same starting position, only the first pattern
1154     /// // will be returned in this case when doing a search for any of the
1155     /// // patterns.
1156     /// let expected = Some(Match::must(0, 0..6));
1157     /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
1158     /// assert_eq!(expected, caps.get_match());
1159     ///
1160     /// // But if we want to check whether some other pattern matches, then we
1161     /// // can provide its pattern ID.
1162     /// let expected = Some(Match::must(1, 0..6));
1163     /// let input = Input::new(haystack)
1164     ///     .anchored(Anchored::Pattern(PatternID::must(1)));
1165     /// re.try_search(&mut cache, &input, &mut caps)?;
1166     /// assert_eq!(expected, caps.get_match());
1167     ///
1168     /// # Ok::<(), Box<dyn std::error::Error>>(())
1169     /// ```
1170     ///
1171     /// # Example: specifying the bounds of a search
1172     ///
1173     /// This example shows how providing the bounds of a search can produce
1174     /// different results than simply sub-slicing the haystack.
1175     ///
1176     /// ```
1177     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1178     /// use regex_automata::{
1179     ///     nfa::thompson::backtrack::BoundedBacktracker,
1180     ///     Match, Input,
1181     /// };
1182     ///
1183     /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
1184     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1185     /// let haystack = "foo123bar";
1186     ///
1187     /// // Since we sub-slice the haystack, the search doesn't know about
1188     /// // the larger context and assumes that `123` is surrounded by word
1189     /// // boundaries. And of course, the match position is reported relative
1190     /// // to the sub-slice as well, which means we get `0..3` instead of
1191     /// // `3..6`.
1192     /// let expected = Some(Match::must(0, 0..3));
1193     /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?;
1194     /// assert_eq!(expected, caps.get_match());
1195     ///
1196     /// // But if we provide the bounds of the search within the context of the
1197     /// // entire haystack, then the search can take the surrounding context
1198     /// // into account. (And if we did find a match, it would be reported
1199     /// // as a valid offset into `haystack` instead of its sub-slice.)
1200     /// let expected = None;
1201     /// re.try_search(
1202     ///     &mut cache, &Input::new(haystack).range(3..6), &mut caps,
1203     /// )?;
1204     /// assert_eq!(expected, caps.get_match());
1205     ///
1206     /// # Ok::<(), Box<dyn std::error::Error>>(())
1207     /// ```
1208     #[inline]
try_search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) -> Result<(), MatchError>1209     pub fn try_search(
1210         &self,
1211         cache: &mut Cache,
1212         input: &Input<'_>,
1213         caps: &mut Captures,
1214     ) -> Result<(), MatchError> {
1215         caps.set_pattern(None);
1216         let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1217         caps.set_pattern(pid);
1218         Ok(())
1219     }
1220 
1221     /// Executes a leftmost forward search and writes the spans of capturing
1222     /// groups that participated in a match into the provided `slots`, and
1223     /// returns the matching pattern ID. The contents of the slots for patterns
1224     /// other than the matching pattern are unspecified. If no match was found,
1225     /// then `None` is returned and the contents of all `slots` is unspecified.
1226     ///
1227     /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
1228     /// slots slice instead of a `Captures` value. This is useful in contexts
1229     /// where you don't want or need to allocate a `Captures`.
1230     ///
1231     /// It is legal to pass _any_ number of slots to this routine. If the regex
1232     /// engine would otherwise write a slot offset that doesn't fit in the
1233     /// provided slice, then it is simply skipped. In general though, there are
1234     /// usually three slice lengths you might want to use:
1235     ///
1236     /// * An empty slice, if you only care about which pattern matched.
1237     /// * A slice with
1238     /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1239     /// slots, if you only care about the overall match spans for each matching
1240     /// pattern.
1241     /// * A slice with
1242     /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1243     /// permits recording match offsets for every capturing group in every
1244     /// pattern.
1245     ///
1246     /// # Errors
1247     ///
1248     /// This routine only errors if the search could not complete. For this
1249     /// backtracking regex engine, this only occurs when the haystack length
1250     /// exceeds [`BoundedBacktracker::max_haystack_len`].
1251     ///
1252     /// When a search cannot complete, callers cannot know whether a match
1253     /// exists or not.
1254     ///
1255     /// # Example
1256     ///
1257     /// This example shows how to find the overall match offsets in a
1258     /// multi-pattern search without allocating a `Captures` value. Indeed, we
1259     /// can put our slots right on the stack.
1260     ///
1261     /// ```
1262     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1263     /// use regex_automata::{
1264     ///     nfa::thompson::backtrack::BoundedBacktracker,
1265     ///     PatternID, Input,
1266     /// };
1267     ///
1268     /// let re = BoundedBacktracker::new_many(&[
1269     ///     r"\pL+",
1270     ///     r"\d+",
1271     /// ])?;
1272     /// let mut cache = re.create_cache();
1273     /// let input = Input::new("!@#123");
1274     ///
1275     /// // We only care about the overall match offsets here, so we just
1276     /// // allocate two slots for each pattern. Each slot records the start
1277     /// // and end of the match.
1278     /// let mut slots = [None; 4];
1279     /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1280     /// assert_eq!(Some(PatternID::must(1)), pid);
1281     ///
1282     /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1283     /// // See 'GroupInfo' for more details on the mapping between groups and
1284     /// // slot indices.
1285     /// let slot_start = pid.unwrap().as_usize() * 2;
1286     /// let slot_end = slot_start + 1;
1287     /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1288     /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1289     ///
1290     /// # Ok::<(), Box<dyn std::error::Error>>(())
1291     /// ```
1292     #[inline]
try_search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Result<Option<PatternID>, MatchError>1293     pub fn try_search_slots(
1294         &self,
1295         cache: &mut Cache,
1296         input: &Input<'_>,
1297         slots: &mut [Option<NonMaxUsize>],
1298     ) -> Result<Option<PatternID>, MatchError> {
1299         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1300         if !utf8empty {
1301             let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1302             return Ok(maybe_hm.map(|hm| hm.pattern()));
1303         }
1304         // See PikeVM::try_search_slots for why we do this.
1305         let min = self.get_nfa().group_info().implicit_slot_len();
1306         if slots.len() >= min {
1307             let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1308             return Ok(maybe_hm.map(|hm| hm.pattern()));
1309         }
1310         if self.get_nfa().pattern_len() == 1 {
1311             let mut enough = [None, None];
1312             let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1313             // This is OK because we know `enough_slots` is strictly bigger
1314             // than `slots`, otherwise this special case isn't reached.
1315             slots.copy_from_slice(&enough[..slots.len()]);
1316             return Ok(got.map(|hm| hm.pattern()));
1317         }
1318         let mut enough = vec![None; min];
1319         let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1320         // This is OK because we know `enough_slots` is strictly bigger than
1321         // `slots`, otherwise this special case isn't reached.
1322         slots.copy_from_slice(&enough[..slots.len()]);
1323         Ok(got.map(|hm| hm.pattern()))
1324     }
1325 
1326     /// This is the actual implementation of `try_search_slots_imp` that
1327     /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1328     /// enabled, 2) the NFA can match the empty string and 3) the caller has
1329     /// provided an insufficient number of slots to record match offsets.
1330     #[inline(never)]
try_search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Result<Option<HalfMatch>, MatchError>1331     fn try_search_slots_imp(
1332         &self,
1333         cache: &mut Cache,
1334         input: &Input<'_>,
1335         slots: &mut [Option<NonMaxUsize>],
1336     ) -> Result<Option<HalfMatch>, MatchError> {
1337         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1338         let hm = match self.search_imp(cache, input, slots)? {
1339             None => return Ok(None),
1340             Some(hm) if !utf8empty => return Ok(Some(hm)),
1341             Some(hm) => hm,
1342         };
1343         empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1344             Ok(self
1345                 .search_imp(cache, input, slots)?
1346                 .map(|hm| (hm, hm.offset())))
1347         })
1348     }
1349 
1350     /// The implementation of standard leftmost backtracking search.
1351     ///
1352     /// Capturing group spans are written to 'caps', but only if requested.
1353     /// 'caps' can be one of three things: 1) totally empty, in which case, we
1354     /// only report the pattern that matched or 2) only has slots for recording
1355     /// the overall match offsets for any pattern or 3) has all slots available
1356     /// for recording the spans of any groups participating in a match.
search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Result<Option<HalfMatch>, MatchError>1357     fn search_imp(
1358         &self,
1359         cache: &mut Cache,
1360         input: &Input<'_>,
1361         slots: &mut [Option<NonMaxUsize>],
1362     ) -> Result<Option<HalfMatch>, MatchError> {
1363         // Unlike in the PikeVM, we write our capturing group spans directly
1364         // into the caller's captures groups. So we have to make sure we're
1365         // starting with a blank slate first. In the PikeVM, we avoid this
1366         // by construction: the spans that are copied to every slot in the
1367         // 'Captures' value already account for presence/absence. In this
1368         // backtracker, we write directly into the caller provided slots, where
1369         // as in the PikeVM, we write into scratch space first and only copy
1370         // them to the caller provided slots when a match is found.
1371         for slot in slots.iter_mut() {
1372             *slot = None;
1373         }
1374         cache.setup_search(&self, input)?;
1375         if input.is_done() {
1376             return Ok(None);
1377         }
1378         let (anchored, start_id) = match input.get_anchored() {
1379             // Only way we're unanchored is if both the caller asked for an
1380             // unanchored search *and* the pattern is itself not anchored.
1381             Anchored::No => (
1382                 self.nfa.is_always_start_anchored(),
1383                 // We always use the anchored starting state here, even if
1384                 // doing an unanchored search. The "unanchored" part of it is
1385                 // implemented in the loop below, by simply trying the next
1386                 // byte offset if the previous backtracking exploration failed.
1387                 self.nfa.start_anchored(),
1388             ),
1389             Anchored::Yes => (true, self.nfa.start_anchored()),
1390             Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
1391                 None => return Ok(None),
1392                 Some(sid) => (true, sid),
1393             },
1394         };
1395         if anchored {
1396             let at = input.start();
1397             return Ok(self.backtrack(cache, input, at, start_id, slots));
1398         }
1399         let pre = self.get_config().get_prefilter();
1400         let mut at = input.start();
1401         while at <= input.end() {
1402             if let Some(ref pre) = pre {
1403                 let span = Span::from(at..input.end());
1404                 match pre.find(input.haystack(), span) {
1405                     None => break,
1406                     Some(ref span) => at = span.start,
1407                 }
1408             }
1409             if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
1410             {
1411                 return Ok(Some(hm));
1412             }
1413             at += 1;
1414         }
1415         Ok(None)
1416     }
1417 
1418     /// Look for a match starting at `at` in `input` and write the matching
1419     /// pattern ID and group spans to `caps`. The search uses `start_id` as its
1420     /// starting state in the underlying NFA.
1421     ///
1422     /// If no match was found, then the caller should increment `at` and try
1423     /// at the next position.
1424     #[cfg_attr(feature = "perf-inline", inline(always))]
backtrack( &self, cache: &mut Cache, input: &Input<'_>, at: usize, start_id: StateID, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1425     fn backtrack(
1426         &self,
1427         cache: &mut Cache,
1428         input: &Input<'_>,
1429         at: usize,
1430         start_id: StateID,
1431         slots: &mut [Option<NonMaxUsize>],
1432     ) -> Option<HalfMatch> {
1433         cache.stack.push(Frame::Step { sid: start_id, at });
1434         while let Some(frame) = cache.stack.pop() {
1435             match frame {
1436                 Frame::Step { sid, at } => {
1437                     if let Some(hm) = self.step(cache, input, sid, at, slots) {
1438                         return Some(hm);
1439                     }
1440                 }
1441                 Frame::RestoreCapture { slot, offset } => {
1442                     slots[slot] = offset;
1443                 }
1444             }
1445         }
1446         None
1447     }
1448 
1449     // LAMENTATION: The actual backtracking search is implemented in about
1450     // 75 lines below. Yet this file is over 2,000 lines long. What have I
1451     // done?
1452 
1453     /// Execute a "step" in the backtracing algorithm.
1454     ///
1455     /// A "step" is somewhat of a misnomer, because this routine keeps going
1456     /// until it either runs out of things to try or fins a match. In the
1457     /// former case, it may have pushed some things on to the backtracking
1458     /// stack, in which case, those will be tried next as part of the
1459     /// 'backtrack' routine above.
1460     #[cfg_attr(feature = "perf-inline", inline(always))]
step( &self, cache: &mut Cache, input: &Input<'_>, mut sid: StateID, mut at: usize, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1461     fn step(
1462         &self,
1463         cache: &mut Cache,
1464         input: &Input<'_>,
1465         mut sid: StateID,
1466         mut at: usize,
1467         slots: &mut [Option<NonMaxUsize>],
1468     ) -> Option<HalfMatch> {
1469         loop {
1470             if !cache.visited.insert(sid, at - input.start()) {
1471                 return None;
1472             }
1473             match *self.nfa.state(sid) {
1474                 State::ByteRange { ref trans } => {
1475                     // Why do we need this? Unlike other regex engines in this
1476                     // crate, the backtracker can steam roll ahead in the
1477                     // haystack outside of the main loop over the bytes in the
1478                     // haystack. While 'trans.matches()' below handles the case
1479                     // of 'at' being out of bounds of 'input.haystack()', we
1480                     // also need to handle the case of 'at' going out of bounds
1481                     // of the span the caller asked to search.
1482                     //
1483                     // We should perhaps make the 'trans.matches()' API accept
1484                     // an '&Input' instead of a '&[u8]'. Or at least, add a new
1485                     // API that does it.
1486                     if at >= input.end() {
1487                         return None;
1488                     }
1489                     if !trans.matches(input.haystack(), at) {
1490                         return None;
1491                     }
1492                     sid = trans.next;
1493                     at += 1;
1494                 }
1495                 State::Sparse(ref sparse) => {
1496                     if at >= input.end() {
1497                         return None;
1498                     }
1499                     sid = sparse.matches(input.haystack(), at)?;
1500                     at += 1;
1501                 }
1502                 State::Dense(ref dense) => {
1503                     if at >= input.end() {
1504                         return None;
1505                     }
1506                     sid = dense.matches(input.haystack(), at)?;
1507                     at += 1;
1508                 }
1509                 State::Look { look, next } => {
1510                     // OK because we don't permit building a searcher with a
1511                     // Unicode word boundary if the requisite Unicode data is
1512                     // unavailable.
1513                     if !self.nfa.look_matcher().matches_inline(
1514                         look,
1515                         input.haystack(),
1516                         at,
1517                     ) {
1518                         return None;
1519                     }
1520                     sid = next;
1521                 }
1522                 State::Union { ref alternates } => {
1523                     sid = match alternates.get(0) {
1524                         None => return None,
1525                         Some(&sid) => sid,
1526                     };
1527                     cache.stack.extend(
1528                         alternates[1..]
1529                             .iter()
1530                             .copied()
1531                             .rev()
1532                             .map(|sid| Frame::Step { sid, at }),
1533                     );
1534                 }
1535                 State::BinaryUnion { alt1, alt2 } => {
1536                     sid = alt1;
1537                     cache.stack.push(Frame::Step { sid: alt2, at });
1538                 }
1539                 State::Capture { next, slot, .. } => {
1540                     if slot.as_usize() < slots.len() {
1541                         cache.stack.push(Frame::RestoreCapture {
1542                             slot,
1543                             offset: slots[slot],
1544                         });
1545                         slots[slot] = NonMaxUsize::new(at);
1546                     }
1547                     sid = next;
1548                 }
1549                 State::Fail => return None,
1550                 State::Match { pattern_id } => {
1551                     return Some(HalfMatch::new(pattern_id, at));
1552                 }
1553             }
1554         }
1555     }
1556 }
1557 
1558 /// An iterator over all non-overlapping matches for a fallible search.
1559 ///
1560 /// The iterator yields a `Result<Match, MatchError` value until no more
1561 /// matches could be found.
1562 ///
1563 /// The lifetime parameters are as follows:
1564 ///
1565 /// * `'r` represents the lifetime of the BoundedBacktracker.
1566 /// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1567 /// * `'h` represents the lifetime of the haystack being searched.
1568 ///
1569 /// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
1570 /// method.
1571 #[derive(Debug)]
1572 pub struct TryFindMatches<'r, 'c, 'h> {
1573     re: &'r BoundedBacktracker,
1574     cache: &'c mut Cache,
1575     caps: Captures,
1576     it: iter::Searcher<'h>,
1577 }
1578 
1579 impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
1580     type Item = Result<Match, MatchError>;
1581 
1582     #[inline]
next(&mut self) -> Option<Result<Match, MatchError>>1583     fn next(&mut self) -> Option<Result<Match, MatchError>> {
1584         // Splitting 'self' apart seems necessary to appease borrowck.
1585         let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1586             *self;
1587         it.try_advance(|input| {
1588             re.try_search(cache, input, caps)?;
1589             Ok(caps.get_match())
1590         })
1591         .transpose()
1592     }
1593 }
1594 
1595 /// An iterator over all non-overlapping leftmost matches, with their capturing
1596 /// groups, for a fallible search.
1597 ///
1598 /// The iterator yields a `Result<Captures, MatchError>` value until no more
1599 /// matches could be found.
1600 ///
1601 /// The lifetime parameters are as follows:
1602 ///
1603 /// * `'r` represents the lifetime of the BoundedBacktracker.
1604 /// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1605 /// * `'h` represents the lifetime of the haystack being searched.
1606 ///
1607 /// This iterator can be created with the
1608 /// [`BoundedBacktracker::try_captures_iter`] method.
1609 #[derive(Debug)]
1610 pub struct TryCapturesMatches<'r, 'c, 'h> {
1611     re: &'r BoundedBacktracker,
1612     cache: &'c mut Cache,
1613     caps: Captures,
1614     it: iter::Searcher<'h>,
1615 }
1616 
1617 impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
1618     type Item = Result<Captures, MatchError>;
1619 
1620     #[inline]
next(&mut self) -> Option<Result<Captures, MatchError>>1621     fn next(&mut self) -> Option<Result<Captures, MatchError>> {
1622         // Splitting 'self' apart seems necessary to appease borrowck.
1623         let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1624             *self;
1625         let _ = it
1626             .try_advance(|input| {
1627                 re.try_search(cache, input, caps)?;
1628                 Ok(caps.get_match())
1629             })
1630             .transpose()?;
1631         if caps.is_match() {
1632             Some(Ok(caps.clone()))
1633         } else {
1634             None
1635         }
1636     }
1637 }
1638 
1639 /// A cache represents mutable state that a [`BoundedBacktracker`] requires
1640 /// during a search.
1641 ///
1642 /// For a given [`BoundedBacktracker`], its corresponding cache may be created
1643 /// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
1644 /// They are equivalent in every way, except the former does not require
1645 /// explicitly importing `Cache`.
1646 ///
1647 /// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
1648 /// it was created. It may only be used with that `BoundedBacktracker`. A cache
1649 /// and its allocations may be re-purposed via [`Cache::reset`], in which case,
1650 /// it can only be used with the new `BoundedBacktracker` (and not the old
1651 /// one).
1652 #[derive(Clone, Debug)]
1653 pub struct Cache {
1654     /// Stack used on the heap for doing backtracking instead of the
1655     /// traditional recursive approach. We don't want recursion because then
1656     /// we're likely to hit a stack overflow for bigger regexes.
1657     stack: Vec<Frame>,
1658     /// The set of (StateID, HaystackOffset) pairs that have been visited
1659     /// by the backtracker within a single search. If such a pair has been
1660     /// visited, then we avoid doing the work for that pair again. This is
1661     /// what "bounds" the backtracking and prevents it from having worst case
1662     /// exponential time.
1663     visited: Visited,
1664 }
1665 
1666 impl Cache {
1667     /// Create a new [`BoundedBacktracker`] cache.
1668     ///
1669     /// A potentially more convenient routine to create a cache is
1670     /// [`BoundedBacktracker::create_cache`], as it does not require also
1671     /// importing the `Cache` type.
1672     ///
1673     /// If you want to reuse the returned `Cache` with some other
1674     /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
1675     /// desired `BoundedBacktracker`.
new(re: &BoundedBacktracker) -> Cache1676     pub fn new(re: &BoundedBacktracker) -> Cache {
1677         Cache { stack: vec![], visited: Visited::new(re) }
1678     }
1679 
1680     /// Reset this cache such that it can be used for searching with different
1681     /// [`BoundedBacktracker`].
1682     ///
1683     /// A cache reset permits reusing memory already allocated in this cache
1684     /// with a different `BoundedBacktracker`.
1685     ///
1686     /// # Example
1687     ///
1688     /// This shows how to re-purpose a cache for use with a different
1689     /// `BoundedBacktracker`.
1690     ///
1691     /// ```
1692     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1693     /// use regex_automata::{
1694     ///     nfa::thompson::backtrack::BoundedBacktracker,
1695     ///     Match,
1696     /// };
1697     ///
1698     /// let re1 = BoundedBacktracker::new(r"\w")?;
1699     /// let re2 = BoundedBacktracker::new(r"\W")?;
1700     ///
1701     /// let mut cache = re1.create_cache();
1702     /// assert_eq!(
1703     ///     Some(Ok(Match::must(0, 0..2))),
1704     ///     re1.try_find_iter(&mut cache, "Δ").next(),
1705     /// );
1706     ///
1707     /// // Using 'cache' with re2 is not allowed. It may result in panics or
1708     /// // incorrect results. In order to re-purpose the cache, we must reset
1709     /// // it with the BoundedBacktracker we'd like to use it with.
1710     /// //
1711     /// // Similarly, after this reset, using the cache with 're1' is also not
1712     /// // allowed.
1713     /// cache.reset(&re2);
1714     /// assert_eq!(
1715     ///     Some(Ok(Match::must(0, 0..3))),
1716     ///     re2.try_find_iter(&mut cache, "☃").next(),
1717     /// );
1718     ///
1719     /// # Ok::<(), Box<dyn std::error::Error>>(())
1720     /// ```
reset(&mut self, re: &BoundedBacktracker)1721     pub fn reset(&mut self, re: &BoundedBacktracker) {
1722         self.visited.reset(re);
1723     }
1724 
1725     /// Returns the heap memory usage, in bytes, of this cache.
1726     ///
1727     /// This does **not** include the stack size used up by this cache. To
1728     /// compute that, use `std::mem::size_of::<Cache>()`.
memory_usage(&self) -> usize1729     pub fn memory_usage(&self) -> usize {
1730         self.stack.len() * core::mem::size_of::<Frame>()
1731             + self.visited.memory_usage()
1732     }
1733 
1734     /// Clears this cache. This should be called at the start of every search
1735     /// to ensure we start with a clean slate.
1736     ///
1737     /// This also sets the length of the capturing groups used in the current
1738     /// search. This permits an optimization where by 'SlotTable::for_state'
1739     /// only returns the number of slots equivalent to the number of slots
1740     /// given in the 'Captures' value. This may be less than the total number
1741     /// of possible slots, e.g., when one only wants to track overall match
1742     /// offsets. This in turn permits less copying of capturing group spans
1743     /// in the BoundedBacktracker.
setup_search( &mut self, re: &BoundedBacktracker, input: &Input<'_>, ) -> Result<(), MatchError>1744     fn setup_search(
1745         &mut self,
1746         re: &BoundedBacktracker,
1747         input: &Input<'_>,
1748     ) -> Result<(), MatchError> {
1749         self.stack.clear();
1750         self.visited.setup_search(re, input)?;
1751         Ok(())
1752     }
1753 }
1754 
1755 /// Represents a stack frame on the heap while doing backtracking.
1756 ///
1757 /// Instead of using explicit recursion for backtracking, we use a stack on
1758 /// the heap to keep track of things that we want to explore if the current
1759 /// backtracking branch turns out to not lead to a match.
1760 #[derive(Clone, Debug)]
1761 enum Frame {
1762     /// Look for a match starting at `sid` and the given position in the
1763     /// haystack.
1764     Step { sid: StateID, at: usize },
1765     /// Reset the given `slot` to the given `offset` (which might be `None`).
1766     /// This effectively gives a "scope" to capturing groups, such that an
1767     /// offset for a particular group only gets returned if the match goes
1768     /// through that capturing group. If backtracking ends up going down a
1769     /// different branch that results in a different offset (or perhaps none at
1770     /// all), then this "restore capture" frame will cause the offset to get
1771     /// reset.
1772     RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
1773 }
1774 
1775 /// A bitset that keeps track of whether a particular (StateID, offset) has
1776 /// been considered during backtracking. If it has already been visited, then
1777 /// backtracking skips it. This is what gives backtracking its "bound."
1778 #[derive(Clone, Debug)]
1779 struct Visited {
1780     /// The actual underlying bitset. Each element in the bitset corresponds
1781     /// to a particular (StateID, offset) pair. States correspond to the rows
1782     /// and the offsets correspond to the columns.
1783     ///
1784     /// If our underlying NFA has N states and the haystack we're searching
1785     /// has M bytes, then we have N*(M+1) entries in our bitset table. The
1786     /// M+1 occurs because our matches are delayed by one byte (to support
1787     /// look-around), and so we need to handle the end position itself rather
1788     /// than stopping just before the end. (If there is no end position, then
1789     /// it's treated as "end-of-input," which is matched by things like '$'.)
1790     ///
1791     /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize))
1792     /// blocks.
1793     ///
1794     /// We use 'usize' to represent our blocks because it makes some of the
1795     /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
1796     /// our block, we'd either need to cast u32s to usizes or usizes to u32s.
1797     bitset: Vec<usize>,
1798     /// The stride represents one plus length of the haystack we're searching
1799     /// (as described above). The stride must be initialized for each search.
1800     stride: usize,
1801 }
1802 
1803 impl Visited {
1804     /// The size of each block, in bits.
1805     const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>();
1806 
1807     /// Create a new visited set for the given backtracker.
1808     ///
1809     /// The set is ready to use, but must be setup at the beginning of each
1810     /// search by calling `setup_search`.
new(re: &BoundedBacktracker) -> Visited1811     fn new(re: &BoundedBacktracker) -> Visited {
1812         let mut visited = Visited { bitset: vec![], stride: 0 };
1813         visited.reset(re);
1814         visited
1815     }
1816 
1817     /// Insert the given (StateID, offset) pair into this set. If it already
1818     /// exists, then this is a no-op and it returns false. Otherwise this
1819     /// returns true.
insert(&mut self, sid: StateID, at: usize) -> bool1820     fn insert(&mut self, sid: StateID, at: usize) -> bool {
1821         let table_index = sid.as_usize() * self.stride + at;
1822         let block_index = table_index / Visited::BLOCK_SIZE;
1823         let bit = table_index % Visited::BLOCK_SIZE;
1824         let block_with_bit = 1 << bit;
1825         if self.bitset[block_index] & block_with_bit != 0 {
1826             return false;
1827         }
1828         self.bitset[block_index] |= block_with_bit;
1829         true
1830     }
1831 
1832     /// Reset this visited set to work with the given bounded backtracker.
reset(&mut self, _: &BoundedBacktracker)1833     fn reset(&mut self, _: &BoundedBacktracker) {
1834         self.bitset.truncate(0);
1835     }
1836 
1837     /// Setup this visited set to work for a search using the given NFA
1838     /// and input configuration. The NFA must be the same NFA used by the
1839     /// BoundedBacktracker given to Visited::reset. Failing to call this might
1840     /// result in panics or silently incorrect search behavior.
setup_search( &mut self, re: &BoundedBacktracker, input: &Input<'_>, ) -> Result<(), MatchError>1841     fn setup_search(
1842         &mut self,
1843         re: &BoundedBacktracker,
1844         input: &Input<'_>,
1845     ) -> Result<(), MatchError> {
1846         // Our haystack length is only the length of the span of the entire
1847         // haystack that we'll be searching.
1848         let haylen = input.get_span().len();
1849         let err = || MatchError::haystack_too_long(haylen);
1850         // Our stride is one more than the length of the input because our main
1851         // search loop includes the position at input.end(). (And it does this
1852         // because matches are delayed by one byte to account for look-around.)
1853         self.stride = haylen + 1;
1854         let needed_capacity =
1855             match re.get_nfa().states().len().checked_mul(self.stride) {
1856                 None => return Err(err()),
1857                 Some(capacity) => capacity,
1858             };
1859         let max_capacity = 8 * re.get_config().get_visited_capacity();
1860         if needed_capacity > max_capacity {
1861             return Err(err());
1862         }
1863         let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
1864         self.bitset.truncate(needed_blocks);
1865         for block in self.bitset.iter_mut() {
1866             *block = 0;
1867         }
1868         if needed_blocks > self.bitset.len() {
1869             self.bitset.resize(needed_blocks, 0);
1870         }
1871         Ok(())
1872     }
1873 
1874     /// Return the heap memory usage, in bytes, of this visited set.
memory_usage(&self) -> usize1875     fn memory_usage(&self) -> usize {
1876         self.bitset.len() * core::mem::size_of::<usize>()
1877     }
1878 }
1879 
1880 /// Integer division, but rounds up instead of down.
div_ceil(lhs: usize, rhs: usize) -> usize1881 fn div_ceil(lhs: usize, rhs: usize) -> usize {
1882     if lhs % rhs == 0 {
1883         lhs / rhs
1884     } else {
1885         (lhs / rhs) + 1
1886     }
1887 }
1888 
1889 #[cfg(test)]
1890 mod tests {
1891     use super::*;
1892 
1893     // This is a regression test for the maximum haystack length computation.
1894     // Previously, it assumed that the total capacity of the backtracker's
1895     // bitset would always be greater than the number of NFA states. But there
1896     // is of course no guarantee that this is true. This regression test
1897     // ensures that not only does `max_haystack_len` not panic, but that it
1898     // should return `0`.
1899     #[cfg(feature = "syntax")]
1900     #[test]
max_haystack_len_overflow()1901     fn max_haystack_len_overflow() {
1902         let re = BoundedBacktracker::builder()
1903             .configure(BoundedBacktracker::config().visited_capacity(10))
1904             .build(r"[0-9A-Za-z]{100}")
1905             .unwrap();
1906         assert_eq!(0, re.max_haystack_len());
1907     }
1908 }
1909