1 /*!
2 An NFA backed bounded backtracker for executing regex searches with capturing
3 groups.
4
5 This module provides a [`BoundedBacktracker`] that works by simulating an NFA
6 using the classical backtracking algorithm with a twist: it avoids redoing
7 work that it has done before and thereby avoids worst case exponential time.
8 In exchange, it can only be used on "short" haystacks. Its advantage is that
9 is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
10 because it does less book-keeping.
11 */
12
13 use alloc::{vec, vec::Vec};
14
15 use crate::{
16 nfa::thompson::{self, BuildError, State, NFA},
17 util::{
18 captures::Captures,
19 empty, iter,
20 prefilter::Prefilter,
21 primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22 search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
23 },
24 };
25
26 /// Returns the minimum visited capacity for the given haystack.
27 ///
28 /// This function can be used as the argument to [`Config::visited_capacity`]
29 /// in order to guarantee that a backtracking search for the given `input`
30 /// won't return an error when using a [`BoundedBacktracker`] built from the
31 /// given `NFA`.
32 ///
33 /// This routine exists primarily as a way to test that the bounded backtracker
34 /// works correctly when its capacity is set to the smallest possible amount.
35 /// Still, it may be useful in cases where you know you want to use the bounded
36 /// backtracker for a specific input, and just need to know what visited
37 /// capacity to provide to make it work.
38 ///
39 /// Be warned that this number could be quite large as it is multiplicative in
40 /// the size the given NFA and haystack.
min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize41 pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
42 div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8)
43 }
44
45 /// The configuration used for building a bounded backtracker.
46 ///
47 /// A bounded backtracker configuration is a simple data object that is
48 /// typically used with [`Builder::configure`].
49 #[derive(Clone, Debug, Default)]
50 pub struct Config {
51 pre: Option<Option<Prefilter>>,
52 visited_capacity: Option<usize>,
53 }
54
55 impl Config {
56 /// Return a new default regex configuration.
new() -> Config57 pub fn new() -> Config {
58 Config::default()
59 }
60
61 /// Set a prefilter to be used whenever a start state is entered.
62 ///
63 /// A [`Prefilter`] in this context is meant to accelerate searches by
64 /// looking for literal prefixes that every match for the corresponding
65 /// pattern (or patterns) must start with. Once a prefilter produces a
66 /// match, the underlying search routine continues on to try and confirm
67 /// the match.
68 ///
69 /// Be warned that setting a prefilter does not guarantee that the search
70 /// will be faster. While it's usually a good bet, if the prefilter
71 /// produces a lot of false positive candidates (i.e., positions matched
72 /// by the prefilter but not by the regex), then the overall result can
73 /// be slower than if you had just executed the regex engine without any
74 /// prefilters.
75 ///
76 /// By default no prefilter is set.
77 ///
78 /// # Example
79 ///
80 /// ```
81 /// use regex_automata::{
82 /// nfa::thompson::backtrack::BoundedBacktracker,
83 /// util::prefilter::Prefilter,
84 /// Input, Match, MatchKind,
85 /// };
86 ///
87 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
88 /// let re = BoundedBacktracker::builder()
89 /// .configure(BoundedBacktracker::config().prefilter(pre))
90 /// .build(r"(foo|bar)[a-z]+")?;
91 /// let mut cache = re.create_cache();
92 /// let input = Input::new("foo1 barfox bar");
93 /// assert_eq!(
94 /// Some(Match::must(0, 5..11)),
95 /// re.try_find(&mut cache, input)?,
96 /// );
97 ///
98 /// # Ok::<(), Box<dyn std::error::Error>>(())
99 /// ```
100 ///
101 /// Be warned though that an incorrect prefilter can lead to incorrect
102 /// results!
103 ///
104 /// ```
105 /// use regex_automata::{
106 /// nfa::thompson::backtrack::BoundedBacktracker,
107 /// util::prefilter::Prefilter,
108 /// Input, HalfMatch, MatchKind,
109 /// };
110 ///
111 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
112 /// let re = BoundedBacktracker::builder()
113 /// .configure(BoundedBacktracker::config().prefilter(pre))
114 /// .build(r"(foo|bar)[a-z]+")?;
115 /// let mut cache = re.create_cache();
116 /// let input = Input::new("foo1 barfox bar");
117 /// // No match reported even though there clearly is one!
118 /// assert_eq!(None, re.try_find(&mut cache, input)?);
119 ///
120 /// # Ok::<(), Box<dyn std::error::Error>>(())
121 /// ```
prefilter(mut self, pre: Option<Prefilter>) -> Config122 pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
123 self.pre = Some(pre);
124 self
125 }
126
127 /// Set the visited capacity used to bound backtracking.
128 ///
129 /// The visited capacity represents the amount of heap memory (in bytes) to
130 /// allocate toward tracking which parts of the backtracking search have
131 /// been done before. The heap memory needed for any particular search is
132 /// proportional to `haystack.len() * nfa.states().len()`, which an be
133 /// quite large. Therefore, the bounded backtracker is typically only able
134 /// to run on shorter haystacks.
135 ///
136 /// For a given regex, increasing the visited capacity means that the
137 /// maximum haystack length that can be searched is increased. The
138 /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
139 ///
140 /// The default capacity is a reasonable but empirically chosen size.
141 ///
142 /// # Example
143 ///
144 /// As with other regex engines, Unicode is what tends to make the bounded
145 /// backtracker less useful by making the maximum haystack length quite
146 /// small. If necessary, increasing the visited capacity using this routine
147 /// will increase the maximum haystack length at the cost of using more
148 /// memory.
149 ///
150 /// Note though that the specific maximum values here are not an API
151 /// guarantee. The default visited capacity is subject to change and not
152 /// covered by semver.
153 ///
154 /// ```
155 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
157 ///
158 /// // Unicode inflates the size of the underlying NFA quite a bit, and
159 /// // thus means that the backtracker can only handle smaller haystacks,
160 /// // assuming that the visited capacity remains unchanged.
161 /// let re = BoundedBacktracker::new(r"\w+")?;
162 /// assert!(re.max_haystack_len() <= 7_000);
163 /// // But we can increase the visited capacity to handle bigger haystacks!
164 /// let re = BoundedBacktracker::builder()
165 /// .configure(BoundedBacktracker::config().visited_capacity(1<<20))
166 /// .build(r"\w+")?;
167 /// assert!(re.max_haystack_len() >= 25_000);
168 /// assert!(re.max_haystack_len() <= 28_000);
169 /// # Ok::<(), Box<dyn std::error::Error>>(())
170 /// ```
visited_capacity(mut self, capacity: usize) -> Config171 pub fn visited_capacity(mut self, capacity: usize) -> Config {
172 self.visited_capacity = Some(capacity);
173 self
174 }
175
176 /// Returns the prefilter set in this configuration, if one at all.
get_prefilter(&self) -> Option<&Prefilter>177 pub fn get_prefilter(&self) -> Option<&Prefilter> {
178 self.pre.as_ref().unwrap_or(&None).as_ref()
179 }
180
181 /// Returns the configured visited capacity.
182 ///
183 /// Note that the actual capacity used may be slightly bigger than the
184 /// configured capacity.
get_visited_capacity(&self) -> usize185 pub fn get_visited_capacity(&self) -> usize {
186 const DEFAULT: usize = 256 * (1 << 10); // 256 KB
187 self.visited_capacity.unwrap_or(DEFAULT)
188 }
189
190 /// Overwrite the default configuration such that the options in `o` are
191 /// always used. If an option in `o` is not set, then the corresponding
192 /// option in `self` is used. If it's not set in `self` either, then it
193 /// remains not set.
overwrite(&self, o: Config) -> Config194 pub(crate) fn overwrite(&self, o: Config) -> Config {
195 Config {
196 pre: o.pre.or_else(|| self.pre.clone()),
197 visited_capacity: o.visited_capacity.or(self.visited_capacity),
198 }
199 }
200 }
201
202 /// A builder for a bounded backtracker.
203 ///
204 /// This builder permits configuring options for the syntax of a pattern, the
205 /// NFA construction and the `BoundedBacktracker` construction. This builder
206 /// is different from a general purpose regex builder in that it permits fine
207 /// grain configuration of the construction process. The trade off for this is
208 /// complexity, and the possibility of setting a configuration that might not
209 /// make sense. For example, there are two different UTF-8 modes:
210 ///
211 /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
212 /// whether the pattern itself can contain sub-expressions that match invalid
213 /// UTF-8.
214 /// * [`thompson::Config::utf8`] controls how the regex iterators themselves
215 /// advance the starting position of the next search when a match with zero
216 /// length is found.
217 ///
218 /// Generally speaking, callers will want to either enable all of these or
219 /// disable all of these.
220 ///
221 /// # Example
222 ///
223 /// This example shows how to disable UTF-8 mode in the syntax and the regex
224 /// itself. This is generally what you want for matching on arbitrary bytes.
225 ///
226 /// ```
227 /// use regex_automata::{
228 /// nfa::thompson::{self, backtrack::BoundedBacktracker},
229 /// util::syntax,
230 /// Match,
231 /// };
232 ///
233 /// let re = BoundedBacktracker::builder()
234 /// .syntax(syntax::Config::new().utf8(false))
235 /// .thompson(thompson::Config::new().utf8(false))
236 /// .build(r"foo(?-u:[^b])ar.*")?;
237 /// let mut cache = re.create_cache();
238 ///
239 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
240 /// let expected = Some(Ok(Match::must(0, 1..9)));
241 /// let got = re.try_find_iter(&mut cache, haystack).next();
242 /// assert_eq!(expected, got);
243 /// // Notice that `(?-u:[^b])` matches invalid UTF-8,
244 /// // but the subsequent `.*` does not! Disabling UTF-8
245 /// // on the syntax permits this.
246 /// //
247 /// // N.B. This example does not show the impact of
248 /// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
249 /// // only impacts regexes that can produce matches of
250 /// // length 0.
251 /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]);
252 ///
253 /// # Ok::<(), Box<dyn std::error::Error>>(())
254 /// ```
255 #[derive(Clone, Debug)]
256 pub struct Builder {
257 config: Config,
258 #[cfg(feature = "syntax")]
259 thompson: thompson::Compiler,
260 }
261
262 impl Builder {
263 /// Create a new BoundedBacktracker builder with its default configuration.
new() -> Builder264 pub fn new() -> Builder {
265 Builder {
266 config: Config::default(),
267 #[cfg(feature = "syntax")]
268 thompson: thompson::Compiler::new(),
269 }
270 }
271
272 /// Build a `BoundedBacktracker` from the given pattern.
273 ///
274 /// If there was a problem parsing or compiling the pattern, then an error
275 /// is returned.
276 #[cfg(feature = "syntax")]
build( &self, pattern: &str, ) -> Result<BoundedBacktracker, BuildError>277 pub fn build(
278 &self,
279 pattern: &str,
280 ) -> Result<BoundedBacktracker, BuildError> {
281 self.build_many(&[pattern])
282 }
283
284 /// Build a `BoundedBacktracker` from the given patterns.
285 #[cfg(feature = "syntax")]
build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<BoundedBacktracker, BuildError>286 pub fn build_many<P: AsRef<str>>(
287 &self,
288 patterns: &[P],
289 ) -> Result<BoundedBacktracker, BuildError> {
290 let nfa = self.thompson.build_many(patterns)?;
291 self.build_from_nfa(nfa)
292 }
293
294 /// Build a `BoundedBacktracker` directly from its NFA.
295 ///
296 /// Note that when using this method, any configuration that applies to the
297 /// construction of the NFA itself will of course be ignored, since the NFA
298 /// given here is already built.
build_from_nfa( &self, nfa: NFA, ) -> Result<BoundedBacktracker, BuildError>299 pub fn build_from_nfa(
300 &self,
301 nfa: NFA,
302 ) -> Result<BoundedBacktracker, BuildError> {
303 nfa.look_set_any().available().map_err(BuildError::word)?;
304 Ok(BoundedBacktracker { config: self.config.clone(), nfa })
305 }
306
307 /// Apply the given `BoundedBacktracker` configuration options to this
308 /// builder.
configure(&mut self, config: Config) -> &mut Builder309 pub fn configure(&mut self, config: Config) -> &mut Builder {
310 self.config = self.config.overwrite(config);
311 self
312 }
313
314 /// Set the syntax configuration for this builder using
315 /// [`syntax::Config`](crate::util::syntax::Config).
316 ///
317 /// This permits setting things like case insensitivity, Unicode and multi
318 /// line mode.
319 ///
320 /// These settings only apply when constructing a `BoundedBacktracker`
321 /// directly from a pattern.
322 #[cfg(feature = "syntax")]
syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder323 pub fn syntax(
324 &mut self,
325 config: crate::util::syntax::Config,
326 ) -> &mut Builder {
327 self.thompson.syntax(config);
328 self
329 }
330
331 /// Set the Thompson NFA configuration for this builder using
332 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
333 ///
334 /// This permits setting things like if additional time should be spent
335 /// shrinking the size of the NFA.
336 ///
337 /// These settings only apply when constructing a `BoundedBacktracker`
338 /// directly from a pattern.
339 #[cfg(feature = "syntax")]
thompson(&mut self, config: thompson::Config) -> &mut Builder340 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
341 self.thompson.configure(config);
342 self
343 }
344 }
345
346 /// A backtracking regex engine that bounds its execution to avoid exponential
347 /// blow-up.
348 ///
349 /// This regex engine only implements leftmost-first match semantics and
350 /// only supports leftmost searches. It effectively does the same thing as a
351 /// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
352 /// it doesn't have to worry about copying capturing group spans for most NFA
353 /// states. Instead, the backtracker can maintain one set of captures (provided
354 /// by the caller) and never needs to copy them. In exchange, the backtracker
355 /// bounds itself to ensure it doesn't exhibit worst case exponential time.
356 /// This results in the backtracker only being able to handle short haystacks
357 /// given reasonable memory usage.
358 ///
359 /// # Searches may return an error!
360 ///
361 /// By design, this backtracking regex engine is bounded. This bound is
362 /// implemented by not visiting any combination of NFA state ID and position
363 /// in a haystack more than once. Thus, the total memory required to bound
364 /// backtracking is proportional to `haystack.len() * nfa.states().len()`.
365 /// This can obviously get quite large, since large haystacks aren't terribly
366 /// uncommon. To avoid using exorbitant memory, the capacity is bounded by
367 /// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
368 /// capacity required for a particular regex and a haystack exceeds this
369 /// capacity, then the search routine will return an error.
370 ///
371 /// Unlike other regex engines that may return an error at search time (like
372 /// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
373 /// backtracker will work for every haystack. Therefore, this regex engine
374 /// _only_ exposes fallible search routines to avoid the footgun of panicking
375 /// when running a search on a haystack that is too big.
376 ///
377 /// If one wants to use the fallible search APIs without handling the
378 /// error, the only way to guarantee an error won't occur from the
379 /// haystack length is to ensure the haystack length does not exceed
380 /// [`BoundedBacktracker::max_haystack_len`].
381 ///
382 /// # Example: Unicode word boundaries
383 ///
384 /// This example shows that the bounded backtracker implements Unicode word
385 /// boundaries correctly by default.
386 ///
387 /// ```
388 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
389 /// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
390 ///
391 /// let re = BoundedBacktracker::new(r"\b\w+\b")?;
392 /// let mut cache = re.create_cache();
393 ///
394 /// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
395 /// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next());
396 /// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next());
397 /// assert_eq!(None, it.next());
398 /// # Ok::<(), Box<dyn std::error::Error>>(())
399 /// ```
400 ///
401 /// # Example: multiple regex patterns
402 ///
403 /// The bounded backtracker supports searching for multiple patterns
404 /// simultaneously, just like other regex engines. Note though that because it
405 /// uses a backtracking strategy, this regex engine is unlikely to scale well
406 /// as more patterns are added. But then again, as more patterns are added, the
407 /// maximum haystack length allowed will also shorten (assuming the visited
408 /// capacity remains invariant).
409 ///
410 /// ```
411 /// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
412 ///
413 /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
414 /// let mut cache = re.create_cache();
415 ///
416 /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
417 /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
418 /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
419 /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
420 /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
421 /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
422 /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
423 /// assert_eq!(None, it.next());
424 /// # Ok::<(), Box<dyn std::error::Error>>(())
425 /// ```
426 #[derive(Clone, Debug)]
427 pub struct BoundedBacktracker {
428 config: Config,
429 nfa: NFA,
430 }
431
432 impl BoundedBacktracker {
433 /// Parse the given regular expression using the default configuration and
434 /// return the corresponding `BoundedBacktracker`.
435 ///
436 /// If you want a non-default configuration, then use the [`Builder`] to
437 /// set your own configuration.
438 ///
439 /// # Example
440 ///
441 /// ```
442 /// use regex_automata::{
443 /// nfa::thompson::backtrack::BoundedBacktracker,
444 /// Match,
445 /// };
446 ///
447 /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
448 /// let mut cache = re.create_cache();
449 /// assert_eq!(
450 /// Some(Ok(Match::must(0, 3..14))),
451 /// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
452 /// );
453 /// # Ok::<(), Box<dyn std::error::Error>>(())
454 /// ```
455 #[cfg(feature = "syntax")]
new(pattern: &str) -> Result<BoundedBacktracker, BuildError>456 pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
457 BoundedBacktracker::builder().build(pattern)
458 }
459
460 /// Like `new`, but parses multiple patterns into a single "multi regex."
461 /// This similarly uses the default regex configuration.
462 ///
463 /// # Example
464 ///
465 /// ```
466 /// use regex_automata::{
467 /// nfa::thompson::backtrack::BoundedBacktracker,
468 /// Match,
469 /// };
470 ///
471 /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
472 /// let mut cache = re.create_cache();
473 ///
474 /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
475 /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
476 /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
477 /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
478 /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
479 /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
480 /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
481 /// assert_eq!(None, it.next());
482 /// # Ok::<(), Box<dyn std::error::Error>>(())
483 /// ```
484 #[cfg(feature = "syntax")]
new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<BoundedBacktracker, BuildError>485 pub fn new_many<P: AsRef<str>>(
486 patterns: &[P],
487 ) -> Result<BoundedBacktracker, BuildError> {
488 BoundedBacktracker::builder().build_many(patterns)
489 }
490
491 /// # Example
492 ///
493 /// This shows how to hand assemble a regular expression via its HIR,
494 /// compile an NFA from it and build a BoundedBacktracker from the NFA.
495 ///
496 /// ```
497 /// use regex_automata::{
498 /// nfa::thompson::{NFA, backtrack::BoundedBacktracker},
499 /// Match,
500 /// };
501 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
502 ///
503 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
504 /// ClassBytesRange::new(b'0', b'9'),
505 /// ClassBytesRange::new(b'A', b'Z'),
506 /// ClassBytesRange::new(b'_', b'_'),
507 /// ClassBytesRange::new(b'a', b'z'),
508 /// ])));
509 ///
510 /// let config = NFA::config().nfa_size_limit(Some(1_000));
511 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
512 ///
513 /// let re = BoundedBacktracker::new_from_nfa(nfa)?;
514 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
515 /// let expected = Some(Match::must(0, 3..4));
516 /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
517 /// assert_eq!(expected, caps.get_match());
518 ///
519 /// # Ok::<(), Box<dyn std::error::Error>>(())
520 /// ```
new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError>521 pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
522 BoundedBacktracker::builder().build_from_nfa(nfa)
523 }
524
525 /// Create a new `BoundedBacktracker` that matches every input.
526 ///
527 /// # Example
528 ///
529 /// ```
530 /// use regex_automata::{
531 /// nfa::thompson::backtrack::BoundedBacktracker,
532 /// Match,
533 /// };
534 ///
535 /// let re = BoundedBacktracker::always_match()?;
536 /// let mut cache = re.create_cache();
537 ///
538 /// let expected = Some(Ok(Match::must(0, 0..0)));
539 /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
540 /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
541 /// # Ok::<(), Box<dyn std::error::Error>>(())
542 /// ```
always_match() -> Result<BoundedBacktracker, BuildError>543 pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
544 let nfa = thompson::NFA::always_match();
545 BoundedBacktracker::new_from_nfa(nfa)
546 }
547
548 /// Create a new `BoundedBacktracker` that never matches any input.
549 ///
550 /// # Example
551 ///
552 /// ```
553 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
554 ///
555 /// let re = BoundedBacktracker::never_match()?;
556 /// let mut cache = re.create_cache();
557 ///
558 /// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
559 /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
560 /// # Ok::<(), Box<dyn std::error::Error>>(())
561 /// ```
never_match() -> Result<BoundedBacktracker, BuildError>562 pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
563 let nfa = thompson::NFA::never_match();
564 BoundedBacktracker::new_from_nfa(nfa)
565 }
566
567 /// Return a default configuration for a `BoundedBacktracker`.
568 ///
569 /// This is a convenience routine to avoid needing to import the `Config`
570 /// type when customizing the construction of a `BoundedBacktracker`.
571 ///
572 /// # Example
573 ///
574 /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
575 /// disabled, zero-width matches that split a codepoint are allowed.
576 /// Otherwise they are never reported.
577 ///
578 /// In the code below, notice that `""` is permitted to match positions
579 /// that split the encoding of a codepoint.
580 ///
581 /// ```
582 /// use regex_automata::{
583 /// nfa::thompson::{self, backtrack::BoundedBacktracker},
584 /// Match,
585 /// };
586 ///
587 /// let re = BoundedBacktracker::builder()
588 /// .thompson(thompson::Config::new().utf8(false))
589 /// .build(r"")?;
590 /// let mut cache = re.create_cache();
591 ///
592 /// let haystack = "a☃z";
593 /// let mut it = re.try_find_iter(&mut cache, haystack);
594 /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next());
595 /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next());
596 /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next());
597 /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next());
598 /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next());
599 /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next());
600 /// assert_eq!(None, it.next());
601 ///
602 /// # Ok::<(), Box<dyn std::error::Error>>(())
603 /// ```
config() -> Config604 pub fn config() -> Config {
605 Config::new()
606 }
607
608 /// Return a builder for configuring the construction of a
609 /// `BoundedBacktracker`.
610 ///
611 /// This is a convenience routine to avoid needing to import the
612 /// [`Builder`] type in common cases.
613 ///
614 /// # Example
615 ///
616 /// This example shows how to use the builder to disable UTF-8 mode
617 /// everywhere.
618 ///
619 /// ```
620 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
621 /// use regex_automata::{
622 /// nfa::thompson::{self, backtrack::BoundedBacktracker},
623 /// util::syntax,
624 /// Match,
625 /// };
626 ///
627 /// let re = BoundedBacktracker::builder()
628 /// .syntax(syntax::Config::new().utf8(false))
629 /// .thompson(thompson::Config::new().utf8(false))
630 /// .build(r"foo(?-u:[^b])ar.*")?;
631 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
632 ///
633 /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
634 /// let expected = Some(Match::must(0, 1..9));
635 /// re.try_captures(&mut cache, haystack, &mut caps)?;
636 /// assert_eq!(expected, caps.get_match());
637 ///
638 /// # Ok::<(), Box<dyn std::error::Error>>(())
639 /// ```
builder() -> Builder640 pub fn builder() -> Builder {
641 Builder::new()
642 }
643
644 /// Create a new cache for this regex.
645 ///
646 /// The cache returned should only be used for searches for this
647 /// regex. If you want to reuse the cache for another regex, then you
648 /// must call [`Cache::reset`] with that regex (or, equivalently,
649 /// [`BoundedBacktracker::reset_cache`]).
create_cache(&self) -> Cache650 pub fn create_cache(&self) -> Cache {
651 Cache::new(self)
652 }
653
654 /// Create a new empty set of capturing groups that is guaranteed to be
655 /// valid for the search APIs on this `BoundedBacktracker`.
656 ///
657 /// A `Captures` value created for a specific `BoundedBacktracker` cannot
658 /// be used with any other `BoundedBacktracker`.
659 ///
660 /// This is a convenience function for [`Captures::all`]. See the
661 /// [`Captures`] documentation for an explanation of its alternative
662 /// constructors that permit the `BoundedBacktracker` to do less work
663 /// during a search, and thus might make it faster.
create_captures(&self) -> Captures664 pub fn create_captures(&self) -> Captures {
665 Captures::all(self.get_nfa().group_info().clone())
666 }
667
668 /// Reset the given cache such that it can be used for searching with the
669 /// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
670 ///
671 /// A cache reset permits reusing memory already allocated in this cache
672 /// with a different `BoundedBacktracker`.
673 ///
674 /// # Example
675 ///
676 /// This shows how to re-purpose a cache for use with a different
677 /// `BoundedBacktracker`.
678 ///
679 /// ```
680 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
681 /// use regex_automata::{
682 /// nfa::thompson::backtrack::BoundedBacktracker,
683 /// Match,
684 /// };
685 ///
686 /// let re1 = BoundedBacktracker::new(r"\w")?;
687 /// let re2 = BoundedBacktracker::new(r"\W")?;
688 ///
689 /// let mut cache = re1.create_cache();
690 /// assert_eq!(
691 /// Some(Ok(Match::must(0, 0..2))),
692 /// re1.try_find_iter(&mut cache, "Δ").next(),
693 /// );
694 ///
695 /// // Using 'cache' with re2 is not allowed. It may result in panics or
696 /// // incorrect results. In order to re-purpose the cache, we must reset
697 /// // it with the BoundedBacktracker we'd like to use it with.
698 /// //
699 /// // Similarly, after this reset, using the cache with 're1' is also not
700 /// // allowed.
701 /// cache.reset(&re2);
702 /// assert_eq!(
703 /// Some(Ok(Match::must(0, 0..3))),
704 /// re2.try_find_iter(&mut cache, "☃").next(),
705 /// );
706 ///
707 /// # Ok::<(), Box<dyn std::error::Error>>(())
708 /// ```
reset_cache(&self, cache: &mut Cache)709 pub fn reset_cache(&self, cache: &mut Cache) {
710 cache.reset(self);
711 }
712
713 /// Returns the total number of patterns compiled into this
714 /// `BoundedBacktracker`.
715 ///
716 /// In the case of a `BoundedBacktracker` that contains no patterns, this
717 /// returns `0`.
718 ///
719 /// # Example
720 ///
721 /// This example shows the pattern length for a `BoundedBacktracker` that
722 /// never matches:
723 ///
724 /// ```
725 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
726 ///
727 /// let re = BoundedBacktracker::never_match()?;
728 /// assert_eq!(re.pattern_len(), 0);
729 /// # Ok::<(), Box<dyn std::error::Error>>(())
730 /// ```
731 ///
732 /// And another example for a `BoundedBacktracker` that matches at every
733 /// position:
734 ///
735 /// ```
736 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
737 ///
738 /// let re = BoundedBacktracker::always_match()?;
739 /// assert_eq!(re.pattern_len(), 1);
740 /// # Ok::<(), Box<dyn std::error::Error>>(())
741 /// ```
742 ///
743 /// And finally, a `BoundedBacktracker` that was constructed from multiple
744 /// patterns:
745 ///
746 /// ```
747 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
748 ///
749 /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
750 /// assert_eq!(re.pattern_len(), 3);
751 /// # Ok::<(), Box<dyn std::error::Error>>(())
752 /// ```
pattern_len(&self) -> usize753 pub fn pattern_len(&self) -> usize {
754 self.nfa.pattern_len()
755 }
756
757 /// Return the config for this `BoundedBacktracker`.
758 #[inline]
get_config(&self) -> &Config759 pub fn get_config(&self) -> &Config {
760 &self.config
761 }
762
763 /// Returns a reference to the underlying NFA.
764 #[inline]
get_nfa(&self) -> &NFA765 pub fn get_nfa(&self) -> &NFA {
766 &self.nfa
767 }
768
769 /// Returns the maximum haystack length supported by this backtracker.
770 ///
771 /// This routine is a function of both [`Config::visited_capacity`] and the
772 /// internal size of the backtracker's NFA.
773 ///
774 /// # Example
775 ///
776 /// This example shows how the maximum haystack length can vary depending
777 /// on the size of the regex itself. Note though that the specific maximum
778 /// values here are not an API guarantee. The default visited capacity is
779 /// subject to change and not covered by semver.
780 ///
781 /// ```
782 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
783 /// use regex_automata::{
784 /// nfa::thompson::backtrack::BoundedBacktracker,
785 /// Match, MatchError,
786 /// };
787 ///
788 /// // If you're only using ASCII, you get a big budget.
789 /// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
790 /// let mut cache = re.create_cache();
791 /// assert_eq!(re.max_haystack_len(), 299_592);
792 /// // Things work up to the max.
793 /// let mut haystack = "a".repeat(299_592);
794 /// let expected = Some(Ok(Match::must(0, 0..299_592)));
795 /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
796 /// // But you'll get an error if you provide a haystack that's too big.
797 /// // Notice that we use the 'try_find_iter' routine instead, which
798 /// // yields Result<Match, MatchError> instead of Match.
799 /// haystack.push('a');
800 /// let expected = Some(Err(MatchError::haystack_too_long(299_593)));
801 /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
802 ///
803 /// // Unicode inflates the size of the underlying NFA quite a bit, and
804 /// // thus means that the backtracker can only handle smaller haystacks,
805 /// // assuming that the visited capacity remains unchanged.
806 /// let re = BoundedBacktracker::new(r"\w+")?;
807 /// assert!(re.max_haystack_len() <= 7_000);
808 /// // But we can increase the visited capacity to handle bigger haystacks!
809 /// let re = BoundedBacktracker::builder()
810 /// .configure(BoundedBacktracker::config().visited_capacity(1<<20))
811 /// .build(r"\w+")?;
812 /// assert!(re.max_haystack_len() >= 25_000);
813 /// assert!(re.max_haystack_len() <= 28_000);
814 /// # Ok::<(), Box<dyn std::error::Error>>(())
815 /// ```
816 #[inline]
max_haystack_len(&self) -> usize817 pub fn max_haystack_len(&self) -> usize {
818 // The capacity given in the config is "bytes of heap memory," but the
819 // capacity we use here is "number of bits." So convert the capacity in
820 // bytes to the capacity in bits.
821 let capacity = 8 * self.get_config().get_visited_capacity();
822 let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
823 let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
824 // It's possible for `real_capacity` to be smaller than the number of
825 // NFA states for particularly large regexes, so we saturate towards
826 // zero.
827 (real_capacity / self.nfa.states().len()).saturating_sub(1)
828 }
829 }
830
831 impl BoundedBacktracker {
832 /// Returns true if and only if this regex matches the given haystack.
833 ///
834 /// In the case of a backtracking regex engine, and unlike most other
835 /// regex engines in this crate, short circuiting isn't practical. However,
836 /// this routine may still be faster because it instructs backtracking to
837 /// not keep track of any capturing groups.
838 ///
839 /// # Errors
840 ///
841 /// This routine only errors if the search could not complete. For this
842 /// backtracking regex engine, this only occurs when the haystack length
843 /// exceeds [`BoundedBacktracker::max_haystack_len`].
844 ///
845 /// When a search cannot complete, callers cannot know whether a match
846 /// exists or not.
847 ///
848 /// # Example
849 ///
850 /// ```
851 /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
852 ///
853 /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
854 /// let mut cache = re.create_cache();
855 ///
856 /// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
857 /// assert!(!re.try_is_match(&mut cache, "foobar")?);
858 /// # Ok::<(), Box<dyn std::error::Error>>(())
859 /// ```
860 ///
861 /// # Example: consistency with search APIs
862 ///
863 /// `is_match` is guaranteed to return `true` whenever `find` returns a
864 /// match. This includes searches that are executed entirely within a
865 /// codepoint:
866 ///
867 /// ```
868 /// use regex_automata::{
869 /// nfa::thompson::backtrack::BoundedBacktracker,
870 /// Input,
871 /// };
872 ///
873 /// let re = BoundedBacktracker::new("a*")?;
874 /// let mut cache = re.create_cache();
875 ///
876 /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
877 /// # Ok::<(), Box<dyn std::error::Error>>(())
878 /// ```
879 ///
880 /// Notice that when UTF-8 mode is disabled, then the above reports a
881 /// match because the restriction against zero-width matches that split a
882 /// codepoint has been lifted:
883 ///
884 /// ```
885 /// use regex_automata::{
886 /// nfa::thompson::{backtrack::BoundedBacktracker, NFA},
887 /// Input,
888 /// };
889 ///
890 /// let re = BoundedBacktracker::builder()
891 /// .thompson(NFA::config().utf8(false))
892 /// .build("a*")?;
893 /// let mut cache = re.create_cache();
894 ///
895 /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
896 /// # Ok::<(), Box<dyn std::error::Error>>(())
897 /// ```
898 #[inline]
try_is_match<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> Result<bool, MatchError>899 pub fn try_is_match<'h, I: Into<Input<'h>>>(
900 &self,
901 cache: &mut Cache,
902 input: I,
903 ) -> Result<bool, MatchError> {
904 let input = input.into().earliest(true);
905 self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some())
906 }
907
908 /// Executes a leftmost forward search and returns a `Match` if one exists.
909 ///
910 /// This routine only includes the overall match span. To get
911 /// access to the individual spans of each capturing group, use
912 /// [`BoundedBacktracker::try_captures`].
913 ///
914 /// # Errors
915 ///
916 /// This routine only errors if the search could not complete. For this
917 /// backtracking regex engine, this only occurs when the haystack length
918 /// exceeds [`BoundedBacktracker::max_haystack_len`].
919 ///
920 /// When a search cannot complete, callers cannot know whether a match
921 /// exists or not.
922 ///
923 /// # Example
924 ///
925 /// ```
926 /// use regex_automata::{
927 /// nfa::thompson::backtrack::BoundedBacktracker,
928 /// Match,
929 /// };
930 ///
931 /// let re = BoundedBacktracker::new("foo[0-9]+")?;
932 /// let mut cache = re.create_cache();
933 /// let expected = Match::must(0, 0..8);
934 /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
935 ///
936 /// # Ok::<(), Box<dyn std::error::Error>>(())
937 /// ```
938 #[inline]
try_find<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, ) -> Result<Option<Match>, MatchError>939 pub fn try_find<'h, I: Into<Input<'h>>>(
940 &self,
941 cache: &mut Cache,
942 input: I,
943 ) -> Result<Option<Match>, MatchError> {
944 let input = input.into();
945 if self.get_nfa().pattern_len() == 1 {
946 let mut slots = [None, None];
947 let pid = match self.try_search_slots(cache, &input, &mut slots)? {
948 None => return Ok(None),
949 Some(pid) => pid,
950 };
951 let start = match slots[0] {
952 None => return Ok(None),
953 Some(s) => s.get(),
954 };
955 let end = match slots[1] {
956 None => return Ok(None),
957 Some(s) => s.get(),
958 };
959 return Ok(Some(Match::new(pid, Span { start, end })));
960 }
961 let ginfo = self.get_nfa().group_info();
962 let slots_len = ginfo.implicit_slot_len();
963 let mut slots = vec![None; slots_len];
964 let pid = match self.try_search_slots(cache, &input, &mut slots)? {
965 None => return Ok(None),
966 Some(pid) => pid,
967 };
968 let start = match slots[pid.as_usize() * 2] {
969 None => return Ok(None),
970 Some(s) => s.get(),
971 };
972 let end = match slots[pid.as_usize() * 2 + 1] {
973 None => return Ok(None),
974 Some(s) => s.get(),
975 };
976 Ok(Some(Match::new(pid, Span { start, end })))
977 }
978
979 /// Executes a leftmost forward search and writes the spans of capturing
980 /// groups that participated in a match into the provided [`Captures`]
981 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
982 /// to return `false`.
983 ///
984 /// # Errors
985 ///
986 /// This routine only errors if the search could not complete. For this
987 /// backtracking regex engine, this only occurs when the haystack length
988 /// exceeds [`BoundedBacktracker::max_haystack_len`].
989 ///
990 /// When a search cannot complete, callers cannot know whether a match
991 /// exists or not.
992 ///
993 /// # Example
994 ///
995 /// ```
996 /// use regex_automata::{
997 /// nfa::thompson::backtrack::BoundedBacktracker,
998 /// Span,
999 /// };
1000 ///
1001 /// let re = BoundedBacktracker::new(
1002 /// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
1003 /// )?;
1004 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1005 ///
1006 /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
1007 /// assert!(caps.is_match());
1008 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
1009 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
1010 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
1011 ///
1012 /// # Ok::<(), Box<dyn std::error::Error>>(())
1013 /// ```
1014 #[inline]
try_captures<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, ) -> Result<(), MatchError>1015 pub fn try_captures<'h, I: Into<Input<'h>>>(
1016 &self,
1017 cache: &mut Cache,
1018 input: I,
1019 caps: &mut Captures,
1020 ) -> Result<(), MatchError> {
1021 self.try_search(cache, &input.into(), caps)
1022 }
1023
1024 /// Returns an iterator over all non-overlapping leftmost matches in the
1025 /// given bytes. If no match exists, then the iterator yields no elements.
1026 ///
1027 /// If the regex engine returns an error at any point, then the iterator
1028 /// will yield that error.
1029 ///
1030 /// # Example
1031 ///
1032 /// ```
1033 /// use regex_automata::{
1034 /// nfa::thompson::backtrack::BoundedBacktracker,
1035 /// Match, MatchError,
1036 /// };
1037 ///
1038 /// let re = BoundedBacktracker::new("foo[0-9]+")?;
1039 /// let mut cache = re.create_cache();
1040 ///
1041 /// let text = "foo1 foo12 foo123";
1042 /// let result: Result<Vec<Match>, MatchError> = re
1043 /// .try_find_iter(&mut cache, text)
1044 /// .collect();
1045 /// let matches = result?;
1046 /// assert_eq!(matches, vec![
1047 /// Match::must(0, 0..4),
1048 /// Match::must(0, 5..10),
1049 /// Match::must(0, 11..17),
1050 /// ]);
1051 /// # Ok::<(), Box<dyn std::error::Error>>(())
1052 /// ```
1053 #[inline]
try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> TryFindMatches<'r, 'c, 'h>1054 pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1055 &'r self,
1056 cache: &'c mut Cache,
1057 input: I,
1058 ) -> TryFindMatches<'r, 'c, 'h> {
1059 let caps = Captures::matches(self.get_nfa().group_info().clone());
1060 let it = iter::Searcher::new(input.into());
1061 TryFindMatches { re: self, cache, caps, it }
1062 }
1063
1064 /// Returns an iterator over all non-overlapping `Captures` values. If no
1065 /// match exists, then the iterator yields no elements.
1066 ///
1067 /// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
1068 /// but it includes the spans of all capturing groups that participate in
1069 /// each match.
1070 ///
1071 /// If the regex engine returns an error at any point, then the iterator
1072 /// will yield that error.
1073 ///
1074 /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
1075 /// how to correctly iterate over all matches in a haystack while avoiding
1076 /// the creation of a new `Captures` value for every match. (Which you are
1077 /// forced to do with an `Iterator`.)
1078 ///
1079 /// # Example
1080 ///
1081 /// ```
1082 /// use regex_automata::{
1083 /// nfa::thompson::backtrack::BoundedBacktracker,
1084 /// Span,
1085 /// };
1086 ///
1087 /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
1088 /// let mut cache = re.create_cache();
1089 ///
1090 /// let text = "foo1 foo12 foo123";
1091 /// let mut spans = vec![];
1092 /// for result in re.try_captures_iter(&mut cache, text) {
1093 /// let caps = result?;
1094 /// // The unwrap is OK since 'numbers' matches if the pattern matches.
1095 /// spans.push(caps.get_group_by_name("numbers").unwrap());
1096 /// }
1097 /// assert_eq!(spans, vec![
1098 /// Span::from(3..4),
1099 /// Span::from(8..10),
1100 /// Span::from(14..17),
1101 /// ]);
1102 /// # Ok::<(), Box<dyn std::error::Error>>(())
1103 /// ```
1104 #[inline]
try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, input: I, ) -> TryCapturesMatches<'r, 'c, 'h>1105 pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1106 &'r self,
1107 cache: &'c mut Cache,
1108 input: I,
1109 ) -> TryCapturesMatches<'r, 'c, 'h> {
1110 let caps = self.create_captures();
1111 let it = iter::Searcher::new(input.into());
1112 TryCapturesMatches { re: self, cache, caps, it }
1113 }
1114 }
1115
1116 impl BoundedBacktracker {
1117 /// Executes a leftmost forward search and writes the spans of capturing
1118 /// groups that participated in a match into the provided [`Captures`]
1119 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
1120 /// to return `false`.
1121 ///
1122 /// This is like [`BoundedBacktracker::try_captures`], but it accepts a
1123 /// concrete `&Input` instead of an `Into<Input>`.
1124 ///
1125 /// # Errors
1126 ///
1127 /// This routine only errors if the search could not complete. For this
1128 /// backtracking regex engine, this only occurs when the haystack length
1129 /// exceeds [`BoundedBacktracker::max_haystack_len`].
1130 ///
1131 /// When a search cannot complete, callers cannot know whether a match
1132 /// exists or not.
1133 ///
1134 /// # Example: specific pattern search
1135 ///
1136 /// This example shows how to build a multi bounded backtracker that
1137 /// permits searching for specific patterns.
1138 ///
1139 /// ```
1140 /// use regex_automata::{
1141 /// nfa::thompson::backtrack::BoundedBacktracker,
1142 /// Anchored, Input, Match, PatternID,
1143 /// };
1144 ///
1145 /// let re = BoundedBacktracker::new_many(&[
1146 /// "[a-z0-9]{6}",
1147 /// "[a-z][a-z0-9]{5}",
1148 /// ])?;
1149 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1150 /// let haystack = "foo123";
1151 ///
1152 /// // Since we are using the default leftmost-first match and both
1153 /// // patterns match at the same starting position, only the first pattern
1154 /// // will be returned in this case when doing a search for any of the
1155 /// // patterns.
1156 /// let expected = Some(Match::must(0, 0..6));
1157 /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
1158 /// assert_eq!(expected, caps.get_match());
1159 ///
1160 /// // But if we want to check whether some other pattern matches, then we
1161 /// // can provide its pattern ID.
1162 /// let expected = Some(Match::must(1, 0..6));
1163 /// let input = Input::new(haystack)
1164 /// .anchored(Anchored::Pattern(PatternID::must(1)));
1165 /// re.try_search(&mut cache, &input, &mut caps)?;
1166 /// assert_eq!(expected, caps.get_match());
1167 ///
1168 /// # Ok::<(), Box<dyn std::error::Error>>(())
1169 /// ```
1170 ///
1171 /// # Example: specifying the bounds of a search
1172 ///
1173 /// This example shows how providing the bounds of a search can produce
1174 /// different results than simply sub-slicing the haystack.
1175 ///
1176 /// ```
1177 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1178 /// use regex_automata::{
1179 /// nfa::thompson::backtrack::BoundedBacktracker,
1180 /// Match, Input,
1181 /// };
1182 ///
1183 /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
1184 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1185 /// let haystack = "foo123bar";
1186 ///
1187 /// // Since we sub-slice the haystack, the search doesn't know about
1188 /// // the larger context and assumes that `123` is surrounded by word
1189 /// // boundaries. And of course, the match position is reported relative
1190 /// // to the sub-slice as well, which means we get `0..3` instead of
1191 /// // `3..6`.
1192 /// let expected = Some(Match::must(0, 0..3));
1193 /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?;
1194 /// assert_eq!(expected, caps.get_match());
1195 ///
1196 /// // But if we provide the bounds of the search within the context of the
1197 /// // entire haystack, then the search can take the surrounding context
1198 /// // into account. (And if we did find a match, it would be reported
1199 /// // as a valid offset into `haystack` instead of its sub-slice.)
1200 /// let expected = None;
1201 /// re.try_search(
1202 /// &mut cache, &Input::new(haystack).range(3..6), &mut caps,
1203 /// )?;
1204 /// assert_eq!(expected, caps.get_match());
1205 ///
1206 /// # Ok::<(), Box<dyn std::error::Error>>(())
1207 /// ```
1208 #[inline]
try_search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) -> Result<(), MatchError>1209 pub fn try_search(
1210 &self,
1211 cache: &mut Cache,
1212 input: &Input<'_>,
1213 caps: &mut Captures,
1214 ) -> Result<(), MatchError> {
1215 caps.set_pattern(None);
1216 let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1217 caps.set_pattern(pid);
1218 Ok(())
1219 }
1220
1221 /// Executes a leftmost forward search and writes the spans of capturing
1222 /// groups that participated in a match into the provided `slots`, and
1223 /// returns the matching pattern ID. The contents of the slots for patterns
1224 /// other than the matching pattern are unspecified. If no match was found,
1225 /// then `None` is returned and the contents of all `slots` is unspecified.
1226 ///
1227 /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
1228 /// slots slice instead of a `Captures` value. This is useful in contexts
1229 /// where you don't want or need to allocate a `Captures`.
1230 ///
1231 /// It is legal to pass _any_ number of slots to this routine. If the regex
1232 /// engine would otherwise write a slot offset that doesn't fit in the
1233 /// provided slice, then it is simply skipped. In general though, there are
1234 /// usually three slice lengths you might want to use:
1235 ///
1236 /// * An empty slice, if you only care about which pattern matched.
1237 /// * A slice with
1238 /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1239 /// slots, if you only care about the overall match spans for each matching
1240 /// pattern.
1241 /// * A slice with
1242 /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1243 /// permits recording match offsets for every capturing group in every
1244 /// pattern.
1245 ///
1246 /// # Errors
1247 ///
1248 /// This routine only errors if the search could not complete. For this
1249 /// backtracking regex engine, this only occurs when the haystack length
1250 /// exceeds [`BoundedBacktracker::max_haystack_len`].
1251 ///
1252 /// When a search cannot complete, callers cannot know whether a match
1253 /// exists or not.
1254 ///
1255 /// # Example
1256 ///
1257 /// This example shows how to find the overall match offsets in a
1258 /// multi-pattern search without allocating a `Captures` value. Indeed, we
1259 /// can put our slots right on the stack.
1260 ///
1261 /// ```
1262 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1263 /// use regex_automata::{
1264 /// nfa::thompson::backtrack::BoundedBacktracker,
1265 /// PatternID, Input,
1266 /// };
1267 ///
1268 /// let re = BoundedBacktracker::new_many(&[
1269 /// r"\pL+",
1270 /// r"\d+",
1271 /// ])?;
1272 /// let mut cache = re.create_cache();
1273 /// let input = Input::new("!@#123");
1274 ///
1275 /// // We only care about the overall match offsets here, so we just
1276 /// // allocate two slots for each pattern. Each slot records the start
1277 /// // and end of the match.
1278 /// let mut slots = [None; 4];
1279 /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1280 /// assert_eq!(Some(PatternID::must(1)), pid);
1281 ///
1282 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1283 /// // See 'GroupInfo' for more details on the mapping between groups and
1284 /// // slot indices.
1285 /// let slot_start = pid.unwrap().as_usize() * 2;
1286 /// let slot_end = slot_start + 1;
1287 /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1288 /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1289 ///
1290 /// # Ok::<(), Box<dyn std::error::Error>>(())
1291 /// ```
1292 #[inline]
try_search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Result<Option<PatternID>, MatchError>1293 pub fn try_search_slots(
1294 &self,
1295 cache: &mut Cache,
1296 input: &Input<'_>,
1297 slots: &mut [Option<NonMaxUsize>],
1298 ) -> Result<Option<PatternID>, MatchError> {
1299 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1300 if !utf8empty {
1301 let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1302 return Ok(maybe_hm.map(|hm| hm.pattern()));
1303 }
1304 // See PikeVM::try_search_slots for why we do this.
1305 let min = self.get_nfa().group_info().implicit_slot_len();
1306 if slots.len() >= min {
1307 let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1308 return Ok(maybe_hm.map(|hm| hm.pattern()));
1309 }
1310 if self.get_nfa().pattern_len() == 1 {
1311 let mut enough = [None, None];
1312 let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1313 // This is OK because we know `enough_slots` is strictly bigger
1314 // than `slots`, otherwise this special case isn't reached.
1315 slots.copy_from_slice(&enough[..slots.len()]);
1316 return Ok(got.map(|hm| hm.pattern()));
1317 }
1318 let mut enough = vec![None; min];
1319 let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1320 // This is OK because we know `enough_slots` is strictly bigger than
1321 // `slots`, otherwise this special case isn't reached.
1322 slots.copy_from_slice(&enough[..slots.len()]);
1323 Ok(got.map(|hm| hm.pattern()))
1324 }
1325
1326 /// This is the actual implementation of `try_search_slots_imp` that
1327 /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1328 /// enabled, 2) the NFA can match the empty string and 3) the caller has
1329 /// provided an insufficient number of slots to record match offsets.
1330 #[inline(never)]
try_search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Result<Option<HalfMatch>, MatchError>1331 fn try_search_slots_imp(
1332 &self,
1333 cache: &mut Cache,
1334 input: &Input<'_>,
1335 slots: &mut [Option<NonMaxUsize>],
1336 ) -> Result<Option<HalfMatch>, MatchError> {
1337 let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1338 let hm = match self.search_imp(cache, input, slots)? {
1339 None => return Ok(None),
1340 Some(hm) if !utf8empty => return Ok(Some(hm)),
1341 Some(hm) => hm,
1342 };
1343 empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1344 Ok(self
1345 .search_imp(cache, input, slots)?
1346 .map(|hm| (hm, hm.offset())))
1347 })
1348 }
1349
1350 /// The implementation of standard leftmost backtracking search.
1351 ///
1352 /// Capturing group spans are written to 'caps', but only if requested.
1353 /// 'caps' can be one of three things: 1) totally empty, in which case, we
1354 /// only report the pattern that matched or 2) only has slots for recording
1355 /// the overall match offsets for any pattern or 3) has all slots available
1356 /// for recording the spans of any groups participating in a match.
search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Result<Option<HalfMatch>, MatchError>1357 fn search_imp(
1358 &self,
1359 cache: &mut Cache,
1360 input: &Input<'_>,
1361 slots: &mut [Option<NonMaxUsize>],
1362 ) -> Result<Option<HalfMatch>, MatchError> {
1363 // Unlike in the PikeVM, we write our capturing group spans directly
1364 // into the caller's captures groups. So we have to make sure we're
1365 // starting with a blank slate first. In the PikeVM, we avoid this
1366 // by construction: the spans that are copied to every slot in the
1367 // 'Captures' value already account for presence/absence. In this
1368 // backtracker, we write directly into the caller provided slots, where
1369 // as in the PikeVM, we write into scratch space first and only copy
1370 // them to the caller provided slots when a match is found.
1371 for slot in slots.iter_mut() {
1372 *slot = None;
1373 }
1374 cache.setup_search(&self, input)?;
1375 if input.is_done() {
1376 return Ok(None);
1377 }
1378 let (anchored, start_id) = match input.get_anchored() {
1379 // Only way we're unanchored is if both the caller asked for an
1380 // unanchored search *and* the pattern is itself not anchored.
1381 Anchored::No => (
1382 self.nfa.is_always_start_anchored(),
1383 // We always use the anchored starting state here, even if
1384 // doing an unanchored search. The "unanchored" part of it is
1385 // implemented in the loop below, by simply trying the next
1386 // byte offset if the previous backtracking exploration failed.
1387 self.nfa.start_anchored(),
1388 ),
1389 Anchored::Yes => (true, self.nfa.start_anchored()),
1390 Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
1391 None => return Ok(None),
1392 Some(sid) => (true, sid),
1393 },
1394 };
1395 if anchored {
1396 let at = input.start();
1397 return Ok(self.backtrack(cache, input, at, start_id, slots));
1398 }
1399 let pre = self.get_config().get_prefilter();
1400 let mut at = input.start();
1401 while at <= input.end() {
1402 if let Some(ref pre) = pre {
1403 let span = Span::from(at..input.end());
1404 match pre.find(input.haystack(), span) {
1405 None => break,
1406 Some(ref span) => at = span.start,
1407 }
1408 }
1409 if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
1410 {
1411 return Ok(Some(hm));
1412 }
1413 at += 1;
1414 }
1415 Ok(None)
1416 }
1417
1418 /// Look for a match starting at `at` in `input` and write the matching
1419 /// pattern ID and group spans to `caps`. The search uses `start_id` as its
1420 /// starting state in the underlying NFA.
1421 ///
1422 /// If no match was found, then the caller should increment `at` and try
1423 /// at the next position.
1424 #[cfg_attr(feature = "perf-inline", inline(always))]
backtrack( &self, cache: &mut Cache, input: &Input<'_>, at: usize, start_id: StateID, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1425 fn backtrack(
1426 &self,
1427 cache: &mut Cache,
1428 input: &Input<'_>,
1429 at: usize,
1430 start_id: StateID,
1431 slots: &mut [Option<NonMaxUsize>],
1432 ) -> Option<HalfMatch> {
1433 cache.stack.push(Frame::Step { sid: start_id, at });
1434 while let Some(frame) = cache.stack.pop() {
1435 match frame {
1436 Frame::Step { sid, at } => {
1437 if let Some(hm) = self.step(cache, input, sid, at, slots) {
1438 return Some(hm);
1439 }
1440 }
1441 Frame::RestoreCapture { slot, offset } => {
1442 slots[slot] = offset;
1443 }
1444 }
1445 }
1446 None
1447 }
1448
1449 // LAMENTATION: The actual backtracking search is implemented in about
1450 // 75 lines below. Yet this file is over 2,000 lines long. What have I
1451 // done?
1452
1453 /// Execute a "step" in the backtracing algorithm.
1454 ///
1455 /// A "step" is somewhat of a misnomer, because this routine keeps going
1456 /// until it either runs out of things to try or fins a match. In the
1457 /// former case, it may have pushed some things on to the backtracking
1458 /// stack, in which case, those will be tried next as part of the
1459 /// 'backtrack' routine above.
1460 #[cfg_attr(feature = "perf-inline", inline(always))]
step( &self, cache: &mut Cache, input: &Input<'_>, mut sid: StateID, mut at: usize, slots: &mut [Option<NonMaxUsize>], ) -> Option<HalfMatch>1461 fn step(
1462 &self,
1463 cache: &mut Cache,
1464 input: &Input<'_>,
1465 mut sid: StateID,
1466 mut at: usize,
1467 slots: &mut [Option<NonMaxUsize>],
1468 ) -> Option<HalfMatch> {
1469 loop {
1470 if !cache.visited.insert(sid, at - input.start()) {
1471 return None;
1472 }
1473 match *self.nfa.state(sid) {
1474 State::ByteRange { ref trans } => {
1475 // Why do we need this? Unlike other regex engines in this
1476 // crate, the backtracker can steam roll ahead in the
1477 // haystack outside of the main loop over the bytes in the
1478 // haystack. While 'trans.matches()' below handles the case
1479 // of 'at' being out of bounds of 'input.haystack()', we
1480 // also need to handle the case of 'at' going out of bounds
1481 // of the span the caller asked to search.
1482 //
1483 // We should perhaps make the 'trans.matches()' API accept
1484 // an '&Input' instead of a '&[u8]'. Or at least, add a new
1485 // API that does it.
1486 if at >= input.end() {
1487 return None;
1488 }
1489 if !trans.matches(input.haystack(), at) {
1490 return None;
1491 }
1492 sid = trans.next;
1493 at += 1;
1494 }
1495 State::Sparse(ref sparse) => {
1496 if at >= input.end() {
1497 return None;
1498 }
1499 sid = sparse.matches(input.haystack(), at)?;
1500 at += 1;
1501 }
1502 State::Dense(ref dense) => {
1503 if at >= input.end() {
1504 return None;
1505 }
1506 sid = dense.matches(input.haystack(), at)?;
1507 at += 1;
1508 }
1509 State::Look { look, next } => {
1510 // OK because we don't permit building a searcher with a
1511 // Unicode word boundary if the requisite Unicode data is
1512 // unavailable.
1513 if !self.nfa.look_matcher().matches_inline(
1514 look,
1515 input.haystack(),
1516 at,
1517 ) {
1518 return None;
1519 }
1520 sid = next;
1521 }
1522 State::Union { ref alternates } => {
1523 sid = match alternates.get(0) {
1524 None => return None,
1525 Some(&sid) => sid,
1526 };
1527 cache.stack.extend(
1528 alternates[1..]
1529 .iter()
1530 .copied()
1531 .rev()
1532 .map(|sid| Frame::Step { sid, at }),
1533 );
1534 }
1535 State::BinaryUnion { alt1, alt2 } => {
1536 sid = alt1;
1537 cache.stack.push(Frame::Step { sid: alt2, at });
1538 }
1539 State::Capture { next, slot, .. } => {
1540 if slot.as_usize() < slots.len() {
1541 cache.stack.push(Frame::RestoreCapture {
1542 slot,
1543 offset: slots[slot],
1544 });
1545 slots[slot] = NonMaxUsize::new(at);
1546 }
1547 sid = next;
1548 }
1549 State::Fail => return None,
1550 State::Match { pattern_id } => {
1551 return Some(HalfMatch::new(pattern_id, at));
1552 }
1553 }
1554 }
1555 }
1556 }
1557
1558 /// An iterator over all non-overlapping matches for a fallible search.
1559 ///
1560 /// The iterator yields a `Result<Match, MatchError` value until no more
1561 /// matches could be found.
1562 ///
1563 /// The lifetime parameters are as follows:
1564 ///
1565 /// * `'r` represents the lifetime of the BoundedBacktracker.
1566 /// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1567 /// * `'h` represents the lifetime of the haystack being searched.
1568 ///
1569 /// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
1570 /// method.
1571 #[derive(Debug)]
1572 pub struct TryFindMatches<'r, 'c, 'h> {
1573 re: &'r BoundedBacktracker,
1574 cache: &'c mut Cache,
1575 caps: Captures,
1576 it: iter::Searcher<'h>,
1577 }
1578
1579 impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
1580 type Item = Result<Match, MatchError>;
1581
1582 #[inline]
next(&mut self) -> Option<Result<Match, MatchError>>1583 fn next(&mut self) -> Option<Result<Match, MatchError>> {
1584 // Splitting 'self' apart seems necessary to appease borrowck.
1585 let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1586 *self;
1587 it.try_advance(|input| {
1588 re.try_search(cache, input, caps)?;
1589 Ok(caps.get_match())
1590 })
1591 .transpose()
1592 }
1593 }
1594
1595 /// An iterator over all non-overlapping leftmost matches, with their capturing
1596 /// groups, for a fallible search.
1597 ///
1598 /// The iterator yields a `Result<Captures, MatchError>` value until no more
1599 /// matches could be found.
1600 ///
1601 /// The lifetime parameters are as follows:
1602 ///
1603 /// * `'r` represents the lifetime of the BoundedBacktracker.
1604 /// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1605 /// * `'h` represents the lifetime of the haystack being searched.
1606 ///
1607 /// This iterator can be created with the
1608 /// [`BoundedBacktracker::try_captures_iter`] method.
1609 #[derive(Debug)]
1610 pub struct TryCapturesMatches<'r, 'c, 'h> {
1611 re: &'r BoundedBacktracker,
1612 cache: &'c mut Cache,
1613 caps: Captures,
1614 it: iter::Searcher<'h>,
1615 }
1616
1617 impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
1618 type Item = Result<Captures, MatchError>;
1619
1620 #[inline]
next(&mut self) -> Option<Result<Captures, MatchError>>1621 fn next(&mut self) -> Option<Result<Captures, MatchError>> {
1622 // Splitting 'self' apart seems necessary to appease borrowck.
1623 let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1624 *self;
1625 let _ = it
1626 .try_advance(|input| {
1627 re.try_search(cache, input, caps)?;
1628 Ok(caps.get_match())
1629 })
1630 .transpose()?;
1631 if caps.is_match() {
1632 Some(Ok(caps.clone()))
1633 } else {
1634 None
1635 }
1636 }
1637 }
1638
1639 /// A cache represents mutable state that a [`BoundedBacktracker`] requires
1640 /// during a search.
1641 ///
1642 /// For a given [`BoundedBacktracker`], its corresponding cache may be created
1643 /// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
1644 /// They are equivalent in every way, except the former does not require
1645 /// explicitly importing `Cache`.
1646 ///
1647 /// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
1648 /// it was created. It may only be used with that `BoundedBacktracker`. A cache
1649 /// and its allocations may be re-purposed via [`Cache::reset`], in which case,
1650 /// it can only be used with the new `BoundedBacktracker` (and not the old
1651 /// one).
1652 #[derive(Clone, Debug)]
1653 pub struct Cache {
1654 /// Stack used on the heap for doing backtracking instead of the
1655 /// traditional recursive approach. We don't want recursion because then
1656 /// we're likely to hit a stack overflow for bigger regexes.
1657 stack: Vec<Frame>,
1658 /// The set of (StateID, HaystackOffset) pairs that have been visited
1659 /// by the backtracker within a single search. If such a pair has been
1660 /// visited, then we avoid doing the work for that pair again. This is
1661 /// what "bounds" the backtracking and prevents it from having worst case
1662 /// exponential time.
1663 visited: Visited,
1664 }
1665
1666 impl Cache {
1667 /// Create a new [`BoundedBacktracker`] cache.
1668 ///
1669 /// A potentially more convenient routine to create a cache is
1670 /// [`BoundedBacktracker::create_cache`], as it does not require also
1671 /// importing the `Cache` type.
1672 ///
1673 /// If you want to reuse the returned `Cache` with some other
1674 /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
1675 /// desired `BoundedBacktracker`.
new(re: &BoundedBacktracker) -> Cache1676 pub fn new(re: &BoundedBacktracker) -> Cache {
1677 Cache { stack: vec![], visited: Visited::new(re) }
1678 }
1679
1680 /// Reset this cache such that it can be used for searching with different
1681 /// [`BoundedBacktracker`].
1682 ///
1683 /// A cache reset permits reusing memory already allocated in this cache
1684 /// with a different `BoundedBacktracker`.
1685 ///
1686 /// # Example
1687 ///
1688 /// This shows how to re-purpose a cache for use with a different
1689 /// `BoundedBacktracker`.
1690 ///
1691 /// ```
1692 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1693 /// use regex_automata::{
1694 /// nfa::thompson::backtrack::BoundedBacktracker,
1695 /// Match,
1696 /// };
1697 ///
1698 /// let re1 = BoundedBacktracker::new(r"\w")?;
1699 /// let re2 = BoundedBacktracker::new(r"\W")?;
1700 ///
1701 /// let mut cache = re1.create_cache();
1702 /// assert_eq!(
1703 /// Some(Ok(Match::must(0, 0..2))),
1704 /// re1.try_find_iter(&mut cache, "Δ").next(),
1705 /// );
1706 ///
1707 /// // Using 'cache' with re2 is not allowed. It may result in panics or
1708 /// // incorrect results. In order to re-purpose the cache, we must reset
1709 /// // it with the BoundedBacktracker we'd like to use it with.
1710 /// //
1711 /// // Similarly, after this reset, using the cache with 're1' is also not
1712 /// // allowed.
1713 /// cache.reset(&re2);
1714 /// assert_eq!(
1715 /// Some(Ok(Match::must(0, 0..3))),
1716 /// re2.try_find_iter(&mut cache, "☃").next(),
1717 /// );
1718 ///
1719 /// # Ok::<(), Box<dyn std::error::Error>>(())
1720 /// ```
reset(&mut self, re: &BoundedBacktracker)1721 pub fn reset(&mut self, re: &BoundedBacktracker) {
1722 self.visited.reset(re);
1723 }
1724
1725 /// Returns the heap memory usage, in bytes, of this cache.
1726 ///
1727 /// This does **not** include the stack size used up by this cache. To
1728 /// compute that, use `std::mem::size_of::<Cache>()`.
memory_usage(&self) -> usize1729 pub fn memory_usage(&self) -> usize {
1730 self.stack.len() * core::mem::size_of::<Frame>()
1731 + self.visited.memory_usage()
1732 }
1733
1734 /// Clears this cache. This should be called at the start of every search
1735 /// to ensure we start with a clean slate.
1736 ///
1737 /// This also sets the length of the capturing groups used in the current
1738 /// search. This permits an optimization where by 'SlotTable::for_state'
1739 /// only returns the number of slots equivalent to the number of slots
1740 /// given in the 'Captures' value. This may be less than the total number
1741 /// of possible slots, e.g., when one only wants to track overall match
1742 /// offsets. This in turn permits less copying of capturing group spans
1743 /// in the BoundedBacktracker.
setup_search( &mut self, re: &BoundedBacktracker, input: &Input<'_>, ) -> Result<(), MatchError>1744 fn setup_search(
1745 &mut self,
1746 re: &BoundedBacktracker,
1747 input: &Input<'_>,
1748 ) -> Result<(), MatchError> {
1749 self.stack.clear();
1750 self.visited.setup_search(re, input)?;
1751 Ok(())
1752 }
1753 }
1754
1755 /// Represents a stack frame on the heap while doing backtracking.
1756 ///
1757 /// Instead of using explicit recursion for backtracking, we use a stack on
1758 /// the heap to keep track of things that we want to explore if the current
1759 /// backtracking branch turns out to not lead to a match.
1760 #[derive(Clone, Debug)]
1761 enum Frame {
1762 /// Look for a match starting at `sid` and the given position in the
1763 /// haystack.
1764 Step { sid: StateID, at: usize },
1765 /// Reset the given `slot` to the given `offset` (which might be `None`).
1766 /// This effectively gives a "scope" to capturing groups, such that an
1767 /// offset for a particular group only gets returned if the match goes
1768 /// through that capturing group. If backtracking ends up going down a
1769 /// different branch that results in a different offset (or perhaps none at
1770 /// all), then this "restore capture" frame will cause the offset to get
1771 /// reset.
1772 RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
1773 }
1774
1775 /// A bitset that keeps track of whether a particular (StateID, offset) has
1776 /// been considered during backtracking. If it has already been visited, then
1777 /// backtracking skips it. This is what gives backtracking its "bound."
1778 #[derive(Clone, Debug)]
1779 struct Visited {
1780 /// The actual underlying bitset. Each element in the bitset corresponds
1781 /// to a particular (StateID, offset) pair. States correspond to the rows
1782 /// and the offsets correspond to the columns.
1783 ///
1784 /// If our underlying NFA has N states and the haystack we're searching
1785 /// has M bytes, then we have N*(M+1) entries in our bitset table. The
1786 /// M+1 occurs because our matches are delayed by one byte (to support
1787 /// look-around), and so we need to handle the end position itself rather
1788 /// than stopping just before the end. (If there is no end position, then
1789 /// it's treated as "end-of-input," which is matched by things like '$'.)
1790 ///
1791 /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize))
1792 /// blocks.
1793 ///
1794 /// We use 'usize' to represent our blocks because it makes some of the
1795 /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
1796 /// our block, we'd either need to cast u32s to usizes or usizes to u32s.
1797 bitset: Vec<usize>,
1798 /// The stride represents one plus length of the haystack we're searching
1799 /// (as described above). The stride must be initialized for each search.
1800 stride: usize,
1801 }
1802
1803 impl Visited {
1804 /// The size of each block, in bits.
1805 const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>();
1806
1807 /// Create a new visited set for the given backtracker.
1808 ///
1809 /// The set is ready to use, but must be setup at the beginning of each
1810 /// search by calling `setup_search`.
new(re: &BoundedBacktracker) -> Visited1811 fn new(re: &BoundedBacktracker) -> Visited {
1812 let mut visited = Visited { bitset: vec![], stride: 0 };
1813 visited.reset(re);
1814 visited
1815 }
1816
1817 /// Insert the given (StateID, offset) pair into this set. If it already
1818 /// exists, then this is a no-op and it returns false. Otherwise this
1819 /// returns true.
insert(&mut self, sid: StateID, at: usize) -> bool1820 fn insert(&mut self, sid: StateID, at: usize) -> bool {
1821 let table_index = sid.as_usize() * self.stride + at;
1822 let block_index = table_index / Visited::BLOCK_SIZE;
1823 let bit = table_index % Visited::BLOCK_SIZE;
1824 let block_with_bit = 1 << bit;
1825 if self.bitset[block_index] & block_with_bit != 0 {
1826 return false;
1827 }
1828 self.bitset[block_index] |= block_with_bit;
1829 true
1830 }
1831
1832 /// Reset this visited set to work with the given bounded backtracker.
reset(&mut self, _: &BoundedBacktracker)1833 fn reset(&mut self, _: &BoundedBacktracker) {
1834 self.bitset.truncate(0);
1835 }
1836
1837 /// Setup this visited set to work for a search using the given NFA
1838 /// and input configuration. The NFA must be the same NFA used by the
1839 /// BoundedBacktracker given to Visited::reset. Failing to call this might
1840 /// result in panics or silently incorrect search behavior.
setup_search( &mut self, re: &BoundedBacktracker, input: &Input<'_>, ) -> Result<(), MatchError>1841 fn setup_search(
1842 &mut self,
1843 re: &BoundedBacktracker,
1844 input: &Input<'_>,
1845 ) -> Result<(), MatchError> {
1846 // Our haystack length is only the length of the span of the entire
1847 // haystack that we'll be searching.
1848 let haylen = input.get_span().len();
1849 let err = || MatchError::haystack_too_long(haylen);
1850 // Our stride is one more than the length of the input because our main
1851 // search loop includes the position at input.end(). (And it does this
1852 // because matches are delayed by one byte to account for look-around.)
1853 self.stride = haylen + 1;
1854 let needed_capacity =
1855 match re.get_nfa().states().len().checked_mul(self.stride) {
1856 None => return Err(err()),
1857 Some(capacity) => capacity,
1858 };
1859 let max_capacity = 8 * re.get_config().get_visited_capacity();
1860 if needed_capacity > max_capacity {
1861 return Err(err());
1862 }
1863 let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
1864 self.bitset.truncate(needed_blocks);
1865 for block in self.bitset.iter_mut() {
1866 *block = 0;
1867 }
1868 if needed_blocks > self.bitset.len() {
1869 self.bitset.resize(needed_blocks, 0);
1870 }
1871 Ok(())
1872 }
1873
1874 /// Return the heap memory usage, in bytes, of this visited set.
memory_usage(&self) -> usize1875 fn memory_usage(&self) -> usize {
1876 self.bitset.len() * core::mem::size_of::<usize>()
1877 }
1878 }
1879
1880 /// Integer division, but rounds up instead of down.
div_ceil(lhs: usize, rhs: usize) -> usize1881 fn div_ceil(lhs: usize, rhs: usize) -> usize {
1882 if lhs % rhs == 0 {
1883 lhs / rhs
1884 } else {
1885 (lhs / rhs) + 1
1886 }
1887 }
1888
1889 #[cfg(test)]
1890 mod tests {
1891 use super::*;
1892
1893 // This is a regression test for the maximum haystack length computation.
1894 // Previously, it assumed that the total capacity of the backtracker's
1895 // bitset would always be greater than the number of NFA states. But there
1896 // is of course no guarantee that this is true. This regression test
1897 // ensures that not only does `max_haystack_len` not panic, but that it
1898 // should return `0`.
1899 #[cfg(feature = "syntax")]
1900 #[test]
max_haystack_len_overflow()1901 fn max_haystack_len_overflow() {
1902 let re = BoundedBacktracker::builder()
1903 .configure(BoundedBacktracker::config().visited_capacity(10))
1904 .build(r"[0-9A-Za-z]{100}")
1905 .unwrap();
1906 assert_eq!(0, re.max_haystack_len());
1907 }
1908 }
1909