1 /*!
2 Types and routines specific to lazy DFAs.
3 
4 This module is the home of [`hybrid::dfa::DFA`](DFA).
5 
6 This module also contains a [`hybrid::dfa::Builder`](Builder) and a
7 [`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
8 */
9 
10 use core::{iter, mem::size_of};
11 
12 use alloc::vec::Vec;
13 
14 use crate::{
15     hybrid::{
16         error::{BuildError, CacheError, StartError},
17         id::{LazyStateID, LazyStateIDError},
18         search,
19     },
20     nfa::thompson,
21     util::{
22         alphabet::{self, ByteClasses, ByteSet},
23         determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
24         empty,
25         prefilter::Prefilter,
26         primitives::{PatternID, StateID as NFAStateID},
27         search::{
28             Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
29         },
30         sparse_set::SparseSets,
31         start::{self, Start, StartByteMap},
32     },
33 };
34 
35 /// The minimum number of states that a lazy DFA's cache size must support.
36 ///
37 /// This is checked at time of construction to ensure that at least some small
38 /// number of states can fit in the given capacity allotment. If we can't fit
39 /// at least this number of states, then the thinking is that it's pretty
40 /// senseless to use the lazy DFA. More to the point, parts of the code do
41 /// assume that the cache can fit at least some small number of states.
42 const MIN_STATES: usize = SENTINEL_STATES + 2;
43 
44 /// The number of "sentinel" states that get added to every lazy DFA.
45 ///
46 /// These are special states indicating status conditions of a search: unknown,
47 /// dead and quit. These states in particular also use zero NFA states, so
48 /// their memory usage is quite small. This is relevant for computing the
49 /// minimum memory needed for a lazy DFA cache.
50 const SENTINEL_STATES: usize = 3;
51 
52 /// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
53 ///
54 /// A lazy DFA is a DFA that builds itself at search time. It otherwise has
55 /// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
56 /// Indeed, both support precisely the same regex features with precisely the
57 /// same semantics.
58 ///
59 /// Where as a `dense::DFA` must be completely built to handle any input before
60 /// it may be used for search, a lazy DFA starts off effectively empty. During
61 /// a search, a lazy DFA will build itself depending on whether it has already
62 /// computed the next transition or not. If it has, then it looks a lot like
63 /// a `dense::DFA` internally: it does a very fast table based access to find
64 /// the next transition. Otherwise, if the state hasn't been computed, then it
65 /// does determinization _for that specific transition_ to compute the next DFA
66 /// state.
67 ///
68 /// The main selling point of a lazy DFA is that, in practice, it has
69 /// the performance profile of a `dense::DFA` without the weakness of it
70 /// taking worst case exponential time to build. Indeed, for each byte of
71 /// input, the lazy DFA will construct as most one new DFA state. Thus, a
72 /// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
73 /// pattern.len()` and `n ~ haystack.len()`).
74 ///
75 /// The main downsides of a lazy DFA are:
76 ///
77 /// 1. It requires mutable "cache" space during search. This is where the
78 /// transition table, among other things, is stored.
79 /// 2. In pathological cases (e.g., if the cache is too small), it will run
80 /// out of room and either require a bigger cache capacity or will repeatedly
81 /// clear the cache and thus repeatedly regenerate DFA states. Overall, this
82 /// will tend to be slower than a typical NFA simulation.
83 ///
84 /// # Capabilities
85 ///
86 /// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
87 /// operations:
88 ///
89 /// 1. Detection of a match.
90 /// 2. Location of the end of a match.
91 /// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
92 /// is reported as well.
93 ///
94 /// A notable absence from the above list of capabilities is the location of
95 /// the *start* of a match. In order to provide both the start and end of
96 /// a match, *two* lazy DFAs are required. This functionality is provided by a
97 /// [`Regex`](crate::hybrid::regex::Regex).
98 ///
99 /// # Example
100 ///
101 /// This shows how to build a lazy DFA with the default configuration and
102 /// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
103 /// a cache and pass it to our search routine.
104 ///
105 /// ```
106 /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
107 ///
108 /// let dfa = DFA::new("foo[0-9]+")?;
109 /// let mut cache = dfa.create_cache();
110 ///
111 /// let expected = Some(HalfMatch::must(0, 8));
112 /// assert_eq!(expected, dfa.try_search_fwd(
113 ///     &mut cache, &Input::new("foo12345"))?,
114 /// );
115 /// # Ok::<(), Box<dyn std::error::Error>>(())
116 /// ```
117 #[derive(Clone, Debug)]
118 pub struct DFA {
119     config: Config,
120     nfa: thompson::NFA,
121     stride2: usize,
122     start_map: StartByteMap,
123     classes: ByteClasses,
124     quitset: ByteSet,
125     cache_capacity: usize,
126 }
127 
128 impl DFA {
129     /// Parse the given regular expression using a default configuration and
130     /// return the corresponding lazy DFA.
131     ///
132     /// If you want a non-default configuration, then use the [`Builder`] to
133     /// set your own configuration.
134     ///
135     /// # Example
136     ///
137     /// ```
138     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
139     ///
140     /// let dfa = DFA::new("foo[0-9]+bar")?;
141     /// let mut cache = dfa.create_cache();
142     ///
143     /// let expected = HalfMatch::must(0, 11);
144     /// assert_eq!(
145     ///     Some(expected),
146     ///     dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
147     /// );
148     /// # Ok::<(), Box<dyn std::error::Error>>(())
149     /// ```
150     #[cfg(feature = "syntax")]
new(pattern: &str) -> Result<DFA, BuildError>151     pub fn new(pattern: &str) -> Result<DFA, BuildError> {
152         DFA::builder().build(pattern)
153     }
154 
155     /// Parse the given regular expressions using a default configuration and
156     /// return the corresponding lazy multi-DFA.
157     ///
158     /// If you want a non-default configuration, then use the [`Builder`] to
159     /// set your own configuration.
160     ///
161     /// # Example
162     ///
163     /// ```
164     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
165     ///
166     /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
167     /// let mut cache = dfa.create_cache();
168     ///
169     /// let expected = HalfMatch::must(1, 3);
170     /// assert_eq!(
171     ///     Some(expected),
172     ///     dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
173     /// );
174     /// # Ok::<(), Box<dyn std::error::Error>>(())
175     /// ```
176     #[cfg(feature = "syntax")]
new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError>177     pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
178         DFA::builder().build_many(patterns)
179     }
180 
181     /// Create a new lazy DFA that matches every input.
182     ///
183     /// # Example
184     ///
185     /// ```
186     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
187     ///
188     /// let dfa = DFA::always_match()?;
189     /// let mut cache = dfa.create_cache();
190     ///
191     /// let expected = HalfMatch::must(0, 0);
192     /// assert_eq!(Some(expected), dfa.try_search_fwd(
193     ///     &mut cache, &Input::new(""))?,
194     /// );
195     /// assert_eq!(Some(expected), dfa.try_search_fwd(
196     ///     &mut cache, &Input::new("foo"))?,
197     /// );
198     /// # Ok::<(), Box<dyn std::error::Error>>(())
199     /// ```
always_match() -> Result<DFA, BuildError>200     pub fn always_match() -> Result<DFA, BuildError> {
201         let nfa = thompson::NFA::always_match();
202         Builder::new().build_from_nfa(nfa)
203     }
204 
205     /// Create a new lazy DFA that never matches any input.
206     ///
207     /// # Example
208     ///
209     /// ```
210     /// use regex_automata::{hybrid::dfa::DFA, Input};
211     ///
212     /// let dfa = DFA::never_match()?;
213     /// let mut cache = dfa.create_cache();
214     ///
215     /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
216     /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
217     /// # Ok::<(), Box<dyn std::error::Error>>(())
218     /// ```
never_match() -> Result<DFA, BuildError>219     pub fn never_match() -> Result<DFA, BuildError> {
220         let nfa = thompson::NFA::never_match();
221         Builder::new().build_from_nfa(nfa)
222     }
223 
224     /// Return a default configuration for a `DFA`.
225     ///
226     /// This is a convenience routine to avoid needing to import the [`Config`]
227     /// type when customizing the construction of a lazy DFA.
228     ///
229     /// # Example
230     ///
231     /// This example shows how to build a lazy DFA that heuristically supports
232     /// Unicode word boundaries.
233     ///
234     /// ```
235     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
236     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
237     ///
238     /// let re = DFA::builder()
239     ///     .configure(DFA::config().unicode_word_boundary(true))
240     ///     .build(r"\b\w+\b")?;
241     /// let mut cache = re.create_cache();
242     ///
243     /// // Since our haystack is all ASCII, the DFA search sees then and knows
244     /// // it is legal to interpret Unicode word boundaries as ASCII word
245     /// // boundaries.
246     /// let input = Input::new("!!foo!!");
247     /// let expected = HalfMatch::must(0, 5);
248     /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
249     ///
250     /// // But if our haystack contains non-ASCII, then the search will fail
251     /// // with an error.
252     /// let input = Input::new("!!βββ!!");
253     /// let expected = MatchError::quit(b'\xCE', 2);
254     /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
255     ///
256     /// # Ok::<(), Box<dyn std::error::Error>>(())
257     /// ```
config() -> Config258     pub fn config() -> Config {
259         Config::new()
260     }
261 
262     /// Return a builder for configuring the construction of a `Regex`.
263     ///
264     /// This is a convenience routine to avoid needing to import the
265     /// [`Builder`] type in common cases.
266     ///
267     /// # Example
268     ///
269     /// This example shows how to use the builder to disable UTF-8 mode
270     /// everywhere for lazy DFAs.
271     ///
272     /// ```
273     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
274     /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
275     ///
276     /// let re = DFA::builder()
277     ///     .syntax(syntax::Config::new().utf8(false))
278     ///     .build(r"foo(?-u:[^b])ar.*")?;
279     /// let mut cache = re.create_cache();
280     ///
281     /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n");
282     /// let expected = Some(HalfMatch::must(0, 9));
283     /// let got = re.try_search_fwd(&mut cache, &input)?;
284     /// assert_eq!(expected, got);
285     ///
286     /// # Ok::<(), Box<dyn std::error::Error>>(())
287     /// ```
builder() -> Builder288     pub fn builder() -> Builder {
289         Builder::new()
290     }
291 
292     /// Create a new cache for this lazy DFA.
293     ///
294     /// The cache returned should only be used for searches for this
295     /// lazy DFA. If you want to reuse the cache for another DFA, then
296     /// you must call [`Cache::reset`] with that DFA (or, equivalently,
297     /// [`DFA::reset_cache`]).
create_cache(&self) -> Cache298     pub fn create_cache(&self) -> Cache {
299         Cache::new(self)
300     }
301 
302     /// Reset the given cache such that it can be used for searching with the
303     /// this lazy DFA (and only this DFA).
304     ///
305     /// A cache reset permits reusing memory already allocated in this cache
306     /// with a different lazy DFA.
307     ///
308     /// Resetting a cache sets its "clear count" to 0. This is relevant if the
309     /// lazy DFA has been configured to "give up" after it has cleared the
310     /// cache a certain number of times.
311     ///
312     /// Any lazy state ID generated by the cache prior to resetting it is
313     /// invalid after the reset.
314     ///
315     /// # Example
316     ///
317     /// This shows how to re-purpose a cache for use with a different DFA.
318     ///
319     /// ```
320     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
321     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
322     ///
323     /// let dfa1 = DFA::new(r"\w")?;
324     /// let dfa2 = DFA::new(r"\W")?;
325     ///
326     /// let mut cache = dfa1.create_cache();
327     /// assert_eq!(
328     ///     Some(HalfMatch::must(0, 2)),
329     ///     dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
330     /// );
331     ///
332     /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
333     /// // incorrect results. In order to re-purpose the cache, we must reset
334     /// // it with the DFA we'd like to use it with.
335     /// //
336     /// // Similarly, after this reset, using the cache with 'dfa1' is also not
337     /// // allowed.
338     /// dfa2.reset_cache(&mut cache);
339     /// assert_eq!(
340     ///     Some(HalfMatch::must(0, 3)),
341     ///     dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
342     /// );
343     ///
344     /// # Ok::<(), Box<dyn std::error::Error>>(())
345     /// ```
reset_cache(&self, cache: &mut Cache)346     pub fn reset_cache(&self, cache: &mut Cache) {
347         Lazy::new(self, cache).reset_cache()
348     }
349 
350     /// Returns the total number of patterns compiled into this lazy DFA.
351     ///
352     /// In the case of a DFA that contains no patterns, this returns `0`.
353     ///
354     /// # Example
355     ///
356     /// This example shows the pattern length for a DFA that never matches:
357     ///
358     /// ```
359     /// use regex_automata::hybrid::dfa::DFA;
360     ///
361     /// let dfa = DFA::never_match()?;
362     /// assert_eq!(dfa.pattern_len(), 0);
363     /// # Ok::<(), Box<dyn std::error::Error>>(())
364     /// ```
365     ///
366     /// And another example for a DFA that matches at every position:
367     ///
368     /// ```
369     /// use regex_automata::hybrid::dfa::DFA;
370     ///
371     /// let dfa = DFA::always_match()?;
372     /// assert_eq!(dfa.pattern_len(), 1);
373     /// # Ok::<(), Box<dyn std::error::Error>>(())
374     /// ```
375     ///
376     /// And finally, a DFA that was constructed from multiple patterns:
377     ///
378     /// ```
379     /// use regex_automata::hybrid::dfa::DFA;
380     ///
381     /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
382     /// assert_eq!(dfa.pattern_len(), 3);
383     /// # Ok::<(), Box<dyn std::error::Error>>(())
384     /// ```
pattern_len(&self) -> usize385     pub fn pattern_len(&self) -> usize {
386         self.nfa.pattern_len()
387     }
388 
389     /// Returns the equivalence classes that make up the alphabet for this DFA.
390     ///
391     /// Unless [`Config::byte_classes`] was disabled, it is possible that
392     /// multiple distinct bytes are grouped into the same equivalence class
393     /// if it is impossible for them to discriminate between a match and a
394     /// non-match. This has the effect of reducing the overall alphabet size
395     /// and in turn potentially substantially reducing the size of the DFA's
396     /// transition table.
397     ///
398     /// The downside of using equivalence classes like this is that every state
399     /// transition will automatically use this map to convert an arbitrary
400     /// byte to its corresponding equivalence class. In practice this has a
401     /// negligible impact on performance.
byte_classes(&self) -> &ByteClasses402     pub fn byte_classes(&self) -> &ByteClasses {
403         &self.classes
404     }
405 
406     /// Returns this lazy DFA's configuration.
get_config(&self) -> &Config407     pub fn get_config(&self) -> &Config {
408         &self.config
409     }
410 
411     /// Returns a reference to the underlying NFA.
get_nfa(&self) -> &thompson::NFA412     pub fn get_nfa(&self) -> &thompson::NFA {
413         &self.nfa
414     }
415 
416     /// Returns the stride, as a base-2 exponent, required for these
417     /// equivalence classes.
418     ///
419     /// The stride is always the smallest power of 2 that is greater than or
420     /// equal to the alphabet length. This is done so that converting between
421     /// state IDs and indices can be done with shifts alone, which is much
422     /// faster than integer division.
stride2(&self) -> usize423     fn stride2(&self) -> usize {
424         self.stride2
425     }
426 
427     /// Returns the total stride for every state in this lazy DFA. This
428     /// corresponds to the total number of transitions used by each state in
429     /// this DFA's transition table.
stride(&self) -> usize430     fn stride(&self) -> usize {
431         1 << self.stride2()
432     }
433 
434     /// Returns the memory usage, in bytes, of this lazy DFA.
435     ///
436     /// This does **not** include the stack size used up by this lazy DFA. To
437     /// compute that, use `std::mem::size_of::<DFA>()`. This also does not
438     /// include the size of the `Cache` used.
439     ///
440     /// This also does not include any heap memory used by the NFA inside of
441     /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
442     /// thus not owned by this hybrid NFA/DFA. More practically, several regex
443     /// engines in this crate embed an NFA, and reporting the NFA's memory
444     /// usage in all of them would likely result in reporting higher heap
445     /// memory than is actually used.
memory_usage(&self) -> usize446     pub fn memory_usage(&self) -> usize {
447         // The only thing that uses heap memory in a DFA is the NFA. But the
448         // NFA has shared ownership, so reporting its memory as part of the
449         // hybrid DFA is likely to lead to double-counting the NFA memory
450         // somehow. In particular, this DFA does not really own an NFA, so
451         // including it in the DFA's memory usage doesn't seem semantically
452         // correct.
453         0
454     }
455 }
456 
457 impl DFA {
458     /// Executes a forward search and returns the end position of the leftmost
459     /// match that is found. If no match exists, then `None` is returned.
460     ///
461     /// In particular, this method continues searching even after it enters
462     /// a match state. The search only terminates once it has reached the
463     /// end of the input or when it has entered a dead or quit state. Upon
464     /// termination, the position of the last byte seen while still in a match
465     /// state is returned.
466     ///
467     /// # Errors
468     ///
469     /// This routine errors if the search could not complete. This can occur
470     /// in a number of circumstances:
471     ///
472     /// * The configuration of the lazy DFA may permit it to "quit" the search.
473     /// For example, setting quit bytes or enabling heuristic support for
474     /// Unicode word boundaries. The default configuration does not enable any
475     /// option that could result in the lazy DFA quitting.
476     /// * The configuration of the lazy DFA may also permit it to "give up"
477     /// on a search if it makes ineffective use of its transition table
478     /// cache. The default configuration does not enable this by default,
479     /// although it is typically a good idea to.
480     /// * When the provided `Input` configuration is not supported. For
481     /// example, by providing an unsupported anchor mode.
482     ///
483     /// When a search returns an error, callers cannot know whether a match
484     /// exists or not.
485     ///
486     /// # Example
487     ///
488     /// This example shows how to run a basic search.
489     ///
490     /// ```
491     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
492     ///
493     /// let dfa = DFA::new("foo[0-9]+")?;
494     /// let mut cache = dfa.create_cache();
495     /// let expected = HalfMatch::must(0, 8);
496     /// assert_eq!(Some(expected), dfa.try_search_fwd(
497     ///     &mut cache, &Input::new("foo12345"))?,
498     /// );
499     ///
500     /// // Even though a match is found after reading the first byte (`a`),
501     /// // the leftmost first match semantics demand that we find the earliest
502     /// // match that prefers earlier parts of the pattern over later parts.
503     /// let dfa = DFA::new("abc|a")?;
504     /// let mut cache = dfa.create_cache();
505     /// let expected = HalfMatch::must(0, 3);
506     /// assert_eq!(Some(expected), dfa.try_search_fwd(
507     ///     &mut cache, &Input::new("abc"))?,
508     /// );
509     ///
510     /// # Ok::<(), Box<dyn std::error::Error>>(())
511     /// ```
512     ///
513     /// # Example: specific pattern search
514     ///
515     /// This example shows how to build a lazy multi-DFA that permits searching
516     /// for specific patterns.
517     ///
518     /// ```
519     /// use regex_automata::{
520     ///     hybrid::dfa::DFA,
521     ///     Anchored, HalfMatch, PatternID, Input,
522     /// };
523     ///
524     /// let dfa = DFA::builder()
525     ///     .configure(DFA::config().starts_for_each_pattern(true))
526     ///     .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
527     /// let mut cache = dfa.create_cache();
528     /// let haystack = "foo123";
529     ///
530     /// // Since we are using the default leftmost-first match and both
531     /// // patterns match at the same starting position, only the first pattern
532     /// // will be returned in this case when doing a search for any of the
533     /// // patterns.
534     /// let expected = Some(HalfMatch::must(0, 6));
535     /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
536     /// assert_eq!(expected, got);
537     ///
538     /// // But if we want to check whether some other pattern matches, then we
539     /// // can provide its pattern ID.
540     /// let expected = Some(HalfMatch::must(1, 6));
541     /// let input = Input::new(haystack)
542     ///     .anchored(Anchored::Pattern(PatternID::must(1)));
543     /// let got = dfa.try_search_fwd(&mut cache, &input)?;
544     /// assert_eq!(expected, got);
545     ///
546     /// # Ok::<(), Box<dyn std::error::Error>>(())
547     /// ```
548     ///
549     /// # Example: specifying the bounds of a search
550     ///
551     /// This example shows how providing the bounds of a search can produce
552     /// different results than simply sub-slicing the haystack.
553     ///
554     /// ```
555     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
556     ///
557     /// // N.B. We disable Unicode here so that we use a simple ASCII word
558     /// // boundary. Alternatively, we could enable heuristic support for
559     /// // Unicode word boundaries since our haystack is pure ASCII.
560     /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
561     /// let mut cache = dfa.create_cache();
562     /// let haystack = "foo123bar";
563     ///
564     /// // Since we sub-slice the haystack, the search doesn't know about the
565     /// // larger context and assumes that `123` is surrounded by word
566     /// // boundaries. And of course, the match position is reported relative
567     /// // to the sub-slice as well, which means we get `3` instead of `6`.
568     /// let expected = Some(HalfMatch::must(0, 3));
569     /// let got = dfa.try_search_fwd(
570     ///     &mut cache,
571     ///     &Input::new(&haystack[3..6]),
572     /// )?;
573     /// assert_eq!(expected, got);
574     ///
575     /// // But if we provide the bounds of the search within the context of the
576     /// // entire haystack, then the search can take the surrounding context
577     /// // into account. (And if we did find a match, it would be reported
578     /// // as a valid offset into `haystack` instead of its sub-slice.)
579     /// let expected = None;
580     /// let got = dfa.try_search_fwd(
581     ///     &mut cache,
582     ///     &Input::new(haystack).range(3..6),
583     /// )?;
584     /// assert_eq!(expected, got);
585     ///
586     /// # Ok::<(), Box<dyn std::error::Error>>(())
587     /// ```
588     #[inline]
try_search_fwd( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError>589     pub fn try_search_fwd(
590         &self,
591         cache: &mut Cache,
592         input: &Input<'_>,
593     ) -> Result<Option<HalfMatch>, MatchError> {
594         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
595         let hm = match search::find_fwd(self, cache, input)? {
596             None => return Ok(None),
597             Some(hm) if !utf8empty => return Ok(Some(hm)),
598             Some(hm) => hm,
599         };
600         // We get to this point when we know our DFA can match the empty string
601         // AND when UTF-8 mode is enabled. In this case, we skip any matches
602         // whose offset splits a codepoint. Such a match is necessarily a
603         // zero-width match, because UTF-8 mode requires the underlying NFA
604         // to be built such that all non-empty matches span valid UTF-8.
605         // Therefore, any match that ends in the middle of a codepoint cannot
606         // be part of a span of valid UTF-8 and thus must be an empty match.
607         // In such cases, we skip it, so as not to report matches that split a
608         // codepoint.
609         //
610         // Note that this is not a checked assumption. Callers *can* provide an
611         // NFA with UTF-8 mode enabled but produces non-empty matches that span
612         // invalid UTF-8. But doing so is documented to result in unspecified
613         // behavior.
614         empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
615             let got = search::find_fwd(self, cache, input)?;
616             Ok(got.map(|hm| (hm, hm.offset())))
617         })
618     }
619 
620     /// Executes a reverse search and returns the start of the position of the
621     /// leftmost match that is found. If no match exists, then `None` is
622     /// returned.
623     ///
624     /// # Errors
625     ///
626     /// This routine errors if the search could not complete. This can occur
627     /// in a number of circumstances:
628     ///
629     /// * The configuration of the lazy DFA may permit it to "quit" the search.
630     /// For example, setting quit bytes or enabling heuristic support for
631     /// Unicode word boundaries. The default configuration does not enable any
632     /// option that could result in the lazy DFA quitting.
633     /// * The configuration of the lazy DFA may also permit it to "give up"
634     /// on a search if it makes ineffective use of its transition table
635     /// cache. The default configuration does not enable this by default,
636     /// although it is typically a good idea to.
637     /// * When the provided `Input` configuration is not supported. For
638     /// example, by providing an unsupported anchor mode.
639     ///
640     /// When a search returns an error, callers cannot know whether a match
641     /// exists or not.
642     ///
643     /// # Example
644     ///
645     /// This routine is principally useful when used in
646     /// conjunction with the
647     /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
648     /// configuration. In general, it's unlikely to be correct to use both
649     /// `try_search_fwd` and `try_search_rev` with the same DFA since any
650     /// particular DFA will only support searching in one direction with
651     /// respect to the pattern.
652     ///
653     /// ```
654     /// use regex_automata::{
655     ///     nfa::thompson,
656     ///     hybrid::dfa::DFA,
657     ///     HalfMatch, Input,
658     /// };
659     ///
660     /// let dfa = DFA::builder()
661     ///     .thompson(thompson::Config::new().reverse(true))
662     ///     .build("foo[0-9]+")?;
663     /// let mut cache = dfa.create_cache();
664     /// let expected = HalfMatch::must(0, 0);
665     /// assert_eq!(
666     ///     Some(expected),
667     ///     dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
668     /// );
669     ///
670     /// // Even though a match is found after reading the last byte (`c`),
671     /// // the leftmost first match semantics demand that we find the earliest
672     /// // match that prefers earlier parts of the pattern over latter parts.
673     /// let dfa = DFA::builder()
674     ///     .thompson(thompson::Config::new().reverse(true))
675     ///     .build("abc|c")?;
676     /// let mut cache = dfa.create_cache();
677     /// let expected = HalfMatch::must(0, 0);
678     /// assert_eq!(Some(expected), dfa.try_search_rev(
679     ///     &mut cache, &Input::new("abc"))?,
680     /// );
681     ///
682     /// # Ok::<(), Box<dyn std::error::Error>>(())
683     /// ```
684     ///
685     /// # Example: UTF-8 mode
686     ///
687     /// This examples demonstrates that UTF-8 mode applies to reverse
688     /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
689     /// matches reported must correspond to valid UTF-8 spans. This includes
690     /// prohibiting zero-width matches that split a codepoint.
691     ///
692     /// UTF-8 mode is enabled by default. Notice below how the only zero-width
693     /// matches reported are those at UTF-8 boundaries:
694     ///
695     /// ```
696     /// use regex_automata::{
697     ///     hybrid::dfa::DFA,
698     ///     nfa::thompson,
699     ///     HalfMatch, Input, MatchKind,
700     /// };
701     ///
702     /// let dfa = DFA::builder()
703     ///     .thompson(thompson::Config::new().reverse(true))
704     ///     .build(r"")?;
705     /// let mut cache = dfa.create_cache();
706     ///
707     /// // Run the reverse DFA to collect all matches.
708     /// let mut input = Input::new("☃");
709     /// let mut matches = vec![];
710     /// loop {
711     ///     match dfa.try_search_rev(&mut cache, &input)? {
712     ///         None => break,
713     ///         Some(hm) => {
714     ///             matches.push(hm);
715     ///             if hm.offset() == 0 || input.end() == 0 {
716     ///                 break;
717     ///             } else if hm.offset() < input.end() {
718     ///                 input.set_end(hm.offset());
719     ///             } else {
720     ///                 // This is only necessary to handle zero-width
721     ///                 // matches, which of course occur in this example.
722     ///                 // Without this, the search would never advance
723     ///                 // backwards beyond the initial match.
724     ///                 input.set_end(input.end() - 1);
725     ///             }
726     ///         }
727     ///     }
728     /// }
729     ///
730     /// // No matches split a codepoint.
731     /// let expected = vec![
732     ///     HalfMatch::must(0, 3),
733     ///     HalfMatch::must(0, 0),
734     /// ];
735     /// assert_eq!(expected, matches);
736     ///
737     /// # Ok::<(), Box<dyn std::error::Error>>(())
738     /// ```
739     ///
740     /// Now let's look at the same example, but with UTF-8 mode on the
741     /// underlying NFA disabled:
742     ///
743     /// ```
744     /// use regex_automata::{
745     ///     hybrid::dfa::DFA,
746     ///     nfa::thompson,
747     ///     HalfMatch, Input, MatchKind,
748     /// };
749     ///
750     /// let dfa = DFA::builder()
751     ///     .thompson(thompson::Config::new().reverse(true).utf8(false))
752     ///     .build(r"")?;
753     /// let mut cache = dfa.create_cache();
754     ///
755     /// // Run the reverse DFA to collect all matches.
756     /// let mut input = Input::new("☃");
757     /// let mut matches = vec![];
758     /// loop {
759     ///     match dfa.try_search_rev(&mut cache, &input)? {
760     ///         None => break,
761     ///         Some(hm) => {
762     ///             matches.push(hm);
763     ///             if hm.offset() == 0 || input.end() == 0 {
764     ///                 break;
765     ///             } else if hm.offset() < input.end() {
766     ///                 input.set_end(hm.offset());
767     ///             } else {
768     ///                 // This is only necessary to handle zero-width
769     ///                 // matches, which of course occur in this example.
770     ///                 // Without this, the search would never advance
771     ///                 // backwards beyond the initial match.
772     ///                 input.set_end(input.end() - 1);
773     ///             }
774     ///         }
775     ///     }
776     /// }
777     ///
778     /// // No matches split a codepoint.
779     /// let expected = vec![
780     ///     HalfMatch::must(0, 3),
781     ///     HalfMatch::must(0, 2),
782     ///     HalfMatch::must(0, 1),
783     ///     HalfMatch::must(0, 0),
784     /// ];
785     /// assert_eq!(expected, matches);
786     ///
787     /// # Ok::<(), Box<dyn std::error::Error>>(())
788     /// ```
789     #[inline]
try_search_rev( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError>790     pub fn try_search_rev(
791         &self,
792         cache: &mut Cache,
793         input: &Input<'_>,
794     ) -> Result<Option<HalfMatch>, MatchError> {
795         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
796         let hm = match search::find_rev(self, cache, input)? {
797             None => return Ok(None),
798             Some(hm) if !utf8empty => return Ok(Some(hm)),
799             Some(hm) => hm,
800         };
801         empty::skip_splits_rev(input, hm, hm.offset(), |input| {
802             let got = search::find_rev(self, cache, input)?;
803             Ok(got.map(|hm| (hm, hm.offset())))
804         })
805     }
806 
807     /// Executes an overlapping forward search and returns the end position of
808     /// matches as they are found. If no match exists, then `None` is returned.
809     ///
810     /// This routine is principally only useful when searching for multiple
811     /// patterns on inputs where multiple patterns may match the same regions
812     /// of text. In particular, callers must preserve the automaton's search
813     /// state from prior calls so that the implementation knows where the last
814     /// match occurred.
815     ///
816     /// When using this routine to implement an iterator of overlapping
817     /// matches, the `start` of the search should remain invariant throughout
818     /// iteration. The `OverlappingState` given to the search will keep track
819     /// of the current position of the search. (This is because multiple
820     /// matches may be reported at the same position, so only the search
821     /// implementation itself knows when to advance the position.)
822     ///
823     /// If for some reason you want the search to forget about its previous
824     /// state and restart the search at a particular position, then setting the
825     /// state to [`OverlappingState::start`] will accomplish that.
826     ///
827     /// # Errors
828     ///
829     /// This routine errors if the search could not complete. This can occur
830     /// in a number of circumstances:
831     ///
832     /// * The configuration of the lazy DFA may permit it to "quit" the search.
833     /// For example, setting quit bytes or enabling heuristic support for
834     /// Unicode word boundaries. The default configuration does not enable any
835     /// option that could result in the lazy DFA quitting.
836     /// * The configuration of the lazy DFA may also permit it to "give up"
837     /// on a search if it makes ineffective use of its transition table
838     /// cache. The default configuration does not enable this by default,
839     /// although it is typically a good idea to.
840     /// * When the provided `Input` configuration is not supported. For
841     /// example, by providing an unsupported anchor mode.
842     ///
843     /// When a search returns an error, callers cannot know whether a match
844     /// exists or not.
845     ///
846     /// # Example
847     ///
848     /// This example shows how to run a basic overlapping search. Notice
849     /// that we build the automaton with a `MatchKind::All` configuration.
850     /// Overlapping searches are unlikely to work as one would expect when
851     /// using the default `MatchKind::LeftmostFirst` match semantics, since
852     /// leftmost-first matching is fundamentally incompatible with overlapping
853     /// searches. Namely, overlapping searches need to report matches as they
854     /// are seen, where as leftmost-first searches will continue searching even
855     /// after a match has been observed in order to find the conventional end
856     /// position of the match. More concretely, leftmost-first searches use
857     /// dead states to terminate a search after a specific match can no longer
858     /// be extended. Overlapping searches instead do the opposite by continuing
859     /// the search to find totally new matches (potentially of other patterns).
860     ///
861     /// ```
862     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
863     /// use regex_automata::{
864     ///     hybrid::dfa::{DFA, OverlappingState},
865     ///     HalfMatch, Input, MatchKind,
866     /// };
867     ///
868     /// let dfa = DFA::builder()
869     ///     .configure(DFA::config().match_kind(MatchKind::All))
870     ///     .build_many(&[r"\w+$", r"\S+$"])?;
871     /// let mut cache = dfa.create_cache();
872     ///
873     /// let haystack = "@foo";
874     /// let mut state = OverlappingState::start();
875     ///
876     /// let expected = Some(HalfMatch::must(1, 4));
877     /// dfa.try_search_overlapping_fwd(
878     ///     &mut cache, &Input::new(haystack), &mut state,
879     /// )?;
880     /// assert_eq!(expected, state.get_match());
881     ///
882     /// // The first pattern also matches at the same position, so re-running
883     /// // the search will yield another match. Notice also that the first
884     /// // pattern is returned after the second. This is because the second
885     /// // pattern begins its match before the first, is therefore an earlier
886     /// // match and is thus reported first.
887     /// let expected = Some(HalfMatch::must(0, 4));
888     /// dfa.try_search_overlapping_fwd(
889     ///     &mut cache, &Input::new(haystack), &mut state,
890     /// )?;
891     /// assert_eq!(expected, state.get_match());
892     ///
893     /// # Ok::<(), Box<dyn std::error::Error>>(())
894     /// ```
895     #[inline]
try_search_overlapping_fwd( &self, cache: &mut Cache, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError>896     pub fn try_search_overlapping_fwd(
897         &self,
898         cache: &mut Cache,
899         input: &Input<'_>,
900         state: &mut OverlappingState,
901     ) -> Result<(), MatchError> {
902         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
903         search::find_overlapping_fwd(self, cache, input, state)?;
904         match state.get_match() {
905             None => Ok(()),
906             Some(_) if !utf8empty => Ok(()),
907             Some(_) => skip_empty_utf8_splits_overlapping(
908                 input,
909                 state,
910                 |input, state| {
911                     search::find_overlapping_fwd(self, cache, input, state)
912                 },
913             ),
914         }
915     }
916 
917     /// Executes a reverse overlapping search and returns the start of the
918     /// position of the leftmost match that is found. If no match exists, then
919     /// `None` is returned.
920     ///
921     /// When using this routine to implement an iterator of overlapping
922     /// matches, the `start` of the search should remain invariant throughout
923     /// iteration. The `OverlappingState` given to the search will keep track
924     /// of the current position of the search. (This is because multiple
925     /// matches may be reported at the same position, so only the search
926     /// implementation itself knows when to advance the position.)
927     ///
928     /// If for some reason you want the search to forget about its previous
929     /// state and restart the search at a particular position, then setting the
930     /// state to [`OverlappingState::start`] will accomplish that.
931     ///
932     /// # Errors
933     ///
934     /// This routine errors if the search could not complete. This can occur
935     /// in a number of circumstances:
936     ///
937     /// * The configuration of the lazy DFA may permit it to "quit" the search.
938     /// For example, setting quit bytes or enabling heuristic support for
939     /// Unicode word boundaries. The default configuration does not enable any
940     /// option that could result in the lazy DFA quitting.
941     /// * The configuration of the lazy DFA may also permit it to "give up"
942     /// on a search if it makes ineffective use of its transition table
943     /// cache. The default configuration does not enable this by default,
944     /// although it is typically a good idea to.
945     /// * When the provided `Input` configuration is not supported. For
946     /// example, by providing an unsupported anchor mode.
947     ///
948     /// When a search returns an error, callers cannot know whether a match
949     /// exists or not.
950     ///
951     /// # Example: UTF-8 mode
952     ///
953     /// This examples demonstrates that UTF-8 mode applies to reverse
954     /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
955     /// matches reported must correspond to valid UTF-8 spans. This includes
956     /// prohibiting zero-width matches that split a codepoint.
957     ///
958     /// UTF-8 mode is enabled by default. Notice below how the only zero-width
959     /// matches reported are those at UTF-8 boundaries:
960     ///
961     /// ```
962     /// use regex_automata::{
963     ///     hybrid::dfa::{DFA, OverlappingState},
964     ///     nfa::thompson,
965     ///     HalfMatch, Input, MatchKind,
966     /// };
967     ///
968     /// let dfa = DFA::builder()
969     ///     .configure(DFA::config().match_kind(MatchKind::All))
970     ///     .thompson(thompson::Config::new().reverse(true))
971     ///     .build_many(&[r"", r"☃"])?;
972     /// let mut cache = dfa.create_cache();
973     ///
974     /// // Run the reverse DFA to collect all matches.
975     /// let input = Input::new("☃");
976     /// let mut state = OverlappingState::start();
977     /// let mut matches = vec![];
978     /// loop {
979     ///     dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
980     ///     match state.get_match() {
981     ///         None => break,
982     ///         Some(hm) => matches.push(hm),
983     ///     }
984     /// }
985     ///
986     /// // No matches split a codepoint.
987     /// let expected = vec![
988     ///     HalfMatch::must(0, 3),
989     ///     HalfMatch::must(1, 0),
990     ///     HalfMatch::must(0, 0),
991     /// ];
992     /// assert_eq!(expected, matches);
993     ///
994     /// # Ok::<(), Box<dyn std::error::Error>>(())
995     /// ```
996     ///
997     /// Now let's look at the same example, but with UTF-8 mode on the
998     /// underlying NFA disabled:
999     ///
1000     /// ```
1001     /// use regex_automata::{
1002     ///     hybrid::dfa::{DFA, OverlappingState},
1003     ///     nfa::thompson,
1004     ///     HalfMatch, Input, MatchKind,
1005     /// };
1006     ///
1007     /// let dfa = DFA::builder()
1008     ///     .configure(DFA::config().match_kind(MatchKind::All))
1009     ///     .thompson(thompson::Config::new().reverse(true).utf8(false))
1010     ///     .build_many(&[r"", r"☃"])?;
1011     /// let mut cache = dfa.create_cache();
1012     ///
1013     /// // Run the reverse DFA to collect all matches.
1014     /// let input = Input::new("☃");
1015     /// let mut state = OverlappingState::start();
1016     /// let mut matches = vec![];
1017     /// loop {
1018     ///     dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
1019     ///     match state.get_match() {
1020     ///         None => break,
1021     ///         Some(hm) => matches.push(hm),
1022     ///     }
1023     /// }
1024     ///
1025     /// // Now *all* positions match, even within a codepoint,
1026     /// // because we lifted the requirement that matches
1027     /// // correspond to valid UTF-8 spans.
1028     /// let expected = vec![
1029     ///     HalfMatch::must(0, 3),
1030     ///     HalfMatch::must(0, 2),
1031     ///     HalfMatch::must(0, 1),
1032     ///     HalfMatch::must(1, 0),
1033     ///     HalfMatch::must(0, 0),
1034     /// ];
1035     /// assert_eq!(expected, matches);
1036     ///
1037     /// # Ok::<(), Box<dyn std::error::Error>>(())
1038     /// ```
1039     #[inline]
try_search_overlapping_rev( &self, cache: &mut Cache, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError>1040     pub fn try_search_overlapping_rev(
1041         &self,
1042         cache: &mut Cache,
1043         input: &Input<'_>,
1044         state: &mut OverlappingState,
1045     ) -> Result<(), MatchError> {
1046         let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1047         search::find_overlapping_rev(self, cache, input, state)?;
1048         match state.get_match() {
1049             None => Ok(()),
1050             Some(_) if !utf8empty => Ok(()),
1051             Some(_) => skip_empty_utf8_splits_overlapping(
1052                 input,
1053                 state,
1054                 |input, state| {
1055                     search::find_overlapping_rev(self, cache, input, state)
1056                 },
1057             ),
1058         }
1059     }
1060 
1061     /// Writes the set of patterns that match anywhere in the given search
1062     /// configuration to `patset`. If multiple patterns match at the same
1063     /// position and the underlying DFA supports overlapping matches, then all
1064     /// matching patterns are written to the given set.
1065     ///
1066     /// Unless all of the patterns in this DFA are anchored, then generally
1067     /// speaking, this will visit every byte in the haystack.
1068     ///
1069     /// This search routine *does not* clear the pattern set. This gives some
1070     /// flexibility to the caller (e.g., running multiple searches with the
1071     /// same pattern set), but does make the API bug-prone if you're reusing
1072     /// the same pattern set for multiple searches but intended them to be
1073     /// independent.
1074     ///
1075     /// If a pattern ID matched but the given `PatternSet` does not have
1076     /// sufficient capacity to store it, then it is not inserted and silently
1077     /// dropped.
1078     ///
1079     /// # Errors
1080     ///
1081     /// This routine errors if the search could not complete. This can occur
1082     /// in a number of circumstances:
1083     ///
1084     /// * The configuration of the lazy DFA may permit it to "quit" the search.
1085     /// For example, setting quit bytes or enabling heuristic support for
1086     /// Unicode word boundaries. The default configuration does not enable any
1087     /// option that could result in the lazy DFA quitting.
1088     /// * The configuration of the lazy DFA may also permit it to "give up"
1089     /// on a search if it makes ineffective use of its transition table
1090     /// cache. The default configuration does not enable this by default,
1091     /// although it is typically a good idea to.
1092     /// * When the provided `Input` configuration is not supported. For
1093     /// example, by providing an unsupported anchor mode.
1094     ///
1095     /// When a search returns an error, callers cannot know whether a match
1096     /// exists or not.
1097     ///
1098     /// # Example
1099     ///
1100     /// This example shows how to find all matching patterns in a haystack,
1101     /// even when some patterns match at the same position as other patterns.
1102     ///
1103     /// ```
1104     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1105     /// use regex_automata::{
1106     ///     hybrid::dfa::DFA,
1107     ///     Input, MatchKind, PatternSet,
1108     /// };
1109     ///
1110     /// let patterns = &[
1111     ///     r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1112     /// ];
1113     /// let dfa = DFA::builder()
1114     ///     .configure(DFA::config().match_kind(MatchKind::All))
1115     ///     .build_many(patterns)?;
1116     /// let mut cache = dfa.create_cache();
1117     ///
1118     /// let input = Input::new("foobar");
1119     /// let mut patset = PatternSet::new(dfa.pattern_len());
1120     /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
1121     /// let expected = vec![0, 2, 3, 4, 6];
1122     /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1123     /// assert_eq!(expected, got);
1124     ///
1125     /// # Ok::<(), Box<dyn std::error::Error>>(())
1126     /// ```
1127     #[inline]
try_which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) -> Result<(), MatchError>1128     pub fn try_which_overlapping_matches(
1129         &self,
1130         cache: &mut Cache,
1131         input: &Input<'_>,
1132         patset: &mut PatternSet,
1133     ) -> Result<(), MatchError> {
1134         let mut state = OverlappingState::start();
1135         while let Some(m) = {
1136             self.try_search_overlapping_fwd(cache, input, &mut state)?;
1137             state.get_match()
1138         } {
1139             let _ = patset.try_insert(m.pattern());
1140             // There's nothing left to find, so we can stop. Or the caller
1141             // asked us to.
1142             if patset.is_full() || input.get_earliest() {
1143                 break;
1144             }
1145         }
1146         Ok(())
1147     }
1148 }
1149 
1150 impl DFA {
1151     /// Transitions from the current state to the next state, given the next
1152     /// byte of input.
1153     ///
1154     /// The given cache is used to either reuse pre-computed state
1155     /// transitions, or to store this newly computed transition for future
1156     /// reuse. Thus, this routine guarantees that it will never return a state
1157     /// ID that has an "unknown" tag.
1158     ///
1159     /// # State identifier validity
1160     ///
1161     /// The only valid value for `current` is the lazy state ID returned
1162     /// by the most recent call to `next_state`, `next_state_untagged`,
1163     /// `next_state_untagged_unchecked`, `start_state_forward` or
1164     /// `state_state_reverse` for the given `cache`. Any state ID returned from
1165     /// prior calls to these routines (with the same `cache`) is considered
1166     /// invalid (even if it gives an appearance of working). State IDs returned
1167     /// from _any_ prior call for different `cache` values are also always
1168     /// invalid.
1169     ///
1170     /// The returned ID is always a valid ID when `current` refers to a valid
1171     /// ID. Moreover, this routine is defined for all possible values of
1172     /// `input`.
1173     ///
1174     /// These validity rules are not checked, even in debug mode. Callers are
1175     /// required to uphold these rules themselves.
1176     ///
1177     /// Violating these state ID validity rules will not sacrifice memory
1178     /// safety, but _may_ produce an incorrect result or a panic.
1179     ///
1180     /// # Panics
1181     ///
1182     /// If the given ID does not refer to a valid state, then this routine
1183     /// may panic but it also may not panic and instead return an invalid or
1184     /// incorrect ID.
1185     ///
1186     /// # Example
1187     ///
1188     /// This shows a simplistic example for walking a lazy DFA for a given
1189     /// haystack by using the `next_state` method.
1190     ///
1191     /// ```
1192     /// use regex_automata::{hybrid::dfa::DFA, Input};
1193     ///
1194     /// let dfa = DFA::new(r"[a-z]+r")?;
1195     /// let mut cache = dfa.create_cache();
1196     /// let haystack = "bar".as_bytes();
1197     ///
1198     /// // The start state is determined by inspecting the position and the
1199     /// // initial bytes of the haystack.
1200     /// let mut sid = dfa.start_state_forward(
1201     ///     &mut cache, &Input::new(haystack),
1202     /// )?;
1203     /// // Walk all the bytes in the haystack.
1204     /// for &b in haystack {
1205     ///     sid = dfa.next_state(&mut cache, sid, b)?;
1206     /// }
1207     /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1208     /// // special "EOI" transition at the end of the search.
1209     /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1210     /// assert!(sid.is_match());
1211     ///
1212     /// # Ok::<(), Box<dyn std::error::Error>>(())
1213     /// ```
1214     #[inline]
next_state( &self, cache: &mut Cache, current: LazyStateID, input: u8, ) -> Result<LazyStateID, CacheError>1215     pub fn next_state(
1216         &self,
1217         cache: &mut Cache,
1218         current: LazyStateID,
1219         input: u8,
1220     ) -> Result<LazyStateID, CacheError> {
1221         let class = usize::from(self.classes.get(input));
1222         let offset = current.as_usize_untagged() + class;
1223         let sid = cache.trans[offset];
1224         if !sid.is_unknown() {
1225             return Ok(sid);
1226         }
1227         let unit = alphabet::Unit::u8(input);
1228         Lazy::new(self, cache).cache_next_state(current, unit)
1229     }
1230 
1231     /// Transitions from the current state to the next state, given the next
1232     /// byte of input and a state ID that is not tagged.
1233     ///
1234     /// The only reason to use this routine is performance. In particular, the
1235     /// `next_state` method needs to do some additional checks, among them is
1236     /// to account for identifiers to states that are not yet computed. In
1237     /// such a case, the transition is computed on the fly. However, if it is
1238     /// known that the `current` state ID is untagged, then these checks can be
1239     /// omitted.
1240     ///
1241     /// Since this routine does not compute states on the fly, it does not
1242     /// modify the cache and thus cannot return an error. Consequently, `cache`
1243     /// does not need to be mutable and it is possible for this routine to
1244     /// return a state ID corresponding to the special "unknown" state. In
1245     /// this case, it is the caller's responsibility to use the prior state
1246     /// ID and `input` with `next_state` in order to force the computation of
1247     /// the unknown transition. Otherwise, trying to use the "unknown" state
1248     /// ID will just result in transitioning back to itself, and thus never
1249     /// terminating. (This is technically a special exemption to the state ID
1250     /// validity rules, but is permissible since this routine is guarateed to
1251     /// never mutate the given `cache`, and thus the identifier is guaranteed
1252     /// to remain valid.)
1253     ///
1254     /// See [`LazyStateID`] for more details on what it means for a state ID
1255     /// to be tagged. Also, see
1256     /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
1257     /// for this same idea, but with bounds checks forcefully elided.
1258     ///
1259     /// # State identifier validity
1260     ///
1261     /// The only valid value for `current` is an **untagged** lazy
1262     /// state ID returned by the most recent call to `next_state`,
1263     /// `next_state_untagged`, `next_state_untagged_unchecked`,
1264     /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1265     /// Any state ID returned from prior calls to these routines (with the
1266     /// same `cache`) is considered invalid (even if it gives an appearance
1267     /// of working). State IDs returned from _any_ prior call for different
1268     /// `cache` values are also always invalid.
1269     ///
1270     /// The returned ID is always a valid ID when `current` refers to a valid
1271     /// ID, although it may be tagged. Moreover, this routine is defined for
1272     /// all possible values of `input`.
1273     ///
1274     /// Not all validity rules are checked, even in debug mode. Callers are
1275     /// required to uphold these rules themselves.
1276     ///
1277     /// Violating these state ID validity rules will not sacrifice memory
1278     /// safety, but _may_ produce an incorrect result or a panic.
1279     ///
1280     /// # Panics
1281     ///
1282     /// If the given ID does not refer to a valid state, then this routine
1283     /// may panic but it also may not panic and instead return an invalid or
1284     /// incorrect ID.
1285     ///
1286     /// # Example
1287     ///
1288     /// This shows a simplistic example for walking a lazy DFA for a given
1289     /// haystack by using the `next_state_untagged` method where possible.
1290     ///
1291     /// ```
1292     /// use regex_automata::{hybrid::dfa::DFA, Input};
1293     ///
1294     /// let dfa = DFA::new(r"[a-z]+r")?;
1295     /// let mut cache = dfa.create_cache();
1296     /// let haystack = "bar".as_bytes();
1297     ///
1298     /// // The start state is determined by inspecting the position and the
1299     /// // initial bytes of the haystack.
1300     /// let mut sid = dfa.start_state_forward(
1301     ///     &mut cache, &Input::new(haystack),
1302     /// )?;
1303     /// // Walk all the bytes in the haystack.
1304     /// let mut at = 0;
1305     /// while at < haystack.len() {
1306     ///     if sid.is_tagged() {
1307     ///         sid = dfa.next_state(&mut cache, sid, haystack[at])?;
1308     ///     } else {
1309     ///         let mut prev_sid = sid;
1310     ///         // We attempt to chew through as much as we can while moving
1311     ///         // through untagged state IDs. Thus, the transition function
1312     ///         // does less work on average per byte. (Unrolling this loop
1313     ///         // may help even more.)
1314     ///         while at < haystack.len() {
1315     ///             prev_sid = sid;
1316     ///             sid = dfa.next_state_untagged(
1317     ///                 &mut cache, sid, haystack[at],
1318     ///             );
1319     ///             at += 1;
1320     ///             if sid.is_tagged() {
1321     ///                 break;
1322     ///             }
1323     ///         }
1324     ///         // We must ensure that we never proceed to the next iteration
1325     ///         // with an unknown state ID. If we don't account for this
1326     ///         // case, then search isn't guaranteed to terminate since all
1327     ///         // transitions on unknown states loop back to itself.
1328     ///         if sid.is_unknown() {
1329     ///             sid = dfa.next_state(
1330     ///                 &mut cache, prev_sid, haystack[at - 1],
1331     ///             )?;
1332     ///         }
1333     ///     }
1334     /// }
1335     /// // Matches are always delayed by 1 byte, so we must explicitly walk the
1336     /// // special "EOI" transition at the end of the search.
1337     /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1338     /// assert!(sid.is_match());
1339     ///
1340     /// # Ok::<(), Box<dyn std::error::Error>>(())
1341     /// ```
1342     #[inline]
next_state_untagged( &self, cache: &Cache, current: LazyStateID, input: u8, ) -> LazyStateID1343     pub fn next_state_untagged(
1344         &self,
1345         cache: &Cache,
1346         current: LazyStateID,
1347         input: u8,
1348     ) -> LazyStateID {
1349         debug_assert!(!current.is_tagged());
1350         let class = usize::from(self.classes.get(input));
1351         let offset = current.as_usize_unchecked() + class;
1352         cache.trans[offset]
1353     }
1354 
1355     /// Transitions from the current state to the next state, eliding bounds
1356     /// checks, given the next byte of input and a state ID that is not tagged.
1357     ///
1358     /// The only reason to use this routine is performance. In particular, the
1359     /// `next_state` method needs to do some additional checks, among them is
1360     /// to account for identifiers to states that are not yet computed. In
1361     /// such a case, the transition is computed on the fly. However, if it is
1362     /// known that the `current` state ID is untagged, then these checks can be
1363     /// omitted.
1364     ///
1365     /// Since this routine does not compute states on the fly, it does not
1366     /// modify the cache and thus cannot return an error. Consequently, `cache`
1367     /// does not need to be mutable and it is possible for this routine to
1368     /// return a state ID corresponding to the special "unknown" state. In
1369     /// this case, it is the caller's responsibility to use the prior state
1370     /// ID and `input` with `next_state` in order to force the computation of
1371     /// the unknown transition. Otherwise, trying to use the "unknown" state
1372     /// ID will just result in transitioning back to itself, and thus never
1373     /// terminating. (This is technically a special exemption to the state ID
1374     /// validity rules, but is permissible since this routine is guarateed to
1375     /// never mutate the given `cache`, and thus the identifier is guaranteed
1376     /// to remain valid.)
1377     ///
1378     /// See [`LazyStateID`] for more details on what it means for a state ID
1379     /// to be tagged. Also, see
1380     /// [`next_state_untagged`](DFA::next_state_untagged)
1381     /// for this same idea, but with memory safety guaranteed by retaining
1382     /// bounds checks.
1383     ///
1384     /// # State identifier validity
1385     ///
1386     /// The only valid value for `current` is an **untagged** lazy
1387     /// state ID returned by the most recent call to `next_state`,
1388     /// `next_state_untagged`, `next_state_untagged_unchecked`,
1389     /// `start_state_forward` or `state_state_reverse` for the given `cache`.
1390     /// Any state ID returned from prior calls to these routines (with the
1391     /// same `cache`) is considered invalid (even if it gives an appearance
1392     /// of working). State IDs returned from _any_ prior call for different
1393     /// `cache` values are also always invalid.
1394     ///
1395     /// The returned ID is always a valid ID when `current` refers to a valid
1396     /// ID, although it may be tagged. Moreover, this routine is defined for
1397     /// all possible values of `input`.
1398     ///
1399     /// Not all validity rules are checked, even in debug mode. Callers are
1400     /// required to uphold these rules themselves.
1401     ///
1402     /// Violating these state ID validity rules will not sacrifice memory
1403     /// safety, but _may_ produce an incorrect result or a panic.
1404     ///
1405     /// # Safety
1406     ///
1407     /// Callers of this method must guarantee that `current` refers to a valid
1408     /// state ID according to the rules described above. If `current` is not a
1409     /// valid state ID for this automaton, then calling this routine may result
1410     /// in undefined behavior.
1411     ///
1412     /// If `current` is valid, then the ID returned is valid for all possible
1413     /// values of `input`.
1414     #[inline]
next_state_untagged_unchecked( &self, cache: &Cache, current: LazyStateID, input: u8, ) -> LazyStateID1415     pub unsafe fn next_state_untagged_unchecked(
1416         &self,
1417         cache: &Cache,
1418         current: LazyStateID,
1419         input: u8,
1420     ) -> LazyStateID {
1421         debug_assert!(!current.is_tagged());
1422         let class = usize::from(self.classes.get(input));
1423         let offset = current.as_usize_unchecked() + class;
1424         *cache.trans.get_unchecked(offset)
1425     }
1426 
1427     /// Transitions from the current state to the next state for the special
1428     /// EOI symbol.
1429     ///
1430     /// The given cache is used to either reuse pre-computed state
1431     /// transitions, or to store this newly computed transition for future
1432     /// reuse. Thus, this routine guarantees that it will never return a state
1433     /// ID that has an "unknown" tag.
1434     ///
1435     /// This routine must be called at the end of every search in a correct
1436     /// implementation of search. Namely, lazy DFAs in this crate delay matches
1437     /// by one byte in order to support look-around operators. Thus, after
1438     /// reaching the end of a haystack, a search implementation must follow one
1439     /// last EOI transition.
1440     ///
1441     /// It is best to think of EOI as an additional symbol in the alphabet of a
1442     /// DFA that is distinct from every other symbol. That is, the alphabet of
1443     /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
1444     /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
1445     /// physical alphabet size may be smaller because of alphabet compression
1446     /// via equivalence classes, but EOI is always represented somehow in the
1447     /// alphabet.)
1448     ///
1449     /// # State identifier validity
1450     ///
1451     /// The only valid value for `current` is the lazy state ID returned
1452     /// by the most recent call to `next_state`, `next_state_untagged`,
1453     /// `next_state_untagged_unchecked`, `start_state_forward` or
1454     /// `state_state_reverse` for the given `cache`. Any state ID returned from
1455     /// prior calls to these routines (with the same `cache`) is considered
1456     /// invalid (even if it gives an appearance of working). State IDs returned
1457     /// from _any_ prior call for different `cache` values are also always
1458     /// invalid.
1459     ///
1460     /// The returned ID is always a valid ID when `current` refers to a valid
1461     /// ID.
1462     ///
1463     /// These validity rules are not checked, even in debug mode. Callers are
1464     /// required to uphold these rules themselves.
1465     ///
1466     /// Violating these state ID validity rules will not sacrifice memory
1467     /// safety, but _may_ produce an incorrect result or a panic.
1468     ///
1469     /// # Panics
1470     ///
1471     /// If the given ID does not refer to a valid state, then this routine
1472     /// may panic but it also may not panic and instead return an invalid or
1473     /// incorrect ID.
1474     ///
1475     /// # Example
1476     ///
1477     /// This shows a simplistic example for walking a DFA for a given haystack,
1478     /// and then finishing the search with the final EOI transition.
1479     ///
1480     /// ```
1481     /// use regex_automata::{hybrid::dfa::DFA, Input};
1482     ///
1483     /// let dfa = DFA::new(r"[a-z]+r")?;
1484     /// let mut cache = dfa.create_cache();
1485     /// let haystack = "bar".as_bytes();
1486     ///
1487     /// // The start state is determined by inspecting the position and the
1488     /// // initial bytes of the haystack.
1489     /// let mut sid = dfa.start_state_forward(
1490     ///     &mut cache, &Input::new(haystack),
1491     /// )?;
1492     /// // Walk all the bytes in the haystack.
1493     /// for &b in haystack {
1494     ///     sid = dfa.next_state(&mut cache, sid, b)?;
1495     /// }
1496     /// // Matches are always delayed by 1 byte, so we must explicitly walk
1497     /// // the special "EOI" transition at the end of the search. Without this
1498     /// // final transition, the assert below will fail since the DFA will not
1499     /// // have entered a match state yet!
1500     /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1501     /// assert!(sid.is_match());
1502     ///
1503     /// # Ok::<(), Box<dyn std::error::Error>>(())
1504     /// ```
1505     #[inline]
next_eoi_state( &self, cache: &mut Cache, current: LazyStateID, ) -> Result<LazyStateID, CacheError>1506     pub fn next_eoi_state(
1507         &self,
1508         cache: &mut Cache,
1509         current: LazyStateID,
1510     ) -> Result<LazyStateID, CacheError> {
1511         let eoi = self.classes.eoi().as_usize();
1512         let offset = current.as_usize_untagged() + eoi;
1513         let sid = cache.trans[offset];
1514         if !sid.is_unknown() {
1515             return Ok(sid);
1516         }
1517         let unit = self.classes.eoi();
1518         Lazy::new(self, cache).cache_next_state(current, unit)
1519     }
1520 
1521     /// Return the ID of the start state for this lazy DFA for the given
1522     /// starting configuration.
1523     ///
1524     /// Unlike typical DFA implementations, the start state for DFAs in this
1525     /// crate is dependent on a few different factors:
1526     ///
1527     /// * The [`Anchored`] mode of the search. Unanchored, anchored and
1528     /// anchored searches for a specific [`PatternID`] all use different start
1529     /// states.
1530     /// * Whether a "look-behind" byte exists. For example, the `^` anchor
1531     /// matches if and only if there is no look-behind byte.
1532     /// * The specific value of that look-behind byte. For example, a `(?m:^)`
1533     /// assertion only matches when there is either no look-behind byte, or
1534     /// when the look-behind byte is a line terminator.
1535     ///
1536     /// The [starting configuration](start::Config) provides the above
1537     /// information.
1538     ///
1539     /// This routine can be used for either forward or reverse searches.
1540     /// Although, as a convenience, if you have an [`Input`], then it
1541     /// may be more succinct to use [`DFA::start_state_forward`] or
1542     /// [`DFA::start_state_reverse`]. Note, for example, that the convenience
1543     /// routines return a [`MatchError`] on failure where as this routine
1544     /// returns a [`StartError`].
1545     ///
1546     /// # Errors
1547     ///
1548     /// This may return a [`StartError`] if the search needs to give up when
1549     /// determining the start state (for example, if it sees a "quit" byte
1550     /// or if the cache has become inefficient). This can also return an
1551     /// error if the given configuration contains an unsupported [`Anchored`]
1552     /// configuration.
1553     #[cfg_attr(feature = "perf-inline", inline(always))]
start_state( &self, cache: &mut Cache, config: &start::Config, ) -> Result<LazyStateID, StartError>1554     pub fn start_state(
1555         &self,
1556         cache: &mut Cache,
1557         config: &start::Config,
1558     ) -> Result<LazyStateID, StartError> {
1559         let lazy = LazyRef::new(self, cache);
1560         let anchored = config.get_anchored();
1561         let start = match config.get_look_behind() {
1562             None => Start::Text,
1563             Some(byte) => {
1564                 if !self.quitset.is_empty() && self.quitset.contains(byte) {
1565                     return Err(StartError::quit(byte));
1566                 }
1567                 self.start_map.get(byte)
1568             }
1569         };
1570         let start_id = lazy.get_cached_start_id(anchored, start)?;
1571         if !start_id.is_unknown() {
1572             return Ok(start_id);
1573         }
1574         Lazy::new(self, cache).cache_start_group(anchored, start)
1575     }
1576 
1577     /// Return the ID of the start state for this lazy DFA when executing a
1578     /// forward search.
1579     ///
1580     /// This is a convenience routine for calling [`DFA::start_state`] that
1581     /// converts the given [`Input`] to a [start configuration](start::Config).
1582     /// Additionally, if an error occurs, it is converted from a [`StartError`]
1583     /// to a [`MatchError`] using the offset information in the given
1584     /// [`Input`].
1585     ///
1586     /// # Errors
1587     ///
1588     /// This may return a [`MatchError`] if the search needs to give up when
1589     /// determining the start state (for example, if it sees a "quit" byte or
1590     /// if the cache has become inefficient). This can also return an error if
1591     /// the given `Input` contains an unsupported [`Anchored`] configuration.
1592     #[cfg_attr(feature = "perf-inline", inline(always))]
start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result<LazyStateID, MatchError>1593     pub fn start_state_forward(
1594         &self,
1595         cache: &mut Cache,
1596         input: &Input<'_>,
1597     ) -> Result<LazyStateID, MatchError> {
1598         let config = start::Config::from_input_forward(input);
1599         self.start_state(cache, &config).map_err(|err| match err {
1600             StartError::Cache { .. } => MatchError::gave_up(input.start()),
1601             StartError::Quit { byte } => {
1602                 let offset = input
1603                     .start()
1604                     .checked_sub(1)
1605                     .expect("no quit in start without look-behind");
1606                 MatchError::quit(byte, offset)
1607             }
1608             StartError::UnsupportedAnchored { mode } => {
1609                 MatchError::unsupported_anchored(mode)
1610             }
1611         })
1612     }
1613 
1614     /// Return the ID of the start state for this lazy DFA when executing a
1615     /// reverse search.
1616     ///
1617     /// This is a convenience routine for calling [`DFA::start_state`] that
1618     /// converts the given [`Input`] to a [start configuration](start::Config).
1619     /// Additionally, if an error occurs, it is converted from a [`StartError`]
1620     /// to a [`MatchError`] using the offset information in the given
1621     /// [`Input`].
1622     ///
1623     /// # Errors
1624     ///
1625     /// This may return a [`MatchError`] if the search needs to give up when
1626     /// determining the start state (for example, if it sees a "quit" byte or
1627     /// if the cache has become inefficient). This can also return an error if
1628     /// the given `Input` contains an unsupported [`Anchored`] configuration.
1629     #[cfg_attr(feature = "perf-inline", inline(always))]
start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result<LazyStateID, MatchError>1630     pub fn start_state_reverse(
1631         &self,
1632         cache: &mut Cache,
1633         input: &Input<'_>,
1634     ) -> Result<LazyStateID, MatchError> {
1635         let config = start::Config::from_input_reverse(input);
1636         self.start_state(cache, &config).map_err(|err| match err {
1637             StartError::Cache { .. } => MatchError::gave_up(input.end()),
1638             StartError::Quit { byte } => {
1639                 let offset = input.end();
1640                 MatchError::quit(byte, offset)
1641             }
1642             StartError::UnsupportedAnchored { mode } => {
1643                 MatchError::unsupported_anchored(mode)
1644             }
1645         })
1646     }
1647 
1648     /// Returns the total number of patterns that match in this state.
1649     ///
1650     /// If the lazy DFA was compiled with one pattern, then this must
1651     /// necessarily always return `1` for all match states.
1652     ///
1653     /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
1654     /// indices up to (but not including) the length returned by this routine
1655     /// without panicking.
1656     ///
1657     /// # Panics
1658     ///
1659     /// If the given state is not a match state, then this may either panic
1660     /// or return an incorrect result.
1661     ///
1662     /// # Example
1663     ///
1664     /// This example shows a simple instance of implementing overlapping
1665     /// matches. In particular, it shows not only how to determine how many
1666     /// patterns have matched in a particular state, but also how to access
1667     /// which specific patterns have matched.
1668     ///
1669     /// Notice that we must use [`MatchKind::All`] when building the DFA. If we
1670     /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be
1671     /// constructed in a way that supports overlapping matches. (It would only
1672     /// report a single pattern that matches at any particular point in time.)
1673     ///
1674     /// Another thing to take note of is the patterns used and the order in
1675     /// which the pattern IDs are reported. In the example below, pattern `3`
1676     /// is yielded first. Why? Because it corresponds to the match that
1677     /// appears first. Namely, the `@` symbol is part of `\S+` but not part
1678     /// of any of the other patterns. Since the `\S+` pattern has a match that
1679     /// starts to the left of any other pattern, its ID is returned before any
1680     /// other.
1681     ///
1682     /// ```
1683     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1684     /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind};
1685     ///
1686     /// let dfa = DFA::builder()
1687     ///     .configure(DFA::config().match_kind(MatchKind::All))
1688     ///     .build_many(&[
1689     ///         r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
1690     ///     ])?;
1691     /// let mut cache = dfa.create_cache();
1692     /// let haystack = "@bar".as_bytes();
1693     ///
1694     /// // The start state is determined by inspecting the position and the
1695     /// // initial bytes of the haystack.
1696     /// let mut sid = dfa.start_state_forward(
1697     ///     &mut cache, &Input::new(haystack),
1698     /// )?;
1699     /// // Walk all the bytes in the haystack.
1700     /// for &b in haystack {
1701     ///     sid = dfa.next_state(&mut cache, sid, b)?;
1702     /// }
1703     /// sid = dfa.next_eoi_state(&mut cache, sid)?;
1704     ///
1705     /// assert!(sid.is_match());
1706     /// assert_eq!(dfa.match_len(&mut cache, sid), 3);
1707     /// // The following calls are guaranteed to not panic since `match_len`
1708     /// // returned `3` above.
1709     /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
1710     /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
1711     /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
1712     ///
1713     /// # Ok::<(), Box<dyn std::error::Error>>(())
1714     /// ```
1715     #[inline]
match_len(&self, cache: &Cache, id: LazyStateID) -> usize1716     pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize {
1717         assert!(id.is_match());
1718         LazyRef::new(self, cache).get_cached_state(id).match_len()
1719     }
1720 
1721     /// Returns the pattern ID corresponding to the given match index in the
1722     /// given state.
1723     ///
1724     /// See [`DFA::match_len`] for an example of how to use this method
1725     /// correctly. Note that if you know your lazy DFA is configured with a
1726     /// single pattern, then this routine is never necessary since it will
1727     /// always return a pattern ID of `0` for an index of `0` when `id`
1728     /// corresponds to a match state.
1729     ///
1730     /// Typically, this routine is used when implementing an overlapping
1731     /// search, as the example for `DFA::match_len` does.
1732     ///
1733     /// # Panics
1734     ///
1735     /// If the state ID is not a match state or if the match index is out
1736     /// of bounds for the given state, then this routine may either panic
1737     /// or produce an incorrect result. If the state ID is correct and the
1738     /// match index is correct, then this routine always produces a valid
1739     /// `PatternID`.
1740     #[inline]
match_pattern( &self, cache: &Cache, id: LazyStateID, match_index: usize, ) -> PatternID1741     pub fn match_pattern(
1742         &self,
1743         cache: &Cache,
1744         id: LazyStateID,
1745         match_index: usize,
1746     ) -> PatternID {
1747         // This is an optimization for the very common case of a DFA with a
1748         // single pattern. This conditional avoids a somewhat more costly path
1749         // that finds the pattern ID from the corresponding `State`, which
1750         // requires a bit of slicing/pointer-chasing. This optimization tends
1751         // to only matter when matches are frequent.
1752         if self.pattern_len() == 1 {
1753             return PatternID::ZERO;
1754         }
1755         LazyRef::new(self, cache)
1756             .get_cached_state(id)
1757             .match_pattern(match_index)
1758     }
1759 }
1760 
1761 /// A cache represents a partially computed DFA.
1762 ///
1763 /// A cache is the key component that differentiates a classical DFA and a
1764 /// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
1765 /// complete transition table that can handle all possible inputs, a hybrid
1766 /// NFA/DFA starts with an empty transition table and builds only the parts
1767 /// required during search. The parts that are built are stored in a cache. For
1768 /// this reason, a cache is a required parameter for nearly every operation on
1769 /// a [`DFA`].
1770 ///
1771 /// Caches can be created from their corresponding DFA via
1772 /// [`DFA::create_cache`]. A cache can only be used with either the DFA that
1773 /// created it, or the DFA that was most recently used to reset it with
1774 /// [`Cache::reset`]. Using a cache with any other DFA may result in panics
1775 /// or incorrect results.
1776 #[derive(Clone, Debug)]
1777 pub struct Cache {
1778     // N.B. If you're looking to understand how determinization works, it
1779     // is probably simpler to first grok src/dfa/determinize.rs, since that
1780     // doesn't have the "laziness" component.
1781     /// The transition table.
1782     ///
1783     /// Given a `current` LazyStateID and an `input` byte, the next state can
1784     /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
1785     /// that no multiplication is used. That's because state identifiers are
1786     /// "premultiplied."
1787     ///
1788     /// Note that the next state may be the "unknown" state. In this case, the
1789     /// next state is not known and determinization for `current` on `input`
1790     /// must be performed.
1791     trans: Vec<LazyStateID>,
1792     /// The starting states for this DFA.
1793     ///
1794     /// These are computed lazily. Initially, these are all set to "unknown"
1795     /// lazy state IDs.
1796     ///
1797     /// When 'starts_for_each_pattern' is disabled (the default), then the size
1798     /// of this is constrained to the possible starting configurations based
1799     /// on the search parameters. (At time of writing, that's 4.) However,
1800     /// when starting states for each pattern is enabled, then there are N
1801     /// additional groups of starting states, where each group reflects the
1802     /// different possible configurations and N is the number of patterns.
1803     starts: Vec<LazyStateID>,
1804     /// A sequence of NFA/DFA powerset states that have been computed for this
1805     /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
1806     /// tagged LazyStateID can be used to index this sequence by converting it
1807     /// to its untagged form.)
1808     states: Vec<State>,
1809     /// A map from states to their corresponding IDs. This map may be accessed
1810     /// via the raw byte representation of a state, which means that a `State`
1811     /// does not need to be allocated to determine whether it already exists
1812     /// in this map. Indeed, the existence of such a state is what determines
1813     /// whether we allocate a new `State` or not.
1814     ///
1815     /// The higher level idea here is that we do just enough determinization
1816     /// for a state to check whether we've already computed it. If we have,
1817     /// then we can save a little (albeit not much) work. The real savings is
1818     /// in memory usage. If we never checked for trivially duplicate states,
1819     /// then our memory usage would explode to unreasonable levels.
1820     states_to_id: StateMap,
1821     /// Sparse sets used to track which NFA states have been visited during
1822     /// various traversals.
1823     sparses: SparseSets,
1824     /// Scratch space for traversing the NFA graph. (We use space on the heap
1825     /// instead of the call stack.)
1826     stack: Vec<NFAStateID>,
1827     /// Scratch space for building a NFA/DFA powerset state. This is used to
1828     /// help amortize allocation since not every powerset state generated is
1829     /// added to the cache. In particular, if it already exists in the cache,
1830     /// then there is no need to allocate a new `State` for it.
1831     scratch_state_builder: StateBuilderEmpty,
1832     /// A simple abstraction for handling the saving of at most a single state
1833     /// across a cache clearing. This is required for correctness. Namely, if
1834     /// adding a new state after clearing the cache fails, then the caller
1835     /// must retain the ability to continue using the state ID given. The
1836     /// state corresponding to the state ID is what we preserve across cache
1837     /// clearings.
1838     state_saver: StateSaver,
1839     /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
1840     /// track this as new states are added since states use a variable amount
1841     /// of heap. Tracking this as we add states makes it possible to compute
1842     /// the total amount of memory used by the determinizer in constant time.
1843     memory_usage_state: usize,
1844     /// The number of times the cache has been cleared. When a minimum cache
1845     /// clear count is set, then the cache will return an error instead of
1846     /// clearing the cache if the count has been exceeded.
1847     clear_count: usize,
1848     /// The total number of bytes searched since the last time this cache was
1849     /// cleared, not including the current search.
1850     ///
1851     /// This can be added to the length of the current search to get the true
1852     /// total number of bytes searched.
1853     ///
1854     /// This is generally only non-zero when the
1855     /// `Cache::search_{start,update,finish}` APIs are used to track search
1856     /// progress.
1857     bytes_searched: usize,
1858     /// The progress of the current search.
1859     ///
1860     /// This is only non-`None` when callers utlize the `Cache::search_start`,
1861     /// `Cache::search_update` and `Cache::search_finish` APIs.
1862     ///
1863     /// The purpose of recording search progress is to be able to make a
1864     /// determination about the efficiency of the cache. Namely, by keeping
1865     /// track of the
1866     progress: Option<SearchProgress>,
1867 }
1868 
1869 impl Cache {
1870     /// Create a new cache for the given lazy DFA.
1871     ///
1872     /// The cache returned should only be used for searches for the given DFA.
1873     /// If you want to reuse the cache for another DFA, then you must call
1874     /// [`Cache::reset`] with that DFA.
new(dfa: &DFA) -> Cache1875     pub fn new(dfa: &DFA) -> Cache {
1876         let mut cache = Cache {
1877             trans: alloc::vec![],
1878             starts: alloc::vec![],
1879             states: alloc::vec![],
1880             states_to_id: StateMap::new(),
1881             sparses: SparseSets::new(dfa.get_nfa().states().len()),
1882             stack: alloc::vec![],
1883             scratch_state_builder: StateBuilderEmpty::new(),
1884             state_saver: StateSaver::none(),
1885             memory_usage_state: 0,
1886             clear_count: 0,
1887             bytes_searched: 0,
1888             progress: None,
1889         };
1890         debug!("pre-init lazy DFA cache size: {}", cache.memory_usage());
1891         Lazy { dfa, cache: &mut cache }.init_cache();
1892         debug!("post-init lazy DFA cache size: {}", cache.memory_usage());
1893         cache
1894     }
1895 
1896     /// Reset this cache such that it can be used for searching with the given
1897     /// lazy DFA (and only that DFA).
1898     ///
1899     /// A cache reset permits reusing memory already allocated in this cache
1900     /// with a different lazy DFA.
1901     ///
1902     /// Resetting a cache sets its "clear count" to 0. This is relevant if the
1903     /// lazy DFA has been configured to "give up" after it has cleared the
1904     /// cache a certain number of times.
1905     ///
1906     /// Any lazy state ID generated by the cache prior to resetting it is
1907     /// invalid after the reset.
1908     ///
1909     /// # Example
1910     ///
1911     /// This shows how to re-purpose a cache for use with a different DFA.
1912     ///
1913     /// ```
1914     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1915     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
1916     ///
1917     /// let dfa1 = DFA::new(r"\w")?;
1918     /// let dfa2 = DFA::new(r"\W")?;
1919     ///
1920     /// let mut cache = dfa1.create_cache();
1921     /// assert_eq!(
1922     ///     Some(HalfMatch::must(0, 2)),
1923     ///     dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
1924     /// );
1925     ///
1926     /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
1927     /// // incorrect results. In order to re-purpose the cache, we must reset
1928     /// // it with the DFA we'd like to use it with.
1929     /// //
1930     /// // Similarly, after this reset, using the cache with 'dfa1' is also not
1931     /// // allowed.
1932     /// cache.reset(&dfa2);
1933     /// assert_eq!(
1934     ///     Some(HalfMatch::must(0, 3)),
1935     ///     dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
1936     /// );
1937     ///
1938     /// # Ok::<(), Box<dyn std::error::Error>>(())
1939     /// ```
reset(&mut self, dfa: &DFA)1940     pub fn reset(&mut self, dfa: &DFA) {
1941         Lazy::new(dfa, self).reset_cache()
1942     }
1943 
1944     /// Initializes a new search starting at the given position.
1945     ///
1946     /// If a previous search was unfinished, then it is finished automatically
1947     /// and a new search is begun.
1948     ///
1949     /// Note that keeping track of search progress is _not necessary_
1950     /// for correct implementations of search using a lazy DFA. Keeping
1951     /// track of search progress is only necessary if you want the
1952     /// [`Config::minimum_bytes_per_state`] configuration knob to work.
1953     #[inline]
search_start(&mut self, at: usize)1954     pub fn search_start(&mut self, at: usize) {
1955         // If a previous search wasn't marked as finished, then finish it
1956         // now automatically.
1957         if let Some(p) = self.progress.take() {
1958             self.bytes_searched += p.len();
1959         }
1960         self.progress = Some(SearchProgress { start: at, at });
1961     }
1962 
1963     /// Updates the current search to indicate that it has search to the
1964     /// current position.
1965     ///
1966     /// No special care needs to be taken for reverse searches. Namely, the
1967     /// position given may be _less than_ the starting position of the search.
1968     ///
1969     /// # Panics
1970     ///
1971     /// This panics if no search has been started by [`Cache::search_start`].
1972     #[inline]
search_update(&mut self, at: usize)1973     pub fn search_update(&mut self, at: usize) {
1974         let p =
1975             self.progress.as_mut().expect("no in-progress search to update");
1976         p.at = at;
1977     }
1978 
1979     /// Indicates that a search has finished at the given position.
1980     ///
1981     /// # Panics
1982     ///
1983     /// This panics if no search has been started by [`Cache::search_start`].
1984     #[inline]
search_finish(&mut self, at: usize)1985     pub fn search_finish(&mut self, at: usize) {
1986         let mut p =
1987             self.progress.take().expect("no in-progress search to finish");
1988         p.at = at;
1989         self.bytes_searched += p.len();
1990     }
1991 
1992     /// Returns the total number of bytes that have been searched since this
1993     /// cache was last cleared.
1994     ///
1995     /// This is useful for determining the efficiency of the cache. For
1996     /// example, the lazy DFA uses this value in conjunction with the
1997     /// [`Config::minimum_bytes_per_state`] knob to help determine whether it
1998     /// should quit searching.
1999     ///
2000     /// This always returns `0` if search progress isn't being tracked. Note
2001     /// that the lazy DFA search routines in this crate always track search
2002     /// progress.
search_total_len(&self) -> usize2003     pub fn search_total_len(&self) -> usize {
2004         self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len())
2005     }
2006 
2007     /// Returns the total number of times this cache has been cleared since it
2008     /// was either created or last reset.
2009     ///
2010     /// This is useful for informational purposes or if you want to change
2011     /// search strategies based on the number of times the cache has been
2012     /// cleared.
clear_count(&self) -> usize2013     pub fn clear_count(&self) -> usize {
2014         self.clear_count
2015     }
2016 
2017     /// Returns the heap memory usage, in bytes, of this cache.
2018     ///
2019     /// This does **not** include the stack size used up by this cache. To
2020     /// compute that, use `std::mem::size_of::<Cache>()`.
memory_usage(&self) -> usize2021     pub fn memory_usage(&self) -> usize {
2022         const ID_SIZE: usize = size_of::<LazyStateID>();
2023         const STATE_SIZE: usize = size_of::<State>();
2024 
2025         // NOTE: If you make changes to the below, then
2026         // 'minimum_cache_capacity' should be updated correspondingly.
2027 
2028         self.trans.len() * ID_SIZE
2029         + self.starts.len() * ID_SIZE
2030         + self.states.len() * STATE_SIZE
2031         // Maps likely use more memory than this, but it's probably close.
2032         + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
2033         + self.sparses.memory_usage()
2034         + self.stack.capacity() * ID_SIZE
2035         + self.scratch_state_builder.capacity()
2036         // Heap memory used by 'State' in both 'states' and 'states_to_id'.
2037         + self.memory_usage_state
2038     }
2039 }
2040 
2041 /// Keeps track of the progress of the current search.
2042 ///
2043 /// This is updated via the `Cache::search_{start,update,finish}` APIs to
2044 /// record how many bytes have been searched. This permits computing a
2045 /// heuristic that represents the efficiency of a cache, and thus helps inform
2046 /// whether the lazy DFA should give up or not.
2047 #[derive(Clone, Debug)]
2048 struct SearchProgress {
2049     start: usize,
2050     at: usize,
2051 }
2052 
2053 impl SearchProgress {
2054     /// Returns the length, in bytes, of this search so far.
2055     ///
2056     /// This automatically handles the case of a reverse search, where `at`
2057     /// is likely to be less than `start`.
len(&self) -> usize2058     fn len(&self) -> usize {
2059         if self.start <= self.at {
2060             self.at - self.start
2061         } else {
2062             self.start - self.at
2063         }
2064     }
2065 }
2066 
2067 /// A map from states to state identifiers. When using std, we use a standard
2068 /// hashmap, since it's a bit faster for this use case. (Other maps, like
2069 /// one's based on FNV, have not yet been benchmarked.)
2070 ///
2071 /// The main purpose of this map is to reuse states where possible. This won't
2072 /// fully minimize the DFA, but it works well in a lot of cases.
2073 #[cfg(feature = "std")]
2074 type StateMap = std::collections::HashMap<State, LazyStateID>;
2075 #[cfg(not(feature = "std"))]
2076 type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
2077 
2078 /// A type that groups methods that require the base NFA/DFA and writable
2079 /// access to the cache.
2080 #[derive(Debug)]
2081 struct Lazy<'i, 'c> {
2082     dfa: &'i DFA,
2083     cache: &'c mut Cache,
2084 }
2085 
2086 impl<'i, 'c> Lazy<'i, 'c> {
2087     /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c>2088     fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
2089         Lazy { dfa, cache }
2090     }
2091 
2092     /// Return an immutable view by downgrading a writable cache to a read-only
2093     /// cache.
as_ref<'a>(&'a self) -> LazyRef<'i, 'a>2094     fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
2095         LazyRef::new(self.dfa, self.cache)
2096     }
2097 
2098     /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
2099     /// like 'next_state' and 'next_eoi_state' that are called in critical
2100     /// areas. The idea is to let the optimizer focus on the other areas of
2101     /// those methods as the hot path.
2102     ///
2103     /// Here's an example that justifies 'inline(never)'
2104     ///
2105     /// ```ignore
2106     /// regex-cli find match hybrid \
2107     ///   --cache-capacity 100000000 \
2108     ///   -p '\pL{100}'
2109     ///   all-codepoints-utf8-100x
2110     /// ```
2111     ///
2112     /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
2113     /// codepoint, in sequence, repeated 100 times.
2114     ///
2115     /// With 'inline(never)' hyperfine reports 1.1s per run. With
2116     /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
2117     #[cold]
2118     #[inline(never)]
cache_next_state( &mut self, mut current: LazyStateID, unit: alphabet::Unit, ) -> Result<LazyStateID, CacheError>2119     fn cache_next_state(
2120         &mut self,
2121         mut current: LazyStateID,
2122         unit: alphabet::Unit,
2123     ) -> Result<LazyStateID, CacheError> {
2124         let stride2 = self.dfa.stride2();
2125         let empty_builder = self.get_state_builder();
2126         let builder = determinize::next(
2127             self.dfa.get_nfa(),
2128             self.dfa.get_config().get_match_kind(),
2129             &mut self.cache.sparses,
2130             &mut self.cache.stack,
2131             &self.cache.states[current.as_usize_untagged() >> stride2],
2132             unit,
2133             empty_builder,
2134         );
2135         let save_state = !self.as_ref().state_builder_fits_in_cache(&builder);
2136         if save_state {
2137             self.save_state(current);
2138         }
2139         let next = self.add_builder_state(builder, |sid| sid)?;
2140         if save_state {
2141             current = self.saved_state_id();
2142         }
2143         // This is the payoff. The next time 'next_state' is called with this
2144         // state and alphabet unit, it will find this transition and avoid
2145         // having to re-determinize this transition.
2146         self.set_transition(current, unit, next);
2147         Ok(next)
2148     }
2149 
2150     /// Compute and cache the starting state for the given pattern ID (if
2151     /// present) and the starting configuration.
2152     ///
2153     /// This panics if a pattern ID is given and the DFA isn't configured to
2154     /// build anchored start states for each pattern.
2155     ///
2156     /// This will never return an unknown lazy state ID.
2157     ///
2158     /// If caching this state would otherwise result in a cache that has been
2159     /// cleared too many times, then an error is returned.
2160     #[cold]
2161     #[inline(never)]
cache_start_group( &mut self, anchored: Anchored, start: Start, ) -> Result<LazyStateID, StartError>2162     fn cache_start_group(
2163         &mut self,
2164         anchored: Anchored,
2165         start: Start,
2166     ) -> Result<LazyStateID, StartError> {
2167         let nfa_start_id = match anchored {
2168             Anchored::No => self.dfa.get_nfa().start_unanchored(),
2169             Anchored::Yes => self.dfa.get_nfa().start_anchored(),
2170             Anchored::Pattern(pid) => {
2171                 if !self.dfa.get_config().get_starts_for_each_pattern() {
2172                     return Err(StartError::unsupported_anchored(anchored));
2173                 }
2174                 match self.dfa.get_nfa().start_pattern(pid) {
2175                     None => return Ok(self.as_ref().dead_id()),
2176                     Some(sid) => sid,
2177                 }
2178             }
2179         };
2180 
2181         let id = self
2182             .cache_start_one(nfa_start_id, start)
2183             .map_err(StartError::cache)?;
2184         self.set_start_state(anchored, start, id);
2185         Ok(id)
2186     }
2187 
2188     /// Compute and cache the starting state for the given NFA state ID and the
2189     /// starting configuration. The NFA state ID might be one of the following:
2190     ///
2191     /// 1) An unanchored start state to match any pattern.
2192     /// 2) An anchored start state to match any pattern.
2193     /// 3) An anchored start state for a particular pattern.
2194     ///
2195     /// This will never return an unknown lazy state ID.
2196     ///
2197     /// If caching this state would otherwise result in a cache that has been
2198     /// cleared too many times, then an error is returned.
cache_start_one( &mut self, nfa_start_id: NFAStateID, start: Start, ) -> Result<LazyStateID, CacheError>2199     fn cache_start_one(
2200         &mut self,
2201         nfa_start_id: NFAStateID,
2202         start: Start,
2203     ) -> Result<LazyStateID, CacheError> {
2204         let mut builder_matches = self.get_state_builder().into_matches();
2205         determinize::set_lookbehind_from_start(
2206             self.dfa.get_nfa(),
2207             &start,
2208             &mut builder_matches,
2209         );
2210         self.cache.sparses.set1.clear();
2211         determinize::epsilon_closure(
2212             self.dfa.get_nfa(),
2213             nfa_start_id,
2214             builder_matches.look_have(),
2215             &mut self.cache.stack,
2216             &mut self.cache.sparses.set1,
2217         );
2218         let mut builder = builder_matches.into_nfa();
2219         determinize::add_nfa_states(
2220             &self.dfa.get_nfa(),
2221             &self.cache.sparses.set1,
2222             &mut builder,
2223         );
2224         let tag_starts = self.dfa.get_config().get_specialize_start_states();
2225         self.add_builder_state(builder, |id| {
2226             if tag_starts {
2227                 id.to_start()
2228             } else {
2229                 id
2230             }
2231         })
2232     }
2233 
2234     /// Either add the given builder state to this cache, or return an ID to an
2235     /// equivalent state already in this cache.
2236     ///
2237     /// In the case where no equivalent state exists, the idmap function given
2238     /// may be used to transform the identifier allocated. This is useful if
2239     /// the caller needs to tag the ID with additional information.
2240     ///
2241     /// This will never return an unknown lazy state ID.
2242     ///
2243     /// If caching this state would otherwise result in a cache that has been
2244     /// cleared too many times, then an error is returned.
add_builder_state( &mut self, builder: StateBuilderNFA, idmap: impl Fn(LazyStateID) -> LazyStateID, ) -> Result<LazyStateID, CacheError>2245     fn add_builder_state(
2246         &mut self,
2247         builder: StateBuilderNFA,
2248         idmap: impl Fn(LazyStateID) -> LazyStateID,
2249     ) -> Result<LazyStateID, CacheError> {
2250         if let Some(&cached_id) =
2251             self.cache.states_to_id.get(builder.as_bytes())
2252         {
2253             // Since we have a cached state, put the constructed state's
2254             // memory back into our scratch space, so that it can be reused.
2255             self.put_state_builder(builder);
2256             return Ok(cached_id);
2257         }
2258         let result = self.add_state(builder.to_state(), idmap);
2259         self.put_state_builder(builder);
2260         result
2261     }
2262 
2263     /// Allocate a new state ID and add the given state to this cache.
2264     ///
2265     /// The idmap function given may be used to transform the identifier
2266     /// allocated. This is useful if the caller needs to tag the ID with
2267     /// additional information.
2268     ///
2269     /// This will never return an unknown lazy state ID.
2270     ///
2271     /// If caching this state would otherwise result in a cache that has been
2272     /// cleared too many times, then an error is returned.
add_state( &mut self, state: State, idmap: impl Fn(LazyStateID) -> LazyStateID, ) -> Result<LazyStateID, CacheError>2273     fn add_state(
2274         &mut self,
2275         state: State,
2276         idmap: impl Fn(LazyStateID) -> LazyStateID,
2277     ) -> Result<LazyStateID, CacheError> {
2278         if !self.as_ref().state_fits_in_cache(&state) {
2279             self.try_clear_cache()?;
2280         }
2281         // It's important for this to come second, since the above may clear
2282         // the cache. If we clear the cache after ID generation, then the ID
2283         // is likely bunk since it would have been generated based on a larger
2284         // transition table.
2285         let mut id = idmap(self.next_state_id()?);
2286         if state.is_match() {
2287             id = id.to_match();
2288         }
2289         // Add room in the transition table. Since this is a fresh state, all
2290         // of its transitions are unknown.
2291         self.cache.trans.extend(
2292             iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
2293         );
2294         // When we add a sentinel state, we never want to set any quit
2295         // transitions. Technically, this is harmless, since sentinel states
2296         // have all of their transitions set to loop back to themselves. But
2297         // when creating sentinel states before the quit sentinel state,
2298         // this will try to call 'set_transition' on a state ID that doesn't
2299         // actually exist yet, which isn't allowed. So we just skip doing so
2300         // entirely.
2301         if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
2302             let quit_id = self.as_ref().quit_id();
2303             for b in self.dfa.quitset.iter() {
2304                 self.set_transition(id, alphabet::Unit::u8(b), quit_id);
2305             }
2306         }
2307         self.cache.memory_usage_state += state.memory_usage();
2308         self.cache.states.push(state.clone());
2309         self.cache.states_to_id.insert(state, id);
2310         Ok(id)
2311     }
2312 
2313     /// Allocate a new state ID.
2314     ///
2315     /// This will never return an unknown lazy state ID.
2316     ///
2317     /// If caching this state would otherwise result in a cache that has been
2318     /// cleared too many times, then an error is returned.
next_state_id(&mut self) -> Result<LazyStateID, CacheError>2319     fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
2320         let sid = match LazyStateID::new(self.cache.trans.len()) {
2321             Ok(sid) => sid,
2322             Err(_) => {
2323                 self.try_clear_cache()?;
2324                 // This has to pass since we check that ID capacity at
2325                 // construction time can fit at least MIN_STATES states.
2326                 LazyStateID::new(self.cache.trans.len()).unwrap()
2327             }
2328         };
2329         Ok(sid)
2330     }
2331 
2332     /// Attempt to clear the cache used by this lazy DFA.
2333     ///
2334     /// If clearing the cache exceeds the minimum number of required cache
2335     /// clearings, then this will return a cache error. In this case,
2336     /// callers should bubble this up as the cache can't be used until it is
2337     /// reset. Implementations of search should convert this error into a
2338     /// [`MatchError::gave_up`].
2339     ///
2340     /// If 'self.state_saver' is set to save a state, then this state is
2341     /// persisted through cache clearing. Otherwise, the cache is returned to
2342     /// its state after initialization with two exceptions: its clear count
2343     /// is incremented and some of its memory likely has additional capacity.
2344     /// That is, clearing a cache does _not_ release memory.
2345     ///
2346     /// Otherwise, any lazy state ID generated by the cache prior to resetting
2347     /// it is invalid after the reset.
try_clear_cache(&mut self) -> Result<(), CacheError>2348     fn try_clear_cache(&mut self) -> Result<(), CacheError> {
2349         let c = self.dfa.get_config();
2350         if let Some(min_count) = c.get_minimum_cache_clear_count() {
2351             if self.cache.clear_count >= min_count {
2352                 if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() {
2353                     let len = self.cache.search_total_len();
2354                     let min_bytes =
2355                         min_bytes_per.saturating_mul(self.cache.states.len());
2356                     // If we've searched 0 bytes then probably something has
2357                     // gone wrong and the lazy DFA search implementation isn't
2358                     // correctly updating the search progress state.
2359                     if len == 0 {
2360                         trace!(
2361                             "number of bytes searched is 0, but \
2362                              a minimum bytes per state searched ({}) is \
2363                              enabled, maybe Cache::search_update \
2364                              is not being used?",
2365                             min_bytes_per,
2366                         );
2367                     }
2368                     if len < min_bytes {
2369                         trace!(
2370                             "lazy DFA cache has been cleared {} times, \
2371                              which exceeds the limit of {}, \
2372                              AND its bytes searched per state is less \
2373                              than the configured minimum of {}, \
2374                              therefore lazy DFA is giving up \
2375                              (bytes searched since cache clear = {}, \
2376                               number of states = {})",
2377                             self.cache.clear_count,
2378                             min_count,
2379                             min_bytes_per,
2380                             len,
2381                             self.cache.states.len(),
2382                         );
2383                         return Err(CacheError::bad_efficiency());
2384                     } else {
2385                         trace!(
2386                             "lazy DFA cache has been cleared {} times, \
2387                              which exceeds the limit of {}, \
2388                              AND its bytes searched per state is greater \
2389                              than the configured minimum of {}, \
2390                              therefore lazy DFA is continuing! \
2391                              (bytes searched since cache clear = {}, \
2392                               number of states = {})",
2393                             self.cache.clear_count,
2394                             min_count,
2395                             min_bytes_per,
2396                             len,
2397                             self.cache.states.len(),
2398                         );
2399                     }
2400                 } else {
2401                     trace!(
2402                         "lazy DFA cache has been cleared {} times, \
2403                          which exceeds the limit of {}, \
2404                          since there is no configured bytes per state \
2405                          minimum, lazy DFA is giving up",
2406                         self.cache.clear_count,
2407                         min_count,
2408                     );
2409                     return Err(CacheError::too_many_cache_clears());
2410                 }
2411             }
2412         }
2413         self.clear_cache();
2414         Ok(())
2415     }
2416 
2417     /// Clears _and_ resets the cache. Resetting the cache means that no
2418     /// states are persisted and the clear count is reset to 0. No heap memory
2419     /// is released.
2420     ///
2421     /// Note that the caller may reset a cache with a different DFA than what
2422     /// it was created from. In which case, the cache can now be used with the
2423     /// new DFA (and not the old DFA).
reset_cache(&mut self)2424     fn reset_cache(&mut self) {
2425         self.cache.state_saver = StateSaver::none();
2426         self.clear_cache();
2427         // If a new DFA is used, it might have a different number of NFA
2428         // states, so we need to make sure our sparse sets have the appropriate
2429         // size.
2430         self.cache.sparses.resize(self.dfa.get_nfa().states().len());
2431         self.cache.clear_count = 0;
2432         self.cache.progress = None;
2433     }
2434 
2435     /// Clear the cache used by this lazy DFA.
2436     ///
2437     /// If 'self.state_saver' is set to save a state, then this state is
2438     /// persisted through cache clearing. Otherwise, the cache is returned to
2439     /// its state after initialization with two exceptions: its clear count
2440     /// is incremented and some of its memory likely has additional capacity.
2441     /// That is, clearing a cache does _not_ release memory.
2442     ///
2443     /// Otherwise, any lazy state ID generated by the cache prior to resetting
2444     /// it is invalid after the reset.
clear_cache(&mut self)2445     fn clear_cache(&mut self) {
2446         self.cache.trans.clear();
2447         self.cache.starts.clear();
2448         self.cache.states.clear();
2449         self.cache.states_to_id.clear();
2450         self.cache.memory_usage_state = 0;
2451         self.cache.clear_count += 1;
2452         self.cache.bytes_searched = 0;
2453         if let Some(ref mut progress) = self.cache.progress {
2454             progress.start = progress.at;
2455         }
2456         trace!(
2457             "lazy DFA cache has been cleared (count: {})",
2458             self.cache.clear_count
2459         );
2460         self.init_cache();
2461         // If the state we want to save is one of the sentinel
2462         // (unknown/dead/quit) states, then 'init_cache' adds those back, and
2463         // their identifier values remains invariant. So there's no need to add
2464         // it again. (And indeed, doing so would be incorrect!)
2465         if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
2466             // If the state is one of the special sentinel states, then it is
2467             // automatically added by cache initialization and its ID always
2468             // remains the same. With that said, this should never occur since
2469             // the sentinel states are all loop states back to themselves. So
2470             // we should never be in a position where we're attempting to save
2471             // a sentinel state since we never compute transitions out of a
2472             // sentinel state.
2473             assert!(
2474                 !self.as_ref().is_sentinel(old_id),
2475                 "cannot save sentinel state"
2476             );
2477             let new_id = self
2478                 .add_state(state, |id| {
2479                     if old_id.is_start() {
2480                         // We don't need to consult the
2481                         // 'specialize_start_states' config knob here, because
2482                         // if it's disabled, old_id.is_start() will never
2483                         // return true.
2484                         id.to_start()
2485                     } else {
2486                         id
2487                     }
2488                 })
2489                 // The unwrap here is OK because lazy DFA creation ensures that
2490                 // we have room in the cache to add MIN_STATES states. Since
2491                 // 'init_cache' above adds 3, this adds a 4th.
2492                 .expect("adding one state after cache clear must work");
2493             self.cache.state_saver = StateSaver::Saved(new_id);
2494         }
2495     }
2496 
2497     /// Initialize this cache from emptiness to a place where it can be used
2498     /// for search.
2499     ///
2500     /// This is called both at cache creation time and after the cache has been
2501     /// cleared.
2502     ///
2503     /// Primarily, this adds the three sentinel states and allocates some
2504     /// initial memory.
init_cache(&mut self)2505     fn init_cache(&mut self) {
2506         // Why multiply by 2 here? Because we make room for both the unanchored
2507         // and anchored start states. Unanchored is first and then anchored.
2508         let mut starts_len = Start::len().checked_mul(2).unwrap();
2509         // ... but if we also want start states for every pattern, we make room
2510         // for that too.
2511         if self.dfa.get_config().get_starts_for_each_pattern() {
2512             starts_len += Start::len() * self.dfa.pattern_len();
2513         }
2514         self.cache
2515             .starts
2516             .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
2517         // This is the set of NFA states that corresponds to each of our three
2518         // sentinel states: the empty set.
2519         let dead = State::dead();
2520         // This sets up some states that we use as sentinels that are present
2521         // in every DFA. While it would be technically possible to implement
2522         // this DFA without explicitly putting these states in the transition
2523         // table, this is convenient to do to make `next_state` correct for all
2524         // valid state IDs without needing explicit conditionals to special
2525         // case these sentinel states.
2526         //
2527         // All three of these states are "dead" states. That is, all of
2528         // them transition only to themselves. So once you enter one of
2529         // these states, it's impossible to leave them. Thus, any correct
2530         // search routine must explicitly check for these state types. (Sans
2531         // `unknown`, since that is only used internally to represent missing
2532         // states.)
2533         let unk_id =
2534             self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
2535         let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
2536         let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
2537         assert_eq!(unk_id, self.as_ref().unknown_id());
2538         assert_eq!(dead_id, self.as_ref().dead_id());
2539         assert_eq!(quit_id, self.as_ref().quit_id());
2540         // The idea here is that if you start in an unknown/dead/quit state and
2541         // try to transition on them, then you should end up where you started.
2542         self.set_all_transitions(unk_id, unk_id);
2543         self.set_all_transitions(dead_id, dead_id);
2544         self.set_all_transitions(quit_id, quit_id);
2545         // All of these states are technically equivalent from the FSM
2546         // perspective, so putting all three of them in the cache isn't
2547         // possible. (They are distinct merely because we use their
2548         // identifiers as sentinels to mean something, as indicated by the
2549         // names.) Moreover, we wouldn't want to do that. Unknown and quit
2550         // states are special in that they are artificial constructions
2551         // this implementation. But dead states are a natural part of
2552         // determinization. When you reach a point in the NFA where you cannot
2553         // go anywhere else, a dead state will naturally arise and we MUST
2554         // reuse the canonical dead state that we've created here. Why? Because
2555         // it is the state ID that tells the search routine whether a state is
2556         // dead or not, and thus, whether to stop the search. Having a bunch of
2557         // distinct dead states would be quite wasteful!
2558         self.cache.states_to_id.insert(dead, dead_id);
2559     }
2560 
2561     /// Save the state corresponding to the ID given such that the state
2562     /// persists through a cache clearing.
2563     ///
2564     /// While the state may persist, the ID may not. In order to discover the
2565     /// new state ID, one must call 'saved_state_id' after a cache clearing.
save_state(&mut self, id: LazyStateID)2566     fn save_state(&mut self, id: LazyStateID) {
2567         let state = self.as_ref().get_cached_state(id).clone();
2568         self.cache.state_saver = StateSaver::ToSave { id, state };
2569     }
2570 
2571     /// Returns the updated lazy state ID for a state that was persisted
2572     /// through a cache clearing.
2573     ///
2574     /// It is only correct to call this routine when both a state has been
2575     /// saved and the cache has just been cleared. Otherwise, this panics.
saved_state_id(&mut self) -> LazyStateID2576     fn saved_state_id(&mut self) -> LazyStateID {
2577         self.cache
2578             .state_saver
2579             .take_saved()
2580             .expect("state saver does not have saved state ID")
2581     }
2582 
2583     /// Set all transitions on the state 'from' to 'to'.
set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID)2584     fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
2585         for unit in self.dfa.classes.representatives(..) {
2586             self.set_transition(from, unit, to);
2587         }
2588     }
2589 
2590     /// Set the transition on 'from' for 'unit' to 'to'.
2591     ///
2592     /// This panics if either 'from' or 'to' is invalid.
2593     ///
2594     /// All unit values are OK.
set_transition( &mut self, from: LazyStateID, unit: alphabet::Unit, to: LazyStateID, )2595     fn set_transition(
2596         &mut self,
2597         from: LazyStateID,
2598         unit: alphabet::Unit,
2599         to: LazyStateID,
2600     ) {
2601         assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from);
2602         assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to);
2603         let offset =
2604             from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
2605         self.cache.trans[offset] = to;
2606     }
2607 
2608     /// Set the start ID for the given pattern ID (if given) and starting
2609     /// configuration to the ID given.
2610     ///
2611     /// This panics if 'id' is not valid or if a pattern ID is given and
2612     /// 'starts_for_each_pattern' is not enabled.
set_start_state( &mut self, anchored: Anchored, start: Start, id: LazyStateID, )2613     fn set_start_state(
2614         &mut self,
2615         anchored: Anchored,
2616         start: Start,
2617         id: LazyStateID,
2618     ) {
2619         assert!(self.as_ref().is_valid(id));
2620         let start_index = start.as_usize();
2621         let index = match anchored {
2622             Anchored::No => start_index,
2623             Anchored::Yes => Start::len() + start_index,
2624             Anchored::Pattern(pid) => {
2625                 assert!(
2626                     self.dfa.get_config().get_starts_for_each_pattern(),
2627                     "attempted to search for a specific pattern \
2628                      without enabling starts_for_each_pattern",
2629                 );
2630                 let pid = pid.as_usize();
2631                 (2 * Start::len()) + (Start::len() * pid) + start_index
2632             }
2633         };
2634         self.cache.starts[index] = id;
2635     }
2636 
2637     /// Returns a state builder from this DFA that might have existing
2638     /// capacity. This helps avoid allocs in cases where a state is built that
2639     /// turns out to already be cached.
2640     ///
2641     /// Callers must put the state builder back with 'put_state_builder',
2642     /// otherwise the allocation reuse won't work.
get_state_builder(&mut self) -> StateBuilderEmpty2643     fn get_state_builder(&mut self) -> StateBuilderEmpty {
2644         core::mem::replace(
2645             &mut self.cache.scratch_state_builder,
2646             StateBuilderEmpty::new(),
2647         )
2648     }
2649 
2650     /// Puts the given state builder back into this DFA for reuse.
2651     ///
2652     /// Note that building a 'State' from a builder always creates a new alloc,
2653     /// so callers should always put the builder back.
put_state_builder(&mut self, builder: StateBuilderNFA)2654     fn put_state_builder(&mut self, builder: StateBuilderNFA) {
2655         let _ = core::mem::replace(
2656             &mut self.cache.scratch_state_builder,
2657             builder.clear(),
2658         );
2659     }
2660 }
2661 
2662 /// A type that groups methods that require the base NFA/DFA and read-only
2663 /// access to the cache.
2664 #[derive(Debug)]
2665 struct LazyRef<'i, 'c> {
2666     dfa: &'i DFA,
2667     cache: &'c Cache,
2668 }
2669 
2670 impl<'i, 'c> LazyRef<'i, 'c> {
2671     /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c>2672     fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
2673         LazyRef { dfa, cache }
2674     }
2675 
2676     /// Return the ID of the start state for the given configuration.
2677     ///
2678     /// If the start state has not yet been computed, then this returns an
2679     /// unknown lazy state ID.
2680     #[cfg_attr(feature = "perf-inline", inline(always))]
get_cached_start_id( &self, anchored: Anchored, start: Start, ) -> Result<LazyStateID, StartError>2681     fn get_cached_start_id(
2682         &self,
2683         anchored: Anchored,
2684         start: Start,
2685     ) -> Result<LazyStateID, StartError> {
2686         let start_index = start.as_usize();
2687         let index = match anchored {
2688             Anchored::No => start_index,
2689             Anchored::Yes => Start::len() + start_index,
2690             Anchored::Pattern(pid) => {
2691                 if !self.dfa.get_config().get_starts_for_each_pattern() {
2692                     return Err(StartError::unsupported_anchored(anchored));
2693                 }
2694                 if pid.as_usize() >= self.dfa.pattern_len() {
2695                     return Ok(self.dead_id());
2696                 }
2697                 (2 * Start::len())
2698                     + (Start::len() * pid.as_usize())
2699                     + start_index
2700             }
2701         };
2702         Ok(self.cache.starts[index])
2703     }
2704 
2705     /// Return the cached NFA/DFA powerset state for the given ID.
2706     ///
2707     /// This panics if the given ID does not address a valid state.
get_cached_state(&self, sid: LazyStateID) -> &State2708     fn get_cached_state(&self, sid: LazyStateID) -> &State {
2709         let index = sid.as_usize_untagged() >> self.dfa.stride2();
2710         &self.cache.states[index]
2711     }
2712 
2713     /// Returns true if and only if the given ID corresponds to a "sentinel"
2714     /// state.
2715     ///
2716     /// A sentinel state is a state that signifies a special condition of
2717     /// search, and where every transition maps back to itself. See LazyStateID
2718     /// for more details. Note that start and match states are _not_ sentinels
2719     /// since they may otherwise be real states with non-trivial transitions.
2720     /// The purposes of sentinel states is purely to indicate something. Their
2721     /// transitions are not meant to be followed.
is_sentinel(&self, id: LazyStateID) -> bool2722     fn is_sentinel(&self, id: LazyStateID) -> bool {
2723         id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
2724     }
2725 
2726     /// Returns the ID of the unknown state for this lazy DFA.
unknown_id(&self) -> LazyStateID2727     fn unknown_id(&self) -> LazyStateID {
2728         // This unwrap is OK since 0 is always a valid state ID.
2729         LazyStateID::new(0).unwrap().to_unknown()
2730     }
2731 
2732     /// Returns the ID of the dead state for this lazy DFA.
dead_id(&self) -> LazyStateID2733     fn dead_id(&self) -> LazyStateID {
2734         // This unwrap is OK since the maximum value here is 1 * 512 = 512,
2735         // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2736         // 512 is the worst case for our equivalence classes (every byte is a
2737         // distinct class).
2738         LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
2739     }
2740 
2741     /// Returns the ID of the quit state for this lazy DFA.
quit_id(&self) -> LazyStateID2742     fn quit_id(&self) -> LazyStateID {
2743         // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
2744         // which is <= 2047 (the maximum state ID on 16-bit systems). Where
2745         // 512 is the worst case for our equivalence classes (every byte is a
2746         // distinct class).
2747         LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
2748     }
2749 
2750     /// Returns true if and only if the given ID is valid.
2751     ///
2752     /// An ID is valid if it is both a valid index into the transition table
2753     /// and is a multiple of the DFA's stride.
is_valid(&self, id: LazyStateID) -> bool2754     fn is_valid(&self, id: LazyStateID) -> bool {
2755         let id = id.as_usize_untagged();
2756         id < self.cache.trans.len() && id % self.dfa.stride() == 0
2757     }
2758 
2759     /// Returns true if adding the state given would fit in this cache.
state_fits_in_cache(&self, state: &State) -> bool2760     fn state_fits_in_cache(&self, state: &State) -> bool {
2761         let needed = self.cache.memory_usage()
2762             + self.memory_usage_for_one_more_state(state.memory_usage());
2763         trace!(
2764             "lazy DFA cache capacity check: {:?} ?<=? {:?}",
2765             needed,
2766             self.dfa.cache_capacity
2767         );
2768         needed <= self.dfa.cache_capacity
2769     }
2770 
2771     /// Returns true if adding the state to be built by the given builder would
2772     /// fit in this cache.
state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool2773     fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
2774         let needed = self.cache.memory_usage()
2775             + self.memory_usage_for_one_more_state(state.as_bytes().len());
2776         needed <= self.dfa.cache_capacity
2777     }
2778 
2779     /// Returns the additional memory usage, in bytes, required to add one more
2780     /// state to this cache. The given size should be the heap size, in bytes,
2781     /// that would be used by the new state being added.
memory_usage_for_one_more_state( &self, state_heap_size: usize, ) -> usize2782     fn memory_usage_for_one_more_state(
2783         &self,
2784         state_heap_size: usize,
2785     ) -> usize {
2786         const ID_SIZE: usize = size_of::<LazyStateID>();
2787         const STATE_SIZE: usize = size_of::<State>();
2788 
2789         self.dfa.stride() * ID_SIZE // additional space needed in trans table
2790         + STATE_SIZE // space in cache.states
2791         + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
2792         + state_heap_size // heap memory used by state itself
2793     }
2794 }
2795 
2796 /// A simple type that encapsulates the saving of a state ID through a cache
2797 /// clearing.
2798 ///
2799 /// A state ID can be marked for saving with ToSave, while a state ID can be
2800 /// saved itself with Saved.
2801 #[derive(Clone, Debug)]
2802 enum StateSaver {
2803     /// An empty state saver. In this case, no states (other than the special
2804     /// sentinel states) are preserved after clearing the cache.
2805     None,
2806     /// An ID of a state (and the state itself) that should be preserved after
2807     /// the lazy DFA's cache has been cleared. After clearing, the updated ID
2808     /// is stored in 'Saved' since it may have changed.
2809     ToSave { id: LazyStateID, state: State },
2810     /// An ID that of a state that has been persisted through a lazy DFA
2811     /// cache clearing. The ID recorded here corresponds to an ID that was
2812     /// once marked as ToSave. The IDs are likely not equivalent even though
2813     /// the states they point to are.
2814     Saved(LazyStateID),
2815 }
2816 
2817 impl StateSaver {
2818     /// Create an empty state saver.
none() -> StateSaver2819     fn none() -> StateSaver {
2820         StateSaver::None
2821     }
2822 
2823     /// Replace this state saver with an empty saver, and if this saver is a
2824     /// request to save a state, return that request.
take_to_save(&mut self) -> Option<(LazyStateID, State)>2825     fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
2826         match core::mem::replace(self, StateSaver::None) {
2827             StateSaver::None | StateSaver::Saved(_) => None,
2828             StateSaver::ToSave { id, state } => Some((id, state)),
2829         }
2830     }
2831 
2832     /// Replace this state saver with an empty saver, and if this saver is a
2833     /// saved state (or a request to save a state), return that state's ID.
2834     ///
2835     /// The idea here is that a request to save a state isn't necessarily
2836     /// honored because it might not be needed. e.g., Some higher level code
2837     /// might request a state to be saved on the off chance that the cache gets
2838     /// cleared when a new state is added at a lower level. But if that new
2839     /// state is never added, then the cache is never cleared and the state and
2840     /// its ID remain unchanged.
take_saved(&mut self) -> Option<LazyStateID>2841     fn take_saved(&mut self) -> Option<LazyStateID> {
2842         match core::mem::replace(self, StateSaver::None) {
2843             StateSaver::None => None,
2844             StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
2845         }
2846     }
2847 }
2848 
2849 /// The configuration used for building a lazy DFA.
2850 ///
2851 /// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
2852 /// advantage of the former is that it often lets you avoid importing the
2853 /// `Config` type directly.
2854 ///
2855 /// A lazy DFA configuration is a simple data object that is typically used
2856 /// with [`Builder::configure`].
2857 ///
2858 /// The default configuration guarantees that a search will never return a
2859 /// "gave up" or "quit" error, although it is possible for a search to fail
2860 /// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
2861 /// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
2862 #[derive(Clone, Debug, Default)]
2863 pub struct Config {
2864     // As with other configuration types in this crate, we put all our knobs
2865     // in options so that we can distinguish between "default" and "not set."
2866     // This makes it possible to easily combine multiple configurations
2867     // without default values overwriting explicitly specified values. See the
2868     // 'overwrite' method.
2869     //
2870     // For docs on the fields below, see the corresponding method setters.
2871     match_kind: Option<MatchKind>,
2872     pre: Option<Option<Prefilter>>,
2873     starts_for_each_pattern: Option<bool>,
2874     byte_classes: Option<bool>,
2875     unicode_word_boundary: Option<bool>,
2876     quitset: Option<ByteSet>,
2877     specialize_start_states: Option<bool>,
2878     cache_capacity: Option<usize>,
2879     skip_cache_capacity_check: Option<bool>,
2880     minimum_cache_clear_count: Option<Option<usize>>,
2881     minimum_bytes_per_state: Option<Option<usize>>,
2882 }
2883 
2884 impl Config {
2885     /// Return a new default lazy DFA builder configuration.
new() -> Config2886     pub fn new() -> Config {
2887         Config::default()
2888     }
2889 
2890     /// Set the desired match semantics.
2891     ///
2892     /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
2893     /// match semantics of Perl-like regex engines. That is, when multiple
2894     /// patterns would match at the same leftmost position, the pattern that
2895     /// appears first in the concrete syntax is chosen.
2896     ///
2897     /// Currently, the only other kind of match semantics supported is
2898     /// [`MatchKind::All`]. This corresponds to classical DFA construction
2899     /// where all possible matches are added to the lazy DFA.
2900     ///
2901     /// Typically, `All` is used when one wants to execute an overlapping
2902     /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
2903     /// sense to use `All` with the various "leftmost" find routines, since the
2904     /// leftmost routines depend on the `LeftmostFirst` automata construction
2905     /// strategy. Specifically, `LeftmostFirst` adds dead states to the
2906     /// lazy DFA as a way to terminate the search and report a match.
2907     /// `LeftmostFirst` also supports non-greedy matches using this strategy
2908     /// where as `All` does not.
2909     ///
2910     /// # Example: overlapping search
2911     ///
2912     /// This example shows the typical use of `MatchKind::All`, which is to
2913     /// report overlapping matches.
2914     ///
2915     /// ```
2916     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
2917     /// use regex_automata::{
2918     ///     hybrid::dfa::{DFA, OverlappingState},
2919     ///     HalfMatch, Input, MatchKind,
2920     /// };
2921     ///
2922     /// let dfa = DFA::builder()
2923     ///     .configure(DFA::config().match_kind(MatchKind::All))
2924     ///     .build_many(&[r"\w+$", r"\S+$"])?;
2925     /// let mut cache = dfa.create_cache();
2926     /// let haystack = "@foo";
2927     /// let mut state = OverlappingState::start();
2928     ///
2929     /// let expected = Some(HalfMatch::must(1, 4));
2930     /// dfa.try_search_overlapping_fwd(
2931     ///     &mut cache, &Input::new(haystack), &mut state,
2932     /// )?;
2933     /// assert_eq!(expected, state.get_match());
2934     ///
2935     /// // The first pattern also matches at the same position, so re-running
2936     /// // the search will yield another match. Notice also that the first
2937     /// // pattern is returned after the second. This is because the second
2938     /// // pattern begins its match before the first, is therefore an earlier
2939     /// // match and is thus reported first.
2940     /// let expected = Some(HalfMatch::must(0, 4));
2941     /// dfa.try_search_overlapping_fwd(
2942     ///     &mut cache, &Input::new(haystack), &mut state,
2943     /// )?;
2944     /// assert_eq!(expected, state.get_match());
2945     ///
2946     /// # Ok::<(), Box<dyn std::error::Error>>(())
2947     /// ```
2948     ///
2949     /// # Example: reverse automaton to find start of match
2950     ///
2951     /// Another example for using `MatchKind::All` is for constructing a
2952     /// reverse automaton to find the start of a match. `All` semantics are
2953     /// used for this in order to find the longest possible match, which
2954     /// corresponds to the leftmost starting position.
2955     ///
2956     /// Note that if you need the starting position then
2957     /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
2958     /// for you, so it's usually not necessary to do this yourself.
2959     ///
2960     /// ```
2961     /// use regex_automata::{
2962     ///     hybrid::dfa::DFA,
2963     ///     nfa::thompson::NFA,
2964     ///     Anchored, HalfMatch, Input, MatchKind,
2965     /// };
2966     ///
2967     /// let input = Input::new("123foobar456");
2968     /// let pattern = r"[a-z]+r";
2969     ///
2970     /// let dfa_fwd = DFA::new(pattern)?;
2971     /// let dfa_rev = DFA::builder()
2972     ///     .thompson(NFA::config().reverse(true))
2973     ///     .configure(DFA::config().match_kind(MatchKind::All))
2974     ///     .build(pattern)?;
2975     /// let mut cache_fwd = dfa_fwd.create_cache();
2976     /// let mut cache_rev = dfa_rev.create_cache();
2977     ///
2978     /// let expected_fwd = HalfMatch::must(0, 9);
2979     /// let expected_rev = HalfMatch::must(0, 3);
2980     /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap();
2981     /// // Here we don't specify the pattern to search for since there's only
2982     /// // one pattern and we're doing a leftmost search. But if this were an
2983     /// // overlapping search, you'd need to specify the pattern that matched
2984     /// // in the forward direction. (Otherwise, you might wind up finding the
2985     /// // starting position of a match of some other pattern.) That in turn
2986     /// // requires building the reverse automaton with starts_for_each_pattern
2987     /// // enabled.
2988     /// let input = input
2989     ///     .clone()
2990     ///     .range(..got_fwd.offset())
2991     ///     .anchored(Anchored::Yes);
2992     /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap();
2993     /// assert_eq!(expected_fwd, got_fwd);
2994     /// assert_eq!(expected_rev, got_rev);
2995     ///
2996     /// # Ok::<(), Box<dyn std::error::Error>>(())
2997     /// ```
match_kind(mut self, kind: MatchKind) -> Config2998     pub fn match_kind(mut self, kind: MatchKind) -> Config {
2999         self.match_kind = Some(kind);
3000         self
3001     }
3002 
3003     /// Set a prefilter to be used whenever a start state is entered.
3004     ///
3005     /// A [`Prefilter`] in this context is meant to accelerate searches by
3006     /// looking for literal prefixes that every match for the corresponding
3007     /// pattern (or patterns) must start with. Once a prefilter produces a
3008     /// match, the underlying search routine continues on to try and confirm
3009     /// the match.
3010     ///
3011     /// Be warned that setting a prefilter does not guarantee that the search
3012     /// will be faster. While it's usually a good bet, if the prefilter
3013     /// produces a lot of false positive candidates (i.e., positions matched
3014     /// by the prefilter but not by the regex), then the overall result can
3015     /// be slower than if you had just executed the regex engine without any
3016     /// prefilters.
3017     ///
3018     /// Note that unless [`Config::specialize_start_states`] has been
3019     /// explicitly set, then setting this will also enable (when `pre` is
3020     /// `Some`) or disable (when `pre` is `None`) start state specialization.
3021     /// This occurs because without start state specialization, a prefilter
3022     /// is likely to be less effective. And without a prefilter, start state
3023     /// specialization is usually pointless.
3024     ///
3025     /// By default no prefilter is set.
3026     ///
3027     /// # Example
3028     ///
3029     /// ```
3030     /// use regex_automata::{
3031     ///     hybrid::dfa::DFA,
3032     ///     util::prefilter::Prefilter,
3033     ///     Input, HalfMatch, MatchKind,
3034     /// };
3035     ///
3036     /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
3037     /// let re = DFA::builder()
3038     ///     .configure(DFA::config().prefilter(pre))
3039     ///     .build(r"(foo|bar)[a-z]+")?;
3040     /// let mut cache = re.create_cache();
3041     /// let input = Input::new("foo1 barfox bar");
3042     /// assert_eq!(
3043     ///     Some(HalfMatch::must(0, 11)),
3044     ///     re.try_search_fwd(&mut cache, &input)?,
3045     /// );
3046     ///
3047     /// # Ok::<(), Box<dyn std::error::Error>>(())
3048     /// ```
3049     ///
3050     /// Be warned though that an incorrect prefilter can lead to incorrect
3051     /// results!
3052     ///
3053     /// ```
3054     /// use regex_automata::{
3055     ///     hybrid::dfa::DFA,
3056     ///     util::prefilter::Prefilter,
3057     ///     Input, HalfMatch, MatchKind,
3058     /// };
3059     ///
3060     /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
3061     /// let re = DFA::builder()
3062     ///     .configure(DFA::config().prefilter(pre))
3063     ///     .build(r"(foo|bar)[a-z]+")?;
3064     /// let mut cache = re.create_cache();
3065     /// let input = Input::new("foo1 barfox bar");
3066     /// assert_eq!(
3067     ///     // No match reported even though there clearly is one!
3068     ///     None,
3069     ///     re.try_search_fwd(&mut cache, &input)?,
3070     /// );
3071     ///
3072     /// # Ok::<(), Box<dyn std::error::Error>>(())
3073     /// ```
prefilter(mut self, pre: Option<Prefilter>) -> Config3074     pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
3075         self.pre = Some(pre);
3076         if self.specialize_start_states.is_none() {
3077             self.specialize_start_states =
3078                 Some(self.get_prefilter().is_some());
3079         }
3080         self
3081     }
3082 
3083     /// Whether to compile a separate start state for each pattern in the
3084     /// lazy DFA.
3085     ///
3086     /// When enabled, a separate **anchored** start state is added for each
3087     /// pattern in the lazy DFA. When this start state is used, then the DFA
3088     /// will only search for matches for the pattern specified, even if there
3089     /// are other patterns in the DFA.
3090     ///
3091     /// The main downside of this option is that it can potentially increase
3092     /// the size of the DFA and/or increase the time it takes to build the
3093     /// DFA at search time. However, since this is configuration for a lazy
3094     /// DFA, these states aren't actually built unless they're used. Enabling
3095     /// this isn't necessarily free, however, as it may result in higher cache
3096     /// usage.
3097     ///
3098     /// There are a few reasons one might want to enable this (it's disabled
3099     /// by default):
3100     ///
3101     /// 1. When looking for the start of an overlapping match (using a reverse
3102     /// DFA), doing it correctly requires starting the reverse search using the
3103     /// starting state of the pattern that matched in the forward direction.
3104     /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
3105     /// will automatically enable this option when building the reverse DFA
3106     /// internally.
3107     /// 2. When you want to use a DFA with multiple patterns to both search
3108     /// for matches of any pattern or to search for anchored matches of one
3109     /// particular pattern while using the same DFA. (Otherwise, you would need
3110     /// to compile a new DFA for each pattern.)
3111     ///
3112     /// By default this is disabled.
3113     ///
3114     /// # Example
3115     ///
3116     /// This example shows how to use this option to permit the same lazy DFA
3117     /// to run both general searches for any pattern and anchored searches for
3118     /// a specific pattern.
3119     ///
3120     /// ```
3121     /// use regex_automata::{
3122     ///     hybrid::dfa::DFA,
3123     ///     Anchored, HalfMatch, Input, PatternID,
3124     /// };
3125     ///
3126     /// let dfa = DFA::builder()
3127     ///     .configure(DFA::config().starts_for_each_pattern(true))
3128     ///     .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?;
3129     /// let mut cache = dfa.create_cache();
3130     /// let haystack = "bar foo123";
3131     ///
3132     /// // Here's a normal unanchored search that looks for any pattern.
3133     /// let expected = HalfMatch::must(0, 10);
3134     /// let input = Input::new(haystack);
3135     /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3136     /// // We can also do a normal anchored search for any pattern. Since it's
3137     /// // an anchored search, we position the start of the search where we
3138     /// // know the match will begin.
3139     /// let expected = HalfMatch::must(0, 10);
3140     /// let input = Input::new(haystack).range(4..);
3141     /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3142     /// // Since we compiled anchored start states for each pattern, we can
3143     /// // also look for matches of other patterns explicitly, even if a
3144     /// // different pattern would have normally matched.
3145     /// let expected = HalfMatch::must(1, 10);
3146     /// let input = Input::new(haystack)
3147     ///     .range(4..)
3148     ///     .anchored(Anchored::Pattern(PatternID::must(1)));
3149     /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?);
3150     ///
3151     /// # Ok::<(), Box<dyn std::error::Error>>(())
3152     /// ```
starts_for_each_pattern(mut self, yes: bool) -> Config3153     pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
3154         self.starts_for_each_pattern = Some(yes);
3155         self
3156     }
3157 
3158     /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
3159     /// not.
3160     ///
3161     /// This option is enabled by default and should never be disabled unless
3162     /// one is debugging the lazy DFA.
3163     ///
3164     /// When enabled, the lazy DFA will use a map from all possible bytes
3165     /// to their corresponding equivalence class. Each equivalence class
3166     /// represents a set of bytes that does not discriminate between a match
3167     /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
3168     /// least two equivalence classes: a set containing `a` and `b` and a set
3169     /// containing every byte except for `a` and `b`. `a` and `b` are in the
3170     /// same equivalence classes because they never discriminate between a
3171     /// match and a non-match.
3172     ///
3173     /// The advantage of this map is that the size of the transition table
3174     /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
3175     /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
3176     /// equivalence classes (rounded up to the nearest power of 2). As a
3177     /// result, total space usage can decrease substantially. Moreover, since a
3178     /// smaller alphabet is used, DFA compilation during search becomes faster
3179     /// as well since it will potentially be able to reuse a single transition
3180     /// for multiple bytes.
3181     ///
3182     /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
3183     /// this does not yield any speed advantages. Namely, even when this is
3184     /// disabled, a byte class map is still used while searching. The only
3185     /// difference is that every byte will be forced into its own distinct
3186     /// equivalence class. This is useful for debugging the actual generated
3187     /// transitions because it lets one see the transitions defined on actual
3188     /// bytes instead of the equivalence classes.
byte_classes(mut self, yes: bool) -> Config3189     pub fn byte_classes(mut self, yes: bool) -> Config {
3190         self.byte_classes = Some(yes);
3191         self
3192     }
3193 
3194     /// Heuristically enable Unicode word boundaries.
3195     ///
3196     /// When set, this will attempt to implement Unicode word boundaries as if
3197     /// they were ASCII word boundaries. This only works when the search input
3198     /// is ASCII only. If a non-ASCII byte is observed while searching, then a
3199     /// [`MatchError::quit`] error is returned.
3200     ///
3201     /// A possible alternative to enabling this option is to simply use an
3202     /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
3203     /// option is if you absolutely need Unicode support. This option lets one
3204     /// use a fast search implementation (a DFA) for some potentially very
3205     /// common cases, while providing the option to fall back to some other
3206     /// regex engine to handle the general case when an error is returned.
3207     ///
3208     /// If the pattern provided has no Unicode word boundary in it, then this
3209     /// option has no effect. (That is, quitting on a non-ASCII byte only
3210     /// occurs when this option is enabled _and_ a Unicode word boundary is
3211     /// present in the pattern.)
3212     ///
3213     /// This is almost equivalent to setting all non-ASCII bytes to be quit
3214     /// bytes. The only difference is that this will cause non-ASCII bytes to
3215     /// be quit bytes _only_ when a Unicode word boundary is present in the
3216     /// pattern.
3217     ///
3218     /// When enabling this option, callers _must_ be prepared to
3219     /// handle a [`MatchError`] error during search. When using a
3220     /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3221     /// `try_` suite of methods. Alternatively, if callers can guarantee that
3222     /// their input is ASCII only, then a [`MatchError::quit`] error will never
3223     /// be returned while searching.
3224     ///
3225     /// This is disabled by default.
3226     ///
3227     /// # Example
3228     ///
3229     /// This example shows how to heuristically enable Unicode word boundaries
3230     /// in a pattern. It also shows what happens when a search comes across a
3231     /// non-ASCII byte.
3232     ///
3233     /// ```
3234     /// use regex_automata::{
3235     ///     hybrid::dfa::DFA,
3236     ///     HalfMatch, Input, MatchError,
3237     /// };
3238     ///
3239     /// let dfa = DFA::builder()
3240     ///     .configure(DFA::config().unicode_word_boundary(true))
3241     ///     .build(r"\b[0-9]+\b")?;
3242     /// let mut cache = dfa.create_cache();
3243     ///
3244     /// // The match occurs before the search ever observes the snowman
3245     /// // character, so no error occurs.
3246     /// let haystack = "foo 123  ☃";
3247     /// let expected = Some(HalfMatch::must(0, 7));
3248     /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3249     /// assert_eq!(expected, got);
3250     ///
3251     /// // Notice that this search fails, even though the snowman character
3252     /// // occurs after the ending match offset. This is because search
3253     /// // routines read one byte past the end of the search to account for
3254     /// // look-around, and indeed, this is required here to determine whether
3255     /// // the trailing \b matches.
3256     /// let haystack = "foo 123 ☃";
3257     /// let expected = MatchError::quit(0xE2, 8);
3258     /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack));
3259     /// assert_eq!(Err(expected), got);
3260     ///
3261     /// // Another example is executing a search where the span of the haystack
3262     /// // we specify is all ASCII, but there is non-ASCII just before it. This
3263     /// // correctly also reports an error.
3264     /// let input = Input::new("β123").range(2..);
3265     /// let expected = MatchError::quit(0xB2, 1);
3266     /// let got = dfa.try_search_fwd(&mut cache, &input);
3267     /// assert_eq!(Err(expected), got);
3268     ///
3269     /// // And similarly for the trailing word boundary.
3270     /// let input = Input::new("123β").range(..3);
3271     /// let expected = MatchError::quit(0xCE, 3);
3272     /// let got = dfa.try_search_fwd(&mut cache, &input);
3273     /// assert_eq!(Err(expected), got);
3274     ///
3275     /// # Ok::<(), Box<dyn std::error::Error>>(())
3276     /// ```
unicode_word_boundary(mut self, yes: bool) -> Config3277     pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
3278         // We have a separate option for this instead of just setting the
3279         // appropriate quit bytes here because we don't want to set quit bytes
3280         // for every regex. We only want to set them when the regex contains a
3281         // Unicode word boundary.
3282         self.unicode_word_boundary = Some(yes);
3283         self
3284     }
3285 
3286     /// Add a "quit" byte to the lazy DFA.
3287     ///
3288     /// When a quit byte is seen during search time, then search will return a
3289     /// [`MatchError::quit`] error indicating the offset at which the search
3290     /// stopped.
3291     ///
3292     /// A quit byte will always overrule any other aspects of a regex. For
3293     /// example, if the `x` byte is added as a quit byte and the regex `\w` is
3294     /// used, then observing `x` will cause the search to quit immediately
3295     /// despite the fact that `x` is in the `\w` class.
3296     ///
3297     /// This mechanism is primarily useful for heuristically enabling certain
3298     /// features like Unicode word boundaries in a DFA. Namely, if the input
3299     /// to search is ASCII, then a Unicode word boundary can be implemented
3300     /// via an ASCII word boundary with no change in semantics. Thus, a DFA
3301     /// can attempt to match a Unicode word boundary but give up as soon as it
3302     /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
3303     /// to be quit bytes, then Unicode word boundaries will be permitted when
3304     /// building lazy DFAs. Of course, callers should enable
3305     /// [`Config::unicode_word_boundary`] if they want this behavior instead.
3306     /// (The advantage being that non-ASCII quit bytes will only be added if a
3307     /// Unicode word boundary is in the pattern.)
3308     ///
3309     /// When enabling this option, callers _must_ be prepared to
3310     /// handle a [`MatchError`] error during search. When using a
3311     /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
3312     /// `try_` suite of methods.
3313     ///
3314     /// By default, there are no quit bytes set.
3315     ///
3316     /// # Panics
3317     ///
3318     /// This panics if heuristic Unicode word boundaries are enabled and any
3319     /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
3320     /// Unicode word boundaries requires setting every non-ASCII byte to a quit
3321     /// byte. So if the caller attempts to undo any of that, then this will
3322     /// panic.
3323     ///
3324     /// # Example
3325     ///
3326     /// This example shows how to cause a search to terminate if it sees a
3327     /// `\n` byte. This could be useful if, for example, you wanted to prevent
3328     /// a user supplied pattern from matching across a line boundary.
3329     ///
3330     /// ```
3331     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3332     /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3333     ///
3334     /// let dfa = DFA::builder()
3335     ///     .configure(DFA::config().quit(b'\n', true))
3336     ///     .build(r"foo\p{any}+bar")?;
3337     /// let mut cache = dfa.create_cache();
3338     ///
3339     /// let haystack = "foo\nbar";
3340     /// // Normally this would produce a match, since \p{any} contains '\n'.
3341     /// // But since we instructed the automaton to enter a quit state if a
3342     /// // '\n' is observed, this produces a match error instead.
3343     /// let expected = MatchError::quit(b'\n', 3);
3344     /// let got = dfa.try_search_fwd(
3345     ///     &mut cache,
3346     ///     &Input::new(haystack),
3347     /// ).unwrap_err();
3348     /// assert_eq!(expected, got);
3349     ///
3350     /// # Ok::<(), Box<dyn std::error::Error>>(())
3351     /// ```
quit(mut self, byte: u8, yes: bool) -> Config3352     pub fn quit(mut self, byte: u8, yes: bool) -> Config {
3353         if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
3354             panic!(
3355                 "cannot set non-ASCII byte to be non-quit when \
3356                  Unicode word boundaries are enabled"
3357             );
3358         }
3359         if self.quitset.is_none() {
3360             self.quitset = Some(ByteSet::empty());
3361         }
3362         if yes {
3363             self.quitset.as_mut().unwrap().add(byte);
3364         } else {
3365             self.quitset.as_mut().unwrap().remove(byte);
3366         }
3367         self
3368     }
3369 
3370     /// Enable specializing start states in the lazy DFA.
3371     ///
3372     /// When start states are specialized, an implementor of a search routine
3373     /// using a lazy DFA can tell when the search has entered a starting state.
3374     /// When start states aren't specialized, then it is impossible to know
3375     /// whether the search has entered a start state.
3376     ///
3377     /// Ideally, this option wouldn't need to exist and we could always
3378     /// specialize start states. The problem is that start states can be quite
3379     /// active. This in turn means that an efficient search routine is likely
3380     /// to ping-pong between a heavily optimized hot loop that handles most
3381     /// states and to a less optimized specialized handling of start states.
3382     /// This causes branches to get heavily mispredicted and overall can
3383     /// materially decrease throughput. Therefore, specializing start states
3384     /// should only be enabled when it is needed.
3385     ///
3386     /// Knowing whether a search is in a start state is typically useful when a
3387     /// prefilter is active for the search. A prefilter is typically only run
3388     /// when in a start state and a prefilter can greatly accelerate a search.
3389     /// Therefore, the possible cost of specializing start states is worth it
3390     /// in this case. Otherwise, if you have no prefilter, there is likely no
3391     /// reason to specialize start states.
3392     ///
3393     /// This is disabled by default, but note that it is automatically
3394     /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
3395     /// `specialize_start_states` has already been set, [`Config::prefilter`]
3396     /// will automatically enable or disable it based on whether a prefilter
3397     /// is present or not, respectively. This is done because a prefilter's
3398     /// effectiveness is rooted in being executed whenever the DFA is in a
3399     /// start state, and that's only possible to do when they are specialized.
3400     ///
3401     /// Note that it is plausibly reasonable to _disable_ this option
3402     /// explicitly while _enabling_ a prefilter. In that case, a prefilter
3403     /// will still be run at the beginning of a search, but never again. This
3404     /// in theory could strike a good balance if you're in a situation where a
3405     /// prefilter is likely to produce many false positive candidates.
3406     ///
3407     /// # Example
3408     ///
3409     /// This example shows how to enable start state specialization and then
3410     /// shows how to check whether a state is a start state or not.
3411     ///
3412     /// ```
3413     /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3414     ///
3415     /// let dfa = DFA::builder()
3416     ///     .configure(DFA::config().specialize_start_states(true))
3417     ///     .build(r"[a-z]+")?;
3418     /// let mut cache = dfa.create_cache();
3419     ///
3420     /// let haystack = "123 foobar 4567".as_bytes();
3421     /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3422     /// // The ID returned by 'start_state_forward' will always be tagged as
3423     /// // a start state when start state specialization is enabled.
3424     /// assert!(sid.is_tagged());
3425     /// assert!(sid.is_start());
3426     ///
3427     /// # Ok::<(), Box<dyn std::error::Error>>(())
3428     /// ```
3429     ///
3430     /// Compare the above with the default lazy DFA configuration where
3431     /// start states are _not_ specialized. In this case, the start state
3432     /// is not tagged and `sid.is_start()` returns false.
3433     ///
3434     /// ```
3435     /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input};
3436     ///
3437     /// let dfa = DFA::new(r"[a-z]+")?;
3438     /// let mut cache = dfa.create_cache();
3439     ///
3440     /// let haystack = "123 foobar 4567".as_bytes();
3441     /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?;
3442     /// // Start states are not tagged in the default configuration!
3443     /// assert!(!sid.is_tagged());
3444     /// assert!(!sid.is_start());
3445     ///
3446     /// # Ok::<(), Box<dyn std::error::Error>>(())
3447     /// ```
specialize_start_states(mut self, yes: bool) -> Config3448     pub fn specialize_start_states(mut self, yes: bool) -> Config {
3449         self.specialize_start_states = Some(yes);
3450         self
3451     }
3452 
3453     /// Sets the maximum amount of heap memory, in bytes, to allocate to the
3454     /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
3455     /// use more heap memory, then, depending on other configuration knobs,
3456     /// either stop the search and return an error or clear the cache and
3457     /// continue the search.
3458     ///
3459     /// The default cache capacity is some "reasonable" number that will
3460     /// accommodate most regular expressions. You may find that if you need
3461     /// to build a large DFA then it may be necessary to increase the cache
3462     /// capacity.
3463     ///
3464     /// Note that while building a lazy DFA will do a "minimum" check to ensure
3465     /// the capacity is big enough, this is more or less about correctness.
3466     /// If the cache is bigger than the minimum but still "too small," then the
3467     /// lazy DFA could wind up spending a lot of time clearing the cache and
3468     /// recomputing transitions, thus negating the performance benefits of a
3469     /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
3470     /// endeavor. For most common patterns, however, the default should be
3471     /// sufficient.
3472     ///
3473     /// For more details on how the lazy DFA's cache is used, see the
3474     /// documentation for [`Cache`].
3475     ///
3476     /// # Example
3477     ///
3478     /// This example shows what happens if the configured cache capacity is
3479     /// too small. In such cases, one can override the cache capacity to make
3480     /// it bigger. Alternatively, one might want to use less memory by setting
3481     /// a smaller cache capacity.
3482     ///
3483     /// ```
3484     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3485     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3486     ///
3487     /// let pattern = r"\p{L}{1000}";
3488     ///
3489     /// // The default cache capacity is likely too small to deal with regexes
3490     /// // that are very large. Large repetitions of large Unicode character
3491     /// // classes are a common way to make very large regexes.
3492     /// let _ = DFA::new(pattern).unwrap_err();
3493     /// // Bump up the capacity to something bigger.
3494     /// let dfa = DFA::builder()
3495     ///     .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
3496     ///     .build(pattern)?;
3497     /// let mut cache = dfa.create_cache();
3498     ///
3499     /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3500     /// let expected = Some(HalfMatch::must(0, 2000));
3501     /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3502     /// assert_eq!(expected, got);
3503     ///
3504     /// # Ok::<(), Box<dyn std::error::Error>>(())
3505     /// ```
cache_capacity(mut self, bytes: usize) -> Config3506     pub fn cache_capacity(mut self, bytes: usize) -> Config {
3507         self.cache_capacity = Some(bytes);
3508         self
3509     }
3510 
3511     /// Configures construction of a lazy DFA to use the minimum cache capacity
3512     /// if the configured capacity is otherwise too small for the provided NFA.
3513     ///
3514     /// This is useful if you never want lazy DFA construction to fail because
3515     /// of a capacity that is too small.
3516     ///
3517     /// In general, this option is typically not a good idea. In particular,
3518     /// while a minimum cache capacity does permit the lazy DFA to function
3519     /// where it otherwise couldn't, it's plausible that it may not function
3520     /// well if it's constantly running out of room. In that case, the speed
3521     /// advantages of the lazy DFA may be negated. On the other hand, the
3522     /// "minimum" cache capacity computed may not be completely accurate and
3523     /// could actually be bigger than what is really necessary. Therefore, it
3524     /// is plausible that using the minimum cache capacity could still result
3525     /// in very good performance.
3526     ///
3527     /// This is disabled by default.
3528     ///
3529     /// # Example
3530     ///
3531     /// This example shows what happens if the configured cache capacity is
3532     /// too small. In such cases, one could override the capacity explicitly.
3533     /// An alternative, demonstrated here, let's us force construction to use
3534     /// the minimum cache capacity if the configured capacity is otherwise
3535     /// too small.
3536     ///
3537     /// ```
3538     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3539     /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
3540     ///
3541     /// let pattern = r"\p{L}{1000}";
3542     ///
3543     /// // The default cache capacity is likely too small to deal with regexes
3544     /// // that are very large. Large repetitions of large Unicode character
3545     /// // classes are a common way to make very large regexes.
3546     /// let _ = DFA::new(pattern).unwrap_err();
3547     /// // Configure construction such it automatically selects the minimum
3548     /// // cache capacity if it would otherwise be too small.
3549     /// let dfa = DFA::builder()
3550     ///     .configure(DFA::config().skip_cache_capacity_check(true))
3551     ///     .build(pattern)?;
3552     /// let mut cache = dfa.create_cache();
3553     ///
3554     /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
3555     /// let expected = Some(HalfMatch::must(0, 2000));
3556     /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?;
3557     /// assert_eq!(expected, got);
3558     ///
3559     /// # Ok::<(), Box<dyn std::error::Error>>(())
3560     /// ```
skip_cache_capacity_check(mut self, yes: bool) -> Config3561     pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
3562         self.skip_cache_capacity_check = Some(yes);
3563         self
3564     }
3565 
3566     /// Configure a lazy DFA search to quit after a certain number of cache
3567     /// clearings.
3568     ///
3569     /// When a minimum is set, then a lazy DFA search will *possibly* "give
3570     /// up" after the minimum number of cache clearings has occurred. This is
3571     /// typically useful in scenarios where callers want to detect whether the
3572     /// lazy DFA search is "efficient" or not. If the cache is cleared too many
3573     /// times, this is a good indicator that it is not efficient, and thus, the
3574     /// caller may wish to use some other regex engine.
3575     ///
3576     /// Note that the number of times a cache is cleared is a property of
3577     /// the cache itself. Thus, if a cache is used in a subsequent search
3578     /// with a similarly configured lazy DFA, then it could cause the
3579     /// search to "give up" if the cache needed to be cleared, depending
3580     /// on its internal count and configured minimum. The cache clear
3581     /// count can only be reset to `0` via [`DFA::reset_cache`] (or
3582     /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
3583     /// you're using the `Regex` API).
3584     ///
3585     /// By default, no minimum is configured. Thus, a lazy DFA search will
3586     /// never give up due to cache clearings. If you do set this option, you
3587     /// might consider also setting [`Config::minimum_bytes_per_state`] in
3588     /// order for the lazy DFA to take efficiency into account before giving
3589     /// up.
3590     ///
3591     /// # Example
3592     ///
3593     /// This example uses a somewhat pathological configuration to demonstrate
3594     /// the _possible_ behavior of cache clearing and how it might result
3595     /// in a search that returns an error.
3596     ///
3597     /// It is important to note that the precise mechanics of how and when
3598     /// a cache gets cleared is an implementation detail.
3599     ///
3600     /// ```
3601     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
3602     /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind};
3603     ///
3604     /// // This is a carefully chosen regex. The idea is to pick one
3605     /// // that requires some decent number of states (hence the bounded
3606     /// // repetition). But we specifically choose to create a class with an
3607     /// // ASCII letter and a non-ASCII letter so that we can check that no new
3608     /// // states are created once the cache is full. Namely, if we fill up the
3609     /// // cache on a haystack of 'a's, then in order to match one 'β', a new
3610     /// // state will need to be created since a 'β' is encoded with multiple
3611     /// // bytes. Since there's no room for this state, the search should quit
3612     /// // at the very first position.
3613     /// let pattern = r"[aβ]{100}";
3614     /// let dfa = DFA::builder()
3615     ///     .configure(
3616     ///         // Configure it so that we have the minimum cache capacity
3617     ///         // possible. And that if any clearings occur, the search quits.
3618     ///         DFA::config()
3619     ///             .skip_cache_capacity_check(true)
3620     ///             .cache_capacity(0)
3621     ///             .minimum_cache_clear_count(Some(0)),
3622     ///     )
3623     ///     .build(pattern)?;
3624     /// let mut cache = dfa.create_cache();
3625     ///
3626     /// // Our search will give up before reaching the end!
3627     /// let haystack = "a".repeat(101).into_bytes();
3628     /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3629     /// assert!(matches!(
3630     ///     *result.unwrap_err().kind(),
3631     ///     MatchErrorKind::GaveUp { .. },
3632     /// ));
3633     ///
3634     /// // Now that we know the cache is full, if we search a haystack that we
3635     /// // know will require creating at least one new state, it should not
3636     /// // be able to make much progress.
3637     /// let haystack = "β".repeat(101).into_bytes();
3638     /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3639     /// assert!(matches!(
3640     ///     *result.unwrap_err().kind(),
3641     ///     MatchErrorKind::GaveUp { .. },
3642     /// ));
3643     ///
3644     /// // If we reset the cache, then we should be able to create more states
3645     /// // and make more progress with searching for betas.
3646     /// cache.reset(&dfa);
3647     /// let haystack = "β".repeat(101).into_bytes();
3648     /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3649     /// assert!(matches!(
3650     ///     *result.unwrap_err().kind(),
3651     ///     MatchErrorKind::GaveUp { .. },
3652     /// ));
3653     ///
3654     /// // ... switching back to ASCII still makes progress since it just needs
3655     /// // to set transitions on existing states!
3656     /// let haystack = "a".repeat(101).into_bytes();
3657     /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack));
3658     /// assert!(matches!(
3659     ///     *result.unwrap_err().kind(),
3660     ///     MatchErrorKind::GaveUp { .. },
3661     /// ));
3662     ///
3663     /// # Ok::<(), Box<dyn std::error::Error>>(())
3664     /// ```
minimum_cache_clear_count(mut self, min: Option<usize>) -> Config3665     pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
3666         self.minimum_cache_clear_count = Some(min);
3667         self
3668     }
3669 
3670     /// Configure a lazy DFA search to quit only when its efficiency drops
3671     /// below the given minimum.
3672     ///
3673     /// The efficiency of the cache is determined by the number of DFA states
3674     /// compiled per byte of haystack searched. For example, if the efficiency
3675     /// is 2, then it means the lazy DFA is creating a new DFA state after
3676     /// searching approximately 2 bytes in a haystack. Generally speaking, 2
3677     /// is quite bad and it's likely that even a slower regex engine like the
3678     /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster.
3679     ///
3680     /// This has no effect if [`Config::minimum_cache_clear_count`] is not set.
3681     /// Namely, this option only kicks in when the cache has been cleared more
3682     /// than the minimum number. If no minimum is set, then the cache is simply
3683     /// cleared whenever it fills up and it is impossible for the lazy DFA to
3684     /// quit due to ineffective use of the cache.
3685     ///
3686     /// In general, if one is setting [`Config::minimum_cache_clear_count`],
3687     /// then one should probably also set this knob as well. The reason is
3688     /// that the absolute number of times the cache is cleared is generally
3689     /// not a great predictor of efficiency. For example, if a new DFA state
3690     /// is created for every 1,000 bytes searched, then it wouldn't be hard
3691     /// for the cache to get cleared more than `N` times and then cause the
3692     /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite
3693     /// good from a performance perspective, and it's likely that the lazy
3694     /// DFA should continue searching, even if it requires clearing the cache
3695     /// occasionally.
3696     ///
3697     /// Finally, note that if you're implementing your own lazy DFA search
3698     /// routine and also want this efficiency check to work correctly, then
3699     /// you'll need to use the following routines to record search progress:
3700     ///
3701     /// * Call [`Cache::search_start`] at the beginning of every search.
3702     /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is
3703     /// called.
3704     /// * Call [`Cache::search_finish`] before completing a search. (It is
3705     /// not strictly necessary to call this when an error is returned, as
3706     /// `Cache::search_start` will automatically finish the previous search
3707     /// for you. But calling it where possible before returning helps improve
3708     /// the accuracy of how many bytes have actually been searched.)
minimum_bytes_per_state(mut self, min: Option<usize>) -> Config3709     pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config {
3710         self.minimum_bytes_per_state = Some(min);
3711         self
3712     }
3713 
3714     /// Returns the match semantics set in this configuration.
get_match_kind(&self) -> MatchKind3715     pub fn get_match_kind(&self) -> MatchKind {
3716         self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
3717     }
3718 
3719     /// Returns the prefilter set in this configuration, if one at all.
get_prefilter(&self) -> Option<&Prefilter>3720     pub fn get_prefilter(&self) -> Option<&Prefilter> {
3721         self.pre.as_ref().unwrap_or(&None).as_ref()
3722     }
3723 
3724     /// Returns whether this configuration has enabled anchored starting states
3725     /// for every pattern in the DFA.
get_starts_for_each_pattern(&self) -> bool3726     pub fn get_starts_for_each_pattern(&self) -> bool {
3727         self.starts_for_each_pattern.unwrap_or(false)
3728     }
3729 
3730     /// Returns whether this configuration has enabled byte classes or not.
3731     /// This is typically a debugging oriented option, as disabling it confers
3732     /// no speed benefit.
get_byte_classes(&self) -> bool3733     pub fn get_byte_classes(&self) -> bool {
3734         self.byte_classes.unwrap_or(true)
3735     }
3736 
3737     /// Returns whether this configuration has enabled heuristic Unicode word
3738     /// boundary support. When enabled, it is possible for a search to return
3739     /// an error.
get_unicode_word_boundary(&self) -> bool3740     pub fn get_unicode_word_boundary(&self) -> bool {
3741         self.unicode_word_boundary.unwrap_or(false)
3742     }
3743 
3744     /// Returns whether this configuration will instruct the lazy DFA to enter
3745     /// a quit state whenever the given byte is seen during a search. When at
3746     /// least one byte has this enabled, it is possible for a search to return
3747     /// an error.
get_quit(&self, byte: u8) -> bool3748     pub fn get_quit(&self, byte: u8) -> bool {
3749         self.quitset.map_or(false, |q| q.contains(byte))
3750     }
3751 
3752     /// Returns whether this configuration will instruct the lazy DFA to
3753     /// "specialize" start states. When enabled, the lazy DFA will tag start
3754     /// states so that search routines using the lazy DFA can detect when
3755     /// it's in a start state and do some kind of optimization (like run a
3756     /// prefilter).
get_specialize_start_states(&self) -> bool3757     pub fn get_specialize_start_states(&self) -> bool {
3758         self.specialize_start_states.unwrap_or(false)
3759     }
3760 
3761     /// Returns the cache capacity set on this configuration.
get_cache_capacity(&self) -> usize3762     pub fn get_cache_capacity(&self) -> usize {
3763         self.cache_capacity.unwrap_or(2 * (1 << 20))
3764     }
3765 
3766     /// Returns whether the cache capacity check should be skipped.
get_skip_cache_capacity_check(&self) -> bool3767     pub fn get_skip_cache_capacity_check(&self) -> bool {
3768         self.skip_cache_capacity_check.unwrap_or(false)
3769     }
3770 
3771     /// Returns, if set, the minimum number of times the cache must be cleared
3772     /// before a lazy DFA search can give up. When no minimum is set, then a
3773     /// search will never quit and will always clear the cache whenever it
3774     /// fills up.
get_minimum_cache_clear_count(&self) -> Option<usize>3775     pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
3776         self.minimum_cache_clear_count.unwrap_or(None)
3777     }
3778 
3779     /// Returns, if set, the minimum number of bytes per state that need to be
3780     /// processed in order for the lazy DFA to keep going. If the minimum falls
3781     /// below this number (and the cache has been cleared a minimum number of
3782     /// times), then the lazy DFA will return a "gave up" error.
get_minimum_bytes_per_state(&self) -> Option<usize>3783     pub fn get_minimum_bytes_per_state(&self) -> Option<usize> {
3784         self.minimum_bytes_per_state.unwrap_or(None)
3785     }
3786 
3787     /// Returns the minimum lazy DFA cache capacity required for the given NFA.
3788     ///
3789     /// The cache capacity required for a particular NFA may change without
3790     /// notice. Callers should not rely on it being stable.
3791     ///
3792     /// This is useful for informational purposes, but can also be useful for
3793     /// other reasons. For example, if one wants to check the minimum cache
3794     /// capacity themselves or if one wants to set the capacity based on the
3795     /// minimum.
3796     ///
3797     /// This may return an error if this configuration does not support all of
3798     /// the instructions used in the given NFA. For example, if the NFA has a
3799     /// Unicode word boundary but this configuration does not enable heuristic
3800     /// support for Unicode word boundaries.
get_minimum_cache_capacity( &self, nfa: &thompson::NFA, ) -> Result<usize, BuildError>3801     pub fn get_minimum_cache_capacity(
3802         &self,
3803         nfa: &thompson::NFA,
3804     ) -> Result<usize, BuildError> {
3805         let quitset = self.quit_set_from_nfa(nfa)?;
3806         let classes = self.byte_classes_from_nfa(nfa, &quitset);
3807         let starts = self.get_starts_for_each_pattern();
3808         Ok(minimum_cache_capacity(nfa, &classes, starts))
3809     }
3810 
3811     /// Returns the byte class map used during search from the given NFA.
3812     ///
3813     /// If byte classes are disabled on this configuration, then a map is
3814     /// returned that puts each byte in its own equivalent class.
byte_classes_from_nfa( &self, nfa: &thompson::NFA, quit: &ByteSet, ) -> ByteClasses3815     fn byte_classes_from_nfa(
3816         &self,
3817         nfa: &thompson::NFA,
3818         quit: &ByteSet,
3819     ) -> ByteClasses {
3820         if !self.get_byte_classes() {
3821             // The lazy DFA will always use the equivalence class map, but
3822             // enabling this option is useful for debugging. Namely, this will
3823             // cause all transitions to be defined over their actual bytes
3824             // instead of an opaque equivalence class identifier. The former is
3825             // much easier to grok as a human.
3826             ByteClasses::singletons()
3827         } else {
3828             let mut set = nfa.byte_class_set().clone();
3829             // It is important to distinguish any "quit" bytes from all other
3830             // bytes. Otherwise, a non-quit byte may end up in the same class
3831             // as a quit byte, and thus cause the DFA stop when it shouldn't.
3832             //
3833             // Test case:
3834             //
3835             //   regex-cli find match hybrid --unicode-word-boundary \
3836             //     -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
3837             if !quit.is_empty() {
3838                 set.add_set(&quit);
3839             }
3840             set.byte_classes()
3841         }
3842     }
3843 
3844     /// Return the quit set for this configuration and the given NFA.
3845     ///
3846     /// This may return an error if the NFA is incompatible with this
3847     /// configuration's quit set. For example, if the NFA has a Unicode word
3848     /// boundary and the quit set doesn't include non-ASCII bytes.
quit_set_from_nfa( &self, nfa: &thompson::NFA, ) -> Result<ByteSet, BuildError>3849     fn quit_set_from_nfa(
3850         &self,
3851         nfa: &thompson::NFA,
3852     ) -> Result<ByteSet, BuildError> {
3853         let mut quit = self.quitset.unwrap_or(ByteSet::empty());
3854         if nfa.look_set_any().contains_word_unicode() {
3855             if self.get_unicode_word_boundary() {
3856                 for b in 0x80..=0xFF {
3857                     quit.add(b);
3858                 }
3859             } else {
3860                 // If heuristic support for Unicode word boundaries wasn't
3861                 // enabled, then we can still check if our quit set is correct.
3862                 // If the caller set their quit bytes in a way that causes the
3863                 // DFA to quit on at least all non-ASCII bytes, then that's all
3864                 // we need for heuristic support to work.
3865                 if !quit.contains_range(0x80, 0xFF) {
3866                     return Err(
3867                         BuildError::unsupported_dfa_word_boundary_unicode(),
3868                     );
3869                 }
3870             }
3871         }
3872         Ok(quit)
3873     }
3874 
3875     /// Overwrite the default configuration such that the options in `o` are
3876     /// always used. If an option in `o` is not set, then the corresponding
3877     /// option in `self` is used. If it's not set in `self` either, then it
3878     /// remains not set.
overwrite(&self, o: Config) -> Config3879     fn overwrite(&self, o: Config) -> Config {
3880         Config {
3881             match_kind: o.match_kind.or(self.match_kind),
3882             pre: o.pre.or_else(|| self.pre.clone()),
3883             starts_for_each_pattern: o
3884                 .starts_for_each_pattern
3885                 .or(self.starts_for_each_pattern),
3886             byte_classes: o.byte_classes.or(self.byte_classes),
3887             unicode_word_boundary: o
3888                 .unicode_word_boundary
3889                 .or(self.unicode_word_boundary),
3890             quitset: o.quitset.or(self.quitset),
3891             specialize_start_states: o
3892                 .specialize_start_states
3893                 .or(self.specialize_start_states),
3894             cache_capacity: o.cache_capacity.or(self.cache_capacity),
3895             skip_cache_capacity_check: o
3896                 .skip_cache_capacity_check
3897                 .or(self.skip_cache_capacity_check),
3898             minimum_cache_clear_count: o
3899                 .minimum_cache_clear_count
3900                 .or(self.minimum_cache_clear_count),
3901             minimum_bytes_per_state: o
3902                 .minimum_bytes_per_state
3903                 .or(self.minimum_bytes_per_state),
3904         }
3905     }
3906 }
3907 
3908 /// A builder for constructing a lazy deterministic finite automaton from
3909 /// regular expressions.
3910 ///
3911 /// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
3912 /// advantage of the former is that it often lets you avoid importing the
3913 /// `Builder` type directly.
3914 ///
3915 /// This builder provides two main things:
3916 ///
3917 /// 1. It provides a few different `build` routines for actually constructing
3918 /// a DFA from different kinds of inputs. The most convenient is
3919 /// [`Builder::build`], which builds a DFA directly from a pattern string. The
3920 /// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
3921 /// from an NFA.
3922 /// 2. The builder permits configuring a number of things.
3923 /// [`Builder::configure`] is used with [`Config`] to configure aspects of
3924 /// the DFA and the construction process itself. [`Builder::syntax`] and
3925 /// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
3926 /// construction, respectively. The syntax and thompson configurations only
3927 /// apply when building from a pattern string.
3928 ///
3929 /// This builder always constructs a *single* lazy DFA. As such, this builder
3930 /// can only be used to construct regexes that either detect the presence
3931 /// of a match or find the end location of a match. A single DFA cannot
3932 /// produce both the start and end of a match. For that information, use a
3933 /// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
3934 /// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
3935 /// to use a DFA directly is if the end location of a match is enough for your
3936 /// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
3937 /// since a second reverse DFA is needed to find the start of a match.
3938 ///
3939 /// # Example
3940 ///
3941 /// This example shows how to build a lazy DFA that uses a tiny cache capacity
3942 /// and completely disables Unicode. That is:
3943 ///
3944 /// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
3945 ///   and `\b` are ASCII-only while `.` matches any byte except for `\n`
3946 ///   (instead of any UTF-8 encoding of a Unicode scalar value except for
3947 ///   `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
3948 /// * The pattern itself is permitted to match invalid UTF-8. For example,
3949 ///   things like `[^a]` that match any byte except for `a` are permitted.
3950 ///
3951 /// ```
3952 /// use regex_automata::{
3953 ///     hybrid::dfa::DFA,
3954 ///     nfa::thompson,
3955 ///     util::syntax,
3956 ///     HalfMatch, Input,
3957 /// };
3958 ///
3959 /// let dfa = DFA::builder()
3960 ///     .configure(DFA::config().cache_capacity(5_000))
3961 ///     .thompson(thompson::Config::new().utf8(false))
3962 ///     .syntax(syntax::Config::new().unicode(false).utf8(false))
3963 ///     .build(r"foo[^b]ar.*")?;
3964 /// let mut cache = dfa.create_cache();
3965 ///
3966 /// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
3967 /// let expected = Some(HalfMatch::must(0, 10));
3968 /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
3969 /// assert_eq!(expected, got);
3970 ///
3971 /// # Ok::<(), Box<dyn std::error::Error>>(())
3972 /// ```
3973 #[derive(Clone, Debug)]
3974 pub struct Builder {
3975     config: Config,
3976     #[cfg(feature = "syntax")]
3977     thompson: thompson::Compiler,
3978 }
3979 
3980 impl Builder {
3981     /// Create a new lazy DFA builder with the default configuration.
new() -> Builder3982     pub fn new() -> Builder {
3983         Builder {
3984             config: Config::default(),
3985             #[cfg(feature = "syntax")]
3986             thompson: thompson::Compiler::new(),
3987         }
3988     }
3989 
3990     /// Build a lazy DFA from the given pattern.
3991     ///
3992     /// If there was a problem parsing or compiling the pattern, then an error
3993     /// is returned.
3994     #[cfg(feature = "syntax")]
build(&self, pattern: &str) -> Result<DFA, BuildError>3995     pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
3996         self.build_many(&[pattern])
3997     }
3998 
3999     /// Build a lazy DFA from the given patterns.
4000     ///
4001     /// When matches are returned, the pattern ID corresponds to the index of
4002     /// the pattern in the slice given.
4003     #[cfg(feature = "syntax")]
build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<DFA, BuildError>4004     pub fn build_many<P: AsRef<str>>(
4005         &self,
4006         patterns: &[P],
4007     ) -> Result<DFA, BuildError> {
4008         let nfa = self
4009             .thompson
4010             .clone()
4011             // We can always forcefully disable captures because DFAs do not
4012             // support them.
4013             .configure(
4014                 thompson::Config::new()
4015                     .which_captures(thompson::WhichCaptures::None),
4016             )
4017             .build_many(patterns)
4018             .map_err(BuildError::nfa)?;
4019         self.build_from_nfa(nfa)
4020     }
4021 
4022     /// Build a DFA from the given NFA.
4023     ///
4024     /// Note that this requires owning a `thompson::NFA`. While this may force
4025     /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs
4026     /// are defined internally to support shared ownership such that cloning is
4027     /// very cheap.
4028     ///
4029     /// # Example
4030     ///
4031     /// This example shows how to build a lazy DFA if you already have an NFA
4032     /// in hand.
4033     ///
4034     /// ```
4035     /// use regex_automata::{
4036     ///     hybrid::dfa::DFA,
4037     ///     nfa::thompson,
4038     ///     HalfMatch, Input,
4039     /// };
4040     ///
4041     /// let haystack = "foo123bar";
4042     ///
4043     /// // This shows how to set non-default options for building an NFA.
4044     /// let nfa = thompson::Compiler::new()
4045     ///     .configure(thompson::Config::new().shrink(true))
4046     ///     .build(r"[0-9]+")?;
4047     /// let dfa = DFA::builder().build_from_nfa(nfa)?;
4048     /// let mut cache = dfa.create_cache();
4049     /// let expected = Some(HalfMatch::must(0, 6));
4050     /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
4051     /// assert_eq!(expected, got);
4052     ///
4053     /// # Ok::<(), Box<dyn std::error::Error>>(())
4054     /// ```
build_from_nfa( &self, nfa: thompson::NFA, ) -> Result<DFA, BuildError>4055     pub fn build_from_nfa(
4056         &self,
4057         nfa: thompson::NFA,
4058     ) -> Result<DFA, BuildError> {
4059         let quitset = self.config.quit_set_from_nfa(&nfa)?;
4060         let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
4061         // Check that we can fit at least a few states into our cache,
4062         // otherwise it's pretty senseless to use the lazy DFA. This does have
4063         // a possible failure mode though. This assumes the maximum size of a
4064         // state in powerset space (so, the total number of NFA states), which
4065         // may never actually materialize, and could be quite a bit larger
4066         // than the actual biggest state. If this turns out to be a problem,
4067         // we could expose a knob that disables this check. But if so, we have
4068         // to be careful not to panic in other areas of the code (the cache
4069         // clearing and init code) that tend to assume some minimum useful
4070         // cache capacity.
4071         let min_cache = minimum_cache_capacity(
4072             &nfa,
4073             &classes,
4074             self.config.get_starts_for_each_pattern(),
4075         );
4076         let mut cache_capacity = self.config.get_cache_capacity();
4077         if cache_capacity < min_cache {
4078             // When the caller has asked us to skip the cache capacity check,
4079             // then we simply force the cache capacity to its minimum amount
4080             // and mush on.
4081             if self.config.get_skip_cache_capacity_check() {
4082                 debug!(
4083                     "given capacity ({}) is too small, \
4084                      since skip_cache_capacity_check is enabled, \
4085                      setting cache capacity to minimum ({})",
4086                     cache_capacity, min_cache,
4087                 );
4088                 cache_capacity = min_cache;
4089             } else {
4090                 return Err(BuildError::insufficient_cache_capacity(
4091                     min_cache,
4092                     cache_capacity,
4093                 ));
4094             }
4095         }
4096         // We also need to check that we can fit at least some small number
4097         // of states in our state ID space. This is unlikely to trigger in
4098         // >=32-bit systems, but 16-bit systems have a pretty small state ID
4099         // space since a number of bits are used up as sentinels.
4100         if let Err(err) = minimum_lazy_state_id(&classes) {
4101             return Err(BuildError::insufficient_state_id_capacity(err));
4102         }
4103         let stride2 = classes.stride2();
4104         let start_map = StartByteMap::new(nfa.look_matcher());
4105         Ok(DFA {
4106             config: self.config.clone(),
4107             nfa,
4108             stride2,
4109             start_map,
4110             classes,
4111             quitset,
4112             cache_capacity,
4113         })
4114     }
4115 
4116     /// Apply the given lazy DFA configuration options to this builder.
configure(&mut self, config: Config) -> &mut Builder4117     pub fn configure(&mut self, config: Config) -> &mut Builder {
4118         self.config = self.config.overwrite(config);
4119         self
4120     }
4121 
4122     /// Set the syntax configuration for this builder using
4123     /// [`syntax::Config`](crate::util::syntax::Config).
4124     ///
4125     /// This permits setting things like case insensitivity, Unicode and multi
4126     /// line mode.
4127     ///
4128     /// These settings only apply when constructing a lazy DFA directly from a
4129     /// pattern.
4130     #[cfg(feature = "syntax")]
syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder4131     pub fn syntax(
4132         &mut self,
4133         config: crate::util::syntax::Config,
4134     ) -> &mut Builder {
4135         self.thompson.syntax(config);
4136         self
4137     }
4138 
4139     /// Set the Thompson NFA configuration for this builder using
4140     /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
4141     ///
4142     /// This permits setting things like whether the DFA should match the regex
4143     /// in reverse or if additional time should be spent shrinking the size of
4144     /// the NFA.
4145     ///
4146     /// These settings only apply when constructing a DFA directly from a
4147     /// pattern.
4148     #[cfg(feature = "syntax")]
thompson(&mut self, config: thompson::Config) -> &mut Builder4149     pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
4150         self.thompson.configure(config);
4151         self
4152     }
4153 }
4154 
4155 /// Represents the current state of an overlapping search.
4156 ///
4157 /// This is used for overlapping searches since they need to know something
4158 /// about the previous search. For example, when multiple patterns match at the
4159 /// same position, this state tracks the last reported pattern so that the next
4160 /// search knows whether to report another matching pattern or continue with
4161 /// the search at the next position. Additionally, it also tracks which state
4162 /// the last search call terminated in.
4163 ///
4164 /// This type provides little introspection capabilities. The only thing a
4165 /// caller can do is construct it and pass it around to permit search routines
4166 /// to use it to track state, and also ask whether a match has been found.
4167 ///
4168 /// Callers should always provide a fresh state constructed via
4169 /// [`OverlappingState::start`] when starting a new search. Reusing state from
4170 /// a previous search may result in incorrect results.
4171 #[derive(Clone, Debug, Eq, PartialEq)]
4172 pub struct OverlappingState {
4173     /// The match reported by the most recent overlapping search to use this
4174     /// state.
4175     ///
4176     /// If a search does not find any matches, then it is expected to clear
4177     /// this value.
4178     pub(crate) mat: Option<HalfMatch>,
4179     /// The state ID of the state at which the search was in when the call
4180     /// terminated. When this is a match state, `last_match` must be set to a
4181     /// non-None value.
4182     ///
4183     /// A `None` value indicates the start state of the corresponding
4184     /// automaton. We cannot use the actual ID, since any one automaton may
4185     /// have many start states, and which one is in use depends on several
4186     /// search-time factors.
4187     pub(crate) id: Option<LazyStateID>,
4188     /// The position of the search.
4189     ///
4190     /// When `id` is None (i.e., we are starting a search), this is set to
4191     /// the beginning of the search as given by the caller regardless of its
4192     /// current value. Subsequent calls to an overlapping search pick up at
4193     /// this offset.
4194     pub(crate) at: usize,
4195     /// The index into the matching patterns of the next match to report if the
4196     /// current state is a match state. Note that this may be 1 greater than
4197     /// the total number of matches to report for the current match state. (In
4198     /// which case, no more matches should be reported at the current position
4199     /// and the search should advance to the next position.)
4200     pub(crate) next_match_index: Option<usize>,
4201     /// This is set to true when a reverse overlapping search has entered its
4202     /// EOI transitions.
4203     ///
4204     /// This isn't used in a forward search because it knows to stop once the
4205     /// position exceeds the end of the search range. In a reverse search,
4206     /// since we use unsigned offsets, we don't "know" once we've gone past
4207     /// `0`. So the only way to detect it is with this extra flag. The reverse
4208     /// overlapping search knows to terminate specifically after it has
4209     /// reported all matches after following the EOI transition.
4210     pub(crate) rev_eoi: bool,
4211 }
4212 
4213 impl OverlappingState {
4214     /// Create a new overlapping state that begins at the start state of any
4215     /// automaton.
start() -> OverlappingState4216     pub fn start() -> OverlappingState {
4217         OverlappingState {
4218             mat: None,
4219             id: None,
4220             at: 0,
4221             next_match_index: None,
4222             rev_eoi: false,
4223         }
4224     }
4225 
4226     /// Return the match result of the most recent search to execute with this
4227     /// state.
4228     ///
4229     /// A searches will clear this result automatically, such that if no
4230     /// match is found, this will correctly report `None`.
get_match(&self) -> Option<HalfMatch>4231     pub fn get_match(&self) -> Option<HalfMatch> {
4232         self.mat
4233     }
4234 }
4235 
4236 /// Runs the given overlapping `search` function (forwards or backwards) until
4237 /// a match is found whose offset does not split a codepoint.
4238 ///
4239 /// This is *not* always correct to call. It should only be called when the
4240 /// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width
4241 /// matches. Calling this when both of those things aren't true might result
4242 /// in legitimate matches getting skipped.
4243 #[cold]
4244 #[inline(never)]
skip_empty_utf8_splits_overlapping<F>( input: &Input<'_>, state: &mut OverlappingState, mut search: F, ) -> Result<(), MatchError> where F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,4245 fn skip_empty_utf8_splits_overlapping<F>(
4246     input: &Input<'_>,
4247     state: &mut OverlappingState,
4248     mut search: F,
4249 ) -> Result<(), MatchError>
4250 where
4251     F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
4252 {
4253     // Note that this routine works for forwards and reverse searches
4254     // even though there's no code here to handle those cases. That's
4255     // because overlapping searches drive themselves to completion via
4256     // `OverlappingState`. So all we have to do is push it until no matches are
4257     // found.
4258 
4259     let mut hm = match state.get_match() {
4260         None => return Ok(()),
4261         Some(hm) => hm,
4262     };
4263     if input.get_anchored().is_anchored() {
4264         if !input.is_char_boundary(hm.offset()) {
4265             state.mat = None;
4266         }
4267         return Ok(());
4268     }
4269     while !input.is_char_boundary(hm.offset()) {
4270         search(input, state)?;
4271         hm = match state.get_match() {
4272             None => return Ok(()),
4273             Some(hm) => hm,
4274         };
4275     }
4276     Ok(())
4277 }
4278 
4279 /// Based on the minimum number of states required for a useful lazy DFA cache,
4280 /// this returns the minimum lazy state ID that must be representable.
4281 ///
4282 /// It's not likely for this to have any impact 32-bit systems (or higher), but
4283 /// on 16-bit systems, the lazy state ID space is quite constrained and thus
4284 /// may be insufficient if our MIN_STATES value is (for some reason) too high.
minimum_lazy_state_id( classes: &ByteClasses, ) -> Result<LazyStateID, LazyStateIDError>4285 fn minimum_lazy_state_id(
4286     classes: &ByteClasses,
4287 ) -> Result<LazyStateID, LazyStateIDError> {
4288     let stride = 1 << classes.stride2();
4289     let min_state_index = MIN_STATES.checked_sub(1).unwrap();
4290     LazyStateID::new(min_state_index * stride)
4291 }
4292 
4293 /// Based on the minimum number of states required for a useful lazy DFA cache,
4294 /// this returns a heuristic minimum number of bytes of heap space required.
4295 ///
4296 /// This is a "heuristic" because the minimum it returns is likely bigger than
4297 /// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
4298 /// the maximum number of NFA states (all of them). This is likely bigger
4299 /// than what is required in practice. Computing the true minimum effectively
4300 /// requires determinization, which is probably too much work to do for a
4301 /// simple check like this.
4302 ///
4303 /// One of the issues with this approach IMO is that it requires that this
4304 /// be in sync with the calculation above for computing how much heap memory
4305 /// the DFA cache uses. If we get it wrong, it's possible for example for the
4306 /// minimum to be smaller than the computed heap memory, and thus, it may be
4307 /// the case that we can't add the required minimum number of states. That in
4308 /// turn will make lazy DFA panic because we assume that we can add at least a
4309 /// minimum number of states.
4310 ///
4311 /// Another approach would be to always allow the minimum number of states to
4312 /// be added to the lazy DFA cache, even if it exceeds the configured cache
4313 /// limit. This does mean that the limit isn't really a limit in all cases,
4314 /// which is unfortunate. But it does at least guarantee that the lazy DFA can
4315 /// always make progress, even if it is slow. (This approach is very similar to
4316 /// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't
4317 /// rely on cache size calculation. Instead, it would just always permit a
4318 /// minimum number of states to be added.)
minimum_cache_capacity( nfa: &thompson::NFA, classes: &ByteClasses, starts_for_each_pattern: bool, ) -> usize4319 fn minimum_cache_capacity(
4320     nfa: &thompson::NFA,
4321     classes: &ByteClasses,
4322     starts_for_each_pattern: bool,
4323 ) -> usize {
4324     const ID_SIZE: usize = size_of::<LazyStateID>();
4325     const STATE_SIZE: usize = size_of::<State>();
4326 
4327     let stride = 1 << classes.stride2();
4328     let states_len = nfa.states().len();
4329     let sparses = 2 * states_len * NFAStateID::SIZE;
4330     let trans = MIN_STATES * stride * ID_SIZE;
4331 
4332     let mut starts = Start::len() * ID_SIZE;
4333     if starts_for_each_pattern {
4334         starts += (Start::len() * nfa.pattern_len()) * ID_SIZE;
4335     }
4336 
4337     // The min number of states HAS to be at least 4: we have 3 sentinel states
4338     // and then we need space for one more when we save a state after clearing
4339     // the cache. We also need space for one more, otherwise we get stuck in a
4340     // loop where we try to add a 5th state, which gets rejected, which clears
4341     // the cache, which adds back a saved state (4th total state) which then
4342     // tries to add the 5th state again.
4343     assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5");
4344     // The minimum number of non-sentinel states. We consider this separately
4345     // because sentinel states are much smaller in that they contain no NFA
4346     // states. Given our aggressive calculation here, it's worth being more
4347     // precise with the number of states we need.
4348     let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap();
4349 
4350     // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of
4351     // patterns, followed by 32-bit encodings of patterns and then delta
4352     // varint encodings of NFA state IDs. We use the worst case (which isn't
4353     // technically possible) of 5 bytes for each NFA state ID.
4354     //
4355     // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
4356     // unknown, dead and quit states. Those states have a known size and it is
4357     // small.
4358     let dead_state_size = State::dead().memory_usage();
4359     let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5);
4360     let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size))
4361         + (non_sentinel * (STATE_SIZE + max_state_size));
4362     // NOTE: We don't double count heap memory used by State for this map since
4363     // we use reference counting to avoid doubling memory usage. (This tends to
4364     // be where most memory is allocated in the cache.)
4365     let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE);
4366     let stack = states_len * NFAStateID::SIZE;
4367     let scratch_state_builder = max_state_size;
4368 
4369     trans
4370         + starts
4371         + states
4372         + states_to_sid
4373         + sparses
4374         + stack
4375         + scratch_state_builder
4376 }
4377 
4378 #[cfg(all(test, feature = "syntax"))]
4379 mod tests {
4380     use super::*;
4381 
4382     // Tests that we handle heuristic Unicode word boundary support in reverse
4383     // DFAs in the specific case of contextual searches.
4384     //
4385     // I wrote this test when I discovered a bug in how heuristic word
4386     // boundaries were handled. Namely, that the starting state selection
4387     // didn't consider the DFA's quit byte set when looking at the byte
4388     // immediately before the start of the search (or immediately after the
4389     // end of the search in the case of a reverse search). As a result, it was
4390     // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte
4391     // in the 'β' codepoint would be treated as a non-word character. But of
4392     // course, this search should trigger the DFA to quit, since there is a
4393     // non-ASCII byte in consideration.
4394     //
4395     // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set
4396     // if it wasn't empty. The forward case is tested in the doc test for the
4397     // Config::unicode_word_boundary API. We test the reverse case here, which
4398     // is sufficiently niche that it doesn't really belong in a doc test.
4399     #[test]
heuristic_unicode_reverse()4400     fn heuristic_unicode_reverse() {
4401         let dfa = DFA::builder()
4402             .configure(DFA::config().unicode_word_boundary(true))
4403             .thompson(thompson::Config::new().reverse(true))
4404             .build(r"\b[0-9]+\b")
4405             .unwrap();
4406         let mut cache = dfa.create_cache();
4407 
4408         let input = Input::new("β123").range(2..);
4409         let expected = MatchError::quit(0xB2, 1);
4410         let got = dfa.try_search_rev(&mut cache, &input);
4411         assert_eq!(Err(expected), got);
4412 
4413         let input = Input::new("123β").range(..3);
4414         let expected = MatchError::quit(0xCE, 3);
4415         let got = dfa.try_search_rev(&mut cache, &input);
4416         assert_eq!(Err(expected), got);
4417     }
4418 }
4419