1 use core::{borrow::Borrow, cell::RefCell};
2 
3 use alloc::{sync::Arc, vec, vec::Vec};
4 
5 use regex_syntax::{
6     hir::{self, Hir},
7     utf8::{Utf8Range, Utf8Sequences},
8     ParserBuilder,
9 };
10 
11 use crate::{
12     nfa::thompson::{
13         builder::Builder,
14         error::BuildError,
15         literal_trie::LiteralTrie,
16         map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17         nfa::{Transition, NFA},
18         range_trie::RangeTrie,
19     },
20     util::{
21         look::{Look, LookMatcher},
22         primitives::{PatternID, StateID},
23     },
24 };
25 
26 /// The configuration used for a Thompson NFA compiler.
27 #[derive(Clone, Debug, Default)]
28 pub struct Config {
29     utf8: Option<bool>,
30     reverse: Option<bool>,
31     nfa_size_limit: Option<Option<usize>>,
32     shrink: Option<bool>,
33     which_captures: Option<WhichCaptures>,
34     look_matcher: Option<LookMatcher>,
35     #[cfg(test)]
36     unanchored_prefix: Option<bool>,
37 }
38 
39 impl Config {
40     /// Return a new default Thompson NFA compiler configuration.
new() -> Config41     pub fn new() -> Config {
42         Config::default()
43     }
44 
45     /// Whether to enable UTF-8 mode during search or not.
46     ///
47     /// A regex engine is said to be in UTF-8 mode when it guarantees that
48     /// all matches returned by it have spans consisting of only valid UTF-8.
49     /// That is, it is impossible for a match span to be returned that
50     /// contains any invalid UTF-8.
51     ///
52     /// UTF-8 mode generally consists of two things:
53     ///
54     /// 1. Whether the NFA's states are constructed such that all paths to a
55     /// match state that consume at least one byte always correspond to valid
56     /// UTF-8.
57     /// 2. Whether all paths to a match state that do _not_ consume any bytes
58     /// should always correspond to valid UTF-8 boundaries.
59     ///
60     /// (1) is a guarantee made by whoever constructs the NFA.
61     /// If you're parsing a regex from its concrete syntax, then
62     /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63     /// this guarantee for you. It does it by returning an error if the regex
64     /// pattern could every report a non-empty match span that contains invalid
65     /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66     /// successfully parses, then you're guaranteed that the corresponding NFA
67     /// will only ever report non-empty match spans containing valid UTF-8.
68     ///
69     /// (2) is a trickier guarantee because it cannot be enforced by the NFA
70     /// state graph itself. Consider, for example, the regex `a*`. It matches
71     /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
72     /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73     /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74     /// guarantee must be made at a higher level than the NFA state graph
75     /// itself. This crate deals with this case in each regex engine. Namely,
76     /// when a zero-width match that splits a codepoint is found and UTF-8
77     /// mode enabled, then it is ignored and the engine moves on looking for
78     /// the next match.
79     ///
80     /// Thus, UTF-8 mode is both a promise that the NFA built only reports
81     /// non-empty matches that are valid UTF-8, and an *instruction* to regex
82     /// engines that empty matches that split codepoints should be banned.
83     ///
84     /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85     /// it only makes sense to enable this option when you *know* your haystack
86     /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87     /// searching a haystack that contains invalid UTF-8 leads to **unspecified
88     /// behavior**.
89     ///
90     /// Therefore, it may make sense to enable `syntax::Config::utf8` while
91     /// simultaneously *disabling* this option. That would ensure all non-empty
92     /// match spans are valid UTF-8, but that empty match spans may still split
93     /// a codepoint or match at other places that aren't valid UTF-8.
94     ///
95     /// In general, this mode is only relevant if your regex can match the
96     /// empty string. Most regexes don't.
97     ///
98     /// This is enabled by default.
99     ///
100     /// # Example
101     ///
102     /// This example shows how UTF-8 mode can impact the match spans that may
103     /// be reported in certain cases.
104     ///
105     /// ```
106     /// use regex_automata::{
107     ///     nfa::thompson::{self, pikevm::PikeVM},
108     ///     Match, Input,
109     /// };
110     ///
111     /// let re = PikeVM::new("")?;
112     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113     ///
114     /// // UTF-8 mode is enabled by default.
115     /// let mut input = Input::new("☃");
116     /// re.search(&mut cache, &input, &mut caps);
117     /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
118     ///
119     /// // Even though an empty regex matches at 1..1, our next match is
120     /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121     /// // three bytes long).
122     /// input.set_start(1);
123     /// re.search(&mut cache, &input, &mut caps);
124     /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
125     ///
126     /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127     /// let re = PikeVM::builder()
128     ///     .thompson(thompson::Config::new().utf8(false))
129     ///     .build("")?;
130     /// re.search(&mut cache, &input, &mut caps);
131     /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
132     ///
133     /// input.set_start(2);
134     /// re.search(&mut cache, &input, &mut caps);
135     /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
136     ///
137     /// input.set_start(3);
138     /// re.search(&mut cache, &input, &mut caps);
139     /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
140     ///
141     /// input.set_start(4);
142     /// re.search(&mut cache, &input, &mut caps);
143     /// assert_eq!(None, caps.get_match());
144     ///
145     /// # Ok::<(), Box<dyn std::error::Error>>(())
146     /// ```
utf8(mut self, yes: bool) -> Config147     pub fn utf8(mut self, yes: bool) -> Config {
148         self.utf8 = Some(yes);
149         self
150     }
151 
152     /// Reverse the NFA.
153     ///
154     /// A NFA reversal is performed by reversing all of the concatenated
155     /// sub-expressions in the original pattern, recursively. (Look around
156     /// operators are also inverted.) The resulting NFA can be used to match
157     /// the pattern starting from the end of a string instead of the beginning
158     /// of a string.
159     ///
160     /// Reversing the NFA is useful for building a reverse DFA, which is most
161     /// useful for finding the start of a match after its ending position has
162     /// been found. NFA execution engines typically do not work on reverse
163     /// NFAs. For example, currently, the Pike VM reports the starting location
164     /// of matches without a reverse NFA.
165     ///
166     /// Currently, enabling this setting requires disabling the
167     /// [`captures`](Config::captures) setting. If both are enabled, then the
168     /// compiler will return an error. It is expected that this limitation will
169     /// be lifted in the future.
170     ///
171     /// This is disabled by default.
172     ///
173     /// # Example
174     ///
175     /// This example shows how to build a DFA from a reverse NFA, and then use
176     /// the DFA to search backwards.
177     ///
178     /// ```
179     /// use regex_automata::{
180     ///     dfa::{self, Automaton},
181     ///     nfa::thompson::{NFA, WhichCaptures},
182     ///     HalfMatch, Input,
183     /// };
184     ///
185     /// let dfa = dfa::dense::Builder::new()
186     ///     .thompson(NFA::config()
187     ///         .which_captures(WhichCaptures::None)
188     ///         .reverse(true)
189     ///     )
190     ///     .build("baz[0-9]+")?;
191     /// let expected = Some(HalfMatch::must(0, 3));
192     /// assert_eq!(
193     ///     expected,
194     ///     dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195     /// );
196     ///
197     /// # Ok::<(), Box<dyn std::error::Error>>(())
198     /// ```
reverse(mut self, yes: bool) -> Config199     pub fn reverse(mut self, yes: bool) -> Config {
200         self.reverse = Some(yes);
201         self
202     }
203 
204     /// Sets an approximate size limit on the total heap used by the NFA being
205     /// compiled.
206     ///
207     /// This permits imposing constraints on the size of a compiled NFA. This
208     /// may be useful in contexts where the regex pattern is untrusted and one
209     /// wants to avoid using too much memory.
210     ///
211     /// This size limit does not apply to auxiliary heap used during
212     /// compilation that is not part of the built NFA.
213     ///
214     /// Note that this size limit is applied during compilation in order for
215     /// the limit to prevent too much heap from being used. However, the
216     /// implementation may use an intermediate NFA representation that is
217     /// otherwise slightly bigger than the final public form. Since the size
218     /// limit may be applied to an intermediate representation, there is not
219     /// necessarily a precise correspondence between the configured size limit
220     /// and the heap usage of the final NFA.
221     ///
222     /// There is no size limit by default.
223     ///
224     /// # Example
225     ///
226     /// This example demonstrates how Unicode mode can greatly increase the
227     /// size of the NFA.
228     ///
229     /// ```
230     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
231     /// use regex_automata::nfa::thompson::NFA;
232     ///
233     /// // 300KB isn't enough!
234     /// NFA::compiler()
235     ///     .configure(NFA::config().nfa_size_limit(Some(300_000)))
236     ///     .build(r"\w{20}")
237     ///     .unwrap_err();
238     ///
239     /// // ... but 400KB probably is.
240     /// let nfa = NFA::compiler()
241     ///     .configure(NFA::config().nfa_size_limit(Some(400_000)))
242     ///     .build(r"\w{20}")?;
243     ///
244     /// assert_eq!(nfa.pattern_len(), 1);
245     ///
246     /// # Ok::<(), Box<dyn std::error::Error>>(())
247     /// ```
nfa_size_limit(mut self, bytes: Option<usize>) -> Config248     pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249         self.nfa_size_limit = Some(bytes);
250         self
251     }
252 
253     /// Apply best effort heuristics to shrink the NFA at the expense of more
254     /// time/memory.
255     ///
256     /// Generally speaking, if one is using an NFA to compile a DFA, then the
257     /// extra time used to shrink the NFA will be more than made up for during
258     /// DFA construction (potentially by a lot). In other words, enabling this
259     /// can substantially decrease the overall amount of time it takes to build
260     /// a DFA.
261     ///
262     /// A reason to keep this disabled is if you want to compile an NFA and
263     /// start using it as quickly as possible without needing to build a DFA,
264     /// and you don't mind using a bit of extra memory for the NFA. e.g., for
265     /// an NFA simulation or for a lazy DFA.
266     ///
267     /// NFA shrinking is currently most useful when compiling a reverse
268     /// NFA with large Unicode character classes. In particular, it trades
269     /// additional CPU time during NFA compilation in favor of generating fewer
270     /// NFA states.
271     ///
272     /// This is disabled by default because it can increase compile times
273     /// quite a bit if you aren't building a full DFA.
274     ///
275     /// # Example
276     ///
277     /// This example shows that NFA shrinking can lead to substantial space
278     /// savings in some cases. Notice that, as noted above, we build a reverse
279     /// DFA and use a pattern with a large Unicode character class.
280     ///
281     /// ```
282     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
283     /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284     ///
285     /// // Currently we have to disable captures when enabling reverse NFA.
286     /// let config = NFA::config()
287     ///     .which_captures(WhichCaptures::None)
288     ///     .reverse(true);
289     /// let not_shrunk = NFA::compiler()
290     ///     .configure(config.clone().shrink(false))
291     ///     .build(r"\w")?;
292     /// let shrunk = NFA::compiler()
293     ///     .configure(config.clone().shrink(true))
294     ///     .build(r"\w")?;
295     ///
296     /// // While a specific shrink factor is not guaranteed, the savings can be
297     /// // considerable in some cases.
298     /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
299     ///
300     /// # Ok::<(), Box<dyn std::error::Error>>(())
301     /// ```
shrink(mut self, yes: bool) -> Config302     pub fn shrink(mut self, yes: bool) -> Config {
303         self.shrink = Some(yes);
304         self
305     }
306 
307     /// Whether to include 'Capture' states in the NFA.
308     ///
309     /// Currently, enabling this setting requires disabling the
310     /// [`reverse`](Config::reverse) setting. If both are enabled, then the
311     /// compiler will return an error. It is expected that this limitation will
312     /// be lifted in the future.
313     ///
314     /// This is enabled by default.
315     ///
316     /// # Example
317     ///
318     /// This example demonstrates that some regex engines, like the Pike VM,
319     /// require capturing states to be present in the NFA to report match
320     /// offsets.
321     ///
322     /// (Note that since this method is deprecated, the example below uses
323     /// [`Config::which_captures`] to disable capture states.)
324     ///
325     /// ```
326     /// use regex_automata::nfa::thompson::{
327     ///     pikevm::PikeVM,
328     ///     NFA,
329     ///     WhichCaptures,
330     /// };
331     ///
332     /// let re = PikeVM::builder()
333     ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
334     ///     .build(r"[a-z]+")?;
335     /// let mut cache = re.create_cache();
336     ///
337     /// assert!(re.is_match(&mut cache, "abc"));
338     /// assert_eq!(None, re.find(&mut cache, "abc"));
339     ///
340     /// # Ok::<(), Box<dyn std::error::Error>>(())
341     /// ```
342     #[deprecated(since = "0.3.5", note = "use which_captures instead")]
captures(self, yes: bool) -> Config343     pub fn captures(self, yes: bool) -> Config {
344         self.which_captures(if yes {
345             WhichCaptures::All
346         } else {
347             WhichCaptures::None
348         })
349     }
350 
351     /// Configures what kinds of capture groups are compiled into
352     /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353     /// Thompson NFA.
354     ///
355     /// Currently, using any option except for [`WhichCaptures::None`] requires
356     /// disabling the [`reverse`](Config::reverse) setting. If both are
357     /// enabled, then the compiler will return an error. It is expected that
358     /// this limitation will be lifted in the future.
359     ///
360     /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361     /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362     /// overhead of capture states for explicit groups. Usually this occurs
363     /// when one wants to use the `PikeVM` only for determining the overall
364     /// match. Otherwise, the `PikeVM` could use much more memory than is
365     /// necessary.
366     ///
367     /// # Example
368     ///
369     /// This example demonstrates that some regex engines, like the Pike VM,
370     /// require capturing states to be present in the NFA to report match
371     /// offsets.
372     ///
373     /// ```
374     /// use regex_automata::nfa::thompson::{
375     ///     pikevm::PikeVM,
376     ///     NFA,
377     ///     WhichCaptures,
378     /// };
379     ///
380     /// let re = PikeVM::builder()
381     ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
382     ///     .build(r"[a-z]+")?;
383     /// let mut cache = re.create_cache();
384     ///
385     /// assert!(re.is_match(&mut cache, "abc"));
386     /// assert_eq!(None, re.find(&mut cache, "abc"));
387     ///
388     /// # Ok::<(), Box<dyn std::error::Error>>(())
389     /// ```
390     ///
391     /// The same applies to the bounded backtracker:
392     ///
393     /// ```
394     /// use regex_automata::nfa::thompson::{
395     ///     backtrack::BoundedBacktracker,
396     ///     NFA,
397     ///     WhichCaptures,
398     /// };
399     ///
400     /// let re = BoundedBacktracker::builder()
401     ///     .thompson(NFA::config().which_captures(WhichCaptures::None))
402     ///     .build(r"[a-z]+")?;
403     /// let mut cache = re.create_cache();
404     ///
405     /// assert!(re.try_is_match(&mut cache, "abc")?);
406     /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407     ///
408     /// # Ok::<(), Box<dyn std::error::Error>>(())
409     /// ```
which_captures(mut self, which_captures: WhichCaptures) -> Config410     pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411         self.which_captures = Some(which_captures);
412         self
413     }
414 
415     /// Sets the look-around matcher that should be used with this NFA.
416     ///
417     /// A look-around matcher determines how to match look-around assertions.
418     /// In particular, some assertions are configurable. For example, the
419     /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420     /// from the default of `\n` to any other byte.
421     ///
422     /// # Example
423     ///
424     /// This shows how to change the line terminator for multi-line assertions.
425     ///
426     /// ```
427     /// use regex_automata::{
428     ///     nfa::thompson::{self, pikevm::PikeVM},
429     ///     util::look::LookMatcher,
430     ///     Match, Input,
431     /// };
432     ///
433     /// let mut lookm = LookMatcher::new();
434     /// lookm.set_line_terminator(b'\x00');
435     ///
436     /// let re = PikeVM::builder()
437     ///     .thompson(thompson::Config::new().look_matcher(lookm))
438     ///     .build(r"(?m)^[a-z]+$")?;
439     /// let mut cache = re.create_cache();
440     ///
441     /// // Multi-line assertions now use NUL as a terminator.
442     /// assert_eq!(
443     ///     Some(Match::must(0, 1..4)),
444     ///     re.find(&mut cache, b"\x00abc\x00"),
445     /// );
446     /// // ... and \n is no longer recognized as a terminator.
447     /// assert_eq!(
448     ///     None,
449     ///     re.find(&mut cache, b"\nabc\n"),
450     /// );
451     ///
452     /// # Ok::<(), Box<dyn std::error::Error>>(())
453     /// ```
look_matcher(mut self, m: LookMatcher) -> Config454     pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455         self.look_matcher = Some(m);
456         self
457     }
458 
459     /// Whether to compile an unanchored prefix into this NFA.
460     ///
461     /// This is enabled by default. It is made available for tests only to make
462     /// it easier to unit test the output of the compiler.
463     #[cfg(test)]
unanchored_prefix(mut self, yes: bool) -> Config464     fn unanchored_prefix(mut self, yes: bool) -> Config {
465         self.unanchored_prefix = Some(yes);
466         self
467     }
468 
469     /// Returns whether this configuration has enabled UTF-8 mode.
get_utf8(&self) -> bool470     pub fn get_utf8(&self) -> bool {
471         self.utf8.unwrap_or(true)
472     }
473 
474     /// Returns whether this configuration has enabled reverse NFA compilation.
get_reverse(&self) -> bool475     pub fn get_reverse(&self) -> bool {
476         self.reverse.unwrap_or(false)
477     }
478 
479     /// Return the configured NFA size limit, if it exists, in the number of
480     /// bytes of heap used.
get_nfa_size_limit(&self) -> Option<usize>481     pub fn get_nfa_size_limit(&self) -> Option<usize> {
482         self.nfa_size_limit.unwrap_or(None)
483     }
484 
485     /// Return whether NFA shrinking is enabled.
get_shrink(&self) -> bool486     pub fn get_shrink(&self) -> bool {
487         self.shrink.unwrap_or(false)
488     }
489 
490     /// Return whether NFA compilation is configured to produce capture states.
491     #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
get_captures(&self) -> bool492     pub fn get_captures(&self) -> bool {
493         self.get_which_captures().is_any()
494     }
495 
496     /// Return what kinds of capture states will be compiled into an NFA.
get_which_captures(&self) -> WhichCaptures497     pub fn get_which_captures(&self) -> WhichCaptures {
498         self.which_captures.unwrap_or(WhichCaptures::All)
499     }
500 
501     /// Return the look-around matcher for this NFA.
get_look_matcher(&self) -> LookMatcher502     pub fn get_look_matcher(&self) -> LookMatcher {
503         self.look_matcher.clone().unwrap_or(LookMatcher::default())
504     }
505 
506     /// Return whether NFA compilation is configured to include an unanchored
507     /// prefix.
508     ///
509     /// This is always false when not in test mode.
get_unanchored_prefix(&self) -> bool510     fn get_unanchored_prefix(&self) -> bool {
511         #[cfg(test)]
512         {
513             self.unanchored_prefix.unwrap_or(true)
514         }
515         #[cfg(not(test))]
516         {
517             true
518         }
519     }
520 
521     /// Overwrite the default configuration such that the options in `o` are
522     /// always used. If an option in `o` is not set, then the corresponding
523     /// option in `self` is used. If it's not set in `self` either, then it
524     /// remains not set.
overwrite(&self, o: Config) -> Config525     pub(crate) fn overwrite(&self, o: Config) -> Config {
526         Config {
527             utf8: o.utf8.or(self.utf8),
528             reverse: o.reverse.or(self.reverse),
529             nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530             shrink: o.shrink.or(self.shrink),
531             which_captures: o.which_captures.or(self.which_captures),
532             look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
533             #[cfg(test)]
534             unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535         }
536     }
537 }
538 
539 /// A configuration indicating which kinds of
540 /// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541 ///
542 /// This configuration can be used with [`Config::which_captures`] to control
543 /// which capture states are compiled into a Thompson NFA.
544 ///
545 /// The default configuration is [`WhichCaptures::All`].
546 #[derive(Clone, Copy, Debug)]
547 pub enum WhichCaptures {
548     /// All capture states, including those corresponding to both implicit and
549     /// explicit capture groups, are included in the Thompson NFA.
550     All,
551     /// Only capture states corresponding to implicit capture groups are
552     /// included. Implicit capture groups appear in every pattern implicitly
553     /// and correspond to the overall match of a pattern.
554     ///
555     /// This is useful when one only cares about the overall match of a
556     /// pattern. By excluding capture states from explicit capture groups,
557     /// one might be able to reduce the memory usage of a multi-pattern regex
558     /// substantially if it was otherwise written to have many explicit capture
559     /// groups.
560     Implicit,
561     /// No capture states are compiled into the Thompson NFA.
562     ///
563     /// This is useful when capture states are either not needed (for example,
564     /// if one is only trying to build a DFA) or if they aren't supported (for
565     /// example, a reverse NFA).
566     None,
567 }
568 
569 impl Default for WhichCaptures {
default() -> WhichCaptures570     fn default() -> WhichCaptures {
571         WhichCaptures::All
572     }
573 }
574 
575 impl WhichCaptures {
576     /// Returns true if this configuration indicates that no capture states
577     /// should be produced in an NFA.
is_none(&self) -> bool578     pub fn is_none(&self) -> bool {
579         matches!(*self, WhichCaptures::None)
580     }
581 
582     /// Returns true if this configuration indicates that some capture states
583     /// should be added to an NFA. Note that this might only include capture
584     /// states for implicit capture groups.
is_any(&self) -> bool585     pub fn is_any(&self) -> bool {
586         !self.is_none()
587     }
588 }
589 
590 /*
591 This compiler below uses Thompson's construction algorithm. The compiler takes
592 a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
593 is structured in a way that permits it to be executed by a virtual machine and
594 also used to efficiently build a DFA.
595 
596 The compiler deals with a slightly expanded set of NFA states than what is
597 in a final NFA (as exhibited by builder::State and nfa::State). Notably a
598 compiler state includes an empty node that has exactly one unconditional
599 epsilon transition to the next state. In other words, it's a "goto" instruction
600 if one views Thompson's NFA as a set of bytecode instructions. These goto
601 instructions are removed in a subsequent phase before returning the NFA to the
602 caller. The purpose of these empty nodes is that they make the construction
603 algorithm substantially simpler to implement. We remove them before returning
604 to the caller because they can represent substantial overhead when traversing
605 the NFA graph (either while searching using the NFA directly or while building
606 a DFA).
607 
608 In the future, it would be nice to provide a Glushkov compiler as well, as it
609 would work well as a bit-parallel NFA for smaller regexes. But the Thompson
610 construction is one I'm more familiar with and seems more straight-forward to
611 deal with when it comes to large Unicode character classes.
612 
613 Internally, the compiler uses interior mutability to improve composition in the
614 face of the borrow checker. In particular, we'd really like to be able to write
615 things like this:
616 
617     self.c_concat(exprs.iter().map(|e| self.c(e)))
618 
619 Which elegantly uses iterators to build up a sequence of compiled regex
620 sub-expressions and then hands it off to the concatenating compiler routine.
621 Without interior mutability, the borrow checker won't let us borrow `self`
622 mutably both inside and outside the closure at the same time.
623 */
624 
625 /// A builder for compiling an NFA from a regex's high-level intermediate
626 /// representation (HIR).
627 ///
628 /// This compiler provides a way to translate a parsed regex pattern into an
629 /// NFA state graph. The NFA state graph can either be used directly to execute
630 /// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
631 ///
632 /// This compiler provides APIs both for compiling regex patterns directly from
633 /// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
634 ///
635 /// This compiler has various options that may be configured via
636 /// [`thompson::Config`](Config).
637 ///
638 /// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
639 /// A `Builder` provides a lower level API that is uncoupled from a regex
640 /// pattern's concrete syntax or even its HIR. Instead, it permits stitching
641 /// together an NFA by hand. See its docs for examples.
642 ///
643 /// # Example: compilation from concrete syntax
644 ///
645 /// This shows how to compile an NFA from a pattern string while setting a size
646 /// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
647 ///
648 /// ```
649 /// use regex_automata::{
650 ///     nfa::thompson::{NFA, pikevm::PikeVM},
651 ///     Match,
652 /// };
653 ///
654 /// let config = NFA::config().nfa_size_limit(Some(1_000));
655 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
656 ///
657 /// let re = PikeVM::new_from_nfa(nfa)?;
658 /// let mut cache = re.create_cache();
659 /// let mut caps = re.create_captures();
660 /// let expected = Some(Match::must(0, 3..4));
661 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
662 /// assert_eq!(expected, caps.get_match());
663 ///
664 /// # Ok::<(), Box<dyn std::error::Error>>(())
665 /// ```
666 ///
667 /// # Example: compilation from HIR
668 ///
669 /// This shows how to hand assemble a regular expression via its HIR, and then
670 /// compile an NFA directly from it.
671 ///
672 /// ```
673 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
674 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
675 ///
676 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
677 ///     ClassBytesRange::new(b'0', b'9'),
678 ///     ClassBytesRange::new(b'A', b'Z'),
679 ///     ClassBytesRange::new(b'_', b'_'),
680 ///     ClassBytesRange::new(b'a', b'z'),
681 /// ])));
682 ///
683 /// let config = NFA::config().nfa_size_limit(Some(1_000));
684 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
685 ///
686 /// let re = PikeVM::new_from_nfa(nfa)?;
687 /// let mut cache = re.create_cache();
688 /// let mut caps = re.create_captures();
689 /// let expected = Some(Match::must(0, 3..4));
690 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
691 /// assert_eq!(expected, caps.get_match());
692 ///
693 /// # Ok::<(), Box<dyn std::error::Error>>(())
694 /// ```
695 #[derive(Clone, Debug)]
696 pub struct Compiler {
697     /// A regex parser, used when compiling an NFA directly from a pattern
698     /// string.
699     parser: ParserBuilder,
700     /// The compiler configuration.
701     config: Config,
702     /// The builder for actually constructing an NFA. This provides a
703     /// convenient abstraction for writing a compiler.
704     builder: RefCell<Builder>,
705     /// State used for compiling character classes to UTF-8 byte automata.
706     /// State is not retained between character class compilations. This just
707     /// serves to amortize allocation to the extent possible.
708     utf8_state: RefCell<Utf8State>,
709     /// State used for arranging character classes in reverse into a trie.
710     trie_state: RefCell<RangeTrie>,
711     /// State used for caching common suffixes when compiling reverse UTF-8
712     /// automata (for Unicode character classes).
713     utf8_suffix: RefCell<Utf8SuffixMap>,
714 }
715 
716 impl Compiler {
717     /// Create a new NFA builder with its default configuration.
new() -> Compiler718     pub fn new() -> Compiler {
719         Compiler {
720             parser: ParserBuilder::new(),
721             config: Config::default(),
722             builder: RefCell::new(Builder::new()),
723             utf8_state: RefCell::new(Utf8State::new()),
724             trie_state: RefCell::new(RangeTrie::new()),
725             utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
726         }
727     }
728 
729     /// Compile the given regular expression pattern into an NFA.
730     ///
731     /// If there was a problem parsing the regex, then that error is returned.
732     ///
733     /// Otherwise, if there was a problem building the NFA, then an error is
734     /// returned. The only error that can occur is if the compiled regex would
735     /// exceed the size limits configured on this builder, or if any part of
736     /// the NFA would exceed the integer representations used. (For example,
737     /// too many states might plausibly occur on a 16-bit target.)
738     ///
739     /// # Example
740     ///
741     /// ```
742     /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
743     ///
744     /// let config = NFA::config().nfa_size_limit(Some(1_000));
745     /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
746     ///
747     /// let re = PikeVM::new_from_nfa(nfa)?;
748     /// let mut cache = re.create_cache();
749     /// let mut caps = re.create_captures();
750     /// let expected = Some(Match::must(0, 3..4));
751     /// re.captures(&mut cache, "!@#A#@!", &mut caps);
752     /// assert_eq!(expected, caps.get_match());
753     ///
754     /// # Ok::<(), Box<dyn std::error::Error>>(())
755     /// ```
build(&self, pattern: &str) -> Result<NFA, BuildError>756     pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
757         self.build_many(&[pattern])
758     }
759 
760     /// Compile the given regular expression patterns into a single NFA.
761     ///
762     /// When matches are returned, the pattern ID corresponds to the index of
763     /// the pattern in the slice given.
764     ///
765     /// # Example
766     ///
767     /// ```
768     /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
769     ///
770     /// let config = NFA::config().nfa_size_limit(Some(1_000));
771     /// let nfa = NFA::compiler().configure(config).build_many(&[
772     ///     r"(?-u)\s",
773     ///     r"(?-u)\w",
774     /// ])?;
775     ///
776     /// let re = PikeVM::new_from_nfa(nfa)?;
777     /// let mut cache = re.create_cache();
778     /// let mut caps = re.create_captures();
779     /// let expected = Some(Match::must(1, 1..2));
780     /// re.captures(&mut cache, "!A! !A!", &mut caps);
781     /// assert_eq!(expected, caps.get_match());
782     ///
783     /// # Ok::<(), Box<dyn std::error::Error>>(())
784     /// ```
build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<NFA, BuildError>785     pub fn build_many<P: AsRef<str>>(
786         &self,
787         patterns: &[P],
788     ) -> Result<NFA, BuildError> {
789         let mut hirs = vec![];
790         for p in patterns {
791             hirs.push(
792                 self.parser
793                     .build()
794                     .parse(p.as_ref())
795                     .map_err(BuildError::syntax)?,
796             );
797             debug!("parsed: {:?}", p.as_ref());
798         }
799         self.build_many_from_hir(&hirs)
800     }
801 
802     /// Compile the given high level intermediate representation of a regular
803     /// expression into an NFA.
804     ///
805     /// If there was a problem building the NFA, then an error is returned. The
806     /// only error that can occur is if the compiled regex would exceed the
807     /// size limits configured on this builder, or if any part of the NFA would
808     /// exceed the integer representations used. (For example, too many states
809     /// might plausibly occur on a 16-bit target.)
810     ///
811     /// # Example
812     ///
813     /// ```
814     /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
815     /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
816     ///
817     /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
818     ///     ClassBytesRange::new(b'0', b'9'),
819     ///     ClassBytesRange::new(b'A', b'Z'),
820     ///     ClassBytesRange::new(b'_', b'_'),
821     ///     ClassBytesRange::new(b'a', b'z'),
822     /// ])));
823     ///
824     /// let config = NFA::config().nfa_size_limit(Some(1_000));
825     /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
826     ///
827     /// let re = PikeVM::new_from_nfa(nfa)?;
828     /// let mut cache = re.create_cache();
829     /// let mut caps = re.create_captures();
830     /// let expected = Some(Match::must(0, 3..4));
831     /// re.captures(&mut cache, "!@#A#@!", &mut caps);
832     /// assert_eq!(expected, caps.get_match());
833     ///
834     /// # Ok::<(), Box<dyn std::error::Error>>(())
835     /// ```
build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError>836     pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
837         self.build_many_from_hir(&[expr])
838     }
839 
840     /// Compile the given high level intermediate representations of regular
841     /// expressions into a single NFA.
842     ///
843     /// When matches are returned, the pattern ID corresponds to the index of
844     /// the pattern in the slice given.
845     ///
846     /// # Example
847     ///
848     /// ```
849     /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
850     /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
851     ///
852     /// let hirs = &[
853     ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
854     ///         ClassBytesRange::new(b'\t', b'\r'),
855     ///         ClassBytesRange::new(b' ', b' '),
856     ///     ]))),
857     ///     Hir::class(Class::Bytes(ClassBytes::new(vec![
858     ///         ClassBytesRange::new(b'0', b'9'),
859     ///         ClassBytesRange::new(b'A', b'Z'),
860     ///         ClassBytesRange::new(b'_', b'_'),
861     ///         ClassBytesRange::new(b'a', b'z'),
862     ///     ]))),
863     /// ];
864     ///
865     /// let config = NFA::config().nfa_size_limit(Some(1_000));
866     /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
867     ///
868     /// let re = PikeVM::new_from_nfa(nfa)?;
869     /// let mut cache = re.create_cache();
870     /// let mut caps = re.create_captures();
871     /// let expected = Some(Match::must(1, 1..2));
872     /// re.captures(&mut cache, "!A! !A!", &mut caps);
873     /// assert_eq!(expected, caps.get_match());
874     ///
875     /// # Ok::<(), Box<dyn std::error::Error>>(())
876     /// ```
build_many_from_hir<H: Borrow<Hir>>( &self, exprs: &[H], ) -> Result<NFA, BuildError>877     pub fn build_many_from_hir<H: Borrow<Hir>>(
878         &self,
879         exprs: &[H],
880     ) -> Result<NFA, BuildError> {
881         self.compile(exprs)
882     }
883 
884     /// Apply the given NFA configuration options to this builder.
885     ///
886     /// # Example
887     ///
888     /// ```
889     /// use regex_automata::nfa::thompson::NFA;
890     ///
891     /// let config = NFA::config().nfa_size_limit(Some(1_000));
892     /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
893     /// assert_eq!(nfa.pattern_len(), 1);
894     ///
895     /// # Ok::<(), Box<dyn std::error::Error>>(())
896     /// ```
configure(&mut self, config: Config) -> &mut Compiler897     pub fn configure(&mut self, config: Config) -> &mut Compiler {
898         self.config = self.config.overwrite(config);
899         self
900     }
901 
902     /// Set the syntax configuration for this builder using
903     /// [`syntax::Config`](crate::util::syntax::Config).
904     ///
905     /// This permits setting things like case insensitivity, Unicode and multi
906     /// line mode.
907     ///
908     /// This syntax configuration only applies when an NFA is built directly
909     /// from a pattern string. If an NFA is built from an HIR, then all syntax
910     /// settings are ignored.
911     ///
912     /// # Example
913     ///
914     /// ```
915     /// use regex_automata::{nfa::thompson::NFA, util::syntax};
916     ///
917     /// let syntax_config = syntax::Config::new().unicode(false);
918     /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
919     /// // If Unicode were enabled, the number of states would be much bigger.
920     /// assert!(nfa.states().len() < 15);
921     ///
922     /// # Ok::<(), Box<dyn std::error::Error>>(())
923     /// ```
syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Compiler924     pub fn syntax(
925         &mut self,
926         config: crate::util::syntax::Config,
927     ) -> &mut Compiler {
928         config.apply(&mut self.parser);
929         self
930     }
931 }
932 
933 impl Compiler {
934     /// Compile the sequence of HIR expressions given. Pattern IDs are
935     /// allocated starting from 0, in correspondence with the slice given.
936     ///
937     /// It is legal to provide an empty slice. In that case, the NFA returned
938     /// has no patterns and will never match anything.
compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError>939     fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
940         if exprs.len() > PatternID::LIMIT {
941             return Err(BuildError::too_many_patterns(exprs.len()));
942         }
943         if self.config.get_reverse()
944             && self.config.get_which_captures().is_any()
945         {
946             return Err(BuildError::unsupported_captures());
947         }
948 
949         self.builder.borrow_mut().clear();
950         self.builder.borrow_mut().set_utf8(self.config.get_utf8());
951         self.builder.borrow_mut().set_reverse(self.config.get_reverse());
952         self.builder
953             .borrow_mut()
954             .set_look_matcher(self.config.get_look_matcher());
955         self.builder
956             .borrow_mut()
957             .set_size_limit(self.config.get_nfa_size_limit())?;
958 
959         // We always add an unanchored prefix unless we were specifically told
960         // not to (for tests only), or if we know that the regex is anchored
961         // for all matches. When an unanchored prefix is not added, then the
962         // NFA's anchored and unanchored start states are equivalent.
963         let all_anchored = exprs.iter().all(|e| {
964             let props = e.borrow().properties();
965             if self.config.get_reverse() {
966                 props.look_set_suffix().contains(hir::Look::End)
967             } else {
968                 props.look_set_prefix().contains(hir::Look::Start)
969             }
970         });
971         let anchored = !self.config.get_unanchored_prefix() || all_anchored;
972         let unanchored_prefix = if anchored {
973             self.c_empty()?
974         } else {
975             self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
976         };
977 
978         let compiled = self.c_alt_iter(exprs.iter().map(|e| {
979             let _ = self.start_pattern()?;
980             let one = self.c_cap(0, None, e.borrow())?;
981             let match_state_id = self.add_match()?;
982             self.patch(one.end, match_state_id)?;
983             let _ = self.finish_pattern(one.start)?;
984             Ok(ThompsonRef { start: one.start, end: match_state_id })
985         }))?;
986         self.patch(unanchored_prefix.end, compiled.start)?;
987         let nfa = self
988             .builder
989             .borrow_mut()
990             .build(compiled.start, unanchored_prefix.start)?;
991 
992         debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
993         Ok(nfa)
994     }
995 
996     /// Compile an arbitrary HIR expression.
c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError>997     fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
998         use regex_syntax::hir::{Class, HirKind::*};
999 
1000         match *expr.kind() {
1001             Empty => self.c_empty(),
1002             Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1003             Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1004             Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1005             Look(ref look) => self.c_look(look),
1006             Repetition(ref rep) => self.c_repetition(rep),
1007             Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1008             Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
1009             Alternation(ref es) => self.c_alt_slice(es),
1010         }
1011     }
1012 
1013     /// Compile a concatenation of the sub-expressions yielded by the given
1014     /// iterator. If the iterator yields no elements, then this compiles down
1015     /// to an "empty" state that always matches.
1016     ///
1017     /// If the compiler is in reverse mode, then the expressions given are
1018     /// automatically compiled in reverse.
c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> where I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,1019     fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1020     where
1021         I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1022     {
1023         let first = if self.is_reverse() { it.next_back() } else { it.next() };
1024         let ThompsonRef { start, mut end } = match first {
1025             Some(result) => result?,
1026             None => return self.c_empty(),
1027         };
1028         loop {
1029             let next =
1030                 if self.is_reverse() { it.next_back() } else { it.next() };
1031             let compiled = match next {
1032                 Some(result) => result?,
1033                 None => break,
1034             };
1035             self.patch(end, compiled.start)?;
1036             end = compiled.end;
1037         }
1038         Ok(ThompsonRef { start, end })
1039     }
1040 
1041     /// Compile an alternation of the given HIR values.
1042     ///
1043     /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1044     /// of an iterator of compiled NFA subgraphs. The point of accepting a
1045     /// slice here is that it opens up some optimization opportunities. For
1046     /// example, if all of the HIR values are literals, then this routine might
1047     /// re-shuffle them to make NFA epsilon closures substantially faster.
c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError>1048     fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1049         // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
1050         let literal_count = exprs
1051             .iter()
1052             .filter(|e| {
1053                 matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1054             })
1055             .count();
1056         if literal_count <= 1 || literal_count < exprs.len() {
1057             return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
1058         }
1059 
1060         let mut trie = if self.is_reverse() {
1061             LiteralTrie::reverse()
1062         } else {
1063             LiteralTrie::forward()
1064         };
1065         for expr in exprs.iter() {
1066             let literal = match *expr.kind() {
1067                 hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1068                 _ => unreachable!(),
1069             };
1070             trie.add(literal)?;
1071         }
1072         trie.compile(&mut self.builder.borrow_mut())
1073     }
1074 
1075     /// Compile an alternation, where each element yielded by the given
1076     /// iterator represents an item in the alternation. If the iterator yields
1077     /// no elements, then this compiles down to a "fail" state.
1078     ///
1079     /// In an alternation, expressions appearing earlier are "preferred" at
1080     /// match time over expressions appearing later. At least, this is true
1081     /// when using "leftmost first" match semantics. (If "leftmost longest" are
1082     /// ever added in the future, then this preference order of priority would
1083     /// not apply in that mode.)
c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> where I: Iterator<Item = Result<ThompsonRef, BuildError>>,1084     fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1085     where
1086         I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1087     {
1088         let first = match it.next() {
1089             None => return self.c_fail(),
1090             Some(result) => result?,
1091         };
1092         let second = match it.next() {
1093             None => return Ok(first),
1094             Some(result) => result?,
1095         };
1096 
1097         let union = self.add_union()?;
1098         let end = self.add_empty()?;
1099         self.patch(union, first.start)?;
1100         self.patch(first.end, end)?;
1101         self.patch(union, second.start)?;
1102         self.patch(second.end, end)?;
1103         for result in it {
1104             let compiled = result?;
1105             self.patch(union, compiled.start)?;
1106             self.patch(compiled.end, end)?;
1107         }
1108         Ok(ThompsonRef { start: union, end })
1109     }
1110 
1111     /// Compile the given capture sub-expression. `expr` should be the
1112     /// sub-expression contained inside the capture. If "capture" states are
1113     /// enabled, then they are added as appropriate.
1114     ///
1115     /// This accepts the pieces of a capture instead of a `hir::Capture` so
1116     /// that it's easy to manufacture a "fake" group when necessary, e.g., for
1117     /// adding the entire pattern as if it were a group in order to create
1118     /// appropriate "capture" states in the NFA.
c_cap( &self, index: u32, name: Option<&str>, expr: &Hir, ) -> Result<ThompsonRef, BuildError>1119     fn c_cap(
1120         &self,
1121         index: u32,
1122         name: Option<&str>,
1123         expr: &Hir,
1124     ) -> Result<ThompsonRef, BuildError> {
1125         match self.config.get_which_captures() {
1126             // No capture states means we always skip them.
1127             WhichCaptures::None => return self.c(expr),
1128             // Implicit captures states means we only add when index==0 since
1129             // index==0 implies the group is implicit.
1130             WhichCaptures::Implicit if index > 0 => return self.c(expr),
1131             _ => {}
1132         }
1133 
1134         let start = self.add_capture_start(index, name)?;
1135         let inner = self.c(expr)?;
1136         let end = self.add_capture_end(index)?;
1137         self.patch(start, inner.start)?;
1138         self.patch(inner.end, end)?;
1139         Ok(ThompsonRef { start, end })
1140     }
1141 
1142     /// Compile the given repetition expression. This handles all types of
1143     /// repetitions and greediness.
c_repetition( &self, rep: &hir::Repetition, ) -> Result<ThompsonRef, BuildError>1144     fn c_repetition(
1145         &self,
1146         rep: &hir::Repetition,
1147     ) -> Result<ThompsonRef, BuildError> {
1148         match (rep.min, rep.max) {
1149             (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1150             (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1151             (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1152             (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1153         }
1154     }
1155 
1156     /// Compile the given expression such that it matches at least `min` times,
1157     /// but no more than `max` times.
1158     ///
1159     /// When `greedy` is true, then the preference is for the expression to
1160     /// match as much as possible. Otherwise, it will match as little as
1161     /// possible.
c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result<ThompsonRef, BuildError>1162     fn c_bounded(
1163         &self,
1164         expr: &Hir,
1165         greedy: bool,
1166         min: u32,
1167         max: u32,
1168     ) -> Result<ThompsonRef, BuildError> {
1169         let prefix = self.c_exactly(expr, min)?;
1170         if min == max {
1171             return Ok(prefix);
1172         }
1173 
1174         // It is tempting here to compile the rest here as a concatenation
1175         // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1176         // were `aaa?a?a?`. The problem here is that it leads to this program:
1177         //
1178         //     >000000: 61 => 01
1179         //      000001: 61 => 02
1180         //      000002: union(03, 04)
1181         //      000003: 61 => 04
1182         //      000004: union(05, 06)
1183         //      000005: 61 => 06
1184         //      000006: union(07, 08)
1185         //      000007: 61 => 08
1186         //      000008: MATCH
1187         //
1188         // And effectively, once you hit state 2, the epsilon closure will
1189         // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1190         // to instead compile it like so:
1191         //
1192         //     >000000: 61 => 01
1193         //      000001: 61 => 02
1194         //      000002: union(03, 08)
1195         //      000003: 61 => 04
1196         //      000004: union(05, 08)
1197         //      000005: 61 => 06
1198         //      000006: union(07, 08)
1199         //      000007: 61 => 08
1200         //      000008: MATCH
1201         //
1202         // So that the epsilon closure of state 2 is now just 3 and 8.
1203         let empty = self.add_empty()?;
1204         let mut prev_end = prefix.end;
1205         for _ in min..max {
1206             let union = if greedy {
1207                 self.add_union()
1208             } else {
1209                 self.add_union_reverse()
1210             }?;
1211             let compiled = self.c(expr)?;
1212             self.patch(prev_end, union)?;
1213             self.patch(union, compiled.start)?;
1214             self.patch(union, empty)?;
1215             prev_end = compiled.end;
1216         }
1217         self.patch(prev_end, empty)?;
1218         Ok(ThompsonRef { start: prefix.start, end: empty })
1219     }
1220 
1221     /// Compile the given expression such that it may be matched `n` or more
1222     /// times, where `n` can be any integer. (Although a particularly large
1223     /// integer is likely to run afoul of any configured size limits.)
1224     ///
1225     /// When `greedy` is true, then the preference is for the expression to
1226     /// match as much as possible. Otherwise, it will match as little as
1227     /// possible.
c_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result<ThompsonRef, BuildError>1228     fn c_at_least(
1229         &self,
1230         expr: &Hir,
1231         greedy: bool,
1232         n: u32,
1233     ) -> Result<ThompsonRef, BuildError> {
1234         if n == 0 {
1235             // When the expression cannot match the empty string, then we
1236             // can get away with something much simpler: just one 'alt'
1237             // instruction that optionally repeats itself. But if the expr
1238             // can match the empty string... see below.
1239             if expr.properties().minimum_len().map_or(false, |len| len > 0) {
1240                 let union = if greedy {
1241                     self.add_union()
1242                 } else {
1243                     self.add_union_reverse()
1244                 }?;
1245                 let compiled = self.c(expr)?;
1246                 self.patch(union, compiled.start)?;
1247                 self.patch(compiled.end, union)?;
1248                 return Ok(ThompsonRef { start: union, end: union });
1249             }
1250 
1251             // What's going on here? Shouldn't x* be simpler than this? It
1252             // turns out that when implementing leftmost-first (Perl-like)
1253             // match semantics, x* results in an incorrect preference order
1254             // when computing the transitive closure of states if and only if
1255             // 'x' can match the empty string. So instead, we compile x* as
1256             // (x+)?, which preserves the correct preference order.
1257             //
1258             // See: https://github.com/rust-lang/regex/issues/779
1259             let compiled = self.c(expr)?;
1260             let plus = if greedy {
1261                 self.add_union()
1262             } else {
1263                 self.add_union_reverse()
1264             }?;
1265             self.patch(compiled.end, plus)?;
1266             self.patch(plus, compiled.start)?;
1267 
1268             let question = if greedy {
1269                 self.add_union()
1270             } else {
1271                 self.add_union_reverse()
1272             }?;
1273             let empty = self.add_empty()?;
1274             self.patch(question, compiled.start)?;
1275             self.patch(question, empty)?;
1276             self.patch(plus, empty)?;
1277             Ok(ThompsonRef { start: question, end: empty })
1278         } else if n == 1 {
1279             let compiled = self.c(expr)?;
1280             let union = if greedy {
1281                 self.add_union()
1282             } else {
1283                 self.add_union_reverse()
1284             }?;
1285             self.patch(compiled.end, union)?;
1286             self.patch(union, compiled.start)?;
1287             Ok(ThompsonRef { start: compiled.start, end: union })
1288         } else {
1289             let prefix = self.c_exactly(expr, n - 1)?;
1290             let last = self.c(expr)?;
1291             let union = if greedy {
1292                 self.add_union()
1293             } else {
1294                 self.add_union_reverse()
1295             }?;
1296             self.patch(prefix.end, last.start)?;
1297             self.patch(last.end, union)?;
1298             self.patch(union, last.start)?;
1299             Ok(ThompsonRef { start: prefix.start, end: union })
1300         }
1301     }
1302 
1303     /// Compile the given expression such that it may be matched zero or one
1304     /// times.
1305     ///
1306     /// When `greedy` is true, then the preference is for the expression to
1307     /// match as much as possible. Otherwise, it will match as little as
1308     /// possible.
c_zero_or_one( &self, expr: &Hir, greedy: bool, ) -> Result<ThompsonRef, BuildError>1309     fn c_zero_or_one(
1310         &self,
1311         expr: &Hir,
1312         greedy: bool,
1313     ) -> Result<ThompsonRef, BuildError> {
1314         let union =
1315             if greedy { self.add_union() } else { self.add_union_reverse() }?;
1316         let compiled = self.c(expr)?;
1317         let empty = self.add_empty()?;
1318         self.patch(union, compiled.start)?;
1319         self.patch(union, empty)?;
1320         self.patch(compiled.end, empty)?;
1321         Ok(ThompsonRef { start: union, end: empty })
1322     }
1323 
1324     /// Compile the given HIR expression exactly `n` times.
c_exactly( &self, expr: &Hir, n: u32, ) -> Result<ThompsonRef, BuildError>1325     fn c_exactly(
1326         &self,
1327         expr: &Hir,
1328         n: u32,
1329     ) -> Result<ThompsonRef, BuildError> {
1330         let it = (0..n).map(|_| self.c(expr));
1331         self.c_concat(it)
1332     }
1333 
1334     /// Compile the given byte oriented character class.
1335     ///
1336     /// This uses "sparse" states to represent an alternation between ranges in
1337     /// this character class. We can use "sparse" states instead of stitching
1338     /// together a "union" state because all ranges in a character class have
1339     /// equal priority *and* are non-overlapping (thus, only one can match, so
1340     /// there's never a question of priority in the first place). This saves a
1341     /// fair bit of overhead when traversing an NFA.
1342     ///
1343     /// This routine compiles an empty character class into a "fail" state.
c_byte_class( &self, cls: &hir::ClassBytes, ) -> Result<ThompsonRef, BuildError>1344     fn c_byte_class(
1345         &self,
1346         cls: &hir::ClassBytes,
1347     ) -> Result<ThompsonRef, BuildError> {
1348         let end = self.add_empty()?;
1349         let mut trans = Vec::with_capacity(cls.ranges().len());
1350         for r in cls.iter() {
1351             trans.push(Transition {
1352                 start: r.start(),
1353                 end: r.end(),
1354                 next: end,
1355             });
1356         }
1357         Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1358     }
1359 
1360     /// Compile the given Unicode character class.
1361     ///
1362     /// This routine specifically tries to use various types of compression,
1363     /// since UTF-8 automata of large classes can get quite large. The specific
1364     /// type of compression used depends on forward vs reverse compilation, and
1365     /// whether NFA shrinking is enabled or not.
1366     ///
1367     /// Aside from repetitions causing lots of repeat group, this is like the
1368     /// single most expensive part of regex compilation. Therefore, a large part
1369     /// of the expense of compilation may be reduce by disabling Unicode in the
1370     /// pattern.
1371     ///
1372     /// This routine compiles an empty character class into a "fail" state.
c_unicode_class( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef, BuildError>1373     fn c_unicode_class(
1374         &self,
1375         cls: &hir::ClassUnicode,
1376     ) -> Result<ThompsonRef, BuildError> {
1377         // If all we have are ASCII ranges wrapped in a Unicode package, then
1378         // there is zero reason to bring out the big guns. We can fit all ASCII
1379         // ranges within a single sparse state.
1380         if cls.is_ascii() {
1381             let end = self.add_empty()?;
1382             let mut trans = Vec::with_capacity(cls.ranges().len());
1383             for r in cls.iter() {
1384                 // The unwraps below are OK because we've verified that this
1385                 // class only contains ASCII codepoints.
1386                 trans.push(Transition {
1387                     // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1388                     start: u8::try_from(u32::from(r.start())).unwrap(),
1389                     end: u8::try_from(u32::from(r.end())).unwrap(),
1390                     next: end,
1391                 });
1392             }
1393             Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1394         } else if self.is_reverse() {
1395             if !self.config.get_shrink() {
1396                 // When we don't want to spend the extra time shrinking, we
1397                 // compile the UTF-8 automaton in reverse using something like
1398                 // the "naive" approach, but will attempt to re-use common
1399                 // suffixes.
1400                 self.c_unicode_class_reverse_with_suffix(cls)
1401             } else {
1402                 // When we want to shrink our NFA for reverse UTF-8 automata,
1403                 // we cannot feed UTF-8 sequences directly to the UTF-8
1404                 // compiler, since the UTF-8 compiler requires all sequences
1405                 // to be lexicographically sorted. Instead, we organize our
1406                 // sequences into a range trie, which can then output our
1407                 // sequences in the correct order. Unfortunately, building the
1408                 // range trie is fairly expensive (but not nearly as expensive
1409                 // as building a DFA). Hence the reason why the 'shrink' option
1410                 // exists, so that this path can be toggled off. For example,
1411                 // we might want to turn this off if we know we won't be
1412                 // compiling a DFA.
1413                 let mut trie = self.trie_state.borrow_mut();
1414                 trie.clear();
1415 
1416                 for rng in cls.iter() {
1417                     for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1418                         seq.reverse();
1419                         trie.insert(seq.as_slice());
1420                     }
1421                 }
1422                 let mut builder = self.builder.borrow_mut();
1423                 let mut utf8_state = self.utf8_state.borrow_mut();
1424                 let mut utf8c =
1425                     Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1426                 trie.iter(|seq| {
1427                     utf8c.add(&seq)?;
1428                     Ok(())
1429                 })?;
1430                 utf8c.finish()
1431             }
1432         } else {
1433             // In the forward direction, we always shrink our UTF-8 automata
1434             // because we can stream it right into the UTF-8 compiler. There
1435             // is almost no downside (in either memory or time) to using this
1436             // approach.
1437             let mut builder = self.builder.borrow_mut();
1438             let mut utf8_state = self.utf8_state.borrow_mut();
1439             let mut utf8c =
1440                 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1441             for rng in cls.iter() {
1442                 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1443                     utf8c.add(seq.as_slice())?;
1444                 }
1445             }
1446             utf8c.finish()
1447         }
1448 
1449         // For reference, the code below is the "naive" version of compiling a
1450         // UTF-8 automaton. It is deliciously simple (and works for both the
1451         // forward and reverse cases), but will unfortunately produce very
1452         // large NFAs. When compiling a forward automaton, the size difference
1453         // can sometimes be an order of magnitude. For example, the '\w' regex
1454         // will generate about ~3000 NFA states using the naive approach below,
1455         // but only 283 states when using the approach above. This is because
1456         // the approach above actually compiles a *minimal* (or near minimal,
1457         // because of the bounded hashmap for reusing equivalent states) UTF-8
1458         // automaton.
1459         //
1460         // The code below is kept as a reference point in order to make it
1461         // easier to understand the higher level goal here. Although, it will
1462         // almost certainly bit-rot, so keep that in mind. Also, if you try to
1463         // use it, some of the tests in this module will fail because they look
1464         // for terser byte code produce by the more optimized handling above.
1465         // But the integration test suite should still pass.
1466         //
1467         // One good example of the substantial difference this can make is to
1468         // compare and contrast performance of the Pike VM when the code below
1469         // is active vs the code above. Here's an example to try:
1470         //
1471         //   regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1472         //
1473         // With Unicode classes generated below, this search takes about 45s on
1474         // my machine. But with the compressed version above, the search takes
1475         // only around 1.4s. The NFA is also 20% smaller. This is in part due
1476         // to the compression, but also because of the utilization of 'sparse'
1477         // NFA states. They lead to much less state shuffling during the NFA
1478         // search.
1479         /*
1480         let it = cls
1481             .iter()
1482             .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1483             .map(|seq| {
1484                 let it = seq
1485                     .as_slice()
1486                     .iter()
1487                     .map(|rng| self.c_range(rng.start, rng.end));
1488                 self.c_concat(it)
1489             });
1490         self.c_alt_iter(it)
1491         */
1492     }
1493 
1494     /// Compile the given Unicode character class in reverse with suffix
1495     /// caching.
1496     ///
1497     /// This is a "quick" way to compile large Unicode classes into reverse
1498     /// UTF-8 automata while doing a small amount of compression on that
1499     /// automata by reusing common suffixes.
1500     ///
1501     /// A more comprehensive compression scheme can be accomplished by using
1502     /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1503     /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
1504     ///
1505     /// This is the technique used when "NFA shrinking" is disabled.
1506     ///
1507     /// (This also tries to use "sparse" states where possible, just like
1508     /// `c_byte_class` does.)
c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef, BuildError>1509     fn c_unicode_class_reverse_with_suffix(
1510         &self,
1511         cls: &hir::ClassUnicode,
1512     ) -> Result<ThompsonRef, BuildError> {
1513         // N.B. It would likely be better to cache common *prefixes* in the
1514         // reverse direction, but it's not quite clear how to do that. The
1515         // advantage of caching suffixes is that it does give us a win, and
1516         // has a very small additional overhead.
1517         let mut cache = self.utf8_suffix.borrow_mut();
1518         cache.clear();
1519 
1520         let union = self.add_union()?;
1521         let alt_end = self.add_empty()?;
1522         for urng in cls.iter() {
1523             for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1524                 let mut end = alt_end;
1525                 for brng in seq.as_slice() {
1526                     let key = Utf8SuffixKey {
1527                         from: end,
1528                         start: brng.start,
1529                         end: brng.end,
1530                     };
1531                     let hash = cache.hash(&key);
1532                     if let Some(id) = cache.get(&key, hash) {
1533                         end = id;
1534                         continue;
1535                     }
1536 
1537                     let compiled = self.c_range(brng.start, brng.end)?;
1538                     self.patch(compiled.end, end)?;
1539                     end = compiled.start;
1540                     cache.set(key, hash, end);
1541                 }
1542                 self.patch(union, end)?;
1543             }
1544         }
1545         Ok(ThompsonRef { start: union, end: alt_end })
1546     }
1547 
1548     /// Compile the given HIR look-around assertion to an NFA look-around
1549     /// assertion.
c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError>1550     fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1551         let look = match *anchor {
1552             hir::Look::Start => Look::Start,
1553             hir::Look::End => Look::End,
1554             hir::Look::StartLF => Look::StartLF,
1555             hir::Look::EndLF => Look::EndLF,
1556             hir::Look::StartCRLF => Look::StartCRLF,
1557             hir::Look::EndCRLF => Look::EndCRLF,
1558             hir::Look::WordAscii => Look::WordAscii,
1559             hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1560             hir::Look::WordUnicode => Look::WordUnicode,
1561             hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1562             hir::Look::WordStartAscii => Look::WordStartAscii,
1563             hir::Look::WordEndAscii => Look::WordEndAscii,
1564             hir::Look::WordStartUnicode => Look::WordStartUnicode,
1565             hir::Look::WordEndUnicode => Look::WordEndUnicode,
1566             hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1567             hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1568             hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1569             hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1570         };
1571         let id = self.add_look(look)?;
1572         Ok(ThompsonRef { start: id, end: id })
1573     }
1574 
1575     /// Compile the given byte string to a concatenation of bytes.
c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError>1576     fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1577         self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
1578     }
1579 
1580     /// Compile a "range" state with one transition that may only be followed
1581     /// if the input byte is in the (inclusive) range given.
1582     ///
1583     /// Both the `start` and `end` locations point to the state created.
1584     /// Callers will likely want to keep the `start`, but patch the `end` to
1585     /// point to some other state.
c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError>1586     fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1587         let id = self.add_range(start, end)?;
1588         Ok(ThompsonRef { start: id, end: id })
1589     }
1590 
1591     /// Compile an "empty" state with one unconditional epsilon transition.
1592     ///
1593     /// Both the `start` and `end` locations point to the state created.
1594     /// Callers will likely want to keep the `start`, but patch the `end` to
1595     /// point to some other state.
c_empty(&self) -> Result<ThompsonRef, BuildError>1596     fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1597         let id = self.add_empty()?;
1598         Ok(ThompsonRef { start: id, end: id })
1599     }
1600 
1601     /// Compile a "fail" state that can never have any outgoing transitions.
c_fail(&self) -> Result<ThompsonRef, BuildError>1602     fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1603         let id = self.add_fail()?;
1604         Ok(ThompsonRef { start: id, end: id })
1605     }
1606 
1607     // The below helpers are meant to be simple wrappers around the
1608     // corresponding Builder methods. For the most part, they let us write
1609     // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1610     // the latter is a mouthful. Some of the methods do inject a little bit
1611     // of extra logic. e.g., Flipping look-around operators when compiling in
1612     // reverse mode.
1613 
patch(&self, from: StateID, to: StateID) -> Result<(), BuildError>1614     fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1615         self.builder.borrow_mut().patch(from, to)
1616     }
1617 
start_pattern(&self) -> Result<PatternID, BuildError>1618     fn start_pattern(&self) -> Result<PatternID, BuildError> {
1619         self.builder.borrow_mut().start_pattern()
1620     }
1621 
finish_pattern( &self, start_id: StateID, ) -> Result<PatternID, BuildError>1622     fn finish_pattern(
1623         &self,
1624         start_id: StateID,
1625     ) -> Result<PatternID, BuildError> {
1626         self.builder.borrow_mut().finish_pattern(start_id)
1627     }
1628 
add_empty(&self) -> Result<StateID, BuildError>1629     fn add_empty(&self) -> Result<StateID, BuildError> {
1630         self.builder.borrow_mut().add_empty()
1631     }
1632 
add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError>1633     fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1634         self.builder.borrow_mut().add_range(Transition {
1635             start,
1636             end,
1637             next: StateID::ZERO,
1638         })
1639     }
1640 
add_sparse( &self, ranges: Vec<Transition>, ) -> Result<StateID, BuildError>1641     fn add_sparse(
1642         &self,
1643         ranges: Vec<Transition>,
1644     ) -> Result<StateID, BuildError> {
1645         self.builder.borrow_mut().add_sparse(ranges)
1646     }
1647 
add_look(&self, mut look: Look) -> Result<StateID, BuildError>1648     fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1649         if self.is_reverse() {
1650             look = look.reversed();
1651         }
1652         self.builder.borrow_mut().add_look(StateID::ZERO, look)
1653     }
1654 
add_union(&self) -> Result<StateID, BuildError>1655     fn add_union(&self) -> Result<StateID, BuildError> {
1656         self.builder.borrow_mut().add_union(vec![])
1657     }
1658 
add_union_reverse(&self) -> Result<StateID, BuildError>1659     fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1660         self.builder.borrow_mut().add_union_reverse(vec![])
1661     }
1662 
add_capture_start( &self, capture_index: u32, name: Option<&str>, ) -> Result<StateID, BuildError>1663     fn add_capture_start(
1664         &self,
1665         capture_index: u32,
1666         name: Option<&str>,
1667     ) -> Result<StateID, BuildError> {
1668         let name = name.map(|n| Arc::from(n));
1669         self.builder.borrow_mut().add_capture_start(
1670             StateID::ZERO,
1671             capture_index,
1672             name,
1673         )
1674     }
1675 
add_capture_end( &self, capture_index: u32, ) -> Result<StateID, BuildError>1676     fn add_capture_end(
1677         &self,
1678         capture_index: u32,
1679     ) -> Result<StateID, BuildError> {
1680         self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1681     }
1682 
add_fail(&self) -> Result<StateID, BuildError>1683     fn add_fail(&self) -> Result<StateID, BuildError> {
1684         self.builder.borrow_mut().add_fail()
1685     }
1686 
add_match(&self) -> Result<StateID, BuildError>1687     fn add_match(&self) -> Result<StateID, BuildError> {
1688         self.builder.borrow_mut().add_match()
1689     }
1690 
is_reverse(&self) -> bool1691     fn is_reverse(&self) -> bool {
1692         self.config.get_reverse()
1693     }
1694 }
1695 
1696 /// A value that represents the result of compiling a sub-expression of a
1697 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1698 /// has an initial state at `start` and a final state at `end`.
1699 #[derive(Clone, Copy, Debug)]
1700 pub(crate) struct ThompsonRef {
1701     pub(crate) start: StateID,
1702     pub(crate) end: StateID,
1703 }
1704 
1705 /// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
1706 /// from a lexicographically sorted sequence of strings in linear time.
1707 ///
1708 /// The trick here is that any Unicode codepoint range can be converted to
1709 /// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1710 /// together via an alternation is trivial, and indeed, it works. However,
1711 /// there is a lot of redundant structure in many UTF-8 automatons. Since our
1712 /// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1713 /// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1714 /// minimal because we use a bounded cache of previously build DFA states.)
1715 ///
1716 /// The drawback is that this sadly doesn't work for reverse automata, since
1717 /// the ranges are no longer in lexicographic order. For that, we invented the
1718 /// range trie (which gets its own module). Once a range trie is built, we then
1719 /// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1720 ///
1721 /// The high level idea is described here:
1722 /// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1723 ///
1724 /// There is also another implementation of this in the `fst` crate.
1725 #[derive(Debug)]
1726 struct Utf8Compiler<'a> {
1727     builder: &'a mut Builder,
1728     state: &'a mut Utf8State,
1729     target: StateID,
1730 }
1731 
1732 #[derive(Clone, Debug)]
1733 struct Utf8State {
1734     compiled: Utf8BoundedMap,
1735     uncompiled: Vec<Utf8Node>,
1736 }
1737 
1738 #[derive(Clone, Debug)]
1739 struct Utf8Node {
1740     trans: Vec<Transition>,
1741     last: Option<Utf8LastTransition>,
1742 }
1743 
1744 #[derive(Clone, Debug)]
1745 struct Utf8LastTransition {
1746     start: u8,
1747     end: u8,
1748 }
1749 
1750 impl Utf8State {
new() -> Utf8State1751     fn new() -> Utf8State {
1752         Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1753     }
1754 
clear(&mut self)1755     fn clear(&mut self) {
1756         self.compiled.clear();
1757         self.uncompiled.clear();
1758     }
1759 }
1760 
1761 impl<'a> Utf8Compiler<'a> {
new( builder: &'a mut Builder, state: &'a mut Utf8State, ) -> Result<Utf8Compiler<'a>, BuildError>1762     fn new(
1763         builder: &'a mut Builder,
1764         state: &'a mut Utf8State,
1765     ) -> Result<Utf8Compiler<'a>, BuildError> {
1766         let target = builder.add_empty()?;
1767         state.clear();
1768         let mut utf8c = Utf8Compiler { builder, state, target };
1769         utf8c.add_empty();
1770         Ok(utf8c)
1771     }
1772 
finish(&mut self) -> Result<ThompsonRef, BuildError>1773     fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1774         self.compile_from(0)?;
1775         let node = self.pop_root();
1776         let start = self.compile(node)?;
1777         Ok(ThompsonRef { start, end: self.target })
1778     }
1779 
add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError>1780     fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1781         let prefix_len = ranges
1782             .iter()
1783             .zip(&self.state.uncompiled)
1784             .take_while(|&(range, node)| {
1785                 node.last.as_ref().map_or(false, |t| {
1786                     (t.start, t.end) == (range.start, range.end)
1787                 })
1788             })
1789             .count();
1790         assert!(prefix_len < ranges.len());
1791         self.compile_from(prefix_len)?;
1792         self.add_suffix(&ranges[prefix_len..]);
1793         Ok(())
1794     }
1795 
compile_from(&mut self, from: usize) -> Result<(), BuildError>1796     fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1797         let mut next = self.target;
1798         while from + 1 < self.state.uncompiled.len() {
1799             let node = self.pop_freeze(next);
1800             next = self.compile(node)?;
1801         }
1802         self.top_last_freeze(next);
1803         Ok(())
1804     }
1805 
compile( &mut self, node: Vec<Transition>, ) -> Result<StateID, BuildError>1806     fn compile(
1807         &mut self,
1808         node: Vec<Transition>,
1809     ) -> Result<StateID, BuildError> {
1810         let hash = self.state.compiled.hash(&node);
1811         if let Some(id) = self.state.compiled.get(&node, hash) {
1812             return Ok(id);
1813         }
1814         let id = self.builder.add_sparse(node.clone())?;
1815         self.state.compiled.set(node, hash, id);
1816         Ok(id)
1817     }
1818 
add_suffix(&mut self, ranges: &[Utf8Range])1819     fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1820         assert!(!ranges.is_empty());
1821         let last = self
1822             .state
1823             .uncompiled
1824             .len()
1825             .checked_sub(1)
1826             .expect("non-empty nodes");
1827         assert!(self.state.uncompiled[last].last.is_none());
1828         self.state.uncompiled[last].last = Some(Utf8LastTransition {
1829             start: ranges[0].start,
1830             end: ranges[0].end,
1831         });
1832         for r in &ranges[1..] {
1833             self.state.uncompiled.push(Utf8Node {
1834                 trans: vec![],
1835                 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1836             });
1837         }
1838     }
1839 
add_empty(&mut self)1840     fn add_empty(&mut self) {
1841         self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1842     }
1843 
pop_freeze(&mut self, next: StateID) -> Vec<Transition>1844     fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1845         let mut uncompiled = self.state.uncompiled.pop().unwrap();
1846         uncompiled.set_last_transition(next);
1847         uncompiled.trans
1848     }
1849 
pop_root(&mut self) -> Vec<Transition>1850     fn pop_root(&mut self) -> Vec<Transition> {
1851         assert_eq!(self.state.uncompiled.len(), 1);
1852         assert!(self.state.uncompiled[0].last.is_none());
1853         self.state.uncompiled.pop().expect("non-empty nodes").trans
1854     }
1855 
top_last_freeze(&mut self, next: StateID)1856     fn top_last_freeze(&mut self, next: StateID) {
1857         let last = self
1858             .state
1859             .uncompiled
1860             .len()
1861             .checked_sub(1)
1862             .expect("non-empty nodes");
1863         self.state.uncompiled[last].set_last_transition(next);
1864     }
1865 }
1866 
1867 impl Utf8Node {
set_last_transition(&mut self, next: StateID)1868     fn set_last_transition(&mut self, next: StateID) {
1869         if let Some(last) = self.last.take() {
1870             self.trans.push(Transition {
1871                 start: last.start,
1872                 end: last.end,
1873                 next,
1874             });
1875         }
1876     }
1877 }
1878 
1879 #[cfg(test)]
1880 mod tests {
1881     use alloc::vec;
1882 
1883     use crate::{
1884         nfa::thompson::{SparseTransitions, State},
1885         util::primitives::SmallIndex,
1886     };
1887 
1888     use super::*;
1889 
build(pattern: &str) -> NFA1890     fn build(pattern: &str) -> NFA {
1891         NFA::compiler()
1892             .configure(
1893                 NFA::config()
1894                     .which_captures(WhichCaptures::None)
1895                     .unanchored_prefix(false),
1896             )
1897             .build(pattern)
1898             .unwrap()
1899     }
1900 
pid(id: usize) -> PatternID1901     fn pid(id: usize) -> PatternID {
1902         PatternID::new(id).unwrap()
1903     }
1904 
sid(id: usize) -> StateID1905     fn sid(id: usize) -> StateID {
1906         StateID::new(id).unwrap()
1907     }
1908 
s_byte(byte: u8, next: usize) -> State1909     fn s_byte(byte: u8, next: usize) -> State {
1910         let next = sid(next);
1911         let trans = Transition { start: byte, end: byte, next };
1912         State::ByteRange { trans }
1913     }
1914 
s_range(start: u8, end: u8, next: usize) -> State1915     fn s_range(start: u8, end: u8, next: usize) -> State {
1916         let next = sid(next);
1917         let trans = Transition { start, end, next };
1918         State::ByteRange { trans }
1919     }
1920 
s_sparse(transitions: &[(u8, u8, usize)]) -> State1921     fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1922         let transitions = transitions
1923             .iter()
1924             .map(|&(start, end, next)| Transition {
1925                 start,
1926                 end,
1927                 next: sid(next),
1928             })
1929             .collect();
1930         State::Sparse(SparseTransitions { transitions })
1931     }
1932 
s_look(look: Look, next: usize) -> State1933     fn s_look(look: Look, next: usize) -> State {
1934         let next = sid(next);
1935         State::Look { look, next }
1936     }
1937 
s_bin_union(alt1: usize, alt2: usize) -> State1938     fn s_bin_union(alt1: usize, alt2: usize) -> State {
1939         State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1940     }
1941 
s_union(alts: &[usize]) -> State1942     fn s_union(alts: &[usize]) -> State {
1943         State::Union {
1944             alternates: alts
1945                 .iter()
1946                 .map(|&id| sid(id))
1947                 .collect::<Vec<StateID>>()
1948                 .into_boxed_slice(),
1949         }
1950     }
1951 
s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State1952     fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1953         State::Capture {
1954             next: sid(next),
1955             pattern_id: pid(pattern),
1956             group_index: SmallIndex::new(index).unwrap(),
1957             slot: SmallIndex::new(slot).unwrap(),
1958         }
1959     }
1960 
s_fail() -> State1961     fn s_fail() -> State {
1962         State::Fail
1963     }
1964 
s_match(id: usize) -> State1965     fn s_match(id: usize) -> State {
1966         State::Match { pattern_id: pid(id) }
1967     }
1968 
1969     // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1970     // prefix.
1971     #[test]
compile_unanchored_prefix()1972     fn compile_unanchored_prefix() {
1973         let nfa = NFA::compiler()
1974             .configure(NFA::config().which_captures(WhichCaptures::None))
1975             .build(r"a")
1976             .unwrap();
1977         assert_eq!(
1978             nfa.states(),
1979             &[
1980                 s_bin_union(2, 1),
1981                 s_range(0, 255, 0),
1982                 s_byte(b'a', 3),
1983                 s_match(0),
1984             ]
1985         );
1986     }
1987 
1988     #[test]
compile_no_unanchored_prefix_with_start_anchor()1989     fn compile_no_unanchored_prefix_with_start_anchor() {
1990         let nfa = NFA::compiler()
1991             .configure(NFA::config().which_captures(WhichCaptures::None))
1992             .build(r"^a")
1993             .unwrap();
1994         assert_eq!(
1995             nfa.states(),
1996             &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)]
1997         );
1998     }
1999 
2000     #[test]
compile_yes_unanchored_prefix_with_end_anchor()2001     fn compile_yes_unanchored_prefix_with_end_anchor() {
2002         let nfa = NFA::compiler()
2003             .configure(NFA::config().which_captures(WhichCaptures::None))
2004             .build(r"a$")
2005             .unwrap();
2006         assert_eq!(
2007             nfa.states(),
2008             &[
2009                 s_bin_union(2, 1),
2010                 s_range(0, 255, 0),
2011                 s_byte(b'a', 3),
2012                 s_look(Look::End, 4),
2013                 s_match(0),
2014             ]
2015         );
2016     }
2017 
2018     #[test]
compile_yes_reverse_unanchored_prefix_with_start_anchor()2019     fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2020         let nfa = NFA::compiler()
2021             .configure(
2022                 NFA::config()
2023                     .reverse(true)
2024                     .which_captures(WhichCaptures::None),
2025             )
2026             .build(r"^a")
2027             .unwrap();
2028         assert_eq!(
2029             nfa.states(),
2030             &[
2031                 s_bin_union(2, 1),
2032                 s_range(0, 255, 0),
2033                 s_byte(b'a', 3),
2034                 // Anchors get flipped in a reverse automaton.
2035                 s_look(Look::End, 4),
2036                 s_match(0),
2037             ],
2038         );
2039     }
2040 
2041     #[test]
compile_no_reverse_unanchored_prefix_with_end_anchor()2042     fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2043         let nfa = NFA::compiler()
2044             .configure(
2045                 NFA::config()
2046                     .reverse(true)
2047                     .which_captures(WhichCaptures::None),
2048             )
2049             .build(r"a$")
2050             .unwrap();
2051         assert_eq!(
2052             nfa.states(),
2053             &[
2054                 // Anchors get flipped in a reverse automaton.
2055                 s_look(Look::Start, 1),
2056                 s_byte(b'a', 2),
2057                 s_match(0),
2058             ],
2059         );
2060     }
2061 
2062     #[test]
compile_empty()2063     fn compile_empty() {
2064         assert_eq!(build("").states(), &[s_match(0),]);
2065     }
2066 
2067     #[test]
compile_literal()2068     fn compile_literal() {
2069         assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
2070         assert_eq!(
2071             build("ab").states(),
2072             &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
2073         );
2074         assert_eq!(
2075             build("☃").states(),
2076             &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
2077         );
2078 
2079         // Check that non-UTF-8 literals work.
2080         let nfa = NFA::compiler()
2081             .configure(
2082                 NFA::config()
2083                     .which_captures(WhichCaptures::None)
2084                     .unanchored_prefix(false),
2085             )
2086             .syntax(crate::util::syntax::Config::new().utf8(false))
2087             .build(r"(?-u)\xFF")
2088             .unwrap();
2089         assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
2090     }
2091 
2092     #[test]
compile_class_ascii()2093     fn compile_class_ascii() {
2094         assert_eq!(
2095             build(r"[a-z]").states(),
2096             &[s_range(b'a', b'z', 1), s_match(0),]
2097         );
2098         assert_eq!(
2099             build(r"[x-za-c]").states(),
2100             &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
2101         );
2102     }
2103 
2104     #[test]
2105     #[cfg(not(miri))]
compile_class_unicode()2106     fn compile_class_unicode() {
2107         assert_eq!(
2108             build(r"[\u03B1-\u03B4]").states(),
2109             &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
2110         );
2111         assert_eq!(
2112             build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2113             &[
2114                 s_range(0xB1, 0xB4, 5),
2115                 s_range(0x99, 0x9E, 5),
2116                 s_byte(0xA4, 1),
2117                 s_byte(0x9F, 2),
2118                 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
2119                 s_match(0),
2120             ]
2121         );
2122         assert_eq!(
2123             build(r"[a-z☃]").states(),
2124             &[
2125                 s_byte(0x83, 3),
2126                 s_byte(0x98, 0),
2127                 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
2128                 s_match(0),
2129             ]
2130         );
2131     }
2132 
2133     #[test]
compile_repetition()2134     fn compile_repetition() {
2135         assert_eq!(
2136             build(r"a?").states(),
2137             &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
2138         );
2139         assert_eq!(
2140             build(r"a??").states(),
2141             &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
2142         );
2143     }
2144 
2145     #[test]
compile_group()2146     fn compile_group() {
2147         assert_eq!(
2148             build(r"ab+").states(),
2149             &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
2150         );
2151         assert_eq!(
2152             build(r"(ab)").states(),
2153             &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
2154         );
2155         assert_eq!(
2156             build(r"(ab)+").states(),
2157             &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
2158         );
2159     }
2160 
2161     #[test]
compile_alternation()2162     fn compile_alternation() {
2163         assert_eq!(
2164             build(r"a|b").states(),
2165             &[s_range(b'a', b'b', 1), s_match(0)]
2166         );
2167         assert_eq!(
2168             build(r"ab|cd").states(),
2169             &[
2170                 s_byte(b'b', 3),
2171                 s_byte(b'd', 3),
2172                 s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
2173                 s_match(0)
2174             ],
2175         );
2176         assert_eq!(
2177             build(r"|b").states(),
2178             &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
2179         );
2180         assert_eq!(
2181             build(r"a|").states(),
2182             &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
2183         );
2184     }
2185 
2186     // This tests the use of a non-binary union, i.e., a state with more than
2187     // 2 unconditional epsilon transitions. The only place they tend to appear
2188     // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2189     // and 'sparse' tend to cover all other cases of alternation.
2190     #[test]
compile_non_binary_union()2191     fn compile_non_binary_union() {
2192         let nfa = NFA::compiler()
2193             .configure(
2194                 NFA::config()
2195                     .which_captures(WhichCaptures::None)
2196                     .reverse(true)
2197                     .shrink(false)
2198                     .unanchored_prefix(false),
2199             )
2200             .build(r"[\u1000\u2000\u3000]")
2201             .unwrap();
2202         assert_eq!(
2203             nfa.states(),
2204             &[
2205                 s_union(&[3, 6, 9]),
2206                 s_byte(0xE1, 10),
2207                 s_byte(0x80, 1),
2208                 s_byte(0x80, 2),
2209                 s_byte(0xE2, 10),
2210                 s_byte(0x80, 4),
2211                 s_byte(0x80, 5),
2212                 s_byte(0xE3, 10),
2213                 s_byte(0x80, 7),
2214                 s_byte(0x80, 8),
2215                 s_match(0),
2216             ]
2217         );
2218     }
2219 
2220     #[test]
compile_many_start_pattern()2221     fn compile_many_start_pattern() {
2222         let nfa = NFA::compiler()
2223             .configure(
2224                 NFA::config()
2225                     .which_captures(WhichCaptures::None)
2226                     .unanchored_prefix(false),
2227             )
2228             .build_many(&["a", "b"])
2229             .unwrap();
2230         assert_eq!(
2231             nfa.states(),
2232             &[
2233                 s_byte(b'a', 1),
2234                 s_match(0),
2235                 s_byte(b'b', 3),
2236                 s_match(1),
2237                 s_bin_union(0, 2),
2238             ]
2239         );
2240         assert_eq!(nfa.start_anchored().as_usize(), 4);
2241         assert_eq!(nfa.start_unanchored().as_usize(), 4);
2242         // Test that the start states for each individual pattern are correct.
2243         assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
2244         assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
2245     }
2246 
2247     // This tests that our compiler can handle an empty character class. At the
2248     // time of writing, the regex parser forbids it, so the only way to test it
2249     // is to provide a hand written HIR.
2250     #[test]
empty_class_bytes()2251     fn empty_class_bytes() {
2252         use regex_syntax::hir::{Class, ClassBytes, Hir};
2253 
2254         let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2255         let config = NFA::config()
2256             .which_captures(WhichCaptures::None)
2257             .unanchored_prefix(false);
2258         let nfa =
2259             NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2260         assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2261     }
2262 
2263     // Like empty_class_bytes, but for a Unicode class.
2264     #[test]
empty_class_unicode()2265     fn empty_class_unicode() {
2266         use regex_syntax::hir::{Class, ClassUnicode, Hir};
2267 
2268         let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2269         let config = NFA::config()
2270             .which_captures(WhichCaptures::None)
2271             .unanchored_prefix(false);
2272         let nfa =
2273             NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2274         assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2275     }
2276 
2277     #[test]
compile_captures_all()2278     fn compile_captures_all() {
2279         let nfa = NFA::compiler()
2280             .configure(
2281                 NFA::config()
2282                     .unanchored_prefix(false)
2283                     .which_captures(WhichCaptures::All),
2284             )
2285             .build("a(b)c")
2286             .unwrap();
2287         assert_eq!(
2288             nfa.states(),
2289             &[
2290                 s_cap(1, 0, 0, 0),
2291                 s_byte(b'a', 2),
2292                 s_cap(3, 0, 1, 2),
2293                 s_byte(b'b', 4),
2294                 s_cap(5, 0, 1, 3),
2295                 s_byte(b'c', 6),
2296                 s_cap(7, 0, 0, 1),
2297                 s_match(0)
2298             ]
2299         );
2300         let ginfo = nfa.group_info();
2301         assert_eq!(2, ginfo.all_group_len());
2302     }
2303 
2304     #[test]
compile_captures_implicit()2305     fn compile_captures_implicit() {
2306         let nfa = NFA::compiler()
2307             .configure(
2308                 NFA::config()
2309                     .unanchored_prefix(false)
2310                     .which_captures(WhichCaptures::Implicit),
2311             )
2312             .build("a(b)c")
2313             .unwrap();
2314         assert_eq!(
2315             nfa.states(),
2316             &[
2317                 s_cap(1, 0, 0, 0),
2318                 s_byte(b'a', 2),
2319                 s_byte(b'b', 3),
2320                 s_byte(b'c', 4),
2321                 s_cap(5, 0, 0, 1),
2322                 s_match(0)
2323             ]
2324         );
2325         let ginfo = nfa.group_info();
2326         assert_eq!(1, ginfo.all_group_len());
2327     }
2328 
2329     #[test]
compile_captures_none()2330     fn compile_captures_none() {
2331         let nfa = NFA::compiler()
2332             .configure(
2333                 NFA::config()
2334                     .unanchored_prefix(false)
2335                     .which_captures(WhichCaptures::None),
2336             )
2337             .build("a(b)c")
2338             .unwrap();
2339         assert_eq!(
2340             nfa.states(),
2341             &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
2342         );
2343         let ginfo = nfa.group_info();
2344         assert_eq!(0, ginfo.all_group_len());
2345     }
2346 }
2347