1 #[cfg(feature = "std")]
2 use dense::{self, DenseDFA};
3 use dfa::DFA;
4 #[cfg(feature = "std")]
5 use error::Result;
6 #[cfg(feature = "std")]
7 use sparse::SparseDFA;
8 #[cfg(feature = "std")]
9 use state_id::StateID;
10 
11 /// A regular expression that uses deterministic finite automata for fast
12 /// searching.
13 ///
14 /// A regular expression is comprised of two DFAs, a "forward" DFA and a
15 /// "reverse" DFA. The forward DFA is responsible for detecting the end of a
16 /// match while the reverse DFA is responsible for detecting the start of a
17 /// match. Thus, in order to find the bounds of any given match, a forward
18 /// search must first be run followed by a reverse search. A match found by
19 /// the forward DFA guarantees that the reverse DFA will also find a match.
20 ///
21 /// The type of the DFA used by a `Regex` corresponds to the `D` type
22 /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
23 /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
24 /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
25 /// search faster, while sparse DFAs use less memory but search more slowly.
26 ///
27 /// By default, a regex's DFA type parameter is set to
28 /// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
29 /// most convenient type that gives the best search performance.
30 ///
31 /// # Sparse DFAs
32 ///
33 /// Since a `Regex` is generic over the `DFA` trait, it can be used with any
34 /// kind of DFA. While this crate constructs dense DFAs by default, it is easy
35 /// enough to build corresponding sparse DFAs, and then build a regex from
36 /// them:
37 ///
38 /// ```
39 /// use regex_automata::Regex;
40 ///
41 /// # fn example() -> Result<(), regex_automata::Error> {
42 /// // First, build a regex that uses dense DFAs.
43 /// let dense_re = Regex::new("foo[0-9]+")?;
44 ///
45 /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
46 /// let fwd = dense_re.forward().to_sparse()?;
47 /// let rev = dense_re.reverse().to_sparse()?;
48 ///
49 /// // Third, build a new regex from the constituent sparse DFAs.
50 /// let sparse_re = Regex::from_dfas(fwd, rev);
51 ///
52 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
53 /// assert_eq!(true, sparse_re.is_match(b"foo123"));
54 /// # Ok(()) }; example().unwrap()
55 /// ```
56 #[cfg(feature = "std")]
57 #[derive(Clone, Debug)]
58 pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
59     forward: D,
60     reverse: D,
61 }
62 
63 /// A regular expression that uses deterministic finite automata for fast
64 /// searching.
65 ///
66 /// A regular expression is comprised of two DFAs, a "forward" DFA and a
67 /// "reverse" DFA. The forward DFA is responsible for detecting the end of a
68 /// match while the reverse DFA is responsible for detecting the start of a
69 /// match. Thus, in order to find the bounds of any given match, a forward
70 /// search must first be run followed by a reverse search. A match found by
71 /// the forward DFA guarantees that the reverse DFA will also find a match.
72 ///
73 /// The type of the DFA used by a `Regex` corresponds to the `D` type
74 /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
75 /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
76 /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
77 /// search faster, while sparse DFAs use less memory but search more slowly.
78 ///
79 /// When using this crate without the standard library, the `Regex` type has
80 /// no default type parameter.
81 ///
82 /// # Sparse DFAs
83 ///
84 /// Since a `Regex` is generic over the `DFA` trait, it can be used with any
85 /// kind of DFA. While this crate constructs dense DFAs by default, it is easy
86 /// enough to build corresponding sparse DFAs, and then build a regex from
87 /// them:
88 ///
89 /// ```
90 /// use regex_automata::Regex;
91 ///
92 /// # fn example() -> Result<(), regex_automata::Error> {
93 /// // First, build a regex that uses dense DFAs.
94 /// let dense_re = Regex::new("foo[0-9]+")?;
95 ///
96 /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
97 /// let fwd = dense_re.forward().to_sparse()?;
98 /// let rev = dense_re.reverse().to_sparse()?;
99 ///
100 /// // Third, build a new regex from the constituent sparse DFAs.
101 /// let sparse_re = Regex::from_dfas(fwd, rev);
102 ///
103 /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
104 /// assert_eq!(true, sparse_re.is_match(b"foo123"));
105 /// # Ok(()) }; example().unwrap()
106 /// ```
107 #[cfg(not(feature = "std"))]
108 #[derive(Clone, Debug)]
109 pub struct Regex<D> {
110     forward: D,
111     reverse: D,
112 }
113 
114 #[cfg(feature = "std")]
115 impl Regex {
116     /// Parse the given regular expression using a default configuration and
117     /// return the corresponding regex.
118     ///
119     /// The default configuration uses `usize` for state IDs, premultiplies
120     /// them and reduces the alphabet size by splitting bytes into equivalence
121     /// classes. The underlying DFAs are *not* minimized.
122     ///
123     /// If you want a non-default configuration, then use the
124     /// [`RegexBuilder`](struct.RegexBuilder.html)
125     /// to set your own configuration.
126     ///
127     /// # Example
128     ///
129     /// ```
130     /// use regex_automata::Regex;
131     ///
132     /// # fn example() -> Result<(), regex_automata::Error> {
133     /// let re = Regex::new("foo[0-9]+bar")?;
134     /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
135     /// # Ok(()) }; example().unwrap()
136     /// ```
new(pattern: &str) -> Result<Regex>137     pub fn new(pattern: &str) -> Result<Regex> {
138         RegexBuilder::new().build(pattern)
139     }
140 }
141 
142 #[cfg(feature = "std")]
143 impl Regex<SparseDFA<Vec<u8>, usize>> {
144     /// Parse the given regular expression using a default configuration and
145     /// return the corresponding regex using sparse DFAs.
146     ///
147     /// The default configuration uses `usize` for state IDs, reduces the
148     /// alphabet size by splitting bytes into equivalence classes. The
149     /// underlying DFAs are *not* minimized.
150     ///
151     /// If you want a non-default configuration, then use the
152     /// [`RegexBuilder`](struct.RegexBuilder.html)
153     /// to set your own configuration.
154     ///
155     /// # Example
156     ///
157     /// ```
158     /// use regex_automata::Regex;
159     ///
160     /// # fn example() -> Result<(), regex_automata::Error> {
161     /// let re = Regex::new_sparse("foo[0-9]+bar")?;
162     /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
163     /// # Ok(()) }; example().unwrap()
164     /// ```
new_sparse( pattern: &str, ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>>165     pub fn new_sparse(
166         pattern: &str,
167     ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
168         RegexBuilder::new().build_sparse(pattern)
169     }
170 }
171 
172 impl<D: DFA> Regex<D> {
173     /// Returns true if and only if the given bytes match.
174     ///
175     /// This routine may short circuit if it knows that scanning future input
176     /// will never lead to a different result. In particular, if the underlying
177     /// DFA enters a match state or a dead state, then this routine will return
178     /// `true` or `false`, respectively, without inspecting any future input.
179     ///
180     /// # Example
181     ///
182     /// ```
183     /// use regex_automata::Regex;
184     ///
185     /// # fn example() -> Result<(), regex_automata::Error> {
186     /// let re = Regex::new("foo[0-9]+bar")?;
187     /// assert_eq!(true, re.is_match(b"foo12345bar"));
188     /// assert_eq!(false, re.is_match(b"foobar"));
189     /// # Ok(()) }; example().unwrap()
190     /// ```
is_match(&self, input: &[u8]) -> bool191     pub fn is_match(&self, input: &[u8]) -> bool {
192         self.is_match_at(input, 0)
193     }
194 
195     /// Returns the first position at which a match is found.
196     ///
197     /// This routine stops scanning input in precisely the same circumstances
198     /// as `is_match`. The key difference is that this routine returns the
199     /// position at which it stopped scanning input if and only if a match
200     /// was found. If no match is found, then `None` is returned.
201     ///
202     /// # Example
203     ///
204     /// ```
205     /// use regex_automata::Regex;
206     ///
207     /// # fn example() -> Result<(), regex_automata::Error> {
208     /// let re = Regex::new("foo[0-9]+")?;
209     /// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
210     ///
211     /// // Normally, the end of the leftmost first match here would be 3,
212     /// // but the shortest match semantics detect a match earlier.
213     /// let re = Regex::new("abc|a")?;
214     /// assert_eq!(Some(1), re.shortest_match(b"abc"));
215     /// # Ok(()) }; example().unwrap()
216     /// ```
shortest_match(&self, input: &[u8]) -> Option<usize>217     pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
218         self.shortest_match_at(input, 0)
219     }
220 
221     /// Returns the start and end offset of the leftmost first match. If no
222     /// match exists, then `None` is returned.
223     ///
224     /// The "leftmost first" match corresponds to the match with the smallest
225     /// starting offset, but where the end offset is determined by preferring
226     /// earlier branches in the original regular expression. For example,
227     /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
228     /// match `Samwise` in `Samwise`.
229     ///
230     /// Generally speaking, the "leftmost first" match is how most backtracking
231     /// regular expressions tend to work. This is in contrast to POSIX-style
232     /// regular expressions that yield "leftmost longest" matches. Namely,
233     /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
234     /// leftmost longest semantics.
235     ///
236     /// # Example
237     ///
238     /// ```
239     /// use regex_automata::Regex;
240     ///
241     /// # fn example() -> Result<(), regex_automata::Error> {
242     /// let re = Regex::new("foo[0-9]+")?;
243     /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
244     ///
245     /// // Even though a match is found after reading the first byte (`a`),
246     /// // the leftmost first match semantics demand that we find the earliest
247     /// // match that prefers earlier parts of the pattern over latter parts.
248     /// let re = Regex::new("abc|a")?;
249     /// assert_eq!(Some((0, 3)), re.find(b"abc"));
250     /// # Ok(()) }; example().unwrap()
251     /// ```
find(&self, input: &[u8]) -> Option<(usize, usize)>252     pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
253         self.find_at(input, 0)
254     }
255 
256     /// Returns the same as `is_match`, but starts the search at the given
257     /// offset.
258     ///
259     /// The significance of the starting point is that it takes the surrounding
260     /// context into consideration. For example, if the DFA is anchored, then
261     /// a match can only occur when `start == 0`.
is_match_at(&self, input: &[u8], start: usize) -> bool262     pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
263         self.forward().is_match_at(input, start)
264     }
265 
266     /// Returns the same as `shortest_match`, but starts the search at the
267     /// given offset.
268     ///
269     /// The significance of the starting point is that it takes the surrounding
270     /// context into consideration. For example, if the DFA is anchored, then
271     /// a match can only occur when `start == 0`.
shortest_match_at( &self, input: &[u8], start: usize, ) -> Option<usize>272     pub fn shortest_match_at(
273         &self,
274         input: &[u8],
275         start: usize,
276     ) -> Option<usize> {
277         self.forward().shortest_match_at(input, start)
278     }
279 
280     /// Returns the same as `find`, but starts the search at the given
281     /// offset.
282     ///
283     /// The significance of the starting point is that it takes the surrounding
284     /// context into consideration. For example, if the DFA is anchored, then
285     /// a match can only occur when `start == 0`.
find_at( &self, input: &[u8], start: usize, ) -> Option<(usize, usize)>286     pub fn find_at(
287         &self,
288         input: &[u8],
289         start: usize,
290     ) -> Option<(usize, usize)> {
291         let end = match self.forward().find_at(input, start) {
292             None => return None,
293             Some(end) => end,
294         };
295         let start = self
296             .reverse()
297             .rfind(&input[start..end])
298             .map(|i| start + i)
299             .expect("reverse search must match if forward search does");
300         Some((start, end))
301     }
302 
303     /// Returns an iterator over all non-overlapping leftmost first matches
304     /// in the given bytes. If no match exists, then the iterator yields no
305     /// elements.
306     ///
307     /// Note that if the regex can match the empty string, then it is
308     /// possible for the iterator to yield a zero-width match at a location
309     /// that is not a valid UTF-8 boundary (for example, between the code units
310     /// of a UTF-8 encoded codepoint). This can happen regardless of whether
311     /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
312     /// was enabled or not.
313     ///
314     /// # Example
315     ///
316     /// ```
317     /// use regex_automata::Regex;
318     ///
319     /// # fn example() -> Result<(), regex_automata::Error> {
320     /// let re = Regex::new("foo[0-9]+")?;
321     /// let text = b"foo1 foo12 foo123";
322     /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
323     /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
324     /// # Ok(()) }; example().unwrap()
325     /// ```
find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D>326     pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
327         Matches::new(self, input)
328     }
329 
330     /// Build a new regex from its constituent forward and reverse DFAs.
331     ///
332     /// This is useful when deserializing a regex from some arbitrary
333     /// memory region. This is also useful for building regexes from other
334     /// types of DFAs.
335     ///
336     /// # Example
337     ///
338     /// This example is a bit a contrived. The usual use of these methods
339     /// would involve serializing `initial_re` somewhere and then deserializing
340     /// it later to build a regex.
341     ///
342     /// ```
343     /// use regex_automata::Regex;
344     ///
345     /// # fn example() -> Result<(), regex_automata::Error> {
346     /// let initial_re = Regex::new("foo[0-9]+")?;
347     /// assert_eq!(true, initial_re.is_match(b"foo123"));
348     ///
349     /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
350     /// let re = Regex::from_dfas(fwd, rev);
351     /// assert_eq!(true, re.is_match(b"foo123"));
352     /// # Ok(()) }; example().unwrap()
353     /// ```
354     ///
355     /// This example shows how you might build smaller DFAs, and then use those
356     /// smaller DFAs to build a new regex.
357     ///
358     /// ```
359     /// use regex_automata::Regex;
360     ///
361     /// # fn example() -> Result<(), regex_automata::Error> {
362     /// let initial_re = Regex::new("foo[0-9]+")?;
363     /// assert_eq!(true, initial_re.is_match(b"foo123"));
364     ///
365     /// let fwd = initial_re.forward().to_u16()?;
366     /// let rev = initial_re.reverse().to_u16()?;
367     /// let re = Regex::from_dfas(fwd, rev);
368     /// assert_eq!(true, re.is_match(b"foo123"));
369     /// # Ok(()) }; example().unwrap()
370     /// ```
371     ///
372     /// This example shows how to build a `Regex` that uses sparse DFAs instead
373     /// of dense DFAs:
374     ///
375     /// ```
376     /// use regex_automata::Regex;
377     ///
378     /// # fn example() -> Result<(), regex_automata::Error> {
379     /// let initial_re = Regex::new("foo[0-9]+")?;
380     /// assert_eq!(true, initial_re.is_match(b"foo123"));
381     ///
382     /// let fwd = initial_re.forward().to_sparse()?;
383     /// let rev = initial_re.reverse().to_sparse()?;
384     /// let re = Regex::from_dfas(fwd, rev);
385     /// assert_eq!(true, re.is_match(b"foo123"));
386     /// # Ok(()) }; example().unwrap()
387     /// ```
from_dfas(forward: D, reverse: D) -> Regex<D>388     pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
389         Regex { forward, reverse }
390     }
391 
392     /// Return the underlying DFA responsible for forward matching.
forward(&self) -> &D393     pub fn forward(&self) -> &D {
394         &self.forward
395     }
396 
397     /// Return the underlying DFA responsible for reverse matching.
reverse(&self) -> &D398     pub fn reverse(&self) -> &D {
399         &self.reverse
400     }
401 }
402 
403 /// An iterator over all non-overlapping matches for a particular search.
404 ///
405 /// The iterator yields a `(usize, usize)` value until no more matches could be
406 /// found. The first `usize` is the start of the match (inclusive) while the
407 /// second `usize` is the end of the match (exclusive).
408 ///
409 /// `S` is the type used to represent state identifiers in the underlying
410 /// regex. The lifetime variables are as follows:
411 ///
412 /// * `'r` is the lifetime of the regular expression value itself.
413 /// * `'t` is the lifetime of the text being searched.
414 #[derive(Clone, Debug)]
415 pub struct Matches<'r, 't, D: DFA + 'r> {
416     re: &'r Regex<D>,
417     text: &'t [u8],
418     last_end: usize,
419     last_match: Option<usize>,
420 }
421 
422 impl<'r, 't, D: DFA> Matches<'r, 't, D> {
new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D>423     fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
424         Matches { re, text, last_end: 0, last_match: None }
425     }
426 }
427 
428 impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
429     type Item = (usize, usize);
430 
next(&mut self) -> Option<(usize, usize)>431     fn next(&mut self) -> Option<(usize, usize)> {
432         if self.last_end > self.text.len() {
433             return None;
434         }
435         let (s, e) = match self.re.find_at(self.text, self.last_end) {
436             None => return None,
437             Some((s, e)) => (s, e),
438         };
439         if s == e {
440             // This is an empty match. To ensure we make progress, start
441             // the next search at the smallest possible starting position
442             // of the next match following this one.
443             self.last_end = e + 1;
444             // Don't accept empty matches immediately following a match.
445             // Just move on to the next match.
446             if Some(e) == self.last_match {
447                 return self.next();
448             }
449         } else {
450             self.last_end = e;
451         }
452         self.last_match = Some(e);
453         Some((s, e))
454     }
455 }
456 
457 /// A builder for a regex based on deterministic finite automatons.
458 ///
459 /// This builder permits configuring several aspects of the construction
460 /// process such as case insensitivity, Unicode support and various options
461 /// that impact the size of the underlying DFAs. In some cases, options (like
462 /// performing DFA minimization) can come with a substantial additional cost.
463 ///
464 /// This builder generally constructs two DFAs, where one is responsible for
465 /// finding the end of a match and the other is responsible for finding the
466 /// start of a match. If you only need to detect whether something matched,
467 /// or only the end of a match, then you should use a
468 /// [`dense::Builder`](dense/struct.Builder.html)
469 /// to construct a single DFA, which is cheaper than building two DFAs.
470 #[cfg(feature = "std")]
471 #[derive(Clone, Debug)]
472 pub struct RegexBuilder {
473     dfa: dense::Builder,
474 }
475 
476 #[cfg(feature = "std")]
477 impl RegexBuilder {
478     /// Create a new regex builder with the default configuration.
new() -> RegexBuilder479     pub fn new() -> RegexBuilder {
480         RegexBuilder { dfa: dense::Builder::new() }
481     }
482 
483     /// Build a regex from the given pattern.
484     ///
485     /// If there was a problem parsing or compiling the pattern, then an error
486     /// is returned.
build(&self, pattern: &str) -> Result<Regex>487     pub fn build(&self, pattern: &str) -> Result<Regex> {
488         self.build_with_size::<usize>(pattern)
489     }
490 
491     /// Build a regex from the given pattern using sparse DFAs.
492     ///
493     /// If there was a problem parsing or compiling the pattern, then an error
494     /// is returned.
build_sparse( &self, pattern: &str, ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>>495     pub fn build_sparse(
496         &self,
497         pattern: &str,
498     ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
499         self.build_with_size_sparse::<usize>(pattern)
500     }
501 
502     /// Build a regex from the given pattern using a specific representation
503     /// for the underlying DFA state IDs.
504     ///
505     /// If there was a problem parsing or compiling the pattern, then an error
506     /// is returned.
507     ///
508     /// The representation of state IDs is determined by the `S` type
509     /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
510     /// or `usize`, where `usize` is the default used for `build`. The purpose
511     /// of specifying a representation for state IDs is to reduce the memory
512     /// footprint of the underlying DFAs.
513     ///
514     /// When using this routine, the chosen state ID representation will be
515     /// used throughout determinization and minimization, if minimization was
516     /// requested. Even if the minimized DFAs can fit into the chosen state ID
517     /// representation but the initial determinized DFA cannot, then this will
518     /// still return an error. To get a minimized DFA with a smaller state ID
519     /// representation, first build it with a bigger state ID representation,
520     /// and then shrink the sizes of the DFAs using one of its conversion
521     /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
522     /// Finally, reconstitute the regex via
523     /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
build_with_size<S: StateID>( &self, pattern: &str, ) -> Result<Regex<DenseDFA<Vec<S>, S>>>524     pub fn build_with_size<S: StateID>(
525         &self,
526         pattern: &str,
527     ) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
528         let forward = self.dfa.build_with_size(pattern)?;
529         let reverse = self
530             .dfa
531             .clone()
532             .anchored(true)
533             .reverse(true)
534             .longest_match(true)
535             .build_with_size(pattern)?;
536         Ok(Regex::from_dfas(forward, reverse))
537     }
538 
539     /// Build a regex from the given pattern using a specific representation
540     /// for the underlying DFA state IDs using sparse DFAs.
build_with_size_sparse<S: StateID>( &self, pattern: &str, ) -> Result<Regex<SparseDFA<Vec<u8>, S>>>541     pub fn build_with_size_sparse<S: StateID>(
542         &self,
543         pattern: &str,
544     ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
545         let re = self.build_with_size(pattern)?;
546         let fwd = re.forward().to_sparse()?;
547         let rev = re.reverse().to_sparse()?;
548         Ok(Regex::from_dfas(fwd, rev))
549     }
550 
551     /// Set whether matching must be anchored at the beginning of the input.
552     ///
553     /// When enabled, a match must begin at the start of the input. When
554     /// disabled, the regex will act as if the pattern started with a `.*?`,
555     /// which enables a match to appear anywhere.
556     ///
557     /// By default this is disabled.
anchored(&mut self, yes: bool) -> &mut RegexBuilder558     pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
559         self.dfa.anchored(yes);
560         self
561     }
562 
563     /// Enable or disable the case insensitive flag by default.
564     ///
565     /// By default this is disabled. It may alternatively be selectively
566     /// enabled in the regular expression itself via the `i` flag.
case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder567     pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
568         self.dfa.case_insensitive(yes);
569         self
570     }
571 
572     /// Enable verbose mode in the regular expression.
573     ///
574     /// When enabled, verbose mode permits insigificant whitespace in many
575     /// places in the regular expression, as well as comments. Comments are
576     /// started using `#` and continue until the end of the line.
577     ///
578     /// By default, this is disabled. It may be selectively enabled in the
579     /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder580     pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
581         self.dfa.ignore_whitespace(yes);
582         self
583     }
584 
585     /// Enable or disable the "dot matches any character" flag by default.
586     ///
587     /// By default this is disabled. It may alternatively be selectively
588     /// enabled in the regular expression itself via the `s` flag.
dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder589     pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
590         self.dfa.dot_matches_new_line(yes);
591         self
592     }
593 
594     /// Enable or disable the "swap greed" flag by default.
595     ///
596     /// By default this is disabled. It may alternatively be selectively
597     /// enabled in the regular expression itself via the `U` flag.
swap_greed(&mut self, yes: bool) -> &mut RegexBuilder598     pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
599         self.dfa.swap_greed(yes);
600         self
601     }
602 
603     /// Enable or disable the Unicode flag (`u`) by default.
604     ///
605     /// By default this is **enabled**. It may alternatively be selectively
606     /// disabled in the regular expression itself via the `u` flag.
607     ///
608     /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
609     /// default), a regular expression will fail to parse if Unicode mode is
610     /// disabled and a sub-expression could possibly match invalid UTF-8.
unicode(&mut self, yes: bool) -> &mut RegexBuilder611     pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
612         self.dfa.unicode(yes);
613         self
614     }
615 
616     /// When enabled, the builder will permit the construction of a regular
617     /// expression that may match invalid UTF-8.
618     ///
619     /// When disabled (the default), the builder is guaranteed to produce a
620     /// regex that will only ever match valid UTF-8 (otherwise, the builder
621     /// will return an error).
allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder622     pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
623         self.dfa.allow_invalid_utf8(yes);
624         self
625     }
626 
627     /// Set the nesting limit used for the regular expression parser.
628     ///
629     /// The nesting limit controls how deep the abstract syntax tree is allowed
630     /// to be. If the AST exceeds the given limit (e.g., with too many nested
631     /// groups), then an error is returned by the parser.
632     ///
633     /// The purpose of this limit is to act as a heuristic to prevent stack
634     /// overflow when building a finite automaton from a regular expression's
635     /// abstract syntax tree. In particular, construction currently uses
636     /// recursion. In the future, the implementation may stop using recursion
637     /// and this option will no longer be necessary.
638     ///
639     /// This limit is not checked until the entire AST is parsed. Therefore,
640     /// if callers want to put a limit on the amount of heap space used, then
641     /// they should impose a limit on the length, in bytes, of the concrete
642     /// pattern string. In particular, this is viable since the parser will
643     /// limit itself to heap space proportional to the lenth of the pattern
644     /// string.
645     ///
646     /// Note that a nest limit of `0` will return a nest limit error for most
647     /// patterns but not all. For example, a nest limit of `0` permits `a` but
648     /// not `ab`, since `ab` requires a concatenation AST item, which results
649     /// in a nest depth of `1`. In general, a nest limit is not something that
650     /// manifests in an obvious way in the concrete syntax, therefore, it
651     /// should not be used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut RegexBuilder652     pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
653         self.dfa.nest_limit(limit);
654         self
655     }
656 
657     /// Minimize the underlying DFAs.
658     ///
659     /// When enabled, the DFAs powering the resulting regex will be minimized
660     /// such that it is as small as possible.
661     ///
662     /// Whether one enables minimization or not depends on the types of costs
663     /// you're willing to pay and how much you care about its benefits. In
664     /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
665     /// space, where `n` is the number of DFA states and `k` is the alphabet
666     /// size. In practice, minimization can be quite costly in terms of both
667     /// space and time, so it should only be done if you're willing to wait
668     /// longer to produce a DFA. In general, you might want a minimal DFA in
669     /// the following circumstances:
670     ///
671     /// 1. You would like to optimize for the size of the automaton. This can
672     ///    manifest in one of two ways. Firstly, if you're converting the
673     ///    DFA into Rust code (or a table embedded in the code), then a minimal
674     ///    DFA will translate into a corresponding reduction in code  size, and
675     ///    thus, also the final compiled binary size. Secondly, if you are
676     ///    building many DFAs and putting them on the heap, you'll be able to
677     ///    fit more if they are smaller. Note though that building a minimal
678     ///    DFA itself requires additional space; you only realize the space
679     ///    savings once the minimal DFA is constructed (at which point, the
680     ///    space used for minimization is freed).
681     /// 2. You've observed that a smaller DFA results in faster match
682     ///    performance. Naively, this isn't guaranteed since there is no
683     ///    inherent difference between matching with a bigger-than-minimal
684     ///    DFA and a minimal DFA. However, a smaller DFA may make use of your
685     ///    CPU's cache more efficiently.
686     /// 3. You are trying to establish an equivalence between regular
687     ///    languages. The standard method for this is to build a minimal DFA
688     ///    for each language and then compare them. If the DFAs are equivalent
689     ///    (up to state renaming), then the languages are equivalent.
690     ///
691     /// This option is disabled by default.
minimize(&mut self, yes: bool) -> &mut RegexBuilder692     pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
693         self.dfa.minimize(yes);
694         self
695     }
696 
697     /// Premultiply state identifiers in the underlying DFA transition tables.
698     ///
699     /// When enabled, state identifiers are premultiplied to point to their
700     /// corresponding row in the DFA's transition table. That is, given the
701     /// `i`th state, its corresponding premultiplied identifier is `i * k`
702     /// where `k` is the alphabet size of the DFA. (The alphabet size is at
703     /// most 256, but is in practice smaller if byte classes is enabled.)
704     ///
705     /// When state identifiers are not premultiplied, then the identifier of
706     /// the `i`th state is `i`.
707     ///
708     /// The advantage of premultiplying state identifiers is that is saves
709     /// a multiplication instruction per byte when searching with the DFA.
710     /// This has been observed to lead to a 20% performance benefit in
711     /// micro-benchmarks.
712     ///
713     /// The primary disadvantage of premultiplying state identifiers is
714     /// that they require a larger integer size to represent. For example,
715     /// if your DFA has 200 states, then its premultiplied form requires
716     /// 16 bits to represent every possible state identifier, where as its
717     /// non-premultiplied form only requires 8 bits.
718     ///
719     /// This option is enabled by default.
premultiply(&mut self, yes: bool) -> &mut RegexBuilder720     pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
721         self.dfa.premultiply(yes);
722         self
723     }
724 
725     /// Shrink the size of the underlying DFA alphabet by mapping bytes to
726     /// their equivalence classes.
727     ///
728     /// When enabled, each DFA will use a map from all possible bytes to their
729     /// corresponding equivalence class. Each equivalence class represents a
730     /// set of bytes that does not discriminate between a match and a non-match
731     /// in the DFA. For example, the pattern `[ab]+` has at least two
732     /// equivalence classes: a set containing `a` and `b` and a set containing
733     /// every byte except for `a` and `b`. `a` and `b` are in the same
734     /// equivalence classes because they never discriminate between a match
735     /// and a non-match.
736     ///
737     /// The advantage of this map is that the size of the transition table can
738     /// be reduced drastically from `#states * 256 * sizeof(id)` to
739     /// `#states * k * sizeof(id)` where `k` is the number of equivalence
740     /// classes. As a result, total space usage can decrease substantially.
741     /// Moreover, since a smaller alphabet is used, compilation becomes faster
742     /// as well.
743     ///
744     /// The disadvantage of this map is that every byte searched must be
745     /// passed through this map before it can be used to determine the next
746     /// transition. This has a small match time performance cost.
747     ///
748     /// This option is enabled by default.
byte_classes(&mut self, yes: bool) -> &mut RegexBuilder749     pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
750         self.dfa.byte_classes(yes);
751         self
752     }
753 
754     /// Apply best effort heuristics to shrink the NFA at the expense of more
755     /// time/memory.
756     ///
757     /// This may be exposed in the future, but for now is exported for use in
758     /// the `regex-automata-debug` tool.
759     #[doc(hidden)]
shrink(&mut self, yes: bool) -> &mut RegexBuilder760     pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
761         self.dfa.shrink(yes);
762         self
763     }
764 }
765 
766 #[cfg(feature = "std")]
767 impl Default for RegexBuilder {
default() -> RegexBuilder768     fn default() -> RegexBuilder {
769         RegexBuilder::new()
770     }
771 }
772