1 /*!
2 Utilities for dealing with the syntax of a regular expression.
3 
4 This module currently only exposes a [`Config`] type that
5 itself represents a wrapper around the configuration for a
6 [`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
7 this wrapper is to make configuring syntax options very similar to how other
8 configuration is done throughout this crate. Namely, instead of duplicating
9 syntax options across every builder (of which there are many), we instead
10 create small config objects like this one that can be passed around and
11 composed.
12 */
13 
14 use alloc::{vec, vec::Vec};
15 
16 use regex_syntax::{
17     ast,
18     hir::{self, Hir},
19     Error, ParserBuilder,
20 };
21 
22 /// A convenience routine for parsing a pattern into an HIR value with the
23 /// default configuration.
24 ///
25 /// # Example
26 ///
27 /// This shows how to parse a pattern into an HIR value:
28 ///
29 /// ```
30 /// use regex_automata::util::syntax;
31 ///
32 /// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
33 /// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
34 ///
35 /// # Ok::<(), Box<dyn std::error::Error>>(())
36 /// ```
parse(pattern: &str) -> Result<Hir, Error>37 pub fn parse(pattern: &str) -> Result<Hir, Error> {
38     parse_with(pattern, &Config::default())
39 }
40 
41 /// A convenience routine for parsing many patterns into HIR value with the
42 /// default configuration.
43 ///
44 /// # Example
45 ///
46 /// This shows how to parse many patterns into an corresponding HIR values:
47 ///
48 /// ```
49 /// use {
50 ///     regex_automata::util::syntax,
51 ///     regex_syntax::hir::Properties,
52 /// };
53 ///
54 /// let hirs = syntax::parse_many(&[
55 ///     r"([a-z]+)|([0-9]+)",
56 ///     r"foo(A-Z]+)bar",
57 /// ])?;
58 /// let props = Properties::union(hirs.iter().map(|h| h.properties()));
59 /// assert_eq!(Some(1), props.static_explicit_captures_len());
60 ///
61 /// # Ok::<(), Box<dyn std::error::Error>>(())
62 /// ```
parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error>63 pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
64     parse_many_with(patterns, &Config::default())
65 }
66 
67 /// A convenience routine for parsing a pattern into an HIR value using a
68 /// `Config`.
69 ///
70 /// # Example
71 ///
72 /// This shows how to parse a pattern into an HIR value with a non-default
73 /// configuration:
74 ///
75 /// ```
76 /// use regex_automata::util::syntax;
77 ///
78 /// let hir = syntax::parse_with(
79 ///     r"^[a-z]+$",
80 ///     &syntax::Config::new().multi_line(true).crlf(true),
81 /// )?;
82 /// assert!(hir.properties().look_set().contains_anchor_crlf());
83 ///
84 /// # Ok::<(), Box<dyn std::error::Error>>(())
85 /// ```
parse_with(pattern: &str, config: &Config) -> Result<Hir, Error>86 pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
87     let mut builder = ParserBuilder::new();
88     config.apply(&mut builder);
89     builder.build().parse(pattern)
90 }
91 
92 /// A convenience routine for parsing many patterns into HIR values using a
93 /// `Config`.
94 ///
95 /// # Example
96 ///
97 /// This shows how to parse many patterns into an corresponding HIR values
98 /// with a non-default configuration:
99 ///
100 /// ```
101 /// use {
102 ///     regex_automata::util::syntax,
103 ///     regex_syntax::hir::Properties,
104 /// };
105 ///
106 /// let patterns = &[
107 ///     r"([a-z]+)|([0-9]+)",
108 ///     r"\W",
109 ///     r"foo(A-Z]+)bar",
110 /// ];
111 /// let config = syntax::Config::new().unicode(false).utf8(false);
112 /// let hirs = syntax::parse_many_with(patterns, &config)?;
113 /// let props = Properties::union(hirs.iter().map(|h| h.properties()));
114 /// assert!(!props.is_utf8());
115 ///
116 /// # Ok::<(), Box<dyn std::error::Error>>(())
117 /// ```
parse_many_with<P: AsRef<str>>( patterns: &[P], config: &Config, ) -> Result<Vec<Hir>, Error>118 pub fn parse_many_with<P: AsRef<str>>(
119     patterns: &[P],
120     config: &Config,
121 ) -> Result<Vec<Hir>, Error> {
122     let mut builder = ParserBuilder::new();
123     config.apply(&mut builder);
124     let mut hirs = vec![];
125     for p in patterns.iter() {
126         hirs.push(builder.build().parse(p.as_ref())?);
127     }
128     Ok(hirs)
129 }
130 
131 /// A common set of configuration options that apply to the syntax of a regex.
132 ///
133 /// This represents a group of configuration options that specifically apply
134 /// to how the concrete syntax of a regular expression is interpreted. In
135 /// particular, they are generally forwarded to the
136 /// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
137 /// in the
138 /// [`regex-syntax`](https://docs.rs/regex-syntax)
139 /// crate when building a regex from its concrete syntax directly.
140 ///
141 /// These options are defined as a group since they apply to every regex engine
142 /// in this crate. Instead of re-defining them on every engine's builder, they
143 /// are instead provided here as one cohesive unit.
144 #[derive(Clone, Copy, Debug)]
145 pub struct Config {
146     case_insensitive: bool,
147     multi_line: bool,
148     dot_matches_new_line: bool,
149     crlf: bool,
150     line_terminator: u8,
151     swap_greed: bool,
152     ignore_whitespace: bool,
153     unicode: bool,
154     utf8: bool,
155     nest_limit: u32,
156     octal: bool,
157 }
158 
159 impl Config {
160     /// Return a new default syntax configuration.
new() -> Config161     pub fn new() -> Config {
162         // These defaults match the ones used in regex-syntax.
163         Config {
164             case_insensitive: false,
165             multi_line: false,
166             dot_matches_new_line: false,
167             crlf: false,
168             line_terminator: b'\n',
169             swap_greed: false,
170             ignore_whitespace: false,
171             unicode: true,
172             utf8: true,
173             nest_limit: 250,
174             octal: false,
175         }
176     }
177 
178     /// Enable or disable the case insensitive flag by default.
179     ///
180     /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
181     /// Specifically, it will apply the "simple" case folding rules as
182     /// specified by Unicode.
183     ///
184     /// By default this is disabled. It may alternatively be selectively
185     /// enabled in the regular expression itself via the `i` flag.
case_insensitive(mut self, yes: bool) -> Config186     pub fn case_insensitive(mut self, yes: bool) -> Config {
187         self.case_insensitive = yes;
188         self
189     }
190 
191     /// Enable or disable the multi-line matching flag by default.
192     ///
193     /// When this is enabled, the `^` and `$` look-around assertions will
194     /// match immediately after and immediately before a new line character,
195     /// respectively. Note that the `\A` and `\z` look-around assertions are
196     /// unaffected by this setting and always correspond to matching at the
197     /// beginning and end of the input.
198     ///
199     /// By default this is disabled. It may alternatively be selectively
200     /// enabled in the regular expression itself via the `m` flag.
multi_line(mut self, yes: bool) -> Config201     pub fn multi_line(mut self, yes: bool) -> Config {
202         self.multi_line = yes;
203         self
204     }
205 
206     /// Enable or disable the "dot matches any character" flag by default.
207     ///
208     /// When this is enabled, `.` will match any character. When it's disabled,
209     /// then `.` will match any character except for a new line character.
210     ///
211     /// Note that `.` is impacted by whether the "unicode" setting is enabled
212     /// or not. When Unicode is enabled (the default), `.` will match any UTF-8
213     /// encoding of any Unicode scalar value (sans a new line, depending on
214     /// whether this "dot matches new line" option is enabled). When Unicode
215     /// mode is disabled, `.` will match any byte instead. Because of this,
216     /// when Unicode mode is disabled, `.` can only be used when the "allow
217     /// invalid UTF-8" option is enabled, since `.` could otherwise match
218     /// invalid UTF-8.
219     ///
220     /// By default this is disabled. It may alternatively be selectively
221     /// enabled in the regular expression itself via the `s` flag.
dot_matches_new_line(mut self, yes: bool) -> Config222     pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
223         self.dot_matches_new_line = yes;
224         self
225     }
226 
227     /// Enable or disable the "CRLF mode" flag by default.
228     ///
229     /// By default this is disabled. It may alternatively be selectively
230     /// enabled in the regular expression itself via the `R` flag.
231     ///
232     /// When CRLF mode is enabled, the following happens:
233     ///
234     /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
235     /// except for `\r` and `\n`.
236     /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
237     /// `\r` and `\n` as line terminators. And in particular, neither will
238     /// match between a `\r` and a `\n`.
crlf(mut self, yes: bool) -> Config239     pub fn crlf(mut self, yes: bool) -> Config {
240         self.crlf = yes;
241         self
242     }
243 
244     /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
245     ///
246     /// Namely, instead of `.` (by default) matching everything except for `\n`,
247     /// this will cause `.` to match everything except for the byte given.
248     ///
249     /// If `.` is used in a context where Unicode mode is enabled and this byte
250     /// isn't ASCII, then an error will be returned. When Unicode mode is
251     /// disabled, then any byte is permitted, but will return an error if UTF-8
252     /// mode is enabled and it is a non-ASCII byte.
253     ///
254     /// In short, any ASCII value for a line terminator is always okay. But a
255     /// non-ASCII byte might result in an error depending on whether Unicode
256     /// mode or UTF-8 mode are enabled.
257     ///
258     /// Note that if `R` mode is enabled then it always takes precedence and
259     /// the line terminator will be treated as `\r` and `\n` simultaneously.
260     ///
261     /// Note also that this *doesn't* impact the look-around assertions
262     /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
263     /// configuration in the regex engine itself.
line_terminator(mut self, byte: u8) -> Config264     pub fn line_terminator(mut self, byte: u8) -> Config {
265         self.line_terminator = byte;
266         self
267     }
268 
269     /// Enable or disable the "swap greed" flag by default.
270     ///
271     /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
272     /// will become greedy.
273     ///
274     /// By default this is disabled. It may alternatively be selectively
275     /// enabled in the regular expression itself via the `U` flag.
swap_greed(mut self, yes: bool) -> Config276     pub fn swap_greed(mut self, yes: bool) -> Config {
277         self.swap_greed = yes;
278         self
279     }
280 
281     /// Enable verbose mode in the regular expression.
282     ///
283     /// When enabled, verbose mode permits insigificant whitespace in many
284     /// places in the regular expression, as well as comments. Comments are
285     /// started using `#` and continue until the end of the line.
286     ///
287     /// By default, this is disabled. It may be selectively enabled in the
288     /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(mut self, yes: bool) -> Config289     pub fn ignore_whitespace(mut self, yes: bool) -> Config {
290         self.ignore_whitespace = yes;
291         self
292     }
293 
294     /// Enable or disable the Unicode flag (`u`) by default.
295     ///
296     /// By default this is **enabled**. It may alternatively be selectively
297     /// disabled in the regular expression itself via the `u` flag.
298     ///
299     /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
300     /// default), a regular expression will fail to parse if Unicode mode is
301     /// disabled and a sub-expression could possibly match invalid UTF-8.
302     ///
303     /// **WARNING**: Unicode mode can greatly increase the size of the compiled
304     /// DFA, which can noticeably impact both memory usage and compilation
305     /// time. This is especially noticeable if your regex contains character
306     /// classes like `\w` that are impacted by whether Unicode is enabled or
307     /// not. If Unicode is not necessary, you are encouraged to disable it.
unicode(mut self, yes: bool) -> Config308     pub fn unicode(mut self, yes: bool) -> Config {
309         self.unicode = yes;
310         self
311     }
312 
313     /// When disabled, the builder will permit the construction of a regular
314     /// expression that may match invalid UTF-8.
315     ///
316     /// For example, when [`Config::unicode`] is disabled, then
317     /// expressions like `[^a]` may match invalid UTF-8 since they can match
318     /// any single byte that is not `a`. By default, these sub-expressions
319     /// are disallowed to avoid returning offsets that split a UTF-8
320     /// encoded codepoint. However, in cases where matching at arbitrary
321     /// locations is desired, this option can be disabled to permit all such
322     /// sub-expressions.
323     ///
324     /// When enabled (the default), the builder is guaranteed to produce a
325     /// regex that will only ever match valid UTF-8 (otherwise, the builder
326     /// will return an error).
utf8(mut self, yes: bool) -> Config327     pub fn utf8(mut self, yes: bool) -> Config {
328         self.utf8 = yes;
329         self
330     }
331 
332     /// Set the nesting limit used for the regular expression parser.
333     ///
334     /// The nesting limit controls how deep the abstract syntax tree is allowed
335     /// to be. If the AST exceeds the given limit (e.g., with too many nested
336     /// groups), then an error is returned by the parser.
337     ///
338     /// The purpose of this limit is to act as a heuristic to prevent stack
339     /// overflow when building a finite automaton from a regular expression's
340     /// abstract syntax tree. In particular, construction currently uses
341     /// recursion. In the future, the implementation may stop using recursion
342     /// and this option will no longer be necessary.
343     ///
344     /// This limit is not checked until the entire AST is parsed. Therefore,
345     /// if callers want to put a limit on the amount of heap space used, then
346     /// they should impose a limit on the length, in bytes, of the concrete
347     /// pattern string. In particular, this is viable since the parser will
348     /// limit itself to heap space proportional to the length of the pattern
349     /// string.
350     ///
351     /// Note that a nest limit of `0` will return a nest limit error for most
352     /// patterns but not all. For example, a nest limit of `0` permits `a` but
353     /// not `ab`, since `ab` requires a concatenation AST item, which results
354     /// in a nest depth of `1`. In general, a nest limit is not something that
355     /// manifests in an obvious way in the concrete syntax, therefore, it
356     /// should not be used in a granular way.
nest_limit(mut self, limit: u32) -> Config357     pub fn nest_limit(mut self, limit: u32) -> Config {
358         self.nest_limit = limit;
359         self
360     }
361 
362     /// Whether to support octal syntax or not.
363     ///
364     /// Octal syntax is a little-known way of uttering Unicode codepoints in
365     /// a regular expression. For example, `a`, `\x61`, `\u0061` and
366     /// `\141` are all equivalent regular expressions, where the last example
367     /// shows octal syntax.
368     ///
369     /// While supporting octal syntax isn't in and of itself a problem, it does
370     /// make good error messages harder. That is, in PCRE based regex engines,
371     /// syntax like `\1` invokes a backreference, which is explicitly
372     /// unsupported in Rust's regex engine. However, many users expect it to
373     /// be supported. Therefore, when octal support is disabled, the error
374     /// message will explicitly mention that backreferences aren't supported.
375     ///
376     /// Octal syntax is disabled by default.
octal(mut self, yes: bool) -> Config377     pub fn octal(mut self, yes: bool) -> Config {
378         self.octal = yes;
379         self
380     }
381 
382     /// Returns whether "unicode" mode is enabled.
get_unicode(&self) -> bool383     pub fn get_unicode(&self) -> bool {
384         self.unicode
385     }
386 
387     /// Returns whether "case insensitive" mode is enabled.
get_case_insensitive(&self) -> bool388     pub fn get_case_insensitive(&self) -> bool {
389         self.case_insensitive
390     }
391 
392     /// Returns whether "multi line" mode is enabled.
get_multi_line(&self) -> bool393     pub fn get_multi_line(&self) -> bool {
394         self.multi_line
395     }
396 
397     /// Returns whether "dot matches new line" mode is enabled.
get_dot_matches_new_line(&self) -> bool398     pub fn get_dot_matches_new_line(&self) -> bool {
399         self.dot_matches_new_line
400     }
401 
402     /// Returns whether "CRLF" mode is enabled.
get_crlf(&self) -> bool403     pub fn get_crlf(&self) -> bool {
404         self.crlf
405     }
406 
407     /// Returns the line terminator in this syntax configuration.
get_line_terminator(&self) -> u8408     pub fn get_line_terminator(&self) -> u8 {
409         self.line_terminator
410     }
411 
412     /// Returns whether "swap greed" mode is enabled.
get_swap_greed(&self) -> bool413     pub fn get_swap_greed(&self) -> bool {
414         self.swap_greed
415     }
416 
417     /// Returns whether "ignore whitespace" mode is enabled.
get_ignore_whitespace(&self) -> bool418     pub fn get_ignore_whitespace(&self) -> bool {
419         self.ignore_whitespace
420     }
421 
422     /// Returns whether UTF-8 mode is enabled.
get_utf8(&self) -> bool423     pub fn get_utf8(&self) -> bool {
424         self.utf8
425     }
426 
427     /// Returns the "nest limit" setting.
get_nest_limit(&self) -> u32428     pub fn get_nest_limit(&self) -> u32 {
429         self.nest_limit
430     }
431 
432     /// Returns whether "octal" mode is enabled.
get_octal(&self) -> bool433     pub fn get_octal(&self) -> bool {
434         self.octal
435     }
436 
437     /// Applies this configuration to the given parser.
apply(&self, builder: &mut ParserBuilder)438     pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
439         builder
440             .unicode(self.unicode)
441             .case_insensitive(self.case_insensitive)
442             .multi_line(self.multi_line)
443             .dot_matches_new_line(self.dot_matches_new_line)
444             .crlf(self.crlf)
445             .line_terminator(self.line_terminator)
446             .swap_greed(self.swap_greed)
447             .ignore_whitespace(self.ignore_whitespace)
448             .utf8(self.utf8)
449             .nest_limit(self.nest_limit)
450             .octal(self.octal);
451     }
452 
453     /// Applies this configuration to the given AST parser.
apply_ast(&self, builder: &mut ast::parse::ParserBuilder)454     pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
455         builder
456             .ignore_whitespace(self.ignore_whitespace)
457             .nest_limit(self.nest_limit)
458             .octal(self.octal);
459     }
460 
461     /// Applies this configuration to the given AST-to-HIR translator.
apply_hir( &self, builder: &mut hir::translate::TranslatorBuilder, )462     pub(crate) fn apply_hir(
463         &self,
464         builder: &mut hir::translate::TranslatorBuilder,
465     ) {
466         builder
467             .unicode(self.unicode)
468             .case_insensitive(self.case_insensitive)
469             .multi_line(self.multi_line)
470             .crlf(self.crlf)
471             .dot_matches_new_line(self.dot_matches_new_line)
472             .line_terminator(self.line_terminator)
473             .swap_greed(self.swap_greed)
474             .utf8(self.utf8);
475     }
476 }
477 
478 impl Default for Config {
default() -> Config479     fn default() -> Config {
480         Config::new()
481     }
482 }
483