1 /*!
2 Utilities for dealing with the syntax of a regular expression.
3
4 This module currently only exposes a [`Config`] type that
5 itself represents a wrapper around the configuration for a
6 [`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
7 this wrapper is to make configuring syntax options very similar to how other
8 configuration is done throughout this crate. Namely, instead of duplicating
9 syntax options across every builder (of which there are many), we instead
10 create small config objects like this one that can be passed around and
11 composed.
12 */
13
14 use alloc::{vec, vec::Vec};
15
16 use regex_syntax::{
17 ast,
18 hir::{self, Hir},
19 Error, ParserBuilder,
20 };
21
22 /// A convenience routine for parsing a pattern into an HIR value with the
23 /// default configuration.
24 ///
25 /// # Example
26 ///
27 /// This shows how to parse a pattern into an HIR value:
28 ///
29 /// ```
30 /// use regex_automata::util::syntax;
31 ///
32 /// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
33 /// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
34 ///
35 /// # Ok::<(), Box<dyn std::error::Error>>(())
36 /// ```
parse(pattern: &str) -> Result<Hir, Error>37 pub fn parse(pattern: &str) -> Result<Hir, Error> {
38 parse_with(pattern, &Config::default())
39 }
40
41 /// A convenience routine for parsing many patterns into HIR value with the
42 /// default configuration.
43 ///
44 /// # Example
45 ///
46 /// This shows how to parse many patterns into an corresponding HIR values:
47 ///
48 /// ```
49 /// use {
50 /// regex_automata::util::syntax,
51 /// regex_syntax::hir::Properties,
52 /// };
53 ///
54 /// let hirs = syntax::parse_many(&[
55 /// r"([a-z]+)|([0-9]+)",
56 /// r"foo(A-Z]+)bar",
57 /// ])?;
58 /// let props = Properties::union(hirs.iter().map(|h| h.properties()));
59 /// assert_eq!(Some(1), props.static_explicit_captures_len());
60 ///
61 /// # Ok::<(), Box<dyn std::error::Error>>(())
62 /// ```
parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error>63 pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
64 parse_many_with(patterns, &Config::default())
65 }
66
67 /// A convenience routine for parsing a pattern into an HIR value using a
68 /// `Config`.
69 ///
70 /// # Example
71 ///
72 /// This shows how to parse a pattern into an HIR value with a non-default
73 /// configuration:
74 ///
75 /// ```
76 /// use regex_automata::util::syntax;
77 ///
78 /// let hir = syntax::parse_with(
79 /// r"^[a-z]+$",
80 /// &syntax::Config::new().multi_line(true).crlf(true),
81 /// )?;
82 /// assert!(hir.properties().look_set().contains_anchor_crlf());
83 ///
84 /// # Ok::<(), Box<dyn std::error::Error>>(())
85 /// ```
parse_with(pattern: &str, config: &Config) -> Result<Hir, Error>86 pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
87 let mut builder = ParserBuilder::new();
88 config.apply(&mut builder);
89 builder.build().parse(pattern)
90 }
91
92 /// A convenience routine for parsing many patterns into HIR values using a
93 /// `Config`.
94 ///
95 /// # Example
96 ///
97 /// This shows how to parse many patterns into an corresponding HIR values
98 /// with a non-default configuration:
99 ///
100 /// ```
101 /// use {
102 /// regex_automata::util::syntax,
103 /// regex_syntax::hir::Properties,
104 /// };
105 ///
106 /// let patterns = &[
107 /// r"([a-z]+)|([0-9]+)",
108 /// r"\W",
109 /// r"foo(A-Z]+)bar",
110 /// ];
111 /// let config = syntax::Config::new().unicode(false).utf8(false);
112 /// let hirs = syntax::parse_many_with(patterns, &config)?;
113 /// let props = Properties::union(hirs.iter().map(|h| h.properties()));
114 /// assert!(!props.is_utf8());
115 ///
116 /// # Ok::<(), Box<dyn std::error::Error>>(())
117 /// ```
parse_many_with<P: AsRef<str>>( patterns: &[P], config: &Config, ) -> Result<Vec<Hir>, Error>118 pub fn parse_many_with<P: AsRef<str>>(
119 patterns: &[P],
120 config: &Config,
121 ) -> Result<Vec<Hir>, Error> {
122 let mut builder = ParserBuilder::new();
123 config.apply(&mut builder);
124 let mut hirs = vec![];
125 for p in patterns.iter() {
126 hirs.push(builder.build().parse(p.as_ref())?);
127 }
128 Ok(hirs)
129 }
130
131 /// A common set of configuration options that apply to the syntax of a regex.
132 ///
133 /// This represents a group of configuration options that specifically apply
134 /// to how the concrete syntax of a regular expression is interpreted. In
135 /// particular, they are generally forwarded to the
136 /// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
137 /// in the
138 /// [`regex-syntax`](https://docs.rs/regex-syntax)
139 /// crate when building a regex from its concrete syntax directly.
140 ///
141 /// These options are defined as a group since they apply to every regex engine
142 /// in this crate. Instead of re-defining them on every engine's builder, they
143 /// are instead provided here as one cohesive unit.
144 #[derive(Clone, Copy, Debug)]
145 pub struct Config {
146 case_insensitive: bool,
147 multi_line: bool,
148 dot_matches_new_line: bool,
149 crlf: bool,
150 line_terminator: u8,
151 swap_greed: bool,
152 ignore_whitespace: bool,
153 unicode: bool,
154 utf8: bool,
155 nest_limit: u32,
156 octal: bool,
157 }
158
159 impl Config {
160 /// Return a new default syntax configuration.
new() -> Config161 pub fn new() -> Config {
162 // These defaults match the ones used in regex-syntax.
163 Config {
164 case_insensitive: false,
165 multi_line: false,
166 dot_matches_new_line: false,
167 crlf: false,
168 line_terminator: b'\n',
169 swap_greed: false,
170 ignore_whitespace: false,
171 unicode: true,
172 utf8: true,
173 nest_limit: 250,
174 octal: false,
175 }
176 }
177
178 /// Enable or disable the case insensitive flag by default.
179 ///
180 /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
181 /// Specifically, it will apply the "simple" case folding rules as
182 /// specified by Unicode.
183 ///
184 /// By default this is disabled. It may alternatively be selectively
185 /// enabled in the regular expression itself via the `i` flag.
case_insensitive(mut self, yes: bool) -> Config186 pub fn case_insensitive(mut self, yes: bool) -> Config {
187 self.case_insensitive = yes;
188 self
189 }
190
191 /// Enable or disable the multi-line matching flag by default.
192 ///
193 /// When this is enabled, the `^` and `$` look-around assertions will
194 /// match immediately after and immediately before a new line character,
195 /// respectively. Note that the `\A` and `\z` look-around assertions are
196 /// unaffected by this setting and always correspond to matching at the
197 /// beginning and end of the input.
198 ///
199 /// By default this is disabled. It may alternatively be selectively
200 /// enabled in the regular expression itself via the `m` flag.
multi_line(mut self, yes: bool) -> Config201 pub fn multi_line(mut self, yes: bool) -> Config {
202 self.multi_line = yes;
203 self
204 }
205
206 /// Enable or disable the "dot matches any character" flag by default.
207 ///
208 /// When this is enabled, `.` will match any character. When it's disabled,
209 /// then `.` will match any character except for a new line character.
210 ///
211 /// Note that `.` is impacted by whether the "unicode" setting is enabled
212 /// or not. When Unicode is enabled (the default), `.` will match any UTF-8
213 /// encoding of any Unicode scalar value (sans a new line, depending on
214 /// whether this "dot matches new line" option is enabled). When Unicode
215 /// mode is disabled, `.` will match any byte instead. Because of this,
216 /// when Unicode mode is disabled, `.` can only be used when the "allow
217 /// invalid UTF-8" option is enabled, since `.` could otherwise match
218 /// invalid UTF-8.
219 ///
220 /// By default this is disabled. It may alternatively be selectively
221 /// enabled in the regular expression itself via the `s` flag.
dot_matches_new_line(mut self, yes: bool) -> Config222 pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
223 self.dot_matches_new_line = yes;
224 self
225 }
226
227 /// Enable or disable the "CRLF mode" flag by default.
228 ///
229 /// By default this is disabled. It may alternatively be selectively
230 /// enabled in the regular expression itself via the `R` flag.
231 ///
232 /// When CRLF mode is enabled, the following happens:
233 ///
234 /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
235 /// except for `\r` and `\n`.
236 /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
237 /// `\r` and `\n` as line terminators. And in particular, neither will
238 /// match between a `\r` and a `\n`.
crlf(mut self, yes: bool) -> Config239 pub fn crlf(mut self, yes: bool) -> Config {
240 self.crlf = yes;
241 self
242 }
243
244 /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
245 ///
246 /// Namely, instead of `.` (by default) matching everything except for `\n`,
247 /// this will cause `.` to match everything except for the byte given.
248 ///
249 /// If `.` is used in a context where Unicode mode is enabled and this byte
250 /// isn't ASCII, then an error will be returned. When Unicode mode is
251 /// disabled, then any byte is permitted, but will return an error if UTF-8
252 /// mode is enabled and it is a non-ASCII byte.
253 ///
254 /// In short, any ASCII value for a line terminator is always okay. But a
255 /// non-ASCII byte might result in an error depending on whether Unicode
256 /// mode or UTF-8 mode are enabled.
257 ///
258 /// Note that if `R` mode is enabled then it always takes precedence and
259 /// the line terminator will be treated as `\r` and `\n` simultaneously.
260 ///
261 /// Note also that this *doesn't* impact the look-around assertions
262 /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
263 /// configuration in the regex engine itself.
line_terminator(mut self, byte: u8) -> Config264 pub fn line_terminator(mut self, byte: u8) -> Config {
265 self.line_terminator = byte;
266 self
267 }
268
269 /// Enable or disable the "swap greed" flag by default.
270 ///
271 /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
272 /// will become greedy.
273 ///
274 /// By default this is disabled. It may alternatively be selectively
275 /// enabled in the regular expression itself via the `U` flag.
swap_greed(mut self, yes: bool) -> Config276 pub fn swap_greed(mut self, yes: bool) -> Config {
277 self.swap_greed = yes;
278 self
279 }
280
281 /// Enable verbose mode in the regular expression.
282 ///
283 /// When enabled, verbose mode permits insigificant whitespace in many
284 /// places in the regular expression, as well as comments. Comments are
285 /// started using `#` and continue until the end of the line.
286 ///
287 /// By default, this is disabled. It may be selectively enabled in the
288 /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(mut self, yes: bool) -> Config289 pub fn ignore_whitespace(mut self, yes: bool) -> Config {
290 self.ignore_whitespace = yes;
291 self
292 }
293
294 /// Enable or disable the Unicode flag (`u`) by default.
295 ///
296 /// By default this is **enabled**. It may alternatively be selectively
297 /// disabled in the regular expression itself via the `u` flag.
298 ///
299 /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
300 /// default), a regular expression will fail to parse if Unicode mode is
301 /// disabled and a sub-expression could possibly match invalid UTF-8.
302 ///
303 /// **WARNING**: Unicode mode can greatly increase the size of the compiled
304 /// DFA, which can noticeably impact both memory usage and compilation
305 /// time. This is especially noticeable if your regex contains character
306 /// classes like `\w` that are impacted by whether Unicode is enabled or
307 /// not. If Unicode is not necessary, you are encouraged to disable it.
unicode(mut self, yes: bool) -> Config308 pub fn unicode(mut self, yes: bool) -> Config {
309 self.unicode = yes;
310 self
311 }
312
313 /// When disabled, the builder will permit the construction of a regular
314 /// expression that may match invalid UTF-8.
315 ///
316 /// For example, when [`Config::unicode`] is disabled, then
317 /// expressions like `[^a]` may match invalid UTF-8 since they can match
318 /// any single byte that is not `a`. By default, these sub-expressions
319 /// are disallowed to avoid returning offsets that split a UTF-8
320 /// encoded codepoint. However, in cases where matching at arbitrary
321 /// locations is desired, this option can be disabled to permit all such
322 /// sub-expressions.
323 ///
324 /// When enabled (the default), the builder is guaranteed to produce a
325 /// regex that will only ever match valid UTF-8 (otherwise, the builder
326 /// will return an error).
utf8(mut self, yes: bool) -> Config327 pub fn utf8(mut self, yes: bool) -> Config {
328 self.utf8 = yes;
329 self
330 }
331
332 /// Set the nesting limit used for the regular expression parser.
333 ///
334 /// The nesting limit controls how deep the abstract syntax tree is allowed
335 /// to be. If the AST exceeds the given limit (e.g., with too many nested
336 /// groups), then an error is returned by the parser.
337 ///
338 /// The purpose of this limit is to act as a heuristic to prevent stack
339 /// overflow when building a finite automaton from a regular expression's
340 /// abstract syntax tree. In particular, construction currently uses
341 /// recursion. In the future, the implementation may stop using recursion
342 /// and this option will no longer be necessary.
343 ///
344 /// This limit is not checked until the entire AST is parsed. Therefore,
345 /// if callers want to put a limit on the amount of heap space used, then
346 /// they should impose a limit on the length, in bytes, of the concrete
347 /// pattern string. In particular, this is viable since the parser will
348 /// limit itself to heap space proportional to the length of the pattern
349 /// string.
350 ///
351 /// Note that a nest limit of `0` will return a nest limit error for most
352 /// patterns but not all. For example, a nest limit of `0` permits `a` but
353 /// not `ab`, since `ab` requires a concatenation AST item, which results
354 /// in a nest depth of `1`. In general, a nest limit is not something that
355 /// manifests in an obvious way in the concrete syntax, therefore, it
356 /// should not be used in a granular way.
nest_limit(mut self, limit: u32) -> Config357 pub fn nest_limit(mut self, limit: u32) -> Config {
358 self.nest_limit = limit;
359 self
360 }
361
362 /// Whether to support octal syntax or not.
363 ///
364 /// Octal syntax is a little-known way of uttering Unicode codepoints in
365 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
366 /// `\141` are all equivalent regular expressions, where the last example
367 /// shows octal syntax.
368 ///
369 /// While supporting octal syntax isn't in and of itself a problem, it does
370 /// make good error messages harder. That is, in PCRE based regex engines,
371 /// syntax like `\1` invokes a backreference, which is explicitly
372 /// unsupported in Rust's regex engine. However, many users expect it to
373 /// be supported. Therefore, when octal support is disabled, the error
374 /// message will explicitly mention that backreferences aren't supported.
375 ///
376 /// Octal syntax is disabled by default.
octal(mut self, yes: bool) -> Config377 pub fn octal(mut self, yes: bool) -> Config {
378 self.octal = yes;
379 self
380 }
381
382 /// Returns whether "unicode" mode is enabled.
get_unicode(&self) -> bool383 pub fn get_unicode(&self) -> bool {
384 self.unicode
385 }
386
387 /// Returns whether "case insensitive" mode is enabled.
get_case_insensitive(&self) -> bool388 pub fn get_case_insensitive(&self) -> bool {
389 self.case_insensitive
390 }
391
392 /// Returns whether "multi line" mode is enabled.
get_multi_line(&self) -> bool393 pub fn get_multi_line(&self) -> bool {
394 self.multi_line
395 }
396
397 /// Returns whether "dot matches new line" mode is enabled.
get_dot_matches_new_line(&self) -> bool398 pub fn get_dot_matches_new_line(&self) -> bool {
399 self.dot_matches_new_line
400 }
401
402 /// Returns whether "CRLF" mode is enabled.
get_crlf(&self) -> bool403 pub fn get_crlf(&self) -> bool {
404 self.crlf
405 }
406
407 /// Returns the line terminator in this syntax configuration.
get_line_terminator(&self) -> u8408 pub fn get_line_terminator(&self) -> u8 {
409 self.line_terminator
410 }
411
412 /// Returns whether "swap greed" mode is enabled.
get_swap_greed(&self) -> bool413 pub fn get_swap_greed(&self) -> bool {
414 self.swap_greed
415 }
416
417 /// Returns whether "ignore whitespace" mode is enabled.
get_ignore_whitespace(&self) -> bool418 pub fn get_ignore_whitespace(&self) -> bool {
419 self.ignore_whitespace
420 }
421
422 /// Returns whether UTF-8 mode is enabled.
get_utf8(&self) -> bool423 pub fn get_utf8(&self) -> bool {
424 self.utf8
425 }
426
427 /// Returns the "nest limit" setting.
get_nest_limit(&self) -> u32428 pub fn get_nest_limit(&self) -> u32 {
429 self.nest_limit
430 }
431
432 /// Returns whether "octal" mode is enabled.
get_octal(&self) -> bool433 pub fn get_octal(&self) -> bool {
434 self.octal
435 }
436
437 /// Applies this configuration to the given parser.
apply(&self, builder: &mut ParserBuilder)438 pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
439 builder
440 .unicode(self.unicode)
441 .case_insensitive(self.case_insensitive)
442 .multi_line(self.multi_line)
443 .dot_matches_new_line(self.dot_matches_new_line)
444 .crlf(self.crlf)
445 .line_terminator(self.line_terminator)
446 .swap_greed(self.swap_greed)
447 .ignore_whitespace(self.ignore_whitespace)
448 .utf8(self.utf8)
449 .nest_limit(self.nest_limit)
450 .octal(self.octal);
451 }
452
453 /// Applies this configuration to the given AST parser.
apply_ast(&self, builder: &mut ast::parse::ParserBuilder)454 pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
455 builder
456 .ignore_whitespace(self.ignore_whitespace)
457 .nest_limit(self.nest_limit)
458 .octal(self.octal);
459 }
460
461 /// Applies this configuration to the given AST-to-HIR translator.
apply_hir( &self, builder: &mut hir::translate::TranslatorBuilder, )462 pub(crate) fn apply_hir(
463 &self,
464 builder: &mut hir::translate::TranslatorBuilder,
465 ) {
466 builder
467 .unicode(self.unicode)
468 .case_insensitive(self.case_insensitive)
469 .multi_line(self.multi_line)
470 .crlf(self.crlf)
471 .dot_matches_new_line(self.dot_matches_new_line)
472 .line_terminator(self.line_terminator)
473 .swap_greed(self.swap_greed)
474 .utf8(self.utf8);
475 }
476 }
477
478 impl Default for Config {
default() -> Config479 fn default() -> Config {
480 Config::new()
481 }
482 }
483