1 /*!
2 Defines a high-level intermediate representation for regular expressions.
3 */
4 use std::char;
5 use std::cmp;
6 use std::error;
7 use std::fmt;
8 use std::result;
9 use std::u8;
10 
11 use crate::ast::Span;
12 use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
13 use crate::unicode;
14 
15 pub use crate::hir::visitor::{visit, Visitor};
16 pub use crate::unicode::CaseFoldError;
17 
18 mod interval;
19 pub mod literal;
20 pub mod print;
21 pub mod translate;
22 mod visitor;
23 
24 /// An error that can occur while translating an `Ast` to a `Hir`.
25 #[derive(Clone, Debug, Eq, PartialEq)]
26 pub struct Error {
27     /// The kind of error.
28     kind: ErrorKind,
29     /// The original pattern that the translator's Ast was parsed from. Every
30     /// span in an error is a valid range into this string.
31     pattern: String,
32     /// The span of this error, derived from the Ast given to the translator.
33     span: Span,
34 }
35 
36 impl Error {
37     /// Return the type of this error.
kind(&self) -> &ErrorKind38     pub fn kind(&self) -> &ErrorKind {
39         &self.kind
40     }
41 
42     /// The original pattern string in which this error occurred.
43     ///
44     /// Every span reported by this error is reported in terms of this string.
pattern(&self) -> &str45     pub fn pattern(&self) -> &str {
46         &self.pattern
47     }
48 
49     /// Return the span at which this error occurred.
span(&self) -> &Span50     pub fn span(&self) -> &Span {
51         &self.span
52     }
53 }
54 
55 /// The type of an error that occurred while building an `Hir`.
56 #[derive(Clone, Debug, Eq, PartialEq)]
57 pub enum ErrorKind {
58     /// This error occurs when a Unicode feature is used when Unicode
59     /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60     UnicodeNotAllowed,
61     /// This error occurs when translating a pattern that could match a byte
62     /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63     InvalidUtf8,
64     /// This occurs when an unrecognized Unicode property name could not
65     /// be found.
66     UnicodePropertyNotFound,
67     /// This occurs when an unrecognized Unicode property value could not
68     /// be found.
69     UnicodePropertyValueNotFound,
70     /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71     /// `\d`) could not be found. This can occur when the `unicode-perl`
72     /// crate feature is not enabled.
73     UnicodePerlClassNotFound,
74     /// This occurs when the Unicode simple case mapping tables are not
75     /// available, and the regular expression required Unicode aware case
76     /// insensitivity.
77     UnicodeCaseUnavailable,
78     /// This occurs when the translator attempts to construct a character class
79     /// that is empty.
80     ///
81     /// Note that this restriction in the translator may be removed in the
82     /// future.
83     EmptyClassNotAllowed,
84     /// Hints that destructuring should not be exhaustive.
85     ///
86     /// This enum may grow additional variants, so this makes sure clients
87     /// don't count on exhaustive matching. (Otherwise, adding a new variant
88     /// could break existing code.)
89     #[doc(hidden)]
90     __Nonexhaustive,
91 }
92 
93 impl ErrorKind {
94     // TODO: Remove this method entirely on the next breaking semver release.
95     #[allow(deprecated)]
description(&self) -> &str96     fn description(&self) -> &str {
97         use self::ErrorKind::*;
98         match *self {
99             UnicodeNotAllowed => "Unicode not allowed here",
100             InvalidUtf8 => "pattern can match invalid UTF-8",
101             UnicodePropertyNotFound => "Unicode property not found",
102             UnicodePropertyValueNotFound => "Unicode property value not found",
103             UnicodePerlClassNotFound => {
104                 "Unicode-aware Perl class not found \
105                  (make sure the unicode-perl feature is enabled)"
106             }
107             UnicodeCaseUnavailable => {
108                 "Unicode-aware case insensitivity matching is not available \
109                  (make sure the unicode-case feature is enabled)"
110             }
111             EmptyClassNotAllowed => "empty character classes are not allowed",
112             __Nonexhaustive => unreachable!(),
113         }
114     }
115 }
116 
117 impl error::Error for Error {
118     // TODO: Remove this method entirely on the next breaking semver release.
119     #[allow(deprecated)]
description(&self) -> &str120     fn description(&self) -> &str {
121         self.kind.description()
122     }
123 }
124 
125 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result126     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127         crate::error::Formatter::from(self).fmt(f)
128     }
129 }
130 
131 impl fmt::Display for ErrorKind {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result132     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133         // TODO: Remove this on the next breaking semver release.
134         #[allow(deprecated)]
135         f.write_str(self.description())
136     }
137 }
138 
139 /// A high-level intermediate representation (HIR) for a regular expression.
140 ///
141 /// The HIR of a regular expression represents an intermediate step between its
142 /// abstract syntax (a structured description of the concrete syntax) and
143 /// compiled byte codes. The purpose of HIR is to make regular expressions
144 /// easier to analyze. In particular, the AST is much more complex than the
145 /// HIR. For example, while an AST supports arbitrarily nested character
146 /// classes, the HIR will flatten all nested classes into a single set. The HIR
147 /// will also "compile away" every flag present in the concrete syntax. For
148 /// example, users of HIR expressions never need to worry about case folding;
149 /// it is handled automatically by the translator (e.g., by translating `(?i)A`
150 /// to `[aA]`).
151 ///
152 /// If the HIR was produced by a translator that disallows invalid UTF-8, then
153 /// the HIR is guaranteed to match UTF-8 exclusively.
154 ///
155 /// This type defines its own destructor that uses constant stack space and
156 /// heap space proportional to the size of the HIR.
157 ///
158 /// The specific type of an HIR expression can be accessed via its `kind`
159 /// or `into_kind` methods. This extra level of indirection exists for two
160 /// reasons:
161 ///
162 /// 1. Construction of an HIR expression *must* use the constructor methods
163 ///    on this `Hir` type instead of building the `HirKind` values directly.
164 ///    This permits construction to enforce invariants like "concatenations
165 ///    always consist of two or more sub-expressions."
166 /// 2. Every HIR expression contains attributes that are defined inductively,
167 ///    and can be computed cheaply during the construction process. For
168 ///    example, one such attribute is whether the expression must match at the
169 ///    beginning of the text.
170 ///
171 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172 /// expression pattern string, and uses constant stack space and heap space
173 /// proportional to the size of the `Hir`.
174 #[derive(Clone, Debug, Eq, PartialEq)]
175 pub struct Hir {
176     /// The underlying HIR kind.
177     kind: HirKind,
178     /// Analysis info about this HIR, computed during construction.
179     info: HirInfo,
180 }
181 
182 /// The kind of an arbitrary `Hir` expression.
183 #[derive(Clone, Debug, Eq, PartialEq)]
184 pub enum HirKind {
185     /// The empty regular expression, which matches everything, including the
186     /// empty string.
187     Empty,
188     /// A single literal character that matches exactly this character.
189     Literal(Literal),
190     /// A single character class that matches any of the characters in the
191     /// class. A class can either consist of Unicode scalar values as
192     /// characters, or it can use bytes.
193     Class(Class),
194     /// An anchor assertion. An anchor assertion match always has zero length.
195     Anchor(Anchor),
196     /// A word boundary assertion, which may or may not be Unicode aware. A
197     /// word boundary assertion match always has zero length.
198     WordBoundary(WordBoundary),
199     /// A repetition operation applied to a child expression.
200     Repetition(Repetition),
201     /// A possibly capturing group, which contains a child expression.
202     Group(Group),
203     /// A concatenation of expressions. A concatenation always has at least two
204     /// child expressions.
205     ///
206     /// A concatenation matches only if each of its child expression matches
207     /// one after the other.
208     Concat(Vec<Hir>),
209     /// An alternation of expressions. An alternation always has at least two
210     /// child expressions.
211     ///
212     /// An alternation matches only if at least one of its child expression
213     /// matches. If multiple expressions match, then the leftmost is preferred.
214     Alternation(Vec<Hir>),
215 }
216 
217 impl Hir {
218     /// Returns a reference to the underlying HIR kind.
kind(&self) -> &HirKind219     pub fn kind(&self) -> &HirKind {
220         &self.kind
221     }
222 
223     /// Consumes ownership of this HIR expression and returns its underlying
224     /// `HirKind`.
into_kind(mut self) -> HirKind225     pub fn into_kind(mut self) -> HirKind {
226         use std::mem;
227         mem::replace(&mut self.kind, HirKind::Empty)
228     }
229 
230     /// Returns an empty HIR expression.
231     ///
232     /// An empty HIR expression always matches, including the empty string.
empty() -> Hir233     pub fn empty() -> Hir {
234         let mut info = HirInfo::new();
235         info.set_always_utf8(true);
236         info.set_all_assertions(true);
237         info.set_anchored_start(false);
238         info.set_anchored_end(false);
239         info.set_line_anchored_start(false);
240         info.set_line_anchored_end(false);
241         info.set_any_anchored_start(false);
242         info.set_any_anchored_end(false);
243         info.set_match_empty(true);
244         info.set_literal(false);
245         info.set_alternation_literal(false);
246         Hir { kind: HirKind::Empty, info }
247     }
248 
249     /// Creates a literal HIR expression.
250     ///
251     /// If the given literal has a `Byte` variant with an ASCII byte, then this
252     /// method panics. This enforces the invariant that `Byte` variants are
253     /// only used to express matching of invalid UTF-8.
literal(lit: Literal) -> Hir254     pub fn literal(lit: Literal) -> Hir {
255         if let Literal::Byte(b) = lit {
256             assert!(b > 0x7F);
257         }
258 
259         let mut info = HirInfo::new();
260         info.set_always_utf8(lit.is_unicode());
261         info.set_all_assertions(false);
262         info.set_anchored_start(false);
263         info.set_anchored_end(false);
264         info.set_line_anchored_start(false);
265         info.set_line_anchored_end(false);
266         info.set_any_anchored_start(false);
267         info.set_any_anchored_end(false);
268         info.set_match_empty(false);
269         info.set_literal(true);
270         info.set_alternation_literal(true);
271         Hir { kind: HirKind::Literal(lit), info }
272     }
273 
274     /// Creates a class HIR expression.
class(class: Class) -> Hir275     pub fn class(class: Class) -> Hir {
276         let mut info = HirInfo::new();
277         info.set_always_utf8(class.is_always_utf8());
278         info.set_all_assertions(false);
279         info.set_anchored_start(false);
280         info.set_anchored_end(false);
281         info.set_line_anchored_start(false);
282         info.set_line_anchored_end(false);
283         info.set_any_anchored_start(false);
284         info.set_any_anchored_end(false);
285         info.set_match_empty(false);
286         info.set_literal(false);
287         info.set_alternation_literal(false);
288         Hir { kind: HirKind::Class(class), info }
289     }
290 
291     /// Creates an anchor assertion HIR expression.
anchor(anchor: Anchor) -> Hir292     pub fn anchor(anchor: Anchor) -> Hir {
293         let mut info = HirInfo::new();
294         info.set_always_utf8(true);
295         info.set_all_assertions(true);
296         info.set_anchored_start(false);
297         info.set_anchored_end(false);
298         info.set_line_anchored_start(false);
299         info.set_line_anchored_end(false);
300         info.set_any_anchored_start(false);
301         info.set_any_anchored_end(false);
302         info.set_match_empty(true);
303         info.set_literal(false);
304         info.set_alternation_literal(false);
305         if let Anchor::StartText = anchor {
306             info.set_anchored_start(true);
307             info.set_line_anchored_start(true);
308             info.set_any_anchored_start(true);
309         }
310         if let Anchor::EndText = anchor {
311             info.set_anchored_end(true);
312             info.set_line_anchored_end(true);
313             info.set_any_anchored_end(true);
314         }
315         if let Anchor::StartLine = anchor {
316             info.set_line_anchored_start(true);
317         }
318         if let Anchor::EndLine = anchor {
319             info.set_line_anchored_end(true);
320         }
321         Hir { kind: HirKind::Anchor(anchor), info }
322     }
323 
324     /// Creates a word boundary assertion HIR expression.
word_boundary(word_boundary: WordBoundary) -> Hir325     pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326         let mut info = HirInfo::new();
327         info.set_always_utf8(true);
328         info.set_all_assertions(true);
329         info.set_anchored_start(false);
330         info.set_anchored_end(false);
331         info.set_line_anchored_start(false);
332         info.set_line_anchored_end(false);
333         info.set_any_anchored_start(false);
334         info.set_any_anchored_end(false);
335         info.set_literal(false);
336         info.set_alternation_literal(false);
337         // A negated word boundary matches '', so that's fine. But \b does not
338         // match \b, so why do we say it can match the empty string? Well,
339         // because, if you search for \b against 'a', it will report [0, 0) and
340         // [1, 1) as matches, and both of those matches correspond to the empty
341         // string. Thus, only *certain* empty strings match \b, which similarly
342         // applies to \B.
343         info.set_match_empty(true);
344         // Negated ASCII word boundaries can match invalid UTF-8.
345         if let WordBoundary::AsciiNegate = word_boundary {
346             info.set_always_utf8(false);
347         }
348         Hir { kind: HirKind::WordBoundary(word_boundary), info }
349     }
350 
351     /// Creates a repetition HIR expression.
repetition(rep: Repetition) -> Hir352     pub fn repetition(rep: Repetition) -> Hir {
353         let mut info = HirInfo::new();
354         info.set_always_utf8(rep.hir.is_always_utf8());
355         info.set_all_assertions(rep.hir.is_all_assertions());
356         // If this operator can match the empty string, then it can never
357         // be anchored.
358         info.set_anchored_start(
359             !rep.is_match_empty() && rep.hir.is_anchored_start(),
360         );
361         info.set_anchored_end(
362             !rep.is_match_empty() && rep.hir.is_anchored_end(),
363         );
364         info.set_line_anchored_start(
365             !rep.is_match_empty() && rep.hir.is_anchored_start(),
366         );
367         info.set_line_anchored_end(
368             !rep.is_match_empty() && rep.hir.is_anchored_end(),
369         );
370         info.set_any_anchored_start(rep.hir.is_any_anchored_start());
371         info.set_any_anchored_end(rep.hir.is_any_anchored_end());
372         info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
373         info.set_literal(false);
374         info.set_alternation_literal(false);
375         Hir { kind: HirKind::Repetition(rep), info }
376     }
377 
378     /// Creates a group HIR expression.
group(group: Group) -> Hir379     pub fn group(group: Group) -> Hir {
380         let mut info = HirInfo::new();
381         info.set_always_utf8(group.hir.is_always_utf8());
382         info.set_all_assertions(group.hir.is_all_assertions());
383         info.set_anchored_start(group.hir.is_anchored_start());
384         info.set_anchored_end(group.hir.is_anchored_end());
385         info.set_line_anchored_start(group.hir.is_line_anchored_start());
386         info.set_line_anchored_end(group.hir.is_line_anchored_end());
387         info.set_any_anchored_start(group.hir.is_any_anchored_start());
388         info.set_any_anchored_end(group.hir.is_any_anchored_end());
389         info.set_match_empty(group.hir.is_match_empty());
390         info.set_literal(false);
391         info.set_alternation_literal(false);
392         Hir { kind: HirKind::Group(group), info }
393     }
394 
395     /// Returns the concatenation of the given expressions.
396     ///
397     /// This flattens the concatenation as appropriate.
concat(mut exprs: Vec<Hir>) -> Hir398     pub fn concat(mut exprs: Vec<Hir>) -> Hir {
399         match exprs.len() {
400             0 => Hir::empty(),
401             1 => exprs.pop().unwrap(),
402             _ => {
403                 let mut info = HirInfo::new();
404                 info.set_always_utf8(true);
405                 info.set_all_assertions(true);
406                 info.set_any_anchored_start(false);
407                 info.set_any_anchored_end(false);
408                 info.set_match_empty(true);
409                 info.set_literal(true);
410                 info.set_alternation_literal(true);
411 
412                 // Some attributes require analyzing all sub-expressions.
413                 for e in &exprs {
414                     let x = info.is_always_utf8() && e.is_always_utf8();
415                     info.set_always_utf8(x);
416 
417                     let x = info.is_all_assertions() && e.is_all_assertions();
418                     info.set_all_assertions(x);
419 
420                     let x = info.is_any_anchored_start()
421                         || e.is_any_anchored_start();
422                     info.set_any_anchored_start(x);
423 
424                     let x =
425                         info.is_any_anchored_end() || e.is_any_anchored_end();
426                     info.set_any_anchored_end(x);
427 
428                     let x = info.is_match_empty() && e.is_match_empty();
429                     info.set_match_empty(x);
430 
431                     let x = info.is_literal() && e.is_literal();
432                     info.set_literal(x);
433 
434                     let x = info.is_alternation_literal()
435                         && e.is_alternation_literal();
436                     info.set_alternation_literal(x);
437                 }
438                 // Anchored attributes require something slightly more
439                 // sophisticated. Normally, WLOG, to determine whether an
440                 // expression is anchored to the start, we'd only need to check
441                 // the first expression of a concatenation. However,
442                 // expressions like `$\b^` are still anchored to the start,
443                 // but the first expression in the concatenation *isn't*
444                 // anchored to the start. So the "first" expression to look at
445                 // is actually one that is either not an assertion or is
446                 // specifically the StartText assertion.
447                 info.set_anchored_start(
448                     exprs
449                         .iter()
450                         .take_while(|e| {
451                             e.is_anchored_start() || e.is_all_assertions()
452                         })
453                         .any(|e| e.is_anchored_start()),
454                 );
455                 // Similarly for the end anchor, but in reverse.
456                 info.set_anchored_end(
457                     exprs
458                         .iter()
459                         .rev()
460                         .take_while(|e| {
461                             e.is_anchored_end() || e.is_all_assertions()
462                         })
463                         .any(|e| e.is_anchored_end()),
464                 );
465                 // Repeat the process for line anchors.
466                 info.set_line_anchored_start(
467                     exprs
468                         .iter()
469                         .take_while(|e| {
470                             e.is_line_anchored_start() || e.is_all_assertions()
471                         })
472                         .any(|e| e.is_line_anchored_start()),
473                 );
474                 info.set_line_anchored_end(
475                     exprs
476                         .iter()
477                         .rev()
478                         .take_while(|e| {
479                             e.is_line_anchored_end() || e.is_all_assertions()
480                         })
481                         .any(|e| e.is_line_anchored_end()),
482                 );
483                 Hir { kind: HirKind::Concat(exprs), info }
484             }
485         }
486     }
487 
488     /// Returns the alternation of the given expressions.
489     ///
490     /// This flattens the alternation as appropriate.
alternation(mut exprs: Vec<Hir>) -> Hir491     pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
492         match exprs.len() {
493             0 => Hir::empty(),
494             1 => exprs.pop().unwrap(),
495             _ => {
496                 let mut info = HirInfo::new();
497                 info.set_always_utf8(true);
498                 info.set_all_assertions(true);
499                 info.set_anchored_start(true);
500                 info.set_anchored_end(true);
501                 info.set_line_anchored_start(true);
502                 info.set_line_anchored_end(true);
503                 info.set_any_anchored_start(false);
504                 info.set_any_anchored_end(false);
505                 info.set_match_empty(false);
506                 info.set_literal(false);
507                 info.set_alternation_literal(true);
508 
509                 // Some attributes require analyzing all sub-expressions.
510                 for e in &exprs {
511                     let x = info.is_always_utf8() && e.is_always_utf8();
512                     info.set_always_utf8(x);
513 
514                     let x = info.is_all_assertions() && e.is_all_assertions();
515                     info.set_all_assertions(x);
516 
517                     let x = info.is_anchored_start() && e.is_anchored_start();
518                     info.set_anchored_start(x);
519 
520                     let x = info.is_anchored_end() && e.is_anchored_end();
521                     info.set_anchored_end(x);
522 
523                     let x = info.is_line_anchored_start()
524                         && e.is_line_anchored_start();
525                     info.set_line_anchored_start(x);
526 
527                     let x = info.is_line_anchored_end()
528                         && e.is_line_anchored_end();
529                     info.set_line_anchored_end(x);
530 
531                     let x = info.is_any_anchored_start()
532                         || e.is_any_anchored_start();
533                     info.set_any_anchored_start(x);
534 
535                     let x =
536                         info.is_any_anchored_end() || e.is_any_anchored_end();
537                     info.set_any_anchored_end(x);
538 
539                     let x = info.is_match_empty() || e.is_match_empty();
540                     info.set_match_empty(x);
541 
542                     let x = info.is_alternation_literal() && e.is_literal();
543                     info.set_alternation_literal(x);
544                 }
545                 Hir { kind: HirKind::Alternation(exprs), info }
546             }
547         }
548     }
549 
550     /// Build an HIR expression for `.`.
551     ///
552     /// A `.` expression matches any character except for `\n`. To build an
553     /// expression that matches any character, including `\n`, use the `any`
554     /// method.
555     ///
556     /// If `bytes` is `true`, then this assumes characters are limited to a
557     /// single byte.
dot(bytes: bool) -> Hir558     pub fn dot(bytes: bool) -> Hir {
559         if bytes {
560             let mut cls = ClassBytes::empty();
561             cls.push(ClassBytesRange::new(b'\0', b'\x09'));
562             cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
563             Hir::class(Class::Bytes(cls))
564         } else {
565             let mut cls = ClassUnicode::empty();
566             cls.push(ClassUnicodeRange::new('\0', '\x09'));
567             cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
568             Hir::class(Class::Unicode(cls))
569         }
570     }
571 
572     /// Build an HIR expression for `(?s).`.
573     ///
574     /// A `(?s).` expression matches any character, including `\n`. To build an
575     /// expression that matches any character except for `\n`, then use the
576     /// `dot` method.
577     ///
578     /// If `bytes` is `true`, then this assumes characters are limited to a
579     /// single byte.
any(bytes: bool) -> Hir580     pub fn any(bytes: bool) -> Hir {
581         if bytes {
582             let mut cls = ClassBytes::empty();
583             cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
584             Hir::class(Class::Bytes(cls))
585         } else {
586             let mut cls = ClassUnicode::empty();
587             cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
588             Hir::class(Class::Unicode(cls))
589         }
590     }
591 
592     /// Return true if and only if this HIR will always match valid UTF-8.
593     ///
594     /// When this returns false, then it is possible for this HIR expression
595     /// to match invalid UTF-8.
is_always_utf8(&self) -> bool596     pub fn is_always_utf8(&self) -> bool {
597         self.info.is_always_utf8()
598     }
599 
600     /// Returns true if and only if this entire HIR expression is made up of
601     /// zero-width assertions.
602     ///
603     /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
604     /// not `^a`.
is_all_assertions(&self) -> bool605     pub fn is_all_assertions(&self) -> bool {
606         self.info.is_all_assertions()
607     }
608 
609     /// Return true if and only if this HIR is required to match from the
610     /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
611     /// `^foo|^bar` but not `^foo|bar`.
is_anchored_start(&self) -> bool612     pub fn is_anchored_start(&self) -> bool {
613         self.info.is_anchored_start()
614     }
615 
616     /// Return true if and only if this HIR is required to match at the end
617     /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
618     /// `foo$|bar$` but not `foo$|bar`.
is_anchored_end(&self) -> bool619     pub fn is_anchored_end(&self) -> bool {
620         self.info.is_anchored_end()
621     }
622 
623     /// Return true if and only if this HIR is required to match from the
624     /// beginning of text or the beginning of a line. This includes expressions
625     /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
626     /// but not `^foo|bar` or `(?m)^foo|bar`.
627     ///
628     /// Note that if `is_anchored_start` is `true`, then
629     /// `is_line_anchored_start` will also be `true`. The reverse implication
630     /// is not true. For example, `(?m)^foo` is line anchored, but not
631     /// `is_anchored_start`.
is_line_anchored_start(&self) -> bool632     pub fn is_line_anchored_start(&self) -> bool {
633         self.info.is_line_anchored_start()
634     }
635 
636     /// Return true if and only if this HIR is required to match at the
637     /// end of text or the end of a line. This includes expressions like
638     /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
639     /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
640     ///
641     /// Note that if `is_anchored_end` is `true`, then
642     /// `is_line_anchored_end` will also be `true`. The reverse implication
643     /// is not true. For example, `(?m)foo$` is line anchored, but not
644     /// `is_anchored_end`.
is_line_anchored_end(&self) -> bool645     pub fn is_line_anchored_end(&self) -> bool {
646         self.info.is_line_anchored_end()
647     }
648 
649     /// Return true if and only if this HIR contains any sub-expression that
650     /// is required to match at the beginning of text. Specifically, this
651     /// returns true if the `^` symbol (when multiline mode is disabled) or the
652     /// `\A` escape appear anywhere in the regex.
is_any_anchored_start(&self) -> bool653     pub fn is_any_anchored_start(&self) -> bool {
654         self.info.is_any_anchored_start()
655     }
656 
657     /// Return true if and only if this HIR contains any sub-expression that is
658     /// required to match at the end of text. Specifically, this returns true
659     /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
660     /// appear anywhere in the regex.
is_any_anchored_end(&self) -> bool661     pub fn is_any_anchored_end(&self) -> bool {
662         self.info.is_any_anchored_end()
663     }
664 
665     /// Return true if and only if the empty string is part of the language
666     /// matched by this regular expression.
667     ///
668     /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
669     /// and `\B`, but not `a` or `a+`.
is_match_empty(&self) -> bool670     pub fn is_match_empty(&self) -> bool {
671         self.info.is_match_empty()
672     }
673 
674     /// Return true if and only if this HIR is a simple literal. This is only
675     /// true when this HIR expression is either itself a `Literal` or a
676     /// concatenation of only `Literal`s.
677     ///
678     /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
679     /// `` are not (even though that contain sub-expressions that are literals).
is_literal(&self) -> bool680     pub fn is_literal(&self) -> bool {
681         self.info.is_literal()
682     }
683 
684     /// Return true if and only if this HIR is either a simple literal or an
685     /// alternation of simple literals. This is only
686     /// true when this HIR expression is either itself a `Literal` or a
687     /// concatenation of only `Literal`s or an alternation of only `Literal`s.
688     ///
689     /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
690     /// literals, but `f+`, `(foo)`, `foo()`, ``
691     /// are not (even though that contain sub-expressions that are literals).
is_alternation_literal(&self) -> bool692     pub fn is_alternation_literal(&self) -> bool {
693         self.info.is_alternation_literal()
694     }
695 }
696 
697 impl HirKind {
698     /// Return true if and only if this HIR is the empty regular expression.
699     ///
700     /// Note that this is not defined inductively. That is, it only tests if
701     /// this kind is the `Empty` variant. To get the inductive definition,
702     /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
is_empty(&self) -> bool703     pub fn is_empty(&self) -> bool {
704         match *self {
705             HirKind::Empty => true,
706             _ => false,
707         }
708     }
709 
710     /// Returns true if and only if this kind has any (including possibly
711     /// empty) subexpressions.
has_subexprs(&self) -> bool712     pub fn has_subexprs(&self) -> bool {
713         match *self {
714             HirKind::Empty
715             | HirKind::Literal(_)
716             | HirKind::Class(_)
717             | HirKind::Anchor(_)
718             | HirKind::WordBoundary(_) => false,
719             HirKind::Group(_)
720             | HirKind::Repetition(_)
721             | HirKind::Concat(_)
722             | HirKind::Alternation(_) => true,
723         }
724     }
725 }
726 
727 /// Print a display representation of this Hir.
728 ///
729 /// The result of this is a valid regular expression pattern string.
730 ///
731 /// This implementation uses constant stack space and heap space proportional
732 /// to the size of the `Hir`.
733 impl fmt::Display for Hir {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result734     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
735         use crate::hir::print::Printer;
736         Printer::new().print(self, f)
737     }
738 }
739 
740 /// The high-level intermediate representation of a literal.
741 ///
742 /// A literal corresponds to a single character, where a character is either
743 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
744 /// are preferred whenever possible. In particular, a `Byte` variant is only
745 /// ever produced when it could match invalid UTF-8.
746 #[derive(Clone, Debug, Eq, PartialEq)]
747 pub enum Literal {
748     /// A single character represented by a Unicode scalar value.
749     Unicode(char),
750     /// A single character represented by an arbitrary byte.
751     Byte(u8),
752 }
753 
754 impl Literal {
755     /// Returns true if and only if this literal corresponds to a Unicode
756     /// scalar value.
is_unicode(&self) -> bool757     pub fn is_unicode(&self) -> bool {
758         match *self {
759             Literal::Unicode(_) => true,
760             Literal::Byte(b) if b <= 0x7F => true,
761             Literal::Byte(_) => false,
762         }
763     }
764 }
765 
766 /// The high-level intermediate representation of a character class.
767 ///
768 /// A character class corresponds to a set of characters. A character is either
769 /// defined by a Unicode scalar value or a byte. Unicode characters are used
770 /// by default, while bytes are used when Unicode mode (via the `u` flag) is
771 /// disabled.
772 ///
773 /// A character class, regardless of its character type, is represented by a
774 /// sequence of non-overlapping non-adjacent ranges of characters.
775 ///
776 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
777 /// be produced even when it exclusively matches valid UTF-8. This is because
778 /// a `Bytes` variant represents an intention by the author of the regular
779 /// expression to disable Unicode mode, which in turn impacts the semantics of
780 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
781 /// match the same set of strings.
782 #[derive(Clone, Debug, Eq, PartialEq)]
783 pub enum Class {
784     /// A set of characters represented by Unicode scalar values.
785     Unicode(ClassUnicode),
786     /// A set of characters represented by arbitrary bytes (one byte per
787     /// character).
788     Bytes(ClassBytes),
789 }
790 
791 impl Class {
792     /// Apply Unicode simple case folding to this character class, in place.
793     /// The character class will be expanded to include all simple case folded
794     /// character variants.
795     ///
796     /// If this is a byte oriented character class, then this will be limited
797     /// to the ASCII ranges `A-Z` and `a-z`.
case_fold_simple(&mut self)798     pub fn case_fold_simple(&mut self) {
799         match *self {
800             Class::Unicode(ref mut x) => x.case_fold_simple(),
801             Class::Bytes(ref mut x) => x.case_fold_simple(),
802         }
803     }
804 
805     /// Negate this character class in place.
806     ///
807     /// After completion, this character class will contain precisely the
808     /// characters that weren't previously in the class.
negate(&mut self)809     pub fn negate(&mut self) {
810         match *self {
811             Class::Unicode(ref mut x) => x.negate(),
812             Class::Bytes(ref mut x) => x.negate(),
813         }
814     }
815 
816     /// Returns true if and only if this character class will only ever match
817     /// valid UTF-8.
818     ///
819     /// A character class can match invalid UTF-8 only when the following
820     /// conditions are met:
821     ///
822     /// 1. The translator was configured to permit generating an expression
823     ///    that can match invalid UTF-8. (By default, this is disabled.)
824     /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
825     ///    syntax or in the parser builder. By default, Unicode mode is
826     ///    enabled.
is_always_utf8(&self) -> bool827     pub fn is_always_utf8(&self) -> bool {
828         match *self {
829             Class::Unicode(_) => true,
830             Class::Bytes(ref x) => x.is_all_ascii(),
831         }
832     }
833 }
834 
835 /// A set of characters represented by Unicode scalar values.
836 #[derive(Clone, Debug, Eq, PartialEq)]
837 pub struct ClassUnicode {
838     set: IntervalSet<ClassUnicodeRange>,
839 }
840 
841 impl ClassUnicode {
842     /// Create a new class from a sequence of ranges.
843     ///
844     /// The given ranges do not need to be in any specific order, and ranges
845     /// may overlap.
new<I>(ranges: I) -> ClassUnicode where I: IntoIterator<Item = ClassUnicodeRange>,846     pub fn new<I>(ranges: I) -> ClassUnicode
847     where
848         I: IntoIterator<Item = ClassUnicodeRange>,
849     {
850         ClassUnicode { set: IntervalSet::new(ranges) }
851     }
852 
853     /// Create a new class with no ranges.
empty() -> ClassUnicode854     pub fn empty() -> ClassUnicode {
855         ClassUnicode::new(vec![])
856     }
857 
858     /// Add a new range to this set.
push(&mut self, range: ClassUnicodeRange)859     pub fn push(&mut self, range: ClassUnicodeRange) {
860         self.set.push(range);
861     }
862 
863     /// Return an iterator over all ranges in this class.
864     ///
865     /// The iterator yields ranges in ascending order.
iter(&self) -> ClassUnicodeIter<'_>866     pub fn iter(&self) -> ClassUnicodeIter<'_> {
867         ClassUnicodeIter(self.set.iter())
868     }
869 
870     /// Return the underlying ranges as a slice.
ranges(&self) -> &[ClassUnicodeRange]871     pub fn ranges(&self) -> &[ClassUnicodeRange] {
872         self.set.intervals()
873     }
874 
875     /// Expand this character class such that it contains all case folded
876     /// characters, according to Unicode's "simple" mapping. For example, if
877     /// this class consists of the range `a-z`, then applying case folding will
878     /// result in the class containing both the ranges `a-z` and `A-Z`.
879     ///
880     /// # Panics
881     ///
882     /// This routine panics when the case mapping data necessary for this
883     /// routine to complete is unavailable. This occurs when the `unicode-case`
884     /// feature is not enabled.
885     ///
886     /// Callers should prefer using `try_case_fold_simple` instead, which will
887     /// return an error instead of panicking.
case_fold_simple(&mut self)888     pub fn case_fold_simple(&mut self) {
889         self.set
890             .case_fold_simple()
891             .expect("unicode-case feature must be enabled");
892     }
893 
894     /// Expand this character class such that it contains all case folded
895     /// characters, according to Unicode's "simple" mapping. For example, if
896     /// this class consists of the range `a-z`, then applying case folding will
897     /// result in the class containing both the ranges `a-z` and `A-Z`.
898     ///
899     /// # Error
900     ///
901     /// This routine returns an error when the case mapping data necessary
902     /// for this routine to complete is unavailable. This occurs when the
903     /// `unicode-case` feature is not enabled.
try_case_fold_simple( &mut self, ) -> result::Result<(), CaseFoldError>904     pub fn try_case_fold_simple(
905         &mut self,
906     ) -> result::Result<(), CaseFoldError> {
907         self.set.case_fold_simple()
908     }
909 
910     /// Negate this character class.
911     ///
912     /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
913     /// set, then it will not be in this set after negation.
negate(&mut self)914     pub fn negate(&mut self) {
915         self.set.negate();
916     }
917 
918     /// Union this character class with the given character class, in place.
union(&mut self, other: &ClassUnicode)919     pub fn union(&mut self, other: &ClassUnicode) {
920         self.set.union(&other.set);
921     }
922 
923     /// Intersect this character class with the given character class, in
924     /// place.
intersect(&mut self, other: &ClassUnicode)925     pub fn intersect(&mut self, other: &ClassUnicode) {
926         self.set.intersect(&other.set);
927     }
928 
929     /// Subtract the given character class from this character class, in place.
difference(&mut self, other: &ClassUnicode)930     pub fn difference(&mut self, other: &ClassUnicode) {
931         self.set.difference(&other.set);
932     }
933 
934     /// Compute the symmetric difference of the given character classes, in
935     /// place.
936     ///
937     /// This computes the symmetric difference of two character classes. This
938     /// removes all elements in this class that are also in the given class,
939     /// but all adds all elements from the given class that aren't in this
940     /// class. That is, the class will contain all elements in either class,
941     /// but will not contain any elements that are in both classes.
symmetric_difference(&mut self, other: &ClassUnicode)942     pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
943         self.set.symmetric_difference(&other.set);
944     }
945 
946     /// Returns true if and only if this character class will either match
947     /// nothing or only ASCII bytes. Stated differently, this returns false
948     /// if and only if this class contains a non-ASCII codepoint.
is_all_ascii(&self) -> bool949     pub fn is_all_ascii(&self) -> bool {
950         self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
951     }
952 }
953 
954 /// An iterator over all ranges in a Unicode character class.
955 ///
956 /// The lifetime `'a` refers to the lifetime of the underlying class.
957 #[derive(Debug)]
958 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
959 
960 impl<'a> Iterator for ClassUnicodeIter<'a> {
961     type Item = &'a ClassUnicodeRange;
962 
next(&mut self) -> Option<&'a ClassUnicodeRange>963     fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
964         self.0.next()
965     }
966 }
967 
968 /// A single range of characters represented by Unicode scalar values.
969 ///
970 /// The range is closed. That is, the start and end of the range are included
971 /// in the range.
972 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
973 pub struct ClassUnicodeRange {
974     start: char,
975     end: char,
976 }
977 
978 impl fmt::Debug for ClassUnicodeRange {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result979     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
980         let start = if !self.start.is_whitespace() && !self.start.is_control()
981         {
982             self.start.to_string()
983         } else {
984             format!("0x{:X}", self.start as u32)
985         };
986         let end = if !self.end.is_whitespace() && !self.end.is_control() {
987             self.end.to_string()
988         } else {
989             format!("0x{:X}", self.end as u32)
990         };
991         f.debug_struct("ClassUnicodeRange")
992             .field("start", &start)
993             .field("end", &end)
994             .finish()
995     }
996 }
997 
998 impl Interval for ClassUnicodeRange {
999     type Bound = char;
1000 
1001     #[inline]
lower(&self) -> char1002     fn lower(&self) -> char {
1003         self.start
1004     }
1005     #[inline]
upper(&self) -> char1006     fn upper(&self) -> char {
1007         self.end
1008     }
1009     #[inline]
set_lower(&mut self, bound: char)1010     fn set_lower(&mut self, bound: char) {
1011         self.start = bound;
1012     }
1013     #[inline]
set_upper(&mut self, bound: char)1014     fn set_upper(&mut self, bound: char) {
1015         self.end = bound;
1016     }
1017 
1018     /// Apply simple case folding to this Unicode scalar value range.
1019     ///
1020     /// Additional ranges are appended to the given vector. Canonical ordering
1021     /// is *not* maintained in the given vector.
case_fold_simple( &self, ranges: &mut Vec<ClassUnicodeRange>, ) -> Result<(), unicode::CaseFoldError>1022     fn case_fold_simple(
1023         &self,
1024         ranges: &mut Vec<ClassUnicodeRange>,
1025     ) -> Result<(), unicode::CaseFoldError> {
1026         if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1027             return Ok(());
1028         }
1029         let start = self.start as u32;
1030         let end = (self.end as u32).saturating_add(1);
1031         let mut next_simple_cp = None;
1032         for cp in (start..end).filter_map(char::from_u32) {
1033             if next_simple_cp.map_or(false, |next| cp < next) {
1034                 continue;
1035             }
1036             let it = match unicode::simple_fold(cp)? {
1037                 Ok(it) => it,
1038                 Err(next) => {
1039                     next_simple_cp = next;
1040                     continue;
1041                 }
1042             };
1043             for cp_folded in it {
1044                 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1045             }
1046         }
1047         Ok(())
1048     }
1049 }
1050 
1051 impl ClassUnicodeRange {
1052     /// Create a new Unicode scalar value range for a character class.
1053     ///
1054     /// The returned range is always in a canonical form. That is, the range
1055     /// returned always satisfies the invariant that `start <= end`.
new(start: char, end: char) -> ClassUnicodeRange1056     pub fn new(start: char, end: char) -> ClassUnicodeRange {
1057         ClassUnicodeRange::create(start, end)
1058     }
1059 
1060     /// Return the start of this range.
1061     ///
1062     /// The start of a range is always less than or equal to the end of the
1063     /// range.
start(&self) -> char1064     pub fn start(&self) -> char {
1065         self.start
1066     }
1067 
1068     /// Return the end of this range.
1069     ///
1070     /// The end of a range is always greater than or equal to the start of the
1071     /// range.
end(&self) -> char1072     pub fn end(&self) -> char {
1073         self.end
1074     }
1075 }
1076 
1077 /// A set of characters represented by arbitrary bytes (where one byte
1078 /// corresponds to one character).
1079 #[derive(Clone, Debug, Eq, PartialEq)]
1080 pub struct ClassBytes {
1081     set: IntervalSet<ClassBytesRange>,
1082 }
1083 
1084 impl ClassBytes {
1085     /// Create a new class from a sequence of ranges.
1086     ///
1087     /// The given ranges do not need to be in any specific order, and ranges
1088     /// may overlap.
new<I>(ranges: I) -> ClassBytes where I: IntoIterator<Item = ClassBytesRange>,1089     pub fn new<I>(ranges: I) -> ClassBytes
1090     where
1091         I: IntoIterator<Item = ClassBytesRange>,
1092     {
1093         ClassBytes { set: IntervalSet::new(ranges) }
1094     }
1095 
1096     /// Create a new class with no ranges.
empty() -> ClassBytes1097     pub fn empty() -> ClassBytes {
1098         ClassBytes::new(vec![])
1099     }
1100 
1101     /// Add a new range to this set.
push(&mut self, range: ClassBytesRange)1102     pub fn push(&mut self, range: ClassBytesRange) {
1103         self.set.push(range);
1104     }
1105 
1106     /// Return an iterator over all ranges in this class.
1107     ///
1108     /// The iterator yields ranges in ascending order.
iter(&self) -> ClassBytesIter<'_>1109     pub fn iter(&self) -> ClassBytesIter<'_> {
1110         ClassBytesIter(self.set.iter())
1111     }
1112 
1113     /// Return the underlying ranges as a slice.
ranges(&self) -> &[ClassBytesRange]1114     pub fn ranges(&self) -> &[ClassBytesRange] {
1115         self.set.intervals()
1116     }
1117 
1118     /// Expand this character class such that it contains all case folded
1119     /// characters. For example, if this class consists of the range `a-z`,
1120     /// then applying case folding will result in the class containing both the
1121     /// ranges `a-z` and `A-Z`.
1122     ///
1123     /// Note that this only applies ASCII case folding, which is limited to the
1124     /// characters `a-z` and `A-Z`.
case_fold_simple(&mut self)1125     pub fn case_fold_simple(&mut self) {
1126         self.set.case_fold_simple().expect("ASCII case folding never fails");
1127     }
1128 
1129     /// Negate this byte class.
1130     ///
1131     /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1132     /// will not be in this set after negation.
negate(&mut self)1133     pub fn negate(&mut self) {
1134         self.set.negate();
1135     }
1136 
1137     /// Union this byte class with the given byte class, in place.
union(&mut self, other: &ClassBytes)1138     pub fn union(&mut self, other: &ClassBytes) {
1139         self.set.union(&other.set);
1140     }
1141 
1142     /// Intersect this byte class with the given byte class, in place.
intersect(&mut self, other: &ClassBytes)1143     pub fn intersect(&mut self, other: &ClassBytes) {
1144         self.set.intersect(&other.set);
1145     }
1146 
1147     /// Subtract the given byte class from this byte class, in place.
difference(&mut self, other: &ClassBytes)1148     pub fn difference(&mut self, other: &ClassBytes) {
1149         self.set.difference(&other.set);
1150     }
1151 
1152     /// Compute the symmetric difference of the given byte classes, in place.
1153     ///
1154     /// This computes the symmetric difference of two byte classes. This
1155     /// removes all elements in this class that are also in the given class,
1156     /// but all adds all elements from the given class that aren't in this
1157     /// class. That is, the class will contain all elements in either class,
1158     /// but will not contain any elements that are in both classes.
symmetric_difference(&mut self, other: &ClassBytes)1159     pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1160         self.set.symmetric_difference(&other.set);
1161     }
1162 
1163     /// Returns true if and only if this character class will either match
1164     /// nothing or only ASCII bytes. Stated differently, this returns false
1165     /// if and only if this class contains a non-ASCII byte.
is_all_ascii(&self) -> bool1166     pub fn is_all_ascii(&self) -> bool {
1167         self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1168     }
1169 }
1170 
1171 /// An iterator over all ranges in a byte character class.
1172 ///
1173 /// The lifetime `'a` refers to the lifetime of the underlying class.
1174 #[derive(Debug)]
1175 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1176 
1177 impl<'a> Iterator for ClassBytesIter<'a> {
1178     type Item = &'a ClassBytesRange;
1179 
next(&mut self) -> Option<&'a ClassBytesRange>1180     fn next(&mut self) -> Option<&'a ClassBytesRange> {
1181         self.0.next()
1182     }
1183 }
1184 
1185 /// A single range of characters represented by arbitrary bytes.
1186 ///
1187 /// The range is closed. That is, the start and end of the range are included
1188 /// in the range.
1189 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1190 pub struct ClassBytesRange {
1191     start: u8,
1192     end: u8,
1193 }
1194 
1195 impl Interval for ClassBytesRange {
1196     type Bound = u8;
1197 
1198     #[inline]
lower(&self) -> u81199     fn lower(&self) -> u8 {
1200         self.start
1201     }
1202     #[inline]
upper(&self) -> u81203     fn upper(&self) -> u8 {
1204         self.end
1205     }
1206     #[inline]
set_lower(&mut self, bound: u8)1207     fn set_lower(&mut self, bound: u8) {
1208         self.start = bound;
1209     }
1210     #[inline]
set_upper(&mut self, bound: u8)1211     fn set_upper(&mut self, bound: u8) {
1212         self.end = bound;
1213     }
1214 
1215     /// Apply simple case folding to this byte range. Only ASCII case mappings
1216     /// (for a-z) are applied.
1217     ///
1218     /// Additional ranges are appended to the given vector. Canonical ordering
1219     /// is *not* maintained in the given vector.
case_fold_simple( &self, ranges: &mut Vec<ClassBytesRange>, ) -> Result<(), unicode::CaseFoldError>1220     fn case_fold_simple(
1221         &self,
1222         ranges: &mut Vec<ClassBytesRange>,
1223     ) -> Result<(), unicode::CaseFoldError> {
1224         if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1225             let lower = cmp::max(self.start, b'a');
1226             let upper = cmp::min(self.end, b'z');
1227             ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1228         }
1229         if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1230             let lower = cmp::max(self.start, b'A');
1231             let upper = cmp::min(self.end, b'Z');
1232             ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1233         }
1234         Ok(())
1235     }
1236 }
1237 
1238 impl ClassBytesRange {
1239     /// Create a new byte range for a character class.
1240     ///
1241     /// The returned range is always in a canonical form. That is, the range
1242     /// returned always satisfies the invariant that `start <= end`.
new(start: u8, end: u8) -> ClassBytesRange1243     pub fn new(start: u8, end: u8) -> ClassBytesRange {
1244         ClassBytesRange::create(start, end)
1245     }
1246 
1247     /// Return the start of this range.
1248     ///
1249     /// The start of a range is always less than or equal to the end of the
1250     /// range.
start(&self) -> u81251     pub fn start(&self) -> u8 {
1252         self.start
1253     }
1254 
1255     /// Return the end of this range.
1256     ///
1257     /// The end of a range is always greater than or equal to the start of the
1258     /// range.
end(&self) -> u81259     pub fn end(&self) -> u8 {
1260         self.end
1261     }
1262 }
1263 
1264 impl fmt::Debug for ClassBytesRange {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1265     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1266         let mut debug = f.debug_struct("ClassBytesRange");
1267         if self.start <= 0x7F {
1268             debug.field("start", &(self.start as char));
1269         } else {
1270             debug.field("start", &self.start);
1271         }
1272         if self.end <= 0x7F {
1273             debug.field("end", &(self.end as char));
1274         } else {
1275             debug.field("end", &self.end);
1276         }
1277         debug.finish()
1278     }
1279 }
1280 
1281 /// The high-level intermediate representation for an anchor assertion.
1282 ///
1283 /// A matching anchor assertion is always zero-length.
1284 #[derive(Clone, Debug, Eq, PartialEq)]
1285 pub enum Anchor {
1286     /// Match the beginning of a line or the beginning of text. Specifically,
1287     /// this matches at the starting position of the input, or at the position
1288     /// immediately following a `\n` character.
1289     StartLine,
1290     /// Match the end of a line or the end of text. Specifically,
1291     /// this matches at the end position of the input, or at the position
1292     /// immediately preceding a `\n` character.
1293     EndLine,
1294     /// Match the beginning of text. Specifically, this matches at the starting
1295     /// position of the input.
1296     StartText,
1297     /// Match the end of text. Specifically, this matches at the ending
1298     /// position of the input.
1299     EndText,
1300 }
1301 
1302 /// The high-level intermediate representation for a word-boundary assertion.
1303 ///
1304 /// A matching word boundary assertion is always zero-length.
1305 #[derive(Clone, Debug, Eq, PartialEq)]
1306 pub enum WordBoundary {
1307     /// Match a Unicode-aware word boundary. That is, this matches a position
1308     /// where the left adjacent character and right adjacent character
1309     /// correspond to a word and non-word or a non-word and word character.
1310     Unicode,
1311     /// Match a Unicode-aware negation of a word boundary.
1312     UnicodeNegate,
1313     /// Match an ASCII-only word boundary. That is, this matches a position
1314     /// where the left adjacent character and right adjacent character
1315     /// correspond to a word and non-word or a non-word and word character.
1316     Ascii,
1317     /// Match an ASCII-only negation of a word boundary.
1318     AsciiNegate,
1319 }
1320 
1321 impl WordBoundary {
1322     /// Returns true if and only if this word boundary assertion is negated.
is_negated(&self) -> bool1323     pub fn is_negated(&self) -> bool {
1324         match *self {
1325             WordBoundary::Unicode | WordBoundary::Ascii => false,
1326             WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1327         }
1328     }
1329 }
1330 
1331 /// The high-level intermediate representation for a group.
1332 ///
1333 /// This represents one of three possible group types:
1334 ///
1335 /// 1. A non-capturing group (e.g., `(?:expr)`).
1336 /// 2. A capturing group (e.g., `(expr)`).
1337 /// 3. A named capturing group (e.g., `(?P<name>expr)`).
1338 #[derive(Clone, Debug, Eq, PartialEq)]
1339 pub struct Group {
1340     /// The kind of this group. If it is a capturing group, then the kind
1341     /// contains the capture group index (and the name, if it is a named
1342     /// group).
1343     pub kind: GroupKind,
1344     /// The expression inside the capturing group, which may be empty.
1345     pub hir: Box<Hir>,
1346 }
1347 
1348 /// The kind of group.
1349 #[derive(Clone, Debug, Eq, PartialEq)]
1350 pub enum GroupKind {
1351     /// A normal unnamed capturing group.
1352     ///
1353     /// The value is the capture index of the group.
1354     CaptureIndex(u32),
1355     /// A named capturing group.
1356     CaptureName {
1357         /// The name of the group.
1358         name: String,
1359         /// The capture index of the group.
1360         index: u32,
1361     },
1362     /// A non-capturing group.
1363     NonCapturing,
1364 }
1365 
1366 /// The high-level intermediate representation of a repetition operator.
1367 ///
1368 /// A repetition operator permits the repetition of an arbitrary
1369 /// sub-expression.
1370 #[derive(Clone, Debug, Eq, PartialEq)]
1371 pub struct Repetition {
1372     /// The kind of this repetition operator.
1373     pub kind: RepetitionKind,
1374     /// Whether this repetition operator is greedy or not. A greedy operator
1375     /// will match as much as it can. A non-greedy operator will match as
1376     /// little as it can.
1377     ///
1378     /// Typically, operators are greedy by default and are only non-greedy when
1379     /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1380     /// not. However, this can be inverted via the `U` "ungreedy" flag.
1381     pub greedy: bool,
1382     /// The expression being repeated.
1383     pub hir: Box<Hir>,
1384 }
1385 
1386 impl Repetition {
1387     /// Returns true if and only if this repetition operator makes it possible
1388     /// to match the empty string.
1389     ///
1390     /// Note that this is not defined inductively. For example, while `a*`
1391     /// will report `true`, `()+` will not, even though `()` matches the empty
1392     /// string and one or more occurrences of something that matches the empty
1393     /// string will always match the empty string. In order to get the
1394     /// inductive definition, see the corresponding method on
1395     /// [`Hir`](struct.Hir.html).
is_match_empty(&self) -> bool1396     pub fn is_match_empty(&self) -> bool {
1397         match self.kind {
1398             RepetitionKind::ZeroOrOne => true,
1399             RepetitionKind::ZeroOrMore => true,
1400             RepetitionKind::OneOrMore => false,
1401             RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1402             RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1403             RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1404         }
1405     }
1406 }
1407 
1408 /// The kind of a repetition operator.
1409 #[derive(Clone, Debug, Eq, PartialEq)]
1410 pub enum RepetitionKind {
1411     /// Matches a sub-expression zero or one times.
1412     ZeroOrOne,
1413     /// Matches a sub-expression zero or more times.
1414     ZeroOrMore,
1415     /// Matches a sub-expression one or more times.
1416     OneOrMore,
1417     /// Matches a sub-expression within a bounded range of times.
1418     Range(RepetitionRange),
1419 }
1420 
1421 /// The kind of a counted repetition operator.
1422 #[derive(Clone, Debug, Eq, PartialEq)]
1423 pub enum RepetitionRange {
1424     /// Matches a sub-expression exactly this many times.
1425     Exactly(u32),
1426     /// Matches a sub-expression at least this many times.
1427     AtLeast(u32),
1428     /// Matches a sub-expression at least `m` times and at most `n` times.
1429     Bounded(u32, u32),
1430 }
1431 
1432 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1433 /// space but heap space proportional to the depth of the total `Hir`.
1434 impl Drop for Hir {
drop(&mut self)1435     fn drop(&mut self) {
1436         use std::mem;
1437 
1438         match *self.kind() {
1439             HirKind::Empty
1440             | HirKind::Literal(_)
1441             | HirKind::Class(_)
1442             | HirKind::Anchor(_)
1443             | HirKind::WordBoundary(_) => return,
1444             HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1445             HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1446             HirKind::Concat(ref x) if x.is_empty() => return,
1447             HirKind::Alternation(ref x) if x.is_empty() => return,
1448             _ => {}
1449         }
1450 
1451         let mut stack = vec![mem::replace(self, Hir::empty())];
1452         while let Some(mut expr) = stack.pop() {
1453             match expr.kind {
1454                 HirKind::Empty
1455                 | HirKind::Literal(_)
1456                 | HirKind::Class(_)
1457                 | HirKind::Anchor(_)
1458                 | HirKind::WordBoundary(_) => {}
1459                 HirKind::Group(ref mut x) => {
1460                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
1461                 }
1462                 HirKind::Repetition(ref mut x) => {
1463                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
1464                 }
1465                 HirKind::Concat(ref mut x) => {
1466                     stack.extend(x.drain(..));
1467                 }
1468                 HirKind::Alternation(ref mut x) => {
1469                     stack.extend(x.drain(..));
1470                 }
1471             }
1472         }
1473     }
1474 }
1475 
1476 /// A type that documents various attributes of an HIR expression.
1477 ///
1478 /// These attributes are typically defined inductively on the HIR.
1479 #[derive(Clone, Debug, Eq, PartialEq)]
1480 struct HirInfo {
1481     /// Represent yes/no questions by a bitfield to conserve space, since
1482     /// this is included in every HIR expression.
1483     ///
1484     /// If more attributes need to be added, it is OK to increase the size of
1485     /// this as appropriate.
1486     bools: u16,
1487 }
1488 
1489 // A simple macro for defining bitfield accessors/mutators.
1490 macro_rules! define_bool {
1491     ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1492         fn $is_fn_name(&self) -> bool {
1493             self.bools & (0b1 << $bit) > 0
1494         }
1495 
1496         fn $set_fn_name(&mut self, yes: bool) {
1497             if yes {
1498                 self.bools |= 1 << $bit;
1499             } else {
1500                 self.bools &= !(1 << $bit);
1501             }
1502         }
1503     };
1504 }
1505 
1506 impl HirInfo {
new() -> HirInfo1507     fn new() -> HirInfo {
1508         HirInfo { bools: 0 }
1509     }
1510 
1511     define_bool!(0, is_always_utf8, set_always_utf8);
1512     define_bool!(1, is_all_assertions, set_all_assertions);
1513     define_bool!(2, is_anchored_start, set_anchored_start);
1514     define_bool!(3, is_anchored_end, set_anchored_end);
1515     define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1516     define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1517     define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1518     define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1519     define_bool!(8, is_match_empty, set_match_empty);
1520     define_bool!(9, is_literal, set_literal);
1521     define_bool!(10, is_alternation_literal, set_alternation_literal);
1522 }
1523 
1524 #[cfg(test)]
1525 mod tests {
1526     use super::*;
1527 
uclass(ranges: &[(char, char)]) -> ClassUnicode1528     fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1529         let ranges: Vec<ClassUnicodeRange> = ranges
1530             .iter()
1531             .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1532             .collect();
1533         ClassUnicode::new(ranges)
1534     }
1535 
bclass(ranges: &[(u8, u8)]) -> ClassBytes1536     fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1537         let ranges: Vec<ClassBytesRange> =
1538             ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1539         ClassBytes::new(ranges)
1540     }
1541 
uranges(cls: &ClassUnicode) -> Vec<(char, char)>1542     fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1543         cls.iter().map(|x| (x.start(), x.end())).collect()
1544     }
1545 
1546     #[cfg(feature = "unicode-case")]
ucasefold(cls: &ClassUnicode) -> ClassUnicode1547     fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1548         let mut cls_ = cls.clone();
1549         cls_.case_fold_simple();
1550         cls_
1551     }
1552 
uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1553     fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1554         let mut cls_ = cls1.clone();
1555         cls_.union(cls2);
1556         cls_
1557     }
1558 
uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1559     fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1560         let mut cls_ = cls1.clone();
1561         cls_.intersect(cls2);
1562         cls_
1563     }
1564 
udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1565     fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1566         let mut cls_ = cls1.clone();
1567         cls_.difference(cls2);
1568         cls_
1569     }
1570 
usymdifference( cls1: &ClassUnicode, cls2: &ClassUnicode, ) -> ClassUnicode1571     fn usymdifference(
1572         cls1: &ClassUnicode,
1573         cls2: &ClassUnicode,
1574     ) -> ClassUnicode {
1575         let mut cls_ = cls1.clone();
1576         cls_.symmetric_difference(cls2);
1577         cls_
1578     }
1579 
unegate(cls: &ClassUnicode) -> ClassUnicode1580     fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1581         let mut cls_ = cls.clone();
1582         cls_.negate();
1583         cls_
1584     }
1585 
branges(cls: &ClassBytes) -> Vec<(u8, u8)>1586     fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1587         cls.iter().map(|x| (x.start(), x.end())).collect()
1588     }
1589 
bcasefold(cls: &ClassBytes) -> ClassBytes1590     fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1591         let mut cls_ = cls.clone();
1592         cls_.case_fold_simple();
1593         cls_
1594     }
1595 
bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1596     fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1597         let mut cls_ = cls1.clone();
1598         cls_.union(cls2);
1599         cls_
1600     }
1601 
bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1602     fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1603         let mut cls_ = cls1.clone();
1604         cls_.intersect(cls2);
1605         cls_
1606     }
1607 
bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1608     fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1609         let mut cls_ = cls1.clone();
1610         cls_.difference(cls2);
1611         cls_
1612     }
1613 
bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1614     fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1615         let mut cls_ = cls1.clone();
1616         cls_.symmetric_difference(cls2);
1617         cls_
1618     }
1619 
bnegate(cls: &ClassBytes) -> ClassBytes1620     fn bnegate(cls: &ClassBytes) -> ClassBytes {
1621         let mut cls_ = cls.clone();
1622         cls_.negate();
1623         cls_
1624     }
1625 
1626     #[test]
class_range_canonical_unicode()1627     fn class_range_canonical_unicode() {
1628         let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1629         assert_eq!('\0', range.start());
1630         assert_eq!('\u{00FF}', range.end());
1631     }
1632 
1633     #[test]
class_range_canonical_bytes()1634     fn class_range_canonical_bytes() {
1635         let range = ClassBytesRange::new(b'\xFF', b'\0');
1636         assert_eq!(b'\0', range.start());
1637         assert_eq!(b'\xFF', range.end());
1638     }
1639 
1640     #[test]
class_canonicalize_unicode()1641     fn class_canonicalize_unicode() {
1642         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1643         let expected = vec![('a', 'c'), ('x', 'z')];
1644         assert_eq!(expected, uranges(&cls));
1645 
1646         let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1647         let expected = vec![('a', 'c'), ('x', 'z')];
1648         assert_eq!(expected, uranges(&cls));
1649 
1650         let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1651         let expected = vec![('w', 'z')];
1652         assert_eq!(expected, uranges(&cls));
1653 
1654         let cls = uclass(&[
1655             ('c', 'f'),
1656             ('a', 'g'),
1657             ('d', 'j'),
1658             ('a', 'c'),
1659             ('m', 'p'),
1660             ('l', 's'),
1661         ]);
1662         let expected = vec![('a', 'j'), ('l', 's')];
1663         assert_eq!(expected, uranges(&cls));
1664 
1665         let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1666         let expected = vec![('u', 'z')];
1667         assert_eq!(expected, uranges(&cls));
1668 
1669         let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1670         let expected = vec![('\x00', '\u{10FFFF}')];
1671         assert_eq!(expected, uranges(&cls));
1672 
1673         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1674         let expected = vec![('a', 'b')];
1675         assert_eq!(expected, uranges(&cls));
1676     }
1677 
1678     #[test]
class_canonicalize_bytes()1679     fn class_canonicalize_bytes() {
1680         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1681         let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682         assert_eq!(expected, branges(&cls));
1683 
1684         let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1685         let expected = vec![(b'a', b'c'), (b'x', b'z')];
1686         assert_eq!(expected, branges(&cls));
1687 
1688         let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1689         let expected = vec![(b'w', b'z')];
1690         assert_eq!(expected, branges(&cls));
1691 
1692         let cls = bclass(&[
1693             (b'c', b'f'),
1694             (b'a', b'g'),
1695             (b'd', b'j'),
1696             (b'a', b'c'),
1697             (b'm', b'p'),
1698             (b'l', b's'),
1699         ]);
1700         let expected = vec![(b'a', b'j'), (b'l', b's')];
1701         assert_eq!(expected, branges(&cls));
1702 
1703         let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1704         let expected = vec![(b'u', b'z')];
1705         assert_eq!(expected, branges(&cls));
1706 
1707         let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1708         let expected = vec![(b'\x00', b'\xFF')];
1709         assert_eq!(expected, branges(&cls));
1710 
1711         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1712         let expected = vec![(b'a', b'b')];
1713         assert_eq!(expected, branges(&cls));
1714     }
1715 
1716     #[test]
1717     #[cfg(feature = "unicode-case")]
class_case_fold_unicode()1718     fn class_case_fold_unicode() {
1719         let cls = uclass(&[
1720             ('C', 'F'),
1721             ('A', 'G'),
1722             ('D', 'J'),
1723             ('A', 'C'),
1724             ('M', 'P'),
1725             ('L', 'S'),
1726             ('c', 'f'),
1727         ]);
1728         let expected = uclass(&[
1729             ('A', 'J'),
1730             ('L', 'S'),
1731             ('a', 'j'),
1732             ('l', 's'),
1733             ('\u{17F}', '\u{17F}'),
1734         ]);
1735         assert_eq!(expected, ucasefold(&cls));
1736 
1737         let cls = uclass(&[('A', 'Z')]);
1738         let expected = uclass(&[
1739             ('A', 'Z'),
1740             ('a', 'z'),
1741             ('\u{17F}', '\u{17F}'),
1742             ('\u{212A}', '\u{212A}'),
1743         ]);
1744         assert_eq!(expected, ucasefold(&cls));
1745 
1746         let cls = uclass(&[('a', 'z')]);
1747         let expected = uclass(&[
1748             ('A', 'Z'),
1749             ('a', 'z'),
1750             ('\u{17F}', '\u{17F}'),
1751             ('\u{212A}', '\u{212A}'),
1752         ]);
1753         assert_eq!(expected, ucasefold(&cls));
1754 
1755         let cls = uclass(&[('A', 'A'), ('_', '_')]);
1756         let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1757         assert_eq!(expected, ucasefold(&cls));
1758 
1759         let cls = uclass(&[('A', 'A'), ('=', '=')]);
1760         let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1761         assert_eq!(expected, ucasefold(&cls));
1762 
1763         let cls = uclass(&[('\x00', '\x10')]);
1764         assert_eq!(cls, ucasefold(&cls));
1765 
1766         let cls = uclass(&[('k', 'k')]);
1767         let expected =
1768             uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1769         assert_eq!(expected, ucasefold(&cls));
1770 
1771         let cls = uclass(&[('@', '@')]);
1772         assert_eq!(cls, ucasefold(&cls));
1773     }
1774 
1775     #[test]
1776     #[cfg(not(feature = "unicode-case"))]
class_case_fold_unicode_disabled()1777     fn class_case_fold_unicode_disabled() {
1778         let mut cls = uclass(&[
1779             ('C', 'F'),
1780             ('A', 'G'),
1781             ('D', 'J'),
1782             ('A', 'C'),
1783             ('M', 'P'),
1784             ('L', 'S'),
1785             ('c', 'f'),
1786         ]);
1787         assert!(cls.try_case_fold_simple().is_err());
1788     }
1789 
1790     #[test]
1791     #[should_panic]
1792     #[cfg(not(feature = "unicode-case"))]
class_case_fold_unicode_disabled_panics()1793     fn class_case_fold_unicode_disabled_panics() {
1794         let mut cls = uclass(&[
1795             ('C', 'F'),
1796             ('A', 'G'),
1797             ('D', 'J'),
1798             ('A', 'C'),
1799             ('M', 'P'),
1800             ('L', 'S'),
1801             ('c', 'f'),
1802         ]);
1803         cls.case_fold_simple();
1804     }
1805 
1806     #[test]
class_case_fold_bytes()1807     fn class_case_fold_bytes() {
1808         let cls = bclass(&[
1809             (b'C', b'F'),
1810             (b'A', b'G'),
1811             (b'D', b'J'),
1812             (b'A', b'C'),
1813             (b'M', b'P'),
1814             (b'L', b'S'),
1815             (b'c', b'f'),
1816         ]);
1817         let expected =
1818             bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1819         assert_eq!(expected, bcasefold(&cls));
1820 
1821         let cls = bclass(&[(b'A', b'Z')]);
1822         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823         assert_eq!(expected, bcasefold(&cls));
1824 
1825         let cls = bclass(&[(b'a', b'z')]);
1826         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1827         assert_eq!(expected, bcasefold(&cls));
1828 
1829         let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1830         let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1831         assert_eq!(expected, bcasefold(&cls));
1832 
1833         let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1834         let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1835         assert_eq!(expected, bcasefold(&cls));
1836 
1837         let cls = bclass(&[(b'\x00', b'\x10')]);
1838         assert_eq!(cls, bcasefold(&cls));
1839 
1840         let cls = bclass(&[(b'k', b'k')]);
1841         let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1842         assert_eq!(expected, bcasefold(&cls));
1843 
1844         let cls = bclass(&[(b'@', b'@')]);
1845         assert_eq!(cls, bcasefold(&cls));
1846     }
1847 
1848     #[test]
class_negate_unicode()1849     fn class_negate_unicode() {
1850         let cls = uclass(&[('a', 'a')]);
1851         let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1852         assert_eq!(expected, unegate(&cls));
1853 
1854         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1855         let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1856         assert_eq!(expected, unegate(&cls));
1857 
1858         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1859         let expected = uclass(&[
1860             ('\x00', '\x60'),
1861             ('\x64', '\x77'),
1862             ('\x7B', '\u{10FFFF}'),
1863         ]);
1864         assert_eq!(expected, unegate(&cls));
1865 
1866         let cls = uclass(&[('\x00', 'a')]);
1867         let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1868         assert_eq!(expected, unegate(&cls));
1869 
1870         let cls = uclass(&[('a', '\u{10FFFF}')]);
1871         let expected = uclass(&[('\x00', '\x60')]);
1872         assert_eq!(expected, unegate(&cls));
1873 
1874         let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1875         let expected = uclass(&[]);
1876         assert_eq!(expected, unegate(&cls));
1877 
1878         let cls = uclass(&[]);
1879         let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1880         assert_eq!(expected, unegate(&cls));
1881 
1882         let cls =
1883             uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1884         let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1885         assert_eq!(expected, unegate(&cls));
1886 
1887         let cls = uclass(&[('\x00', '\u{D7FF}')]);
1888         let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1889         assert_eq!(expected, unegate(&cls));
1890 
1891         let cls = uclass(&[('\x00', '\u{D7FE}')]);
1892         let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1893         assert_eq!(expected, unegate(&cls));
1894 
1895         let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1896         let expected = uclass(&[('\x00', '\u{D7FF}')]);
1897         assert_eq!(expected, unegate(&cls));
1898 
1899         let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1900         let expected = uclass(&[('\x00', '\u{E000}')]);
1901         assert_eq!(expected, unegate(&cls));
1902     }
1903 
1904     #[test]
class_negate_bytes()1905     fn class_negate_bytes() {
1906         let cls = bclass(&[(b'a', b'a')]);
1907         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1908         assert_eq!(expected, bnegate(&cls));
1909 
1910         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1911         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1912         assert_eq!(expected, bnegate(&cls));
1913 
1914         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1915         let expected = bclass(&[
1916             (b'\x00', b'\x60'),
1917             (b'\x64', b'\x77'),
1918             (b'\x7B', b'\xFF'),
1919         ]);
1920         assert_eq!(expected, bnegate(&cls));
1921 
1922         let cls = bclass(&[(b'\x00', b'a')]);
1923         let expected = bclass(&[(b'\x62', b'\xFF')]);
1924         assert_eq!(expected, bnegate(&cls));
1925 
1926         let cls = bclass(&[(b'a', b'\xFF')]);
1927         let expected = bclass(&[(b'\x00', b'\x60')]);
1928         assert_eq!(expected, bnegate(&cls));
1929 
1930         let cls = bclass(&[(b'\x00', b'\xFF')]);
1931         let expected = bclass(&[]);
1932         assert_eq!(expected, bnegate(&cls));
1933 
1934         let cls = bclass(&[]);
1935         let expected = bclass(&[(b'\x00', b'\xFF')]);
1936         assert_eq!(expected, bnegate(&cls));
1937 
1938         let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1939         let expected = bclass(&[(b'\xFE', b'\xFE')]);
1940         assert_eq!(expected, bnegate(&cls));
1941     }
1942 
1943     #[test]
class_union_unicode()1944     fn class_union_unicode() {
1945         let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1946         let cls2 = uclass(&[('a', 'z')]);
1947         let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1948         assert_eq!(expected, uunion(&cls1, &cls2));
1949     }
1950 
1951     #[test]
class_union_bytes()1952     fn class_union_bytes() {
1953         let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1954         let cls2 = bclass(&[(b'a', b'z')]);
1955         let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1956         assert_eq!(expected, bunion(&cls1, &cls2));
1957     }
1958 
1959     #[test]
class_intersect_unicode()1960     fn class_intersect_unicode() {
1961         let cls1 = uclass(&[]);
1962         let cls2 = uclass(&[('a', 'a')]);
1963         let expected = uclass(&[]);
1964         assert_eq!(expected, uintersect(&cls1, &cls2));
1965 
1966         let cls1 = uclass(&[('a', 'a')]);
1967         let cls2 = uclass(&[('a', 'a')]);
1968         let expected = uclass(&[('a', 'a')]);
1969         assert_eq!(expected, uintersect(&cls1, &cls2));
1970 
1971         let cls1 = uclass(&[('a', 'a')]);
1972         let cls2 = uclass(&[('b', 'b')]);
1973         let expected = uclass(&[]);
1974         assert_eq!(expected, uintersect(&cls1, &cls2));
1975 
1976         let cls1 = uclass(&[('a', 'a')]);
1977         let cls2 = uclass(&[('a', 'c')]);
1978         let expected = uclass(&[('a', 'a')]);
1979         assert_eq!(expected, uintersect(&cls1, &cls2));
1980 
1981         let cls1 = uclass(&[('a', 'b')]);
1982         let cls2 = uclass(&[('a', 'c')]);
1983         let expected = uclass(&[('a', 'b')]);
1984         assert_eq!(expected, uintersect(&cls1, &cls2));
1985 
1986         let cls1 = uclass(&[('a', 'b')]);
1987         let cls2 = uclass(&[('b', 'c')]);
1988         let expected = uclass(&[('b', 'b')]);
1989         assert_eq!(expected, uintersect(&cls1, &cls2));
1990 
1991         let cls1 = uclass(&[('a', 'b')]);
1992         let cls2 = uclass(&[('c', 'd')]);
1993         let expected = uclass(&[]);
1994         assert_eq!(expected, uintersect(&cls1, &cls2));
1995 
1996         let cls1 = uclass(&[('b', 'c')]);
1997         let cls2 = uclass(&[('a', 'd')]);
1998         let expected = uclass(&[('b', 'c')]);
1999         assert_eq!(expected, uintersect(&cls1, &cls2));
2000 
2001         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2002         let cls2 = uclass(&[('a', 'h')]);
2003         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004         assert_eq!(expected, uintersect(&cls1, &cls2));
2005 
2006         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2007         let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2008         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2009         assert_eq!(expected, uintersect(&cls1, &cls2));
2010 
2011         let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2012         let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2013         let expected = uclass(&[]);
2014         assert_eq!(expected, uintersect(&cls1, &cls2));
2015 
2016         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2017         let cls2 = uclass(&[('h', 'h')]);
2018         let expected = uclass(&[('h', 'h')]);
2019         assert_eq!(expected, uintersect(&cls1, &cls2));
2020 
2021         let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2022         let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2023         let expected = uclass(&[]);
2024         assert_eq!(expected, uintersect(&cls1, &cls2));
2025 
2026         let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2027         let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2028         let expected = uclass(&[('b', 'f')]);
2029         assert_eq!(expected, uintersect(&cls1, &cls2));
2030     }
2031 
2032     #[test]
class_intersect_bytes()2033     fn class_intersect_bytes() {
2034         let cls1 = bclass(&[]);
2035         let cls2 = bclass(&[(b'a', b'a')]);
2036         let expected = bclass(&[]);
2037         assert_eq!(expected, bintersect(&cls1, &cls2));
2038 
2039         let cls1 = bclass(&[(b'a', b'a')]);
2040         let cls2 = bclass(&[(b'a', b'a')]);
2041         let expected = bclass(&[(b'a', b'a')]);
2042         assert_eq!(expected, bintersect(&cls1, &cls2));
2043 
2044         let cls1 = bclass(&[(b'a', b'a')]);
2045         let cls2 = bclass(&[(b'b', b'b')]);
2046         let expected = bclass(&[]);
2047         assert_eq!(expected, bintersect(&cls1, &cls2));
2048 
2049         let cls1 = bclass(&[(b'a', b'a')]);
2050         let cls2 = bclass(&[(b'a', b'c')]);
2051         let expected = bclass(&[(b'a', b'a')]);
2052         assert_eq!(expected, bintersect(&cls1, &cls2));
2053 
2054         let cls1 = bclass(&[(b'a', b'b')]);
2055         let cls2 = bclass(&[(b'a', b'c')]);
2056         let expected = bclass(&[(b'a', b'b')]);
2057         assert_eq!(expected, bintersect(&cls1, &cls2));
2058 
2059         let cls1 = bclass(&[(b'a', b'b')]);
2060         let cls2 = bclass(&[(b'b', b'c')]);
2061         let expected = bclass(&[(b'b', b'b')]);
2062         assert_eq!(expected, bintersect(&cls1, &cls2));
2063 
2064         let cls1 = bclass(&[(b'a', b'b')]);
2065         let cls2 = bclass(&[(b'c', b'd')]);
2066         let expected = bclass(&[]);
2067         assert_eq!(expected, bintersect(&cls1, &cls2));
2068 
2069         let cls1 = bclass(&[(b'b', b'c')]);
2070         let cls2 = bclass(&[(b'a', b'd')]);
2071         let expected = bclass(&[(b'b', b'c')]);
2072         assert_eq!(expected, bintersect(&cls1, &cls2));
2073 
2074         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2075         let cls2 = bclass(&[(b'a', b'h')]);
2076         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077         assert_eq!(expected, bintersect(&cls1, &cls2));
2078 
2079         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2080         let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2081         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2082         assert_eq!(expected, bintersect(&cls1, &cls2));
2083 
2084         let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2085         let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2086         let expected = bclass(&[]);
2087         assert_eq!(expected, bintersect(&cls1, &cls2));
2088 
2089         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2090         let cls2 = bclass(&[(b'h', b'h')]);
2091         let expected = bclass(&[(b'h', b'h')]);
2092         assert_eq!(expected, bintersect(&cls1, &cls2));
2093 
2094         let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2095         let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2096         let expected = bclass(&[]);
2097         assert_eq!(expected, bintersect(&cls1, &cls2));
2098 
2099         let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2100         let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2101         let expected = bclass(&[(b'b', b'f')]);
2102         assert_eq!(expected, bintersect(&cls1, &cls2));
2103     }
2104 
2105     #[test]
class_difference_unicode()2106     fn class_difference_unicode() {
2107         let cls1 = uclass(&[('a', 'a')]);
2108         let cls2 = uclass(&[('a', 'a')]);
2109         let expected = uclass(&[]);
2110         assert_eq!(expected, udifference(&cls1, &cls2));
2111 
2112         let cls1 = uclass(&[('a', 'a')]);
2113         let cls2 = uclass(&[]);
2114         let expected = uclass(&[('a', 'a')]);
2115         assert_eq!(expected, udifference(&cls1, &cls2));
2116 
2117         let cls1 = uclass(&[]);
2118         let cls2 = uclass(&[('a', 'a')]);
2119         let expected = uclass(&[]);
2120         assert_eq!(expected, udifference(&cls1, &cls2));
2121 
2122         let cls1 = uclass(&[('a', 'z')]);
2123         let cls2 = uclass(&[('a', 'a')]);
2124         let expected = uclass(&[('b', 'z')]);
2125         assert_eq!(expected, udifference(&cls1, &cls2));
2126 
2127         let cls1 = uclass(&[('a', 'z')]);
2128         let cls2 = uclass(&[('z', 'z')]);
2129         let expected = uclass(&[('a', 'y')]);
2130         assert_eq!(expected, udifference(&cls1, &cls2));
2131 
2132         let cls1 = uclass(&[('a', 'z')]);
2133         let cls2 = uclass(&[('m', 'm')]);
2134         let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2135         assert_eq!(expected, udifference(&cls1, &cls2));
2136 
2137         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2138         let cls2 = uclass(&[('a', 'z')]);
2139         let expected = uclass(&[]);
2140         assert_eq!(expected, udifference(&cls1, &cls2));
2141 
2142         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2143         let cls2 = uclass(&[('d', 'v')]);
2144         let expected = uclass(&[('a', 'c')]);
2145         assert_eq!(expected, udifference(&cls1, &cls2));
2146 
2147         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2148         let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2149         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2150         assert_eq!(expected, udifference(&cls1, &cls2));
2151 
2152         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2153         let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2154         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2155         assert_eq!(expected, udifference(&cls1, &cls2));
2156 
2157         let cls1 = uclass(&[('x', 'z')]);
2158         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2159         let expected = uclass(&[('x', 'z')]);
2160         assert_eq!(expected, udifference(&cls1, &cls2));
2161 
2162         let cls1 = uclass(&[('a', 'z')]);
2163         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2164         let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2165         assert_eq!(expected, udifference(&cls1, &cls2));
2166     }
2167 
2168     #[test]
class_difference_bytes()2169     fn class_difference_bytes() {
2170         let cls1 = bclass(&[(b'a', b'a')]);
2171         let cls2 = bclass(&[(b'a', b'a')]);
2172         let expected = bclass(&[]);
2173         assert_eq!(expected, bdifference(&cls1, &cls2));
2174 
2175         let cls1 = bclass(&[(b'a', b'a')]);
2176         let cls2 = bclass(&[]);
2177         let expected = bclass(&[(b'a', b'a')]);
2178         assert_eq!(expected, bdifference(&cls1, &cls2));
2179 
2180         let cls1 = bclass(&[]);
2181         let cls2 = bclass(&[(b'a', b'a')]);
2182         let expected = bclass(&[]);
2183         assert_eq!(expected, bdifference(&cls1, &cls2));
2184 
2185         let cls1 = bclass(&[(b'a', b'z')]);
2186         let cls2 = bclass(&[(b'a', b'a')]);
2187         let expected = bclass(&[(b'b', b'z')]);
2188         assert_eq!(expected, bdifference(&cls1, &cls2));
2189 
2190         let cls1 = bclass(&[(b'a', b'z')]);
2191         let cls2 = bclass(&[(b'z', b'z')]);
2192         let expected = bclass(&[(b'a', b'y')]);
2193         assert_eq!(expected, bdifference(&cls1, &cls2));
2194 
2195         let cls1 = bclass(&[(b'a', b'z')]);
2196         let cls2 = bclass(&[(b'm', b'm')]);
2197         let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2198         assert_eq!(expected, bdifference(&cls1, &cls2));
2199 
2200         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2201         let cls2 = bclass(&[(b'a', b'z')]);
2202         let expected = bclass(&[]);
2203         assert_eq!(expected, bdifference(&cls1, &cls2));
2204 
2205         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2206         let cls2 = bclass(&[(b'd', b'v')]);
2207         let expected = bclass(&[(b'a', b'c')]);
2208         assert_eq!(expected, bdifference(&cls1, &cls2));
2209 
2210         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2211         let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2212         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2213         assert_eq!(expected, bdifference(&cls1, &cls2));
2214 
2215         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2216         let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2217         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2218         assert_eq!(expected, bdifference(&cls1, &cls2));
2219 
2220         let cls1 = bclass(&[(b'x', b'z')]);
2221         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2222         let expected = bclass(&[(b'x', b'z')]);
2223         assert_eq!(expected, bdifference(&cls1, &cls2));
2224 
2225         let cls1 = bclass(&[(b'a', b'z')]);
2226         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2227         let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2228         assert_eq!(expected, bdifference(&cls1, &cls2));
2229     }
2230 
2231     #[test]
class_symmetric_difference_unicode()2232     fn class_symmetric_difference_unicode() {
2233         let cls1 = uclass(&[('a', 'm')]);
2234         let cls2 = uclass(&[('g', 't')]);
2235         let expected = uclass(&[('a', 'f'), ('n', 't')]);
2236         assert_eq!(expected, usymdifference(&cls1, &cls2));
2237     }
2238 
2239     #[test]
class_symmetric_difference_bytes()2240     fn class_symmetric_difference_bytes() {
2241         let cls1 = bclass(&[(b'a', b'm')]);
2242         let cls2 = bclass(&[(b'g', b't')]);
2243         let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2244         assert_eq!(expected, bsymdifference(&cls1, &cls2));
2245     }
2246 
2247     #[test]
2248     #[should_panic]
hir_byte_literal_non_ascii()2249     fn hir_byte_literal_non_ascii() {
2250         Hir::literal(Literal::Byte(b'a'));
2251     }
2252 
2253     // We use a thread with an explicit stack size to test that our destructor
2254     // for Hir can handle arbitrarily sized expressions in constant stack
2255     // space. In case we run on a platform without threads (WASM?), we limit
2256     // this test to Windows/Unix.
2257     #[test]
2258     #[cfg(any(unix, windows))]
no_stack_overflow_on_drop()2259     fn no_stack_overflow_on_drop() {
2260         use std::thread;
2261 
2262         let run = || {
2263             let mut expr = Hir::empty();
2264             for _ in 0..100 {
2265                 expr = Hir::group(Group {
2266                     kind: GroupKind::NonCapturing,
2267                     hir: Box::new(expr),
2268                 });
2269                 expr = Hir::repetition(Repetition {
2270                     kind: RepetitionKind::ZeroOrOne,
2271                     greedy: true,
2272                     hir: Box::new(expr),
2273                 });
2274 
2275                 expr = Hir {
2276                     kind: HirKind::Concat(vec![expr]),
2277                     info: HirInfo::new(),
2278                 };
2279                 expr = Hir {
2280                     kind: HirKind::Alternation(vec![expr]),
2281                     info: HirInfo::new(),
2282                 };
2283             }
2284             assert!(!expr.kind.is_empty());
2285         };
2286 
2287         // We run our test on a thread with a small stack size so we can
2288         // force the issue more easily.
2289         //
2290         // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
2291         // for context on the specific stack size chosen here.
2292         thread::Builder::new()
2293             .stack_size(16 << 10)
2294             .spawn(run)
2295             .unwrap()
2296             .join()
2297             .unwrap();
2298     }
2299 }
2300