1 /*! 2 Defines a high-level intermediate representation for regular expressions. 3 */ 4 use std::char; 5 use std::cmp; 6 use std::error; 7 use std::fmt; 8 use std::result; 9 use std::u8; 10 11 use crate::ast::Span; 12 use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; 13 use crate::unicode; 14 15 pub use crate::hir::visitor::{visit, Visitor}; 16 pub use crate::unicode::CaseFoldError; 17 18 mod interval; 19 pub mod literal; 20 pub mod print; 21 pub mod translate; 22 mod visitor; 23 24 /// An error that can occur while translating an `Ast` to a `Hir`. 25 #[derive(Clone, Debug, Eq, PartialEq)] 26 pub struct Error { 27 /// The kind of error. 28 kind: ErrorKind, 29 /// The original pattern that the translator's Ast was parsed from. Every 30 /// span in an error is a valid range into this string. 31 pattern: String, 32 /// The span of this error, derived from the Ast given to the translator. 33 span: Span, 34 } 35 36 impl Error { 37 /// Return the type of this error. kind(&self) -> &ErrorKind38 pub fn kind(&self) -> &ErrorKind { 39 &self.kind 40 } 41 42 /// The original pattern string in which this error occurred. 43 /// 44 /// Every span reported by this error is reported in terms of this string. pattern(&self) -> &str45 pub fn pattern(&self) -> &str { 46 &self.pattern 47 } 48 49 /// Return the span at which this error occurred. span(&self) -> &Span50 pub fn span(&self) -> &Span { 51 &self.span 52 } 53 } 54 55 /// The type of an error that occurred while building an `Hir`. 56 #[derive(Clone, Debug, Eq, PartialEq)] 57 pub enum ErrorKind { 58 /// This error occurs when a Unicode feature is used when Unicode 59 /// support is disabled. For example `(?-u:\pL)` would trigger this error. 60 UnicodeNotAllowed, 61 /// This error occurs when translating a pattern that could match a byte 62 /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. 63 InvalidUtf8, 64 /// This occurs when an unrecognized Unicode property name could not 65 /// be found. 66 UnicodePropertyNotFound, 67 /// This occurs when an unrecognized Unicode property value could not 68 /// be found. 69 UnicodePropertyValueNotFound, 70 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or 71 /// `\d`) could not be found. This can occur when the `unicode-perl` 72 /// crate feature is not enabled. 73 UnicodePerlClassNotFound, 74 /// This occurs when the Unicode simple case mapping tables are not 75 /// available, and the regular expression required Unicode aware case 76 /// insensitivity. 77 UnicodeCaseUnavailable, 78 /// This occurs when the translator attempts to construct a character class 79 /// that is empty. 80 /// 81 /// Note that this restriction in the translator may be removed in the 82 /// future. 83 EmptyClassNotAllowed, 84 /// Hints that destructuring should not be exhaustive. 85 /// 86 /// This enum may grow additional variants, so this makes sure clients 87 /// don't count on exhaustive matching. (Otherwise, adding a new variant 88 /// could break existing code.) 89 #[doc(hidden)] 90 __Nonexhaustive, 91 } 92 93 impl ErrorKind { 94 // TODO: Remove this method entirely on the next breaking semver release. 95 #[allow(deprecated)] description(&self) -> &str96 fn description(&self) -> &str { 97 use self::ErrorKind::*; 98 match *self { 99 UnicodeNotAllowed => "Unicode not allowed here", 100 InvalidUtf8 => "pattern can match invalid UTF-8", 101 UnicodePropertyNotFound => "Unicode property not found", 102 UnicodePropertyValueNotFound => "Unicode property value not found", 103 UnicodePerlClassNotFound => { 104 "Unicode-aware Perl class not found \ 105 (make sure the unicode-perl feature is enabled)" 106 } 107 UnicodeCaseUnavailable => { 108 "Unicode-aware case insensitivity matching is not available \ 109 (make sure the unicode-case feature is enabled)" 110 } 111 EmptyClassNotAllowed => "empty character classes are not allowed", 112 __Nonexhaustive => unreachable!(), 113 } 114 } 115 } 116 117 impl error::Error for Error { 118 // TODO: Remove this method entirely on the next breaking semver release. 119 #[allow(deprecated)] description(&self) -> &str120 fn description(&self) -> &str { 121 self.kind.description() 122 } 123 } 124 125 impl fmt::Display for Error { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result126 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 127 crate::error::Formatter::from(self).fmt(f) 128 } 129 } 130 131 impl fmt::Display for ErrorKind { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result132 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 133 // TODO: Remove this on the next breaking semver release. 134 #[allow(deprecated)] 135 f.write_str(self.description()) 136 } 137 } 138 139 /// A high-level intermediate representation (HIR) for a regular expression. 140 /// 141 /// The HIR of a regular expression represents an intermediate step between its 142 /// abstract syntax (a structured description of the concrete syntax) and 143 /// compiled byte codes. The purpose of HIR is to make regular expressions 144 /// easier to analyze. In particular, the AST is much more complex than the 145 /// HIR. For example, while an AST supports arbitrarily nested character 146 /// classes, the HIR will flatten all nested classes into a single set. The HIR 147 /// will also "compile away" every flag present in the concrete syntax. For 148 /// example, users of HIR expressions never need to worry about case folding; 149 /// it is handled automatically by the translator (e.g., by translating `(?i)A` 150 /// to `[aA]`). 151 /// 152 /// If the HIR was produced by a translator that disallows invalid UTF-8, then 153 /// the HIR is guaranteed to match UTF-8 exclusively. 154 /// 155 /// This type defines its own destructor that uses constant stack space and 156 /// heap space proportional to the size of the HIR. 157 /// 158 /// The specific type of an HIR expression can be accessed via its `kind` 159 /// or `into_kind` methods. This extra level of indirection exists for two 160 /// reasons: 161 /// 162 /// 1. Construction of an HIR expression *must* use the constructor methods 163 /// on this `Hir` type instead of building the `HirKind` values directly. 164 /// This permits construction to enforce invariants like "concatenations 165 /// always consist of two or more sub-expressions." 166 /// 2. Every HIR expression contains attributes that are defined inductively, 167 /// and can be computed cheaply during the construction process. For 168 /// example, one such attribute is whether the expression must match at the 169 /// beginning of the text. 170 /// 171 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular 172 /// expression pattern string, and uses constant stack space and heap space 173 /// proportional to the size of the `Hir`. 174 #[derive(Clone, Debug, Eq, PartialEq)] 175 pub struct Hir { 176 /// The underlying HIR kind. 177 kind: HirKind, 178 /// Analysis info about this HIR, computed during construction. 179 info: HirInfo, 180 } 181 182 /// The kind of an arbitrary `Hir` expression. 183 #[derive(Clone, Debug, Eq, PartialEq)] 184 pub enum HirKind { 185 /// The empty regular expression, which matches everything, including the 186 /// empty string. 187 Empty, 188 /// A single literal character that matches exactly this character. 189 Literal(Literal), 190 /// A single character class that matches any of the characters in the 191 /// class. A class can either consist of Unicode scalar values as 192 /// characters, or it can use bytes. 193 Class(Class), 194 /// An anchor assertion. An anchor assertion match always has zero length. 195 Anchor(Anchor), 196 /// A word boundary assertion, which may or may not be Unicode aware. A 197 /// word boundary assertion match always has zero length. 198 WordBoundary(WordBoundary), 199 /// A repetition operation applied to a child expression. 200 Repetition(Repetition), 201 /// A possibly capturing group, which contains a child expression. 202 Group(Group), 203 /// A concatenation of expressions. A concatenation always has at least two 204 /// child expressions. 205 /// 206 /// A concatenation matches only if each of its child expression matches 207 /// one after the other. 208 Concat(Vec<Hir>), 209 /// An alternation of expressions. An alternation always has at least two 210 /// child expressions. 211 /// 212 /// An alternation matches only if at least one of its child expression 213 /// matches. If multiple expressions match, then the leftmost is preferred. 214 Alternation(Vec<Hir>), 215 } 216 217 impl Hir { 218 /// Returns a reference to the underlying HIR kind. kind(&self) -> &HirKind219 pub fn kind(&self) -> &HirKind { 220 &self.kind 221 } 222 223 /// Consumes ownership of this HIR expression and returns its underlying 224 /// `HirKind`. into_kind(mut self) -> HirKind225 pub fn into_kind(mut self) -> HirKind { 226 use std::mem; 227 mem::replace(&mut self.kind, HirKind::Empty) 228 } 229 230 /// Returns an empty HIR expression. 231 /// 232 /// An empty HIR expression always matches, including the empty string. empty() -> Hir233 pub fn empty() -> Hir { 234 let mut info = HirInfo::new(); 235 info.set_always_utf8(true); 236 info.set_all_assertions(true); 237 info.set_anchored_start(false); 238 info.set_anchored_end(false); 239 info.set_line_anchored_start(false); 240 info.set_line_anchored_end(false); 241 info.set_any_anchored_start(false); 242 info.set_any_anchored_end(false); 243 info.set_match_empty(true); 244 info.set_literal(false); 245 info.set_alternation_literal(false); 246 Hir { kind: HirKind::Empty, info } 247 } 248 249 /// Creates a literal HIR expression. 250 /// 251 /// If the given literal has a `Byte` variant with an ASCII byte, then this 252 /// method panics. This enforces the invariant that `Byte` variants are 253 /// only used to express matching of invalid UTF-8. literal(lit: Literal) -> Hir254 pub fn literal(lit: Literal) -> Hir { 255 if let Literal::Byte(b) = lit { 256 assert!(b > 0x7F); 257 } 258 259 let mut info = HirInfo::new(); 260 info.set_always_utf8(lit.is_unicode()); 261 info.set_all_assertions(false); 262 info.set_anchored_start(false); 263 info.set_anchored_end(false); 264 info.set_line_anchored_start(false); 265 info.set_line_anchored_end(false); 266 info.set_any_anchored_start(false); 267 info.set_any_anchored_end(false); 268 info.set_match_empty(false); 269 info.set_literal(true); 270 info.set_alternation_literal(true); 271 Hir { kind: HirKind::Literal(lit), info } 272 } 273 274 /// Creates a class HIR expression. class(class: Class) -> Hir275 pub fn class(class: Class) -> Hir { 276 let mut info = HirInfo::new(); 277 info.set_always_utf8(class.is_always_utf8()); 278 info.set_all_assertions(false); 279 info.set_anchored_start(false); 280 info.set_anchored_end(false); 281 info.set_line_anchored_start(false); 282 info.set_line_anchored_end(false); 283 info.set_any_anchored_start(false); 284 info.set_any_anchored_end(false); 285 info.set_match_empty(false); 286 info.set_literal(false); 287 info.set_alternation_literal(false); 288 Hir { kind: HirKind::Class(class), info } 289 } 290 291 /// Creates an anchor assertion HIR expression. anchor(anchor: Anchor) -> Hir292 pub fn anchor(anchor: Anchor) -> Hir { 293 let mut info = HirInfo::new(); 294 info.set_always_utf8(true); 295 info.set_all_assertions(true); 296 info.set_anchored_start(false); 297 info.set_anchored_end(false); 298 info.set_line_anchored_start(false); 299 info.set_line_anchored_end(false); 300 info.set_any_anchored_start(false); 301 info.set_any_anchored_end(false); 302 info.set_match_empty(true); 303 info.set_literal(false); 304 info.set_alternation_literal(false); 305 if let Anchor::StartText = anchor { 306 info.set_anchored_start(true); 307 info.set_line_anchored_start(true); 308 info.set_any_anchored_start(true); 309 } 310 if let Anchor::EndText = anchor { 311 info.set_anchored_end(true); 312 info.set_line_anchored_end(true); 313 info.set_any_anchored_end(true); 314 } 315 if let Anchor::StartLine = anchor { 316 info.set_line_anchored_start(true); 317 } 318 if let Anchor::EndLine = anchor { 319 info.set_line_anchored_end(true); 320 } 321 Hir { kind: HirKind::Anchor(anchor), info } 322 } 323 324 /// Creates a word boundary assertion HIR expression. word_boundary(word_boundary: WordBoundary) -> Hir325 pub fn word_boundary(word_boundary: WordBoundary) -> Hir { 326 let mut info = HirInfo::new(); 327 info.set_always_utf8(true); 328 info.set_all_assertions(true); 329 info.set_anchored_start(false); 330 info.set_anchored_end(false); 331 info.set_line_anchored_start(false); 332 info.set_line_anchored_end(false); 333 info.set_any_anchored_start(false); 334 info.set_any_anchored_end(false); 335 info.set_literal(false); 336 info.set_alternation_literal(false); 337 // A negated word boundary matches '', so that's fine. But \b does not 338 // match \b, so why do we say it can match the empty string? Well, 339 // because, if you search for \b against 'a', it will report [0, 0) and 340 // [1, 1) as matches, and both of those matches correspond to the empty 341 // string. Thus, only *certain* empty strings match \b, which similarly 342 // applies to \B. 343 info.set_match_empty(true); 344 // Negated ASCII word boundaries can match invalid UTF-8. 345 if let WordBoundary::AsciiNegate = word_boundary { 346 info.set_always_utf8(false); 347 } 348 Hir { kind: HirKind::WordBoundary(word_boundary), info } 349 } 350 351 /// Creates a repetition HIR expression. repetition(rep: Repetition) -> Hir352 pub fn repetition(rep: Repetition) -> Hir { 353 let mut info = HirInfo::new(); 354 info.set_always_utf8(rep.hir.is_always_utf8()); 355 info.set_all_assertions(rep.hir.is_all_assertions()); 356 // If this operator can match the empty string, then it can never 357 // be anchored. 358 info.set_anchored_start( 359 !rep.is_match_empty() && rep.hir.is_anchored_start(), 360 ); 361 info.set_anchored_end( 362 !rep.is_match_empty() && rep.hir.is_anchored_end(), 363 ); 364 info.set_line_anchored_start( 365 !rep.is_match_empty() && rep.hir.is_anchored_start(), 366 ); 367 info.set_line_anchored_end( 368 !rep.is_match_empty() && rep.hir.is_anchored_end(), 369 ); 370 info.set_any_anchored_start(rep.hir.is_any_anchored_start()); 371 info.set_any_anchored_end(rep.hir.is_any_anchored_end()); 372 info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); 373 info.set_literal(false); 374 info.set_alternation_literal(false); 375 Hir { kind: HirKind::Repetition(rep), info } 376 } 377 378 /// Creates a group HIR expression. group(group: Group) -> Hir379 pub fn group(group: Group) -> Hir { 380 let mut info = HirInfo::new(); 381 info.set_always_utf8(group.hir.is_always_utf8()); 382 info.set_all_assertions(group.hir.is_all_assertions()); 383 info.set_anchored_start(group.hir.is_anchored_start()); 384 info.set_anchored_end(group.hir.is_anchored_end()); 385 info.set_line_anchored_start(group.hir.is_line_anchored_start()); 386 info.set_line_anchored_end(group.hir.is_line_anchored_end()); 387 info.set_any_anchored_start(group.hir.is_any_anchored_start()); 388 info.set_any_anchored_end(group.hir.is_any_anchored_end()); 389 info.set_match_empty(group.hir.is_match_empty()); 390 info.set_literal(false); 391 info.set_alternation_literal(false); 392 Hir { kind: HirKind::Group(group), info } 393 } 394 395 /// Returns the concatenation of the given expressions. 396 /// 397 /// This flattens the concatenation as appropriate. concat(mut exprs: Vec<Hir>) -> Hir398 pub fn concat(mut exprs: Vec<Hir>) -> Hir { 399 match exprs.len() { 400 0 => Hir::empty(), 401 1 => exprs.pop().unwrap(), 402 _ => { 403 let mut info = HirInfo::new(); 404 info.set_always_utf8(true); 405 info.set_all_assertions(true); 406 info.set_any_anchored_start(false); 407 info.set_any_anchored_end(false); 408 info.set_match_empty(true); 409 info.set_literal(true); 410 info.set_alternation_literal(true); 411 412 // Some attributes require analyzing all sub-expressions. 413 for e in &exprs { 414 let x = info.is_always_utf8() && e.is_always_utf8(); 415 info.set_always_utf8(x); 416 417 let x = info.is_all_assertions() && e.is_all_assertions(); 418 info.set_all_assertions(x); 419 420 let x = info.is_any_anchored_start() 421 || e.is_any_anchored_start(); 422 info.set_any_anchored_start(x); 423 424 let x = 425 info.is_any_anchored_end() || e.is_any_anchored_end(); 426 info.set_any_anchored_end(x); 427 428 let x = info.is_match_empty() && e.is_match_empty(); 429 info.set_match_empty(x); 430 431 let x = info.is_literal() && e.is_literal(); 432 info.set_literal(x); 433 434 let x = info.is_alternation_literal() 435 && e.is_alternation_literal(); 436 info.set_alternation_literal(x); 437 } 438 // Anchored attributes require something slightly more 439 // sophisticated. Normally, WLOG, to determine whether an 440 // expression is anchored to the start, we'd only need to check 441 // the first expression of a concatenation. However, 442 // expressions like `$\b^` are still anchored to the start, 443 // but the first expression in the concatenation *isn't* 444 // anchored to the start. So the "first" expression to look at 445 // is actually one that is either not an assertion or is 446 // specifically the StartText assertion. 447 info.set_anchored_start( 448 exprs 449 .iter() 450 .take_while(|e| { 451 e.is_anchored_start() || e.is_all_assertions() 452 }) 453 .any(|e| e.is_anchored_start()), 454 ); 455 // Similarly for the end anchor, but in reverse. 456 info.set_anchored_end( 457 exprs 458 .iter() 459 .rev() 460 .take_while(|e| { 461 e.is_anchored_end() || e.is_all_assertions() 462 }) 463 .any(|e| e.is_anchored_end()), 464 ); 465 // Repeat the process for line anchors. 466 info.set_line_anchored_start( 467 exprs 468 .iter() 469 .take_while(|e| { 470 e.is_line_anchored_start() || e.is_all_assertions() 471 }) 472 .any(|e| e.is_line_anchored_start()), 473 ); 474 info.set_line_anchored_end( 475 exprs 476 .iter() 477 .rev() 478 .take_while(|e| { 479 e.is_line_anchored_end() || e.is_all_assertions() 480 }) 481 .any(|e| e.is_line_anchored_end()), 482 ); 483 Hir { kind: HirKind::Concat(exprs), info } 484 } 485 } 486 } 487 488 /// Returns the alternation of the given expressions. 489 /// 490 /// This flattens the alternation as appropriate. alternation(mut exprs: Vec<Hir>) -> Hir491 pub fn alternation(mut exprs: Vec<Hir>) -> Hir { 492 match exprs.len() { 493 0 => Hir::empty(), 494 1 => exprs.pop().unwrap(), 495 _ => { 496 let mut info = HirInfo::new(); 497 info.set_always_utf8(true); 498 info.set_all_assertions(true); 499 info.set_anchored_start(true); 500 info.set_anchored_end(true); 501 info.set_line_anchored_start(true); 502 info.set_line_anchored_end(true); 503 info.set_any_anchored_start(false); 504 info.set_any_anchored_end(false); 505 info.set_match_empty(false); 506 info.set_literal(false); 507 info.set_alternation_literal(true); 508 509 // Some attributes require analyzing all sub-expressions. 510 for e in &exprs { 511 let x = info.is_always_utf8() && e.is_always_utf8(); 512 info.set_always_utf8(x); 513 514 let x = info.is_all_assertions() && e.is_all_assertions(); 515 info.set_all_assertions(x); 516 517 let x = info.is_anchored_start() && e.is_anchored_start(); 518 info.set_anchored_start(x); 519 520 let x = info.is_anchored_end() && e.is_anchored_end(); 521 info.set_anchored_end(x); 522 523 let x = info.is_line_anchored_start() 524 && e.is_line_anchored_start(); 525 info.set_line_anchored_start(x); 526 527 let x = info.is_line_anchored_end() 528 && e.is_line_anchored_end(); 529 info.set_line_anchored_end(x); 530 531 let x = info.is_any_anchored_start() 532 || e.is_any_anchored_start(); 533 info.set_any_anchored_start(x); 534 535 let x = 536 info.is_any_anchored_end() || e.is_any_anchored_end(); 537 info.set_any_anchored_end(x); 538 539 let x = info.is_match_empty() || e.is_match_empty(); 540 info.set_match_empty(x); 541 542 let x = info.is_alternation_literal() && e.is_literal(); 543 info.set_alternation_literal(x); 544 } 545 Hir { kind: HirKind::Alternation(exprs), info } 546 } 547 } 548 } 549 550 /// Build an HIR expression for `.`. 551 /// 552 /// A `.` expression matches any character except for `\n`. To build an 553 /// expression that matches any character, including `\n`, use the `any` 554 /// method. 555 /// 556 /// If `bytes` is `true`, then this assumes characters are limited to a 557 /// single byte. dot(bytes: bool) -> Hir558 pub fn dot(bytes: bool) -> Hir { 559 if bytes { 560 let mut cls = ClassBytes::empty(); 561 cls.push(ClassBytesRange::new(b'\0', b'\x09')); 562 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); 563 Hir::class(Class::Bytes(cls)) 564 } else { 565 let mut cls = ClassUnicode::empty(); 566 cls.push(ClassUnicodeRange::new('\0', '\x09')); 567 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); 568 Hir::class(Class::Unicode(cls)) 569 } 570 } 571 572 /// Build an HIR expression for `(?s).`. 573 /// 574 /// A `(?s).` expression matches any character, including `\n`. To build an 575 /// expression that matches any character except for `\n`, then use the 576 /// `dot` method. 577 /// 578 /// If `bytes` is `true`, then this assumes characters are limited to a 579 /// single byte. any(bytes: bool) -> Hir580 pub fn any(bytes: bool) -> Hir { 581 if bytes { 582 let mut cls = ClassBytes::empty(); 583 cls.push(ClassBytesRange::new(b'\0', b'\xFF')); 584 Hir::class(Class::Bytes(cls)) 585 } else { 586 let mut cls = ClassUnicode::empty(); 587 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); 588 Hir::class(Class::Unicode(cls)) 589 } 590 } 591 592 /// Return true if and only if this HIR will always match valid UTF-8. 593 /// 594 /// When this returns false, then it is possible for this HIR expression 595 /// to match invalid UTF-8. is_always_utf8(&self) -> bool596 pub fn is_always_utf8(&self) -> bool { 597 self.info.is_always_utf8() 598 } 599 600 /// Returns true if and only if this entire HIR expression is made up of 601 /// zero-width assertions. 602 /// 603 /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but 604 /// not `^a`. is_all_assertions(&self) -> bool605 pub fn is_all_assertions(&self) -> bool { 606 self.info.is_all_assertions() 607 } 608 609 /// Return true if and only if this HIR is required to match from the 610 /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, 611 /// `^foo|^bar` but not `^foo|bar`. is_anchored_start(&self) -> bool612 pub fn is_anchored_start(&self) -> bool { 613 self.info.is_anchored_start() 614 } 615 616 /// Return true if and only if this HIR is required to match at the end 617 /// of text. This includes expressions like `foo$`, `(foo|bar)$`, 618 /// `foo$|bar$` but not `foo$|bar`. is_anchored_end(&self) -> bool619 pub fn is_anchored_end(&self) -> bool { 620 self.info.is_anchored_end() 621 } 622 623 /// Return true if and only if this HIR is required to match from the 624 /// beginning of text or the beginning of a line. This includes expressions 625 /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` 626 /// but not `^foo|bar` or `(?m)^foo|bar`. 627 /// 628 /// Note that if `is_anchored_start` is `true`, then 629 /// `is_line_anchored_start` will also be `true`. The reverse implication 630 /// is not true. For example, `(?m)^foo` is line anchored, but not 631 /// `is_anchored_start`. is_line_anchored_start(&self) -> bool632 pub fn is_line_anchored_start(&self) -> bool { 633 self.info.is_line_anchored_start() 634 } 635 636 /// Return true if and only if this HIR is required to match at the 637 /// end of text or the end of a line. This includes expressions like 638 /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, 639 /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. 640 /// 641 /// Note that if `is_anchored_end` is `true`, then 642 /// `is_line_anchored_end` will also be `true`. The reverse implication 643 /// is not true. For example, `(?m)foo$` is line anchored, but not 644 /// `is_anchored_end`. is_line_anchored_end(&self) -> bool645 pub fn is_line_anchored_end(&self) -> bool { 646 self.info.is_line_anchored_end() 647 } 648 649 /// Return true if and only if this HIR contains any sub-expression that 650 /// is required to match at the beginning of text. Specifically, this 651 /// returns true if the `^` symbol (when multiline mode is disabled) or the 652 /// `\A` escape appear anywhere in the regex. is_any_anchored_start(&self) -> bool653 pub fn is_any_anchored_start(&self) -> bool { 654 self.info.is_any_anchored_start() 655 } 656 657 /// Return true if and only if this HIR contains any sub-expression that is 658 /// required to match at the end of text. Specifically, this returns true 659 /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape 660 /// appear anywhere in the regex. is_any_anchored_end(&self) -> bool661 pub fn is_any_anchored_end(&self) -> bool { 662 self.info.is_any_anchored_end() 663 } 664 665 /// Return true if and only if the empty string is part of the language 666 /// matched by this regular expression. 667 /// 668 /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` 669 /// and `\B`, but not `a` or `a+`. is_match_empty(&self) -> bool670 pub fn is_match_empty(&self) -> bool { 671 self.info.is_match_empty() 672 } 673 674 /// Return true if and only if this HIR is a simple literal. This is only 675 /// true when this HIR expression is either itself a `Literal` or a 676 /// concatenation of only `Literal`s. 677 /// 678 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, 679 /// `` are not (even though that contain sub-expressions that are literals). is_literal(&self) -> bool680 pub fn is_literal(&self) -> bool { 681 self.info.is_literal() 682 } 683 684 /// Return true if and only if this HIR is either a simple literal or an 685 /// alternation of simple literals. This is only 686 /// true when this HIR expression is either itself a `Literal` or a 687 /// concatenation of only `Literal`s or an alternation of only `Literal`s. 688 /// 689 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation 690 /// literals, but `f+`, `(foo)`, `foo()`, `` 691 /// are not (even though that contain sub-expressions that are literals). is_alternation_literal(&self) -> bool692 pub fn is_alternation_literal(&self) -> bool { 693 self.info.is_alternation_literal() 694 } 695 } 696 697 impl HirKind { 698 /// Return true if and only if this HIR is the empty regular expression. 699 /// 700 /// Note that this is not defined inductively. That is, it only tests if 701 /// this kind is the `Empty` variant. To get the inductive definition, 702 /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). is_empty(&self) -> bool703 pub fn is_empty(&self) -> bool { 704 match *self { 705 HirKind::Empty => true, 706 _ => false, 707 } 708 } 709 710 /// Returns true if and only if this kind has any (including possibly 711 /// empty) subexpressions. has_subexprs(&self) -> bool712 pub fn has_subexprs(&self) -> bool { 713 match *self { 714 HirKind::Empty 715 | HirKind::Literal(_) 716 | HirKind::Class(_) 717 | HirKind::Anchor(_) 718 | HirKind::WordBoundary(_) => false, 719 HirKind::Group(_) 720 | HirKind::Repetition(_) 721 | HirKind::Concat(_) 722 | HirKind::Alternation(_) => true, 723 } 724 } 725 } 726 727 /// Print a display representation of this Hir. 728 /// 729 /// The result of this is a valid regular expression pattern string. 730 /// 731 /// This implementation uses constant stack space and heap space proportional 732 /// to the size of the `Hir`. 733 impl fmt::Display for Hir { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result734 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 735 use crate::hir::print::Printer; 736 Printer::new().print(self, f) 737 } 738 } 739 740 /// The high-level intermediate representation of a literal. 741 /// 742 /// A literal corresponds to a single character, where a character is either 743 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters 744 /// are preferred whenever possible. In particular, a `Byte` variant is only 745 /// ever produced when it could match invalid UTF-8. 746 #[derive(Clone, Debug, Eq, PartialEq)] 747 pub enum Literal { 748 /// A single character represented by a Unicode scalar value. 749 Unicode(char), 750 /// A single character represented by an arbitrary byte. 751 Byte(u8), 752 } 753 754 impl Literal { 755 /// Returns true if and only if this literal corresponds to a Unicode 756 /// scalar value. is_unicode(&self) -> bool757 pub fn is_unicode(&self) -> bool { 758 match *self { 759 Literal::Unicode(_) => true, 760 Literal::Byte(b) if b <= 0x7F => true, 761 Literal::Byte(_) => false, 762 } 763 } 764 } 765 766 /// The high-level intermediate representation of a character class. 767 /// 768 /// A character class corresponds to a set of characters. A character is either 769 /// defined by a Unicode scalar value or a byte. Unicode characters are used 770 /// by default, while bytes are used when Unicode mode (via the `u` flag) is 771 /// disabled. 772 /// 773 /// A character class, regardless of its character type, is represented by a 774 /// sequence of non-overlapping non-adjacent ranges of characters. 775 /// 776 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may 777 /// be produced even when it exclusively matches valid UTF-8. This is because 778 /// a `Bytes` variant represents an intention by the author of the regular 779 /// expression to disable Unicode mode, which in turn impacts the semantics of 780 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not 781 /// match the same set of strings. 782 #[derive(Clone, Debug, Eq, PartialEq)] 783 pub enum Class { 784 /// A set of characters represented by Unicode scalar values. 785 Unicode(ClassUnicode), 786 /// A set of characters represented by arbitrary bytes (one byte per 787 /// character). 788 Bytes(ClassBytes), 789 } 790 791 impl Class { 792 /// Apply Unicode simple case folding to this character class, in place. 793 /// The character class will be expanded to include all simple case folded 794 /// character variants. 795 /// 796 /// If this is a byte oriented character class, then this will be limited 797 /// to the ASCII ranges `A-Z` and `a-z`. case_fold_simple(&mut self)798 pub fn case_fold_simple(&mut self) { 799 match *self { 800 Class::Unicode(ref mut x) => x.case_fold_simple(), 801 Class::Bytes(ref mut x) => x.case_fold_simple(), 802 } 803 } 804 805 /// Negate this character class in place. 806 /// 807 /// After completion, this character class will contain precisely the 808 /// characters that weren't previously in the class. negate(&mut self)809 pub fn negate(&mut self) { 810 match *self { 811 Class::Unicode(ref mut x) => x.negate(), 812 Class::Bytes(ref mut x) => x.negate(), 813 } 814 } 815 816 /// Returns true if and only if this character class will only ever match 817 /// valid UTF-8. 818 /// 819 /// A character class can match invalid UTF-8 only when the following 820 /// conditions are met: 821 /// 822 /// 1. The translator was configured to permit generating an expression 823 /// that can match invalid UTF-8. (By default, this is disabled.) 824 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete 825 /// syntax or in the parser builder. By default, Unicode mode is 826 /// enabled. is_always_utf8(&self) -> bool827 pub fn is_always_utf8(&self) -> bool { 828 match *self { 829 Class::Unicode(_) => true, 830 Class::Bytes(ref x) => x.is_all_ascii(), 831 } 832 } 833 } 834 835 /// A set of characters represented by Unicode scalar values. 836 #[derive(Clone, Debug, Eq, PartialEq)] 837 pub struct ClassUnicode { 838 set: IntervalSet<ClassUnicodeRange>, 839 } 840 841 impl ClassUnicode { 842 /// Create a new class from a sequence of ranges. 843 /// 844 /// The given ranges do not need to be in any specific order, and ranges 845 /// may overlap. new<I>(ranges: I) -> ClassUnicode where I: IntoIterator<Item = ClassUnicodeRange>,846 pub fn new<I>(ranges: I) -> ClassUnicode 847 where 848 I: IntoIterator<Item = ClassUnicodeRange>, 849 { 850 ClassUnicode { set: IntervalSet::new(ranges) } 851 } 852 853 /// Create a new class with no ranges. empty() -> ClassUnicode854 pub fn empty() -> ClassUnicode { 855 ClassUnicode::new(vec![]) 856 } 857 858 /// Add a new range to this set. push(&mut self, range: ClassUnicodeRange)859 pub fn push(&mut self, range: ClassUnicodeRange) { 860 self.set.push(range); 861 } 862 863 /// Return an iterator over all ranges in this class. 864 /// 865 /// The iterator yields ranges in ascending order. iter(&self) -> ClassUnicodeIter<'_>866 pub fn iter(&self) -> ClassUnicodeIter<'_> { 867 ClassUnicodeIter(self.set.iter()) 868 } 869 870 /// Return the underlying ranges as a slice. ranges(&self) -> &[ClassUnicodeRange]871 pub fn ranges(&self) -> &[ClassUnicodeRange] { 872 self.set.intervals() 873 } 874 875 /// Expand this character class such that it contains all case folded 876 /// characters, according to Unicode's "simple" mapping. For example, if 877 /// this class consists of the range `a-z`, then applying case folding will 878 /// result in the class containing both the ranges `a-z` and `A-Z`. 879 /// 880 /// # Panics 881 /// 882 /// This routine panics when the case mapping data necessary for this 883 /// routine to complete is unavailable. This occurs when the `unicode-case` 884 /// feature is not enabled. 885 /// 886 /// Callers should prefer using `try_case_fold_simple` instead, which will 887 /// return an error instead of panicking. case_fold_simple(&mut self)888 pub fn case_fold_simple(&mut self) { 889 self.set 890 .case_fold_simple() 891 .expect("unicode-case feature must be enabled"); 892 } 893 894 /// Expand this character class such that it contains all case folded 895 /// characters, according to Unicode's "simple" mapping. For example, if 896 /// this class consists of the range `a-z`, then applying case folding will 897 /// result in the class containing both the ranges `a-z` and `A-Z`. 898 /// 899 /// # Error 900 /// 901 /// This routine returns an error when the case mapping data necessary 902 /// for this routine to complete is unavailable. This occurs when the 903 /// `unicode-case` feature is not enabled. try_case_fold_simple( &mut self, ) -> result::Result<(), CaseFoldError>904 pub fn try_case_fold_simple( 905 &mut self, 906 ) -> result::Result<(), CaseFoldError> { 907 self.set.case_fold_simple() 908 } 909 910 /// Negate this character class. 911 /// 912 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this 913 /// set, then it will not be in this set after negation. negate(&mut self)914 pub fn negate(&mut self) { 915 self.set.negate(); 916 } 917 918 /// Union this character class with the given character class, in place. union(&mut self, other: &ClassUnicode)919 pub fn union(&mut self, other: &ClassUnicode) { 920 self.set.union(&other.set); 921 } 922 923 /// Intersect this character class with the given character class, in 924 /// place. intersect(&mut self, other: &ClassUnicode)925 pub fn intersect(&mut self, other: &ClassUnicode) { 926 self.set.intersect(&other.set); 927 } 928 929 /// Subtract the given character class from this character class, in place. difference(&mut self, other: &ClassUnicode)930 pub fn difference(&mut self, other: &ClassUnicode) { 931 self.set.difference(&other.set); 932 } 933 934 /// Compute the symmetric difference of the given character classes, in 935 /// place. 936 /// 937 /// This computes the symmetric difference of two character classes. This 938 /// removes all elements in this class that are also in the given class, 939 /// but all adds all elements from the given class that aren't in this 940 /// class. That is, the class will contain all elements in either class, 941 /// but will not contain any elements that are in both classes. symmetric_difference(&mut self, other: &ClassUnicode)942 pub fn symmetric_difference(&mut self, other: &ClassUnicode) { 943 self.set.symmetric_difference(&other.set); 944 } 945 946 /// Returns true if and only if this character class will either match 947 /// nothing or only ASCII bytes. Stated differently, this returns false 948 /// if and only if this class contains a non-ASCII codepoint. is_all_ascii(&self) -> bool949 pub fn is_all_ascii(&self) -> bool { 950 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') 951 } 952 } 953 954 /// An iterator over all ranges in a Unicode character class. 955 /// 956 /// The lifetime `'a` refers to the lifetime of the underlying class. 957 #[derive(Debug)] 958 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); 959 960 impl<'a> Iterator for ClassUnicodeIter<'a> { 961 type Item = &'a ClassUnicodeRange; 962 next(&mut self) -> Option<&'a ClassUnicodeRange>963 fn next(&mut self) -> Option<&'a ClassUnicodeRange> { 964 self.0.next() 965 } 966 } 967 968 /// A single range of characters represented by Unicode scalar values. 969 /// 970 /// The range is closed. That is, the start and end of the range are included 971 /// in the range. 972 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] 973 pub struct ClassUnicodeRange { 974 start: char, 975 end: char, 976 } 977 978 impl fmt::Debug for ClassUnicodeRange { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result979 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 980 let start = if !self.start.is_whitespace() && !self.start.is_control() 981 { 982 self.start.to_string() 983 } else { 984 format!("0x{:X}", self.start as u32) 985 }; 986 let end = if !self.end.is_whitespace() && !self.end.is_control() { 987 self.end.to_string() 988 } else { 989 format!("0x{:X}", self.end as u32) 990 }; 991 f.debug_struct("ClassUnicodeRange") 992 .field("start", &start) 993 .field("end", &end) 994 .finish() 995 } 996 } 997 998 impl Interval for ClassUnicodeRange { 999 type Bound = char; 1000 1001 #[inline] lower(&self) -> char1002 fn lower(&self) -> char { 1003 self.start 1004 } 1005 #[inline] upper(&self) -> char1006 fn upper(&self) -> char { 1007 self.end 1008 } 1009 #[inline] set_lower(&mut self, bound: char)1010 fn set_lower(&mut self, bound: char) { 1011 self.start = bound; 1012 } 1013 #[inline] set_upper(&mut self, bound: char)1014 fn set_upper(&mut self, bound: char) { 1015 self.end = bound; 1016 } 1017 1018 /// Apply simple case folding to this Unicode scalar value range. 1019 /// 1020 /// Additional ranges are appended to the given vector. Canonical ordering 1021 /// is *not* maintained in the given vector. case_fold_simple( &self, ranges: &mut Vec<ClassUnicodeRange>, ) -> Result<(), unicode::CaseFoldError>1022 fn case_fold_simple( 1023 &self, 1024 ranges: &mut Vec<ClassUnicodeRange>, 1025 ) -> Result<(), unicode::CaseFoldError> { 1026 if !unicode::contains_simple_case_mapping(self.start, self.end)? { 1027 return Ok(()); 1028 } 1029 let start = self.start as u32; 1030 let end = (self.end as u32).saturating_add(1); 1031 let mut next_simple_cp = None; 1032 for cp in (start..end).filter_map(char::from_u32) { 1033 if next_simple_cp.map_or(false, |next| cp < next) { 1034 continue; 1035 } 1036 let it = match unicode::simple_fold(cp)? { 1037 Ok(it) => it, 1038 Err(next) => { 1039 next_simple_cp = next; 1040 continue; 1041 } 1042 }; 1043 for cp_folded in it { 1044 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); 1045 } 1046 } 1047 Ok(()) 1048 } 1049 } 1050 1051 impl ClassUnicodeRange { 1052 /// Create a new Unicode scalar value range for a character class. 1053 /// 1054 /// The returned range is always in a canonical form. That is, the range 1055 /// returned always satisfies the invariant that `start <= end`. new(start: char, end: char) -> ClassUnicodeRange1056 pub fn new(start: char, end: char) -> ClassUnicodeRange { 1057 ClassUnicodeRange::create(start, end) 1058 } 1059 1060 /// Return the start of this range. 1061 /// 1062 /// The start of a range is always less than or equal to the end of the 1063 /// range. start(&self) -> char1064 pub fn start(&self) -> char { 1065 self.start 1066 } 1067 1068 /// Return the end of this range. 1069 /// 1070 /// The end of a range is always greater than or equal to the start of the 1071 /// range. end(&self) -> char1072 pub fn end(&self) -> char { 1073 self.end 1074 } 1075 } 1076 1077 /// A set of characters represented by arbitrary bytes (where one byte 1078 /// corresponds to one character). 1079 #[derive(Clone, Debug, Eq, PartialEq)] 1080 pub struct ClassBytes { 1081 set: IntervalSet<ClassBytesRange>, 1082 } 1083 1084 impl ClassBytes { 1085 /// Create a new class from a sequence of ranges. 1086 /// 1087 /// The given ranges do not need to be in any specific order, and ranges 1088 /// may overlap. new<I>(ranges: I) -> ClassBytes where I: IntoIterator<Item = ClassBytesRange>,1089 pub fn new<I>(ranges: I) -> ClassBytes 1090 where 1091 I: IntoIterator<Item = ClassBytesRange>, 1092 { 1093 ClassBytes { set: IntervalSet::new(ranges) } 1094 } 1095 1096 /// Create a new class with no ranges. empty() -> ClassBytes1097 pub fn empty() -> ClassBytes { 1098 ClassBytes::new(vec![]) 1099 } 1100 1101 /// Add a new range to this set. push(&mut self, range: ClassBytesRange)1102 pub fn push(&mut self, range: ClassBytesRange) { 1103 self.set.push(range); 1104 } 1105 1106 /// Return an iterator over all ranges in this class. 1107 /// 1108 /// The iterator yields ranges in ascending order. iter(&self) -> ClassBytesIter<'_>1109 pub fn iter(&self) -> ClassBytesIter<'_> { 1110 ClassBytesIter(self.set.iter()) 1111 } 1112 1113 /// Return the underlying ranges as a slice. ranges(&self) -> &[ClassBytesRange]1114 pub fn ranges(&self) -> &[ClassBytesRange] { 1115 self.set.intervals() 1116 } 1117 1118 /// Expand this character class such that it contains all case folded 1119 /// characters. For example, if this class consists of the range `a-z`, 1120 /// then applying case folding will result in the class containing both the 1121 /// ranges `a-z` and `A-Z`. 1122 /// 1123 /// Note that this only applies ASCII case folding, which is limited to the 1124 /// characters `a-z` and `A-Z`. case_fold_simple(&mut self)1125 pub fn case_fold_simple(&mut self) { 1126 self.set.case_fold_simple().expect("ASCII case folding never fails"); 1127 } 1128 1129 /// Negate this byte class. 1130 /// 1131 /// For all `b` where `b` is a any byte, if `b` was in this set, then it 1132 /// will not be in this set after negation. negate(&mut self)1133 pub fn negate(&mut self) { 1134 self.set.negate(); 1135 } 1136 1137 /// Union this byte class with the given byte class, in place. union(&mut self, other: &ClassBytes)1138 pub fn union(&mut self, other: &ClassBytes) { 1139 self.set.union(&other.set); 1140 } 1141 1142 /// Intersect this byte class with the given byte class, in place. intersect(&mut self, other: &ClassBytes)1143 pub fn intersect(&mut self, other: &ClassBytes) { 1144 self.set.intersect(&other.set); 1145 } 1146 1147 /// Subtract the given byte class from this byte class, in place. difference(&mut self, other: &ClassBytes)1148 pub fn difference(&mut self, other: &ClassBytes) { 1149 self.set.difference(&other.set); 1150 } 1151 1152 /// Compute the symmetric difference of the given byte classes, in place. 1153 /// 1154 /// This computes the symmetric difference of two byte classes. This 1155 /// removes all elements in this class that are also in the given class, 1156 /// but all adds all elements from the given class that aren't in this 1157 /// class. That is, the class will contain all elements in either class, 1158 /// but will not contain any elements that are in both classes. symmetric_difference(&mut self, other: &ClassBytes)1159 pub fn symmetric_difference(&mut self, other: &ClassBytes) { 1160 self.set.symmetric_difference(&other.set); 1161 } 1162 1163 /// Returns true if and only if this character class will either match 1164 /// nothing or only ASCII bytes. Stated differently, this returns false 1165 /// if and only if this class contains a non-ASCII byte. is_all_ascii(&self) -> bool1166 pub fn is_all_ascii(&self) -> bool { 1167 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) 1168 } 1169 } 1170 1171 /// An iterator over all ranges in a byte character class. 1172 /// 1173 /// The lifetime `'a` refers to the lifetime of the underlying class. 1174 #[derive(Debug)] 1175 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); 1176 1177 impl<'a> Iterator for ClassBytesIter<'a> { 1178 type Item = &'a ClassBytesRange; 1179 next(&mut self) -> Option<&'a ClassBytesRange>1180 fn next(&mut self) -> Option<&'a ClassBytesRange> { 1181 self.0.next() 1182 } 1183 } 1184 1185 /// A single range of characters represented by arbitrary bytes. 1186 /// 1187 /// The range is closed. That is, the start and end of the range are included 1188 /// in the range. 1189 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] 1190 pub struct ClassBytesRange { 1191 start: u8, 1192 end: u8, 1193 } 1194 1195 impl Interval for ClassBytesRange { 1196 type Bound = u8; 1197 1198 #[inline] lower(&self) -> u81199 fn lower(&self) -> u8 { 1200 self.start 1201 } 1202 #[inline] upper(&self) -> u81203 fn upper(&self) -> u8 { 1204 self.end 1205 } 1206 #[inline] set_lower(&mut self, bound: u8)1207 fn set_lower(&mut self, bound: u8) { 1208 self.start = bound; 1209 } 1210 #[inline] set_upper(&mut self, bound: u8)1211 fn set_upper(&mut self, bound: u8) { 1212 self.end = bound; 1213 } 1214 1215 /// Apply simple case folding to this byte range. Only ASCII case mappings 1216 /// (for a-z) are applied. 1217 /// 1218 /// Additional ranges are appended to the given vector. Canonical ordering 1219 /// is *not* maintained in the given vector. case_fold_simple( &self, ranges: &mut Vec<ClassBytesRange>, ) -> Result<(), unicode::CaseFoldError>1220 fn case_fold_simple( 1221 &self, 1222 ranges: &mut Vec<ClassBytesRange>, 1223 ) -> Result<(), unicode::CaseFoldError> { 1224 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { 1225 let lower = cmp::max(self.start, b'a'); 1226 let upper = cmp::min(self.end, b'z'); 1227 ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); 1228 } 1229 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { 1230 let lower = cmp::max(self.start, b'A'); 1231 let upper = cmp::min(self.end, b'Z'); 1232 ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); 1233 } 1234 Ok(()) 1235 } 1236 } 1237 1238 impl ClassBytesRange { 1239 /// Create a new byte range for a character class. 1240 /// 1241 /// The returned range is always in a canonical form. That is, the range 1242 /// returned always satisfies the invariant that `start <= end`. new(start: u8, end: u8) -> ClassBytesRange1243 pub fn new(start: u8, end: u8) -> ClassBytesRange { 1244 ClassBytesRange::create(start, end) 1245 } 1246 1247 /// Return the start of this range. 1248 /// 1249 /// The start of a range is always less than or equal to the end of the 1250 /// range. start(&self) -> u81251 pub fn start(&self) -> u8 { 1252 self.start 1253 } 1254 1255 /// Return the end of this range. 1256 /// 1257 /// The end of a range is always greater than or equal to the start of the 1258 /// range. end(&self) -> u81259 pub fn end(&self) -> u8 { 1260 self.end 1261 } 1262 } 1263 1264 impl fmt::Debug for ClassBytesRange { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1265 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1266 let mut debug = f.debug_struct("ClassBytesRange"); 1267 if self.start <= 0x7F { 1268 debug.field("start", &(self.start as char)); 1269 } else { 1270 debug.field("start", &self.start); 1271 } 1272 if self.end <= 0x7F { 1273 debug.field("end", &(self.end as char)); 1274 } else { 1275 debug.field("end", &self.end); 1276 } 1277 debug.finish() 1278 } 1279 } 1280 1281 /// The high-level intermediate representation for an anchor assertion. 1282 /// 1283 /// A matching anchor assertion is always zero-length. 1284 #[derive(Clone, Debug, Eq, PartialEq)] 1285 pub enum Anchor { 1286 /// Match the beginning of a line or the beginning of text. Specifically, 1287 /// this matches at the starting position of the input, or at the position 1288 /// immediately following a `\n` character. 1289 StartLine, 1290 /// Match the end of a line or the end of text. Specifically, 1291 /// this matches at the end position of the input, or at the position 1292 /// immediately preceding a `\n` character. 1293 EndLine, 1294 /// Match the beginning of text. Specifically, this matches at the starting 1295 /// position of the input. 1296 StartText, 1297 /// Match the end of text. Specifically, this matches at the ending 1298 /// position of the input. 1299 EndText, 1300 } 1301 1302 /// The high-level intermediate representation for a word-boundary assertion. 1303 /// 1304 /// A matching word boundary assertion is always zero-length. 1305 #[derive(Clone, Debug, Eq, PartialEq)] 1306 pub enum WordBoundary { 1307 /// Match a Unicode-aware word boundary. That is, this matches a position 1308 /// where the left adjacent character and right adjacent character 1309 /// correspond to a word and non-word or a non-word and word character. 1310 Unicode, 1311 /// Match a Unicode-aware negation of a word boundary. 1312 UnicodeNegate, 1313 /// Match an ASCII-only word boundary. That is, this matches a position 1314 /// where the left adjacent character and right adjacent character 1315 /// correspond to a word and non-word or a non-word and word character. 1316 Ascii, 1317 /// Match an ASCII-only negation of a word boundary. 1318 AsciiNegate, 1319 } 1320 1321 impl WordBoundary { 1322 /// Returns true if and only if this word boundary assertion is negated. is_negated(&self) -> bool1323 pub fn is_negated(&self) -> bool { 1324 match *self { 1325 WordBoundary::Unicode | WordBoundary::Ascii => false, 1326 WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, 1327 } 1328 } 1329 } 1330 1331 /// The high-level intermediate representation for a group. 1332 /// 1333 /// This represents one of three possible group types: 1334 /// 1335 /// 1. A non-capturing group (e.g., `(?:expr)`). 1336 /// 2. A capturing group (e.g., `(expr)`). 1337 /// 3. A named capturing group (e.g., `(?P<name>expr)`). 1338 #[derive(Clone, Debug, Eq, PartialEq)] 1339 pub struct Group { 1340 /// The kind of this group. If it is a capturing group, then the kind 1341 /// contains the capture group index (and the name, if it is a named 1342 /// group). 1343 pub kind: GroupKind, 1344 /// The expression inside the capturing group, which may be empty. 1345 pub hir: Box<Hir>, 1346 } 1347 1348 /// The kind of group. 1349 #[derive(Clone, Debug, Eq, PartialEq)] 1350 pub enum GroupKind { 1351 /// A normal unnamed capturing group. 1352 /// 1353 /// The value is the capture index of the group. 1354 CaptureIndex(u32), 1355 /// A named capturing group. 1356 CaptureName { 1357 /// The name of the group. 1358 name: String, 1359 /// The capture index of the group. 1360 index: u32, 1361 }, 1362 /// A non-capturing group. 1363 NonCapturing, 1364 } 1365 1366 /// The high-level intermediate representation of a repetition operator. 1367 /// 1368 /// A repetition operator permits the repetition of an arbitrary 1369 /// sub-expression. 1370 #[derive(Clone, Debug, Eq, PartialEq)] 1371 pub struct Repetition { 1372 /// The kind of this repetition operator. 1373 pub kind: RepetitionKind, 1374 /// Whether this repetition operator is greedy or not. A greedy operator 1375 /// will match as much as it can. A non-greedy operator will match as 1376 /// little as it can. 1377 /// 1378 /// Typically, operators are greedy by default and are only non-greedy when 1379 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is 1380 /// not. However, this can be inverted via the `U` "ungreedy" flag. 1381 pub greedy: bool, 1382 /// The expression being repeated. 1383 pub hir: Box<Hir>, 1384 } 1385 1386 impl Repetition { 1387 /// Returns true if and only if this repetition operator makes it possible 1388 /// to match the empty string. 1389 /// 1390 /// Note that this is not defined inductively. For example, while `a*` 1391 /// will report `true`, `()+` will not, even though `()` matches the empty 1392 /// string and one or more occurrences of something that matches the empty 1393 /// string will always match the empty string. In order to get the 1394 /// inductive definition, see the corresponding method on 1395 /// [`Hir`](struct.Hir.html). is_match_empty(&self) -> bool1396 pub fn is_match_empty(&self) -> bool { 1397 match self.kind { 1398 RepetitionKind::ZeroOrOne => true, 1399 RepetitionKind::ZeroOrMore => true, 1400 RepetitionKind::OneOrMore => false, 1401 RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, 1402 RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, 1403 RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, 1404 } 1405 } 1406 } 1407 1408 /// The kind of a repetition operator. 1409 #[derive(Clone, Debug, Eq, PartialEq)] 1410 pub enum RepetitionKind { 1411 /// Matches a sub-expression zero or one times. 1412 ZeroOrOne, 1413 /// Matches a sub-expression zero or more times. 1414 ZeroOrMore, 1415 /// Matches a sub-expression one or more times. 1416 OneOrMore, 1417 /// Matches a sub-expression within a bounded range of times. 1418 Range(RepetitionRange), 1419 } 1420 1421 /// The kind of a counted repetition operator. 1422 #[derive(Clone, Debug, Eq, PartialEq)] 1423 pub enum RepetitionRange { 1424 /// Matches a sub-expression exactly this many times. 1425 Exactly(u32), 1426 /// Matches a sub-expression at least this many times. 1427 AtLeast(u32), 1428 /// Matches a sub-expression at least `m` times and at most `n` times. 1429 Bounded(u32, u32), 1430 } 1431 1432 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack 1433 /// space but heap space proportional to the depth of the total `Hir`. 1434 impl Drop for Hir { drop(&mut self)1435 fn drop(&mut self) { 1436 use std::mem; 1437 1438 match *self.kind() { 1439 HirKind::Empty 1440 | HirKind::Literal(_) 1441 | HirKind::Class(_) 1442 | HirKind::Anchor(_) 1443 | HirKind::WordBoundary(_) => return, 1444 HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, 1445 HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, 1446 HirKind::Concat(ref x) if x.is_empty() => return, 1447 HirKind::Alternation(ref x) if x.is_empty() => return, 1448 _ => {} 1449 } 1450 1451 let mut stack = vec![mem::replace(self, Hir::empty())]; 1452 while let Some(mut expr) = stack.pop() { 1453 match expr.kind { 1454 HirKind::Empty 1455 | HirKind::Literal(_) 1456 | HirKind::Class(_) 1457 | HirKind::Anchor(_) 1458 | HirKind::WordBoundary(_) => {} 1459 HirKind::Group(ref mut x) => { 1460 stack.push(mem::replace(&mut x.hir, Hir::empty())); 1461 } 1462 HirKind::Repetition(ref mut x) => { 1463 stack.push(mem::replace(&mut x.hir, Hir::empty())); 1464 } 1465 HirKind::Concat(ref mut x) => { 1466 stack.extend(x.drain(..)); 1467 } 1468 HirKind::Alternation(ref mut x) => { 1469 stack.extend(x.drain(..)); 1470 } 1471 } 1472 } 1473 } 1474 } 1475 1476 /// A type that documents various attributes of an HIR expression. 1477 /// 1478 /// These attributes are typically defined inductively on the HIR. 1479 #[derive(Clone, Debug, Eq, PartialEq)] 1480 struct HirInfo { 1481 /// Represent yes/no questions by a bitfield to conserve space, since 1482 /// this is included in every HIR expression. 1483 /// 1484 /// If more attributes need to be added, it is OK to increase the size of 1485 /// this as appropriate. 1486 bools: u16, 1487 } 1488 1489 // A simple macro for defining bitfield accessors/mutators. 1490 macro_rules! define_bool { 1491 ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { 1492 fn $is_fn_name(&self) -> bool { 1493 self.bools & (0b1 << $bit) > 0 1494 } 1495 1496 fn $set_fn_name(&mut self, yes: bool) { 1497 if yes { 1498 self.bools |= 1 << $bit; 1499 } else { 1500 self.bools &= !(1 << $bit); 1501 } 1502 } 1503 }; 1504 } 1505 1506 impl HirInfo { new() -> HirInfo1507 fn new() -> HirInfo { 1508 HirInfo { bools: 0 } 1509 } 1510 1511 define_bool!(0, is_always_utf8, set_always_utf8); 1512 define_bool!(1, is_all_assertions, set_all_assertions); 1513 define_bool!(2, is_anchored_start, set_anchored_start); 1514 define_bool!(3, is_anchored_end, set_anchored_end); 1515 define_bool!(4, is_line_anchored_start, set_line_anchored_start); 1516 define_bool!(5, is_line_anchored_end, set_line_anchored_end); 1517 define_bool!(6, is_any_anchored_start, set_any_anchored_start); 1518 define_bool!(7, is_any_anchored_end, set_any_anchored_end); 1519 define_bool!(8, is_match_empty, set_match_empty); 1520 define_bool!(9, is_literal, set_literal); 1521 define_bool!(10, is_alternation_literal, set_alternation_literal); 1522 } 1523 1524 #[cfg(test)] 1525 mod tests { 1526 use super::*; 1527 uclass(ranges: &[(char, char)]) -> ClassUnicode1528 fn uclass(ranges: &[(char, char)]) -> ClassUnicode { 1529 let ranges: Vec<ClassUnicodeRange> = ranges 1530 .iter() 1531 .map(|&(s, e)| ClassUnicodeRange::new(s, e)) 1532 .collect(); 1533 ClassUnicode::new(ranges) 1534 } 1535 bclass(ranges: &[(u8, u8)]) -> ClassBytes1536 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { 1537 let ranges: Vec<ClassBytesRange> = 1538 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); 1539 ClassBytes::new(ranges) 1540 } 1541 uranges(cls: &ClassUnicode) -> Vec<(char, char)>1542 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { 1543 cls.iter().map(|x| (x.start(), x.end())).collect() 1544 } 1545 1546 #[cfg(feature = "unicode-case")] ucasefold(cls: &ClassUnicode) -> ClassUnicode1547 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { 1548 let mut cls_ = cls.clone(); 1549 cls_.case_fold_simple(); 1550 cls_ 1551 } 1552 uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1553 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { 1554 let mut cls_ = cls1.clone(); 1555 cls_.union(cls2); 1556 cls_ 1557 } 1558 uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1559 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { 1560 let mut cls_ = cls1.clone(); 1561 cls_.intersect(cls2); 1562 cls_ 1563 } 1564 udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1565 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { 1566 let mut cls_ = cls1.clone(); 1567 cls_.difference(cls2); 1568 cls_ 1569 } 1570 usymdifference( cls1: &ClassUnicode, cls2: &ClassUnicode, ) -> ClassUnicode1571 fn usymdifference( 1572 cls1: &ClassUnicode, 1573 cls2: &ClassUnicode, 1574 ) -> ClassUnicode { 1575 let mut cls_ = cls1.clone(); 1576 cls_.symmetric_difference(cls2); 1577 cls_ 1578 } 1579 unegate(cls: &ClassUnicode) -> ClassUnicode1580 fn unegate(cls: &ClassUnicode) -> ClassUnicode { 1581 let mut cls_ = cls.clone(); 1582 cls_.negate(); 1583 cls_ 1584 } 1585 branges(cls: &ClassBytes) -> Vec<(u8, u8)>1586 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { 1587 cls.iter().map(|x| (x.start(), x.end())).collect() 1588 } 1589 bcasefold(cls: &ClassBytes) -> ClassBytes1590 fn bcasefold(cls: &ClassBytes) -> ClassBytes { 1591 let mut cls_ = cls.clone(); 1592 cls_.case_fold_simple(); 1593 cls_ 1594 } 1595 bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1596 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1597 let mut cls_ = cls1.clone(); 1598 cls_.union(cls2); 1599 cls_ 1600 } 1601 bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1602 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1603 let mut cls_ = cls1.clone(); 1604 cls_.intersect(cls2); 1605 cls_ 1606 } 1607 bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1608 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1609 let mut cls_ = cls1.clone(); 1610 cls_.difference(cls2); 1611 cls_ 1612 } 1613 bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1614 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1615 let mut cls_ = cls1.clone(); 1616 cls_.symmetric_difference(cls2); 1617 cls_ 1618 } 1619 bnegate(cls: &ClassBytes) -> ClassBytes1620 fn bnegate(cls: &ClassBytes) -> ClassBytes { 1621 let mut cls_ = cls.clone(); 1622 cls_.negate(); 1623 cls_ 1624 } 1625 1626 #[test] class_range_canonical_unicode()1627 fn class_range_canonical_unicode() { 1628 let range = ClassUnicodeRange::new('\u{00FF}', '\0'); 1629 assert_eq!('\0', range.start()); 1630 assert_eq!('\u{00FF}', range.end()); 1631 } 1632 1633 #[test] class_range_canonical_bytes()1634 fn class_range_canonical_bytes() { 1635 let range = ClassBytesRange::new(b'\xFF', b'\0'); 1636 assert_eq!(b'\0', range.start()); 1637 assert_eq!(b'\xFF', range.end()); 1638 } 1639 1640 #[test] class_canonicalize_unicode()1641 fn class_canonicalize_unicode() { 1642 let cls = uclass(&[('a', 'c'), ('x', 'z')]); 1643 let expected = vec![('a', 'c'), ('x', 'z')]; 1644 assert_eq!(expected, uranges(&cls)); 1645 1646 let cls = uclass(&[('x', 'z'), ('a', 'c')]); 1647 let expected = vec![('a', 'c'), ('x', 'z')]; 1648 assert_eq!(expected, uranges(&cls)); 1649 1650 let cls = uclass(&[('x', 'z'), ('w', 'y')]); 1651 let expected = vec![('w', 'z')]; 1652 assert_eq!(expected, uranges(&cls)); 1653 1654 let cls = uclass(&[ 1655 ('c', 'f'), 1656 ('a', 'g'), 1657 ('d', 'j'), 1658 ('a', 'c'), 1659 ('m', 'p'), 1660 ('l', 's'), 1661 ]); 1662 let expected = vec![('a', 'j'), ('l', 's')]; 1663 assert_eq!(expected, uranges(&cls)); 1664 1665 let cls = uclass(&[('x', 'z'), ('u', 'w')]); 1666 let expected = vec![('u', 'z')]; 1667 assert_eq!(expected, uranges(&cls)); 1668 1669 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); 1670 let expected = vec![('\x00', '\u{10FFFF}')]; 1671 assert_eq!(expected, uranges(&cls)); 1672 1673 let cls = uclass(&[('a', 'a'), ('b', 'b')]); 1674 let expected = vec![('a', 'b')]; 1675 assert_eq!(expected, uranges(&cls)); 1676 } 1677 1678 #[test] class_canonicalize_bytes()1679 fn class_canonicalize_bytes() { 1680 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); 1681 let expected = vec![(b'a', b'c'), (b'x', b'z')]; 1682 assert_eq!(expected, branges(&cls)); 1683 1684 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); 1685 let expected = vec![(b'a', b'c'), (b'x', b'z')]; 1686 assert_eq!(expected, branges(&cls)); 1687 1688 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); 1689 let expected = vec![(b'w', b'z')]; 1690 assert_eq!(expected, branges(&cls)); 1691 1692 let cls = bclass(&[ 1693 (b'c', b'f'), 1694 (b'a', b'g'), 1695 (b'd', b'j'), 1696 (b'a', b'c'), 1697 (b'm', b'p'), 1698 (b'l', b's'), 1699 ]); 1700 let expected = vec![(b'a', b'j'), (b'l', b's')]; 1701 assert_eq!(expected, branges(&cls)); 1702 1703 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); 1704 let expected = vec![(b'u', b'z')]; 1705 assert_eq!(expected, branges(&cls)); 1706 1707 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); 1708 let expected = vec![(b'\x00', b'\xFF')]; 1709 assert_eq!(expected, branges(&cls)); 1710 1711 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); 1712 let expected = vec![(b'a', b'b')]; 1713 assert_eq!(expected, branges(&cls)); 1714 } 1715 1716 #[test] 1717 #[cfg(feature = "unicode-case")] class_case_fold_unicode()1718 fn class_case_fold_unicode() { 1719 let cls = uclass(&[ 1720 ('C', 'F'), 1721 ('A', 'G'), 1722 ('D', 'J'), 1723 ('A', 'C'), 1724 ('M', 'P'), 1725 ('L', 'S'), 1726 ('c', 'f'), 1727 ]); 1728 let expected = uclass(&[ 1729 ('A', 'J'), 1730 ('L', 'S'), 1731 ('a', 'j'), 1732 ('l', 's'), 1733 ('\u{17F}', '\u{17F}'), 1734 ]); 1735 assert_eq!(expected, ucasefold(&cls)); 1736 1737 let cls = uclass(&[('A', 'Z')]); 1738 let expected = uclass(&[ 1739 ('A', 'Z'), 1740 ('a', 'z'), 1741 ('\u{17F}', '\u{17F}'), 1742 ('\u{212A}', '\u{212A}'), 1743 ]); 1744 assert_eq!(expected, ucasefold(&cls)); 1745 1746 let cls = uclass(&[('a', 'z')]); 1747 let expected = uclass(&[ 1748 ('A', 'Z'), 1749 ('a', 'z'), 1750 ('\u{17F}', '\u{17F}'), 1751 ('\u{212A}', '\u{212A}'), 1752 ]); 1753 assert_eq!(expected, ucasefold(&cls)); 1754 1755 let cls = uclass(&[('A', 'A'), ('_', '_')]); 1756 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); 1757 assert_eq!(expected, ucasefold(&cls)); 1758 1759 let cls = uclass(&[('A', 'A'), ('=', '=')]); 1760 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); 1761 assert_eq!(expected, ucasefold(&cls)); 1762 1763 let cls = uclass(&[('\x00', '\x10')]); 1764 assert_eq!(cls, ucasefold(&cls)); 1765 1766 let cls = uclass(&[('k', 'k')]); 1767 let expected = 1768 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); 1769 assert_eq!(expected, ucasefold(&cls)); 1770 1771 let cls = uclass(&[('@', '@')]); 1772 assert_eq!(cls, ucasefold(&cls)); 1773 } 1774 1775 #[test] 1776 #[cfg(not(feature = "unicode-case"))] class_case_fold_unicode_disabled()1777 fn class_case_fold_unicode_disabled() { 1778 let mut cls = uclass(&[ 1779 ('C', 'F'), 1780 ('A', 'G'), 1781 ('D', 'J'), 1782 ('A', 'C'), 1783 ('M', 'P'), 1784 ('L', 'S'), 1785 ('c', 'f'), 1786 ]); 1787 assert!(cls.try_case_fold_simple().is_err()); 1788 } 1789 1790 #[test] 1791 #[should_panic] 1792 #[cfg(not(feature = "unicode-case"))] class_case_fold_unicode_disabled_panics()1793 fn class_case_fold_unicode_disabled_panics() { 1794 let mut cls = uclass(&[ 1795 ('C', 'F'), 1796 ('A', 'G'), 1797 ('D', 'J'), 1798 ('A', 'C'), 1799 ('M', 'P'), 1800 ('L', 'S'), 1801 ('c', 'f'), 1802 ]); 1803 cls.case_fold_simple(); 1804 } 1805 1806 #[test] class_case_fold_bytes()1807 fn class_case_fold_bytes() { 1808 let cls = bclass(&[ 1809 (b'C', b'F'), 1810 (b'A', b'G'), 1811 (b'D', b'J'), 1812 (b'A', b'C'), 1813 (b'M', b'P'), 1814 (b'L', b'S'), 1815 (b'c', b'f'), 1816 ]); 1817 let expected = 1818 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); 1819 assert_eq!(expected, bcasefold(&cls)); 1820 1821 let cls = bclass(&[(b'A', b'Z')]); 1822 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); 1823 assert_eq!(expected, bcasefold(&cls)); 1824 1825 let cls = bclass(&[(b'a', b'z')]); 1826 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); 1827 assert_eq!(expected, bcasefold(&cls)); 1828 1829 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); 1830 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); 1831 assert_eq!(expected, bcasefold(&cls)); 1832 1833 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); 1834 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); 1835 assert_eq!(expected, bcasefold(&cls)); 1836 1837 let cls = bclass(&[(b'\x00', b'\x10')]); 1838 assert_eq!(cls, bcasefold(&cls)); 1839 1840 let cls = bclass(&[(b'k', b'k')]); 1841 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); 1842 assert_eq!(expected, bcasefold(&cls)); 1843 1844 let cls = bclass(&[(b'@', b'@')]); 1845 assert_eq!(cls, bcasefold(&cls)); 1846 } 1847 1848 #[test] class_negate_unicode()1849 fn class_negate_unicode() { 1850 let cls = uclass(&[('a', 'a')]); 1851 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); 1852 assert_eq!(expected, unegate(&cls)); 1853 1854 let cls = uclass(&[('a', 'a'), ('b', 'b')]); 1855 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); 1856 assert_eq!(expected, unegate(&cls)); 1857 1858 let cls = uclass(&[('a', 'c'), ('x', 'z')]); 1859 let expected = uclass(&[ 1860 ('\x00', '\x60'), 1861 ('\x64', '\x77'), 1862 ('\x7B', '\u{10FFFF}'), 1863 ]); 1864 assert_eq!(expected, unegate(&cls)); 1865 1866 let cls = uclass(&[('\x00', 'a')]); 1867 let expected = uclass(&[('\x62', '\u{10FFFF}')]); 1868 assert_eq!(expected, unegate(&cls)); 1869 1870 let cls = uclass(&[('a', '\u{10FFFF}')]); 1871 let expected = uclass(&[('\x00', '\x60')]); 1872 assert_eq!(expected, unegate(&cls)); 1873 1874 let cls = uclass(&[('\x00', '\u{10FFFF}')]); 1875 let expected = uclass(&[]); 1876 assert_eq!(expected, unegate(&cls)); 1877 1878 let cls = uclass(&[]); 1879 let expected = uclass(&[('\x00', '\u{10FFFF}')]); 1880 assert_eq!(expected, unegate(&cls)); 1881 1882 let cls = 1883 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); 1884 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); 1885 assert_eq!(expected, unegate(&cls)); 1886 1887 let cls = uclass(&[('\x00', '\u{D7FF}')]); 1888 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); 1889 assert_eq!(expected, unegate(&cls)); 1890 1891 let cls = uclass(&[('\x00', '\u{D7FE}')]); 1892 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); 1893 assert_eq!(expected, unegate(&cls)); 1894 1895 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); 1896 let expected = uclass(&[('\x00', '\u{D7FF}')]); 1897 assert_eq!(expected, unegate(&cls)); 1898 1899 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); 1900 let expected = uclass(&[('\x00', '\u{E000}')]); 1901 assert_eq!(expected, unegate(&cls)); 1902 } 1903 1904 #[test] class_negate_bytes()1905 fn class_negate_bytes() { 1906 let cls = bclass(&[(b'a', b'a')]); 1907 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); 1908 assert_eq!(expected, bnegate(&cls)); 1909 1910 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); 1911 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); 1912 assert_eq!(expected, bnegate(&cls)); 1913 1914 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); 1915 let expected = bclass(&[ 1916 (b'\x00', b'\x60'), 1917 (b'\x64', b'\x77'), 1918 (b'\x7B', b'\xFF'), 1919 ]); 1920 assert_eq!(expected, bnegate(&cls)); 1921 1922 let cls = bclass(&[(b'\x00', b'a')]); 1923 let expected = bclass(&[(b'\x62', b'\xFF')]); 1924 assert_eq!(expected, bnegate(&cls)); 1925 1926 let cls = bclass(&[(b'a', b'\xFF')]); 1927 let expected = bclass(&[(b'\x00', b'\x60')]); 1928 assert_eq!(expected, bnegate(&cls)); 1929 1930 let cls = bclass(&[(b'\x00', b'\xFF')]); 1931 let expected = bclass(&[]); 1932 assert_eq!(expected, bnegate(&cls)); 1933 1934 let cls = bclass(&[]); 1935 let expected = bclass(&[(b'\x00', b'\xFF')]); 1936 assert_eq!(expected, bnegate(&cls)); 1937 1938 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); 1939 let expected = bclass(&[(b'\xFE', b'\xFE')]); 1940 assert_eq!(expected, bnegate(&cls)); 1941 } 1942 1943 #[test] class_union_unicode()1944 fn class_union_unicode() { 1945 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); 1946 let cls2 = uclass(&[('a', 'z')]); 1947 let expected = uclass(&[('a', 'z'), ('A', 'C')]); 1948 assert_eq!(expected, uunion(&cls1, &cls2)); 1949 } 1950 1951 #[test] class_union_bytes()1952 fn class_union_bytes() { 1953 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); 1954 let cls2 = bclass(&[(b'a', b'z')]); 1955 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); 1956 assert_eq!(expected, bunion(&cls1, &cls2)); 1957 } 1958 1959 #[test] class_intersect_unicode()1960 fn class_intersect_unicode() { 1961 let cls1 = uclass(&[]); 1962 let cls2 = uclass(&[('a', 'a')]); 1963 let expected = uclass(&[]); 1964 assert_eq!(expected, uintersect(&cls1, &cls2)); 1965 1966 let cls1 = uclass(&[('a', 'a')]); 1967 let cls2 = uclass(&[('a', 'a')]); 1968 let expected = uclass(&[('a', 'a')]); 1969 assert_eq!(expected, uintersect(&cls1, &cls2)); 1970 1971 let cls1 = uclass(&[('a', 'a')]); 1972 let cls2 = uclass(&[('b', 'b')]); 1973 let expected = uclass(&[]); 1974 assert_eq!(expected, uintersect(&cls1, &cls2)); 1975 1976 let cls1 = uclass(&[('a', 'a')]); 1977 let cls2 = uclass(&[('a', 'c')]); 1978 let expected = uclass(&[('a', 'a')]); 1979 assert_eq!(expected, uintersect(&cls1, &cls2)); 1980 1981 let cls1 = uclass(&[('a', 'b')]); 1982 let cls2 = uclass(&[('a', 'c')]); 1983 let expected = uclass(&[('a', 'b')]); 1984 assert_eq!(expected, uintersect(&cls1, &cls2)); 1985 1986 let cls1 = uclass(&[('a', 'b')]); 1987 let cls2 = uclass(&[('b', 'c')]); 1988 let expected = uclass(&[('b', 'b')]); 1989 assert_eq!(expected, uintersect(&cls1, &cls2)); 1990 1991 let cls1 = uclass(&[('a', 'b')]); 1992 let cls2 = uclass(&[('c', 'd')]); 1993 let expected = uclass(&[]); 1994 assert_eq!(expected, uintersect(&cls1, &cls2)); 1995 1996 let cls1 = uclass(&[('b', 'c')]); 1997 let cls2 = uclass(&[('a', 'd')]); 1998 let expected = uclass(&[('b', 'c')]); 1999 assert_eq!(expected, uintersect(&cls1, &cls2)); 2000 2001 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2002 let cls2 = uclass(&[('a', 'h')]); 2003 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2004 assert_eq!(expected, uintersect(&cls1, &cls2)); 2005 2006 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2007 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2008 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2009 assert_eq!(expected, uintersect(&cls1, &cls2)); 2010 2011 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); 2012 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); 2013 let expected = uclass(&[]); 2014 assert_eq!(expected, uintersect(&cls1, &cls2)); 2015 2016 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2017 let cls2 = uclass(&[('h', 'h')]); 2018 let expected = uclass(&[('h', 'h')]); 2019 assert_eq!(expected, uintersect(&cls1, &cls2)); 2020 2021 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); 2022 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); 2023 let expected = uclass(&[]); 2024 assert_eq!(expected, uintersect(&cls1, &cls2)); 2025 2026 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); 2027 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); 2028 let expected = uclass(&[('b', 'f')]); 2029 assert_eq!(expected, uintersect(&cls1, &cls2)); 2030 } 2031 2032 #[test] class_intersect_bytes()2033 fn class_intersect_bytes() { 2034 let cls1 = bclass(&[]); 2035 let cls2 = bclass(&[(b'a', b'a')]); 2036 let expected = bclass(&[]); 2037 assert_eq!(expected, bintersect(&cls1, &cls2)); 2038 2039 let cls1 = bclass(&[(b'a', b'a')]); 2040 let cls2 = bclass(&[(b'a', b'a')]); 2041 let expected = bclass(&[(b'a', b'a')]); 2042 assert_eq!(expected, bintersect(&cls1, &cls2)); 2043 2044 let cls1 = bclass(&[(b'a', b'a')]); 2045 let cls2 = bclass(&[(b'b', b'b')]); 2046 let expected = bclass(&[]); 2047 assert_eq!(expected, bintersect(&cls1, &cls2)); 2048 2049 let cls1 = bclass(&[(b'a', b'a')]); 2050 let cls2 = bclass(&[(b'a', b'c')]); 2051 let expected = bclass(&[(b'a', b'a')]); 2052 assert_eq!(expected, bintersect(&cls1, &cls2)); 2053 2054 let cls1 = bclass(&[(b'a', b'b')]); 2055 let cls2 = bclass(&[(b'a', b'c')]); 2056 let expected = bclass(&[(b'a', b'b')]); 2057 assert_eq!(expected, bintersect(&cls1, &cls2)); 2058 2059 let cls1 = bclass(&[(b'a', b'b')]); 2060 let cls2 = bclass(&[(b'b', b'c')]); 2061 let expected = bclass(&[(b'b', b'b')]); 2062 assert_eq!(expected, bintersect(&cls1, &cls2)); 2063 2064 let cls1 = bclass(&[(b'a', b'b')]); 2065 let cls2 = bclass(&[(b'c', b'd')]); 2066 let expected = bclass(&[]); 2067 assert_eq!(expected, bintersect(&cls1, &cls2)); 2068 2069 let cls1 = bclass(&[(b'b', b'c')]); 2070 let cls2 = bclass(&[(b'a', b'd')]); 2071 let expected = bclass(&[(b'b', b'c')]); 2072 assert_eq!(expected, bintersect(&cls1, &cls2)); 2073 2074 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2075 let cls2 = bclass(&[(b'a', b'h')]); 2076 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2077 assert_eq!(expected, bintersect(&cls1, &cls2)); 2078 2079 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2080 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2081 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2082 assert_eq!(expected, bintersect(&cls1, &cls2)); 2083 2084 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); 2085 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); 2086 let expected = bclass(&[]); 2087 assert_eq!(expected, bintersect(&cls1, &cls2)); 2088 2089 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2090 let cls2 = bclass(&[(b'h', b'h')]); 2091 let expected = bclass(&[(b'h', b'h')]); 2092 assert_eq!(expected, bintersect(&cls1, &cls2)); 2093 2094 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); 2095 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); 2096 let expected = bclass(&[]); 2097 assert_eq!(expected, bintersect(&cls1, &cls2)); 2098 2099 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); 2100 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); 2101 let expected = bclass(&[(b'b', b'f')]); 2102 assert_eq!(expected, bintersect(&cls1, &cls2)); 2103 } 2104 2105 #[test] class_difference_unicode()2106 fn class_difference_unicode() { 2107 let cls1 = uclass(&[('a', 'a')]); 2108 let cls2 = uclass(&[('a', 'a')]); 2109 let expected = uclass(&[]); 2110 assert_eq!(expected, udifference(&cls1, &cls2)); 2111 2112 let cls1 = uclass(&[('a', 'a')]); 2113 let cls2 = uclass(&[]); 2114 let expected = uclass(&[('a', 'a')]); 2115 assert_eq!(expected, udifference(&cls1, &cls2)); 2116 2117 let cls1 = uclass(&[]); 2118 let cls2 = uclass(&[('a', 'a')]); 2119 let expected = uclass(&[]); 2120 assert_eq!(expected, udifference(&cls1, &cls2)); 2121 2122 let cls1 = uclass(&[('a', 'z')]); 2123 let cls2 = uclass(&[('a', 'a')]); 2124 let expected = uclass(&[('b', 'z')]); 2125 assert_eq!(expected, udifference(&cls1, &cls2)); 2126 2127 let cls1 = uclass(&[('a', 'z')]); 2128 let cls2 = uclass(&[('z', 'z')]); 2129 let expected = uclass(&[('a', 'y')]); 2130 assert_eq!(expected, udifference(&cls1, &cls2)); 2131 2132 let cls1 = uclass(&[('a', 'z')]); 2133 let cls2 = uclass(&[('m', 'm')]); 2134 let expected = uclass(&[('a', 'l'), ('n', 'z')]); 2135 assert_eq!(expected, udifference(&cls1, &cls2)); 2136 2137 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2138 let cls2 = uclass(&[('a', 'z')]); 2139 let expected = uclass(&[]); 2140 assert_eq!(expected, udifference(&cls1, &cls2)); 2141 2142 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2143 let cls2 = uclass(&[('d', 'v')]); 2144 let expected = uclass(&[('a', 'c')]); 2145 assert_eq!(expected, udifference(&cls1, &cls2)); 2146 2147 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2148 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); 2149 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); 2150 assert_eq!(expected, udifference(&cls1, &cls2)); 2151 2152 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2153 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); 2154 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); 2155 assert_eq!(expected, udifference(&cls1, &cls2)); 2156 2157 let cls1 = uclass(&[('x', 'z')]); 2158 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); 2159 let expected = uclass(&[('x', 'z')]); 2160 assert_eq!(expected, udifference(&cls1, &cls2)); 2161 2162 let cls1 = uclass(&[('a', 'z')]); 2163 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); 2164 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); 2165 assert_eq!(expected, udifference(&cls1, &cls2)); 2166 } 2167 2168 #[test] class_difference_bytes()2169 fn class_difference_bytes() { 2170 let cls1 = bclass(&[(b'a', b'a')]); 2171 let cls2 = bclass(&[(b'a', b'a')]); 2172 let expected = bclass(&[]); 2173 assert_eq!(expected, bdifference(&cls1, &cls2)); 2174 2175 let cls1 = bclass(&[(b'a', b'a')]); 2176 let cls2 = bclass(&[]); 2177 let expected = bclass(&[(b'a', b'a')]); 2178 assert_eq!(expected, bdifference(&cls1, &cls2)); 2179 2180 let cls1 = bclass(&[]); 2181 let cls2 = bclass(&[(b'a', b'a')]); 2182 let expected = bclass(&[]); 2183 assert_eq!(expected, bdifference(&cls1, &cls2)); 2184 2185 let cls1 = bclass(&[(b'a', b'z')]); 2186 let cls2 = bclass(&[(b'a', b'a')]); 2187 let expected = bclass(&[(b'b', b'z')]); 2188 assert_eq!(expected, bdifference(&cls1, &cls2)); 2189 2190 let cls1 = bclass(&[(b'a', b'z')]); 2191 let cls2 = bclass(&[(b'z', b'z')]); 2192 let expected = bclass(&[(b'a', b'y')]); 2193 assert_eq!(expected, bdifference(&cls1, &cls2)); 2194 2195 let cls1 = bclass(&[(b'a', b'z')]); 2196 let cls2 = bclass(&[(b'm', b'm')]); 2197 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); 2198 assert_eq!(expected, bdifference(&cls1, &cls2)); 2199 2200 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2201 let cls2 = bclass(&[(b'a', b'z')]); 2202 let expected = bclass(&[]); 2203 assert_eq!(expected, bdifference(&cls1, &cls2)); 2204 2205 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2206 let cls2 = bclass(&[(b'd', b'v')]); 2207 let expected = bclass(&[(b'a', b'c')]); 2208 assert_eq!(expected, bdifference(&cls1, &cls2)); 2209 2210 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2211 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); 2212 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); 2213 assert_eq!(expected, bdifference(&cls1, &cls2)); 2214 2215 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2216 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); 2217 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); 2218 assert_eq!(expected, bdifference(&cls1, &cls2)); 2219 2220 let cls1 = bclass(&[(b'x', b'z')]); 2221 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); 2222 let expected = bclass(&[(b'x', b'z')]); 2223 assert_eq!(expected, bdifference(&cls1, &cls2)); 2224 2225 let cls1 = bclass(&[(b'a', b'z')]); 2226 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); 2227 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); 2228 assert_eq!(expected, bdifference(&cls1, &cls2)); 2229 } 2230 2231 #[test] class_symmetric_difference_unicode()2232 fn class_symmetric_difference_unicode() { 2233 let cls1 = uclass(&[('a', 'm')]); 2234 let cls2 = uclass(&[('g', 't')]); 2235 let expected = uclass(&[('a', 'f'), ('n', 't')]); 2236 assert_eq!(expected, usymdifference(&cls1, &cls2)); 2237 } 2238 2239 #[test] class_symmetric_difference_bytes()2240 fn class_symmetric_difference_bytes() { 2241 let cls1 = bclass(&[(b'a', b'm')]); 2242 let cls2 = bclass(&[(b'g', b't')]); 2243 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); 2244 assert_eq!(expected, bsymdifference(&cls1, &cls2)); 2245 } 2246 2247 #[test] 2248 #[should_panic] hir_byte_literal_non_ascii()2249 fn hir_byte_literal_non_ascii() { 2250 Hir::literal(Literal::Byte(b'a')); 2251 } 2252 2253 // We use a thread with an explicit stack size to test that our destructor 2254 // for Hir can handle arbitrarily sized expressions in constant stack 2255 // space. In case we run on a platform without threads (WASM?), we limit 2256 // this test to Windows/Unix. 2257 #[test] 2258 #[cfg(any(unix, windows))] no_stack_overflow_on_drop()2259 fn no_stack_overflow_on_drop() { 2260 use std::thread; 2261 2262 let run = || { 2263 let mut expr = Hir::empty(); 2264 for _ in 0..100 { 2265 expr = Hir::group(Group { 2266 kind: GroupKind::NonCapturing, 2267 hir: Box::new(expr), 2268 }); 2269 expr = Hir::repetition(Repetition { 2270 kind: RepetitionKind::ZeroOrOne, 2271 greedy: true, 2272 hir: Box::new(expr), 2273 }); 2274 2275 expr = Hir { 2276 kind: HirKind::Concat(vec![expr]), 2277 info: HirInfo::new(), 2278 }; 2279 expr = Hir { 2280 kind: HirKind::Alternation(vec![expr]), 2281 info: HirInfo::new(), 2282 }; 2283 } 2284 assert!(!expr.kind.is_empty()); 2285 }; 2286 2287 // We run our test on a thread with a small stack size so we can 2288 // force the issue more easily. 2289 // 2290 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' 2291 // for context on the specific stack size chosen here. 2292 thread::Builder::new() 2293 .stack_size(16 << 10) 2294 .spawn(run) 2295 .unwrap() 2296 .join() 2297 .unwrap(); 2298 } 2299 } 2300