1 use core::{borrow::Borrow, cell::RefCell}; 2 3 use alloc::{sync::Arc, vec, vec::Vec}; 4 5 use regex_syntax::{ 6 hir::{self, Hir}, 7 utf8::{Utf8Range, Utf8Sequences}, 8 ParserBuilder, 9 }; 10 11 use crate::{ 12 nfa::thompson::{ 13 builder::Builder, 14 error::BuildError, 15 literal_trie::LiteralTrie, 16 map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, 17 nfa::{Transition, NFA}, 18 range_trie::RangeTrie, 19 }, 20 util::{ 21 look::{Look, LookMatcher}, 22 primitives::{PatternID, StateID}, 23 }, 24 }; 25 26 /// The configuration used for a Thompson NFA compiler. 27 #[derive(Clone, Debug, Default)] 28 pub struct Config { 29 utf8: Option<bool>, 30 reverse: Option<bool>, 31 nfa_size_limit: Option<Option<usize>>, 32 shrink: Option<bool>, 33 which_captures: Option<WhichCaptures>, 34 look_matcher: Option<LookMatcher>, 35 #[cfg(test)] 36 unanchored_prefix: Option<bool>, 37 } 38 39 impl Config { 40 /// Return a new default Thompson NFA compiler configuration. new() -> Config41 pub fn new() -> Config { 42 Config::default() 43 } 44 45 /// Whether to enable UTF-8 mode during search or not. 46 /// 47 /// A regex engine is said to be in UTF-8 mode when it guarantees that 48 /// all matches returned by it have spans consisting of only valid UTF-8. 49 /// That is, it is impossible for a match span to be returned that 50 /// contains any invalid UTF-8. 51 /// 52 /// UTF-8 mode generally consists of two things: 53 /// 54 /// 1. Whether the NFA's states are constructed such that all paths to a 55 /// match state that consume at least one byte always correspond to valid 56 /// UTF-8. 57 /// 2. Whether all paths to a match state that do _not_ consume any bytes 58 /// should always correspond to valid UTF-8 boundaries. 59 /// 60 /// (1) is a guarantee made by whoever constructs the NFA. 61 /// If you're parsing a regex from its concrete syntax, then 62 /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make 63 /// this guarantee for you. It does it by returning an error if the regex 64 /// pattern could every report a non-empty match span that contains invalid 65 /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex 66 /// successfully parses, then you're guaranteed that the corresponding NFA 67 /// will only ever report non-empty match spans containing valid UTF-8. 68 /// 69 /// (2) is a trickier guarantee because it cannot be enforced by the NFA 70 /// state graph itself. Consider, for example, the regex `a*`. It matches 71 /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where 72 /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint, 73 /// and thus correspond to invalid UTF-8 boundaries. Therefore, this 74 /// guarantee must be made at a higher level than the NFA state graph 75 /// itself. This crate deals with this case in each regex engine. Namely, 76 /// when a zero-width match that splits a codepoint is found and UTF-8 77 /// mode enabled, then it is ignored and the engine moves on looking for 78 /// the next match. 79 /// 80 /// Thus, UTF-8 mode is both a promise that the NFA built only reports 81 /// non-empty matches that are valid UTF-8, and an *instruction* to regex 82 /// engines that empty matches that split codepoints should be banned. 83 /// 84 /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans, 85 /// it only makes sense to enable this option when you *know* your haystack 86 /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and 87 /// searching a haystack that contains invalid UTF-8 leads to **unspecified 88 /// behavior**. 89 /// 90 /// Therefore, it may make sense to enable `syntax::Config::utf8` while 91 /// simultaneously *disabling* this option. That would ensure all non-empty 92 /// match spans are valid UTF-8, but that empty match spans may still split 93 /// a codepoint or match at other places that aren't valid UTF-8. 94 /// 95 /// In general, this mode is only relevant if your regex can match the 96 /// empty string. Most regexes don't. 97 /// 98 /// This is enabled by default. 99 /// 100 /// # Example 101 /// 102 /// This example shows how UTF-8 mode can impact the match spans that may 103 /// be reported in certain cases. 104 /// 105 /// ``` 106 /// use regex_automata::{ 107 /// nfa::thompson::{self, pikevm::PikeVM}, 108 /// Match, Input, 109 /// }; 110 /// 111 /// let re = PikeVM::new("")?; 112 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 113 /// 114 /// // UTF-8 mode is enabled by default. 115 /// let mut input = Input::new("☃"); 116 /// re.search(&mut cache, &input, &mut caps); 117 /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); 118 /// 119 /// // Even though an empty regex matches at 1..1, our next match is 120 /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is 121 /// // three bytes long). 122 /// input.set_start(1); 123 /// re.search(&mut cache, &input, &mut caps); 124 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); 125 /// 126 /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: 127 /// let re = PikeVM::builder() 128 /// .thompson(thompson::Config::new().utf8(false)) 129 /// .build("")?; 130 /// re.search(&mut cache, &input, &mut caps); 131 /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); 132 /// 133 /// input.set_start(2); 134 /// re.search(&mut cache, &input, &mut caps); 135 /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); 136 /// 137 /// input.set_start(3); 138 /// re.search(&mut cache, &input, &mut caps); 139 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); 140 /// 141 /// input.set_start(4); 142 /// re.search(&mut cache, &input, &mut caps); 143 /// assert_eq!(None, caps.get_match()); 144 /// 145 /// # Ok::<(), Box<dyn std::error::Error>>(()) 146 /// ``` utf8(mut self, yes: bool) -> Config147 pub fn utf8(mut self, yes: bool) -> Config { 148 self.utf8 = Some(yes); 149 self 150 } 151 152 /// Reverse the NFA. 153 /// 154 /// A NFA reversal is performed by reversing all of the concatenated 155 /// sub-expressions in the original pattern, recursively. (Look around 156 /// operators are also inverted.) The resulting NFA can be used to match 157 /// the pattern starting from the end of a string instead of the beginning 158 /// of a string. 159 /// 160 /// Reversing the NFA is useful for building a reverse DFA, which is most 161 /// useful for finding the start of a match after its ending position has 162 /// been found. NFA execution engines typically do not work on reverse 163 /// NFAs. For example, currently, the Pike VM reports the starting location 164 /// of matches without a reverse NFA. 165 /// 166 /// Currently, enabling this setting requires disabling the 167 /// [`captures`](Config::captures) setting. If both are enabled, then the 168 /// compiler will return an error. It is expected that this limitation will 169 /// be lifted in the future. 170 /// 171 /// This is disabled by default. 172 /// 173 /// # Example 174 /// 175 /// This example shows how to build a DFA from a reverse NFA, and then use 176 /// the DFA to search backwards. 177 /// 178 /// ``` 179 /// use regex_automata::{ 180 /// dfa::{self, Automaton}, 181 /// nfa::thompson::{NFA, WhichCaptures}, 182 /// HalfMatch, Input, 183 /// }; 184 /// 185 /// let dfa = dfa::dense::Builder::new() 186 /// .thompson(NFA::config() 187 /// .which_captures(WhichCaptures::None) 188 /// .reverse(true) 189 /// ) 190 /// .build("baz[0-9]+")?; 191 /// let expected = Some(HalfMatch::must(0, 3)); 192 /// assert_eq!( 193 /// expected, 194 /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?, 195 /// ); 196 /// 197 /// # Ok::<(), Box<dyn std::error::Error>>(()) 198 /// ``` reverse(mut self, yes: bool) -> Config199 pub fn reverse(mut self, yes: bool) -> Config { 200 self.reverse = Some(yes); 201 self 202 } 203 204 /// Sets an approximate size limit on the total heap used by the NFA being 205 /// compiled. 206 /// 207 /// This permits imposing constraints on the size of a compiled NFA. This 208 /// may be useful in contexts where the regex pattern is untrusted and one 209 /// wants to avoid using too much memory. 210 /// 211 /// This size limit does not apply to auxiliary heap used during 212 /// compilation that is not part of the built NFA. 213 /// 214 /// Note that this size limit is applied during compilation in order for 215 /// the limit to prevent too much heap from being used. However, the 216 /// implementation may use an intermediate NFA representation that is 217 /// otherwise slightly bigger than the final public form. Since the size 218 /// limit may be applied to an intermediate representation, there is not 219 /// necessarily a precise correspondence between the configured size limit 220 /// and the heap usage of the final NFA. 221 /// 222 /// There is no size limit by default. 223 /// 224 /// # Example 225 /// 226 /// This example demonstrates how Unicode mode can greatly increase the 227 /// size of the NFA. 228 /// 229 /// ``` 230 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 231 /// use regex_automata::nfa::thompson::NFA; 232 /// 233 /// // 300KB isn't enough! 234 /// NFA::compiler() 235 /// .configure(NFA::config().nfa_size_limit(Some(300_000))) 236 /// .build(r"\w{20}") 237 /// .unwrap_err(); 238 /// 239 /// // ... but 400KB probably is. 240 /// let nfa = NFA::compiler() 241 /// .configure(NFA::config().nfa_size_limit(Some(400_000))) 242 /// .build(r"\w{20}")?; 243 /// 244 /// assert_eq!(nfa.pattern_len(), 1); 245 /// 246 /// # Ok::<(), Box<dyn std::error::Error>>(()) 247 /// ``` nfa_size_limit(mut self, bytes: Option<usize>) -> Config248 pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config { 249 self.nfa_size_limit = Some(bytes); 250 self 251 } 252 253 /// Apply best effort heuristics to shrink the NFA at the expense of more 254 /// time/memory. 255 /// 256 /// Generally speaking, if one is using an NFA to compile a DFA, then the 257 /// extra time used to shrink the NFA will be more than made up for during 258 /// DFA construction (potentially by a lot). In other words, enabling this 259 /// can substantially decrease the overall amount of time it takes to build 260 /// a DFA. 261 /// 262 /// A reason to keep this disabled is if you want to compile an NFA and 263 /// start using it as quickly as possible without needing to build a DFA, 264 /// and you don't mind using a bit of extra memory for the NFA. e.g., for 265 /// an NFA simulation or for a lazy DFA. 266 /// 267 /// NFA shrinking is currently most useful when compiling a reverse 268 /// NFA with large Unicode character classes. In particular, it trades 269 /// additional CPU time during NFA compilation in favor of generating fewer 270 /// NFA states. 271 /// 272 /// This is disabled by default because it can increase compile times 273 /// quite a bit if you aren't building a full DFA. 274 /// 275 /// # Example 276 /// 277 /// This example shows that NFA shrinking can lead to substantial space 278 /// savings in some cases. Notice that, as noted above, we build a reverse 279 /// DFA and use a pattern with a large Unicode character class. 280 /// 281 /// ``` 282 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 283 /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; 284 /// 285 /// // Currently we have to disable captures when enabling reverse NFA. 286 /// let config = NFA::config() 287 /// .which_captures(WhichCaptures::None) 288 /// .reverse(true); 289 /// let not_shrunk = NFA::compiler() 290 /// .configure(config.clone().shrink(false)) 291 /// .build(r"\w")?; 292 /// let shrunk = NFA::compiler() 293 /// .configure(config.clone().shrink(true)) 294 /// .build(r"\w")?; 295 /// 296 /// // While a specific shrink factor is not guaranteed, the savings can be 297 /// // considerable in some cases. 298 /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len()); 299 /// 300 /// # Ok::<(), Box<dyn std::error::Error>>(()) 301 /// ``` shrink(mut self, yes: bool) -> Config302 pub fn shrink(mut self, yes: bool) -> Config { 303 self.shrink = Some(yes); 304 self 305 } 306 307 /// Whether to include 'Capture' states in the NFA. 308 /// 309 /// Currently, enabling this setting requires disabling the 310 /// [`reverse`](Config::reverse) setting. If both are enabled, then the 311 /// compiler will return an error. It is expected that this limitation will 312 /// be lifted in the future. 313 /// 314 /// This is enabled by default. 315 /// 316 /// # Example 317 /// 318 /// This example demonstrates that some regex engines, like the Pike VM, 319 /// require capturing states to be present in the NFA to report match 320 /// offsets. 321 /// 322 /// (Note that since this method is deprecated, the example below uses 323 /// [`Config::which_captures`] to disable capture states.) 324 /// 325 /// ``` 326 /// use regex_automata::nfa::thompson::{ 327 /// pikevm::PikeVM, 328 /// NFA, 329 /// WhichCaptures, 330 /// }; 331 /// 332 /// let re = PikeVM::builder() 333 /// .thompson(NFA::config().which_captures(WhichCaptures::None)) 334 /// .build(r"[a-z]+")?; 335 /// let mut cache = re.create_cache(); 336 /// 337 /// assert!(re.is_match(&mut cache, "abc")); 338 /// assert_eq!(None, re.find(&mut cache, "abc")); 339 /// 340 /// # Ok::<(), Box<dyn std::error::Error>>(()) 341 /// ``` 342 #[deprecated(since = "0.3.5", note = "use which_captures instead")] captures(self, yes: bool) -> Config343 pub fn captures(self, yes: bool) -> Config { 344 self.which_captures(if yes { 345 WhichCaptures::All 346 } else { 347 WhichCaptures::None 348 }) 349 } 350 351 /// Configures what kinds of capture groups are compiled into 352 /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a 353 /// Thompson NFA. 354 /// 355 /// Currently, using any option except for [`WhichCaptures::None`] requires 356 /// disabling the [`reverse`](Config::reverse) setting. If both are 357 /// enabled, then the compiler will return an error. It is expected that 358 /// this limitation will be lifted in the future. 359 /// 360 /// This is set to [`WhichCaptures::All`] by default. Callers may wish to 361 /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the 362 /// overhead of capture states for explicit groups. Usually this occurs 363 /// when one wants to use the `PikeVM` only for determining the overall 364 /// match. Otherwise, the `PikeVM` could use much more memory than is 365 /// necessary. 366 /// 367 /// # Example 368 /// 369 /// This example demonstrates that some regex engines, like the Pike VM, 370 /// require capturing states to be present in the NFA to report match 371 /// offsets. 372 /// 373 /// ``` 374 /// use regex_automata::nfa::thompson::{ 375 /// pikevm::PikeVM, 376 /// NFA, 377 /// WhichCaptures, 378 /// }; 379 /// 380 /// let re = PikeVM::builder() 381 /// .thompson(NFA::config().which_captures(WhichCaptures::None)) 382 /// .build(r"[a-z]+")?; 383 /// let mut cache = re.create_cache(); 384 /// 385 /// assert!(re.is_match(&mut cache, "abc")); 386 /// assert_eq!(None, re.find(&mut cache, "abc")); 387 /// 388 /// # Ok::<(), Box<dyn std::error::Error>>(()) 389 /// ``` 390 /// 391 /// The same applies to the bounded backtracker: 392 /// 393 /// ``` 394 /// use regex_automata::nfa::thompson::{ 395 /// backtrack::BoundedBacktracker, 396 /// NFA, 397 /// WhichCaptures, 398 /// }; 399 /// 400 /// let re = BoundedBacktracker::builder() 401 /// .thompson(NFA::config().which_captures(WhichCaptures::None)) 402 /// .build(r"[a-z]+")?; 403 /// let mut cache = re.create_cache(); 404 /// 405 /// assert!(re.try_is_match(&mut cache, "abc")?); 406 /// assert_eq!(None, re.try_find(&mut cache, "abc")?); 407 /// 408 /// # Ok::<(), Box<dyn std::error::Error>>(()) 409 /// ``` which_captures(mut self, which_captures: WhichCaptures) -> Config410 pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { 411 self.which_captures = Some(which_captures); 412 self 413 } 414 415 /// Sets the look-around matcher that should be used with this NFA. 416 /// 417 /// A look-around matcher determines how to match look-around assertions. 418 /// In particular, some assertions are configurable. For example, the 419 /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed 420 /// from the default of `\n` to any other byte. 421 /// 422 /// # Example 423 /// 424 /// This shows how to change the line terminator for multi-line assertions. 425 /// 426 /// ``` 427 /// use regex_automata::{ 428 /// nfa::thompson::{self, pikevm::PikeVM}, 429 /// util::look::LookMatcher, 430 /// Match, Input, 431 /// }; 432 /// 433 /// let mut lookm = LookMatcher::new(); 434 /// lookm.set_line_terminator(b'\x00'); 435 /// 436 /// let re = PikeVM::builder() 437 /// .thompson(thompson::Config::new().look_matcher(lookm)) 438 /// .build(r"(?m)^[a-z]+$")?; 439 /// let mut cache = re.create_cache(); 440 /// 441 /// // Multi-line assertions now use NUL as a terminator. 442 /// assert_eq!( 443 /// Some(Match::must(0, 1..4)), 444 /// re.find(&mut cache, b"\x00abc\x00"), 445 /// ); 446 /// // ... and \n is no longer recognized as a terminator. 447 /// assert_eq!( 448 /// None, 449 /// re.find(&mut cache, b"\nabc\n"), 450 /// ); 451 /// 452 /// # Ok::<(), Box<dyn std::error::Error>>(()) 453 /// ``` look_matcher(mut self, m: LookMatcher) -> Config454 pub fn look_matcher(mut self, m: LookMatcher) -> Config { 455 self.look_matcher = Some(m); 456 self 457 } 458 459 /// Whether to compile an unanchored prefix into this NFA. 460 /// 461 /// This is enabled by default. It is made available for tests only to make 462 /// it easier to unit test the output of the compiler. 463 #[cfg(test)] unanchored_prefix(mut self, yes: bool) -> Config464 fn unanchored_prefix(mut self, yes: bool) -> Config { 465 self.unanchored_prefix = Some(yes); 466 self 467 } 468 469 /// Returns whether this configuration has enabled UTF-8 mode. get_utf8(&self) -> bool470 pub fn get_utf8(&self) -> bool { 471 self.utf8.unwrap_or(true) 472 } 473 474 /// Returns whether this configuration has enabled reverse NFA compilation. get_reverse(&self) -> bool475 pub fn get_reverse(&self) -> bool { 476 self.reverse.unwrap_or(false) 477 } 478 479 /// Return the configured NFA size limit, if it exists, in the number of 480 /// bytes of heap used. get_nfa_size_limit(&self) -> Option<usize>481 pub fn get_nfa_size_limit(&self) -> Option<usize> { 482 self.nfa_size_limit.unwrap_or(None) 483 } 484 485 /// Return whether NFA shrinking is enabled. get_shrink(&self) -> bool486 pub fn get_shrink(&self) -> bool { 487 self.shrink.unwrap_or(false) 488 } 489 490 /// Return whether NFA compilation is configured to produce capture states. 491 #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] get_captures(&self) -> bool492 pub fn get_captures(&self) -> bool { 493 self.get_which_captures().is_any() 494 } 495 496 /// Return what kinds of capture states will be compiled into an NFA. get_which_captures(&self) -> WhichCaptures497 pub fn get_which_captures(&self) -> WhichCaptures { 498 self.which_captures.unwrap_or(WhichCaptures::All) 499 } 500 501 /// Return the look-around matcher for this NFA. get_look_matcher(&self) -> LookMatcher502 pub fn get_look_matcher(&self) -> LookMatcher { 503 self.look_matcher.clone().unwrap_or(LookMatcher::default()) 504 } 505 506 /// Return whether NFA compilation is configured to include an unanchored 507 /// prefix. 508 /// 509 /// This is always false when not in test mode. get_unanchored_prefix(&self) -> bool510 fn get_unanchored_prefix(&self) -> bool { 511 #[cfg(test)] 512 { 513 self.unanchored_prefix.unwrap_or(true) 514 } 515 #[cfg(not(test))] 516 { 517 true 518 } 519 } 520 521 /// Overwrite the default configuration such that the options in `o` are 522 /// always used. If an option in `o` is not set, then the corresponding 523 /// option in `self` is used. If it's not set in `self` either, then it 524 /// remains not set. overwrite(&self, o: Config) -> Config525 pub(crate) fn overwrite(&self, o: Config) -> Config { 526 Config { 527 utf8: o.utf8.or(self.utf8), 528 reverse: o.reverse.or(self.reverse), 529 nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), 530 shrink: o.shrink.or(self.shrink), 531 which_captures: o.which_captures.or(self.which_captures), 532 look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), 533 #[cfg(test)] 534 unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), 535 } 536 } 537 } 538 539 /// A configuration indicating which kinds of 540 /// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. 541 /// 542 /// This configuration can be used with [`Config::which_captures`] to control 543 /// which capture states are compiled into a Thompson NFA. 544 /// 545 /// The default configuration is [`WhichCaptures::All`]. 546 #[derive(Clone, Copy, Debug)] 547 pub enum WhichCaptures { 548 /// All capture states, including those corresponding to both implicit and 549 /// explicit capture groups, are included in the Thompson NFA. 550 All, 551 /// Only capture states corresponding to implicit capture groups are 552 /// included. Implicit capture groups appear in every pattern implicitly 553 /// and correspond to the overall match of a pattern. 554 /// 555 /// This is useful when one only cares about the overall match of a 556 /// pattern. By excluding capture states from explicit capture groups, 557 /// one might be able to reduce the memory usage of a multi-pattern regex 558 /// substantially if it was otherwise written to have many explicit capture 559 /// groups. 560 Implicit, 561 /// No capture states are compiled into the Thompson NFA. 562 /// 563 /// This is useful when capture states are either not needed (for example, 564 /// if one is only trying to build a DFA) or if they aren't supported (for 565 /// example, a reverse NFA). 566 None, 567 } 568 569 impl Default for WhichCaptures { default() -> WhichCaptures570 fn default() -> WhichCaptures { 571 WhichCaptures::All 572 } 573 } 574 575 impl WhichCaptures { 576 /// Returns true if this configuration indicates that no capture states 577 /// should be produced in an NFA. is_none(&self) -> bool578 pub fn is_none(&self) -> bool { 579 matches!(*self, WhichCaptures::None) 580 } 581 582 /// Returns true if this configuration indicates that some capture states 583 /// should be added to an NFA. Note that this might only include capture 584 /// states for implicit capture groups. is_any(&self) -> bool585 pub fn is_any(&self) -> bool { 586 !self.is_none() 587 } 588 } 589 590 /* 591 This compiler below uses Thompson's construction algorithm. The compiler takes 592 a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph 593 is structured in a way that permits it to be executed by a virtual machine and 594 also used to efficiently build a DFA. 595 596 The compiler deals with a slightly expanded set of NFA states than what is 597 in a final NFA (as exhibited by builder::State and nfa::State). Notably a 598 compiler state includes an empty node that has exactly one unconditional 599 epsilon transition to the next state. In other words, it's a "goto" instruction 600 if one views Thompson's NFA as a set of bytecode instructions. These goto 601 instructions are removed in a subsequent phase before returning the NFA to the 602 caller. The purpose of these empty nodes is that they make the construction 603 algorithm substantially simpler to implement. We remove them before returning 604 to the caller because they can represent substantial overhead when traversing 605 the NFA graph (either while searching using the NFA directly or while building 606 a DFA). 607 608 In the future, it would be nice to provide a Glushkov compiler as well, as it 609 would work well as a bit-parallel NFA for smaller regexes. But the Thompson 610 construction is one I'm more familiar with and seems more straight-forward to 611 deal with when it comes to large Unicode character classes. 612 613 Internally, the compiler uses interior mutability to improve composition in the 614 face of the borrow checker. In particular, we'd really like to be able to write 615 things like this: 616 617 self.c_concat(exprs.iter().map(|e| self.c(e))) 618 619 Which elegantly uses iterators to build up a sequence of compiled regex 620 sub-expressions and then hands it off to the concatenating compiler routine. 621 Without interior mutability, the borrow checker won't let us borrow `self` 622 mutably both inside and outside the closure at the same time. 623 */ 624 625 /// A builder for compiling an NFA from a regex's high-level intermediate 626 /// representation (HIR). 627 /// 628 /// This compiler provides a way to translate a parsed regex pattern into an 629 /// NFA state graph. The NFA state graph can either be used directly to execute 630 /// a search (e.g., with a Pike VM), or it can be further used to build a DFA. 631 /// 632 /// This compiler provides APIs both for compiling regex patterns directly from 633 /// their concrete syntax, or via a [`regex_syntax::hir::Hir`]. 634 /// 635 /// This compiler has various options that may be configured via 636 /// [`thompson::Config`](Config). 637 /// 638 /// Note that a compiler is not the same as a [`thompson::Builder`](Builder). 639 /// A `Builder` provides a lower level API that is uncoupled from a regex 640 /// pattern's concrete syntax or even its HIR. Instead, it permits stitching 641 /// together an NFA by hand. See its docs for examples. 642 /// 643 /// # Example: compilation from concrete syntax 644 /// 645 /// This shows how to compile an NFA from a pattern string while setting a size 646 /// limit on how big the NFA is allowed to be (in terms of bytes of heap used). 647 /// 648 /// ``` 649 /// use regex_automata::{ 650 /// nfa::thompson::{NFA, pikevm::PikeVM}, 651 /// Match, 652 /// }; 653 /// 654 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 655 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; 656 /// 657 /// let re = PikeVM::new_from_nfa(nfa)?; 658 /// let mut cache = re.create_cache(); 659 /// let mut caps = re.create_captures(); 660 /// let expected = Some(Match::must(0, 3..4)); 661 /// re.captures(&mut cache, "!@#A#@!", &mut caps); 662 /// assert_eq!(expected, caps.get_match()); 663 /// 664 /// # Ok::<(), Box<dyn std::error::Error>>(()) 665 /// ``` 666 /// 667 /// # Example: compilation from HIR 668 /// 669 /// This shows how to hand assemble a regular expression via its HIR, and then 670 /// compile an NFA directly from it. 671 /// 672 /// ``` 673 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 674 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; 675 /// 676 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ 677 /// ClassBytesRange::new(b'0', b'9'), 678 /// ClassBytesRange::new(b'A', b'Z'), 679 /// ClassBytesRange::new(b'_', b'_'), 680 /// ClassBytesRange::new(b'a', b'z'), 681 /// ]))); 682 /// 683 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 684 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; 685 /// 686 /// let re = PikeVM::new_from_nfa(nfa)?; 687 /// let mut cache = re.create_cache(); 688 /// let mut caps = re.create_captures(); 689 /// let expected = Some(Match::must(0, 3..4)); 690 /// re.captures(&mut cache, "!@#A#@!", &mut caps); 691 /// assert_eq!(expected, caps.get_match()); 692 /// 693 /// # Ok::<(), Box<dyn std::error::Error>>(()) 694 /// ``` 695 #[derive(Clone, Debug)] 696 pub struct Compiler { 697 /// A regex parser, used when compiling an NFA directly from a pattern 698 /// string. 699 parser: ParserBuilder, 700 /// The compiler configuration. 701 config: Config, 702 /// The builder for actually constructing an NFA. This provides a 703 /// convenient abstraction for writing a compiler. 704 builder: RefCell<Builder>, 705 /// State used for compiling character classes to UTF-8 byte automata. 706 /// State is not retained between character class compilations. This just 707 /// serves to amortize allocation to the extent possible. 708 utf8_state: RefCell<Utf8State>, 709 /// State used for arranging character classes in reverse into a trie. 710 trie_state: RefCell<RangeTrie>, 711 /// State used for caching common suffixes when compiling reverse UTF-8 712 /// automata (for Unicode character classes). 713 utf8_suffix: RefCell<Utf8SuffixMap>, 714 } 715 716 impl Compiler { 717 /// Create a new NFA builder with its default configuration. new() -> Compiler718 pub fn new() -> Compiler { 719 Compiler { 720 parser: ParserBuilder::new(), 721 config: Config::default(), 722 builder: RefCell::new(Builder::new()), 723 utf8_state: RefCell::new(Utf8State::new()), 724 trie_state: RefCell::new(RangeTrie::new()), 725 utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), 726 } 727 } 728 729 /// Compile the given regular expression pattern into an NFA. 730 /// 731 /// If there was a problem parsing the regex, then that error is returned. 732 /// 733 /// Otherwise, if there was a problem building the NFA, then an error is 734 /// returned. The only error that can occur is if the compiled regex would 735 /// exceed the size limits configured on this builder, or if any part of 736 /// the NFA would exceed the integer representations used. (For example, 737 /// too many states might plausibly occur on a 16-bit target.) 738 /// 739 /// # Example 740 /// 741 /// ``` 742 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 743 /// 744 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 745 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; 746 /// 747 /// let re = PikeVM::new_from_nfa(nfa)?; 748 /// let mut cache = re.create_cache(); 749 /// let mut caps = re.create_captures(); 750 /// let expected = Some(Match::must(0, 3..4)); 751 /// re.captures(&mut cache, "!@#A#@!", &mut caps); 752 /// assert_eq!(expected, caps.get_match()); 753 /// 754 /// # Ok::<(), Box<dyn std::error::Error>>(()) 755 /// ``` build(&self, pattern: &str) -> Result<NFA, BuildError>756 pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> { 757 self.build_many(&[pattern]) 758 } 759 760 /// Compile the given regular expression patterns into a single NFA. 761 /// 762 /// When matches are returned, the pattern ID corresponds to the index of 763 /// the pattern in the slice given. 764 /// 765 /// # Example 766 /// 767 /// ``` 768 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 769 /// 770 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 771 /// let nfa = NFA::compiler().configure(config).build_many(&[ 772 /// r"(?-u)\s", 773 /// r"(?-u)\w", 774 /// ])?; 775 /// 776 /// let re = PikeVM::new_from_nfa(nfa)?; 777 /// let mut cache = re.create_cache(); 778 /// let mut caps = re.create_captures(); 779 /// let expected = Some(Match::must(1, 1..2)); 780 /// re.captures(&mut cache, "!A! !A!", &mut caps); 781 /// assert_eq!(expected, caps.get_match()); 782 /// 783 /// # Ok::<(), Box<dyn std::error::Error>>(()) 784 /// ``` build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<NFA, BuildError>785 pub fn build_many<P: AsRef<str>>( 786 &self, 787 patterns: &[P], 788 ) -> Result<NFA, BuildError> { 789 let mut hirs = vec![]; 790 for p in patterns { 791 hirs.push( 792 self.parser 793 .build() 794 .parse(p.as_ref()) 795 .map_err(BuildError::syntax)?, 796 ); 797 debug!("parsed: {:?}", p.as_ref()); 798 } 799 self.build_many_from_hir(&hirs) 800 } 801 802 /// Compile the given high level intermediate representation of a regular 803 /// expression into an NFA. 804 /// 805 /// If there was a problem building the NFA, then an error is returned. The 806 /// only error that can occur is if the compiled regex would exceed the 807 /// size limits configured on this builder, or if any part of the NFA would 808 /// exceed the integer representations used. (For example, too many states 809 /// might plausibly occur on a 16-bit target.) 810 /// 811 /// # Example 812 /// 813 /// ``` 814 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 815 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; 816 /// 817 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ 818 /// ClassBytesRange::new(b'0', b'9'), 819 /// ClassBytesRange::new(b'A', b'Z'), 820 /// ClassBytesRange::new(b'_', b'_'), 821 /// ClassBytesRange::new(b'a', b'z'), 822 /// ]))); 823 /// 824 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 825 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; 826 /// 827 /// let re = PikeVM::new_from_nfa(nfa)?; 828 /// let mut cache = re.create_cache(); 829 /// let mut caps = re.create_captures(); 830 /// let expected = Some(Match::must(0, 3..4)); 831 /// re.captures(&mut cache, "!@#A#@!", &mut caps); 832 /// assert_eq!(expected, caps.get_match()); 833 /// 834 /// # Ok::<(), Box<dyn std::error::Error>>(()) 835 /// ``` build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError>836 pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> { 837 self.build_many_from_hir(&[expr]) 838 } 839 840 /// Compile the given high level intermediate representations of regular 841 /// expressions into a single NFA. 842 /// 843 /// When matches are returned, the pattern ID corresponds to the index of 844 /// the pattern in the slice given. 845 /// 846 /// # Example 847 /// 848 /// ``` 849 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 850 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; 851 /// 852 /// let hirs = &[ 853 /// Hir::class(Class::Bytes(ClassBytes::new(vec![ 854 /// ClassBytesRange::new(b'\t', b'\r'), 855 /// ClassBytesRange::new(b' ', b' '), 856 /// ]))), 857 /// Hir::class(Class::Bytes(ClassBytes::new(vec![ 858 /// ClassBytesRange::new(b'0', b'9'), 859 /// ClassBytesRange::new(b'A', b'Z'), 860 /// ClassBytesRange::new(b'_', b'_'), 861 /// ClassBytesRange::new(b'a', b'z'), 862 /// ]))), 863 /// ]; 864 /// 865 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 866 /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?; 867 /// 868 /// let re = PikeVM::new_from_nfa(nfa)?; 869 /// let mut cache = re.create_cache(); 870 /// let mut caps = re.create_captures(); 871 /// let expected = Some(Match::must(1, 1..2)); 872 /// re.captures(&mut cache, "!A! !A!", &mut caps); 873 /// assert_eq!(expected, caps.get_match()); 874 /// 875 /// # Ok::<(), Box<dyn std::error::Error>>(()) 876 /// ``` build_many_from_hir<H: Borrow<Hir>>( &self, exprs: &[H], ) -> Result<NFA, BuildError>877 pub fn build_many_from_hir<H: Borrow<Hir>>( 878 &self, 879 exprs: &[H], 880 ) -> Result<NFA, BuildError> { 881 self.compile(exprs) 882 } 883 884 /// Apply the given NFA configuration options to this builder. 885 /// 886 /// # Example 887 /// 888 /// ``` 889 /// use regex_automata::nfa::thompson::NFA; 890 /// 891 /// let config = NFA::config().nfa_size_limit(Some(1_000)); 892 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; 893 /// assert_eq!(nfa.pattern_len(), 1); 894 /// 895 /// # Ok::<(), Box<dyn std::error::Error>>(()) 896 /// ``` configure(&mut self, config: Config) -> &mut Compiler897 pub fn configure(&mut self, config: Config) -> &mut Compiler { 898 self.config = self.config.overwrite(config); 899 self 900 } 901 902 /// Set the syntax configuration for this builder using 903 /// [`syntax::Config`](crate::util::syntax::Config). 904 /// 905 /// This permits setting things like case insensitivity, Unicode and multi 906 /// line mode. 907 /// 908 /// This syntax configuration only applies when an NFA is built directly 909 /// from a pattern string. If an NFA is built from an HIR, then all syntax 910 /// settings are ignored. 911 /// 912 /// # Example 913 /// 914 /// ``` 915 /// use regex_automata::{nfa::thompson::NFA, util::syntax}; 916 /// 917 /// let syntax_config = syntax::Config::new().unicode(false); 918 /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?; 919 /// // If Unicode were enabled, the number of states would be much bigger. 920 /// assert!(nfa.states().len() < 15); 921 /// 922 /// # Ok::<(), Box<dyn std::error::Error>>(()) 923 /// ``` syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Compiler924 pub fn syntax( 925 &mut self, 926 config: crate::util::syntax::Config, 927 ) -> &mut Compiler { 928 config.apply(&mut self.parser); 929 self 930 } 931 } 932 933 impl Compiler { 934 /// Compile the sequence of HIR expressions given. Pattern IDs are 935 /// allocated starting from 0, in correspondence with the slice given. 936 /// 937 /// It is legal to provide an empty slice. In that case, the NFA returned 938 /// has no patterns and will never match anything. compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError>939 fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> { 940 if exprs.len() > PatternID::LIMIT { 941 return Err(BuildError::too_many_patterns(exprs.len())); 942 } 943 if self.config.get_reverse() 944 && self.config.get_which_captures().is_any() 945 { 946 return Err(BuildError::unsupported_captures()); 947 } 948 949 self.builder.borrow_mut().clear(); 950 self.builder.borrow_mut().set_utf8(self.config.get_utf8()); 951 self.builder.borrow_mut().set_reverse(self.config.get_reverse()); 952 self.builder 953 .borrow_mut() 954 .set_look_matcher(self.config.get_look_matcher()); 955 self.builder 956 .borrow_mut() 957 .set_size_limit(self.config.get_nfa_size_limit())?; 958 959 // We always add an unanchored prefix unless we were specifically told 960 // not to (for tests only), or if we know that the regex is anchored 961 // for all matches. When an unanchored prefix is not added, then the 962 // NFA's anchored and unanchored start states are equivalent. 963 let all_anchored = exprs.iter().all(|e| { 964 let props = e.borrow().properties(); 965 if self.config.get_reverse() { 966 props.look_set_suffix().contains(hir::Look::End) 967 } else { 968 props.look_set_prefix().contains(hir::Look::Start) 969 } 970 }); 971 let anchored = !self.config.get_unanchored_prefix() || all_anchored; 972 let unanchored_prefix = if anchored { 973 self.c_empty()? 974 } else { 975 self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? 976 }; 977 978 let compiled = self.c_alt_iter(exprs.iter().map(|e| { 979 let _ = self.start_pattern()?; 980 let one = self.c_cap(0, None, e.borrow())?; 981 let match_state_id = self.add_match()?; 982 self.patch(one.end, match_state_id)?; 983 let _ = self.finish_pattern(one.start)?; 984 Ok(ThompsonRef { start: one.start, end: match_state_id }) 985 }))?; 986 self.patch(unanchored_prefix.end, compiled.start)?; 987 let nfa = self 988 .builder 989 .borrow_mut() 990 .build(compiled.start, unanchored_prefix.start)?; 991 992 debug!("HIR-to-NFA compilation complete, config: {:?}", self.config); 993 Ok(nfa) 994 } 995 996 /// Compile an arbitrary HIR expression. c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError>997 fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> { 998 use regex_syntax::hir::{Class, HirKind::*}; 999 1000 match *expr.kind() { 1001 Empty => self.c_empty(), 1002 Literal(hir::Literal(ref bytes)) => self.c_literal(bytes), 1003 Class(Class::Bytes(ref c)) => self.c_byte_class(c), 1004 Class(Class::Unicode(ref c)) => self.c_unicode_class(c), 1005 Look(ref look) => self.c_look(look), 1006 Repetition(ref rep) => self.c_repetition(rep), 1007 Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), 1008 Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), 1009 Alternation(ref es) => self.c_alt_slice(es), 1010 } 1011 } 1012 1013 /// Compile a concatenation of the sub-expressions yielded by the given 1014 /// iterator. If the iterator yields no elements, then this compiles down 1015 /// to an "empty" state that always matches. 1016 /// 1017 /// If the compiler is in reverse mode, then the expressions given are 1018 /// automatically compiled in reverse. c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> where I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,1019 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> 1020 where 1021 I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, 1022 { 1023 let first = if self.is_reverse() { it.next_back() } else { it.next() }; 1024 let ThompsonRef { start, mut end } = match first { 1025 Some(result) => result?, 1026 None => return self.c_empty(), 1027 }; 1028 loop { 1029 let next = 1030 if self.is_reverse() { it.next_back() } else { it.next() }; 1031 let compiled = match next { 1032 Some(result) => result?, 1033 None => break, 1034 }; 1035 self.patch(end, compiled.start)?; 1036 end = compiled.end; 1037 } 1038 Ok(ThompsonRef { start, end }) 1039 } 1040 1041 /// Compile an alternation of the given HIR values. 1042 /// 1043 /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead 1044 /// of an iterator of compiled NFA subgraphs. The point of accepting a 1045 /// slice here is that it opens up some optimization opportunities. For 1046 /// example, if all of the HIR values are literals, then this routine might 1047 /// re-shuffle them to make NFA epsilon closures substantially faster. c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError>1048 fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> { 1049 // self.c_alt_iter(exprs.iter().map(|e| self.c(e))) 1050 let literal_count = exprs 1051 .iter() 1052 .filter(|e| { 1053 matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_))) 1054 }) 1055 .count(); 1056 if literal_count <= 1 || literal_count < exprs.len() { 1057 return self.c_alt_iter(exprs.iter().map(|e| self.c(e))); 1058 } 1059 1060 let mut trie = if self.is_reverse() { 1061 LiteralTrie::reverse() 1062 } else { 1063 LiteralTrie::forward() 1064 }; 1065 for expr in exprs.iter() { 1066 let literal = match *expr.kind() { 1067 hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes, 1068 _ => unreachable!(), 1069 }; 1070 trie.add(literal)?; 1071 } 1072 trie.compile(&mut self.builder.borrow_mut()) 1073 } 1074 1075 /// Compile an alternation, where each element yielded by the given 1076 /// iterator represents an item in the alternation. If the iterator yields 1077 /// no elements, then this compiles down to a "fail" state. 1078 /// 1079 /// In an alternation, expressions appearing earlier are "preferred" at 1080 /// match time over expressions appearing later. At least, this is true 1081 /// when using "leftmost first" match semantics. (If "leftmost longest" are 1082 /// ever added in the future, then this preference order of priority would 1083 /// not apply in that mode.) c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> where I: Iterator<Item = Result<ThompsonRef, BuildError>>,1084 fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> 1085 where 1086 I: Iterator<Item = Result<ThompsonRef, BuildError>>, 1087 { 1088 let first = match it.next() { 1089 None => return self.c_fail(), 1090 Some(result) => result?, 1091 }; 1092 let second = match it.next() { 1093 None => return Ok(first), 1094 Some(result) => result?, 1095 }; 1096 1097 let union = self.add_union()?; 1098 let end = self.add_empty()?; 1099 self.patch(union, first.start)?; 1100 self.patch(first.end, end)?; 1101 self.patch(union, second.start)?; 1102 self.patch(second.end, end)?; 1103 for result in it { 1104 let compiled = result?; 1105 self.patch(union, compiled.start)?; 1106 self.patch(compiled.end, end)?; 1107 } 1108 Ok(ThompsonRef { start: union, end }) 1109 } 1110 1111 /// Compile the given capture sub-expression. `expr` should be the 1112 /// sub-expression contained inside the capture. If "capture" states are 1113 /// enabled, then they are added as appropriate. 1114 /// 1115 /// This accepts the pieces of a capture instead of a `hir::Capture` so 1116 /// that it's easy to manufacture a "fake" group when necessary, e.g., for 1117 /// adding the entire pattern as if it were a group in order to create 1118 /// appropriate "capture" states in the NFA. c_cap( &self, index: u32, name: Option<&str>, expr: &Hir, ) -> Result<ThompsonRef, BuildError>1119 fn c_cap( 1120 &self, 1121 index: u32, 1122 name: Option<&str>, 1123 expr: &Hir, 1124 ) -> Result<ThompsonRef, BuildError> { 1125 match self.config.get_which_captures() { 1126 // No capture states means we always skip them. 1127 WhichCaptures::None => return self.c(expr), 1128 // Implicit captures states means we only add when index==0 since 1129 // index==0 implies the group is implicit. 1130 WhichCaptures::Implicit if index > 0 => return self.c(expr), 1131 _ => {} 1132 } 1133 1134 let start = self.add_capture_start(index, name)?; 1135 let inner = self.c(expr)?; 1136 let end = self.add_capture_end(index)?; 1137 self.patch(start, inner.start)?; 1138 self.patch(inner.end, end)?; 1139 Ok(ThompsonRef { start, end }) 1140 } 1141 1142 /// Compile the given repetition expression. This handles all types of 1143 /// repetitions and greediness. c_repetition( &self, rep: &hir::Repetition, ) -> Result<ThompsonRef, BuildError>1144 fn c_repetition( 1145 &self, 1146 rep: &hir::Repetition, 1147 ) -> Result<ThompsonRef, BuildError> { 1148 match (rep.min, rep.max) { 1149 (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), 1150 (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), 1151 (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), 1152 (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), 1153 } 1154 } 1155 1156 /// Compile the given expression such that it matches at least `min` times, 1157 /// but no more than `max` times. 1158 /// 1159 /// When `greedy` is true, then the preference is for the expression to 1160 /// match as much as possible. Otherwise, it will match as little as 1161 /// possible. c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result<ThompsonRef, BuildError>1162 fn c_bounded( 1163 &self, 1164 expr: &Hir, 1165 greedy: bool, 1166 min: u32, 1167 max: u32, 1168 ) -> Result<ThompsonRef, BuildError> { 1169 let prefix = self.c_exactly(expr, min)?; 1170 if min == max { 1171 return Ok(prefix); 1172 } 1173 1174 // It is tempting here to compile the rest here as a concatenation 1175 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it 1176 // were `aaa?a?a?`. The problem here is that it leads to this program: 1177 // 1178 // >000000: 61 => 01 1179 // 000001: 61 => 02 1180 // 000002: union(03, 04) 1181 // 000003: 61 => 04 1182 // 000004: union(05, 06) 1183 // 000005: 61 => 06 1184 // 000006: union(07, 08) 1185 // 000007: 61 => 08 1186 // 000008: MATCH 1187 // 1188 // And effectively, once you hit state 2, the epsilon closure will 1189 // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better 1190 // to instead compile it like so: 1191 // 1192 // >000000: 61 => 01 1193 // 000001: 61 => 02 1194 // 000002: union(03, 08) 1195 // 000003: 61 => 04 1196 // 000004: union(05, 08) 1197 // 000005: 61 => 06 1198 // 000006: union(07, 08) 1199 // 000007: 61 => 08 1200 // 000008: MATCH 1201 // 1202 // So that the epsilon closure of state 2 is now just 3 and 8. 1203 let empty = self.add_empty()?; 1204 let mut prev_end = prefix.end; 1205 for _ in min..max { 1206 let union = if greedy { 1207 self.add_union() 1208 } else { 1209 self.add_union_reverse() 1210 }?; 1211 let compiled = self.c(expr)?; 1212 self.patch(prev_end, union)?; 1213 self.patch(union, compiled.start)?; 1214 self.patch(union, empty)?; 1215 prev_end = compiled.end; 1216 } 1217 self.patch(prev_end, empty)?; 1218 Ok(ThompsonRef { start: prefix.start, end: empty }) 1219 } 1220 1221 /// Compile the given expression such that it may be matched `n` or more 1222 /// times, where `n` can be any integer. (Although a particularly large 1223 /// integer is likely to run afoul of any configured size limits.) 1224 /// 1225 /// When `greedy` is true, then the preference is for the expression to 1226 /// match as much as possible. Otherwise, it will match as little as 1227 /// possible. c_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result<ThompsonRef, BuildError>1228 fn c_at_least( 1229 &self, 1230 expr: &Hir, 1231 greedy: bool, 1232 n: u32, 1233 ) -> Result<ThompsonRef, BuildError> { 1234 if n == 0 { 1235 // When the expression cannot match the empty string, then we 1236 // can get away with something much simpler: just one 'alt' 1237 // instruction that optionally repeats itself. But if the expr 1238 // can match the empty string... see below. 1239 if expr.properties().minimum_len().map_or(false, |len| len > 0) { 1240 let union = if greedy { 1241 self.add_union() 1242 } else { 1243 self.add_union_reverse() 1244 }?; 1245 let compiled = self.c(expr)?; 1246 self.patch(union, compiled.start)?; 1247 self.patch(compiled.end, union)?; 1248 return Ok(ThompsonRef { start: union, end: union }); 1249 } 1250 1251 // What's going on here? Shouldn't x* be simpler than this? It 1252 // turns out that when implementing leftmost-first (Perl-like) 1253 // match semantics, x* results in an incorrect preference order 1254 // when computing the transitive closure of states if and only if 1255 // 'x' can match the empty string. So instead, we compile x* as 1256 // (x+)?, which preserves the correct preference order. 1257 // 1258 // See: https://github.com/rust-lang/regex/issues/779 1259 let compiled = self.c(expr)?; 1260 let plus = if greedy { 1261 self.add_union() 1262 } else { 1263 self.add_union_reverse() 1264 }?; 1265 self.patch(compiled.end, plus)?; 1266 self.patch(plus, compiled.start)?; 1267 1268 let question = if greedy { 1269 self.add_union() 1270 } else { 1271 self.add_union_reverse() 1272 }?; 1273 let empty = self.add_empty()?; 1274 self.patch(question, compiled.start)?; 1275 self.patch(question, empty)?; 1276 self.patch(plus, empty)?; 1277 Ok(ThompsonRef { start: question, end: empty }) 1278 } else if n == 1 { 1279 let compiled = self.c(expr)?; 1280 let union = if greedy { 1281 self.add_union() 1282 } else { 1283 self.add_union_reverse() 1284 }?; 1285 self.patch(compiled.end, union)?; 1286 self.patch(union, compiled.start)?; 1287 Ok(ThompsonRef { start: compiled.start, end: union }) 1288 } else { 1289 let prefix = self.c_exactly(expr, n - 1)?; 1290 let last = self.c(expr)?; 1291 let union = if greedy { 1292 self.add_union() 1293 } else { 1294 self.add_union_reverse() 1295 }?; 1296 self.patch(prefix.end, last.start)?; 1297 self.patch(last.end, union)?; 1298 self.patch(union, last.start)?; 1299 Ok(ThompsonRef { start: prefix.start, end: union }) 1300 } 1301 } 1302 1303 /// Compile the given expression such that it may be matched zero or one 1304 /// times. 1305 /// 1306 /// When `greedy` is true, then the preference is for the expression to 1307 /// match as much as possible. Otherwise, it will match as little as 1308 /// possible. c_zero_or_one( &self, expr: &Hir, greedy: bool, ) -> Result<ThompsonRef, BuildError>1309 fn c_zero_or_one( 1310 &self, 1311 expr: &Hir, 1312 greedy: bool, 1313 ) -> Result<ThompsonRef, BuildError> { 1314 let union = 1315 if greedy { self.add_union() } else { self.add_union_reverse() }?; 1316 let compiled = self.c(expr)?; 1317 let empty = self.add_empty()?; 1318 self.patch(union, compiled.start)?; 1319 self.patch(union, empty)?; 1320 self.patch(compiled.end, empty)?; 1321 Ok(ThompsonRef { start: union, end: empty }) 1322 } 1323 1324 /// Compile the given HIR expression exactly `n` times. c_exactly( &self, expr: &Hir, n: u32, ) -> Result<ThompsonRef, BuildError>1325 fn c_exactly( 1326 &self, 1327 expr: &Hir, 1328 n: u32, 1329 ) -> Result<ThompsonRef, BuildError> { 1330 let it = (0..n).map(|_| self.c(expr)); 1331 self.c_concat(it) 1332 } 1333 1334 /// Compile the given byte oriented character class. 1335 /// 1336 /// This uses "sparse" states to represent an alternation between ranges in 1337 /// this character class. We can use "sparse" states instead of stitching 1338 /// together a "union" state because all ranges in a character class have 1339 /// equal priority *and* are non-overlapping (thus, only one can match, so 1340 /// there's never a question of priority in the first place). This saves a 1341 /// fair bit of overhead when traversing an NFA. 1342 /// 1343 /// This routine compiles an empty character class into a "fail" state. c_byte_class( &self, cls: &hir::ClassBytes, ) -> Result<ThompsonRef, BuildError>1344 fn c_byte_class( 1345 &self, 1346 cls: &hir::ClassBytes, 1347 ) -> Result<ThompsonRef, BuildError> { 1348 let end = self.add_empty()?; 1349 let mut trans = Vec::with_capacity(cls.ranges().len()); 1350 for r in cls.iter() { 1351 trans.push(Transition { 1352 start: r.start(), 1353 end: r.end(), 1354 next: end, 1355 }); 1356 } 1357 Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) 1358 } 1359 1360 /// Compile the given Unicode character class. 1361 /// 1362 /// This routine specifically tries to use various types of compression, 1363 /// since UTF-8 automata of large classes can get quite large. The specific 1364 /// type of compression used depends on forward vs reverse compilation, and 1365 /// whether NFA shrinking is enabled or not. 1366 /// 1367 /// Aside from repetitions causing lots of repeat group, this is like the 1368 /// single most expensive part of regex compilation. Therefore, a large part 1369 /// of the expense of compilation may be reduce by disabling Unicode in the 1370 /// pattern. 1371 /// 1372 /// This routine compiles an empty character class into a "fail" state. c_unicode_class( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef, BuildError>1373 fn c_unicode_class( 1374 &self, 1375 cls: &hir::ClassUnicode, 1376 ) -> Result<ThompsonRef, BuildError> { 1377 // If all we have are ASCII ranges wrapped in a Unicode package, then 1378 // there is zero reason to bring out the big guns. We can fit all ASCII 1379 // ranges within a single sparse state. 1380 if cls.is_ascii() { 1381 let end = self.add_empty()?; 1382 let mut trans = Vec::with_capacity(cls.ranges().len()); 1383 for r in cls.iter() { 1384 // The unwraps below are OK because we've verified that this 1385 // class only contains ASCII codepoints. 1386 trans.push(Transition { 1387 // FIXME(1.59): use the 'TryFrom<char> for u8' impl. 1388 start: u8::try_from(u32::from(r.start())).unwrap(), 1389 end: u8::try_from(u32::from(r.end())).unwrap(), 1390 next: end, 1391 }); 1392 } 1393 Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) 1394 } else if self.is_reverse() { 1395 if !self.config.get_shrink() { 1396 // When we don't want to spend the extra time shrinking, we 1397 // compile the UTF-8 automaton in reverse using something like 1398 // the "naive" approach, but will attempt to re-use common 1399 // suffixes. 1400 self.c_unicode_class_reverse_with_suffix(cls) 1401 } else { 1402 // When we want to shrink our NFA for reverse UTF-8 automata, 1403 // we cannot feed UTF-8 sequences directly to the UTF-8 1404 // compiler, since the UTF-8 compiler requires all sequences 1405 // to be lexicographically sorted. Instead, we organize our 1406 // sequences into a range trie, which can then output our 1407 // sequences in the correct order. Unfortunately, building the 1408 // range trie is fairly expensive (but not nearly as expensive 1409 // as building a DFA). Hence the reason why the 'shrink' option 1410 // exists, so that this path can be toggled off. For example, 1411 // we might want to turn this off if we know we won't be 1412 // compiling a DFA. 1413 let mut trie = self.trie_state.borrow_mut(); 1414 trie.clear(); 1415 1416 for rng in cls.iter() { 1417 for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { 1418 seq.reverse(); 1419 trie.insert(seq.as_slice()); 1420 } 1421 } 1422 let mut builder = self.builder.borrow_mut(); 1423 let mut utf8_state = self.utf8_state.borrow_mut(); 1424 let mut utf8c = 1425 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; 1426 trie.iter(|seq| { 1427 utf8c.add(&seq)?; 1428 Ok(()) 1429 })?; 1430 utf8c.finish() 1431 } 1432 } else { 1433 // In the forward direction, we always shrink our UTF-8 automata 1434 // because we can stream it right into the UTF-8 compiler. There 1435 // is almost no downside (in either memory or time) to using this 1436 // approach. 1437 let mut builder = self.builder.borrow_mut(); 1438 let mut utf8_state = self.utf8_state.borrow_mut(); 1439 let mut utf8c = 1440 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; 1441 for rng in cls.iter() { 1442 for seq in Utf8Sequences::new(rng.start(), rng.end()) { 1443 utf8c.add(seq.as_slice())?; 1444 } 1445 } 1446 utf8c.finish() 1447 } 1448 1449 // For reference, the code below is the "naive" version of compiling a 1450 // UTF-8 automaton. It is deliciously simple (and works for both the 1451 // forward and reverse cases), but will unfortunately produce very 1452 // large NFAs. When compiling a forward automaton, the size difference 1453 // can sometimes be an order of magnitude. For example, the '\w' regex 1454 // will generate about ~3000 NFA states using the naive approach below, 1455 // but only 283 states when using the approach above. This is because 1456 // the approach above actually compiles a *minimal* (or near minimal, 1457 // because of the bounded hashmap for reusing equivalent states) UTF-8 1458 // automaton. 1459 // 1460 // The code below is kept as a reference point in order to make it 1461 // easier to understand the higher level goal here. Although, it will 1462 // almost certainly bit-rot, so keep that in mind. Also, if you try to 1463 // use it, some of the tests in this module will fail because they look 1464 // for terser byte code produce by the more optimized handling above. 1465 // But the integration test suite should still pass. 1466 // 1467 // One good example of the substantial difference this can make is to 1468 // compare and contrast performance of the Pike VM when the code below 1469 // is active vs the code above. Here's an example to try: 1470 // 1471 // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file 1472 // 1473 // With Unicode classes generated below, this search takes about 45s on 1474 // my machine. But with the compressed version above, the search takes 1475 // only around 1.4s. The NFA is also 20% smaller. This is in part due 1476 // to the compression, but also because of the utilization of 'sparse' 1477 // NFA states. They lead to much less state shuffling during the NFA 1478 // search. 1479 /* 1480 let it = cls 1481 .iter() 1482 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) 1483 .map(|seq| { 1484 let it = seq 1485 .as_slice() 1486 .iter() 1487 .map(|rng| self.c_range(rng.start, rng.end)); 1488 self.c_concat(it) 1489 }); 1490 self.c_alt_iter(it) 1491 */ 1492 } 1493 1494 /// Compile the given Unicode character class in reverse with suffix 1495 /// caching. 1496 /// 1497 /// This is a "quick" way to compile large Unicode classes into reverse 1498 /// UTF-8 automata while doing a small amount of compression on that 1499 /// automata by reusing common suffixes. 1500 /// 1501 /// A more comprehensive compression scheme can be accomplished by using 1502 /// a range trie to efficiently sort a reverse sequence of UTF-8 byte 1503 /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`. 1504 /// 1505 /// This is the technique used when "NFA shrinking" is disabled. 1506 /// 1507 /// (This also tries to use "sparse" states where possible, just like 1508 /// `c_byte_class` does.) c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, ) -> Result<ThompsonRef, BuildError>1509 fn c_unicode_class_reverse_with_suffix( 1510 &self, 1511 cls: &hir::ClassUnicode, 1512 ) -> Result<ThompsonRef, BuildError> { 1513 // N.B. It would likely be better to cache common *prefixes* in the 1514 // reverse direction, but it's not quite clear how to do that. The 1515 // advantage of caching suffixes is that it does give us a win, and 1516 // has a very small additional overhead. 1517 let mut cache = self.utf8_suffix.borrow_mut(); 1518 cache.clear(); 1519 1520 let union = self.add_union()?; 1521 let alt_end = self.add_empty()?; 1522 for urng in cls.iter() { 1523 for seq in Utf8Sequences::new(urng.start(), urng.end()) { 1524 let mut end = alt_end; 1525 for brng in seq.as_slice() { 1526 let key = Utf8SuffixKey { 1527 from: end, 1528 start: brng.start, 1529 end: brng.end, 1530 }; 1531 let hash = cache.hash(&key); 1532 if let Some(id) = cache.get(&key, hash) { 1533 end = id; 1534 continue; 1535 } 1536 1537 let compiled = self.c_range(brng.start, brng.end)?; 1538 self.patch(compiled.end, end)?; 1539 end = compiled.start; 1540 cache.set(key, hash, end); 1541 } 1542 self.patch(union, end)?; 1543 } 1544 } 1545 Ok(ThompsonRef { start: union, end: alt_end }) 1546 } 1547 1548 /// Compile the given HIR look-around assertion to an NFA look-around 1549 /// assertion. c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError>1550 fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> { 1551 let look = match *anchor { 1552 hir::Look::Start => Look::Start, 1553 hir::Look::End => Look::End, 1554 hir::Look::StartLF => Look::StartLF, 1555 hir::Look::EndLF => Look::EndLF, 1556 hir::Look::StartCRLF => Look::StartCRLF, 1557 hir::Look::EndCRLF => Look::EndCRLF, 1558 hir::Look::WordAscii => Look::WordAscii, 1559 hir::Look::WordAsciiNegate => Look::WordAsciiNegate, 1560 hir::Look::WordUnicode => Look::WordUnicode, 1561 hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, 1562 hir::Look::WordStartAscii => Look::WordStartAscii, 1563 hir::Look::WordEndAscii => Look::WordEndAscii, 1564 hir::Look::WordStartUnicode => Look::WordStartUnicode, 1565 hir::Look::WordEndUnicode => Look::WordEndUnicode, 1566 hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, 1567 hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, 1568 hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, 1569 hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, 1570 }; 1571 let id = self.add_look(look)?; 1572 Ok(ThompsonRef { start: id, end: id }) 1573 } 1574 1575 /// Compile the given byte string to a concatenation of bytes. c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError>1576 fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> { 1577 self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b))) 1578 } 1579 1580 /// Compile a "range" state with one transition that may only be followed 1581 /// if the input byte is in the (inclusive) range given. 1582 /// 1583 /// Both the `start` and `end` locations point to the state created. 1584 /// Callers will likely want to keep the `start`, but patch the `end` to 1585 /// point to some other state. c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError>1586 fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> { 1587 let id = self.add_range(start, end)?; 1588 Ok(ThompsonRef { start: id, end: id }) 1589 } 1590 1591 /// Compile an "empty" state with one unconditional epsilon transition. 1592 /// 1593 /// Both the `start` and `end` locations point to the state created. 1594 /// Callers will likely want to keep the `start`, but patch the `end` to 1595 /// point to some other state. c_empty(&self) -> Result<ThompsonRef, BuildError>1596 fn c_empty(&self) -> Result<ThompsonRef, BuildError> { 1597 let id = self.add_empty()?; 1598 Ok(ThompsonRef { start: id, end: id }) 1599 } 1600 1601 /// Compile a "fail" state that can never have any outgoing transitions. c_fail(&self) -> Result<ThompsonRef, BuildError>1602 fn c_fail(&self) -> Result<ThompsonRef, BuildError> { 1603 let id = self.add_fail()?; 1604 Ok(ThompsonRef { start: id, end: id }) 1605 } 1606 1607 // The below helpers are meant to be simple wrappers around the 1608 // corresponding Builder methods. For the most part, they let us write 1609 // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where 1610 // the latter is a mouthful. Some of the methods do inject a little bit 1611 // of extra logic. e.g., Flipping look-around operators when compiling in 1612 // reverse mode. 1613 patch(&self, from: StateID, to: StateID) -> Result<(), BuildError>1614 fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> { 1615 self.builder.borrow_mut().patch(from, to) 1616 } 1617 start_pattern(&self) -> Result<PatternID, BuildError>1618 fn start_pattern(&self) -> Result<PatternID, BuildError> { 1619 self.builder.borrow_mut().start_pattern() 1620 } 1621 finish_pattern( &self, start_id: StateID, ) -> Result<PatternID, BuildError>1622 fn finish_pattern( 1623 &self, 1624 start_id: StateID, 1625 ) -> Result<PatternID, BuildError> { 1626 self.builder.borrow_mut().finish_pattern(start_id) 1627 } 1628 add_empty(&self) -> Result<StateID, BuildError>1629 fn add_empty(&self) -> Result<StateID, BuildError> { 1630 self.builder.borrow_mut().add_empty() 1631 } 1632 add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError>1633 fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> { 1634 self.builder.borrow_mut().add_range(Transition { 1635 start, 1636 end, 1637 next: StateID::ZERO, 1638 }) 1639 } 1640 add_sparse( &self, ranges: Vec<Transition>, ) -> Result<StateID, BuildError>1641 fn add_sparse( 1642 &self, 1643 ranges: Vec<Transition>, 1644 ) -> Result<StateID, BuildError> { 1645 self.builder.borrow_mut().add_sparse(ranges) 1646 } 1647 add_look(&self, mut look: Look) -> Result<StateID, BuildError>1648 fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> { 1649 if self.is_reverse() { 1650 look = look.reversed(); 1651 } 1652 self.builder.borrow_mut().add_look(StateID::ZERO, look) 1653 } 1654 add_union(&self) -> Result<StateID, BuildError>1655 fn add_union(&self) -> Result<StateID, BuildError> { 1656 self.builder.borrow_mut().add_union(vec![]) 1657 } 1658 add_union_reverse(&self) -> Result<StateID, BuildError>1659 fn add_union_reverse(&self) -> Result<StateID, BuildError> { 1660 self.builder.borrow_mut().add_union_reverse(vec![]) 1661 } 1662 add_capture_start( &self, capture_index: u32, name: Option<&str>, ) -> Result<StateID, BuildError>1663 fn add_capture_start( 1664 &self, 1665 capture_index: u32, 1666 name: Option<&str>, 1667 ) -> Result<StateID, BuildError> { 1668 let name = name.map(|n| Arc::from(n)); 1669 self.builder.borrow_mut().add_capture_start( 1670 StateID::ZERO, 1671 capture_index, 1672 name, 1673 ) 1674 } 1675 add_capture_end( &self, capture_index: u32, ) -> Result<StateID, BuildError>1676 fn add_capture_end( 1677 &self, 1678 capture_index: u32, 1679 ) -> Result<StateID, BuildError> { 1680 self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index) 1681 } 1682 add_fail(&self) -> Result<StateID, BuildError>1683 fn add_fail(&self) -> Result<StateID, BuildError> { 1684 self.builder.borrow_mut().add_fail() 1685 } 1686 add_match(&self) -> Result<StateID, BuildError>1687 fn add_match(&self) -> Result<StateID, BuildError> { 1688 self.builder.borrow_mut().add_match() 1689 } 1690 is_reverse(&self) -> bool1691 fn is_reverse(&self) -> bool { 1692 self.config.get_reverse() 1693 } 1694 } 1695 1696 /// A value that represents the result of compiling a sub-expression of a 1697 /// regex's HIR. Specifically, this represents a sub-graph of the NFA that 1698 /// has an initial state at `start` and a final state at `end`. 1699 #[derive(Clone, Copy, Debug)] 1700 pub(crate) struct ThompsonRef { 1701 pub(crate) start: StateID, 1702 pub(crate) end: StateID, 1703 } 1704 1705 /// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs 1706 /// from a lexicographically sorted sequence of strings in linear time. 1707 /// 1708 /// The trick here is that any Unicode codepoint range can be converted to 1709 /// a sequence of byte ranges that form a UTF-8 automaton. Connecting them 1710 /// together via an alternation is trivial, and indeed, it works. However, 1711 /// there is a lot of redundant structure in many UTF-8 automatons. Since our 1712 /// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm 1713 /// to build nearly minimal DFAs in linear time. (They are guaranteed to be 1714 /// minimal because we use a bounded cache of previously build DFA states.) 1715 /// 1716 /// The drawback is that this sadly doesn't work for reverse automata, since 1717 /// the ranges are no longer in lexicographic order. For that, we invented the 1718 /// range trie (which gets its own module). Once a range trie is built, we then 1719 /// use this same Utf8Compiler to build a reverse UTF-8 automaton. 1720 /// 1721 /// The high level idea is described here: 1722 /// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures 1723 /// 1724 /// There is also another implementation of this in the `fst` crate. 1725 #[derive(Debug)] 1726 struct Utf8Compiler<'a> { 1727 builder: &'a mut Builder, 1728 state: &'a mut Utf8State, 1729 target: StateID, 1730 } 1731 1732 #[derive(Clone, Debug)] 1733 struct Utf8State { 1734 compiled: Utf8BoundedMap, 1735 uncompiled: Vec<Utf8Node>, 1736 } 1737 1738 #[derive(Clone, Debug)] 1739 struct Utf8Node { 1740 trans: Vec<Transition>, 1741 last: Option<Utf8LastTransition>, 1742 } 1743 1744 #[derive(Clone, Debug)] 1745 struct Utf8LastTransition { 1746 start: u8, 1747 end: u8, 1748 } 1749 1750 impl Utf8State { new() -> Utf8State1751 fn new() -> Utf8State { 1752 Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] } 1753 } 1754 clear(&mut self)1755 fn clear(&mut self) { 1756 self.compiled.clear(); 1757 self.uncompiled.clear(); 1758 } 1759 } 1760 1761 impl<'a> Utf8Compiler<'a> { new( builder: &'a mut Builder, state: &'a mut Utf8State, ) -> Result<Utf8Compiler<'a>, BuildError>1762 fn new( 1763 builder: &'a mut Builder, 1764 state: &'a mut Utf8State, 1765 ) -> Result<Utf8Compiler<'a>, BuildError> { 1766 let target = builder.add_empty()?; 1767 state.clear(); 1768 let mut utf8c = Utf8Compiler { builder, state, target }; 1769 utf8c.add_empty(); 1770 Ok(utf8c) 1771 } 1772 finish(&mut self) -> Result<ThompsonRef, BuildError>1773 fn finish(&mut self) -> Result<ThompsonRef, BuildError> { 1774 self.compile_from(0)?; 1775 let node = self.pop_root(); 1776 let start = self.compile(node)?; 1777 Ok(ThompsonRef { start, end: self.target }) 1778 } 1779 add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError>1780 fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> { 1781 let prefix_len = ranges 1782 .iter() 1783 .zip(&self.state.uncompiled) 1784 .take_while(|&(range, node)| { 1785 node.last.as_ref().map_or(false, |t| { 1786 (t.start, t.end) == (range.start, range.end) 1787 }) 1788 }) 1789 .count(); 1790 assert!(prefix_len < ranges.len()); 1791 self.compile_from(prefix_len)?; 1792 self.add_suffix(&ranges[prefix_len..]); 1793 Ok(()) 1794 } 1795 compile_from(&mut self, from: usize) -> Result<(), BuildError>1796 fn compile_from(&mut self, from: usize) -> Result<(), BuildError> { 1797 let mut next = self.target; 1798 while from + 1 < self.state.uncompiled.len() { 1799 let node = self.pop_freeze(next); 1800 next = self.compile(node)?; 1801 } 1802 self.top_last_freeze(next); 1803 Ok(()) 1804 } 1805 compile( &mut self, node: Vec<Transition>, ) -> Result<StateID, BuildError>1806 fn compile( 1807 &mut self, 1808 node: Vec<Transition>, 1809 ) -> Result<StateID, BuildError> { 1810 let hash = self.state.compiled.hash(&node); 1811 if let Some(id) = self.state.compiled.get(&node, hash) { 1812 return Ok(id); 1813 } 1814 let id = self.builder.add_sparse(node.clone())?; 1815 self.state.compiled.set(node, hash, id); 1816 Ok(id) 1817 } 1818 add_suffix(&mut self, ranges: &[Utf8Range])1819 fn add_suffix(&mut self, ranges: &[Utf8Range]) { 1820 assert!(!ranges.is_empty()); 1821 let last = self 1822 .state 1823 .uncompiled 1824 .len() 1825 .checked_sub(1) 1826 .expect("non-empty nodes"); 1827 assert!(self.state.uncompiled[last].last.is_none()); 1828 self.state.uncompiled[last].last = Some(Utf8LastTransition { 1829 start: ranges[0].start, 1830 end: ranges[0].end, 1831 }); 1832 for r in &ranges[1..] { 1833 self.state.uncompiled.push(Utf8Node { 1834 trans: vec![], 1835 last: Some(Utf8LastTransition { start: r.start, end: r.end }), 1836 }); 1837 } 1838 } 1839 add_empty(&mut self)1840 fn add_empty(&mut self) { 1841 self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); 1842 } 1843 pop_freeze(&mut self, next: StateID) -> Vec<Transition>1844 fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { 1845 let mut uncompiled = self.state.uncompiled.pop().unwrap(); 1846 uncompiled.set_last_transition(next); 1847 uncompiled.trans 1848 } 1849 pop_root(&mut self) -> Vec<Transition>1850 fn pop_root(&mut self) -> Vec<Transition> { 1851 assert_eq!(self.state.uncompiled.len(), 1); 1852 assert!(self.state.uncompiled[0].last.is_none()); 1853 self.state.uncompiled.pop().expect("non-empty nodes").trans 1854 } 1855 top_last_freeze(&mut self, next: StateID)1856 fn top_last_freeze(&mut self, next: StateID) { 1857 let last = self 1858 .state 1859 .uncompiled 1860 .len() 1861 .checked_sub(1) 1862 .expect("non-empty nodes"); 1863 self.state.uncompiled[last].set_last_transition(next); 1864 } 1865 } 1866 1867 impl Utf8Node { set_last_transition(&mut self, next: StateID)1868 fn set_last_transition(&mut self, next: StateID) { 1869 if let Some(last) = self.last.take() { 1870 self.trans.push(Transition { 1871 start: last.start, 1872 end: last.end, 1873 next, 1874 }); 1875 } 1876 } 1877 } 1878 1879 #[cfg(test)] 1880 mod tests { 1881 use alloc::vec; 1882 1883 use crate::{ 1884 nfa::thompson::{SparseTransitions, State}, 1885 util::primitives::SmallIndex, 1886 }; 1887 1888 use super::*; 1889 build(pattern: &str) -> NFA1890 fn build(pattern: &str) -> NFA { 1891 NFA::compiler() 1892 .configure( 1893 NFA::config() 1894 .which_captures(WhichCaptures::None) 1895 .unanchored_prefix(false), 1896 ) 1897 .build(pattern) 1898 .unwrap() 1899 } 1900 pid(id: usize) -> PatternID1901 fn pid(id: usize) -> PatternID { 1902 PatternID::new(id).unwrap() 1903 } 1904 sid(id: usize) -> StateID1905 fn sid(id: usize) -> StateID { 1906 StateID::new(id).unwrap() 1907 } 1908 s_byte(byte: u8, next: usize) -> State1909 fn s_byte(byte: u8, next: usize) -> State { 1910 let next = sid(next); 1911 let trans = Transition { start: byte, end: byte, next }; 1912 State::ByteRange { trans } 1913 } 1914 s_range(start: u8, end: u8, next: usize) -> State1915 fn s_range(start: u8, end: u8, next: usize) -> State { 1916 let next = sid(next); 1917 let trans = Transition { start, end, next }; 1918 State::ByteRange { trans } 1919 } 1920 s_sparse(transitions: &[(u8, u8, usize)]) -> State1921 fn s_sparse(transitions: &[(u8, u8, usize)]) -> State { 1922 let transitions = transitions 1923 .iter() 1924 .map(|&(start, end, next)| Transition { 1925 start, 1926 end, 1927 next: sid(next), 1928 }) 1929 .collect(); 1930 State::Sparse(SparseTransitions { transitions }) 1931 } 1932 s_look(look: Look, next: usize) -> State1933 fn s_look(look: Look, next: usize) -> State { 1934 let next = sid(next); 1935 State::Look { look, next } 1936 } 1937 s_bin_union(alt1: usize, alt2: usize) -> State1938 fn s_bin_union(alt1: usize, alt2: usize) -> State { 1939 State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } 1940 } 1941 s_union(alts: &[usize]) -> State1942 fn s_union(alts: &[usize]) -> State { 1943 State::Union { 1944 alternates: alts 1945 .iter() 1946 .map(|&id| sid(id)) 1947 .collect::<Vec<StateID>>() 1948 .into_boxed_slice(), 1949 } 1950 } 1951 s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State1952 fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { 1953 State::Capture { 1954 next: sid(next), 1955 pattern_id: pid(pattern), 1956 group_index: SmallIndex::new(index).unwrap(), 1957 slot: SmallIndex::new(slot).unwrap(), 1958 } 1959 } 1960 s_fail() -> State1961 fn s_fail() -> State { 1962 State::Fail 1963 } 1964 s_match(id: usize) -> State1965 fn s_match(id: usize) -> State { 1966 State::Match { pattern_id: pid(id) } 1967 } 1968 1969 // Test that building an unanchored NFA has an appropriate `(?s:.)*?` 1970 // prefix. 1971 #[test] compile_unanchored_prefix()1972 fn compile_unanchored_prefix() { 1973 let nfa = NFA::compiler() 1974 .configure(NFA::config().which_captures(WhichCaptures::None)) 1975 .build(r"a") 1976 .unwrap(); 1977 assert_eq!( 1978 nfa.states(), 1979 &[ 1980 s_bin_union(2, 1), 1981 s_range(0, 255, 0), 1982 s_byte(b'a', 3), 1983 s_match(0), 1984 ] 1985 ); 1986 } 1987 1988 #[test] compile_no_unanchored_prefix_with_start_anchor()1989 fn compile_no_unanchored_prefix_with_start_anchor() { 1990 let nfa = NFA::compiler() 1991 .configure(NFA::config().which_captures(WhichCaptures::None)) 1992 .build(r"^a") 1993 .unwrap(); 1994 assert_eq!( 1995 nfa.states(), 1996 &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] 1997 ); 1998 } 1999 2000 #[test] compile_yes_unanchored_prefix_with_end_anchor()2001 fn compile_yes_unanchored_prefix_with_end_anchor() { 2002 let nfa = NFA::compiler() 2003 .configure(NFA::config().which_captures(WhichCaptures::None)) 2004 .build(r"a$") 2005 .unwrap(); 2006 assert_eq!( 2007 nfa.states(), 2008 &[ 2009 s_bin_union(2, 1), 2010 s_range(0, 255, 0), 2011 s_byte(b'a', 3), 2012 s_look(Look::End, 4), 2013 s_match(0), 2014 ] 2015 ); 2016 } 2017 2018 #[test] compile_yes_reverse_unanchored_prefix_with_start_anchor()2019 fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { 2020 let nfa = NFA::compiler() 2021 .configure( 2022 NFA::config() 2023 .reverse(true) 2024 .which_captures(WhichCaptures::None), 2025 ) 2026 .build(r"^a") 2027 .unwrap(); 2028 assert_eq!( 2029 nfa.states(), 2030 &[ 2031 s_bin_union(2, 1), 2032 s_range(0, 255, 0), 2033 s_byte(b'a', 3), 2034 // Anchors get flipped in a reverse automaton. 2035 s_look(Look::End, 4), 2036 s_match(0), 2037 ], 2038 ); 2039 } 2040 2041 #[test] compile_no_reverse_unanchored_prefix_with_end_anchor()2042 fn compile_no_reverse_unanchored_prefix_with_end_anchor() { 2043 let nfa = NFA::compiler() 2044 .configure( 2045 NFA::config() 2046 .reverse(true) 2047 .which_captures(WhichCaptures::None), 2048 ) 2049 .build(r"a$") 2050 .unwrap(); 2051 assert_eq!( 2052 nfa.states(), 2053 &[ 2054 // Anchors get flipped in a reverse automaton. 2055 s_look(Look::Start, 1), 2056 s_byte(b'a', 2), 2057 s_match(0), 2058 ], 2059 ); 2060 } 2061 2062 #[test] compile_empty()2063 fn compile_empty() { 2064 assert_eq!(build("").states(), &[s_match(0),]); 2065 } 2066 2067 #[test] compile_literal()2068 fn compile_literal() { 2069 assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]); 2070 assert_eq!( 2071 build("ab").states(), 2072 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] 2073 ); 2074 assert_eq!( 2075 build("☃").states(), 2076 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] 2077 ); 2078 2079 // Check that non-UTF-8 literals work. 2080 let nfa = NFA::compiler() 2081 .configure( 2082 NFA::config() 2083 .which_captures(WhichCaptures::None) 2084 .unanchored_prefix(false), 2085 ) 2086 .syntax(crate::util::syntax::Config::new().utf8(false)) 2087 .build(r"(?-u)\xFF") 2088 .unwrap(); 2089 assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]); 2090 } 2091 2092 #[test] compile_class_ascii()2093 fn compile_class_ascii() { 2094 assert_eq!( 2095 build(r"[a-z]").states(), 2096 &[s_range(b'a', b'z', 1), s_match(0),] 2097 ); 2098 assert_eq!( 2099 build(r"[x-za-c]").states(), 2100 &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] 2101 ); 2102 } 2103 2104 #[test] 2105 #[cfg(not(miri))] compile_class_unicode()2106 fn compile_class_unicode() { 2107 assert_eq!( 2108 build(r"[\u03B1-\u03B4]").states(), 2109 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] 2110 ); 2111 assert_eq!( 2112 build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(), 2113 &[ 2114 s_range(0xB1, 0xB4, 5), 2115 s_range(0x99, 0x9E, 5), 2116 s_byte(0xA4, 1), 2117 s_byte(0x9F, 2), 2118 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), 2119 s_match(0), 2120 ] 2121 ); 2122 assert_eq!( 2123 build(r"[a-z☃]").states(), 2124 &[ 2125 s_byte(0x83, 3), 2126 s_byte(0x98, 0), 2127 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), 2128 s_match(0), 2129 ] 2130 ); 2131 } 2132 2133 #[test] compile_repetition()2134 fn compile_repetition() { 2135 assert_eq!( 2136 build(r"a?").states(), 2137 &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),] 2138 ); 2139 assert_eq!( 2140 build(r"a??").states(), 2141 &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),] 2142 ); 2143 } 2144 2145 #[test] compile_group()2146 fn compile_group() { 2147 assert_eq!( 2148 build(r"ab+").states(), 2149 &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)] 2150 ); 2151 assert_eq!( 2152 build(r"(ab)").states(), 2153 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] 2154 ); 2155 assert_eq!( 2156 build(r"(ab)+").states(), 2157 &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)] 2158 ); 2159 } 2160 2161 #[test] compile_alternation()2162 fn compile_alternation() { 2163 assert_eq!( 2164 build(r"a|b").states(), 2165 &[s_range(b'a', b'b', 1), s_match(0)] 2166 ); 2167 assert_eq!( 2168 build(r"ab|cd").states(), 2169 &[ 2170 s_byte(b'b', 3), 2171 s_byte(b'd', 3), 2172 s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]), 2173 s_match(0) 2174 ], 2175 ); 2176 assert_eq!( 2177 build(r"|b").states(), 2178 &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)] 2179 ); 2180 assert_eq!( 2181 build(r"a|").states(), 2182 &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)] 2183 ); 2184 } 2185 2186 // This tests the use of a non-binary union, i.e., a state with more than 2187 // 2 unconditional epsilon transitions. The only place they tend to appear 2188 // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union' 2189 // and 'sparse' tend to cover all other cases of alternation. 2190 #[test] compile_non_binary_union()2191 fn compile_non_binary_union() { 2192 let nfa = NFA::compiler() 2193 .configure( 2194 NFA::config() 2195 .which_captures(WhichCaptures::None) 2196 .reverse(true) 2197 .shrink(false) 2198 .unanchored_prefix(false), 2199 ) 2200 .build(r"[\u1000\u2000\u3000]") 2201 .unwrap(); 2202 assert_eq!( 2203 nfa.states(), 2204 &[ 2205 s_union(&[3, 6, 9]), 2206 s_byte(0xE1, 10), 2207 s_byte(0x80, 1), 2208 s_byte(0x80, 2), 2209 s_byte(0xE2, 10), 2210 s_byte(0x80, 4), 2211 s_byte(0x80, 5), 2212 s_byte(0xE3, 10), 2213 s_byte(0x80, 7), 2214 s_byte(0x80, 8), 2215 s_match(0), 2216 ] 2217 ); 2218 } 2219 2220 #[test] compile_many_start_pattern()2221 fn compile_many_start_pattern() { 2222 let nfa = NFA::compiler() 2223 .configure( 2224 NFA::config() 2225 .which_captures(WhichCaptures::None) 2226 .unanchored_prefix(false), 2227 ) 2228 .build_many(&["a", "b"]) 2229 .unwrap(); 2230 assert_eq!( 2231 nfa.states(), 2232 &[ 2233 s_byte(b'a', 1), 2234 s_match(0), 2235 s_byte(b'b', 3), 2236 s_match(1), 2237 s_bin_union(0, 2), 2238 ] 2239 ); 2240 assert_eq!(nfa.start_anchored().as_usize(), 4); 2241 assert_eq!(nfa.start_unanchored().as_usize(), 4); 2242 // Test that the start states for each individual pattern are correct. 2243 assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0)); 2244 assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2)); 2245 } 2246 2247 // This tests that our compiler can handle an empty character class. At the 2248 // time of writing, the regex parser forbids it, so the only way to test it 2249 // is to provide a hand written HIR. 2250 #[test] empty_class_bytes()2251 fn empty_class_bytes() { 2252 use regex_syntax::hir::{Class, ClassBytes, Hir}; 2253 2254 let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); 2255 let config = NFA::config() 2256 .which_captures(WhichCaptures::None) 2257 .unanchored_prefix(false); 2258 let nfa = 2259 NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); 2260 assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); 2261 } 2262 2263 // Like empty_class_bytes, but for a Unicode class. 2264 #[test] empty_class_unicode()2265 fn empty_class_unicode() { 2266 use regex_syntax::hir::{Class, ClassUnicode, Hir}; 2267 2268 let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); 2269 let config = NFA::config() 2270 .which_captures(WhichCaptures::None) 2271 .unanchored_prefix(false); 2272 let nfa = 2273 NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); 2274 assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); 2275 } 2276 2277 #[test] compile_captures_all()2278 fn compile_captures_all() { 2279 let nfa = NFA::compiler() 2280 .configure( 2281 NFA::config() 2282 .unanchored_prefix(false) 2283 .which_captures(WhichCaptures::All), 2284 ) 2285 .build("a(b)c") 2286 .unwrap(); 2287 assert_eq!( 2288 nfa.states(), 2289 &[ 2290 s_cap(1, 0, 0, 0), 2291 s_byte(b'a', 2), 2292 s_cap(3, 0, 1, 2), 2293 s_byte(b'b', 4), 2294 s_cap(5, 0, 1, 3), 2295 s_byte(b'c', 6), 2296 s_cap(7, 0, 0, 1), 2297 s_match(0) 2298 ] 2299 ); 2300 let ginfo = nfa.group_info(); 2301 assert_eq!(2, ginfo.all_group_len()); 2302 } 2303 2304 #[test] compile_captures_implicit()2305 fn compile_captures_implicit() { 2306 let nfa = NFA::compiler() 2307 .configure( 2308 NFA::config() 2309 .unanchored_prefix(false) 2310 .which_captures(WhichCaptures::Implicit), 2311 ) 2312 .build("a(b)c") 2313 .unwrap(); 2314 assert_eq!( 2315 nfa.states(), 2316 &[ 2317 s_cap(1, 0, 0, 0), 2318 s_byte(b'a', 2), 2319 s_byte(b'b', 3), 2320 s_byte(b'c', 4), 2321 s_cap(5, 0, 0, 1), 2322 s_match(0) 2323 ] 2324 ); 2325 let ginfo = nfa.group_info(); 2326 assert_eq!(1, ginfo.all_group_len()); 2327 } 2328 2329 #[test] compile_captures_none()2330 fn compile_captures_none() { 2331 let nfa = NFA::compiler() 2332 .configure( 2333 NFA::config() 2334 .unanchored_prefix(false) 2335 .which_captures(WhichCaptures::None), 2336 ) 2337 .build("a(b)c") 2338 .unwrap(); 2339 assert_eq!( 2340 nfa.states(), 2341 &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] 2342 ); 2343 let ginfo = nfa.group_info(); 2344 assert_eq!(0, ginfo.all_group_len()); 2345 } 2346 } 2347