1 use core::{ 2 borrow::Borrow, 3 panic::{RefUnwindSafe, UnwindSafe}, 4 }; 5 6 use alloc::{boxed::Box, sync::Arc, vec, vec::Vec}; 7 8 use regex_syntax::{ 9 ast, 10 hir::{self, Hir}, 11 }; 12 13 use crate::{ 14 meta::{ 15 error::BuildError, 16 strategy::{self, Strategy}, 17 wrappers, 18 }, 19 nfa::thompson::WhichCaptures, 20 util::{ 21 captures::{Captures, GroupInfo}, 22 iter, 23 pool::{Pool, PoolGuard}, 24 prefilter::Prefilter, 25 primitives::{NonMaxUsize, PatternID}, 26 search::{HalfMatch, Input, Match, MatchKind, PatternSet, Span}, 27 }, 28 }; 29 30 /// A type alias for our pool of meta::Cache that fixes the type parameters to 31 /// what we use for the meta regex below. 32 type CachePool = Pool<Cache, CachePoolFn>; 33 34 /// Same as above, but for the guard returned by a pool. 35 type CachePoolGuard<'a> = PoolGuard<'a, Cache, CachePoolFn>; 36 37 /// The type of the closure we use to create new caches. We need to spell out 38 /// all of the marker traits or else we risk leaking !MARKER impls. 39 type CachePoolFn = 40 Box<dyn Fn() -> Cache + Send + Sync + UnwindSafe + RefUnwindSafe>; 41 42 /// A regex matcher that works by composing several other regex matchers 43 /// automatically. 44 /// 45 /// In effect, a meta regex papers over a lot of the quirks or performance 46 /// problems in each of the regex engines in this crate. Its goal is to provide 47 /// an infallible and simple API that "just does the right thing" in the common 48 /// case. 49 /// 50 /// A meta regex is the implementation of a `Regex` in the `regex` crate. 51 /// Indeed, the `regex` crate API is essentially just a light wrapper over 52 /// this type. This includes the `regex` crate's `RegexSet` API! 53 /// 54 /// # Composition 55 /// 56 /// This is called a "meta" matcher precisely because it uses other regex 57 /// matchers to provide a convenient high level regex API. Here are some 58 /// examples of how other regex matchers are composed: 59 /// 60 /// * When calling [`Regex::captures`], instead of immediately 61 /// running a slower but more capable regex engine like the 62 /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM), the meta regex engine 63 /// will usually first look for the bounds of a match with a higher throughput 64 /// regex engine like a [lazy DFA](crate::hybrid). Only when a match is found 65 /// is a slower engine like `PikeVM` used to find the matching span for each 66 /// capture group. 67 /// * While higher throughout engines like the lazy DFA cannot handle 68 /// Unicode word boundaries in general, they can still be used on pure ASCII 69 /// haystacks by pretending that Unicode word boundaries are just plain ASCII 70 /// word boundaries. However, if a haystack is not ASCII, the meta regex engine 71 /// will automatically switch to a (possibly slower) regex engine that supports 72 /// Unicode word boundaries in general. 73 /// * In some cases where a regex pattern is just a simple literal or a small 74 /// set of literals, an actual regex engine won't be used at all. Instead, 75 /// substring or multi-substring search algorithms will be employed. 76 /// 77 /// There are many other forms of composition happening too, but the above 78 /// should give a general idea. In particular, it may perhaps be surprising 79 /// that *multiple* regex engines might get executed for a single search. That 80 /// is, the decision of what regex engine to use is not _just_ based on the 81 /// pattern, but also based on the dynamic execution of the search itself. 82 /// 83 /// The primary reason for this composition is performance. The fundamental 84 /// tension is that the faster engines tend to be less capable, and the more 85 /// capable engines tend to be slower. 86 /// 87 /// Note that the forms of composition that are allowed are determined by 88 /// compile time crate features and configuration. For example, if the `hybrid` 89 /// feature isn't enabled, or if [`Config::hybrid`] has been disabled, then the 90 /// meta regex engine will never use a lazy DFA. 91 /// 92 /// # Synchronization and cloning 93 /// 94 /// Most of the regex engines in this crate require some kind of mutable 95 /// "scratch" space to read and write from while performing a search. Since 96 /// a meta regex composes these regex engines, a meta regex also requires 97 /// mutable scratch space. This scratch space is called a [`Cache`]. 98 /// 99 /// Most regex engines _also_ usually have a read-only component, typically 100 /// a [Thompson `NFA`](crate::nfa::thompson::NFA). 101 /// 102 /// In order to make the `Regex` API convenient, most of the routines hide 103 /// the fact that a `Cache` is needed at all. To achieve this, a [memory 104 /// pool](crate::util::pool::Pool) is used internally to retrieve `Cache` 105 /// values in a thread safe way that also permits reuse. This in turn implies 106 /// that every such search call requires some form of synchronization. Usually 107 /// this synchronization is fast enough to not notice, but in some cases, it 108 /// can be a bottleneck. This typically occurs when all of the following are 109 /// true: 110 /// 111 /// * The same `Regex` is shared across multiple threads simultaneously, 112 /// usually via a [`util::lazy::Lazy`](crate::util::lazy::Lazy) or something 113 /// similar from the `once_cell` or `lazy_static` crates. 114 /// * The primary unit of work in each thread is a regex search. 115 /// * Searches are run on very short haystacks. 116 /// 117 /// This particular case can lead to high contention on the pool used by a 118 /// `Regex` internally, which can in turn increase latency to a noticeable 119 /// effect. This cost can be mitigated in one of the following ways: 120 /// 121 /// * Use a distinct copy of a `Regex` in each thread, usually by cloning it. 122 /// Cloning a `Regex` _does not_ do a deep copy of its read-only component. 123 /// But it does lead to each `Regex` having its own memory pool, which in 124 /// turn eliminates the problem of contention. In general, this technique should 125 /// not result in any additional memory usage when compared to sharing the same 126 /// `Regex` across multiple threads simultaneously. 127 /// * Use lower level APIs, like [`Regex::search_with`], which permit passing 128 /// a `Cache` explicitly. In this case, it is up to you to determine how best 129 /// to provide a `Cache`. For example, you might put a `Cache` in thread-local 130 /// storage if your use case allows for it. 131 /// 132 /// Overall, this is an issue that happens rarely in practice, but it can 133 /// happen. 134 /// 135 /// # Warning: spin-locks may be used in alloc-only mode 136 /// 137 /// When this crate is built without the `std` feature and the high level APIs 138 /// on a `Regex` are used, then a spin-lock will be used to synchronize access 139 /// to an internal pool of `Cache` values. This may be undesirable because 140 /// a spin-lock is [effectively impossible to implement correctly in user 141 /// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could 142 /// result in a deadlock. 143 /// 144 /// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html 145 /// 146 /// If one wants to avoid the use of spin-locks when the `std` feature is 147 /// disabled, then you must use APIs that accept a `Cache` value explicitly. 148 /// For example, [`Regex::search_with`]. 149 /// 150 /// # Example 151 /// 152 /// ``` 153 /// use regex_automata::meta::Regex; 154 /// 155 /// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?; 156 /// assert!(re.is_match("2010-03-14")); 157 /// 158 /// # Ok::<(), Box<dyn std::error::Error>>(()) 159 /// ``` 160 /// 161 /// # Example: anchored search 162 /// 163 /// This example shows how to use [`Input::anchored`] to run an anchored 164 /// search, even when the regex pattern itself isn't anchored. An anchored 165 /// search guarantees that if a match is found, then the start offset of the 166 /// match corresponds to the offset at which the search was started. 167 /// 168 /// ``` 169 /// use regex_automata::{meta::Regex, Anchored, Input, Match}; 170 /// 171 /// let re = Regex::new(r"\bfoo\b")?; 172 /// let input = Input::new("xx foo xx").range(3..).anchored(Anchored::Yes); 173 /// // The offsets are in terms of the original haystack. 174 /// assert_eq!(Some(Match::must(0, 3..6)), re.find(input)); 175 /// 176 /// // Notice that no match occurs here, because \b still takes the 177 /// // surrounding context into account, even if it means looking back 178 /// // before the start of your search. 179 /// let hay = "xxfoo xx"; 180 /// let input = Input::new(hay).range(2..).anchored(Anchored::Yes); 181 /// assert_eq!(None, re.find(input)); 182 /// // Indeed, you cannot achieve the above by simply slicing the 183 /// // haystack itself, since the regex engine can't see the 184 /// // surrounding context. This is why 'Input' permits setting 185 /// // the bounds of a search! 186 /// let input = Input::new(&hay[2..]).anchored(Anchored::Yes); 187 /// // WRONG! 188 /// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); 189 /// 190 /// # Ok::<(), Box<dyn std::error::Error>>(()) 191 /// ``` 192 /// 193 /// # Example: earliest search 194 /// 195 /// This example shows how to use [`Input::earliest`] to run a search that 196 /// might stop before finding the typical leftmost match. 197 /// 198 /// ``` 199 /// use regex_automata::{meta::Regex, Anchored, Input, Match}; 200 /// 201 /// let re = Regex::new(r"[a-z]{3}|b")?; 202 /// let input = Input::new("abc").earliest(true); 203 /// assert_eq!(Some(Match::must(0, 1..2)), re.find(input)); 204 /// 205 /// // Note that "earliest" isn't really a match semantic unto itself. 206 /// // Instead, it is merely an instruction to whatever regex engine 207 /// // gets used internally to quit as soon as it can. For example, 208 /// // this regex uses a different search technique, and winds up 209 /// // producing a different (but valid) match! 210 /// let re = Regex::new(r"abc|b")?; 211 /// let input = Input::new("abc").earliest(true); 212 /// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); 213 /// 214 /// # Ok::<(), Box<dyn std::error::Error>>(()) 215 /// ``` 216 /// 217 /// # Example: change the line terminator 218 /// 219 /// This example shows how to enable multi-line mode by default and change 220 /// the line terminator to the NUL byte: 221 /// 222 /// ``` 223 /// use regex_automata::{meta::Regex, util::syntax, Match}; 224 /// 225 /// let re = Regex::builder() 226 /// .syntax(syntax::Config::new().multi_line(true)) 227 /// .configure(Regex::config().line_terminator(b'\x00')) 228 /// .build(r"^foo$")?; 229 /// let hay = "\x00foo\x00"; 230 /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); 231 /// 232 /// # Ok::<(), Box<dyn std::error::Error>>(()) 233 /// ``` 234 #[derive(Debug)] 235 pub struct Regex { 236 /// The actual regex implementation. 237 imp: Arc<RegexI>, 238 /// A thread safe pool of caches. 239 /// 240 /// For the higher level search APIs, a `Cache` is automatically plucked 241 /// from this pool before running a search. The lower level `with` methods 242 /// permit the caller to provide their own cache, thereby bypassing 243 /// accesses to this pool. 244 /// 245 /// Note that we put this outside the `Arc` so that cloning a `Regex` 246 /// results in creating a fresh `CachePool`. This in turn permits callers 247 /// to clone regexes into separate threads where each such regex gets 248 /// the pool's "thread owner" optimization. Otherwise, if one shares the 249 /// `Regex` directly, then the pool will go through a slower mutex path for 250 /// all threads except for the "owner." 251 pool: CachePool, 252 } 253 254 /// The internal implementation of `Regex`, split out so that it can be wrapped 255 /// in an `Arc`. 256 #[derive(Debug)] 257 struct RegexI { 258 /// The core matching engine. 259 /// 260 /// Why is this reference counted when RegexI is already wrapped in an Arc? 261 /// Well, we need to capture this in a closure to our `Pool` below in order 262 /// to create new `Cache` values when needed. So since it needs to be in 263 /// two places, we make it reference counted. 264 /// 265 /// We make `RegexI` itself reference counted too so that `Regex` itself 266 /// stays extremely small and very cheap to clone. 267 strat: Arc<dyn Strategy>, 268 /// Metadata about the regexes driving the strategy. The metadata is also 269 /// usually stored inside the strategy too, but we put it here as well 270 /// so that we can get quick access to it (without virtual calls) before 271 /// executing the regex engine. For example, we use this metadata to 272 /// detect a subset of cases where we know a match is impossible, and can 273 /// thus avoid calling into the strategy at all. 274 /// 275 /// Since `RegexInfo` is stored in multiple places, it is also reference 276 /// counted. 277 info: RegexInfo, 278 } 279 280 /// Convenience constructors for a `Regex` using the default configuration. 281 impl Regex { 282 /// Builds a `Regex` from a single pattern string using the default 283 /// configuration. 284 /// 285 /// If there was a problem parsing the pattern or a problem turning it into 286 /// a regex matcher, then an error is returned. 287 /// 288 /// If you want to change the configuration of a `Regex`, use a [`Builder`] 289 /// with a [`Config`]. 290 /// 291 /// # Example 292 /// 293 /// ``` 294 /// use regex_automata::{meta::Regex, Match}; 295 /// 296 /// let re = Regex::new(r"(?Rm)^foo$")?; 297 /// let hay = "\r\nfoo\r\n"; 298 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); 299 /// 300 /// # Ok::<(), Box<dyn std::error::Error>>(()) 301 /// ``` new(pattern: &str) -> Result<Regex, BuildError>302 pub fn new(pattern: &str) -> Result<Regex, BuildError> { 303 Regex::builder().build(pattern) 304 } 305 306 /// Builds a `Regex` from many pattern strings using the default 307 /// configuration. 308 /// 309 /// If there was a problem parsing any of the patterns or a problem turning 310 /// them into a regex matcher, then an error is returned. 311 /// 312 /// If you want to change the configuration of a `Regex`, use a [`Builder`] 313 /// with a [`Config`]. 314 /// 315 /// # Example: simple lexer 316 /// 317 /// This simplistic example leverages the multi-pattern support to build a 318 /// simple little lexer. The pattern ID in the match tells you which regex 319 /// matched, which in turn might be used to map back to the "type" of the 320 /// token returned by the lexer. 321 /// 322 /// ``` 323 /// use regex_automata::{meta::Regex, Match}; 324 /// 325 /// let re = Regex::new_many(&[ 326 /// r"[[:space:]]", 327 /// r"[A-Za-z0-9][A-Za-z0-9_]+", 328 /// r"->", 329 /// r".", 330 /// ])?; 331 /// let haystack = "fn is_boss(bruce: i32, springsteen: String) -> bool;"; 332 /// let matches: Vec<Match> = re.find_iter(haystack).collect(); 333 /// assert_eq!(matches, vec![ 334 /// Match::must(1, 0..2), // 'fn' 335 /// Match::must(0, 2..3), // ' ' 336 /// Match::must(1, 3..10), // 'is_boss' 337 /// Match::must(3, 10..11), // '(' 338 /// Match::must(1, 11..16), // 'bruce' 339 /// Match::must(3, 16..17), // ':' 340 /// Match::must(0, 17..18), // ' ' 341 /// Match::must(1, 18..21), // 'i32' 342 /// Match::must(3, 21..22), // ',' 343 /// Match::must(0, 22..23), // ' ' 344 /// Match::must(1, 23..34), // 'springsteen' 345 /// Match::must(3, 34..35), // ':' 346 /// Match::must(0, 35..36), // ' ' 347 /// Match::must(1, 36..42), // 'String' 348 /// Match::must(3, 42..43), // ')' 349 /// Match::must(0, 43..44), // ' ' 350 /// Match::must(2, 44..46), // '->' 351 /// Match::must(0, 46..47), // ' ' 352 /// Match::must(1, 47..51), // 'bool' 353 /// Match::must(3, 51..52), // ';' 354 /// ]); 355 /// 356 /// # Ok::<(), Box<dyn std::error::Error>>(()) 357 /// ``` 358 /// 359 /// One can write a lexer like the above using a regex like 360 /// `(?P<space>[[:space:]])|(?P<ident>[A-Za-z0-9][A-Za-z0-9_]+)|...`, 361 /// but then you need to ask whether capture group matched to determine 362 /// which branch in the regex matched, and thus, which token the match 363 /// corresponds to. In contrast, the above example includes the pattern ID 364 /// in the match. There's no need to use capture groups at all. 365 /// 366 /// # Example: finding the pattern that caused an error 367 /// 368 /// When a syntax error occurs, it is possible to ask which pattern 369 /// caused the syntax error. 370 /// 371 /// ``` 372 /// use regex_automata::{meta::Regex, PatternID}; 373 /// 374 /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); 375 /// assert_eq!(Some(PatternID::must(2)), err.pattern()); 376 /// ``` 377 /// 378 /// # Example: zero patterns is valid 379 /// 380 /// Building a regex with zero patterns results in a regex that never 381 /// matches anything. Because this routine is generic, passing an empty 382 /// slice usually requires a turbo-fish (or something else to help type 383 /// inference). 384 /// 385 /// ``` 386 /// use regex_automata::{meta::Regex, util::syntax, Match}; 387 /// 388 /// let re = Regex::new_many::<&str>(&[])?; 389 /// assert_eq!(None, re.find("")); 390 /// 391 /// # Ok::<(), Box<dyn std::error::Error>>(()) 392 /// ``` new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<Regex, BuildError>393 pub fn new_many<P: AsRef<str>>( 394 patterns: &[P], 395 ) -> Result<Regex, BuildError> { 396 Regex::builder().build_many(patterns) 397 } 398 399 /// Return a default configuration for a `Regex`. 400 /// 401 /// This is a convenience routine to avoid needing to import the [`Config`] 402 /// type when customizing the construction of a `Regex`. 403 /// 404 /// # Example: lower the NFA size limit 405 /// 406 /// In some cases, the default size limit might be too big. The size limit 407 /// can be lowered, which will prevent large regex patterns from compiling. 408 /// 409 /// ``` 410 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 411 /// use regex_automata::meta::Regex; 412 /// 413 /// let result = Regex::builder() 414 /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) 415 /// // Not even 20KB is enough to build a single large Unicode class! 416 /// .build(r"\pL"); 417 /// assert!(result.is_err()); 418 /// 419 /// # Ok::<(), Box<dyn std::error::Error>>(()) 420 /// ``` config() -> Config421 pub fn config() -> Config { 422 Config::new() 423 } 424 425 /// Return a builder for configuring the construction of a `Regex`. 426 /// 427 /// This is a convenience routine to avoid needing to import the 428 /// [`Builder`] type in common cases. 429 /// 430 /// # Example: change the line terminator 431 /// 432 /// This example shows how to enable multi-line mode by default and change 433 /// the line terminator to the NUL byte: 434 /// 435 /// ``` 436 /// use regex_automata::{meta::Regex, util::syntax, Match}; 437 /// 438 /// let re = Regex::builder() 439 /// .syntax(syntax::Config::new().multi_line(true)) 440 /// .configure(Regex::config().line_terminator(b'\x00')) 441 /// .build(r"^foo$")?; 442 /// let hay = "\x00foo\x00"; 443 /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); 444 /// 445 /// # Ok::<(), Box<dyn std::error::Error>>(()) 446 /// ``` builder() -> Builder447 pub fn builder() -> Builder { 448 Builder::new() 449 } 450 } 451 452 /// High level convenience routines for using a regex to search a haystack. 453 impl Regex { 454 /// Returns true if and only if this regex matches the given haystack. 455 /// 456 /// This routine may short circuit if it knows that scanning future input 457 /// will never lead to a different result. (Consider how this might make 458 /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`. 459 /// This routine _may_ stop after it sees the first `a`, but routines like 460 /// `find` need to continue searching because `+` is greedy by default.) 461 /// 462 /// # Example 463 /// 464 /// ``` 465 /// use regex_automata::meta::Regex; 466 /// 467 /// let re = Regex::new("foo[0-9]+bar")?; 468 /// 469 /// assert!(re.is_match("foo12345bar")); 470 /// assert!(!re.is_match("foobar")); 471 /// 472 /// # Ok::<(), Box<dyn std::error::Error>>(()) 473 /// ``` 474 /// 475 /// # Example: consistency with search APIs 476 /// 477 /// `is_match` is guaranteed to return `true` whenever `find` returns a 478 /// match. This includes searches that are executed entirely within a 479 /// codepoint: 480 /// 481 /// ``` 482 /// use regex_automata::{meta::Regex, Input}; 483 /// 484 /// let re = Regex::new("a*")?; 485 /// 486 /// // This doesn't match because the default configuration bans empty 487 /// // matches from splitting a codepoint. 488 /// assert!(!re.is_match(Input::new("☃").span(1..2))); 489 /// assert_eq!(None, re.find(Input::new("☃").span(1..2))); 490 /// 491 /// # Ok::<(), Box<dyn std::error::Error>>(()) 492 /// ``` 493 /// 494 /// Notice that when UTF-8 mode is disabled, then the above reports a 495 /// match because the restriction against zero-width matches that split a 496 /// codepoint has been lifted: 497 /// 498 /// ``` 499 /// use regex_automata::{meta::Regex, Input, Match}; 500 /// 501 /// let re = Regex::builder() 502 /// .configure(Regex::config().utf8_empty(false)) 503 /// .build("a*")?; 504 /// 505 /// assert!(re.is_match(Input::new("☃").span(1..2))); 506 /// assert_eq!( 507 /// Some(Match::must(0, 1..1)), 508 /// re.find(Input::new("☃").span(1..2)), 509 /// ); 510 /// 511 /// # Ok::<(), Box<dyn std::error::Error>>(()) 512 /// ``` 513 /// 514 /// A similar idea applies when using line anchors with CRLF mode enabled, 515 /// which prevents them from matching between a `\r` and a `\n`. 516 /// 517 /// ``` 518 /// use regex_automata::{meta::Regex, Input, Match}; 519 /// 520 /// let re = Regex::new(r"(?Rm:$)")?; 521 /// assert!(!re.is_match(Input::new("\r\n").span(1..1))); 522 /// // A regular line anchor, which only considers \n as a 523 /// // line terminator, will match. 524 /// let re = Regex::new(r"(?m:$)")?; 525 /// assert!(re.is_match(Input::new("\r\n").span(1..1))); 526 /// 527 /// # Ok::<(), Box<dyn std::error::Error>>(()) 528 /// ``` 529 #[inline] is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool530 pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { 531 let input = input.into().earliest(true); 532 if self.imp.info.is_impossible(&input) { 533 return false; 534 } 535 let mut guard = self.pool.get(); 536 let result = self.imp.strat.is_match(&mut guard, &input); 537 // See 'Regex::search' for why we put the guard back explicitly. 538 PoolGuard::put(guard); 539 result 540 } 541 542 /// Executes a leftmost search and returns the first match that is found, 543 /// if one exists. 544 /// 545 /// # Example 546 /// 547 /// ``` 548 /// use regex_automata::{meta::Regex, Match}; 549 /// 550 /// let re = Regex::new("foo[0-9]+")?; 551 /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345")); 552 /// 553 /// # Ok::<(), Box<dyn std::error::Error>>(()) 554 /// ``` 555 #[inline] find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match>556 pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { 557 self.search(&input.into()) 558 } 559 560 /// Executes a leftmost forward search and writes the spans of capturing 561 /// groups that participated in a match into the provided [`Captures`] 562 /// value. If no match was found, then [`Captures::is_match`] is guaranteed 563 /// to return `false`. 564 /// 565 /// # Example 566 /// 567 /// ``` 568 /// use regex_automata::{meta::Regex, Span}; 569 /// 570 /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; 571 /// let mut caps = re.create_captures(); 572 /// 573 /// re.captures("2010-03-14", &mut caps); 574 /// assert!(caps.is_match()); 575 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); 576 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); 577 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); 578 /// 579 /// # Ok::<(), Box<dyn std::error::Error>>(()) 580 /// ``` 581 #[inline] captures<'h, I: Into<Input<'h>>>( &self, input: I, caps: &mut Captures, )582 pub fn captures<'h, I: Into<Input<'h>>>( 583 &self, 584 input: I, 585 caps: &mut Captures, 586 ) { 587 self.search_captures(&input.into(), caps) 588 } 589 590 /// Returns an iterator over all non-overlapping leftmost matches in 591 /// the given haystack. If no match exists, then the iterator yields no 592 /// elements. 593 /// 594 /// # Example 595 /// 596 /// ``` 597 /// use regex_automata::{meta::Regex, Match}; 598 /// 599 /// let re = Regex::new("foo[0-9]+")?; 600 /// let haystack = "foo1 foo12 foo123"; 601 /// let matches: Vec<Match> = re.find_iter(haystack).collect(); 602 /// assert_eq!(matches, vec![ 603 /// Match::must(0, 0..4), 604 /// Match::must(0, 5..10), 605 /// Match::must(0, 11..17), 606 /// ]); 607 /// # Ok::<(), Box<dyn std::error::Error>>(()) 608 /// ``` 609 #[inline] find_iter<'r, 'h, I: Into<Input<'h>>>( &'r self, input: I, ) -> FindMatches<'r, 'h>610 pub fn find_iter<'r, 'h, I: Into<Input<'h>>>( 611 &'r self, 612 input: I, 613 ) -> FindMatches<'r, 'h> { 614 let cache = self.pool.get(); 615 let it = iter::Searcher::new(input.into()); 616 FindMatches { re: self, cache, it } 617 } 618 619 /// Returns an iterator over all non-overlapping `Captures` values. If no 620 /// match exists, then the iterator yields no elements. 621 /// 622 /// This yields the same matches as [`Regex::find_iter`], but it includes 623 /// the spans of all capturing groups that participate in each match. 624 /// 625 /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for 626 /// how to correctly iterate over all matches in a haystack while avoiding 627 /// the creation of a new `Captures` value for every match. (Which you are 628 /// forced to do with an `Iterator`.) 629 /// 630 /// # Example 631 /// 632 /// ``` 633 /// use regex_automata::{meta::Regex, Span}; 634 /// 635 /// let re = Regex::new("foo(?P<numbers>[0-9]+)")?; 636 /// 637 /// let haystack = "foo1 foo12 foo123"; 638 /// let matches: Vec<Span> = re 639 /// .captures_iter(haystack) 640 /// // The unwrap is OK since 'numbers' matches if the pattern matches. 641 /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) 642 /// .collect(); 643 /// assert_eq!(matches, vec![ 644 /// Span::from(3..4), 645 /// Span::from(8..10), 646 /// Span::from(14..17), 647 /// ]); 648 /// # Ok::<(), Box<dyn std::error::Error>>(()) 649 /// ``` 650 #[inline] captures_iter<'r, 'h, I: Into<Input<'h>>>( &'r self, input: I, ) -> CapturesMatches<'r, 'h>651 pub fn captures_iter<'r, 'h, I: Into<Input<'h>>>( 652 &'r self, 653 input: I, 654 ) -> CapturesMatches<'r, 'h> { 655 let cache = self.pool.get(); 656 let caps = self.create_captures(); 657 let it = iter::Searcher::new(input.into()); 658 CapturesMatches { re: self, cache, caps, it } 659 } 660 661 /// Returns an iterator of spans of the haystack given, delimited by a 662 /// match of the regex. Namely, each element of the iterator corresponds to 663 /// a part of the haystack that *isn't* matched by the regular expression. 664 /// 665 /// # Example 666 /// 667 /// To split a string delimited by arbitrary amounts of spaces or tabs: 668 /// 669 /// ``` 670 /// use regex_automata::meta::Regex; 671 /// 672 /// let re = Regex::new(r"[ \t]+")?; 673 /// let hay = "a b \t c\td e"; 674 /// let fields: Vec<&str> = re.split(hay).map(|span| &hay[span]).collect(); 675 /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); 676 /// 677 /// # Ok::<(), Box<dyn std::error::Error>>(()) 678 /// ``` 679 /// 680 /// # Example: more cases 681 /// 682 /// Basic usage: 683 /// 684 /// ``` 685 /// use regex_automata::meta::Regex; 686 /// 687 /// let re = Regex::new(r" ")?; 688 /// let hay = "Mary had a little lamb"; 689 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 690 /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); 691 /// 692 /// let re = Regex::new(r"X")?; 693 /// let hay = ""; 694 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 695 /// assert_eq!(got, vec![""]); 696 /// 697 /// let re = Regex::new(r"X")?; 698 /// let hay = "lionXXtigerXleopard"; 699 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 700 /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); 701 /// 702 /// let re = Regex::new(r"::")?; 703 /// let hay = "lion::tiger::leopard"; 704 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 705 /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); 706 /// 707 /// # Ok::<(), Box<dyn std::error::Error>>(()) 708 /// ``` 709 /// 710 /// If a haystack contains multiple contiguous matches, you will end up 711 /// with empty spans yielded by the iterator: 712 /// 713 /// ``` 714 /// use regex_automata::meta::Regex; 715 /// 716 /// let re = Regex::new(r"X")?; 717 /// let hay = "XXXXaXXbXc"; 718 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 719 /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); 720 /// 721 /// let re = Regex::new(r"/")?; 722 /// let hay = "(///)"; 723 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 724 /// assert_eq!(got, vec!["(", "", "", ")"]); 725 /// 726 /// # Ok::<(), Box<dyn std::error::Error>>(()) 727 /// ``` 728 /// 729 /// Separators at the start or end of a haystack are neighbored by empty 730 /// spans. 731 /// 732 /// ``` 733 /// use regex_automata::meta::Regex; 734 /// 735 /// let re = Regex::new(r"0")?; 736 /// let hay = "010"; 737 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 738 /// assert_eq!(got, vec!["", "1", ""]); 739 /// 740 /// # Ok::<(), Box<dyn std::error::Error>>(()) 741 /// ``` 742 /// 743 /// When the empty string is used as a regex, it splits at every valid 744 /// UTF-8 boundary by default (which includes the beginning and end of the 745 /// haystack): 746 /// 747 /// ``` 748 /// use regex_automata::meta::Regex; 749 /// 750 /// let re = Regex::new(r"")?; 751 /// let hay = "rust"; 752 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 753 /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); 754 /// 755 /// // Splitting by an empty string is UTF-8 aware by default! 756 /// let re = Regex::new(r"")?; 757 /// let hay = "☃"; 758 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 759 /// assert_eq!(got, vec!["", "☃", ""]); 760 /// 761 /// # Ok::<(), Box<dyn std::error::Error>>(()) 762 /// ``` 763 /// 764 /// But note that UTF-8 mode for empty strings can be disabled, which will 765 /// then result in a match at every byte offset in the haystack, 766 /// including between every UTF-8 code unit. 767 /// 768 /// ``` 769 /// use regex_automata::meta::Regex; 770 /// 771 /// let re = Regex::builder() 772 /// .configure(Regex::config().utf8_empty(false)) 773 /// .build(r"")?; 774 /// let hay = "☃".as_bytes(); 775 /// let got: Vec<&[u8]> = re.split(hay).map(|sp| &hay[sp]).collect(); 776 /// assert_eq!(got, vec![ 777 /// // Writing byte string slices is just brutal. The problem is that 778 /// // b"foo" has type &[u8; 3] instead of &[u8]. 779 /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..], 780 /// ]); 781 /// 782 /// # Ok::<(), Box<dyn std::error::Error>>(()) 783 /// ``` 784 /// 785 /// Contiguous separators (commonly shows up with whitespace), can lead to 786 /// possibly surprising behavior. For example, this code is correct: 787 /// 788 /// ``` 789 /// use regex_automata::meta::Regex; 790 /// 791 /// let re = Regex::new(r" ")?; 792 /// let hay = " a b c"; 793 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 794 /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); 795 /// 796 /// # Ok::<(), Box<dyn std::error::Error>>(()) 797 /// ``` 798 /// 799 /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want 800 /// to match contiguous space characters: 801 /// 802 /// ``` 803 /// use regex_automata::meta::Regex; 804 /// 805 /// let re = Regex::new(r" +")?; 806 /// let hay = " a b c"; 807 /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); 808 /// // N.B. This does still include a leading empty span because ' +' 809 /// // matches at the beginning of the haystack. 810 /// assert_eq!(got, vec!["", "a", "b", "c"]); 811 /// 812 /// # Ok::<(), Box<dyn std::error::Error>>(()) 813 /// ``` 814 #[inline] split<'r, 'h, I: Into<Input<'h>>>( &'r self, input: I, ) -> Split<'r, 'h>815 pub fn split<'r, 'h, I: Into<Input<'h>>>( 816 &'r self, 817 input: I, 818 ) -> Split<'r, 'h> { 819 Split { finder: self.find_iter(input), last: 0 } 820 } 821 822 /// Returns an iterator of at most `limit` spans of the haystack given, 823 /// delimited by a match of the regex. (A `limit` of `0` will return no 824 /// spans.) Namely, each element of the iterator corresponds to a part 825 /// of the haystack that *isn't* matched by the regular expression. The 826 /// remainder of the haystack that is not split will be the last element in 827 /// the iterator. 828 /// 829 /// # Example 830 /// 831 /// Get the first two words in some haystack: 832 /// 833 /// ``` 834 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 835 /// use regex_automata::meta::Regex; 836 /// 837 /// let re = Regex::new(r"\W+").unwrap(); 838 /// let hay = "Hey! How are you?"; 839 /// let fields: Vec<&str> = 840 /// re.splitn(hay, 3).map(|span| &hay[span]).collect(); 841 /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); 842 /// 843 /// # Ok::<(), Box<dyn std::error::Error>>(()) 844 /// ``` 845 /// 846 /// # Examples: more cases 847 /// 848 /// ``` 849 /// use regex_automata::meta::Regex; 850 /// 851 /// let re = Regex::new(r" ")?; 852 /// let hay = "Mary had a little lamb"; 853 /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); 854 /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); 855 /// 856 /// let re = Regex::new(r"X")?; 857 /// let hay = ""; 858 /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); 859 /// assert_eq!(got, vec![""]); 860 /// 861 /// let re = Regex::new(r"X")?; 862 /// let hay = "lionXXtigerXleopard"; 863 /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); 864 /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); 865 /// 866 /// let re = Regex::new(r"::")?; 867 /// let hay = "lion::tiger::leopard"; 868 /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); 869 /// assert_eq!(got, vec!["lion", "tiger::leopard"]); 870 /// 871 /// let re = Regex::new(r"X")?; 872 /// let hay = "abcXdef"; 873 /// let got: Vec<&str> = re.splitn(hay, 1).map(|sp| &hay[sp]).collect(); 874 /// assert_eq!(got, vec!["abcXdef"]); 875 /// 876 /// let re = Regex::new(r"X")?; 877 /// let hay = "abcdef"; 878 /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); 879 /// assert_eq!(got, vec!["abcdef"]); 880 /// 881 /// let re = Regex::new(r"X")?; 882 /// let hay = "abcXdef"; 883 /// let got: Vec<&str> = re.splitn(hay, 0).map(|sp| &hay[sp]).collect(); 884 /// assert!(got.is_empty()); 885 /// 886 /// # Ok::<(), Box<dyn std::error::Error>>(()) 887 /// ``` splitn<'r, 'h, I: Into<Input<'h>>>( &'r self, input: I, limit: usize, ) -> SplitN<'r, 'h>888 pub fn splitn<'r, 'h, I: Into<Input<'h>>>( 889 &'r self, 890 input: I, 891 limit: usize, 892 ) -> SplitN<'r, 'h> { 893 SplitN { splits: self.split(input), limit } 894 } 895 } 896 897 /// Lower level search routines that give more control. 898 impl Regex { 899 /// Returns the start and end offset of the leftmost match. If no match 900 /// exists, then `None` is returned. 901 /// 902 /// This is like [`Regex::find`] but, but it accepts a concrete `&Input` 903 /// instead of an `Into<Input>`. 904 /// 905 /// # Example 906 /// 907 /// ``` 908 /// use regex_automata::{meta::Regex, Input, Match}; 909 /// 910 /// let re = Regex::new(r"Samwise|Sam")?; 911 /// let input = Input::new( 912 /// "one of the chief characters, Samwise the Brave", 913 /// ); 914 /// assert_eq!(Some(Match::must(0, 29..36)), re.search(&input)); 915 /// 916 /// # Ok::<(), Box<dyn std::error::Error>>(()) 917 /// ``` 918 #[inline] search(&self, input: &Input<'_>) -> Option<Match>919 pub fn search(&self, input: &Input<'_>) -> Option<Match> { 920 if self.imp.info.is_impossible(input) { 921 return None; 922 } 923 let mut guard = self.pool.get(); 924 let result = self.imp.strat.search(&mut guard, input); 925 // We do this dance with the guard and explicitly put it back in the 926 // pool because it seems to result in better codegen. If we let the 927 // guard's Drop impl put it back in the pool, then functions like 928 // ptr::drop_in_place get called and they *don't* get inlined. This 929 // isn't usually a big deal, but in latency sensitive benchmarks the 930 // extra function call can matter. 931 // 932 // I used `rebar measure -f '^grep/every-line$' -e meta` to measure 933 // the effects here. 934 // 935 // Note that this doesn't eliminate the latency effects of using the 936 // pool. There is still some (minor) cost for the "thread owner" of the 937 // pool. (i.e., The thread that first calls a regex search routine.) 938 // However, for other threads using the regex, the pool access can be 939 // quite expensive as it goes through a mutex. Callers can avoid this 940 // by either cloning the Regex (which creates a distinct copy of the 941 // pool), or callers can use the lower level APIs that accept a 'Cache' 942 // directly and do their own handling. 943 PoolGuard::put(guard); 944 result 945 } 946 947 /// Returns the end offset of the leftmost match. If no match exists, then 948 /// `None` is returned. 949 /// 950 /// This is distinct from [`Regex::search`] in that it only returns the end 951 /// of a match and not the start of the match. Depending on a variety of 952 /// implementation details, this _may_ permit the regex engine to do less 953 /// overall work. For example, if a DFA is being used to execute a search, 954 /// then the start of a match usually requires running a separate DFA in 955 /// reverse to the find the start of a match. If one only needs the end of 956 /// a match, then the separate reverse scan to find the start of a match 957 /// can be skipped. (Note that the reverse scan is avoided even when using 958 /// `Regex::search` when possible, for example, in the case of an anchored 959 /// search.) 960 /// 961 /// # Example 962 /// 963 /// ``` 964 /// use regex_automata::{meta::Regex, Input, HalfMatch}; 965 /// 966 /// let re = Regex::new(r"Samwise|Sam")?; 967 /// let input = Input::new( 968 /// "one of the chief characters, Samwise the Brave", 969 /// ); 970 /// assert_eq!(Some(HalfMatch::must(0, 36)), re.search_half(&input)); 971 /// 972 /// # Ok::<(), Box<dyn std::error::Error>>(()) 973 /// ``` 974 #[inline] search_half(&self, input: &Input<'_>) -> Option<HalfMatch>975 pub fn search_half(&self, input: &Input<'_>) -> Option<HalfMatch> { 976 if self.imp.info.is_impossible(input) { 977 return None; 978 } 979 let mut guard = self.pool.get(); 980 let result = self.imp.strat.search_half(&mut guard, input); 981 // See 'Regex::search' for why we put the guard back explicitly. 982 PoolGuard::put(guard); 983 result 984 } 985 986 /// Executes a leftmost forward search and writes the spans of capturing 987 /// groups that participated in a match into the provided [`Captures`] 988 /// value. If no match was found, then [`Captures::is_match`] is guaranteed 989 /// to return `false`. 990 /// 991 /// This is like [`Regex::captures`], but it accepts a concrete `&Input` 992 /// instead of an `Into<Input>`. 993 /// 994 /// # Example: specific pattern search 995 /// 996 /// This example shows how to build a multi-pattern `Regex` that permits 997 /// searching for specific patterns. 998 /// 999 /// ``` 1000 /// use regex_automata::{ 1001 /// meta::Regex, 1002 /// Anchored, Match, PatternID, Input, 1003 /// }; 1004 /// 1005 /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; 1006 /// let mut caps = re.create_captures(); 1007 /// let haystack = "foo123"; 1008 /// 1009 /// // Since we are using the default leftmost-first match and both 1010 /// // patterns match at the same starting position, only the first pattern 1011 /// // will be returned in this case when doing a search for any of the 1012 /// // patterns. 1013 /// let expected = Some(Match::must(0, 0..6)); 1014 /// re.search_captures(&Input::new(haystack), &mut caps); 1015 /// assert_eq!(expected, caps.get_match()); 1016 /// 1017 /// // But if we want to check whether some other pattern matches, then we 1018 /// // can provide its pattern ID. 1019 /// let expected = Some(Match::must(1, 0..6)); 1020 /// let input = Input::new(haystack) 1021 /// .anchored(Anchored::Pattern(PatternID::must(1))); 1022 /// re.search_captures(&input, &mut caps); 1023 /// assert_eq!(expected, caps.get_match()); 1024 /// 1025 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1026 /// ``` 1027 /// 1028 /// # Example: specifying the bounds of a search 1029 /// 1030 /// This example shows how providing the bounds of a search can produce 1031 /// different results than simply sub-slicing the haystack. 1032 /// 1033 /// ``` 1034 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1035 /// use regex_automata::{meta::Regex, Match, Input}; 1036 /// 1037 /// let re = Regex::new(r"\b[0-9]{3}\b")?; 1038 /// let mut caps = re.create_captures(); 1039 /// let haystack = "foo123bar"; 1040 /// 1041 /// // Since we sub-slice the haystack, the search doesn't know about 1042 /// // the larger context and assumes that `123` is surrounded by word 1043 /// // boundaries. And of course, the match position is reported relative 1044 /// // to the sub-slice as well, which means we get `0..3` instead of 1045 /// // `3..6`. 1046 /// let expected = Some(Match::must(0, 0..3)); 1047 /// let input = Input::new(&haystack[3..6]); 1048 /// re.search_captures(&input, &mut caps); 1049 /// assert_eq!(expected, caps.get_match()); 1050 /// 1051 /// // But if we provide the bounds of the search within the context of the 1052 /// // entire haystack, then the search can take the surrounding context 1053 /// // into account. (And if we did find a match, it would be reported 1054 /// // as a valid offset into `haystack` instead of its sub-slice.) 1055 /// let expected = None; 1056 /// let input = Input::new(haystack).range(3..6); 1057 /// re.search_captures(&input, &mut caps); 1058 /// assert_eq!(expected, caps.get_match()); 1059 /// 1060 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1061 /// ``` 1062 #[inline] search_captures(&self, input: &Input<'_>, caps: &mut Captures)1063 pub fn search_captures(&self, input: &Input<'_>, caps: &mut Captures) { 1064 caps.set_pattern(None); 1065 let pid = self.search_slots(input, caps.slots_mut()); 1066 caps.set_pattern(pid); 1067 } 1068 1069 /// Executes a leftmost forward search and writes the spans of capturing 1070 /// groups that participated in a match into the provided `slots`, and 1071 /// returns the matching pattern ID. The contents of the slots for patterns 1072 /// other than the matching pattern are unspecified. If no match was found, 1073 /// then `None` is returned and the contents of `slots` is unspecified. 1074 /// 1075 /// This is like [`Regex::search`], but it accepts a raw slots slice 1076 /// instead of a `Captures` value. This is useful in contexts where you 1077 /// don't want or need to allocate a `Captures`. 1078 /// 1079 /// It is legal to pass _any_ number of slots to this routine. If the regex 1080 /// engine would otherwise write a slot offset that doesn't fit in the 1081 /// provided slice, then it is simply skipped. In general though, there are 1082 /// usually three slice lengths you might want to use: 1083 /// 1084 /// * An empty slice, if you only care about which pattern matched. 1085 /// * A slice with [`pattern_len() * 2`](Regex::pattern_len) slots, if you 1086 /// only care about the overall match spans for each matching pattern. 1087 /// * A slice with 1088 /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which 1089 /// permits recording match offsets for every capturing group in every 1090 /// pattern. 1091 /// 1092 /// # Example 1093 /// 1094 /// This example shows how to find the overall match offsets in a 1095 /// multi-pattern search without allocating a `Captures` value. Indeed, we 1096 /// can put our slots right on the stack. 1097 /// 1098 /// ``` 1099 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1100 /// use regex_automata::{meta::Regex, PatternID, Input}; 1101 /// 1102 /// let re = Regex::new_many(&[ 1103 /// r"\pL+", 1104 /// r"\d+", 1105 /// ])?; 1106 /// let input = Input::new("!@#123"); 1107 /// 1108 /// // We only care about the overall match offsets here, so we just 1109 /// // allocate two slots for each pattern. Each slot records the start 1110 /// // and end of the match. 1111 /// let mut slots = [None; 4]; 1112 /// let pid = re.search_slots(&input, &mut slots); 1113 /// assert_eq!(Some(PatternID::must(1)), pid); 1114 /// 1115 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. 1116 /// // See 'GroupInfo' for more details on the mapping between groups and 1117 /// // slot indices. 1118 /// let slot_start = pid.unwrap().as_usize() * 2; 1119 /// let slot_end = slot_start + 1; 1120 /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); 1121 /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); 1122 /// 1123 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1124 /// ``` 1125 #[inline] search_slots( &self, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<PatternID>1126 pub fn search_slots( 1127 &self, 1128 input: &Input<'_>, 1129 slots: &mut [Option<NonMaxUsize>], 1130 ) -> Option<PatternID> { 1131 if self.imp.info.is_impossible(input) { 1132 return None; 1133 } 1134 let mut guard = self.pool.get(); 1135 let result = self.imp.strat.search_slots(&mut guard, input, slots); 1136 // See 'Regex::search' for why we put the guard back explicitly. 1137 PoolGuard::put(guard); 1138 result 1139 } 1140 1141 /// Writes the set of patterns that match anywhere in the given search 1142 /// configuration to `patset`. If multiple patterns match at the same 1143 /// position and this `Regex` was configured with [`MatchKind::All`] 1144 /// semantics, then all matching patterns are written to the given set. 1145 /// 1146 /// Unless all of the patterns in this `Regex` are anchored, then generally 1147 /// speaking, this will scan the entire haystack. 1148 /// 1149 /// This search routine *does not* clear the pattern set. This gives some 1150 /// flexibility to the caller (e.g., running multiple searches with the 1151 /// same pattern set), but does make the API bug-prone if you're reusing 1152 /// the same pattern set for multiple searches but intended them to be 1153 /// independent. 1154 /// 1155 /// If a pattern ID matched but the given `PatternSet` does not have 1156 /// sufficient capacity to store it, then it is not inserted and silently 1157 /// dropped. 1158 /// 1159 /// # Example 1160 /// 1161 /// This example shows how to find all matching patterns in a haystack, 1162 /// even when some patterns match at the same position as other patterns. 1163 /// It is important that we configure the `Regex` with [`MatchKind::All`] 1164 /// semantics here, or else overlapping matches will not be reported. 1165 /// 1166 /// ``` 1167 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1168 /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; 1169 /// 1170 /// let patterns = &[ 1171 /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", 1172 /// ]; 1173 /// let re = Regex::builder() 1174 /// .configure(Regex::config().match_kind(MatchKind::All)) 1175 /// .build_many(patterns)?; 1176 /// 1177 /// let input = Input::new("foobar"); 1178 /// let mut patset = PatternSet::new(re.pattern_len()); 1179 /// re.which_overlapping_matches(&input, &mut patset); 1180 /// let expected = vec![0, 2, 3, 4, 6]; 1181 /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); 1182 /// assert_eq!(expected, got); 1183 /// 1184 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1185 /// ``` 1186 #[inline] which_overlapping_matches( &self, input: &Input<'_>, patset: &mut PatternSet, )1187 pub fn which_overlapping_matches( 1188 &self, 1189 input: &Input<'_>, 1190 patset: &mut PatternSet, 1191 ) { 1192 if self.imp.info.is_impossible(input) { 1193 return; 1194 } 1195 let mut guard = self.pool.get(); 1196 let result = self 1197 .imp 1198 .strat 1199 .which_overlapping_matches(&mut guard, input, patset); 1200 // See 'Regex::search' for why we put the guard back explicitly. 1201 PoolGuard::put(guard); 1202 result 1203 } 1204 } 1205 1206 /// Lower level search routines that give more control, and require the caller 1207 /// to provide an explicit [`Cache`] parameter. 1208 impl Regex { 1209 /// This is like [`Regex::search`], but requires the caller to 1210 /// explicitly pass a [`Cache`]. 1211 /// 1212 /// # Why pass a `Cache` explicitly? 1213 /// 1214 /// Passing a `Cache` explicitly will bypass the use of an internal memory 1215 /// pool used by `Regex` to get a `Cache` for a search. The use of this 1216 /// pool can be slower in some cases when a `Regex` is used from multiple 1217 /// threads simultaneously. Typically, performance only becomes an issue 1218 /// when there is heavy contention, which in turn usually only occurs 1219 /// when each thread's primary unit of work is a regex search on a small 1220 /// haystack. 1221 /// 1222 /// # Example 1223 /// 1224 /// ``` 1225 /// use regex_automata::{meta::Regex, Input, Match}; 1226 /// 1227 /// let re = Regex::new(r"Samwise|Sam")?; 1228 /// let mut cache = re.create_cache(); 1229 /// let input = Input::new( 1230 /// "one of the chief characters, Samwise the Brave", 1231 /// ); 1232 /// assert_eq!( 1233 /// Some(Match::must(0, 29..36)), 1234 /// re.search_with(&mut cache, &input), 1235 /// ); 1236 /// 1237 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1238 /// ``` 1239 #[inline] search_with( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option<Match>1240 pub fn search_with( 1241 &self, 1242 cache: &mut Cache, 1243 input: &Input<'_>, 1244 ) -> Option<Match> { 1245 if self.imp.info.is_impossible(input) { 1246 return None; 1247 } 1248 self.imp.strat.search(cache, input) 1249 } 1250 1251 /// This is like [`Regex::search_half`], but requires the caller to 1252 /// explicitly pass a [`Cache`]. 1253 /// 1254 /// # Why pass a `Cache` explicitly? 1255 /// 1256 /// Passing a `Cache` explicitly will bypass the use of an internal memory 1257 /// pool used by `Regex` to get a `Cache` for a search. The use of this 1258 /// pool can be slower in some cases when a `Regex` is used from multiple 1259 /// threads simultaneously. Typically, performance only becomes an issue 1260 /// when there is heavy contention, which in turn usually only occurs 1261 /// when each thread's primary unit of work is a regex search on a small 1262 /// haystack. 1263 /// 1264 /// # Example 1265 /// 1266 /// ``` 1267 /// use regex_automata::{meta::Regex, Input, HalfMatch}; 1268 /// 1269 /// let re = Regex::new(r"Samwise|Sam")?; 1270 /// let mut cache = re.create_cache(); 1271 /// let input = Input::new( 1272 /// "one of the chief characters, Samwise the Brave", 1273 /// ); 1274 /// assert_eq!( 1275 /// Some(HalfMatch::must(0, 36)), 1276 /// re.search_half_with(&mut cache, &input), 1277 /// ); 1278 /// 1279 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1280 /// ``` 1281 #[inline] search_half_with( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option<HalfMatch>1282 pub fn search_half_with( 1283 &self, 1284 cache: &mut Cache, 1285 input: &Input<'_>, 1286 ) -> Option<HalfMatch> { 1287 if self.imp.info.is_impossible(input) { 1288 return None; 1289 } 1290 self.imp.strat.search_half(cache, input) 1291 } 1292 1293 /// This is like [`Regex::search_captures`], but requires the caller to 1294 /// explicitly pass a [`Cache`]. 1295 /// 1296 /// # Why pass a `Cache` explicitly? 1297 /// 1298 /// Passing a `Cache` explicitly will bypass the use of an internal memory 1299 /// pool used by `Regex` to get a `Cache` for a search. The use of this 1300 /// pool can be slower in some cases when a `Regex` is used from multiple 1301 /// threads simultaneously. Typically, performance only becomes an issue 1302 /// when there is heavy contention, which in turn usually only occurs 1303 /// when each thread's primary unit of work is a regex search on a small 1304 /// haystack. 1305 /// 1306 /// # Example: specific pattern search 1307 /// 1308 /// This example shows how to build a multi-pattern `Regex` that permits 1309 /// searching for specific patterns. 1310 /// 1311 /// ``` 1312 /// use regex_automata::{ 1313 /// meta::Regex, 1314 /// Anchored, Match, PatternID, Input, 1315 /// }; 1316 /// 1317 /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; 1318 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 1319 /// let haystack = "foo123"; 1320 /// 1321 /// // Since we are using the default leftmost-first match and both 1322 /// // patterns match at the same starting position, only the first pattern 1323 /// // will be returned in this case when doing a search for any of the 1324 /// // patterns. 1325 /// let expected = Some(Match::must(0, 0..6)); 1326 /// re.search_captures_with(&mut cache, &Input::new(haystack), &mut caps); 1327 /// assert_eq!(expected, caps.get_match()); 1328 /// 1329 /// // But if we want to check whether some other pattern matches, then we 1330 /// // can provide its pattern ID. 1331 /// let expected = Some(Match::must(1, 0..6)); 1332 /// let input = Input::new(haystack) 1333 /// .anchored(Anchored::Pattern(PatternID::must(1))); 1334 /// re.search_captures_with(&mut cache, &input, &mut caps); 1335 /// assert_eq!(expected, caps.get_match()); 1336 /// 1337 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1338 /// ``` 1339 /// 1340 /// # Example: specifying the bounds of a search 1341 /// 1342 /// This example shows how providing the bounds of a search can produce 1343 /// different results than simply sub-slicing the haystack. 1344 /// 1345 /// ``` 1346 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1347 /// use regex_automata::{meta::Regex, Match, Input}; 1348 /// 1349 /// let re = Regex::new(r"\b[0-9]{3}\b")?; 1350 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 1351 /// let haystack = "foo123bar"; 1352 /// 1353 /// // Since we sub-slice the haystack, the search doesn't know about 1354 /// // the larger context and assumes that `123` is surrounded by word 1355 /// // boundaries. And of course, the match position is reported relative 1356 /// // to the sub-slice as well, which means we get `0..3` instead of 1357 /// // `3..6`. 1358 /// let expected = Some(Match::must(0, 0..3)); 1359 /// let input = Input::new(&haystack[3..6]); 1360 /// re.search_captures_with(&mut cache, &input, &mut caps); 1361 /// assert_eq!(expected, caps.get_match()); 1362 /// 1363 /// // But if we provide the bounds of the search within the context of the 1364 /// // entire haystack, then the search can take the surrounding context 1365 /// // into account. (And if we did find a match, it would be reported 1366 /// // as a valid offset into `haystack` instead of its sub-slice.) 1367 /// let expected = None; 1368 /// let input = Input::new(haystack).range(3..6); 1369 /// re.search_captures_with(&mut cache, &input, &mut caps); 1370 /// assert_eq!(expected, caps.get_match()); 1371 /// 1372 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1373 /// ``` 1374 #[inline] search_captures_with( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, )1375 pub fn search_captures_with( 1376 &self, 1377 cache: &mut Cache, 1378 input: &Input<'_>, 1379 caps: &mut Captures, 1380 ) { 1381 caps.set_pattern(None); 1382 let pid = self.search_slots_with(cache, input, caps.slots_mut()); 1383 caps.set_pattern(pid); 1384 } 1385 1386 /// This is like [`Regex::search_slots`], but requires the caller to 1387 /// explicitly pass a [`Cache`]. 1388 /// 1389 /// # Why pass a `Cache` explicitly? 1390 /// 1391 /// Passing a `Cache` explicitly will bypass the use of an internal memory 1392 /// pool used by `Regex` to get a `Cache` for a search. The use of this 1393 /// pool can be slower in some cases when a `Regex` is used from multiple 1394 /// threads simultaneously. Typically, performance only becomes an issue 1395 /// when there is heavy contention, which in turn usually only occurs 1396 /// when each thread's primary unit of work is a regex search on a small 1397 /// haystack. 1398 /// 1399 /// # Example 1400 /// 1401 /// This example shows how to find the overall match offsets in a 1402 /// multi-pattern search without allocating a `Captures` value. Indeed, we 1403 /// can put our slots right on the stack. 1404 /// 1405 /// ``` 1406 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1407 /// use regex_automata::{meta::Regex, PatternID, Input}; 1408 /// 1409 /// let re = Regex::new_many(&[ 1410 /// r"\pL+", 1411 /// r"\d+", 1412 /// ])?; 1413 /// let mut cache = re.create_cache(); 1414 /// let input = Input::new("!@#123"); 1415 /// 1416 /// // We only care about the overall match offsets here, so we just 1417 /// // allocate two slots for each pattern. Each slot records the start 1418 /// // and end of the match. 1419 /// let mut slots = [None; 4]; 1420 /// let pid = re.search_slots_with(&mut cache, &input, &mut slots); 1421 /// assert_eq!(Some(PatternID::must(1)), pid); 1422 /// 1423 /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. 1424 /// // See 'GroupInfo' for more details on the mapping between groups and 1425 /// // slot indices. 1426 /// let slot_start = pid.unwrap().as_usize() * 2; 1427 /// let slot_end = slot_start + 1; 1428 /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); 1429 /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); 1430 /// 1431 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1432 /// ``` 1433 #[inline] search_slots_with( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option<NonMaxUsize>], ) -> Option<PatternID>1434 pub fn search_slots_with( 1435 &self, 1436 cache: &mut Cache, 1437 input: &Input<'_>, 1438 slots: &mut [Option<NonMaxUsize>], 1439 ) -> Option<PatternID> { 1440 if self.imp.info.is_impossible(input) { 1441 return None; 1442 } 1443 self.imp.strat.search_slots(cache, input, slots) 1444 } 1445 1446 /// This is like [`Regex::which_overlapping_matches`], but requires the 1447 /// caller to explicitly pass a [`Cache`]. 1448 /// 1449 /// Passing a `Cache` explicitly will bypass the use of an internal memory 1450 /// pool used by `Regex` to get a `Cache` for a search. The use of this 1451 /// pool can be slower in some cases when a `Regex` is used from multiple 1452 /// threads simultaneously. Typically, performance only becomes an issue 1453 /// when there is heavy contention, which in turn usually only occurs 1454 /// when each thread's primary unit of work is a regex search on a small 1455 /// haystack. 1456 /// 1457 /// # Why pass a `Cache` explicitly? 1458 /// 1459 /// # Example 1460 /// 1461 /// ``` 1462 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1463 /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; 1464 /// 1465 /// let patterns = &[ 1466 /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", 1467 /// ]; 1468 /// let re = Regex::builder() 1469 /// .configure(Regex::config().match_kind(MatchKind::All)) 1470 /// .build_many(patterns)?; 1471 /// let mut cache = re.create_cache(); 1472 /// 1473 /// let input = Input::new("foobar"); 1474 /// let mut patset = PatternSet::new(re.pattern_len()); 1475 /// re.which_overlapping_matches_with(&mut cache, &input, &mut patset); 1476 /// let expected = vec![0, 2, 3, 4, 6]; 1477 /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); 1478 /// assert_eq!(expected, got); 1479 /// 1480 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1481 /// ``` 1482 #[inline] which_overlapping_matches_with( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, )1483 pub fn which_overlapping_matches_with( 1484 &self, 1485 cache: &mut Cache, 1486 input: &Input<'_>, 1487 patset: &mut PatternSet, 1488 ) { 1489 if self.imp.info.is_impossible(input) { 1490 return; 1491 } 1492 self.imp.strat.which_overlapping_matches(cache, input, patset) 1493 } 1494 } 1495 1496 /// Various non-search routines for querying properties of a `Regex` and 1497 /// convenience routines for creating [`Captures`] and [`Cache`] values. 1498 impl Regex { 1499 /// Creates a new object for recording capture group offsets. This is used 1500 /// in search APIs like [`Regex::captures`] and [`Regex::search_captures`]. 1501 /// 1502 /// This is a convenience routine for 1503 /// `Captures::all(re.group_info().clone())`. Callers may build other types 1504 /// of `Captures` values that record less information (and thus require 1505 /// less work from the regex engine) using [`Captures::matches`] and 1506 /// [`Captures::empty`]. 1507 /// 1508 /// # Example 1509 /// 1510 /// This shows some alternatives to [`Regex::create_captures`]: 1511 /// 1512 /// ``` 1513 /// use regex_automata::{ 1514 /// meta::Regex, 1515 /// util::captures::Captures, 1516 /// Match, PatternID, Span, 1517 /// }; 1518 /// 1519 /// let re = Regex::new(r"(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)")?; 1520 /// 1521 /// // This is equivalent to Regex::create_captures. It stores matching 1522 /// // offsets for all groups in the regex. 1523 /// let mut all = Captures::all(re.group_info().clone()); 1524 /// re.captures("Bruce Springsteen", &mut all); 1525 /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); 1526 /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); 1527 /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); 1528 /// 1529 /// // In this version, we only care about the implicit groups, which 1530 /// // means offsets for the explicit groups will be unavailable. It can 1531 /// // sometimes be faster to ask for fewer groups, since the underlying 1532 /// // regex engine needs to do less work to keep track of them. 1533 /// let mut matches = Captures::matches(re.group_info().clone()); 1534 /// re.captures("Bruce Springsteen", &mut matches); 1535 /// // We still get the overall match info. 1536 /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); 1537 /// // But now the explicit groups are unavailable. 1538 /// assert_eq!(None, matches.get_group_by_name("first")); 1539 /// assert_eq!(None, matches.get_group_by_name("last")); 1540 /// 1541 /// // Finally, in this version, we don't ask to keep track of offsets for 1542 /// // *any* groups. All we get back is whether a match occurred, and if 1543 /// // so, the ID of the pattern that matched. 1544 /// let mut empty = Captures::empty(re.group_info().clone()); 1545 /// re.captures("Bruce Springsteen", &mut empty); 1546 /// // it's a match! 1547 /// assert!(empty.is_match()); 1548 /// // for pattern ID 0 1549 /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); 1550 /// // Match offsets are unavailable. 1551 /// assert_eq!(None, empty.get_match()); 1552 /// // And of course, explicit groups are unavailable too. 1553 /// assert_eq!(None, empty.get_group_by_name("first")); 1554 /// assert_eq!(None, empty.get_group_by_name("last")); 1555 /// 1556 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1557 /// ``` create_captures(&self) -> Captures1558 pub fn create_captures(&self) -> Captures { 1559 Captures::all(self.group_info().clone()) 1560 } 1561 1562 /// Creates a new cache for use with lower level search APIs like 1563 /// [`Regex::search_with`]. 1564 /// 1565 /// The cache returned should only be used for searches for this `Regex`. 1566 /// If you want to reuse the cache for another `Regex`, then you must call 1567 /// [`Cache::reset`] with that `Regex`. 1568 /// 1569 /// This is a convenience routine for [`Cache::new`]. 1570 /// 1571 /// # Example 1572 /// 1573 /// ``` 1574 /// use regex_automata::{meta::Regex, Input, Match}; 1575 /// 1576 /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; 1577 /// let mut cache = re.create_cache(); 1578 /// let input = Input::new("crazy janey and her mission man"); 1579 /// assert_eq!( 1580 /// Some(Match::must(0, 20..31)), 1581 /// re.search_with(&mut cache, &input), 1582 /// ); 1583 /// 1584 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1585 /// ``` create_cache(&self) -> Cache1586 pub fn create_cache(&self) -> Cache { 1587 self.imp.strat.create_cache() 1588 } 1589 1590 /// Returns the total number of patterns in this regex. 1591 /// 1592 /// The standard [`Regex::new`] constructor always results in a `Regex` 1593 /// with a single pattern, but [`Regex::new_many`] permits building a 1594 /// multi-pattern regex. 1595 /// 1596 /// A `Regex` guarantees that the maximum possible `PatternID` returned in 1597 /// any match is `Regex::pattern_len() - 1`. In the case where the number 1598 /// of patterns is `0`, a match is impossible. 1599 /// 1600 /// # Example 1601 /// 1602 /// ``` 1603 /// use regex_automata::meta::Regex; 1604 /// 1605 /// let re = Regex::new(r"(?m)^[a-z]$")?; 1606 /// assert_eq!(1, re.pattern_len()); 1607 /// 1608 /// let re = Regex::new_many::<&str>(&[])?; 1609 /// assert_eq!(0, re.pattern_len()); 1610 /// 1611 /// let re = Regex::new_many(&["a", "b", "c"])?; 1612 /// assert_eq!(3, re.pattern_len()); 1613 /// 1614 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1615 /// ``` pattern_len(&self) -> usize1616 pub fn pattern_len(&self) -> usize { 1617 self.imp.info.pattern_len() 1618 } 1619 1620 /// Returns the total number of capturing groups. 1621 /// 1622 /// This includes the implicit capturing group corresponding to the 1623 /// entire match. Therefore, the minimum value returned is `1`. 1624 /// 1625 /// # Example 1626 /// 1627 /// This shows a few patterns and how many capture groups they have. 1628 /// 1629 /// ``` 1630 /// use regex_automata::meta::Regex; 1631 /// 1632 /// let len = |pattern| { 1633 /// Regex::new(pattern).map(|re| re.captures_len()) 1634 /// }; 1635 /// 1636 /// assert_eq!(1, len("a")?); 1637 /// assert_eq!(2, len("(a)")?); 1638 /// assert_eq!(3, len("(a)|(b)")?); 1639 /// assert_eq!(5, len("(a)(b)|(c)(d)")?); 1640 /// assert_eq!(2, len("(a)|b")?); 1641 /// assert_eq!(2, len("a|(b)")?); 1642 /// assert_eq!(2, len("(b)*")?); 1643 /// assert_eq!(2, len("(b)+")?); 1644 /// 1645 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1646 /// ``` 1647 /// 1648 /// # Example: multiple patterns 1649 /// 1650 /// This routine also works for multiple patterns. The total number is 1651 /// the sum of the capture groups of each pattern. 1652 /// 1653 /// ``` 1654 /// use regex_automata::meta::Regex; 1655 /// 1656 /// let len = |patterns| { 1657 /// Regex::new_many(patterns).map(|re| re.captures_len()) 1658 /// }; 1659 /// 1660 /// assert_eq!(2, len(&["a", "b"])?); 1661 /// assert_eq!(4, len(&["(a)", "(b)"])?); 1662 /// assert_eq!(6, len(&["(a)|(b)", "(c)|(d)"])?); 1663 /// assert_eq!(8, len(&["(a)(b)|(c)(d)", "(x)(y)"])?); 1664 /// assert_eq!(3, len(&["(a)", "b"])?); 1665 /// assert_eq!(3, len(&["a", "(b)"])?); 1666 /// assert_eq!(4, len(&["(a)", "(b)*"])?); 1667 /// assert_eq!(4, len(&["(a)+", "(b)+"])?); 1668 /// 1669 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1670 /// ``` captures_len(&self) -> usize1671 pub fn captures_len(&self) -> usize { 1672 self.imp 1673 .info 1674 .props_union() 1675 .explicit_captures_len() 1676 .saturating_add(self.pattern_len()) 1677 } 1678 1679 /// Returns the total number of capturing groups that appear in every 1680 /// possible match. 1681 /// 1682 /// If the number of capture groups can vary depending on the match, then 1683 /// this returns `None`. That is, a value is only returned when the number 1684 /// of matching groups is invariant or "static." 1685 /// 1686 /// Note that like [`Regex::captures_len`], this **does** include the 1687 /// implicit capturing group corresponding to the entire match. Therefore, 1688 /// when a non-None value is returned, it is guaranteed to be at least `1`. 1689 /// Stated differently, a return value of `Some(0)` is impossible. 1690 /// 1691 /// # Example 1692 /// 1693 /// This shows a few cases where a static number of capture groups is 1694 /// available and a few cases where it is not. 1695 /// 1696 /// ``` 1697 /// use regex_automata::meta::Regex; 1698 /// 1699 /// let len = |pattern| { 1700 /// Regex::new(pattern).map(|re| re.static_captures_len()) 1701 /// }; 1702 /// 1703 /// assert_eq!(Some(1), len("a")?); 1704 /// assert_eq!(Some(2), len("(a)")?); 1705 /// assert_eq!(Some(2), len("(a)|(b)")?); 1706 /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); 1707 /// assert_eq!(None, len("(a)|b")?); 1708 /// assert_eq!(None, len("a|(b)")?); 1709 /// assert_eq!(None, len("(b)*")?); 1710 /// assert_eq!(Some(2), len("(b)+")?); 1711 /// 1712 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1713 /// ``` 1714 /// 1715 /// # Example: multiple patterns 1716 /// 1717 /// This property extends to regexes with multiple patterns as well. In 1718 /// order for their to be a static number of capture groups in this case, 1719 /// every pattern must have the same static number. 1720 /// 1721 /// ``` 1722 /// use regex_automata::meta::Regex; 1723 /// 1724 /// let len = |patterns| { 1725 /// Regex::new_many(patterns).map(|re| re.static_captures_len()) 1726 /// }; 1727 /// 1728 /// assert_eq!(Some(1), len(&["a", "b"])?); 1729 /// assert_eq!(Some(2), len(&["(a)", "(b)"])?); 1730 /// assert_eq!(Some(2), len(&["(a)|(b)", "(c)|(d)"])?); 1731 /// assert_eq!(Some(3), len(&["(a)(b)|(c)(d)", "(x)(y)"])?); 1732 /// assert_eq!(None, len(&["(a)", "b"])?); 1733 /// assert_eq!(None, len(&["a", "(b)"])?); 1734 /// assert_eq!(None, len(&["(a)", "(b)*"])?); 1735 /// assert_eq!(Some(2), len(&["(a)+", "(b)+"])?); 1736 /// 1737 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1738 /// ``` 1739 #[inline] static_captures_len(&self) -> Option<usize>1740 pub fn static_captures_len(&self) -> Option<usize> { 1741 self.imp 1742 .info 1743 .props_union() 1744 .static_explicit_captures_len() 1745 .map(|len| len.saturating_add(1)) 1746 } 1747 1748 /// Return information about the capture groups in this `Regex`. 1749 /// 1750 /// A `GroupInfo` is an immutable object that can be cheaply cloned. It 1751 /// is responsible for maintaining a mapping between the capture groups 1752 /// in the concrete syntax of zero or more regex patterns and their 1753 /// internal representation used by some of the regex matchers. It is also 1754 /// responsible for maintaining a mapping between the name of each group 1755 /// (if one exists) and its corresponding group index. 1756 /// 1757 /// A `GroupInfo` is ultimately what is used to build a [`Captures`] value, 1758 /// which is some mutable space where group offsets are stored as a result 1759 /// of a search. 1760 /// 1761 /// # Example 1762 /// 1763 /// This shows some alternatives to [`Regex::create_captures`]: 1764 /// 1765 /// ``` 1766 /// use regex_automata::{ 1767 /// meta::Regex, 1768 /// util::captures::Captures, 1769 /// Match, PatternID, Span, 1770 /// }; 1771 /// 1772 /// let re = Regex::new(r"(?<first>[A-Z][a-z]+) (?<last>[A-Z][a-z]+)")?; 1773 /// 1774 /// // This is equivalent to Regex::create_captures. It stores matching 1775 /// // offsets for all groups in the regex. 1776 /// let mut all = Captures::all(re.group_info().clone()); 1777 /// re.captures("Bruce Springsteen", &mut all); 1778 /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); 1779 /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); 1780 /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); 1781 /// 1782 /// // In this version, we only care about the implicit groups, which 1783 /// // means offsets for the explicit groups will be unavailable. It can 1784 /// // sometimes be faster to ask for fewer groups, since the underlying 1785 /// // regex engine needs to do less work to keep track of them. 1786 /// let mut matches = Captures::matches(re.group_info().clone()); 1787 /// re.captures("Bruce Springsteen", &mut matches); 1788 /// // We still get the overall match info. 1789 /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); 1790 /// // But now the explicit groups are unavailable. 1791 /// assert_eq!(None, matches.get_group_by_name("first")); 1792 /// assert_eq!(None, matches.get_group_by_name("last")); 1793 /// 1794 /// // Finally, in this version, we don't ask to keep track of offsets for 1795 /// // *any* groups. All we get back is whether a match occurred, and if 1796 /// // so, the ID of the pattern that matched. 1797 /// let mut empty = Captures::empty(re.group_info().clone()); 1798 /// re.captures("Bruce Springsteen", &mut empty); 1799 /// // it's a match! 1800 /// assert!(empty.is_match()); 1801 /// // for pattern ID 0 1802 /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); 1803 /// // Match offsets are unavailable. 1804 /// assert_eq!(None, empty.get_match()); 1805 /// // And of course, explicit groups are unavailable too. 1806 /// assert_eq!(None, empty.get_group_by_name("first")); 1807 /// assert_eq!(None, empty.get_group_by_name("last")); 1808 /// 1809 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1810 /// ``` 1811 #[inline] group_info(&self) -> &GroupInfo1812 pub fn group_info(&self) -> &GroupInfo { 1813 self.imp.strat.group_info() 1814 } 1815 1816 /// Returns the configuration object used to build this `Regex`. 1817 /// 1818 /// If no configuration object was explicitly passed, then the 1819 /// configuration returned represents the default. 1820 #[inline] get_config(&self) -> &Config1821 pub fn get_config(&self) -> &Config { 1822 self.imp.info.config() 1823 } 1824 1825 /// Returns true if this regex has a high chance of being "accelerated." 1826 /// 1827 /// The precise meaning of "accelerated" is specifically left unspecified, 1828 /// but the general meaning is that the search is a high likelihood of 1829 /// running faster than than a character-at-a-time loop inside a standard 1830 /// regex engine. 1831 /// 1832 /// When a regex is accelerated, it is only a *probabilistic* claim. That 1833 /// is, just because the regex is believed to be accelerated, that doesn't 1834 /// mean it will definitely execute searches very fast. Similarly, if a 1835 /// regex is *not* accelerated, that is also a probabilistic claim. That 1836 /// is, a regex for which `is_accelerated` returns `false` could still run 1837 /// searches more quickly than a regex for which `is_accelerated` returns 1838 /// `true`. 1839 /// 1840 /// Whether a regex is marked as accelerated or not is dependent on 1841 /// implementations details that may change in a semver compatible release. 1842 /// That is, a regex that is accelerated in a `x.y.1` release might not be 1843 /// accelerated in a `x.y.2` release. 1844 /// 1845 /// Basically, the value of acceleration boils down to a hedge: a hodge 1846 /// podge of internal heuristics combine to make a probabilistic guess 1847 /// that this regex search may run "fast." The value in knowing this from 1848 /// a caller's perspective is that it may act as a signal that no further 1849 /// work should be done to accelerate a search. For example, a grep-like 1850 /// tool might try to do some extra work extracting literals from a regex 1851 /// to create its own heuristic acceleration strategies. But it might 1852 /// choose to defer to this crate's acceleration strategy if one exists. 1853 /// This routine permits querying whether such a strategy is active for a 1854 /// particular regex. 1855 /// 1856 /// # Example 1857 /// 1858 /// ``` 1859 /// use regex_automata::meta::Regex; 1860 /// 1861 /// // A simple literal is very likely to be accelerated. 1862 /// let re = Regex::new(r"foo")?; 1863 /// assert!(re.is_accelerated()); 1864 /// 1865 /// // A regex with no literals is likely to not be accelerated. 1866 /// let re = Regex::new(r"\w")?; 1867 /// assert!(!re.is_accelerated()); 1868 /// 1869 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1870 /// ``` 1871 #[inline] is_accelerated(&self) -> bool1872 pub fn is_accelerated(&self) -> bool { 1873 self.imp.strat.is_accelerated() 1874 } 1875 1876 /// Return the total approximate heap memory, in bytes, used by this `Regex`. 1877 /// 1878 /// Note that currently, there is no high level configuration for setting 1879 /// a limit on the specific value returned by this routine. Instead, the 1880 /// following routines can be used to control heap memory at a bit of a 1881 /// lower level: 1882 /// 1883 /// * [`Config::nfa_size_limit`] controls how big _any_ of the NFAs are 1884 /// allowed to be. 1885 /// * [`Config::onepass_size_limit`] controls how big the one-pass DFA is 1886 /// allowed to be. 1887 /// * [`Config::hybrid_cache_capacity`] controls how much memory the lazy 1888 /// DFA is permitted to allocate to store its transition table. 1889 /// * [`Config::dfa_size_limit`] controls how big a fully compiled DFA is 1890 /// allowed to be. 1891 /// * [`Config::dfa_state_limit`] controls the conditions under which the 1892 /// meta regex engine will even attempt to build a fully compiled DFA. 1893 #[inline] memory_usage(&self) -> usize1894 pub fn memory_usage(&self) -> usize { 1895 self.imp.strat.memory_usage() 1896 } 1897 } 1898 1899 impl Clone for Regex { clone(&self) -> Regex1900 fn clone(&self) -> Regex { 1901 let imp = Arc::clone(&self.imp); 1902 let pool = { 1903 let strat = Arc::clone(&imp.strat); 1904 let create: CachePoolFn = Box::new(move || strat.create_cache()); 1905 Pool::new(create) 1906 }; 1907 Regex { imp, pool } 1908 } 1909 } 1910 1911 #[derive(Clone, Debug)] 1912 pub(crate) struct RegexInfo(Arc<RegexInfoI>); 1913 1914 #[derive(Clone, Debug)] 1915 struct RegexInfoI { 1916 config: Config, 1917 props: Vec<hir::Properties>, 1918 props_union: hir::Properties, 1919 } 1920 1921 impl RegexInfo { new(config: Config, hirs: &[&Hir]) -> RegexInfo1922 fn new(config: Config, hirs: &[&Hir]) -> RegexInfo { 1923 // Collect all of the properties from each of the HIRs, and also 1924 // union them into one big set of properties representing all HIRs 1925 // as if they were in one big alternation. 1926 let mut props = vec![]; 1927 for hir in hirs.iter() { 1928 props.push(hir.properties().clone()); 1929 } 1930 let props_union = hir::Properties::union(&props); 1931 1932 RegexInfo(Arc::new(RegexInfoI { config, props, props_union })) 1933 } 1934 config(&self) -> &Config1935 pub(crate) fn config(&self) -> &Config { 1936 &self.0.config 1937 } 1938 props(&self) -> &[hir::Properties]1939 pub(crate) fn props(&self) -> &[hir::Properties] { 1940 &self.0.props 1941 } 1942 props_union(&self) -> &hir::Properties1943 pub(crate) fn props_union(&self) -> &hir::Properties { 1944 &self.0.props_union 1945 } 1946 pattern_len(&self) -> usize1947 pub(crate) fn pattern_len(&self) -> usize { 1948 self.props().len() 1949 } 1950 memory_usage(&self) -> usize1951 pub(crate) fn memory_usage(&self) -> usize { 1952 self.props().iter().map(|p| p.memory_usage()).sum::<usize>() 1953 + self.props_union().memory_usage() 1954 } 1955 1956 /// Returns true when the search is guaranteed to be anchored. That is, 1957 /// when a match is reported, its offset is guaranteed to correspond to 1958 /// the start of the search. 1959 /// 1960 /// This includes returning true when `input` _isn't_ anchored but the 1961 /// underlying regex is. 1962 #[cfg_attr(feature = "perf-inline", inline(always))] is_anchored_start(&self, input: &Input<'_>) -> bool1963 pub(crate) fn is_anchored_start(&self, input: &Input<'_>) -> bool { 1964 input.get_anchored().is_anchored() || self.is_always_anchored_start() 1965 } 1966 1967 /// Returns true when this regex is always anchored to the start of a 1968 /// search. And in particular, that regardless of an `Input` configuration, 1969 /// if any match is reported it must start at `0`. 1970 #[cfg_attr(feature = "perf-inline", inline(always))] is_always_anchored_start(&self) -> bool1971 pub(crate) fn is_always_anchored_start(&self) -> bool { 1972 use regex_syntax::hir::Look; 1973 self.props_union().look_set_prefix().contains(Look::Start) 1974 } 1975 1976 /// Returns true when this regex is always anchored to the end of a 1977 /// search. And in particular, that regardless of an `Input` configuration, 1978 /// if any match is reported it must end at the end of the haystack. 1979 #[cfg_attr(feature = "perf-inline", inline(always))] is_always_anchored_end(&self) -> bool1980 pub(crate) fn is_always_anchored_end(&self) -> bool { 1981 use regex_syntax::hir::Look; 1982 self.props_union().look_set_suffix().contains(Look::End) 1983 } 1984 1985 /// Returns true if and only if it is known that a match is impossible 1986 /// for the given input. This is useful for short-circuiting and avoiding 1987 /// running the regex engine if it's known no match can be reported. 1988 /// 1989 /// Note that this doesn't necessarily detect every possible case. For 1990 /// example, when `pattern_len() == 0`, a match is impossible, but that 1991 /// case is so rare that it's fine to be handled by the regex engine 1992 /// itself. That is, it's not worth the cost of adding it here in order to 1993 /// make it a little faster. The reason is that this is called for every 1994 /// search. so there is some cost to adding checks here. Arguably, some of 1995 /// the checks that are here already probably shouldn't be here... 1996 #[cfg_attr(feature = "perf-inline", inline(always))] is_impossible(&self, input: &Input<'_>) -> bool1997 fn is_impossible(&self, input: &Input<'_>) -> bool { 1998 // The underlying regex is anchored, so if we don't start the search 1999 // at position 0, a match is impossible, because the anchor can only 2000 // match at position 0. 2001 if input.start() > 0 && self.is_always_anchored_start() { 2002 return true; 2003 } 2004 // Same idea, but for the end anchor. 2005 if input.end() < input.haystack().len() 2006 && self.is_always_anchored_end() 2007 { 2008 return true; 2009 } 2010 // If the haystack is smaller than the minimum length required, then 2011 // we know there can be no match. 2012 let minlen = match self.props_union().minimum_len() { 2013 None => return false, 2014 Some(minlen) => minlen, 2015 }; 2016 if input.get_span().len() < minlen { 2017 return true; 2018 } 2019 // Same idea as minimum, but for maximum. This is trickier. We can 2020 // only apply the maximum when we know the entire span that we're 2021 // searching *has* to match according to the regex (and possibly the 2022 // input configuration). If we know there is too much for the regex 2023 // to match, we can bail early. 2024 // 2025 // I don't think we can apply the maximum otherwise unfortunately. 2026 if self.is_anchored_start(input) && self.is_always_anchored_end() { 2027 let maxlen = match self.props_union().maximum_len() { 2028 None => return false, 2029 Some(maxlen) => maxlen, 2030 }; 2031 if input.get_span().len() > maxlen { 2032 return true; 2033 } 2034 } 2035 false 2036 } 2037 } 2038 2039 /// An iterator over all non-overlapping matches. 2040 /// 2041 /// The iterator yields a [`Match`] value until no more matches could be found. 2042 /// 2043 /// The lifetime parameters are as follows: 2044 /// 2045 /// * `'r` represents the lifetime of the `Regex` that produced this iterator. 2046 /// * `'h` represents the lifetime of the haystack being searched. 2047 /// 2048 /// This iterator can be created with the [`Regex::find_iter`] method. 2049 #[derive(Debug)] 2050 pub struct FindMatches<'r, 'h> { 2051 re: &'r Regex, 2052 cache: CachePoolGuard<'r>, 2053 it: iter::Searcher<'h>, 2054 } 2055 2056 impl<'r, 'h> FindMatches<'r, 'h> { 2057 /// Returns the `Regex` value that created this iterator. 2058 #[inline] regex(&self) -> &'r Regex2059 pub fn regex(&self) -> &'r Regex { 2060 self.re 2061 } 2062 2063 /// Returns the current `Input` associated with this iterator. 2064 /// 2065 /// The `start` position on the given `Input` may change during iteration, 2066 /// but all other values are guaranteed to remain invariant. 2067 #[inline] input<'s>(&'s self) -> &'s Input<'h>2068 pub fn input<'s>(&'s self) -> &'s Input<'h> { 2069 self.it.input() 2070 } 2071 } 2072 2073 impl<'r, 'h> Iterator for FindMatches<'r, 'h> { 2074 type Item = Match; 2075 2076 #[inline] next(&mut self) -> Option<Match>2077 fn next(&mut self) -> Option<Match> { 2078 let FindMatches { re, ref mut cache, ref mut it } = *self; 2079 it.advance(|input| Ok(re.search_with(cache, input))) 2080 } 2081 2082 #[inline] count(self) -> usize2083 fn count(self) -> usize { 2084 // If all we care about is a count of matches, then we only need to 2085 // find the end position of each match. This can give us a 2x perf 2086 // boost in some cases, because it avoids needing to do a reverse scan 2087 // to find the start of a match. 2088 let FindMatches { re, mut cache, it } = self; 2089 // This does the deref for PoolGuard once instead of every iter. 2090 let cache = &mut *cache; 2091 it.into_half_matches_iter( 2092 |input| Ok(re.search_half_with(cache, input)), 2093 ) 2094 .count() 2095 } 2096 } 2097 2098 impl<'r, 'h> core::iter::FusedIterator for FindMatches<'r, 'h> {} 2099 2100 /// An iterator over all non-overlapping leftmost matches with their capturing 2101 /// groups. 2102 /// 2103 /// The iterator yields a [`Captures`] value until no more matches could be 2104 /// found. 2105 /// 2106 /// The lifetime parameters are as follows: 2107 /// 2108 /// * `'r` represents the lifetime of the `Regex` that produced this iterator. 2109 /// * `'h` represents the lifetime of the haystack being searched. 2110 /// 2111 /// This iterator can be created with the [`Regex::captures_iter`] method. 2112 #[derive(Debug)] 2113 pub struct CapturesMatches<'r, 'h> { 2114 re: &'r Regex, 2115 cache: CachePoolGuard<'r>, 2116 caps: Captures, 2117 it: iter::Searcher<'h>, 2118 } 2119 2120 impl<'r, 'h> CapturesMatches<'r, 'h> { 2121 /// Returns the `Regex` value that created this iterator. 2122 #[inline] regex(&self) -> &'r Regex2123 pub fn regex(&self) -> &'r Regex { 2124 self.re 2125 } 2126 2127 /// Returns the current `Input` associated with this iterator. 2128 /// 2129 /// The `start` position on the given `Input` may change during iteration, 2130 /// but all other values are guaranteed to remain invariant. 2131 #[inline] input<'s>(&'s self) -> &'s Input<'h>2132 pub fn input<'s>(&'s self) -> &'s Input<'h> { 2133 self.it.input() 2134 } 2135 } 2136 2137 impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { 2138 type Item = Captures; 2139 2140 #[inline] next(&mut self) -> Option<Captures>2141 fn next(&mut self) -> Option<Captures> { 2142 // Splitting 'self' apart seems necessary to appease borrowck. 2143 let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = 2144 *self; 2145 let _ = it.advance(|input| { 2146 re.search_captures_with(cache, input, caps); 2147 Ok(caps.get_match()) 2148 }); 2149 if caps.is_match() { 2150 Some(caps.clone()) 2151 } else { 2152 None 2153 } 2154 } 2155 2156 #[inline] count(self) -> usize2157 fn count(self) -> usize { 2158 let CapturesMatches { re, mut cache, it, .. } = self; 2159 // This does the deref for PoolGuard once instead of every iter. 2160 let cache = &mut *cache; 2161 it.into_half_matches_iter( 2162 |input| Ok(re.search_half_with(cache, input)), 2163 ) 2164 .count() 2165 } 2166 } 2167 2168 impl<'r, 'h> core::iter::FusedIterator for CapturesMatches<'r, 'h> {} 2169 2170 /// Yields all substrings delimited by a regular expression match. 2171 /// 2172 /// The spans correspond to the offsets between matches. 2173 /// 2174 /// The lifetime parameters are as follows: 2175 /// 2176 /// * `'r` represents the lifetime of the `Regex` that produced this iterator. 2177 /// * `'h` represents the lifetime of the haystack being searched. 2178 /// 2179 /// This iterator can be created with the [`Regex::split`] method. 2180 #[derive(Debug)] 2181 pub struct Split<'r, 'h> { 2182 finder: FindMatches<'r, 'h>, 2183 last: usize, 2184 } 2185 2186 impl<'r, 'h> Split<'r, 'h> { 2187 /// Returns the current `Input` associated with this iterator. 2188 /// 2189 /// The `start` position on the given `Input` may change during iteration, 2190 /// but all other values are guaranteed to remain invariant. 2191 #[inline] input<'s>(&'s self) -> &'s Input<'h>2192 pub fn input<'s>(&'s self) -> &'s Input<'h> { 2193 self.finder.input() 2194 } 2195 } 2196 2197 impl<'r, 'h> Iterator for Split<'r, 'h> { 2198 type Item = Span; 2199 next(&mut self) -> Option<Span>2200 fn next(&mut self) -> Option<Span> { 2201 match self.finder.next() { 2202 None => { 2203 let len = self.finder.it.input().haystack().len(); 2204 if self.last > len { 2205 None 2206 } else { 2207 let span = Span::from(self.last..len); 2208 self.last = len + 1; // Next call will return None 2209 Some(span) 2210 } 2211 } 2212 Some(m) => { 2213 let span = Span::from(self.last..m.start()); 2214 self.last = m.end(); 2215 Some(span) 2216 } 2217 } 2218 } 2219 } 2220 2221 impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {} 2222 2223 /// Yields at most `N` spans delimited by a regular expression match. 2224 /// 2225 /// The spans correspond to the offsets between matches. The last span will be 2226 /// whatever remains after splitting. 2227 /// 2228 /// The lifetime parameters are as follows: 2229 /// 2230 /// * `'r` represents the lifetime of the `Regex` that produced this iterator. 2231 /// * `'h` represents the lifetime of the haystack being searched. 2232 /// 2233 /// This iterator can be created with the [`Regex::splitn`] method. 2234 #[derive(Debug)] 2235 pub struct SplitN<'r, 'h> { 2236 splits: Split<'r, 'h>, 2237 limit: usize, 2238 } 2239 2240 impl<'r, 'h> SplitN<'r, 'h> { 2241 /// Returns the current `Input` associated with this iterator. 2242 /// 2243 /// The `start` position on the given `Input` may change during iteration, 2244 /// but all other values are guaranteed to remain invariant. 2245 #[inline] input<'s>(&'s self) -> &'s Input<'h>2246 pub fn input<'s>(&'s self) -> &'s Input<'h> { 2247 self.splits.input() 2248 } 2249 } 2250 2251 impl<'r, 'h> Iterator for SplitN<'r, 'h> { 2252 type Item = Span; 2253 next(&mut self) -> Option<Span>2254 fn next(&mut self) -> Option<Span> { 2255 if self.limit == 0 { 2256 return None; 2257 } 2258 2259 self.limit -= 1; 2260 if self.limit > 0 { 2261 return self.splits.next(); 2262 } 2263 2264 let len = self.splits.finder.it.input().haystack().len(); 2265 if self.splits.last > len { 2266 // We've already returned all substrings. 2267 None 2268 } else { 2269 // self.n == 0, so future calls will return None immediately 2270 Some(Span::from(self.splits.last..len)) 2271 } 2272 } 2273 size_hint(&self) -> (usize, Option<usize>)2274 fn size_hint(&self) -> (usize, Option<usize>) { 2275 (0, Some(self.limit)) 2276 } 2277 } 2278 2279 impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} 2280 2281 /// Represents mutable scratch space used by regex engines during a search. 2282 /// 2283 /// Most of the regex engines in this crate require some kind of 2284 /// mutable state in order to execute a search. This mutable state is 2285 /// explicitly separated from the the core regex object (such as a 2286 /// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex 2287 /// object can be shared across multiple threads simultaneously without any 2288 /// synchronization. Conversely, a `Cache` must either be duplicated if using 2289 /// the same `Regex` from multiple threads, or else there must be some kind of 2290 /// synchronization that guarantees exclusive access while it's in use by one 2291 /// thread. 2292 /// 2293 /// A `Regex` attempts to do this synchronization for you by using a thread 2294 /// pool internally. Its size scales roughly with the number of simultaneous 2295 /// regex searches. 2296 /// 2297 /// For cases where one does not want to rely on a `Regex`'s internal thread 2298 /// pool, lower level routines such as [`Regex::search_with`] are provided 2299 /// that permit callers to pass a `Cache` into the search routine explicitly. 2300 /// 2301 /// General advice is that the thread pool is often more than good enough. 2302 /// However, it may be possible to observe the effects of its latency, 2303 /// especially when searching many small haystacks from many threads 2304 /// simultaneously. 2305 /// 2306 /// Caches can be created from their corresponding `Regex` via 2307 /// [`Regex::create_cache`]. A cache can only be used with either the `Regex` 2308 /// that created it, or the `Regex` that was most recently used to reset it 2309 /// with [`Cache::reset`]. Using a cache with any other `Regex` may result in 2310 /// panics or incorrect results. 2311 /// 2312 /// # Example 2313 /// 2314 /// ``` 2315 /// use regex_automata::{meta::Regex, Input, Match}; 2316 /// 2317 /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; 2318 /// let mut cache = re.create_cache(); 2319 /// let input = Input::new("crazy janey and her mission man"); 2320 /// assert_eq!( 2321 /// Some(Match::must(0, 20..31)), 2322 /// re.search_with(&mut cache, &input), 2323 /// ); 2324 /// 2325 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2326 /// ``` 2327 #[derive(Debug, Clone)] 2328 pub struct Cache { 2329 pub(crate) capmatches: Captures, 2330 pub(crate) pikevm: wrappers::PikeVMCache, 2331 pub(crate) backtrack: wrappers::BoundedBacktrackerCache, 2332 pub(crate) onepass: wrappers::OnePassCache, 2333 pub(crate) hybrid: wrappers::HybridCache, 2334 pub(crate) revhybrid: wrappers::ReverseHybridCache, 2335 } 2336 2337 impl Cache { 2338 /// Creates a new `Cache` for use with this regex. 2339 /// 2340 /// The cache returned should only be used for searches for the given 2341 /// `Regex`. If you want to reuse the cache for another `Regex`, then you 2342 /// must call [`Cache::reset`] with that `Regex`. new(re: &Regex) -> Cache2343 pub fn new(re: &Regex) -> Cache { 2344 re.create_cache() 2345 } 2346 2347 /// Reset this cache such that it can be used for searching with the given 2348 /// `Regex` (and only that `Regex`). 2349 /// 2350 /// A cache reset permits potentially reusing memory already allocated in 2351 /// this cache with a different `Regex`. 2352 /// 2353 /// # Example 2354 /// 2355 /// This shows how to re-purpose a cache for use with a different `Regex`. 2356 /// 2357 /// ``` 2358 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2359 /// use regex_automata::{meta::Regex, Match, Input}; 2360 /// 2361 /// let re1 = Regex::new(r"\w")?; 2362 /// let re2 = Regex::new(r"\W")?; 2363 /// 2364 /// let mut cache = re1.create_cache(); 2365 /// assert_eq!( 2366 /// Some(Match::must(0, 0..2)), 2367 /// re1.search_with(&mut cache, &Input::new("Δ")), 2368 /// ); 2369 /// 2370 /// // Using 'cache' with re2 is not allowed. It may result in panics or 2371 /// // incorrect results. In order to re-purpose the cache, we must reset 2372 /// // it with the Regex we'd like to use it with. 2373 /// // 2374 /// // Similarly, after this reset, using the cache with 're1' is also not 2375 /// // allowed. 2376 /// cache.reset(&re2); 2377 /// assert_eq!( 2378 /// Some(Match::must(0, 0..3)), 2379 /// re2.search_with(&mut cache, &Input::new("☃")), 2380 /// ); 2381 /// 2382 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2383 /// ``` reset(&mut self, re: &Regex)2384 pub fn reset(&mut self, re: &Regex) { 2385 re.imp.strat.reset_cache(self) 2386 } 2387 2388 /// Returns the heap memory usage, in bytes, of this cache. 2389 /// 2390 /// This does **not** include the stack size used up by this cache. To 2391 /// compute that, use `std::mem::size_of::<Cache>()`. memory_usage(&self) -> usize2392 pub fn memory_usage(&self) -> usize { 2393 let mut bytes = 0; 2394 bytes += self.pikevm.memory_usage(); 2395 bytes += self.backtrack.memory_usage(); 2396 bytes += self.onepass.memory_usage(); 2397 bytes += self.hybrid.memory_usage(); 2398 bytes += self.revhybrid.memory_usage(); 2399 bytes 2400 } 2401 } 2402 2403 /// An object describing the configuration of a `Regex`. 2404 /// 2405 /// This configuration only includes options for the 2406 /// non-syntax behavior of a `Regex`, and can be applied via the 2407 /// [`Builder::configure`] method. For configuring the syntax options, see 2408 /// [`util::syntax::Config`](crate::util::syntax::Config). 2409 /// 2410 /// # Example: lower the NFA size limit 2411 /// 2412 /// In some cases, the default size limit might be too big. The size limit can 2413 /// be lowered, which will prevent large regex patterns from compiling. 2414 /// 2415 /// ``` 2416 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2417 /// use regex_automata::meta::Regex; 2418 /// 2419 /// let result = Regex::builder() 2420 /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) 2421 /// // Not even 20KB is enough to build a single large Unicode class! 2422 /// .build(r"\pL"); 2423 /// assert!(result.is_err()); 2424 /// 2425 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2426 /// ``` 2427 #[derive(Clone, Debug, Default)] 2428 pub struct Config { 2429 // As with other configuration types in this crate, we put all our knobs 2430 // in options so that we can distinguish between "default" and "not set." 2431 // This makes it possible to easily combine multiple configurations 2432 // without default values overwriting explicitly specified values. See the 2433 // 'overwrite' method. 2434 // 2435 // For docs on the fields below, see the corresponding method setters. 2436 match_kind: Option<MatchKind>, 2437 utf8_empty: Option<bool>, 2438 autopre: Option<bool>, 2439 pre: Option<Option<Prefilter>>, 2440 which_captures: Option<WhichCaptures>, 2441 nfa_size_limit: Option<Option<usize>>, 2442 onepass_size_limit: Option<Option<usize>>, 2443 hybrid_cache_capacity: Option<usize>, 2444 hybrid: Option<bool>, 2445 dfa: Option<bool>, 2446 dfa_size_limit: Option<Option<usize>>, 2447 dfa_state_limit: Option<Option<usize>>, 2448 onepass: Option<bool>, 2449 backtrack: Option<bool>, 2450 byte_classes: Option<bool>, 2451 line_terminator: Option<u8>, 2452 } 2453 2454 impl Config { 2455 /// Create a new configuration object for a `Regex`. new() -> Config2456 pub fn new() -> Config { 2457 Config::default() 2458 } 2459 2460 /// Set the match semantics for a `Regex`. 2461 /// 2462 /// The default value is [`MatchKind::LeftmostFirst`]. 2463 /// 2464 /// # Example 2465 /// 2466 /// ``` 2467 /// use regex_automata::{meta::Regex, Match, MatchKind}; 2468 /// 2469 /// // By default, leftmost-first semantics are used, which 2470 /// // disambiguates matches at the same position by selecting 2471 /// // the one that corresponds earlier in the pattern. 2472 /// let re = Regex::new("sam|samwise")?; 2473 /// assert_eq!(Some(Match::must(0, 0..3)), re.find("samwise")); 2474 /// 2475 /// // But with 'all' semantics, match priority is ignored 2476 /// // and all match states are included. When coupled with 2477 /// // a leftmost search, the search will report the last 2478 /// // possible match. 2479 /// let re = Regex::builder() 2480 /// .configure(Regex::config().match_kind(MatchKind::All)) 2481 /// .build("sam|samwise")?; 2482 /// assert_eq!(Some(Match::must(0, 0..7)), re.find("samwise")); 2483 /// // Beware that this can lead to skipping matches! 2484 /// // Usually 'all' is used for anchored reverse searches 2485 /// // only, or for overlapping searches. 2486 /// assert_eq!(Some(Match::must(0, 4..11)), re.find("sam samwise")); 2487 /// 2488 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2489 /// ``` match_kind(self, kind: MatchKind) -> Config2490 pub fn match_kind(self, kind: MatchKind) -> Config { 2491 Config { match_kind: Some(kind), ..self } 2492 } 2493 2494 /// Toggles whether empty matches are permitted to occur between the code 2495 /// units of a UTF-8 encoded codepoint. 2496 /// 2497 /// This should generally be enabled when search a `&str` or anything that 2498 /// you otherwise know is valid UTF-8. It should be disabled in all other 2499 /// cases. Namely, if the haystack is not valid UTF-8 and this is enabled, 2500 /// then behavior is unspecified. 2501 /// 2502 /// By default, this is enabled. 2503 /// 2504 /// # Example 2505 /// 2506 /// ``` 2507 /// use regex_automata::{meta::Regex, Match}; 2508 /// 2509 /// let re = Regex::new("")?; 2510 /// let got: Vec<Match> = re.find_iter("☃").collect(); 2511 /// // Matches only occur at the beginning and end of the snowman. 2512 /// assert_eq!(got, vec![ 2513 /// Match::must(0, 0..0), 2514 /// Match::must(0, 3..3), 2515 /// ]); 2516 /// 2517 /// let re = Regex::builder() 2518 /// .configure(Regex::config().utf8_empty(false)) 2519 /// .build("")?; 2520 /// let got: Vec<Match> = re.find_iter("☃").collect(); 2521 /// // Matches now occur at every position! 2522 /// assert_eq!(got, vec![ 2523 /// Match::must(0, 0..0), 2524 /// Match::must(0, 1..1), 2525 /// Match::must(0, 2..2), 2526 /// Match::must(0, 3..3), 2527 /// ]); 2528 /// 2529 /// Ok::<(), Box<dyn std::error::Error>>(()) 2530 /// ``` utf8_empty(self, yes: bool) -> Config2531 pub fn utf8_empty(self, yes: bool) -> Config { 2532 Config { utf8_empty: Some(yes), ..self } 2533 } 2534 2535 /// Toggles whether automatic prefilter support is enabled. 2536 /// 2537 /// If this is disabled and [`Config::prefilter`] is not set, then the 2538 /// meta regex engine will not use any prefilters. This can sometimes 2539 /// be beneficial in cases where you know (or have measured) that the 2540 /// prefilter leads to overall worse search performance. 2541 /// 2542 /// By default, this is enabled. 2543 /// 2544 /// # Example 2545 /// 2546 /// ``` 2547 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2548 /// use regex_automata::{meta::Regex, Match}; 2549 /// 2550 /// let re = Regex::builder() 2551 /// .configure(Regex::config().auto_prefilter(false)) 2552 /// .build(r"Bruce \w+")?; 2553 /// let hay = "Hello Bruce Springsteen!"; 2554 /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); 2555 /// 2556 /// Ok::<(), Box<dyn std::error::Error>>(()) 2557 /// ``` auto_prefilter(self, yes: bool) -> Config2558 pub fn auto_prefilter(self, yes: bool) -> Config { 2559 Config { autopre: Some(yes), ..self } 2560 } 2561 2562 /// Overrides and sets the prefilter to use inside a `Regex`. 2563 /// 2564 /// This permits one to forcefully set a prefilter in cases where the 2565 /// caller knows better than whatever the automatic prefilter logic is 2566 /// capable of. 2567 /// 2568 /// By default, this is set to `None` and an automatic prefilter will be 2569 /// used if one could be built. (Assuming [`Config::auto_prefilter`] is 2570 /// enabled, which it is by default.) 2571 /// 2572 /// # Example 2573 /// 2574 /// This example shows how to set your own prefilter. In the case of a 2575 /// pattern like `Bruce \w+`, the automatic prefilter is likely to be 2576 /// constructed in a way that it will look for occurrences of `Bruce `. 2577 /// In most cases, this is the best choice. But in some cases, it may be 2578 /// the case that running `memchr` on `B` is the best choice. One can 2579 /// achieve that behavior by overriding the automatic prefilter logic 2580 /// and providing a prefilter that just matches `B`. 2581 /// 2582 /// ``` 2583 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2584 /// use regex_automata::{ 2585 /// meta::Regex, 2586 /// util::prefilter::Prefilter, 2587 /// Match, MatchKind, 2588 /// }; 2589 /// 2590 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["B"]) 2591 /// .expect("a prefilter"); 2592 /// let re = Regex::builder() 2593 /// .configure(Regex::config().prefilter(Some(pre))) 2594 /// .build(r"Bruce \w+")?; 2595 /// let hay = "Hello Bruce Springsteen!"; 2596 /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); 2597 /// 2598 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2599 /// ``` 2600 /// 2601 /// # Example: incorrect prefilters can lead to incorrect results! 2602 /// 2603 /// Be warned that setting an incorrect prefilter can lead to missed 2604 /// matches. So if you use this option, ensure your prefilter can _never_ 2605 /// report false negatives. (A false positive is, on the other hand, quite 2606 /// okay and generally unavoidable.) 2607 /// 2608 /// ``` 2609 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2610 /// use regex_automata::{ 2611 /// meta::Regex, 2612 /// util::prefilter::Prefilter, 2613 /// Match, MatchKind, 2614 /// }; 2615 /// 2616 /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Z"]) 2617 /// .expect("a prefilter"); 2618 /// let re = Regex::builder() 2619 /// .configure(Regex::config().prefilter(Some(pre))) 2620 /// .build(r"Bruce \w+")?; 2621 /// let hay = "Hello Bruce Springsteen!"; 2622 /// // Oops! No match found, but there should be one! 2623 /// assert_eq!(None, re.find(hay)); 2624 /// 2625 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2626 /// ``` prefilter(self, pre: Option<Prefilter>) -> Config2627 pub fn prefilter(self, pre: Option<Prefilter>) -> Config { 2628 Config { pre: Some(pre), ..self } 2629 } 2630 2631 /// Configures what kinds of groups are compiled as "capturing" in the 2632 /// underlying regex engine. 2633 /// 2634 /// This is set to [`WhichCaptures::All`] by default. Callers may wish to 2635 /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the 2636 /// overhead of capture states for explicit groups. 2637 /// 2638 /// Note that another approach to avoiding the overhead of capture groups 2639 /// is by using non-capturing groups in the regex pattern. That is, 2640 /// `(?:a)` instead of `(a)`. This option is useful when you can't control 2641 /// the concrete syntax but know that you don't need the underlying capture 2642 /// states. For example, using `WhichCaptures::Implicit` will behave as if 2643 /// all explicit capturing groups in the pattern were non-capturing. 2644 /// 2645 /// Setting this to `WhichCaptures::None` is usually not the right thing to 2646 /// do. When no capture states are compiled, some regex engines (such as 2647 /// the `PikeVM`) won't be able to report match offsets. This will manifest 2648 /// as no match being found. 2649 /// 2650 /// # Example 2651 /// 2652 /// This example demonstrates how the results of capture groups can change 2653 /// based on this option. First we show the default (all capture groups in 2654 /// the pattern are capturing): 2655 /// 2656 /// ``` 2657 /// use regex_automata::{meta::Regex, Match, Span}; 2658 /// 2659 /// let re = Regex::new(r"foo([0-9]+)bar")?; 2660 /// let hay = "foo123bar"; 2661 /// 2662 /// let mut caps = re.create_captures(); 2663 /// re.captures(hay, &mut caps); 2664 /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); 2665 /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); 2666 /// 2667 /// Ok::<(), Box<dyn std::error::Error>>(()) 2668 /// ``` 2669 /// 2670 /// And now we show the behavior when we only include implicit capture 2671 /// groups. In this case, we can only find the overall match span, but the 2672 /// spans of any other explicit group don't exist because they are treated 2673 /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, 2674 /// there is no real point in using [`Regex::captures`] since it will never 2675 /// be able to report more information than [`Regex::find`].) 2676 /// 2677 /// ``` 2678 /// use regex_automata::{ 2679 /// meta::Regex, 2680 /// nfa::thompson::WhichCaptures, 2681 /// Match, 2682 /// Span, 2683 /// }; 2684 /// 2685 /// let re = Regex::builder() 2686 /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) 2687 /// .build(r"foo([0-9]+)bar")?; 2688 /// let hay = "foo123bar"; 2689 /// 2690 /// let mut caps = re.create_captures(); 2691 /// re.captures(hay, &mut caps); 2692 /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); 2693 /// assert_eq!(None, caps.get_group(1)); 2694 /// 2695 /// Ok::<(), Box<dyn std::error::Error>>(()) 2696 /// ``` which_captures(mut self, which_captures: WhichCaptures) -> Config2697 pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { 2698 self.which_captures = Some(which_captures); 2699 self 2700 } 2701 2702 /// Sets the size limit, in bytes, to enforce on the construction of every 2703 /// NFA build by the meta regex engine. 2704 /// 2705 /// Setting it to `None` disables the limit. This is not recommended if 2706 /// you're compiling untrusted patterns. 2707 /// 2708 /// Note that this limit is applied to _each_ NFA built, and if any of 2709 /// them exceed the limit, then construction will fail. This limit does 2710 /// _not_ correspond to the total memory used by all NFAs in the meta regex 2711 /// engine. 2712 /// 2713 /// This defaults to some reasonable number that permits most reasonable 2714 /// patterns. 2715 /// 2716 /// # Example 2717 /// 2718 /// ``` 2719 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2720 /// use regex_automata::meta::Regex; 2721 /// 2722 /// let result = Regex::builder() 2723 /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) 2724 /// // Not even 20KB is enough to build a single large Unicode class! 2725 /// .build(r"\pL"); 2726 /// assert!(result.is_err()); 2727 /// 2728 /// // But notice that building such a regex with the exact same limit 2729 /// // can succeed depending on other aspects of the configuration. For 2730 /// // example, a single *forward* NFA will (at time of writing) fit into 2731 /// // the 20KB limit, but a *reverse* NFA of the same pattern will not. 2732 /// // So if one configures a meta regex such that a reverse NFA is never 2733 /// // needed and thus never built, then the 20KB limit will be enough for 2734 /// // a pattern like \pL! 2735 /// let result = Regex::builder() 2736 /// .configure(Regex::config() 2737 /// .nfa_size_limit(Some(20 * (1<<10))) 2738 /// // The DFAs are the only thing that (currently) need a reverse 2739 /// // NFA. So if both are disabled, the meta regex engine will 2740 /// // skip building the reverse NFA. Note that this isn't an API 2741 /// // guarantee. A future semver compatible version may introduce 2742 /// // new use cases for a reverse NFA. 2743 /// .hybrid(false) 2744 /// .dfa(false) 2745 /// ) 2746 /// // Not even 20KB is enough to build a single large Unicode class! 2747 /// .build(r"\pL"); 2748 /// assert!(result.is_ok()); 2749 /// 2750 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2751 /// ``` nfa_size_limit(self, limit: Option<usize>) -> Config2752 pub fn nfa_size_limit(self, limit: Option<usize>) -> Config { 2753 Config { nfa_size_limit: Some(limit), ..self } 2754 } 2755 2756 /// Sets the size limit, in bytes, for the one-pass DFA. 2757 /// 2758 /// Setting it to `None` disables the limit. Disabling the limit is 2759 /// strongly discouraged when compiling untrusted patterns. Even if the 2760 /// patterns are trusted, it still may not be a good idea, since a one-pass 2761 /// DFA can use a lot of memory. With that said, as the size of a regex 2762 /// increases, the likelihood of it being one-pass likely decreases. 2763 /// 2764 /// This defaults to some reasonable number that permits most reasonable 2765 /// one-pass patterns. 2766 /// 2767 /// # Example 2768 /// 2769 /// This shows how to set the one-pass DFA size limit. Note that since 2770 /// a one-pass DFA is an optional component of the meta regex engine, 2771 /// this size limit only impacts what is built internally and will never 2772 /// determine whether a `Regex` itself fails to build. 2773 /// 2774 /// ``` 2775 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2776 /// use regex_automata::meta::Regex; 2777 /// 2778 /// let result = Regex::builder() 2779 /// .configure(Regex::config().onepass_size_limit(Some(2 * (1<<20)))) 2780 /// .build(r"\pL{5}"); 2781 /// assert!(result.is_ok()); 2782 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2783 /// ``` onepass_size_limit(self, limit: Option<usize>) -> Config2784 pub fn onepass_size_limit(self, limit: Option<usize>) -> Config { 2785 Config { onepass_size_limit: Some(limit), ..self } 2786 } 2787 2788 /// Set the cache capacity, in bytes, for the lazy DFA. 2789 /// 2790 /// The cache capacity of the lazy DFA determines approximately how much 2791 /// heap memory it is allowed to use to store its state transitions. The 2792 /// state transitions are computed at search time, and if the cache fills 2793 /// up it, it is cleared. At this point, any previously generated state 2794 /// transitions are lost and are re-generated if they're needed again. 2795 /// 2796 /// This sort of cache filling and clearing works quite well _so long as 2797 /// cache clearing happens infrequently_. If it happens too often, then the 2798 /// meta regex engine will stop using the lazy DFA and switch over to a 2799 /// different regex engine. 2800 /// 2801 /// In cases where the cache is cleared too often, it may be possible to 2802 /// give the cache more space and reduce (or eliminate) how often it is 2803 /// cleared. Similarly, sometimes a regex is so big that the lazy DFA isn't 2804 /// used at all if its cache capacity isn't big enough. 2805 /// 2806 /// The capacity set here is a _limit_ on how much memory is used. The 2807 /// actual memory used is only allocated as it's needed. 2808 /// 2809 /// Determining the right value for this is a little tricky and will likely 2810 /// required some profiling. Enabling the `logging` feature and setting the 2811 /// log level to `trace` will also tell you how often the cache is being 2812 /// cleared. 2813 /// 2814 /// # Example 2815 /// 2816 /// ``` 2817 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2818 /// use regex_automata::meta::Regex; 2819 /// 2820 /// let result = Regex::builder() 2821 /// .configure(Regex::config().hybrid_cache_capacity(20 * (1<<20))) 2822 /// .build(r"\pL{5}"); 2823 /// assert!(result.is_ok()); 2824 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2825 /// ``` hybrid_cache_capacity(self, limit: usize) -> Config2826 pub fn hybrid_cache_capacity(self, limit: usize) -> Config { 2827 Config { hybrid_cache_capacity: Some(limit), ..self } 2828 } 2829 2830 /// Sets the size limit, in bytes, for heap memory used for a fully 2831 /// compiled DFA. 2832 /// 2833 /// **NOTE:** If you increase this, you'll likely also need to increase 2834 /// [`Config::dfa_state_limit`]. 2835 /// 2836 /// In contrast to the lazy DFA, building a full DFA requires computing 2837 /// all of its state transitions up front. This can be a very expensive 2838 /// process, and runs in worst case `2^n` time and space (where `n` is 2839 /// proportional to the size of the regex). However, a full DFA unlocks 2840 /// some additional optimization opportunities. 2841 /// 2842 /// Because full DFAs can be so expensive, the default limits for them are 2843 /// incredibly small. Generally speaking, if your regex is moderately big 2844 /// or if you're using Unicode features (`\w` is Unicode-aware by default 2845 /// for example), then you can expect that the meta regex engine won't even 2846 /// attempt to build a DFA for it. 2847 /// 2848 /// If this and [`Config::dfa_state_limit`] are set to `None`, then the 2849 /// meta regex will not use any sort of limits when deciding whether to 2850 /// build a DFA. This in turn makes construction of a `Regex` take 2851 /// worst case exponential time and space. Even short patterns can result 2852 /// in huge space blow ups. So it is strongly recommended to keep some kind 2853 /// of limit set! 2854 /// 2855 /// The default is set to a small number that permits some simple regexes 2856 /// to get compiled into DFAs in reasonable time. 2857 /// 2858 /// # Example 2859 /// 2860 /// ``` 2861 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2862 /// use regex_automata::meta::Regex; 2863 /// 2864 /// let result = Regex::builder() 2865 /// // 100MB is much bigger than the default. 2866 /// .configure(Regex::config() 2867 /// .dfa_size_limit(Some(100 * (1<<20))) 2868 /// // We don't care about size too much here, so just 2869 /// // remove the NFA state limit altogether. 2870 /// .dfa_state_limit(None)) 2871 /// .build(r"\pL{5}"); 2872 /// assert!(result.is_ok()); 2873 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2874 /// ``` dfa_size_limit(self, limit: Option<usize>) -> Config2875 pub fn dfa_size_limit(self, limit: Option<usize>) -> Config { 2876 Config { dfa_size_limit: Some(limit), ..self } 2877 } 2878 2879 /// Sets a limit on the total number of NFA states, beyond which, a full 2880 /// DFA is not attempted to be compiled. 2881 /// 2882 /// This limit works in concert with [`Config::dfa_size_limit`]. Namely, 2883 /// where as `Config::dfa_size_limit` is applied by attempting to construct 2884 /// a DFA, this limit is used to avoid the attempt in the first place. This 2885 /// is useful to avoid hefty initialization costs associated with building 2886 /// a DFA for cases where it is obvious the DFA will ultimately be too big. 2887 /// 2888 /// By default, this is set to a very small number. 2889 /// 2890 /// # Example 2891 /// 2892 /// ``` 2893 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 2894 /// use regex_automata::meta::Regex; 2895 /// 2896 /// let result = Regex::builder() 2897 /// .configure(Regex::config() 2898 /// // Sometimes the default state limit rejects DFAs even 2899 /// // if they would fit in the size limit. Here, we disable 2900 /// // the check on the number of NFA states and just rely on 2901 /// // the size limit. 2902 /// .dfa_state_limit(None)) 2903 /// .build(r"(?-u)\w{30}"); 2904 /// assert!(result.is_ok()); 2905 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2906 /// ``` dfa_state_limit(self, limit: Option<usize>) -> Config2907 pub fn dfa_state_limit(self, limit: Option<usize>) -> Config { 2908 Config { dfa_state_limit: Some(limit), ..self } 2909 } 2910 2911 /// Whether to attempt to shrink the size of the alphabet for the regex 2912 /// pattern or not. When enabled, the alphabet is shrunk into a set of 2913 /// equivalence classes, where every byte in the same equivalence class 2914 /// cannot discriminate between a match or non-match. 2915 /// 2916 /// **WARNING:** This is only useful for debugging DFAs. Disabling this 2917 /// does not yield any speed advantages. Indeed, disabling it can result 2918 /// in much higher memory usage. Disabling byte classes is useful for 2919 /// debugging the actual generated transitions because it lets one see the 2920 /// transitions defined on actual bytes instead of the equivalence classes. 2921 /// 2922 /// This option is enabled by default and should never be disabled unless 2923 /// one is debugging the meta regex engine's internals. 2924 /// 2925 /// # Example 2926 /// 2927 /// ``` 2928 /// use regex_automata::{meta::Regex, Match}; 2929 /// 2930 /// let re = Regex::builder() 2931 /// .configure(Regex::config().byte_classes(false)) 2932 /// .build(r"[a-z]+")?; 2933 /// let hay = "!!quux!!"; 2934 /// assert_eq!(Some(Match::must(0, 2..6)), re.find(hay)); 2935 /// 2936 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2937 /// ``` byte_classes(self, yes: bool) -> Config2938 pub fn byte_classes(self, yes: bool) -> Config { 2939 Config { byte_classes: Some(yes), ..self } 2940 } 2941 2942 /// Set the line terminator to be used by the `^` and `$` anchors in 2943 /// multi-line mode. 2944 /// 2945 /// This option has no effect when CRLF mode is enabled. That is, 2946 /// regardless of this setting, `(?Rm:^)` and `(?Rm:$)` will always treat 2947 /// `\r` and `\n` as line terminators (and will never match between a `\r` 2948 /// and a `\n`). 2949 /// 2950 /// By default, `\n` is the line terminator. 2951 /// 2952 /// **Warning**: This does not change the behavior of `.`. To do that, 2953 /// you'll need to configure the syntax option 2954 /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator) 2955 /// in addition to this. Otherwise, `.` will continue to match any 2956 /// character other than `\n`. 2957 /// 2958 /// # Example 2959 /// 2960 /// ``` 2961 /// use regex_automata::{meta::Regex, util::syntax, Match}; 2962 /// 2963 /// let re = Regex::builder() 2964 /// .syntax(syntax::Config::new().multi_line(true)) 2965 /// .configure(Regex::config().line_terminator(b'\x00')) 2966 /// .build(r"^foo$")?; 2967 /// let hay = "\x00foo\x00"; 2968 /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); 2969 /// 2970 /// # Ok::<(), Box<dyn std::error::Error>>(()) 2971 /// ``` line_terminator(self, byte: u8) -> Config2972 pub fn line_terminator(self, byte: u8) -> Config { 2973 Config { line_terminator: Some(byte), ..self } 2974 } 2975 2976 /// Toggle whether the hybrid NFA/DFA (also known as the "lazy DFA") should 2977 /// be available for use by the meta regex engine. 2978 /// 2979 /// Enabling this does not necessarily mean that the lazy DFA will 2980 /// definitely be used. It just means that it will be _available_ for use 2981 /// if the meta regex engine thinks it will be useful. 2982 /// 2983 /// When the `hybrid` crate feature is enabled, then this is enabled by 2984 /// default. Otherwise, if the crate feature is disabled, then this is 2985 /// always disabled, regardless of its setting by the caller. hybrid(self, yes: bool) -> Config2986 pub fn hybrid(self, yes: bool) -> Config { 2987 Config { hybrid: Some(yes), ..self } 2988 } 2989 2990 /// Toggle whether a fully compiled DFA should be available for use by the 2991 /// meta regex engine. 2992 /// 2993 /// Enabling this does not necessarily mean that a DFA will definitely be 2994 /// used. It just means that it will be _available_ for use if the meta 2995 /// regex engine thinks it will be useful. 2996 /// 2997 /// When the `dfa-build` crate feature is enabled, then this is enabled by 2998 /// default. Otherwise, if the crate feature is disabled, then this is 2999 /// always disabled, regardless of its setting by the caller. dfa(self, yes: bool) -> Config3000 pub fn dfa(self, yes: bool) -> Config { 3001 Config { dfa: Some(yes), ..self } 3002 } 3003 3004 /// Toggle whether a one-pass DFA should be available for use by the meta 3005 /// regex engine. 3006 /// 3007 /// Enabling this does not necessarily mean that a one-pass DFA will 3008 /// definitely be used. It just means that it will be _available_ for 3009 /// use if the meta regex engine thinks it will be useful. (Indeed, a 3010 /// one-pass DFA can only be used when the regex is one-pass. See the 3011 /// [`dfa::onepass`](crate::dfa::onepass) module for more details.) 3012 /// 3013 /// When the `dfa-onepass` crate feature is enabled, then this is enabled 3014 /// by default. Otherwise, if the crate feature is disabled, then this is 3015 /// always disabled, regardless of its setting by the caller. onepass(self, yes: bool) -> Config3016 pub fn onepass(self, yes: bool) -> Config { 3017 Config { onepass: Some(yes), ..self } 3018 } 3019 3020 /// Toggle whether a bounded backtracking regex engine should be available 3021 /// for use by the meta regex engine. 3022 /// 3023 /// Enabling this does not necessarily mean that a bounded backtracker will 3024 /// definitely be used. It just means that it will be _available_ for use 3025 /// if the meta regex engine thinks it will be useful. 3026 /// 3027 /// When the `nfa-backtrack` crate feature is enabled, then this is enabled 3028 /// by default. Otherwise, if the crate feature is disabled, then this is 3029 /// always disabled, regardless of its setting by the caller. backtrack(self, yes: bool) -> Config3030 pub fn backtrack(self, yes: bool) -> Config { 3031 Config { backtrack: Some(yes), ..self } 3032 } 3033 3034 /// Returns the match kind on this configuration, as set by 3035 /// [`Config::match_kind`]. 3036 /// 3037 /// If it was not explicitly set, then a default value is returned. get_match_kind(&self) -> MatchKind3038 pub fn get_match_kind(&self) -> MatchKind { 3039 self.match_kind.unwrap_or(MatchKind::LeftmostFirst) 3040 } 3041 3042 /// Returns whether empty matches must fall on valid UTF-8 boundaries, as 3043 /// set by [`Config::utf8_empty`]. 3044 /// 3045 /// If it was not explicitly set, then a default value is returned. get_utf8_empty(&self) -> bool3046 pub fn get_utf8_empty(&self) -> bool { 3047 self.utf8_empty.unwrap_or(true) 3048 } 3049 3050 /// Returns whether automatic prefilters are enabled, as set by 3051 /// [`Config::auto_prefilter`]. 3052 /// 3053 /// If it was not explicitly set, then a default value is returned. get_auto_prefilter(&self) -> bool3054 pub fn get_auto_prefilter(&self) -> bool { 3055 self.autopre.unwrap_or(true) 3056 } 3057 3058 /// Returns a manually set prefilter, if one was set by 3059 /// [`Config::prefilter`]. 3060 /// 3061 /// If it was not explicitly set, then a default value is returned. get_prefilter(&self) -> Option<&Prefilter>3062 pub fn get_prefilter(&self) -> Option<&Prefilter> { 3063 self.pre.as_ref().unwrap_or(&None).as_ref() 3064 } 3065 3066 /// Returns the capture configuration, as set by 3067 /// [`Config::which_captures`]. 3068 /// 3069 /// If it was not explicitly set, then a default value is returned. get_which_captures(&self) -> WhichCaptures3070 pub fn get_which_captures(&self) -> WhichCaptures { 3071 self.which_captures.unwrap_or(WhichCaptures::All) 3072 } 3073 3074 /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. 3075 /// 3076 /// If it was not explicitly set, then a default value is returned. get_nfa_size_limit(&self) -> Option<usize>3077 pub fn get_nfa_size_limit(&self) -> Option<usize> { 3078 self.nfa_size_limit.unwrap_or(Some(10 * (1 << 20))) 3079 } 3080 3081 /// Returns one-pass DFA size limit, as set by 3082 /// [`Config::onepass_size_limit`]. 3083 /// 3084 /// If it was not explicitly set, then a default value is returned. get_onepass_size_limit(&self) -> Option<usize>3085 pub fn get_onepass_size_limit(&self) -> Option<usize> { 3086 self.onepass_size_limit.unwrap_or(Some(1 * (1 << 20))) 3087 } 3088 3089 /// Returns hybrid NFA/DFA cache capacity, as set by 3090 /// [`Config::hybrid_cache_capacity`]. 3091 /// 3092 /// If it was not explicitly set, then a default value is returned. get_hybrid_cache_capacity(&self) -> usize3093 pub fn get_hybrid_cache_capacity(&self) -> usize { 3094 self.hybrid_cache_capacity.unwrap_or(2 * (1 << 20)) 3095 } 3096 3097 /// Returns DFA size limit, as set by [`Config::dfa_size_limit`]. 3098 /// 3099 /// If it was not explicitly set, then a default value is returned. get_dfa_size_limit(&self) -> Option<usize>3100 pub fn get_dfa_size_limit(&self) -> Option<usize> { 3101 // The default for this is VERY small because building a full DFA is 3102 // ridiculously costly. But for regexes that are very small, it can be 3103 // beneficial to use a full DFA. In particular, a full DFA can enable 3104 // additional optimizations via something called "accelerated" states. 3105 // Namely, when there's a state with only a few outgoing transitions, 3106 // we can temporary suspend walking the transition table and use memchr 3107 // for just those outgoing transitions to skip ahead very quickly. 3108 // 3109 // Generally speaking, if Unicode is enabled in your regex and you're 3110 // using some kind of Unicode feature, then it's going to blow this 3111 // size limit. Moreover, Unicode tends to defeat the "accelerated" 3112 // state optimization too, so it's a double whammy. 3113 // 3114 // We also use a limit on the number of NFA states to avoid even 3115 // starting the DFA construction process. Namely, DFA construction 3116 // itself could make lots of initial allocs proportional to the size 3117 // of the NFA, and if the NFA is large, it doesn't make sense to pay 3118 // that cost if we know it's likely to be blown by a large margin. 3119 self.dfa_size_limit.unwrap_or(Some(40 * (1 << 10))) 3120 } 3121 3122 /// Returns DFA size limit in terms of the number of states in the NFA, as 3123 /// set by [`Config::dfa_state_limit`]. 3124 /// 3125 /// If it was not explicitly set, then a default value is returned. get_dfa_state_limit(&self) -> Option<usize>3126 pub fn get_dfa_state_limit(&self) -> Option<usize> { 3127 // Again, as with the size limit, we keep this very small. 3128 self.dfa_state_limit.unwrap_or(Some(30)) 3129 } 3130 3131 /// Returns whether byte classes are enabled, as set by 3132 /// [`Config::byte_classes`]. 3133 /// 3134 /// If it was not explicitly set, then a default value is returned. get_byte_classes(&self) -> bool3135 pub fn get_byte_classes(&self) -> bool { 3136 self.byte_classes.unwrap_or(true) 3137 } 3138 3139 /// Returns the line terminator for this configuration, as set by 3140 /// [`Config::line_terminator`]. 3141 /// 3142 /// If it was not explicitly set, then a default value is returned. get_line_terminator(&self) -> u83143 pub fn get_line_terminator(&self) -> u8 { 3144 self.line_terminator.unwrap_or(b'\n') 3145 } 3146 3147 /// Returns whether the hybrid NFA/DFA regex engine may be used, as set by 3148 /// [`Config::hybrid`]. 3149 /// 3150 /// If it was not explicitly set, then a default value is returned. get_hybrid(&self) -> bool3151 pub fn get_hybrid(&self) -> bool { 3152 #[cfg(feature = "hybrid")] 3153 { 3154 self.hybrid.unwrap_or(true) 3155 } 3156 #[cfg(not(feature = "hybrid"))] 3157 { 3158 false 3159 } 3160 } 3161 3162 /// Returns whether the DFA regex engine may be used, as set by 3163 /// [`Config::dfa`]. 3164 /// 3165 /// If it was not explicitly set, then a default value is returned. get_dfa(&self) -> bool3166 pub fn get_dfa(&self) -> bool { 3167 #[cfg(feature = "dfa-build")] 3168 { 3169 self.dfa.unwrap_or(true) 3170 } 3171 #[cfg(not(feature = "dfa-build"))] 3172 { 3173 false 3174 } 3175 } 3176 3177 /// Returns whether the one-pass DFA regex engine may be used, as set by 3178 /// [`Config::onepass`]. 3179 /// 3180 /// If it was not explicitly set, then a default value is returned. get_onepass(&self) -> bool3181 pub fn get_onepass(&self) -> bool { 3182 #[cfg(feature = "dfa-onepass")] 3183 { 3184 self.onepass.unwrap_or(true) 3185 } 3186 #[cfg(not(feature = "dfa-onepass"))] 3187 { 3188 false 3189 } 3190 } 3191 3192 /// Returns whether the bounded backtracking regex engine may be used, as 3193 /// set by [`Config::backtrack`]. 3194 /// 3195 /// If it was not explicitly set, then a default value is returned. get_backtrack(&self) -> bool3196 pub fn get_backtrack(&self) -> bool { 3197 #[cfg(feature = "nfa-backtrack")] 3198 { 3199 self.backtrack.unwrap_or(true) 3200 } 3201 #[cfg(not(feature = "nfa-backtrack"))] 3202 { 3203 false 3204 } 3205 } 3206 3207 /// Overwrite the default configuration such that the options in `o` are 3208 /// always used. If an option in `o` is not set, then the corresponding 3209 /// option in `self` is used. If it's not set in `self` either, then it 3210 /// remains not set. overwrite(&self, o: Config) -> Config3211 pub(crate) fn overwrite(&self, o: Config) -> Config { 3212 Config { 3213 match_kind: o.match_kind.or(self.match_kind), 3214 utf8_empty: o.utf8_empty.or(self.utf8_empty), 3215 autopre: o.autopre.or(self.autopre), 3216 pre: o.pre.or_else(|| self.pre.clone()), 3217 which_captures: o.which_captures.or(self.which_captures), 3218 nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), 3219 onepass_size_limit: o 3220 .onepass_size_limit 3221 .or(self.onepass_size_limit), 3222 hybrid_cache_capacity: o 3223 .hybrid_cache_capacity 3224 .or(self.hybrid_cache_capacity), 3225 hybrid: o.hybrid.or(self.hybrid), 3226 dfa: o.dfa.or(self.dfa), 3227 dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), 3228 dfa_state_limit: o.dfa_state_limit.or(self.dfa_state_limit), 3229 onepass: o.onepass.or(self.onepass), 3230 backtrack: o.backtrack.or(self.backtrack), 3231 byte_classes: o.byte_classes.or(self.byte_classes), 3232 line_terminator: o.line_terminator.or(self.line_terminator), 3233 } 3234 } 3235 } 3236 3237 /// A builder for configuring and constructing a `Regex`. 3238 /// 3239 /// The builder permits configuring two different aspects of a `Regex`: 3240 /// 3241 /// * [`Builder::configure`] will set high-level configuration options as 3242 /// described by a [`Config`]. 3243 /// * [`Builder::syntax`] will set the syntax level configuration options 3244 /// as described by a [`util::syntax::Config`](crate::util::syntax::Config). 3245 /// This only applies when building a `Regex` from pattern strings. 3246 /// 3247 /// Once configured, the builder can then be used to construct a `Regex` from 3248 /// one of 4 different inputs: 3249 /// 3250 /// * [`Builder::build`] creates a regex from a single pattern string. 3251 /// * [`Builder::build_many`] creates a regex from many pattern strings. 3252 /// * [`Builder::build_from_hir`] creates a regex from a 3253 /// [`regex-syntax::Hir`](Hir) expression. 3254 /// * [`Builder::build_many_from_hir`] creates a regex from many 3255 /// [`regex-syntax::Hir`](Hir) expressions. 3256 /// 3257 /// The latter two methods in particular provide a way to construct a fully 3258 /// feature regular expression matcher directly from an `Hir` expression 3259 /// without having to first convert it to a string. (This is in contrast to the 3260 /// top-level `regex` crate which intentionally provides no such API in order 3261 /// to avoid making `regex-syntax` a public dependency.) 3262 /// 3263 /// As a convenience, this builder may be created via [`Regex::builder`], which 3264 /// may help avoid an extra import. 3265 /// 3266 /// # Example: change the line terminator 3267 /// 3268 /// This example shows how to enable multi-line mode by default and change the 3269 /// line terminator to the NUL byte: 3270 /// 3271 /// ``` 3272 /// use regex_automata::{meta::Regex, util::syntax, Match}; 3273 /// 3274 /// let re = Regex::builder() 3275 /// .syntax(syntax::Config::new().multi_line(true)) 3276 /// .configure(Regex::config().line_terminator(b'\x00')) 3277 /// .build(r"^foo$")?; 3278 /// let hay = "\x00foo\x00"; 3279 /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); 3280 /// 3281 /// # Ok::<(), Box<dyn std::error::Error>>(()) 3282 /// ``` 3283 /// 3284 /// # Example: disable UTF-8 requirement 3285 /// 3286 /// By default, regex patterns are required to match UTF-8. This includes 3287 /// regex patterns that can produce matches of length zero. In the case of an 3288 /// empty match, by default, matches will not appear between the code units of 3289 /// a UTF-8 encoded codepoint. 3290 /// 3291 /// However, it can be useful to disable this requirement, particularly if 3292 /// you're searching things like `&[u8]` that are not known to be valid UTF-8. 3293 /// 3294 /// ``` 3295 /// use regex_automata::{meta::Regex, util::syntax, Match}; 3296 /// 3297 /// let mut builder = Regex::builder(); 3298 /// // Disables the requirement that non-empty matches match UTF-8. 3299 /// builder.syntax(syntax::Config::new().utf8(false)); 3300 /// // Disables the requirement that empty matches match UTF-8 boundaries. 3301 /// builder.configure(Regex::config().utf8_empty(false)); 3302 /// 3303 /// // We can match raw bytes via \xZZ syntax, but we need to disable 3304 /// // Unicode mode to do that. We could disable it everywhere, or just 3305 /// // selectively, as shown here. 3306 /// let re = builder.build(r"(?-u:\xFF)foo(?-u:\xFF)")?; 3307 /// let hay = b"\xFFfoo\xFF"; 3308 /// assert_eq!(Some(Match::must(0, 0..5)), re.find(hay)); 3309 /// 3310 /// // We can also match between code units. 3311 /// let re = builder.build(r"")?; 3312 /// let hay = "☃"; 3313 /// assert_eq!(re.find_iter(hay).collect::<Vec<Match>>(), vec![ 3314 /// Match::must(0, 0..0), 3315 /// Match::must(0, 1..1), 3316 /// Match::must(0, 2..2), 3317 /// Match::must(0, 3..3), 3318 /// ]); 3319 /// 3320 /// # Ok::<(), Box<dyn std::error::Error>>(()) 3321 /// ``` 3322 #[derive(Clone, Debug)] 3323 pub struct Builder { 3324 config: Config, 3325 ast: ast::parse::ParserBuilder, 3326 hir: hir::translate::TranslatorBuilder, 3327 } 3328 3329 impl Builder { 3330 /// Creates a new builder for configuring and constructing a [`Regex`]. new() -> Builder3331 pub fn new() -> Builder { 3332 Builder { 3333 config: Config::default(), 3334 ast: ast::parse::ParserBuilder::new(), 3335 hir: hir::translate::TranslatorBuilder::new(), 3336 } 3337 } 3338 3339 /// Builds a `Regex` from a single pattern string. 3340 /// 3341 /// If there was a problem parsing the pattern or a problem turning it into 3342 /// a regex matcher, then an error is returned. 3343 /// 3344 /// # Example 3345 /// 3346 /// This example shows how to configure syntax options. 3347 /// 3348 /// ``` 3349 /// use regex_automata::{meta::Regex, util::syntax, Match}; 3350 /// 3351 /// let re = Regex::builder() 3352 /// .syntax(syntax::Config::new().crlf(true).multi_line(true)) 3353 /// .build(r"^foo$")?; 3354 /// let hay = "\r\nfoo\r\n"; 3355 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); 3356 /// 3357 /// # Ok::<(), Box<dyn std::error::Error>>(()) 3358 /// ``` build(&self, pattern: &str) -> Result<Regex, BuildError>3359 pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { 3360 self.build_many(&[pattern]) 3361 } 3362 3363 /// Builds a `Regex` from many pattern strings. 3364 /// 3365 /// If there was a problem parsing any of the patterns or a problem turning 3366 /// them into a regex matcher, then an error is returned. 3367 /// 3368 /// # Example: finding the pattern that caused an error 3369 /// 3370 /// When a syntax error occurs, it is possible to ask which pattern 3371 /// caused the syntax error. 3372 /// 3373 /// ``` 3374 /// use regex_automata::{meta::Regex, PatternID}; 3375 /// 3376 /// let err = Regex::builder() 3377 /// .build_many(&["a", "b", r"\p{Foo}", "c"]) 3378 /// .unwrap_err(); 3379 /// assert_eq!(Some(PatternID::must(2)), err.pattern()); 3380 /// ``` 3381 /// 3382 /// # Example: zero patterns is valid 3383 /// 3384 /// Building a regex with zero patterns results in a regex that never 3385 /// matches anything. Because this routine is generic, passing an empty 3386 /// slice usually requires a turbo-fish (or something else to help type 3387 /// inference). 3388 /// 3389 /// ``` 3390 /// use regex_automata::{meta::Regex, util::syntax, Match}; 3391 /// 3392 /// let re = Regex::builder() 3393 /// .build_many::<&str>(&[])?; 3394 /// assert_eq!(None, re.find("")); 3395 /// 3396 /// # Ok::<(), Box<dyn std::error::Error>>(()) 3397 /// ``` build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<Regex, BuildError>3398 pub fn build_many<P: AsRef<str>>( 3399 &self, 3400 patterns: &[P], 3401 ) -> Result<Regex, BuildError> { 3402 use crate::util::primitives::IteratorIndexExt; 3403 log! { 3404 debug!("building meta regex with {} patterns:", patterns.len()); 3405 for (pid, p) in patterns.iter().with_pattern_ids() { 3406 let p = p.as_ref(); 3407 // We might split a grapheme with this truncation logic, but 3408 // that's fine. We at least avoid splitting a codepoint. 3409 let maxoff = p 3410 .char_indices() 3411 .map(|(i, ch)| i + ch.len_utf8()) 3412 .take(1000) 3413 .last() 3414 .unwrap_or(0); 3415 if maxoff < p.len() { 3416 debug!("{:?}: {}[... snip ...]", pid, &p[..maxoff]); 3417 } else { 3418 debug!("{:?}: {}", pid, p); 3419 } 3420 } 3421 } 3422 let (mut asts, mut hirs) = (vec![], vec![]); 3423 for (pid, p) in patterns.iter().with_pattern_ids() { 3424 let ast = self 3425 .ast 3426 .build() 3427 .parse(p.as_ref()) 3428 .map_err(|err| BuildError::ast(pid, err))?; 3429 asts.push(ast); 3430 } 3431 for ((pid, p), ast) in 3432 patterns.iter().with_pattern_ids().zip(asts.iter()) 3433 { 3434 let hir = self 3435 .hir 3436 .build() 3437 .translate(p.as_ref(), ast) 3438 .map_err(|err| BuildError::hir(pid, err))?; 3439 hirs.push(hir); 3440 } 3441 self.build_many_from_hir(&hirs) 3442 } 3443 3444 /// Builds a `Regex` directly from an `Hir` expression. 3445 /// 3446 /// This is useful if you needed to parse a pattern string into an `Hir` 3447 /// for other reasons (such as analysis or transformations). This routine 3448 /// permits building a `Regex` directly from the `Hir` expression instead 3449 /// of first converting the `Hir` back to a pattern string. 3450 /// 3451 /// When using this method, any options set via [`Builder::syntax`] are 3452 /// ignored. Namely, the syntax options only apply when parsing a pattern 3453 /// string, which isn't relevant here. 3454 /// 3455 /// If there was a problem building the underlying regex matcher for the 3456 /// given `Hir`, then an error is returned. 3457 /// 3458 /// # Example 3459 /// 3460 /// This example shows how one can hand-construct an `Hir` expression and 3461 /// build a regex from it without doing any parsing at all. 3462 /// 3463 /// ``` 3464 /// use { 3465 /// regex_automata::{meta::Regex, Match}, 3466 /// regex_syntax::hir::{Hir, Look}, 3467 /// }; 3468 /// 3469 /// // (?Rm)^foo$ 3470 /// let hir = Hir::concat(vec![ 3471 /// Hir::look(Look::StartCRLF), 3472 /// Hir::literal("foo".as_bytes()), 3473 /// Hir::look(Look::EndCRLF), 3474 /// ]); 3475 /// let re = Regex::builder() 3476 /// .build_from_hir(&hir)?; 3477 /// let hay = "\r\nfoo\r\n"; 3478 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); 3479 /// 3480 /// Ok::<(), Box<dyn std::error::Error>>(()) 3481 /// ``` build_from_hir(&self, hir: &Hir) -> Result<Regex, BuildError>3482 pub fn build_from_hir(&self, hir: &Hir) -> Result<Regex, BuildError> { 3483 self.build_many_from_hir(&[hir]) 3484 } 3485 3486 /// Builds a `Regex` directly from many `Hir` expressions. 3487 /// 3488 /// This is useful if you needed to parse pattern strings into `Hir` 3489 /// expressions for other reasons (such as analysis or transformations). 3490 /// This routine permits building a `Regex` directly from the `Hir` 3491 /// expressions instead of first converting the `Hir` expressions back to 3492 /// pattern strings. 3493 /// 3494 /// When using this method, any options set via [`Builder::syntax`] are 3495 /// ignored. Namely, the syntax options only apply when parsing a pattern 3496 /// string, which isn't relevant here. 3497 /// 3498 /// If there was a problem building the underlying regex matcher for the 3499 /// given `Hir` expressions, then an error is returned. 3500 /// 3501 /// Note that unlike [`Builder::build_many`], this can only fail as a 3502 /// result of building the underlying matcher. In that case, there is 3503 /// no single `Hir` expression that can be isolated as a reason for the 3504 /// failure. So if this routine fails, it's not possible to determine which 3505 /// `Hir` expression caused the failure. 3506 /// 3507 /// # Example 3508 /// 3509 /// This example shows how one can hand-construct multiple `Hir` 3510 /// expressions and build a single regex from them without doing any 3511 /// parsing at all. 3512 /// 3513 /// ``` 3514 /// use { 3515 /// regex_automata::{meta::Regex, Match}, 3516 /// regex_syntax::hir::{Hir, Look}, 3517 /// }; 3518 /// 3519 /// // (?Rm)^foo$ 3520 /// let hir1 = Hir::concat(vec![ 3521 /// Hir::look(Look::StartCRLF), 3522 /// Hir::literal("foo".as_bytes()), 3523 /// Hir::look(Look::EndCRLF), 3524 /// ]); 3525 /// // (?Rm)^bar$ 3526 /// let hir2 = Hir::concat(vec![ 3527 /// Hir::look(Look::StartCRLF), 3528 /// Hir::literal("bar".as_bytes()), 3529 /// Hir::look(Look::EndCRLF), 3530 /// ]); 3531 /// let re = Regex::builder() 3532 /// .build_many_from_hir(&[&hir1, &hir2])?; 3533 /// let hay = "\r\nfoo\r\nbar"; 3534 /// let got: Vec<Match> = re.find_iter(hay).collect(); 3535 /// let expected = vec![ 3536 /// Match::must(0, 2..5), 3537 /// Match::must(1, 7..10), 3538 /// ]; 3539 /// assert_eq!(expected, got); 3540 /// 3541 /// Ok::<(), Box<dyn std::error::Error>>(()) 3542 /// ``` build_many_from_hir<H: Borrow<Hir>>( &self, hirs: &[H], ) -> Result<Regex, BuildError>3543 pub fn build_many_from_hir<H: Borrow<Hir>>( 3544 &self, 3545 hirs: &[H], 3546 ) -> Result<Regex, BuildError> { 3547 let config = self.config.clone(); 3548 // We collect the HIRs into a vec so we can write internal routines 3549 // with '&[&Hir]'. i.e., Don't use generics everywhere to keep code 3550 // bloat down.. 3551 let hirs: Vec<&Hir> = hirs.iter().map(|hir| hir.borrow()).collect(); 3552 let info = RegexInfo::new(config, &hirs); 3553 let strat = strategy::new(&info, &hirs)?; 3554 let pool = { 3555 let strat = Arc::clone(&strat); 3556 let create: CachePoolFn = Box::new(move || strat.create_cache()); 3557 Pool::new(create) 3558 }; 3559 Ok(Regex { imp: Arc::new(RegexI { strat, info }), pool }) 3560 } 3561 3562 /// Configure the behavior of a `Regex`. 3563 /// 3564 /// This configuration controls non-syntax options related to the behavior 3565 /// of a `Regex`. This includes things like whether empty matches can split 3566 /// a codepoint, prefilters, line terminators and a long list of options 3567 /// for configuring which regex engines the meta regex engine will be able 3568 /// to use internally. 3569 /// 3570 /// # Example 3571 /// 3572 /// This example shows how to disable UTF-8 empty mode. This will permit 3573 /// empty matches to occur between the UTF-8 encoding of a codepoint. 3574 /// 3575 /// ``` 3576 /// use regex_automata::{meta::Regex, Match}; 3577 /// 3578 /// let re = Regex::new("")?; 3579 /// let got: Vec<Match> = re.find_iter("☃").collect(); 3580 /// // Matches only occur at the beginning and end of the snowman. 3581 /// assert_eq!(got, vec![ 3582 /// Match::must(0, 0..0), 3583 /// Match::must(0, 3..3), 3584 /// ]); 3585 /// 3586 /// let re = Regex::builder() 3587 /// .configure(Regex::config().utf8_empty(false)) 3588 /// .build("")?; 3589 /// let got: Vec<Match> = re.find_iter("☃").collect(); 3590 /// // Matches now occur at every position! 3591 /// assert_eq!(got, vec![ 3592 /// Match::must(0, 0..0), 3593 /// Match::must(0, 1..1), 3594 /// Match::must(0, 2..2), 3595 /// Match::must(0, 3..3), 3596 /// ]); 3597 /// 3598 /// Ok::<(), Box<dyn std::error::Error>>(()) 3599 /// ``` configure(&mut self, config: Config) -> &mut Builder3600 pub fn configure(&mut self, config: Config) -> &mut Builder { 3601 self.config = self.config.overwrite(config); 3602 self 3603 } 3604 3605 /// Configure the syntax options when parsing a pattern string while 3606 /// building a `Regex`. 3607 /// 3608 /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`] 3609 /// are used. The other build methods accept `Hir` values, which have 3610 /// already been parsed. 3611 /// 3612 /// # Example 3613 /// 3614 /// This example shows how to enable case insensitive mode. 3615 /// 3616 /// ``` 3617 /// use regex_automata::{meta::Regex, util::syntax, Match}; 3618 /// 3619 /// let re = Regex::builder() 3620 /// .syntax(syntax::Config::new().case_insensitive(true)) 3621 /// .build(r"δ")?; 3622 /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ")); 3623 /// 3624 /// Ok::<(), Box<dyn std::error::Error>>(()) 3625 /// ``` syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder3626 pub fn syntax( 3627 &mut self, 3628 config: crate::util::syntax::Config, 3629 ) -> &mut Builder { 3630 config.apply_ast(&mut self.ast); 3631 config.apply_hir(&mut self.hir); 3632 self 3633 } 3634 } 3635 3636 #[cfg(test)] 3637 mod tests { 3638 use super::*; 3639 3640 // I found this in the course of building out the benchmark suite for 3641 // rebar. 3642 #[test] regression_suffix_literal_count()3643 fn regression_suffix_literal_count() { 3644 let _ = env_logger::try_init(); 3645 3646 let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); 3647 assert_eq!(1, re.find_iter("tingling").count()); 3648 } 3649 } 3650