1 #[cfg(feature = "std")] 2 use dense::{self, DenseDFA}; 3 use dfa::DFA; 4 #[cfg(feature = "std")] 5 use error::Result; 6 #[cfg(feature = "std")] 7 use sparse::SparseDFA; 8 #[cfg(feature = "std")] 9 use state_id::StateID; 10 11 /// A regular expression that uses deterministic finite automata for fast 12 /// searching. 13 /// 14 /// A regular expression is comprised of two DFAs, a "forward" DFA and a 15 /// "reverse" DFA. The forward DFA is responsible for detecting the end of a 16 /// match while the reverse DFA is responsible for detecting the start of a 17 /// match. Thus, in order to find the bounds of any given match, a forward 18 /// search must first be run followed by a reverse search. A match found by 19 /// the forward DFA guarantees that the reverse DFA will also find a match. 20 /// 21 /// The type of the DFA used by a `Regex` corresponds to the `D` type 22 /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, 23 /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a 24 /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but 25 /// search faster, while sparse DFAs use less memory but search more slowly. 26 /// 27 /// By default, a regex's DFA type parameter is set to 28 /// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the 29 /// most convenient type that gives the best search performance. 30 /// 31 /// # Sparse DFAs 32 /// 33 /// Since a `Regex` is generic over the `DFA` trait, it can be used with any 34 /// kind of DFA. While this crate constructs dense DFAs by default, it is easy 35 /// enough to build corresponding sparse DFAs, and then build a regex from 36 /// them: 37 /// 38 /// ``` 39 /// use regex_automata::Regex; 40 /// 41 /// # fn example() -> Result<(), regex_automata::Error> { 42 /// // First, build a regex that uses dense DFAs. 43 /// let dense_re = Regex::new("foo[0-9]+")?; 44 /// 45 /// // Second, build sparse DFAs from the forward and reverse dense DFAs. 46 /// let fwd = dense_re.forward().to_sparse()?; 47 /// let rev = dense_re.reverse().to_sparse()?; 48 /// 49 /// // Third, build a new regex from the constituent sparse DFAs. 50 /// let sparse_re = Regex::from_dfas(fwd, rev); 51 /// 52 /// // A regex that uses sparse DFAs can be used just like with dense DFAs. 53 /// assert_eq!(true, sparse_re.is_match(b"foo123")); 54 /// # Ok(()) }; example().unwrap() 55 /// ``` 56 #[cfg(feature = "std")] 57 #[derive(Clone, Debug)] 58 pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> { 59 forward: D, 60 reverse: D, 61 } 62 63 /// A regular expression that uses deterministic finite automata for fast 64 /// searching. 65 /// 66 /// A regular expression is comprised of two DFAs, a "forward" DFA and a 67 /// "reverse" DFA. The forward DFA is responsible for detecting the end of a 68 /// match while the reverse DFA is responsible for detecting the start of a 69 /// match. Thus, in order to find the bounds of any given match, a forward 70 /// search must first be run followed by a reverse search. A match found by 71 /// the forward DFA guarantees that the reverse DFA will also find a match. 72 /// 73 /// The type of the DFA used by a `Regex` corresponds to the `D` type 74 /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, 75 /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a 76 /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but 77 /// search faster, while sparse DFAs use less memory but search more slowly. 78 /// 79 /// When using this crate without the standard library, the `Regex` type has 80 /// no default type parameter. 81 /// 82 /// # Sparse DFAs 83 /// 84 /// Since a `Regex` is generic over the `DFA` trait, it can be used with any 85 /// kind of DFA. While this crate constructs dense DFAs by default, it is easy 86 /// enough to build corresponding sparse DFAs, and then build a regex from 87 /// them: 88 /// 89 /// ``` 90 /// use regex_automata::Regex; 91 /// 92 /// # fn example() -> Result<(), regex_automata::Error> { 93 /// // First, build a regex that uses dense DFAs. 94 /// let dense_re = Regex::new("foo[0-9]+")?; 95 /// 96 /// // Second, build sparse DFAs from the forward and reverse dense DFAs. 97 /// let fwd = dense_re.forward().to_sparse()?; 98 /// let rev = dense_re.reverse().to_sparse()?; 99 /// 100 /// // Third, build a new regex from the constituent sparse DFAs. 101 /// let sparse_re = Regex::from_dfas(fwd, rev); 102 /// 103 /// // A regex that uses sparse DFAs can be used just like with dense DFAs. 104 /// assert_eq!(true, sparse_re.is_match(b"foo123")); 105 /// # Ok(()) }; example().unwrap() 106 /// ``` 107 #[cfg(not(feature = "std"))] 108 #[derive(Clone, Debug)] 109 pub struct Regex<D> { 110 forward: D, 111 reverse: D, 112 } 113 114 #[cfg(feature = "std")] 115 impl Regex { 116 /// Parse the given regular expression using a default configuration and 117 /// return the corresponding regex. 118 /// 119 /// The default configuration uses `usize` for state IDs, premultiplies 120 /// them and reduces the alphabet size by splitting bytes into equivalence 121 /// classes. The underlying DFAs are *not* minimized. 122 /// 123 /// If you want a non-default configuration, then use the 124 /// [`RegexBuilder`](struct.RegexBuilder.html) 125 /// to set your own configuration. 126 /// 127 /// # Example 128 /// 129 /// ``` 130 /// use regex_automata::Regex; 131 /// 132 /// # fn example() -> Result<(), regex_automata::Error> { 133 /// let re = Regex::new("foo[0-9]+bar")?; 134 /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); 135 /// # Ok(()) }; example().unwrap() 136 /// ``` new(pattern: &str) -> Result<Regex>137 pub fn new(pattern: &str) -> Result<Regex> { 138 RegexBuilder::new().build(pattern) 139 } 140 } 141 142 #[cfg(feature = "std")] 143 impl Regex<SparseDFA<Vec<u8>, usize>> { 144 /// Parse the given regular expression using a default configuration and 145 /// return the corresponding regex using sparse DFAs. 146 /// 147 /// The default configuration uses `usize` for state IDs, reduces the 148 /// alphabet size by splitting bytes into equivalence classes. The 149 /// underlying DFAs are *not* minimized. 150 /// 151 /// If you want a non-default configuration, then use the 152 /// [`RegexBuilder`](struct.RegexBuilder.html) 153 /// to set your own configuration. 154 /// 155 /// # Example 156 /// 157 /// ``` 158 /// use regex_automata::Regex; 159 /// 160 /// # fn example() -> Result<(), regex_automata::Error> { 161 /// let re = Regex::new_sparse("foo[0-9]+bar")?; 162 /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); 163 /// # Ok(()) }; example().unwrap() 164 /// ``` new_sparse( pattern: &str, ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>>165 pub fn new_sparse( 166 pattern: &str, 167 ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { 168 RegexBuilder::new().build_sparse(pattern) 169 } 170 } 171 172 impl<D: DFA> Regex<D> { 173 /// Returns true if and only if the given bytes match. 174 /// 175 /// This routine may short circuit if it knows that scanning future input 176 /// will never lead to a different result. In particular, if the underlying 177 /// DFA enters a match state or a dead state, then this routine will return 178 /// `true` or `false`, respectively, without inspecting any future input. 179 /// 180 /// # Example 181 /// 182 /// ``` 183 /// use regex_automata::Regex; 184 /// 185 /// # fn example() -> Result<(), regex_automata::Error> { 186 /// let re = Regex::new("foo[0-9]+bar")?; 187 /// assert_eq!(true, re.is_match(b"foo12345bar")); 188 /// assert_eq!(false, re.is_match(b"foobar")); 189 /// # Ok(()) }; example().unwrap() 190 /// ``` is_match(&self, input: &[u8]) -> bool191 pub fn is_match(&self, input: &[u8]) -> bool { 192 self.is_match_at(input, 0) 193 } 194 195 /// Returns the first position at which a match is found. 196 /// 197 /// This routine stops scanning input in precisely the same circumstances 198 /// as `is_match`. The key difference is that this routine returns the 199 /// position at which it stopped scanning input if and only if a match 200 /// was found. If no match is found, then `None` is returned. 201 /// 202 /// # Example 203 /// 204 /// ``` 205 /// use regex_automata::Regex; 206 /// 207 /// # fn example() -> Result<(), regex_automata::Error> { 208 /// let re = Regex::new("foo[0-9]+")?; 209 /// assert_eq!(Some(4), re.shortest_match(b"foo12345")); 210 /// 211 /// // Normally, the end of the leftmost first match here would be 3, 212 /// // but the shortest match semantics detect a match earlier. 213 /// let re = Regex::new("abc|a")?; 214 /// assert_eq!(Some(1), re.shortest_match(b"abc")); 215 /// # Ok(()) }; example().unwrap() 216 /// ``` shortest_match(&self, input: &[u8]) -> Option<usize>217 pub fn shortest_match(&self, input: &[u8]) -> Option<usize> { 218 self.shortest_match_at(input, 0) 219 } 220 221 /// Returns the start and end offset of the leftmost first match. If no 222 /// match exists, then `None` is returned. 223 /// 224 /// The "leftmost first" match corresponds to the match with the smallest 225 /// starting offset, but where the end offset is determined by preferring 226 /// earlier branches in the original regular expression. For example, 227 /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will 228 /// match `Samwise` in `Samwise`. 229 /// 230 /// Generally speaking, the "leftmost first" match is how most backtracking 231 /// regular expressions tend to work. This is in contrast to POSIX-style 232 /// regular expressions that yield "leftmost longest" matches. Namely, 233 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using 234 /// leftmost longest semantics. 235 /// 236 /// # Example 237 /// 238 /// ``` 239 /// use regex_automata::Regex; 240 /// 241 /// # fn example() -> Result<(), regex_automata::Error> { 242 /// let re = Regex::new("foo[0-9]+")?; 243 /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz")); 244 /// 245 /// // Even though a match is found after reading the first byte (`a`), 246 /// // the leftmost first match semantics demand that we find the earliest 247 /// // match that prefers earlier parts of the pattern over latter parts. 248 /// let re = Regex::new("abc|a")?; 249 /// assert_eq!(Some((0, 3)), re.find(b"abc")); 250 /// # Ok(()) }; example().unwrap() 251 /// ``` find(&self, input: &[u8]) -> Option<(usize, usize)>252 pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> { 253 self.find_at(input, 0) 254 } 255 256 /// Returns the same as `is_match`, but starts the search at the given 257 /// offset. 258 /// 259 /// The significance of the starting point is that it takes the surrounding 260 /// context into consideration. For example, if the DFA is anchored, then 261 /// a match can only occur when `start == 0`. is_match_at(&self, input: &[u8], start: usize) -> bool262 pub fn is_match_at(&self, input: &[u8], start: usize) -> bool { 263 self.forward().is_match_at(input, start) 264 } 265 266 /// Returns the same as `shortest_match`, but starts the search at the 267 /// given offset. 268 /// 269 /// The significance of the starting point is that it takes the surrounding 270 /// context into consideration. For example, if the DFA is anchored, then 271 /// a match can only occur when `start == 0`. shortest_match_at( &self, input: &[u8], start: usize, ) -> Option<usize>272 pub fn shortest_match_at( 273 &self, 274 input: &[u8], 275 start: usize, 276 ) -> Option<usize> { 277 self.forward().shortest_match_at(input, start) 278 } 279 280 /// Returns the same as `find`, but starts the search at the given 281 /// offset. 282 /// 283 /// The significance of the starting point is that it takes the surrounding 284 /// context into consideration. For example, if the DFA is anchored, then 285 /// a match can only occur when `start == 0`. find_at( &self, input: &[u8], start: usize, ) -> Option<(usize, usize)>286 pub fn find_at( 287 &self, 288 input: &[u8], 289 start: usize, 290 ) -> Option<(usize, usize)> { 291 let end = match self.forward().find_at(input, start) { 292 None => return None, 293 Some(end) => end, 294 }; 295 let start = self 296 .reverse() 297 .rfind(&input[start..end]) 298 .map(|i| start + i) 299 .expect("reverse search must match if forward search does"); 300 Some((start, end)) 301 } 302 303 /// Returns an iterator over all non-overlapping leftmost first matches 304 /// in the given bytes. If no match exists, then the iterator yields no 305 /// elements. 306 /// 307 /// Note that if the regex can match the empty string, then it is 308 /// possible for the iterator to yield a zero-width match at a location 309 /// that is not a valid UTF-8 boundary (for example, between the code units 310 /// of a UTF-8 encoded codepoint). This can happen regardless of whether 311 /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) 312 /// was enabled or not. 313 /// 314 /// # Example 315 /// 316 /// ``` 317 /// use regex_automata::Regex; 318 /// 319 /// # fn example() -> Result<(), regex_automata::Error> { 320 /// let re = Regex::new("foo[0-9]+")?; 321 /// let text = b"foo1 foo12 foo123"; 322 /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); 323 /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]); 324 /// # Ok(()) }; example().unwrap() 325 /// ``` find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D>326 pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> { 327 Matches::new(self, input) 328 } 329 330 /// Build a new regex from its constituent forward and reverse DFAs. 331 /// 332 /// This is useful when deserializing a regex from some arbitrary 333 /// memory region. This is also useful for building regexes from other 334 /// types of DFAs. 335 /// 336 /// # Example 337 /// 338 /// This example is a bit a contrived. The usual use of these methods 339 /// would involve serializing `initial_re` somewhere and then deserializing 340 /// it later to build a regex. 341 /// 342 /// ``` 343 /// use regex_automata::Regex; 344 /// 345 /// # fn example() -> Result<(), regex_automata::Error> { 346 /// let initial_re = Regex::new("foo[0-9]+")?; 347 /// assert_eq!(true, initial_re.is_match(b"foo123")); 348 /// 349 /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); 350 /// let re = Regex::from_dfas(fwd, rev); 351 /// assert_eq!(true, re.is_match(b"foo123")); 352 /// # Ok(()) }; example().unwrap() 353 /// ``` 354 /// 355 /// This example shows how you might build smaller DFAs, and then use those 356 /// smaller DFAs to build a new regex. 357 /// 358 /// ``` 359 /// use regex_automata::Regex; 360 /// 361 /// # fn example() -> Result<(), regex_automata::Error> { 362 /// let initial_re = Regex::new("foo[0-9]+")?; 363 /// assert_eq!(true, initial_re.is_match(b"foo123")); 364 /// 365 /// let fwd = initial_re.forward().to_u16()?; 366 /// let rev = initial_re.reverse().to_u16()?; 367 /// let re = Regex::from_dfas(fwd, rev); 368 /// assert_eq!(true, re.is_match(b"foo123")); 369 /// # Ok(()) }; example().unwrap() 370 /// ``` 371 /// 372 /// This example shows how to build a `Regex` that uses sparse DFAs instead 373 /// of dense DFAs: 374 /// 375 /// ``` 376 /// use regex_automata::Regex; 377 /// 378 /// # fn example() -> Result<(), regex_automata::Error> { 379 /// let initial_re = Regex::new("foo[0-9]+")?; 380 /// assert_eq!(true, initial_re.is_match(b"foo123")); 381 /// 382 /// let fwd = initial_re.forward().to_sparse()?; 383 /// let rev = initial_re.reverse().to_sparse()?; 384 /// let re = Regex::from_dfas(fwd, rev); 385 /// assert_eq!(true, re.is_match(b"foo123")); 386 /// # Ok(()) }; example().unwrap() 387 /// ``` from_dfas(forward: D, reverse: D) -> Regex<D>388 pub fn from_dfas(forward: D, reverse: D) -> Regex<D> { 389 Regex { forward, reverse } 390 } 391 392 /// Return the underlying DFA responsible for forward matching. forward(&self) -> &D393 pub fn forward(&self) -> &D { 394 &self.forward 395 } 396 397 /// Return the underlying DFA responsible for reverse matching. reverse(&self) -> &D398 pub fn reverse(&self) -> &D { 399 &self.reverse 400 } 401 } 402 403 /// An iterator over all non-overlapping matches for a particular search. 404 /// 405 /// The iterator yields a `(usize, usize)` value until no more matches could be 406 /// found. The first `usize` is the start of the match (inclusive) while the 407 /// second `usize` is the end of the match (exclusive). 408 /// 409 /// `S` is the type used to represent state identifiers in the underlying 410 /// regex. The lifetime variables are as follows: 411 /// 412 /// * `'r` is the lifetime of the regular expression value itself. 413 /// * `'t` is the lifetime of the text being searched. 414 #[derive(Clone, Debug)] 415 pub struct Matches<'r, 't, D: DFA + 'r> { 416 re: &'r Regex<D>, 417 text: &'t [u8], 418 last_end: usize, 419 last_match: Option<usize>, 420 } 421 422 impl<'r, 't, D: DFA> Matches<'r, 't, D> { new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D>423 fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> { 424 Matches { re, text, last_end: 0, last_match: None } 425 } 426 } 427 428 impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> { 429 type Item = (usize, usize); 430 next(&mut self) -> Option<(usize, usize)>431 fn next(&mut self) -> Option<(usize, usize)> { 432 if self.last_end > self.text.len() { 433 return None; 434 } 435 let (s, e) = match self.re.find_at(self.text, self.last_end) { 436 None => return None, 437 Some((s, e)) => (s, e), 438 }; 439 if s == e { 440 // This is an empty match. To ensure we make progress, start 441 // the next search at the smallest possible starting position 442 // of the next match following this one. 443 self.last_end = e + 1; 444 // Don't accept empty matches immediately following a match. 445 // Just move on to the next match. 446 if Some(e) == self.last_match { 447 return self.next(); 448 } 449 } else { 450 self.last_end = e; 451 } 452 self.last_match = Some(e); 453 Some((s, e)) 454 } 455 } 456 457 /// A builder for a regex based on deterministic finite automatons. 458 /// 459 /// This builder permits configuring several aspects of the construction 460 /// process such as case insensitivity, Unicode support and various options 461 /// that impact the size of the underlying DFAs. In some cases, options (like 462 /// performing DFA minimization) can come with a substantial additional cost. 463 /// 464 /// This builder generally constructs two DFAs, where one is responsible for 465 /// finding the end of a match and the other is responsible for finding the 466 /// start of a match. If you only need to detect whether something matched, 467 /// or only the end of a match, then you should use a 468 /// [`dense::Builder`](dense/struct.Builder.html) 469 /// to construct a single DFA, which is cheaper than building two DFAs. 470 #[cfg(feature = "std")] 471 #[derive(Clone, Debug)] 472 pub struct RegexBuilder { 473 dfa: dense::Builder, 474 } 475 476 #[cfg(feature = "std")] 477 impl RegexBuilder { 478 /// Create a new regex builder with the default configuration. new() -> RegexBuilder479 pub fn new() -> RegexBuilder { 480 RegexBuilder { dfa: dense::Builder::new() } 481 } 482 483 /// Build a regex from the given pattern. 484 /// 485 /// If there was a problem parsing or compiling the pattern, then an error 486 /// is returned. build(&self, pattern: &str) -> Result<Regex>487 pub fn build(&self, pattern: &str) -> Result<Regex> { 488 self.build_with_size::<usize>(pattern) 489 } 490 491 /// Build a regex from the given pattern using sparse DFAs. 492 /// 493 /// If there was a problem parsing or compiling the pattern, then an error 494 /// is returned. build_sparse( &self, pattern: &str, ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>>495 pub fn build_sparse( 496 &self, 497 pattern: &str, 498 ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { 499 self.build_with_size_sparse::<usize>(pattern) 500 } 501 502 /// Build a regex from the given pattern using a specific representation 503 /// for the underlying DFA state IDs. 504 /// 505 /// If there was a problem parsing or compiling the pattern, then an error 506 /// is returned. 507 /// 508 /// The representation of state IDs is determined by the `S` type 509 /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` 510 /// or `usize`, where `usize` is the default used for `build`. The purpose 511 /// of specifying a representation for state IDs is to reduce the memory 512 /// footprint of the underlying DFAs. 513 /// 514 /// When using this routine, the chosen state ID representation will be 515 /// used throughout determinization and minimization, if minimization was 516 /// requested. Even if the minimized DFAs can fit into the chosen state ID 517 /// representation but the initial determinized DFA cannot, then this will 518 /// still return an error. To get a minimized DFA with a smaller state ID 519 /// representation, first build it with a bigger state ID representation, 520 /// and then shrink the sizes of the DFAs using one of its conversion 521 /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). 522 /// Finally, reconstitute the regex via 523 /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa). build_with_size<S: StateID>( &self, pattern: &str, ) -> Result<Regex<DenseDFA<Vec<S>, S>>>524 pub fn build_with_size<S: StateID>( 525 &self, 526 pattern: &str, 527 ) -> Result<Regex<DenseDFA<Vec<S>, S>>> { 528 let forward = self.dfa.build_with_size(pattern)?; 529 let reverse = self 530 .dfa 531 .clone() 532 .anchored(true) 533 .reverse(true) 534 .longest_match(true) 535 .build_with_size(pattern)?; 536 Ok(Regex::from_dfas(forward, reverse)) 537 } 538 539 /// Build a regex from the given pattern using a specific representation 540 /// for the underlying DFA state IDs using sparse DFAs. build_with_size_sparse<S: StateID>( &self, pattern: &str, ) -> Result<Regex<SparseDFA<Vec<u8>, S>>>541 pub fn build_with_size_sparse<S: StateID>( 542 &self, 543 pattern: &str, 544 ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> { 545 let re = self.build_with_size(pattern)?; 546 let fwd = re.forward().to_sparse()?; 547 let rev = re.reverse().to_sparse()?; 548 Ok(Regex::from_dfas(fwd, rev)) 549 } 550 551 /// Set whether matching must be anchored at the beginning of the input. 552 /// 553 /// When enabled, a match must begin at the start of the input. When 554 /// disabled, the regex will act as if the pattern started with a `.*?`, 555 /// which enables a match to appear anywhere. 556 /// 557 /// By default this is disabled. anchored(&mut self, yes: bool) -> &mut RegexBuilder558 pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder { 559 self.dfa.anchored(yes); 560 self 561 } 562 563 /// Enable or disable the case insensitive flag by default. 564 /// 565 /// By default this is disabled. It may alternatively be selectively 566 /// enabled in the regular expression itself via the `i` flag. case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder567 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { 568 self.dfa.case_insensitive(yes); 569 self 570 } 571 572 /// Enable verbose mode in the regular expression. 573 /// 574 /// When enabled, verbose mode permits insigificant whitespace in many 575 /// places in the regular expression, as well as comments. Comments are 576 /// started using `#` and continue until the end of the line. 577 /// 578 /// By default, this is disabled. It may be selectively enabled in the 579 /// regular expression by using the `x` flag regardless of this setting. ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder580 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { 581 self.dfa.ignore_whitespace(yes); 582 self 583 } 584 585 /// Enable or disable the "dot matches any character" flag by default. 586 /// 587 /// By default this is disabled. It may alternatively be selectively 588 /// enabled in the regular expression itself via the `s` flag. dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder589 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { 590 self.dfa.dot_matches_new_line(yes); 591 self 592 } 593 594 /// Enable or disable the "swap greed" flag by default. 595 /// 596 /// By default this is disabled. It may alternatively be selectively 597 /// enabled in the regular expression itself via the `U` flag. swap_greed(&mut self, yes: bool) -> &mut RegexBuilder598 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { 599 self.dfa.swap_greed(yes); 600 self 601 } 602 603 /// Enable or disable the Unicode flag (`u`) by default. 604 /// 605 /// By default this is **enabled**. It may alternatively be selectively 606 /// disabled in the regular expression itself via the `u` flag. 607 /// 608 /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by 609 /// default), a regular expression will fail to parse if Unicode mode is 610 /// disabled and a sub-expression could possibly match invalid UTF-8. unicode(&mut self, yes: bool) -> &mut RegexBuilder611 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { 612 self.dfa.unicode(yes); 613 self 614 } 615 616 /// When enabled, the builder will permit the construction of a regular 617 /// expression that may match invalid UTF-8. 618 /// 619 /// When disabled (the default), the builder is guaranteed to produce a 620 /// regex that will only ever match valid UTF-8 (otherwise, the builder 621 /// will return an error). allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder622 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder { 623 self.dfa.allow_invalid_utf8(yes); 624 self 625 } 626 627 /// Set the nesting limit used for the regular expression parser. 628 /// 629 /// The nesting limit controls how deep the abstract syntax tree is allowed 630 /// to be. If the AST exceeds the given limit (e.g., with too many nested 631 /// groups), then an error is returned by the parser. 632 /// 633 /// The purpose of this limit is to act as a heuristic to prevent stack 634 /// overflow when building a finite automaton from a regular expression's 635 /// abstract syntax tree. In particular, construction currently uses 636 /// recursion. In the future, the implementation may stop using recursion 637 /// and this option will no longer be necessary. 638 /// 639 /// This limit is not checked until the entire AST is parsed. Therefore, 640 /// if callers want to put a limit on the amount of heap space used, then 641 /// they should impose a limit on the length, in bytes, of the concrete 642 /// pattern string. In particular, this is viable since the parser will 643 /// limit itself to heap space proportional to the lenth of the pattern 644 /// string. 645 /// 646 /// Note that a nest limit of `0` will return a nest limit error for most 647 /// patterns but not all. For example, a nest limit of `0` permits `a` but 648 /// not `ab`, since `ab` requires a concatenation AST item, which results 649 /// in a nest depth of `1`. In general, a nest limit is not something that 650 /// manifests in an obvious way in the concrete syntax, therefore, it 651 /// should not be used in a granular way. nest_limit(&mut self, limit: u32) -> &mut RegexBuilder652 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { 653 self.dfa.nest_limit(limit); 654 self 655 } 656 657 /// Minimize the underlying DFAs. 658 /// 659 /// When enabled, the DFAs powering the resulting regex will be minimized 660 /// such that it is as small as possible. 661 /// 662 /// Whether one enables minimization or not depends on the types of costs 663 /// you're willing to pay and how much you care about its benefits. In 664 /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` 665 /// space, where `n` is the number of DFA states and `k` is the alphabet 666 /// size. In practice, minimization can be quite costly in terms of both 667 /// space and time, so it should only be done if you're willing to wait 668 /// longer to produce a DFA. In general, you might want a minimal DFA in 669 /// the following circumstances: 670 /// 671 /// 1. You would like to optimize for the size of the automaton. This can 672 /// manifest in one of two ways. Firstly, if you're converting the 673 /// DFA into Rust code (or a table embedded in the code), then a minimal 674 /// DFA will translate into a corresponding reduction in code size, and 675 /// thus, also the final compiled binary size. Secondly, if you are 676 /// building many DFAs and putting them on the heap, you'll be able to 677 /// fit more if they are smaller. Note though that building a minimal 678 /// DFA itself requires additional space; you only realize the space 679 /// savings once the minimal DFA is constructed (at which point, the 680 /// space used for minimization is freed). 681 /// 2. You've observed that a smaller DFA results in faster match 682 /// performance. Naively, this isn't guaranteed since there is no 683 /// inherent difference between matching with a bigger-than-minimal 684 /// DFA and a minimal DFA. However, a smaller DFA may make use of your 685 /// CPU's cache more efficiently. 686 /// 3. You are trying to establish an equivalence between regular 687 /// languages. The standard method for this is to build a minimal DFA 688 /// for each language and then compare them. If the DFAs are equivalent 689 /// (up to state renaming), then the languages are equivalent. 690 /// 691 /// This option is disabled by default. minimize(&mut self, yes: bool) -> &mut RegexBuilder692 pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder { 693 self.dfa.minimize(yes); 694 self 695 } 696 697 /// Premultiply state identifiers in the underlying DFA transition tables. 698 /// 699 /// When enabled, state identifiers are premultiplied to point to their 700 /// corresponding row in the DFA's transition table. That is, given the 701 /// `i`th state, its corresponding premultiplied identifier is `i * k` 702 /// where `k` is the alphabet size of the DFA. (The alphabet size is at 703 /// most 256, but is in practice smaller if byte classes is enabled.) 704 /// 705 /// When state identifiers are not premultiplied, then the identifier of 706 /// the `i`th state is `i`. 707 /// 708 /// The advantage of premultiplying state identifiers is that is saves 709 /// a multiplication instruction per byte when searching with the DFA. 710 /// This has been observed to lead to a 20% performance benefit in 711 /// micro-benchmarks. 712 /// 713 /// The primary disadvantage of premultiplying state identifiers is 714 /// that they require a larger integer size to represent. For example, 715 /// if your DFA has 200 states, then its premultiplied form requires 716 /// 16 bits to represent every possible state identifier, where as its 717 /// non-premultiplied form only requires 8 bits. 718 /// 719 /// This option is enabled by default. premultiply(&mut self, yes: bool) -> &mut RegexBuilder720 pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder { 721 self.dfa.premultiply(yes); 722 self 723 } 724 725 /// Shrink the size of the underlying DFA alphabet by mapping bytes to 726 /// their equivalence classes. 727 /// 728 /// When enabled, each DFA will use a map from all possible bytes to their 729 /// corresponding equivalence class. Each equivalence class represents a 730 /// set of bytes that does not discriminate between a match and a non-match 731 /// in the DFA. For example, the pattern `[ab]+` has at least two 732 /// equivalence classes: a set containing `a` and `b` and a set containing 733 /// every byte except for `a` and `b`. `a` and `b` are in the same 734 /// equivalence classes because they never discriminate between a match 735 /// and a non-match. 736 /// 737 /// The advantage of this map is that the size of the transition table can 738 /// be reduced drastically from `#states * 256 * sizeof(id)` to 739 /// `#states * k * sizeof(id)` where `k` is the number of equivalence 740 /// classes. As a result, total space usage can decrease substantially. 741 /// Moreover, since a smaller alphabet is used, compilation becomes faster 742 /// as well. 743 /// 744 /// The disadvantage of this map is that every byte searched must be 745 /// passed through this map before it can be used to determine the next 746 /// transition. This has a small match time performance cost. 747 /// 748 /// This option is enabled by default. byte_classes(&mut self, yes: bool) -> &mut RegexBuilder749 pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder { 750 self.dfa.byte_classes(yes); 751 self 752 } 753 754 /// Apply best effort heuristics to shrink the NFA at the expense of more 755 /// time/memory. 756 /// 757 /// This may be exposed in the future, but for now is exported for use in 758 /// the `regex-automata-debug` tool. 759 #[doc(hidden)] shrink(&mut self, yes: bool) -> &mut RegexBuilder760 pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder { 761 self.dfa.shrink(yes); 762 self 763 } 764 } 765 766 #[cfg(feature = "std")] 767 impl Default for RegexBuilder { default() -> RegexBuilder768 fn default() -> RegexBuilder { 769 RegexBuilder::new() 770 } 771 } 772