1 /*! 2 Types and routines that support the search APIs of most regex engines. 3 4 This sub-module isn't exposed directly, but rather, its contents are exported 5 at the crate root due to the universality of most of the types and routines in 6 this module. 7 */ 8 9 use core::ops::{Range, RangeBounds}; 10 11 use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; 12 13 /// The parameters for a regex search including the haystack to search. 14 /// 15 /// It turns out that regex searches have a few parameters, and in most cases, 16 /// those parameters have defaults that work in the vast majority of cases. 17 /// This `Input` type exists to make that common case seamnless while also 18 /// providing an avenue for changing the parameters of a search. In particular, 19 /// this type enables doing so without a combinatorial explosion of different 20 /// methods and/or superfluous parameters in the common cases. 21 /// 22 /// An `Input` permits configuring the following things: 23 /// 24 /// * Search only a substring of a haystack, while taking the broader context 25 /// into account for resolving look-around assertions. 26 /// * Indicating whether to search for all patterns in a regex, or to 27 /// only search for one pattern in particular. 28 /// * Whether to perform an anchored on unanchored search. 29 /// * Whether to report a match as early as possible. 30 /// 31 /// All of these parameters, except for the haystack, have sensible default 32 /// values. This means that the minimal search configuration is simply a call 33 /// to [`Input::new`] with your haystack. Setting any other parameter is 34 /// optional. 35 /// 36 /// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a 37 /// `From<H> for Input` implementation. This is useful because many of the 38 /// search APIs in this crate accept an `Into<Input>`. This means you can 39 /// provide string or byte strings to these routines directly, and they'll 40 /// automatically get converted into an `Input` for you. 41 /// 42 /// The lifetime parameter `'h` refers to the lifetime of the haystack. 43 /// 44 /// # Organization 45 /// 46 /// The API of `Input` is split into a few different parts: 47 /// 48 /// * A builder-like API that transforms a `Input` by value. Examples: 49 /// [`Input::span`] and [`Input::anchored`]. 50 /// * A setter API that permits mutating parameters in place. Examples: 51 /// [`Input::set_span`] and [`Input::set_anchored`]. 52 /// * A getter API that permits retrieving any of the search parameters. 53 /// Examples: [`Input::get_span`] and [`Input::get_anchored`]. 54 /// * A few convenience getter routines that don't conform to the above naming 55 /// pattern due to how common they are. Examples: [`Input::haystack`], 56 /// [`Input::start`] and [`Input::end`]. 57 /// * Miscellaneous predicates and other helper routines that are useful 58 /// in some contexts. Examples: [`Input::is_char_boundary`]. 59 /// 60 /// A `Input` exposes so much because it is meant to be used by both callers of 61 /// regex engines _and_ implementors of regex engines. A constraining factor is 62 /// that regex engines should accept a `&Input` as its lowest level API, which 63 /// means that implementors should only use the "getter" APIs of a `Input`. 64 /// 65 /// # Valid bounds and search termination 66 /// 67 /// An `Input` permits setting the bounds of a search via either 68 /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or 69 /// else a panic will occur. Bounds are valid if and only if: 70 /// 71 /// * The bounds represent a valid range into the input's haystack. 72 /// * **or** the end bound is a valid ending bound for the haystack *and* 73 /// the start bound is exactly one greater than the start bound. 74 /// 75 /// In the latter case, [`Input::is_done`] will return true and indicates any 76 /// search receiving such an input should immediately return with no match. 77 /// 78 /// Note that while `Input` is used for reverse searches in this crate, the 79 /// `Input::is_done` predicate assumes a forward search. Because unsigned 80 /// offsets are used internally, there is no way to tell from only the offsets 81 /// whether a reverse search is done or not. 82 /// 83 /// # Regex engine support 84 /// 85 /// Any regex engine accepting an `Input` must support at least the following 86 /// things: 87 /// 88 /// * Searching a `&[u8]` for matches. 89 /// * Searching a substring of `&[u8]` for a match, such that any match 90 /// reported must appear entirely within that substring. 91 /// * For a forwards search, a match should never be reported when 92 /// [`Input::is_done`] returns true. (For reverse searches, termination should 93 /// be handled outside of `Input`.) 94 /// 95 /// Supporting other aspects of an `Input` are optional, but regex engines 96 /// should handle aspects they don't support gracefully. How this is done is 97 /// generally up to the regex engine. This crate generally treats unsupported 98 /// anchored modes as an error to report for example, but for simplicity, in 99 /// the meta regex engine, trying to search with an invalid pattern ID just 100 /// results in no match being reported. 101 #[derive(Clone)] 102 pub struct Input<'h> { 103 haystack: &'h [u8], 104 span: Span, 105 anchored: Anchored, 106 earliest: bool, 107 } 108 109 impl<'h> Input<'h> { 110 /// Create a new search configuration for the given haystack. 111 #[inline] new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h>112 pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> { 113 // Perform only one call to `haystack.as_ref()` to protect from incorrect 114 // implementations that return different values from multiple calls. 115 // This is important because there's code that relies on `span` not being 116 // out of bounds with respect to the stored `haystack`. 117 let haystack = haystack.as_ref(); 118 Input { 119 haystack, 120 span: Span { start: 0, end: haystack.len() }, 121 anchored: Anchored::No, 122 earliest: false, 123 } 124 } 125 126 /// Set the span for this search. 127 /// 128 /// This routine does not panic if the span given is not a valid range for 129 /// this search's haystack. If this search is run with an invalid range, 130 /// then the most likely outcome is that the actual search execution will 131 /// panic. 132 /// 133 /// This routine is generic over how a span is provided. While 134 /// a [`Span`] may be given directly, one may also provide a 135 /// `std::ops::Range<usize>`. To provide anything supported by range 136 /// syntax, use the [`Input::range`] method. 137 /// 138 /// The default span is the entire haystack. 139 /// 140 /// Note that [`Input::range`] overrides this method and vice versa. 141 /// 142 /// # Panics 143 /// 144 /// This panics if the given span does not correspond to valid bounds in 145 /// the haystack or the termination of a search. 146 /// 147 /// # Example 148 /// 149 /// This example shows how the span of the search can impact whether a 150 /// match is reported or not. This is particularly relevant for look-around 151 /// operators, which might take things outside of the span into account 152 /// when determining whether they match. 153 /// 154 /// ``` 155 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 156 /// use regex_automata::{ 157 /// nfa::thompson::pikevm::PikeVM, 158 /// Match, Input, 159 /// }; 160 /// 161 /// // Look for 'at', but as a distinct word. 162 /// let re = PikeVM::new(r"\bat\b")?; 163 /// let mut cache = re.create_cache(); 164 /// let mut caps = re.create_captures(); 165 /// 166 /// // Our haystack contains 'at', but not as a distinct word. 167 /// let haystack = "batter"; 168 /// 169 /// // A standard search finds nothing, as expected. 170 /// let input = Input::new(haystack); 171 /// re.search(&mut cache, &input, &mut caps); 172 /// assert_eq!(None, caps.get_match()); 173 /// 174 /// // But if we wanted to search starting at position '1', we might 175 /// // slice the haystack. If we do this, it's impossible for the \b 176 /// // anchors to take the surrounding context into account! And thus, 177 /// // a match is produced. 178 /// let input = Input::new(&haystack[1..3]); 179 /// re.search(&mut cache, &input, &mut caps); 180 /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); 181 /// 182 /// // But if we specify the span of the search instead of slicing the 183 /// // haystack, then the regex engine can "see" outside of the span 184 /// // and resolve the anchors correctly. 185 /// let input = Input::new(haystack).span(1..3); 186 /// re.search(&mut cache, &input, &mut caps); 187 /// assert_eq!(None, caps.get_match()); 188 /// 189 /// # Ok::<(), Box<dyn std::error::Error>>(()) 190 /// ``` 191 /// 192 /// This may seem a little ham-fisted, but this scenario tends to come up 193 /// if some other regex engine found the match span and now you need to 194 /// re-process that span to look for capturing groups. (e.g., Run a faster 195 /// DFA first, find a match, then run the PikeVM on just the match span to 196 /// resolve capturing groups.) In order to implement that sort of logic 197 /// correctly, you need to set the span on the search instead of slicing 198 /// the haystack directly. 199 /// 200 /// The other advantage of using this routine to specify the bounds of the 201 /// search is that the match offsets are still reported in terms of the 202 /// original haystack. For example, the second search in the example above 203 /// reported a match at position `0`, even though `at` starts at offset 204 /// `1` because we sliced the haystack. 205 #[inline] span<S: Into<Span>>(mut self, span: S) -> Input<'h>206 pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> { 207 self.set_span(span); 208 self 209 } 210 211 /// Like `Input::span`, but accepts any range instead. 212 /// 213 /// This routine does not panic if the range given is not a valid range for 214 /// this search's haystack. If this search is run with an invalid range, 215 /// then the most likely outcome is that the actual search execution will 216 /// panic. 217 /// 218 /// The default range is the entire haystack. 219 /// 220 /// Note that [`Input::span`] overrides this method and vice versa. 221 /// 222 /// # Panics 223 /// 224 /// This routine will panic if the given range could not be converted 225 /// to a valid [`Range`]. For example, this would panic when given 226 /// `0..=usize::MAX` since it cannot be represented using a half-open 227 /// interval in terms of `usize`. 228 /// 229 /// This also panics if the given range does not correspond to valid bounds 230 /// in the haystack or the termination of a search. 231 /// 232 /// # Example 233 /// 234 /// ``` 235 /// use regex_automata::Input; 236 /// 237 /// let input = Input::new("foobar"); 238 /// assert_eq!(0..6, input.get_range()); 239 /// 240 /// let input = Input::new("foobar").range(2..=4); 241 /// assert_eq!(2..5, input.get_range()); 242 /// ``` 243 #[inline] range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h>244 pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> { 245 self.set_range(range); 246 self 247 } 248 249 /// Sets the anchor mode of a search. 250 /// 251 /// When a search is anchored (so that's [`Anchored::Yes`] or 252 /// [`Anchored::Pattern`]), a match must begin at the start of a search. 253 /// When a search is not anchored (that's [`Anchored::No`]), regex engines 254 /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix 255 /// permits a match to appear anywhere. 256 /// 257 /// By default, the anchored mode is [`Anchored::No`]. 258 /// 259 /// **WARNING:** this is subtly different than using a `^` at the start of 260 /// your regex. A `^` forces a regex to match exclusively at the start of 261 /// a haystack, regardless of where you begin your search. In contrast, 262 /// anchoring a search will allow your regex to match anywhere in your 263 /// haystack, but the match must start at the beginning of a search. 264 /// 265 /// For example, consider the haystack `aba` and the following searches: 266 /// 267 /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` 268 /// starting at position `2`. Since `^` requires the match to start at 269 /// the beginning of the haystack and `2 > 0`, no match is found. 270 /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` 271 /// starting at position `2`. This reports a match at `[2, 3]` since 272 /// the match starts where the search started. Since there is no `^`, 273 /// there is no requirement for the match to start at the beginning of 274 /// the haystack. 275 /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` 276 /// starting at position `1`. Since `b` corresponds to position `1` and 277 /// since the search is anchored, it finds no match. While the regex 278 /// matches at other positions, configuring the search to be anchored 279 /// requires that it only report a match that begins at the same offset 280 /// as the beginning of the search. 281 /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` 282 /// starting at position `1`. Since the search is not anchored and 283 /// the regex does not start with `^`, the search executes as if there 284 /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it 285 /// reports a match at `[2, 3]`. 286 /// 287 /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, 288 /// except it only reports matches for a particular pattern. 289 /// 290 /// # Example 291 /// 292 /// This demonstrates the differences between an anchored search and 293 /// a pattern that begins with `^` (as described in the above warning 294 /// message). 295 /// 296 /// ``` 297 /// use regex_automata::{ 298 /// nfa::thompson::pikevm::PikeVM, 299 /// Anchored, Match, Input, 300 /// }; 301 /// 302 /// let haystack = "aba"; 303 /// 304 /// let re = PikeVM::new(r"^a")?; 305 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 306 /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); 307 /// re.search(&mut cache, &input, &mut caps); 308 /// // No match is found because 2 is not the beginning of the haystack, 309 /// // which is what ^ requires. 310 /// assert_eq!(None, caps.get_match()); 311 /// 312 /// let re = PikeVM::new(r"a")?; 313 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 314 /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); 315 /// re.search(&mut cache, &input, &mut caps); 316 /// // An anchored search can still match anywhere in the haystack, it just 317 /// // must begin at the start of the search which is '2' in this case. 318 /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); 319 /// 320 /// let re = PikeVM::new(r"a")?; 321 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 322 /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); 323 /// re.search(&mut cache, &input, &mut caps); 324 /// // No match is found since we start searching at offset 1 which 325 /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match 326 /// // is found. 327 /// assert_eq!(None, caps.get_match()); 328 /// 329 /// let re = PikeVM::new(r"a")?; 330 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 331 /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); 332 /// re.search(&mut cache, &input, &mut caps); 333 /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the 334 /// // pattern. Even though the search starts at 'b', the 'match anything' 335 /// // prefix allows the search to match 'a'. 336 /// let expected = Some(Match::must(0, 2..3)); 337 /// assert_eq!(expected, caps.get_match()); 338 /// 339 /// # Ok::<(), Box<dyn std::error::Error>>(()) 340 /// ``` 341 #[inline] anchored(mut self, mode: Anchored) -> Input<'h>342 pub fn anchored(mut self, mode: Anchored) -> Input<'h> { 343 self.set_anchored(mode); 344 self 345 } 346 347 /// Whether to execute an "earliest" search or not. 348 /// 349 /// When running a non-overlapping search, an "earliest" search will return 350 /// the match location as early as possible. For example, given a pattern 351 /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search 352 /// will return `foo12345` as a match. But an "earliest" search for regex 353 /// engines that support "earliest" semantics will return `foo1` as a 354 /// match, since as soon as the first digit following `foo` is seen, it is 355 /// known to have found a match. 356 /// 357 /// Note that "earliest" semantics generally depend on the regex engine. 358 /// Different regex engines may determine there is a match at different 359 /// points. So there is no guarantee that "earliest" matches will always 360 /// return the same offsets for all regex engines. The "earliest" notion 361 /// is really about when the particular regex engine determines there is 362 /// a match rather than a consistent semantic unto itself. This is often 363 /// useful for implementing "did a match occur or not" predicates, but 364 /// sometimes the offset is useful as well. 365 /// 366 /// This is disabled by default. 367 /// 368 /// # Example 369 /// 370 /// This example shows the difference between "earliest" searching and 371 /// normal searching. 372 /// 373 /// ``` 374 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; 375 /// 376 /// let re = PikeVM::new(r"foo[0-9]+")?; 377 /// let mut cache = re.create_cache(); 378 /// let mut caps = re.create_captures(); 379 /// 380 /// // A normal search implements greediness like you expect. 381 /// let input = Input::new("foo12345"); 382 /// re.search(&mut cache, &input, &mut caps); 383 /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); 384 /// 385 /// // When 'earliest' is enabled and the regex engine supports 386 /// // it, the search will bail once it knows a match has been 387 /// // found. 388 /// let input = Input::new("foo12345").earliest(true); 389 /// re.search(&mut cache, &input, &mut caps); 390 /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); 391 /// # Ok::<(), Box<dyn std::error::Error>>(()) 392 /// ``` 393 #[inline] earliest(mut self, yes: bool) -> Input<'h>394 pub fn earliest(mut self, yes: bool) -> Input<'h> { 395 self.set_earliest(yes); 396 self 397 } 398 399 /// Set the span for this search configuration. 400 /// 401 /// This is like the [`Input::span`] method, except this mutates the 402 /// span in place. 403 /// 404 /// This routine is generic over how a span is provided. While 405 /// a [`Span`] may be given directly, one may also provide a 406 /// `std::ops::Range<usize>`. 407 /// 408 /// # Panics 409 /// 410 /// This panics if the given span does not correspond to valid bounds in 411 /// the haystack or the termination of a search. 412 /// 413 /// # Example 414 /// 415 /// ``` 416 /// use regex_automata::Input; 417 /// 418 /// let mut input = Input::new("foobar"); 419 /// assert_eq!(0..6, input.get_range()); 420 /// input.set_span(2..4); 421 /// assert_eq!(2..4, input.get_range()); 422 /// ``` 423 #[inline] set_span<S: Into<Span>>(&mut self, span: S)424 pub fn set_span<S: Into<Span>>(&mut self, span: S) { 425 let span = span.into(); 426 assert!( 427 span.end <= self.haystack.len() 428 && span.start <= span.end.wrapping_add(1), 429 "invalid span {:?} for haystack of length {}", 430 span, 431 self.haystack.len(), 432 ); 433 self.span = span; 434 } 435 436 /// Set the span for this search configuration given any range. 437 /// 438 /// This is like the [`Input::range`] method, except this mutates the 439 /// span in place. 440 /// 441 /// This routine does not panic if the range given is not a valid range for 442 /// this search's haystack. If this search is run with an invalid range, 443 /// then the most likely outcome is that the actual search execution will 444 /// panic. 445 /// 446 /// # Panics 447 /// 448 /// This routine will panic if the given range could not be converted 449 /// to a valid [`Range`]. For example, this would panic when given 450 /// `0..=usize::MAX` since it cannot be represented using a half-open 451 /// interval in terms of `usize`. 452 /// 453 /// This also panics if the given span does not correspond to valid bounds 454 /// in the haystack or the termination of a search. 455 /// 456 /// # Example 457 /// 458 /// ``` 459 /// use regex_automata::Input; 460 /// 461 /// let mut input = Input::new("foobar"); 462 /// assert_eq!(0..6, input.get_range()); 463 /// input.set_range(2..=4); 464 /// assert_eq!(2..5, input.get_range()); 465 /// ``` 466 #[inline] set_range<R: RangeBounds<usize>>(&mut self, range: R)467 pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) { 468 use core::ops::Bound; 469 470 // It's a little weird to convert ranges into spans, and then spans 471 // back into ranges when we actually slice the haystack. Because 472 // of that process, we always represent everything as a half-open 473 // internal. Therefore, handling things like m..=n is a little awkward. 474 let start = match range.start_bound() { 475 Bound::Included(&i) => i, 476 // Can this case ever happen? Range syntax doesn't support it... 477 Bound::Excluded(&i) => i.checked_add(1).unwrap(), 478 Bound::Unbounded => 0, 479 }; 480 let end = match range.end_bound() { 481 Bound::Included(&i) => i.checked_add(1).unwrap(), 482 Bound::Excluded(&i) => i, 483 Bound::Unbounded => self.haystack().len(), 484 }; 485 self.set_span(Span { start, end }); 486 } 487 488 /// Set the starting offset for the span for this search configuration. 489 /// 490 /// This is a convenience routine for only mutating the start of a span 491 /// without having to set the entire span. 492 /// 493 /// # Panics 494 /// 495 /// This panics if the span resulting from the new start position does not 496 /// correspond to valid bounds in the haystack or the termination of a 497 /// search. 498 /// 499 /// # Example 500 /// 501 /// ``` 502 /// use regex_automata::Input; 503 /// 504 /// let mut input = Input::new("foobar"); 505 /// assert_eq!(0..6, input.get_range()); 506 /// input.set_start(5); 507 /// assert_eq!(5..6, input.get_range()); 508 /// ``` 509 #[inline] set_start(&mut self, start: usize)510 pub fn set_start(&mut self, start: usize) { 511 self.set_span(Span { start, ..self.get_span() }); 512 } 513 514 /// Set the ending offset for the span for this search configuration. 515 /// 516 /// This is a convenience routine for only mutating the end of a span 517 /// without having to set the entire span. 518 /// 519 /// # Panics 520 /// 521 /// This panics if the span resulting from the new end position does not 522 /// correspond to valid bounds in the haystack or the termination of a 523 /// search. 524 /// 525 /// # Example 526 /// 527 /// ``` 528 /// use regex_automata::Input; 529 /// 530 /// let mut input = Input::new("foobar"); 531 /// assert_eq!(0..6, input.get_range()); 532 /// input.set_end(5); 533 /// assert_eq!(0..5, input.get_range()); 534 /// ``` 535 #[inline] set_end(&mut self, end: usize)536 pub fn set_end(&mut self, end: usize) { 537 self.set_span(Span { end, ..self.get_span() }); 538 } 539 540 /// Set the anchor mode of a search. 541 /// 542 /// This is like [`Input::anchored`], except it mutates the search 543 /// configuration in place. 544 /// 545 /// # Example 546 /// 547 /// ``` 548 /// use regex_automata::{Anchored, Input, PatternID}; 549 /// 550 /// let mut input = Input::new("foobar"); 551 /// assert_eq!(Anchored::No, input.get_anchored()); 552 /// 553 /// let pid = PatternID::must(5); 554 /// input.set_anchored(Anchored::Pattern(pid)); 555 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); 556 /// ``` 557 #[inline] set_anchored(&mut self, mode: Anchored)558 pub fn set_anchored(&mut self, mode: Anchored) { 559 self.anchored = mode; 560 } 561 562 /// Set whether the search should execute in "earliest" mode or not. 563 /// 564 /// This is like [`Input::earliest`], except it mutates the search 565 /// configuration in place. 566 /// 567 /// # Example 568 /// 569 /// ``` 570 /// use regex_automata::Input; 571 /// 572 /// let mut input = Input::new("foobar"); 573 /// assert!(!input.get_earliest()); 574 /// input.set_earliest(true); 575 /// assert!(input.get_earliest()); 576 /// ``` 577 #[inline] set_earliest(&mut self, yes: bool)578 pub fn set_earliest(&mut self, yes: bool) { 579 self.earliest = yes; 580 } 581 582 /// Return a borrow of the underlying haystack as a slice of bytes. 583 /// 584 /// # Example 585 /// 586 /// ``` 587 /// use regex_automata::Input; 588 /// 589 /// let input = Input::new("foobar"); 590 /// assert_eq!(b"foobar", input.haystack()); 591 /// ``` 592 #[inline] haystack(&self) -> &[u8]593 pub fn haystack(&self) -> &[u8] { 594 self.haystack 595 } 596 597 /// Return the start position of this search. 598 /// 599 /// This is a convenience routine for `search.get_span().start()`. 600 /// 601 /// When [`Input::is_done`] is `false`, this is guaranteed to return 602 /// an offset that is less than or equal to [`Input::end`]. Otherwise, 603 /// the offset is one greater than [`Input::end`]. 604 /// 605 /// # Example 606 /// 607 /// ``` 608 /// use regex_automata::Input; 609 /// 610 /// let input = Input::new("foobar"); 611 /// assert_eq!(0, input.start()); 612 /// 613 /// let input = Input::new("foobar").span(2..4); 614 /// assert_eq!(2, input.start()); 615 /// ``` 616 #[inline] start(&self) -> usize617 pub fn start(&self) -> usize { 618 self.get_span().start 619 } 620 621 /// Return the end position of this search. 622 /// 623 /// This is a convenience routine for `search.get_span().end()`. 624 /// 625 /// This is guaranteed to return an offset that is a valid exclusive end 626 /// bound for this input's haystack. 627 /// 628 /// # Example 629 /// 630 /// ``` 631 /// use regex_automata::Input; 632 /// 633 /// let input = Input::new("foobar"); 634 /// assert_eq!(6, input.end()); 635 /// 636 /// let input = Input::new("foobar").span(2..4); 637 /// assert_eq!(4, input.end()); 638 /// ``` 639 #[inline] end(&self) -> usize640 pub fn end(&self) -> usize { 641 self.get_span().end 642 } 643 644 /// Return the span for this search configuration. 645 /// 646 /// If one was not explicitly set, then the span corresponds to the entire 647 /// range of the haystack. 648 /// 649 /// When [`Input::is_done`] is `false`, the span returned is guaranteed 650 /// to correspond to valid bounds for this input's haystack. 651 /// 652 /// # Example 653 /// 654 /// ``` 655 /// use regex_automata::{Input, Span}; 656 /// 657 /// let input = Input::new("foobar"); 658 /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); 659 /// ``` 660 #[inline] get_span(&self) -> Span661 pub fn get_span(&self) -> Span { 662 self.span 663 } 664 665 /// Return the span as a range for this search configuration. 666 /// 667 /// If one was not explicitly set, then the span corresponds to the entire 668 /// range of the haystack. 669 /// 670 /// When [`Input::is_done`] is `false`, the range returned is guaranteed 671 /// to correspond to valid bounds for this input's haystack. 672 /// 673 /// # Example 674 /// 675 /// ``` 676 /// use regex_automata::Input; 677 /// 678 /// let input = Input::new("foobar"); 679 /// assert_eq!(0..6, input.get_range()); 680 /// ``` 681 #[inline] get_range(&self) -> Range<usize>682 pub fn get_range(&self) -> Range<usize> { 683 self.get_span().range() 684 } 685 686 /// Return the anchored mode for this search configuration. 687 /// 688 /// If no anchored mode was set, then it defaults to [`Anchored::No`]. 689 /// 690 /// # Example 691 /// 692 /// ``` 693 /// use regex_automata::{Anchored, Input, PatternID}; 694 /// 695 /// let mut input = Input::new("foobar"); 696 /// assert_eq!(Anchored::No, input.get_anchored()); 697 /// 698 /// let pid = PatternID::must(5); 699 /// input.set_anchored(Anchored::Pattern(pid)); 700 /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); 701 /// ``` 702 #[inline] get_anchored(&self) -> Anchored703 pub fn get_anchored(&self) -> Anchored { 704 self.anchored 705 } 706 707 /// Return whether this search should execute in "earliest" mode. 708 /// 709 /// # Example 710 /// 711 /// ``` 712 /// use regex_automata::Input; 713 /// 714 /// let input = Input::new("foobar"); 715 /// assert!(!input.get_earliest()); 716 /// ``` 717 #[inline] get_earliest(&self) -> bool718 pub fn get_earliest(&self) -> bool { 719 self.earliest 720 } 721 722 /// Return true if and only if this search can never return any other 723 /// matches. 724 /// 725 /// This occurs when the start position of this search is greater than the 726 /// end position of the search. 727 /// 728 /// # Example 729 /// 730 /// ``` 731 /// use regex_automata::Input; 732 /// 733 /// let mut input = Input::new("foobar"); 734 /// assert!(!input.is_done()); 735 /// input.set_start(6); 736 /// assert!(!input.is_done()); 737 /// input.set_start(7); 738 /// assert!(input.is_done()); 739 /// ``` 740 #[inline] is_done(&self) -> bool741 pub fn is_done(&self) -> bool { 742 self.get_span().start > self.get_span().end 743 } 744 745 /// Returns true if and only if the given offset in this search's haystack 746 /// falls on a valid UTF-8 encoded codepoint boundary. 747 /// 748 /// If the haystack is not valid UTF-8, then the behavior of this routine 749 /// is unspecified. 750 /// 751 /// # Example 752 /// 753 /// This shows where codepoint boundaries do and don't exist in valid 754 /// UTF-8. 755 /// 756 /// ``` 757 /// use regex_automata::Input; 758 /// 759 /// let input = Input::new("☃"); 760 /// assert!(input.is_char_boundary(0)); 761 /// assert!(!input.is_char_boundary(1)); 762 /// assert!(!input.is_char_boundary(2)); 763 /// assert!(input.is_char_boundary(3)); 764 /// assert!(!input.is_char_boundary(4)); 765 /// ``` 766 #[inline] is_char_boundary(&self, offset: usize) -> bool767 pub fn is_char_boundary(&self, offset: usize) -> bool { 768 utf8::is_boundary(self.haystack(), offset) 769 } 770 } 771 772 impl<'h> core::fmt::Debug for Input<'h> { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result773 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 774 use crate::util::escape::DebugHaystack; 775 776 f.debug_struct("Input") 777 .field("haystack", &DebugHaystack(self.haystack())) 778 .field("span", &self.span) 779 .field("anchored", &self.anchored) 780 .field("earliest", &self.earliest) 781 .finish() 782 } 783 } 784 785 impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { from(haystack: &'h H) -> Input<'h>786 fn from(haystack: &'h H) -> Input<'h> { 787 Input::new(haystack) 788 } 789 } 790 791 /// A representation of a span reported by a regex engine. 792 /// 793 /// A span corresponds to the starting and ending _byte offsets_ of a 794 /// contiguous region of bytes. The starting offset is inclusive while the 795 /// ending offset is exclusive. That is, a span is a half-open interval. 796 /// 797 /// A span is used to report the offsets of a match, but it is also used to 798 /// convey which region of a haystack should be searched via routines like 799 /// [`Input::span`]. 800 /// 801 /// This is basically equivalent to a `std::ops::Range<usize>`, except this 802 /// type implements `Copy` which makes it more ergonomic to use in the context 803 /// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, 804 /// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`, 805 /// which means things like `Span::from(5..10)` work. 806 #[derive(Clone, Copy, Eq, Hash, PartialEq)] 807 pub struct Span { 808 /// The start offset of the span, inclusive. 809 pub start: usize, 810 /// The end offset of the span, exclusive. 811 pub end: usize, 812 } 813 814 impl Span { 815 /// Returns this span as a range. 816 #[inline] range(&self) -> Range<usize>817 pub fn range(&self) -> Range<usize> { 818 Range::from(*self) 819 } 820 821 /// Returns true when this span is empty. That is, when `start >= end`. 822 #[inline] is_empty(&self) -> bool823 pub fn is_empty(&self) -> bool { 824 self.start >= self.end 825 } 826 827 /// Returns the length of this span. 828 /// 829 /// This returns `0` in precisely the cases that `is_empty` returns `true`. 830 #[inline] len(&self) -> usize831 pub fn len(&self) -> usize { 832 self.end.saturating_sub(self.start) 833 } 834 835 /// Returns true when the given offset is contained within this span. 836 /// 837 /// Note that an empty span contains no offsets and will always return 838 /// false. 839 #[inline] contains(&self, offset: usize) -> bool840 pub fn contains(&self, offset: usize) -> bool { 841 !self.is_empty() && self.start <= offset && offset <= self.end 842 } 843 844 /// Returns a new span with `offset` added to this span's `start` and `end` 845 /// values. 846 #[inline] offset(&self, offset: usize) -> Span847 pub fn offset(&self, offset: usize) -> Span { 848 Span { start: self.start + offset, end: self.end + offset } 849 } 850 } 851 852 impl core::fmt::Debug for Span { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result853 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 854 write!(f, "{}..{}", self.start, self.end) 855 } 856 } 857 858 impl core::ops::Index<Span> for [u8] { 859 type Output = [u8]; 860 861 #[inline] index(&self, index: Span) -> &[u8]862 fn index(&self, index: Span) -> &[u8] { 863 &self[index.range()] 864 } 865 } 866 867 impl core::ops::IndexMut<Span> for [u8] { 868 #[inline] index_mut(&mut self, index: Span) -> &mut [u8]869 fn index_mut(&mut self, index: Span) -> &mut [u8] { 870 &mut self[index.range()] 871 } 872 } 873 874 impl core::ops::Index<Span> for str { 875 type Output = str; 876 877 #[inline] index(&self, index: Span) -> &str878 fn index(&self, index: Span) -> &str { 879 &self[index.range()] 880 } 881 } 882 883 impl From<Range<usize>> for Span { 884 #[inline] from(range: Range<usize>) -> Span885 fn from(range: Range<usize>) -> Span { 886 Span { start: range.start, end: range.end } 887 } 888 } 889 890 impl From<Span> for Range<usize> { 891 #[inline] from(span: Span) -> Range<usize>892 fn from(span: Span) -> Range<usize> { 893 Range { start: span.start, end: span.end } 894 } 895 } 896 897 impl PartialEq<Range<usize>> for Span { 898 #[inline] eq(&self, range: &Range<usize>) -> bool899 fn eq(&self, range: &Range<usize>) -> bool { 900 self.start == range.start && self.end == range.end 901 } 902 } 903 904 impl PartialEq<Span> for Range<usize> { 905 #[inline] eq(&self, span: &Span) -> bool906 fn eq(&self, span: &Span) -> bool { 907 self.start == span.start && self.end == span.end 908 } 909 } 910 911 /// A representation of "half" of a match reported by a DFA. 912 /// 913 /// This is called a "half" match because it only includes the end location (or 914 /// start location for a reverse search) of a match. This corresponds to the 915 /// information that a single DFA scan can report. Getting the other half of 916 /// the match requires a second scan with a reversed DFA. 917 /// 918 /// A half match also includes the pattern that matched. The pattern is 919 /// identified by an ID, which corresponds to its position (starting from `0`) 920 /// relative to other patterns used to construct the corresponding DFA. If only 921 /// a single pattern is provided to the DFA, then all matches are guaranteed to 922 /// have a pattern ID of `0`. 923 #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 924 pub struct HalfMatch { 925 /// The pattern ID. 926 pattern: PatternID, 927 /// The offset of the match. 928 /// 929 /// For forward searches, the offset is exclusive. For reverse searches, 930 /// the offset is inclusive. 931 offset: usize, 932 } 933 934 impl HalfMatch { 935 /// Create a new half match from a pattern ID and a byte offset. 936 #[inline] new(pattern: PatternID, offset: usize) -> HalfMatch937 pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { 938 HalfMatch { pattern, offset } 939 } 940 941 /// Create a new half match from a pattern ID and a byte offset. 942 /// 943 /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a 944 /// [`PatternID`]. This panics if the given `usize` is not representable 945 /// as a `PatternID`. 946 #[inline] must(pattern: usize, offset: usize) -> HalfMatch947 pub fn must(pattern: usize, offset: usize) -> HalfMatch { 948 HalfMatch::new(PatternID::new(pattern).unwrap(), offset) 949 } 950 951 /// Returns the ID of the pattern that matched. 952 /// 953 /// The ID of a pattern is derived from the position in which it was 954 /// originally inserted into the corresponding DFA. The first pattern has 955 /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. 956 #[inline] pattern(&self) -> PatternID957 pub fn pattern(&self) -> PatternID { 958 self.pattern 959 } 960 961 /// The position of the match. 962 /// 963 /// If this match was produced by a forward search, then the offset is 964 /// exclusive. If this match was produced by a reverse search, then the 965 /// offset is inclusive. 966 #[inline] offset(&self) -> usize967 pub fn offset(&self) -> usize { 968 self.offset 969 } 970 } 971 972 /// A representation of a match reported by a regex engine. 973 /// 974 /// A match has two essential pieces of information: the [`PatternID`] that 975 /// matches, and the [`Span`] of the match in a haystack. 976 /// 977 /// The pattern is identified by an ID, which corresponds to its position 978 /// (starting from `0`) relative to other patterns used to construct the 979 /// corresponding regex engine. If only a single pattern is provided, then all 980 /// matches are guaranteed to have a pattern ID of `0`. 981 /// 982 /// Every match reported by a regex engine guarantees that its span has its 983 /// start offset as less than or equal to its end offset. 984 #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] 985 pub struct Match { 986 /// The pattern ID. 987 pattern: PatternID, 988 /// The underlying match span. 989 span: Span, 990 } 991 992 impl Match { 993 /// Create a new match from a pattern ID and a span. 994 /// 995 /// This constructor is generic over how a span is provided. While 996 /// a [`Span`] may be given directly, one may also provide a 997 /// `std::ops::Range<usize>`. 998 /// 999 /// # Panics 1000 /// 1001 /// This panics if `end < start`. 1002 /// 1003 /// # Example 1004 /// 1005 /// This shows how to create a match for the first pattern in a regex 1006 /// object using convenient range syntax. 1007 /// 1008 /// ``` 1009 /// use regex_automata::{Match, PatternID}; 1010 /// 1011 /// let m = Match::new(PatternID::ZERO, 5..10); 1012 /// assert_eq!(0, m.pattern().as_usize()); 1013 /// assert_eq!(5, m.start()); 1014 /// assert_eq!(10, m.end()); 1015 /// ``` 1016 #[inline] new<S: Into<Span>>(pattern: PatternID, span: S) -> Match1017 pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match { 1018 let span: Span = span.into(); 1019 assert!(span.start <= span.end, "invalid match span"); 1020 Match { pattern, span } 1021 } 1022 1023 /// Create a new match from a pattern ID and a byte offset span. 1024 /// 1025 /// This constructor is generic over how a span is provided. While 1026 /// a [`Span`] may be given directly, one may also provide a 1027 /// `std::ops::Range<usize>`. 1028 /// 1029 /// This is like [`Match::new`], but accepts a `usize` instead of a 1030 /// [`PatternID`]. This panics if the given `usize` is not representable 1031 /// as a `PatternID`. 1032 /// 1033 /// # Panics 1034 /// 1035 /// This panics if `end < start` or if `pattern > PatternID::MAX`. 1036 /// 1037 /// # Example 1038 /// 1039 /// This shows how to create a match for the third pattern in a regex 1040 /// object using convenient range syntax. 1041 /// 1042 /// ``` 1043 /// use regex_automata::Match; 1044 /// 1045 /// let m = Match::must(3, 5..10); 1046 /// assert_eq!(3, m.pattern().as_usize()); 1047 /// assert_eq!(5, m.start()); 1048 /// assert_eq!(10, m.end()); 1049 /// ``` 1050 #[inline] must<S: Into<Span>>(pattern: usize, span: S) -> Match1051 pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match { 1052 Match::new(PatternID::must(pattern), span) 1053 } 1054 1055 /// Returns the ID of the pattern that matched. 1056 /// 1057 /// The ID of a pattern is derived from the position in which it was 1058 /// originally inserted into the corresponding regex engine. The first 1059 /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and 1060 /// so on. 1061 #[inline] pattern(&self) -> PatternID1062 pub fn pattern(&self) -> PatternID { 1063 self.pattern 1064 } 1065 1066 /// The starting position of the match. 1067 /// 1068 /// This is a convenience routine for `Match::span().start`. 1069 #[inline] start(&self) -> usize1070 pub fn start(&self) -> usize { 1071 self.span().start 1072 } 1073 1074 /// The ending position of the match. 1075 /// 1076 /// This is a convenience routine for `Match::span().end`. 1077 #[inline] end(&self) -> usize1078 pub fn end(&self) -> usize { 1079 self.span().end 1080 } 1081 1082 /// Returns the match span as a range. 1083 /// 1084 /// This is a convenience routine for `Match::span().range()`. 1085 #[inline] range(&self) -> core::ops::Range<usize>1086 pub fn range(&self) -> core::ops::Range<usize> { 1087 self.span().range() 1088 } 1089 1090 /// Returns the span for this match. 1091 #[inline] span(&self) -> Span1092 pub fn span(&self) -> Span { 1093 self.span 1094 } 1095 1096 /// Returns true when the span in this match is empty. 1097 /// 1098 /// An empty match can only be returned when the regex itself can match 1099 /// the empty string. 1100 #[inline] is_empty(&self) -> bool1101 pub fn is_empty(&self) -> bool { 1102 self.span().is_empty() 1103 } 1104 1105 /// Returns the length of this match. 1106 /// 1107 /// This returns `0` in precisely the cases that `is_empty` returns `true`. 1108 #[inline] len(&self) -> usize1109 pub fn len(&self) -> usize { 1110 self.span().len() 1111 } 1112 } 1113 1114 /// A set of `PatternID`s. 1115 /// 1116 /// A set of pattern identifiers is useful for recording which patterns have 1117 /// matched a particular haystack. A pattern set _only_ includes pattern 1118 /// identifiers. It does not include offset information. 1119 /// 1120 /// # Example 1121 /// 1122 /// This shows basic usage of a set. 1123 /// 1124 /// ``` 1125 /// use regex_automata::{PatternID, PatternSet}; 1126 /// 1127 /// let pid1 = PatternID::must(5); 1128 /// let pid2 = PatternID::must(8); 1129 /// // Create a new empty set. 1130 /// let mut set = PatternSet::new(10); 1131 /// // Insert pattern IDs. 1132 /// set.insert(pid1); 1133 /// set.insert(pid2); 1134 /// // Test membership. 1135 /// assert!(set.contains(pid1)); 1136 /// assert!(set.contains(pid2)); 1137 /// // Get all members. 1138 /// assert_eq!( 1139 /// vec![5, 8], 1140 /// set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(), 1141 /// ); 1142 /// // Clear the set. 1143 /// set.clear(); 1144 /// // Test that it is indeed empty. 1145 /// assert!(set.is_empty()); 1146 /// ``` 1147 #[cfg(feature = "alloc")] 1148 #[derive(Clone, Debug, Eq, PartialEq)] 1149 pub struct PatternSet { 1150 /// The number of patterns set to 'true' in this set. 1151 len: usize, 1152 /// A map from PatternID to boolean of whether a pattern matches or not. 1153 /// 1154 /// This should probably be a bitset, but it's probably unlikely to matter 1155 /// much in practice. 1156 /// 1157 /// The main downside of this representation (and similarly for a bitset) 1158 /// is that iteration scales with the capacity of the set instead of 1159 /// the length of the set. This doesn't seem likely to be a problem in 1160 /// practice. 1161 /// 1162 /// Another alternative is to just use a 'SparseSet' for this. It does use 1163 /// more memory (quite a bit more), but that seems fine I think compared 1164 /// to the memory being used by the regex engine. The real hiccup with 1165 /// it is that it yields pattern IDs in the order they were inserted. 1166 /// Which is actually kind of nice, but at the time of writing, pattern 1167 /// IDs are yielded in ascending order in the regex crate RegexSet API. 1168 /// If we did change to 'SparseSet', we could provide an additional 1169 /// 'iter_match_order' iterator, but keep the ascending order one for 1170 /// compatibility. 1171 which: alloc::boxed::Box<[bool]>, 1172 } 1173 1174 #[cfg(feature = "alloc")] 1175 impl PatternSet { 1176 /// Create a new set of pattern identifiers with the given capacity. 1177 /// 1178 /// The given capacity typically corresponds to (at least) the number of 1179 /// patterns in a compiled regex object. 1180 /// 1181 /// # Panics 1182 /// 1183 /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is 1184 /// impossible if you use the `pattern_len()` method as defined on any of 1185 /// the regex engines in this crate. Namely, a regex will fail to build by 1186 /// returning an error if the number of patterns given to it exceeds the 1187 /// limit. Therefore, the number of patterns in a valid regex is always 1188 /// a correct capacity to provide here. new(capacity: usize) -> PatternSet1189 pub fn new(capacity: usize) -> PatternSet { 1190 assert!( 1191 capacity <= PatternID::LIMIT, 1192 "pattern set capacity exceeds limit of {}", 1193 PatternID::LIMIT, 1194 ); 1195 PatternSet { 1196 len: 0, 1197 which: alloc::vec![false; capacity].into_boxed_slice(), 1198 } 1199 } 1200 1201 /// Clear this set such that it contains no pattern IDs. clear(&mut self)1202 pub fn clear(&mut self) { 1203 self.len = 0; 1204 for matched in self.which.iter_mut() { 1205 *matched = false; 1206 } 1207 } 1208 1209 /// Return true if and only if the given pattern identifier is in this set. contains(&self, pid: PatternID) -> bool1210 pub fn contains(&self, pid: PatternID) -> bool { 1211 pid.as_usize() < self.capacity() && self.which[pid] 1212 } 1213 1214 /// Insert the given pattern identifier into this set and return `true` if 1215 /// the given pattern ID was not previously in this set. 1216 /// 1217 /// If the pattern identifier is already in this set, then this is a no-op. 1218 /// 1219 /// Use [`PatternSet::try_insert`] for a fallible version of this routine. 1220 /// 1221 /// # Panics 1222 /// 1223 /// This panics if this pattern set has insufficient capacity to 1224 /// store the given pattern ID. insert(&mut self, pid: PatternID) -> bool1225 pub fn insert(&mut self, pid: PatternID) -> bool { 1226 self.try_insert(pid) 1227 .expect("PatternSet should have sufficient capacity") 1228 } 1229 1230 /// Insert the given pattern identifier into this set and return `true` if 1231 /// the given pattern ID was not previously in this set. 1232 /// 1233 /// If the pattern identifier is already in this set, then this is a no-op. 1234 /// 1235 /// # Errors 1236 /// 1237 /// This returns an error if this pattern set has insufficient capacity to 1238 /// store the given pattern ID. try_insert( &mut self, pid: PatternID, ) -> Result<bool, PatternSetInsertError>1239 pub fn try_insert( 1240 &mut self, 1241 pid: PatternID, 1242 ) -> Result<bool, PatternSetInsertError> { 1243 if pid.as_usize() >= self.capacity() { 1244 return Err(PatternSetInsertError { 1245 attempted: pid, 1246 capacity: self.capacity(), 1247 }); 1248 } 1249 if self.which[pid] { 1250 return Ok(false); 1251 } 1252 self.len += 1; 1253 self.which[pid] = true; 1254 Ok(true) 1255 } 1256 1257 /* 1258 // This is currently commented out because it is unused and it is unclear 1259 // whether it's useful or not. What's the harm in having it? When, if 1260 // we ever wanted to change our representation to a 'SparseSet', then 1261 // supporting this method would be a bit tricky. So in order to keep some 1262 // API evolution flexibility, we leave it out for now. 1263 1264 /// Remove the given pattern identifier from this set. 1265 /// 1266 /// If the pattern identifier was not previously in this set, then this 1267 /// does not change the set and returns `false`. 1268 /// 1269 /// # Panics 1270 /// 1271 /// This panics if `pid` exceeds the capacity of this set. 1272 pub fn remove(&mut self, pid: PatternID) -> bool { 1273 if !self.which[pid] { 1274 return false; 1275 } 1276 self.len -= 1; 1277 self.which[pid] = false; 1278 true 1279 } 1280 */ 1281 1282 /// Return true if and only if this set has no pattern identifiers in it. is_empty(&self) -> bool1283 pub fn is_empty(&self) -> bool { 1284 self.len() == 0 1285 } 1286 1287 /// Return true if and only if this set has the maximum number of pattern 1288 /// identifiers in the set. This occurs precisely when `PatternSet::len() 1289 /// == PatternSet::capacity()`. 1290 /// 1291 /// This particular property is useful to test because it may allow one to 1292 /// stop a search earlier than you might otherwise. Namely, if a search is 1293 /// only reporting which patterns match a haystack and if you know all of 1294 /// the patterns match at a given point, then there's no new information 1295 /// that can be learned by continuing the search. (Because a pattern set 1296 /// does not keep track of offset information.) is_full(&self) -> bool1297 pub fn is_full(&self) -> bool { 1298 self.len() == self.capacity() 1299 } 1300 1301 /// Returns the total number of pattern identifiers in this set. len(&self) -> usize1302 pub fn len(&self) -> usize { 1303 self.len 1304 } 1305 1306 /// Returns the total number of pattern identifiers that may be stored 1307 /// in this set. 1308 /// 1309 /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. 1310 /// 1311 /// Typically, the capacity of a pattern set matches the number of patterns 1312 /// in a regex object with which you are searching. capacity(&self) -> usize1313 pub fn capacity(&self) -> usize { 1314 self.which.len() 1315 } 1316 1317 /// Returns an iterator over all pattern identifiers in this set. 1318 /// 1319 /// The iterator yields pattern identifiers in ascending order, starting 1320 /// at zero. iter(&self) -> PatternSetIter<'_>1321 pub fn iter(&self) -> PatternSetIter<'_> { 1322 PatternSetIter { it: self.which.iter().enumerate() } 1323 } 1324 } 1325 1326 /// An error that occurs when a `PatternID` failed to insert into a 1327 /// `PatternSet`. 1328 /// 1329 /// An insert fails when the given `PatternID` exceeds the configured capacity 1330 /// of the `PatternSet`. 1331 /// 1332 /// This error is created by the [`PatternSet::try_insert`] routine. 1333 #[cfg(feature = "alloc")] 1334 #[derive(Clone, Debug)] 1335 pub struct PatternSetInsertError { 1336 attempted: PatternID, 1337 capacity: usize, 1338 } 1339 1340 #[cfg(feature = "std")] 1341 impl std::error::Error for PatternSetInsertError {} 1342 1343 #[cfg(feature = "alloc")] 1344 impl core::fmt::Display for PatternSetInsertError { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result1345 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 1346 write!( 1347 f, 1348 "failed to insert pattern ID {} into pattern set \ 1349 with insufficiet capacity of {}", 1350 self.attempted.as_usize(), 1351 self.capacity, 1352 ) 1353 } 1354 } 1355 1356 /// An iterator over all pattern identifiers in a [`PatternSet`]. 1357 /// 1358 /// The lifetime parameter `'a` refers to the lifetime of the pattern set being 1359 /// iterated over. 1360 /// 1361 /// This iterator is created by the [`PatternSet::iter`] method. 1362 #[cfg(feature = "alloc")] 1363 #[derive(Clone, Debug)] 1364 pub struct PatternSetIter<'a> { 1365 it: core::iter::Enumerate<core::slice::Iter<'a, bool>>, 1366 } 1367 1368 #[cfg(feature = "alloc")] 1369 impl<'a> Iterator for PatternSetIter<'a> { 1370 type Item = PatternID; 1371 next(&mut self) -> Option<PatternID>1372 fn next(&mut self) -> Option<PatternID> { 1373 while let Some((index, &yes)) = self.it.next() { 1374 if yes { 1375 // Only valid 'PatternID' values can be inserted into the set 1376 // and construction of the set panics if the capacity would 1377 // permit storing invalid pattern IDs. Thus, 'yes' is only true 1378 // precisely when 'index' corresponds to a valid 'PatternID'. 1379 return Some(PatternID::new_unchecked(index)); 1380 } 1381 } 1382 None 1383 } 1384 size_hint(&self) -> (usize, Option<usize>)1385 fn size_hint(&self) -> (usize, Option<usize>) { 1386 self.it.size_hint() 1387 } 1388 } 1389 1390 #[cfg(feature = "alloc")] 1391 impl<'a> DoubleEndedIterator for PatternSetIter<'a> { next_back(&mut self) -> Option<PatternID>1392 fn next_back(&mut self) -> Option<PatternID> { 1393 while let Some((index, &yes)) = self.it.next_back() { 1394 if yes { 1395 // Only valid 'PatternID' values can be inserted into the set 1396 // and construction of the set panics if the capacity would 1397 // permit storing invalid pattern IDs. Thus, 'yes' is only true 1398 // precisely when 'index' corresponds to a valid 'PatternID'. 1399 return Some(PatternID::new_unchecked(index)); 1400 } 1401 } 1402 None 1403 } 1404 } 1405 1406 /// The type of anchored search to perform. 1407 /// 1408 /// This is *almost* a boolean option. That is, you can either do an unanchored 1409 /// search for any pattern in a regex, or you can do an anchored search for any 1410 /// pattern in a regex. 1411 /// 1412 /// A third option exists that, assuming the regex engine supports it, permits 1413 /// you to do an anchored search for a specific pattern. 1414 /// 1415 /// Note that there is no way to run an unanchored search for a specific 1416 /// pattern. If you need that, you'll need to build separate regexes for each 1417 /// pattern. 1418 /// 1419 /// # Errors 1420 /// 1421 /// If a regex engine does not support the anchored mode selected, then the 1422 /// regex engine will return an error. While any non-trivial regex engine 1423 /// should support at least one of the available anchored modes, there is no 1424 /// singular mode that is guaranteed to be universally supported. Some regex 1425 /// engines might only support unanchored searches (DFAs compiled without 1426 /// anchored starting states) and some regex engines might only support 1427 /// anchored searches (like the one-pass DFA). 1428 /// 1429 /// The specific error returned is a [`MatchError`] with a 1430 /// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the 1431 /// `Anchored` value given that is unsupported. 1432 /// 1433 /// Note that regex engines should report "no match" if, for example, an 1434 /// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where 1435 /// anchored searches for a specific pattern are supported. This is smooths out 1436 /// behavior such that it's possible to guarantee that an error never occurs 1437 /// based on how the regex engine is configured. All regex engines in this 1438 /// crate report "no match" when searching for an invalid pattern ID, but where 1439 /// searching for a valid pattern ID is otherwise supported. 1440 /// 1441 /// # Example 1442 /// 1443 /// This example shows how to use the various `Anchored` modes to run a 1444 /// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) 1445 /// because it supports all modes unconditionally. Some regex engines, like 1446 /// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored 1447 /// searches. 1448 /// 1449 /// ``` 1450 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1451 /// use regex_automata::{ 1452 /// nfa::thompson::pikevm::PikeVM, 1453 /// Anchored, Input, Match, PatternID, 1454 /// }; 1455 /// 1456 /// let re = PikeVM::new_many(&[ 1457 /// r"Mrs. \w+", 1458 /// r"Miss \w+", 1459 /// r"Mr. \w+", 1460 /// r"Ms. \w+", 1461 /// ])?; 1462 /// let mut cache = re.create_cache(); 1463 /// let hay = "Hello Mr. Springsteen!"; 1464 /// 1465 /// // The default is to do an unanchored search. 1466 /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); 1467 /// // Explicitly ask for an unanchored search. Same as above. 1468 /// let input = Input::new(hay).anchored(Anchored::No); 1469 /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); 1470 /// 1471 /// // Now try an anchored search. Since the match doesn't start at the 1472 /// // beginning of the haystack, no match is found! 1473 /// let input = Input::new(hay).anchored(Anchored::Yes); 1474 /// assert_eq!(None, re.find(&mut cache, input)); 1475 /// 1476 /// // We can try an anchored search again, but move the location of where 1477 /// // we start the search. Note that the offsets reported are still in 1478 /// // terms of the overall haystack and not relative to where we started 1479 /// // the search. 1480 /// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); 1481 /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); 1482 /// 1483 /// // Now try an anchored search for a specific pattern. We specifically 1484 /// // choose a pattern that we know doesn't match to prove that the search 1485 /// // only looks for the pattern we provide. 1486 /// let input = Input::new(hay) 1487 /// .anchored(Anchored::Pattern(PatternID::must(1))) 1488 /// .range(6..); 1489 /// assert_eq!(None, re.find(&mut cache, input)); 1490 /// 1491 /// // But if we switch it to the pattern that we know matches, then we find 1492 /// // the match. 1493 /// let input = Input::new(hay) 1494 /// .anchored(Anchored::Pattern(PatternID::must(2))) 1495 /// .range(6..); 1496 /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); 1497 /// 1498 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1499 /// ``` 1500 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 1501 pub enum Anchored { 1502 /// Run an unanchored search. This means a match may occur anywhere at or 1503 /// after the start position of the search. 1504 /// 1505 /// This search can return a match for any pattern in the regex. 1506 No, 1507 /// Run an anchored search. This means that a match must begin at the 1508 /// start position of the search. 1509 /// 1510 /// This search can return a match for any pattern in the regex. 1511 Yes, 1512 /// Run an anchored search for a specific pattern. This means that a match 1513 /// must be for the given pattern and must begin at the start position of 1514 /// the search. 1515 Pattern(PatternID), 1516 } 1517 1518 impl Anchored { 1519 /// Returns true if and only if this anchor mode corresponds to any kind of 1520 /// anchored search. 1521 /// 1522 /// # Example 1523 /// 1524 /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` 1525 /// are considered anchored searches. 1526 /// 1527 /// ``` 1528 /// use regex_automata::{Anchored, PatternID}; 1529 /// 1530 /// assert!(!Anchored::No.is_anchored()); 1531 /// assert!(Anchored::Yes.is_anchored()); 1532 /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); 1533 /// ``` 1534 #[inline] is_anchored(&self) -> bool1535 pub fn is_anchored(&self) -> bool { 1536 matches!(*self, Anchored::Yes | Anchored::Pattern(_)) 1537 } 1538 1539 /// Returns the pattern ID associated with this configuration if it is an 1540 /// anchored search for a specific pattern. Otherwise `None` is returned. 1541 /// 1542 /// # Example 1543 /// 1544 /// ``` 1545 /// use regex_automata::{Anchored, PatternID}; 1546 /// 1547 /// assert_eq!(None, Anchored::No.pattern()); 1548 /// assert_eq!(None, Anchored::Yes.pattern()); 1549 /// 1550 /// let pid = PatternID::must(5); 1551 /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); 1552 /// ``` 1553 #[inline] pattern(&self) -> Option<PatternID>1554 pub fn pattern(&self) -> Option<PatternID> { 1555 match *self { 1556 Anchored::Pattern(pid) => Some(pid), 1557 _ => None, 1558 } 1559 } 1560 } 1561 1562 /// The kind of match semantics to use for a regex pattern. 1563 /// 1564 /// The default match kind is `LeftmostFirst`, and this corresponds to the 1565 /// match semantics used by most backtracking engines, such as Perl. 1566 /// 1567 /// # Leftmost first or "preference order" match semantics 1568 /// 1569 /// Leftmost-first semantics determine which match to report when there are 1570 /// multiple paths through a regex that match at the same position. The tie is 1571 /// essentially broken by how a backtracker would behave. For example, consider 1572 /// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this 1573 /// case, both the `foofoo` and `foo` branches match at position `0`. So should 1574 /// the end of the match be `3` or `6`? 1575 /// 1576 /// A backtracker will conceptually work by trying `foofoofoo` and failing. 1577 /// Then it will try `foofoo`, find the match and stop there. Thus, the 1578 /// leftmost-first match position is `6`. This is called "leftmost-first" or 1579 /// "preference order" because the order of the branches as written in the 1580 /// regex pattern is what determines how to break the tie. 1581 /// 1582 /// (Note that leftmost-longest match semantics, which break ties by always 1583 /// taking the longest matching string, are not currently supported by this 1584 /// crate. These match semantics tend to be found in POSIX regex engines.) 1585 /// 1586 /// This example shows how leftmost-first semantics work, and how it even 1587 /// applies to multi-pattern regexes: 1588 /// 1589 /// ``` 1590 /// use regex_automata::{ 1591 /// nfa::thompson::pikevm::PikeVM, 1592 /// Match, 1593 /// }; 1594 /// 1595 /// let re = PikeVM::new_many(&[ 1596 /// r"foofoofoo", 1597 /// r"foofoo", 1598 /// r"foo", 1599 /// ])?; 1600 /// let mut cache = re.create_cache(); 1601 /// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect(); 1602 /// let expected = vec![Match::must(1, 0..6)]; 1603 /// assert_eq!(expected, got); 1604 /// 1605 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1606 /// ``` 1607 /// 1608 /// # All matches 1609 /// 1610 /// The `All` match semantics report any and all matches, and generally will 1611 /// attempt to match as much as possible. It doesn't respect any sort of match 1612 /// priority at all, so things like non-greedy matching don't work in this 1613 /// mode. 1614 /// 1615 /// The fact that non-greedy matching doesn't work generally makes most forms 1616 /// of unanchored non-overlapping searches have unintuitive behavior. Namely, 1617 /// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the 1618 /// beginning of the pattern, which is specifically non-greedy. Since it will 1619 /// be treated as greedy in `All` match semantics, this generally means that 1620 /// it will first attempt to consume all of the haystack and is likely to wind 1621 /// up skipping matches. 1622 /// 1623 /// Generally speaking, `All` should only be used in two circumstances: 1624 /// 1625 /// * When running an anchored search and there is a desire to match as much as 1626 /// possible. For example, when building a reverse regex matcher to find the 1627 /// start of a match after finding the end. In this case, the reverse search 1628 /// is anchored to the end of the match found by the forward search. 1629 /// * When running overlapping searches. Since `All` encodes all possible 1630 /// matches, this is generally what you want for an overlapping search. If you 1631 /// try to use leftmost-first in an overlapping search, it is likely to produce 1632 /// counter-intuitive results since leftmost-first specifically excludes some 1633 /// matches from its underlying finite state machine. 1634 /// 1635 /// This example demonstrates the counter-intuitive behavior of `All` semantics 1636 /// when using a standard leftmost unanchored search: 1637 /// 1638 /// ``` 1639 /// use regex_automata::{ 1640 /// nfa::thompson::pikevm::PikeVM, 1641 /// Match, MatchKind, 1642 /// }; 1643 /// 1644 /// let re = PikeVM::builder() 1645 /// .configure(PikeVM::config().match_kind(MatchKind::All)) 1646 /// .build("foo")?; 1647 /// let hay = "first foo second foo wat"; 1648 /// let mut cache = re.create_cache(); 1649 /// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect(); 1650 /// // Notice that it completely skips the first 'foo'! 1651 /// let expected = vec![Match::must(0, 17..20)]; 1652 /// assert_eq!(expected, got); 1653 /// 1654 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1655 /// ``` 1656 /// 1657 /// This second example shows how `All` semantics are useful for an overlapping 1658 /// search. Note that we use lower level lazy DFA APIs here since the NFA 1659 /// engines only currently support a very limited form of overlapping search. 1660 /// 1661 /// ``` 1662 /// use regex_automata::{ 1663 /// hybrid::dfa::{DFA, OverlappingState}, 1664 /// HalfMatch, Input, MatchKind, 1665 /// }; 1666 /// 1667 /// let re = DFA::builder() 1668 /// // If we didn't set 'All' semantics here, then the regex would only 1669 /// // match 'foo' at offset 3 and nothing else. Why? Because the state 1670 /// // machine implements preference order and knows that the 'foofoo' and 1671 /// // 'foofoofoo' branches can never match since 'foo' will always match 1672 /// // when they match and take priority. 1673 /// .configure(DFA::config().match_kind(MatchKind::All)) 1674 /// .build(r"foo|foofoo|foofoofoo")?; 1675 /// let mut cache = re.create_cache(); 1676 /// let mut state = OverlappingState::start(); 1677 /// let input = Input::new("foofoofoo"); 1678 /// let mut got = vec![]; 1679 /// loop { 1680 /// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; 1681 /// let m = match state.get_match() { 1682 /// None => break, 1683 /// Some(m) => m, 1684 /// }; 1685 /// got.push(m); 1686 /// } 1687 /// let expected = vec![ 1688 /// HalfMatch::must(0, 3), 1689 /// HalfMatch::must(0, 6), 1690 /// HalfMatch::must(0, 9), 1691 /// ]; 1692 /// assert_eq!(expected, got); 1693 /// 1694 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1695 /// ``` 1696 #[non_exhaustive] 1697 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 1698 pub enum MatchKind { 1699 /// Report all possible matches. 1700 All, 1701 /// Report only the leftmost matches. When multiple leftmost matches exist, 1702 /// report the match corresponding to the part of the regex that appears 1703 /// first in the syntax. 1704 LeftmostFirst, 1705 // There is prior art in RE2 that shows that we should be able to add 1706 // LeftmostLongest too. The tricky part of it is supporting ungreedy 1707 // repetitions. Instead of treating all NFA states as having equivalent 1708 // priority (as in 'All') or treating all NFA states as having distinct 1709 // priority based on order (as in 'LeftmostFirst'), we instead group NFA 1710 // states into sets, and treat members of each set as having equivalent 1711 // priority, but having greater priority than all following members 1712 // of different sets. 1713 // 1714 // However, it's not clear whether it's really worth adding this. After 1715 // all, leftmost-longest can be emulated when using literals by using 1716 // leftmost-first and sorting the literals by length in descending order. 1717 // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will 1718 // always match `a` in `ab` when using leftmost-first, but leftmost-longest 1719 // would match `ab`. 1720 } 1721 1722 impl MatchKind { 1723 #[cfg(feature = "alloc")] continue_past_first_match(&self) -> bool1724 pub(crate) fn continue_past_first_match(&self) -> bool { 1725 *self == MatchKind::All 1726 } 1727 } 1728 1729 impl Default for MatchKind { default() -> MatchKind1730 fn default() -> MatchKind { 1731 MatchKind::LeftmostFirst 1732 } 1733 } 1734 1735 /// An error indicating that a search stopped before reporting whether a 1736 /// match exists or not. 1737 /// 1738 /// To be very clear, this error type implies that one cannot assume that no 1739 /// matches occur, since the search stopped before completing. That is, if 1740 /// you're looking for information about where a search determined that no 1741 /// match can occur, then this error type does *not* give you that. (Indeed, at 1742 /// the time of writing, if you need such a thing, you have to write your own 1743 /// search routine.) 1744 /// 1745 /// Normally, when one searches for something, the response is either an 1746 /// affirmative "it was found at this location" or a negative "not found at 1747 /// all." However, in some cases, a regex engine can be configured to stop its 1748 /// search before concluding whether a match exists or not. When this happens, 1749 /// it may be important for the caller to know why the regex engine gave up and 1750 /// where in the input it gave up at. This error type exposes the 'why' and the 1751 /// 'where.' 1752 /// 1753 /// For example, the DFAs provided by this library generally cannot correctly 1754 /// implement Unicode word boundaries. Instead, they provide an option to 1755 /// eagerly support them on ASCII text (since Unicode word boundaries are 1756 /// equivalent to ASCII word boundaries when searching ASCII text), but will 1757 /// "give up" if a non-ASCII byte is seen. In such cases, one is usually 1758 /// required to either report the failure to the caller (unergonomic) or 1759 /// otherwise fall back to some other regex engine (ergonomic, but potentially 1760 /// costly). 1761 /// 1762 /// More generally, some regex engines offer the ability for callers to specify 1763 /// certain bytes that will trigger the regex engine to automatically quit if 1764 /// they are seen. 1765 /// 1766 /// Still yet, there may be other reasons for a failed match. For example, 1767 /// the hybrid DFA provided by this crate can be configured to give up if it 1768 /// believes that it is not efficient. This in turn permits callers to choose a 1769 /// different regex engine. 1770 /// 1771 /// (Note that DFAs are configured by default to never quit or give up in this 1772 /// fashion. For example, by default, a DFA will fail to build if the regex 1773 /// pattern contains a Unicode word boundary. One needs to opt into the "quit" 1774 /// behavior via options, like 1775 /// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) 1776 /// 1777 /// There are a couple other ways a search 1778 /// can fail. For example, when using the 1779 /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) 1780 /// with a haystack that is too long, or trying to run an unanchored search 1781 /// with a [one-pass DFA](crate::dfa::onepass). 1782 #[derive(Clone, Debug, Eq, PartialEq)] 1783 pub struct MatchError( 1784 #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>, 1785 #[cfg(not(feature = "alloc"))] MatchErrorKind, 1786 ); 1787 1788 impl MatchError { 1789 /// Create a new error value with the given kind. 1790 /// 1791 /// This is a more verbose version of the kind-specific constructors, 1792 /// e.g., `MatchError::quit`. new(kind: MatchErrorKind) -> MatchError1793 pub fn new(kind: MatchErrorKind) -> MatchError { 1794 #[cfg(feature = "alloc")] 1795 { 1796 MatchError(alloc::boxed::Box::new(kind)) 1797 } 1798 #[cfg(not(feature = "alloc"))] 1799 { 1800 MatchError(kind) 1801 } 1802 } 1803 1804 /// Returns a reference to the underlying error kind. kind(&self) -> &MatchErrorKind1805 pub fn kind(&self) -> &MatchErrorKind { 1806 &self.0 1807 } 1808 1809 /// Create a new "quit" error. The given `byte` corresponds to the value 1810 /// that tripped a search's quit condition, and `offset` corresponds to the 1811 /// location in the haystack at which the search quit. 1812 /// 1813 /// This is the same as calling `MatchError::new` with a 1814 /// [`MatchErrorKind::Quit`] kind. quit(byte: u8, offset: usize) -> MatchError1815 pub fn quit(byte: u8, offset: usize) -> MatchError { 1816 MatchError::new(MatchErrorKind::Quit { byte, offset }) 1817 } 1818 1819 /// Create a new "gave up" error. The given `offset` corresponds to the 1820 /// location in the haystack at which the search gave up. 1821 /// 1822 /// This is the same as calling `MatchError::new` with a 1823 /// [`MatchErrorKind::GaveUp`] kind. gave_up(offset: usize) -> MatchError1824 pub fn gave_up(offset: usize) -> MatchError { 1825 MatchError::new(MatchErrorKind::GaveUp { offset }) 1826 } 1827 1828 /// Create a new "haystack too long" error. The given `len` corresponds to 1829 /// the length of the haystack that was problematic. 1830 /// 1831 /// This is the same as calling `MatchError::new` with a 1832 /// [`MatchErrorKind::HaystackTooLong`] kind. haystack_too_long(len: usize) -> MatchError1833 pub fn haystack_too_long(len: usize) -> MatchError { 1834 MatchError::new(MatchErrorKind::HaystackTooLong { len }) 1835 } 1836 1837 /// Create a new "unsupported anchored" error. This occurs when the caller 1838 /// requests a search with an anchor mode that is not supported by the 1839 /// regex engine. 1840 /// 1841 /// This is the same as calling `MatchError::new` with a 1842 /// [`MatchErrorKind::UnsupportedAnchored`] kind. unsupported_anchored(mode: Anchored) -> MatchError1843 pub fn unsupported_anchored(mode: Anchored) -> MatchError { 1844 MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) 1845 } 1846 } 1847 1848 /// The underlying kind of a [`MatchError`]. 1849 /// 1850 /// This is a **non-exhaustive** enum. That means new variants may be added in 1851 /// a semver-compatible release. 1852 #[non_exhaustive] 1853 #[derive(Clone, Debug, Eq, PartialEq)] 1854 pub enum MatchErrorKind { 1855 /// The search saw a "quit" byte at which it was instructed to stop 1856 /// searching. 1857 Quit { 1858 /// The "quit" byte that was observed that caused the search to stop. 1859 byte: u8, 1860 /// The offset at which the quit byte was observed. 1861 offset: usize, 1862 }, 1863 /// The search, based on heuristics, determined that it would be better 1864 /// to stop, typically to provide the caller an opportunity to use an 1865 /// alternative regex engine. 1866 /// 1867 /// Currently, the only way for this to occur is via the lazy DFA and 1868 /// only when it is configured to do so (it will not return this error by 1869 /// default). 1870 GaveUp { 1871 /// The offset at which the search stopped. This corresponds to the 1872 /// position immediately following the last byte scanned. 1873 offset: usize, 1874 }, 1875 /// This error occurs if the haystack given to the regex engine was too 1876 /// long to be searched. This occurs, for example, with regex engines 1877 /// like the bounded backtracker that have a configurable fixed amount of 1878 /// capacity that is tied to the length of the haystack. Anything beyond 1879 /// that configured limit will result in an error at search time. 1880 HaystackTooLong { 1881 /// The length of the haystack that exceeded the limit. 1882 len: usize, 1883 }, 1884 /// An error indicating that a particular type of anchored search was 1885 /// requested, but that the regex engine does not support it. 1886 /// 1887 /// Note that this error should not be returned by a regex engine simply 1888 /// because the pattern ID is invalid (i.e., equal to or exceeds the number 1889 /// of patterns in the regex). In that case, the regex engine should report 1890 /// a non-match. 1891 UnsupportedAnchored { 1892 /// The anchored mode given that is unsupported. 1893 mode: Anchored, 1894 }, 1895 } 1896 1897 #[cfg(feature = "std")] 1898 impl std::error::Error for MatchError {} 1899 1900 impl core::fmt::Display for MatchError { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result1901 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 1902 match *self.kind() { 1903 MatchErrorKind::Quit { byte, offset } => write!( 1904 f, 1905 "quit search after observing byte {:?} at offset {}", 1906 DebugByte(byte), 1907 offset, 1908 ), 1909 MatchErrorKind::GaveUp { offset } => { 1910 write!(f, "gave up searching at offset {}", offset) 1911 } 1912 MatchErrorKind::HaystackTooLong { len } => { 1913 write!(f, "haystack of length {} is too long", len) 1914 } 1915 MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { 1916 write!(f, "anchored searches are not supported or enabled") 1917 } 1918 MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { 1919 write!(f, "unanchored searches are not supported or enabled") 1920 } 1921 MatchErrorKind::UnsupportedAnchored { 1922 mode: Anchored::Pattern(pid), 1923 } => { 1924 write!( 1925 f, 1926 "anchored searches for a specific pattern ({}) are \ 1927 not supported or enabled", 1928 pid.as_usize(), 1929 ) 1930 } 1931 } 1932 } 1933 } 1934 1935 #[cfg(test)] 1936 mod tests { 1937 use super::*; 1938 1939 // We test that our 'MatchError' type is the size we expect. This isn't an 1940 // API guarantee, but if the size increases, we really want to make sure we 1941 // decide to do that intentionally. So this should be a speed bump. And in 1942 // general, we should not increase the size without a very good reason. 1943 // 1944 // Why? Because low level search APIs return Result<.., MatchError>. When 1945 // MatchError gets bigger, so to does the Result type. 1946 // 1947 // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes 1948 // the importance of keeping a small error type. But without 'alloc', we 1949 // still want things to be small. 1950 #[test] match_error_size()1951 fn match_error_size() { 1952 let expected_size = if cfg!(feature = "alloc") { 1953 core::mem::size_of::<usize>() 1954 } else { 1955 2 * core::mem::size_of::<usize>() 1956 }; 1957 assert_eq!(expected_size, core::mem::size_of::<MatchError>()); 1958 } 1959 1960 // Same as above, but for the underlying match error kind. 1961 #[cfg(target_pointer_width = "64")] 1962 #[test] match_error_kind_size()1963 fn match_error_kind_size() { 1964 let expected_size = 2 * core::mem::size_of::<usize>(); 1965 assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); 1966 } 1967 1968 #[cfg(target_pointer_width = "32")] 1969 #[test] match_error_kind_size()1970 fn match_error_kind_size() { 1971 let expected_size = 3 * core::mem::size_of::<usize>(); 1972 assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>()); 1973 } 1974 1975 #[test] incorrect_asref_guard()1976 fn incorrect_asref_guard() { 1977 struct Bad(std::cell::Cell<bool>); 1978 1979 impl AsRef<[u8]> for Bad { 1980 fn as_ref(&self) -> &[u8] { 1981 if self.0.replace(false) { 1982 &[] 1983 } else { 1984 &[0; 1000] 1985 } 1986 } 1987 } 1988 1989 let bad = Bad(std::cell::Cell::new(true)); 1990 let input = Input::new(&bad); 1991 assert!(input.end() <= input.haystack().len()); 1992 } 1993 } 1994