1 /*!
2 Types and routines for working with look-around assertions.
3 
4 This module principally defines two types:
5 
6 * [`Look`] enumerates all of the assertions supported by this crate.
7 * [`LookSet`] provides a way to efficiently store a set of [`Look`] values.
8 * [`LookMatcher`] provides routines for checking whether a `Look` or a
9 `LookSet` matches at a particular position in a haystack.
10 */
11 
12 // LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically
13 // copied verbatim from the regex-syntax crate. I would have no problems using
14 // the regex-syntax types and defining the matching routines (only found
15 // in this crate) as free functions, except the `Look` and `LookSet` types
16 // are used in lots of places. Including in places we expect to work when
17 // regex-syntax is *not* enabled, such as in the definition of the NFA itself.
18 //
19 // Thankfully the code we copy is pretty simple and there isn't much of it.
20 // Otherwise, the rest of this module deals with *matching* the assertions,
21 // which is not something that regex-syntax handles.
22 
23 use crate::util::{escape::DebugByte, utf8};
24 
25 /// A look-around assertion.
26 ///
27 /// An assertion matches at a position between characters in a haystack.
28 /// Namely, it does not actually "consume" any input as most parts of a regular
29 /// expression do. Assertions are a way of stating that some property must be
30 /// true at a particular point during matching.
31 ///
32 /// For example, `(?m)^[a-z]+$` is a pattern that:
33 ///
34 /// * Scans the haystack for a position at which `(?m:^)` is satisfied. That
35 /// occurs at either the beginning of the haystack, or immediately following
36 /// a `\n` character.
37 /// * Looks for one or more occurrences of `[a-z]`.
38 /// * Once `[a-z]+` has matched as much as it can, an overall match is only
39 /// reported when `[a-z]+` stops just before a `\n`.
40 ///
41 /// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.
42 ///
43 /// Assertions are also called "look-around," "look-behind" and "look-ahead."
44 /// Specifically, some assertions are look-behind (like `^`), other assertions
45 /// are look-ahead (like `$`) and yet other assertions are both look-ahead and
46 /// look-behind (like `\b`).
47 ///
48 /// # Assertions in an NFA
49 ///
50 /// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be
51 /// thought of as a conditional epsilon transition. That is, a matching engine
52 /// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits
53 /// moving through conditional epsilon transitions when their condition
54 /// is satisfied at whatever position the `PikeVM` is currently at in the
55 /// haystack.
56 ///
57 /// How assertions are handled in a `DFA` is trickier, since a DFA does not
58 /// have epsilon transitions at all. In this case, they are compiled into the
59 /// automaton itself, at the expense of more states than what would be required
60 /// without an assertion.
61 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
62 pub enum Look {
63     /// Match the beginning of text. Specifically, this matches at the starting
64     /// position of the input.
65     Start = 1 << 0,
66     /// Match the end of text. Specifically, this matches at the ending
67     /// position of the input.
68     End = 1 << 1,
69     /// Match the beginning of a line or the beginning of text. Specifically,
70     /// this matches at the starting position of the input, or at the position
71     /// immediately following a `\n` character.
72     StartLF = 1 << 2,
73     /// Match the end of a line or the end of text. Specifically, this matches
74     /// at the end position of the input, or at the position immediately
75     /// preceding a `\n` character.
76     EndLF = 1 << 3,
77     /// Match the beginning of a line or the beginning of text. Specifically,
78     /// this matches at the starting position of the input, or at the position
79     /// immediately following either a `\r` or `\n` character, but never after
80     /// a `\r` when a `\n` follows.
81     StartCRLF = 1 << 4,
82     /// Match the end of a line or the end of text. Specifically, this matches
83     /// at the end position of the input, or at the position immediately
84     /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
85     /// precedes it.
86     EndCRLF = 1 << 5,
87     /// Match an ASCII-only word boundary. That is, this matches a position
88     /// where the left adjacent character and right adjacent character
89     /// correspond to a word and non-word or a non-word and word character.
90     WordAscii = 1 << 6,
91     /// Match an ASCII-only negation of a word boundary.
92     WordAsciiNegate = 1 << 7,
93     /// Match a Unicode-aware word boundary. That is, this matches a position
94     /// where the left adjacent character and right adjacent character
95     /// correspond to a word and non-word or a non-word and word character.
96     WordUnicode = 1 << 8,
97     /// Match a Unicode-aware negation of a word boundary.
98     WordUnicodeNegate = 1 << 9,
99     /// Match the start of an ASCII-only word boundary. That is, this matches a
100     /// position at either the beginning of the haystack or where the previous
101     /// character is not a word character and the following character is a word
102     /// character.
103     WordStartAscii = 1 << 10,
104     /// Match the end of an ASCII-only word boundary. That is, this matches
105     /// a position at either the end of the haystack or where the previous
106     /// character is a word character and the following character is not a word
107     /// character.
108     WordEndAscii = 1 << 11,
109     /// Match the start of a Unicode word boundary. That is, this matches a
110     /// position at either the beginning of the haystack or where the previous
111     /// character is not a word character and the following character is a word
112     /// character.
113     WordStartUnicode = 1 << 12,
114     /// Match the end of a Unicode word boundary. That is, this matches a
115     /// position at either the end of the haystack or where the previous
116     /// character is a word character and the following character is not a word
117     /// character.
118     WordEndUnicode = 1 << 13,
119     /// Match the start half of an ASCII-only word boundary. That is, this
120     /// matches a position at either the beginning of the haystack or where the
121     /// previous character is not a word character.
122     WordStartHalfAscii = 1 << 14,
123     /// Match the end half of an ASCII-only word boundary. That is, this
124     /// matches a position at either the end of the haystack or where the
125     /// following character is not a word character.
126     WordEndHalfAscii = 1 << 15,
127     /// Match the start half of a Unicode word boundary. That is, this matches
128     /// a position at either the beginning of the haystack or where the
129     /// previous character is not a word character.
130     WordStartHalfUnicode = 1 << 16,
131     /// Match the end half of a Unicode word boundary. That is, this matches
132     /// a position at either the end of the haystack or where the following
133     /// character is not a word character.
134     WordEndHalfUnicode = 1 << 17,
135 }
136 
137 impl Look {
138     /// Flip the look-around assertion to its equivalent for reverse searches.
139     /// For example, `StartLF` gets translated to `EndLF`.
140     ///
141     /// Some assertions, such as `WordUnicode`, remain the same since they
142     /// match the same positions regardless of the direction of the search.
143     #[inline]
reversed(self) -> Look144     pub const fn reversed(self) -> Look {
145         match self {
146             Look::Start => Look::End,
147             Look::End => Look::Start,
148             Look::StartLF => Look::EndLF,
149             Look::EndLF => Look::StartLF,
150             Look::StartCRLF => Look::EndCRLF,
151             Look::EndCRLF => Look::StartCRLF,
152             Look::WordAscii => Look::WordAscii,
153             Look::WordAsciiNegate => Look::WordAsciiNegate,
154             Look::WordUnicode => Look::WordUnicode,
155             Look::WordUnicodeNegate => Look::WordUnicodeNegate,
156             Look::WordStartAscii => Look::WordEndAscii,
157             Look::WordEndAscii => Look::WordStartAscii,
158             Look::WordStartUnicode => Look::WordEndUnicode,
159             Look::WordEndUnicode => Look::WordStartUnicode,
160             Look::WordStartHalfAscii => Look::WordEndHalfAscii,
161             Look::WordEndHalfAscii => Look::WordStartHalfAscii,
162             Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
163             Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
164         }
165     }
166 
167     /// Return the underlying representation of this look-around enumeration
168     /// as an integer. Giving the return value to the [`Look::from_repr`]
169     /// constructor is guaranteed to return the same look-around variant that
170     /// one started with within a semver compatible release of this crate.
171     #[inline]
as_repr(self) -> u32172     pub const fn as_repr(self) -> u32 {
173         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
174         // actual int.
175         self as u32
176     }
177 
178     /// Given the underlying representation of a `Look` value, return the
179     /// corresponding `Look` value if the representation is valid. Otherwise
180     /// `None` is returned.
181     #[inline]
from_repr(repr: u32) -> Option<Look>182     pub const fn from_repr(repr: u32) -> Option<Look> {
183         match repr {
184             0b00_0000_0000_0000_0001 => Some(Look::Start),
185             0b00_0000_0000_0000_0010 => Some(Look::End),
186             0b00_0000_0000_0000_0100 => Some(Look::StartLF),
187             0b00_0000_0000_0000_1000 => Some(Look::EndLF),
188             0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
189             0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
190             0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
191             0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
192             0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
193             0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
194             0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
195             0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
196             0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
197             0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
198             0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
199             0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
200             0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
201             0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
202             _ => None,
203         }
204     }
205 
206     /// Returns a convenient single codepoint representation of this
207     /// look-around assertion. Each assertion is guaranteed to be represented
208     /// by a distinct character.
209     ///
210     /// This is useful for succinctly representing a look-around assertion in
211     /// human friendly but succinct output intended for a programmer working on
212     /// regex internals.
213     #[inline]
as_char(self) -> char214     pub const fn as_char(self) -> char {
215         match self {
216             Look::Start => 'A',
217             Look::End => 'z',
218             Look::StartLF => '^',
219             Look::EndLF => '$',
220             Look::StartCRLF => 'r',
221             Look::EndCRLF => 'R',
222             Look::WordAscii => 'b',
223             Look::WordAsciiNegate => 'B',
224             Look::WordUnicode => '��',
225             Look::WordUnicodeNegate => '��',
226             Look::WordStartAscii => '<',
227             Look::WordEndAscii => '>',
228             Look::WordStartUnicode => '〈',
229             Look::WordEndUnicode => '〉',
230             Look::WordStartHalfAscii => '◁',
231             Look::WordEndHalfAscii => '▷',
232             Look::WordStartHalfUnicode => '◀',
233             Look::WordEndHalfUnicode => '▶',
234         }
235     }
236 }
237 
238 /// LookSet is a memory-efficient set of look-around assertions.
239 ///
240 /// This is useful for efficiently tracking look-around assertions. For
241 /// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties
242 /// that return `LookSet`s.
243 #[derive(Clone, Copy, Default, Eq, PartialEq)]
244 pub struct LookSet {
245     /// The underlying representation this set is exposed to make it possible
246     /// to store it somewhere efficiently. The representation is that
247     /// of a bitset, where each assertion occupies bit `i` where
248     /// `i = Look::as_repr()`.
249     ///
250     /// Note that users of this internal representation must permit the full
251     /// range of `u16` values to be represented. For example, even if the
252     /// current implementation only makes use of the 10 least significant bits,
253     /// it may use more bits in a future semver compatible release.
254     pub bits: u32,
255 }
256 
257 impl LookSet {
258     /// Create an empty set of look-around assertions.
259     #[inline]
empty() -> LookSet260     pub fn empty() -> LookSet {
261         LookSet { bits: 0 }
262     }
263 
264     /// Create a full set of look-around assertions.
265     ///
266     /// This set contains all possible look-around assertions.
267     #[inline]
full() -> LookSet268     pub fn full() -> LookSet {
269         LookSet { bits: !0 }
270     }
271 
272     /// Create a look-around set containing the look-around assertion given.
273     ///
274     /// This is a convenience routine for creating an empty set and inserting
275     /// one look-around assertions.
276     #[inline]
singleton(look: Look) -> LookSet277     pub fn singleton(look: Look) -> LookSet {
278         LookSet::empty().insert(look)
279     }
280 
281     /// Returns the total number of look-around assertions in this set.
282     #[inline]
len(self) -> usize283     pub fn len(self) -> usize {
284         // OK because max value always fits in a u8, which in turn always
285         // fits in a usize, regardless of target.
286         usize::try_from(self.bits.count_ones()).unwrap()
287     }
288 
289     /// Returns true if and only if this set is empty.
290     #[inline]
is_empty(self) -> bool291     pub fn is_empty(self) -> bool {
292         self.len() == 0
293     }
294 
295     /// Returns true if and only if the given look-around assertion is in this
296     /// set.
297     #[inline]
contains(self, look: Look) -> bool298     pub fn contains(self, look: Look) -> bool {
299         self.bits & look.as_repr() != 0
300     }
301 
302     /// Returns true if and only if this set contains any anchor assertions.
303     /// This includes both "start/end of haystack" and "start/end of line."
304     #[inline]
contains_anchor(&self) -> bool305     pub fn contains_anchor(&self) -> bool {
306         self.contains_anchor_haystack() || self.contains_anchor_line()
307     }
308 
309     /// Returns true if and only if this set contains any "start/end of
310     /// haystack" anchors. This doesn't include "start/end of line" anchors.
311     #[inline]
contains_anchor_haystack(&self) -> bool312     pub fn contains_anchor_haystack(&self) -> bool {
313         self.contains(Look::Start) || self.contains(Look::End)
314     }
315 
316     /// Returns true if and only if this set contains any "start/end of line"
317     /// anchors. This doesn't include "start/end of haystack" anchors. This
318     /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
319     #[inline]
contains_anchor_line(&self) -> bool320     pub fn contains_anchor_line(&self) -> bool {
321         self.contains(Look::StartLF)
322             || self.contains(Look::EndLF)
323             || self.contains(Look::StartCRLF)
324             || self.contains(Look::EndCRLF)
325     }
326 
327     /// Returns true if and only if this set contains any "start/end of line"
328     /// anchors that only treat `\n` as line terminators. This does not include
329     /// haystack anchors or CRLF aware line anchors.
330     #[inline]
contains_anchor_lf(&self) -> bool331     pub fn contains_anchor_lf(&self) -> bool {
332         self.contains(Look::StartLF) || self.contains(Look::EndLF)
333     }
334 
335     /// Returns true if and only if this set contains any "start/end of line"
336     /// anchors that are CRLF-aware. This doesn't include "start/end of
337     /// haystack" or "start/end of line-feed" anchors.
338     #[inline]
contains_anchor_crlf(&self) -> bool339     pub fn contains_anchor_crlf(&self) -> bool {
340         self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
341     }
342 
343     /// Returns true if and only if this set contains any word boundary or
344     /// negated word boundary assertions. This include both Unicode and ASCII
345     /// word boundaries.
346     #[inline]
contains_word(self) -> bool347     pub fn contains_word(self) -> bool {
348         self.contains_word_unicode() || self.contains_word_ascii()
349     }
350 
351     /// Returns true if and only if this set contains any Unicode word boundary
352     /// or negated Unicode word boundary assertions.
353     #[inline]
contains_word_unicode(self) -> bool354     pub fn contains_word_unicode(self) -> bool {
355         self.contains(Look::WordUnicode)
356             || self.contains(Look::WordUnicodeNegate)
357             || self.contains(Look::WordStartUnicode)
358             || self.contains(Look::WordEndUnicode)
359             || self.contains(Look::WordStartHalfUnicode)
360             || self.contains(Look::WordEndHalfUnicode)
361     }
362 
363     /// Returns true if and only if this set contains any ASCII word boundary
364     /// or negated ASCII word boundary assertions.
365     #[inline]
contains_word_ascii(self) -> bool366     pub fn contains_word_ascii(self) -> bool {
367         self.contains(Look::WordAscii)
368             || self.contains(Look::WordAsciiNegate)
369             || self.contains(Look::WordStartAscii)
370             || self.contains(Look::WordEndAscii)
371             || self.contains(Look::WordStartHalfAscii)
372             || self.contains(Look::WordEndHalfAscii)
373     }
374 
375     /// Returns an iterator over all of the look-around assertions in this set.
376     #[inline]
iter(self) -> LookSetIter377     pub fn iter(self) -> LookSetIter {
378         LookSetIter { set: self }
379     }
380 
381     /// Return a new set that is equivalent to the original, but with the given
382     /// assertion added to it. If the assertion is already in the set, then the
383     /// returned set is equivalent to the original.
384     #[inline]
insert(self, look: Look) -> LookSet385     pub fn insert(self, look: Look) -> LookSet {
386         LookSet { bits: self.bits | look.as_repr() }
387     }
388 
389     /// Updates this set in place with the result of inserting the given
390     /// assertion into this set.
391     #[inline]
set_insert(&mut self, look: Look)392     pub fn set_insert(&mut self, look: Look) {
393         *self = self.insert(look);
394     }
395 
396     /// Return a new set that is equivalent to the original, but with the given
397     /// assertion removed from it. If the assertion is not in the set, then the
398     /// returned set is equivalent to the original.
399     #[inline]
remove(self, look: Look) -> LookSet400     pub fn remove(self, look: Look) -> LookSet {
401         LookSet { bits: self.bits & !look.as_repr() }
402     }
403 
404     /// Updates this set in place with the result of removing the given
405     /// assertion from this set.
406     #[inline]
set_remove(&mut self, look: Look)407     pub fn set_remove(&mut self, look: Look) {
408         *self = self.remove(look);
409     }
410 
411     /// Returns a new set that is the result of subtracting the given set from
412     /// this set.
413     #[inline]
subtract(self, other: LookSet) -> LookSet414     pub fn subtract(self, other: LookSet) -> LookSet {
415         LookSet { bits: self.bits & !other.bits }
416     }
417 
418     /// Updates this set in place with the result of subtracting the given set
419     /// from this set.
420     #[inline]
set_subtract(&mut self, other: LookSet)421     pub fn set_subtract(&mut self, other: LookSet) {
422         *self = self.subtract(other);
423     }
424 
425     /// Returns a new set that is the union of this and the one given.
426     #[inline]
union(self, other: LookSet) -> LookSet427     pub fn union(self, other: LookSet) -> LookSet {
428         LookSet { bits: self.bits | other.bits }
429     }
430 
431     /// Updates this set in place with the result of unioning it with the one
432     /// given.
433     #[inline]
set_union(&mut self, other: LookSet)434     pub fn set_union(&mut self, other: LookSet) {
435         *self = self.union(other);
436     }
437 
438     /// Returns a new set that is the intersection of this and the one given.
439     #[inline]
intersect(self, other: LookSet) -> LookSet440     pub fn intersect(self, other: LookSet) -> LookSet {
441         LookSet { bits: self.bits & other.bits }
442     }
443 
444     /// Updates this set in place with the result of intersecting it with the
445     /// one given.
446     #[inline]
set_intersect(&mut self, other: LookSet)447     pub fn set_intersect(&mut self, other: LookSet) {
448         *self = self.intersect(other);
449     }
450 
451     /// Return a `LookSet` from the slice given as a native endian 32-bit
452     /// integer.
453     ///
454     /// # Panics
455     ///
456     /// This panics if `slice.len() < 4`.
457     #[inline]
read_repr(slice: &[u8]) -> LookSet458     pub fn read_repr(slice: &[u8]) -> LookSet {
459         let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
460         LookSet { bits }
461     }
462 
463     /// Write a `LookSet` as a native endian 32-bit integer to the beginning
464     /// of the slice given.
465     ///
466     /// # Panics
467     ///
468     /// This panics if `slice.len() < 4`.
469     #[inline]
write_repr(self, slice: &mut [u8])470     pub fn write_repr(self, slice: &mut [u8]) {
471         let raw = self.bits.to_ne_bytes();
472         slice[0] = raw[0];
473         slice[1] = raw[1];
474         slice[2] = raw[2];
475         slice[3] = raw[3];
476     }
477 
478     /// Checks that all assertions in this set can be matched.
479     ///
480     /// Some assertions, such as Unicode word boundaries, require optional (but
481     /// enabled by default) tables that may not be available. If there are
482     /// assertions in this set that require tables that are not available, then
483     /// this will return an error.
484     ///
485     /// Specifically, this returns an error when the the
486     /// `unicode-word-boundary` feature is _not_ enabled _and_ this set
487     /// contains a Unicode word boundary assertion.
488     ///
489     /// It can be useful to use this on the result of
490     /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)
491     /// when building a matcher engine to ensure methods like
492     /// [`LookMatcher::matches_set`] do not panic at search time.
available(self) -> Result<(), UnicodeWordBoundaryError>493     pub fn available(self) -> Result<(), UnicodeWordBoundaryError> {
494         if self.contains_word_unicode() {
495             UnicodeWordBoundaryError::check()?;
496         }
497         Ok(())
498     }
499 }
500 
501 impl core::fmt::Debug for LookSet {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result502     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
503         if self.is_empty() {
504             return write!(f, "∅");
505         }
506         for look in self.iter() {
507             write!(f, "{}", look.as_char())?;
508         }
509         Ok(())
510     }
511 }
512 
513 /// An iterator over all look-around assertions in a [`LookSet`].
514 ///
515 /// This iterator is created by [`LookSet::iter`].
516 #[derive(Clone, Debug)]
517 pub struct LookSetIter {
518     set: LookSet,
519 }
520 
521 impl Iterator for LookSetIter {
522     type Item = Look;
523 
524     #[inline]
next(&mut self) -> Option<Look>525     fn next(&mut self) -> Option<Look> {
526         if self.set.is_empty() {
527             return None;
528         }
529         // We'll never have more than u8::MAX distinct look-around assertions,
530         // so 'bit' will always fit into a u16.
531         let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
532         let look = Look::from_repr(1 << bit)?;
533         self.set = self.set.remove(look);
534         Some(look)
535     }
536 }
537 
538 /// A matcher for look-around assertions.
539 ///
540 /// This matcher permits configuring aspects of how look-around assertions are
541 /// matched.
542 ///
543 /// # Example
544 ///
545 /// A `LookMatcher` can change the line terminator used for matching multi-line
546 /// anchors such as `(?m:^)` and `(?m:$)`.
547 ///
548 /// ```
549 /// use regex_automata::{
550 ///     nfa::thompson::{self, pikevm::PikeVM},
551 ///     util::look::LookMatcher,
552 ///     Match, Input,
553 /// };
554 ///
555 /// let mut lookm = LookMatcher::new();
556 /// lookm.set_line_terminator(b'\x00');
557 ///
558 /// let re = PikeVM::builder()
559 ///     .thompson(thompson::Config::new().look_matcher(lookm))
560 ///     .build(r"(?m)^[a-z]+$")?;
561 /// let mut cache = re.create_cache();
562 ///
563 /// // Multi-line assertions now use NUL as a terminator.
564 /// assert_eq!(
565 ///     Some(Match::must(0, 1..4)),
566 ///     re.find(&mut cache, b"\x00abc\x00"),
567 /// );
568 /// // ... and \n is no longer recognized as a terminator.
569 /// assert_eq!(
570 ///     None,
571 ///     re.find(&mut cache, b"\nabc\n"),
572 /// );
573 ///
574 /// # Ok::<(), Box<dyn std::error::Error>>(())
575 /// ```
576 #[derive(Clone, Debug)]
577 pub struct LookMatcher {
578     lineterm: DebugByte,
579 }
580 
581 impl LookMatcher {
582     /// Creates a new default matcher for look-around assertions.
new() -> LookMatcher583     pub fn new() -> LookMatcher {
584         LookMatcher { lineterm: DebugByte(b'\n') }
585     }
586 
587     /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.
588     ///
589     /// Namely, instead of `^` matching after `\n` and `$` matching immediately
590     /// before a `\n`, this will cause it to match after and before the byte
591     /// given.
592     ///
593     /// It can occasionally be useful to use this to configure the line
594     /// terminator to the NUL byte when searching binary data.
595     ///
596     /// Note that this does not apply to CRLF-aware line anchors such as
597     /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to
598     /// use `\r` and `\n`.
set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher599     pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher {
600         self.lineterm.0 = byte;
601         self
602     }
603 
604     /// Returns the line terminator that was configured for this matcher.
605     ///
606     /// If no line terminator was configured, then this returns `\n`.
607     ///
608     /// Note that the line terminator should only be used for matching `(?m:^)`
609     /// and `(?m:$)` assertions. It specifically should _not_ be used for
610     /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.
get_line_terminator(&self) -> u8611     pub fn get_line_terminator(&self) -> u8 {
612         self.lineterm.0
613     }
614 
615     /// Returns true when the position `at` in `haystack` satisfies the given
616     /// look-around assertion.
617     ///
618     /// # Panics
619     ///
620     /// This panics when testing any Unicode word boundary assertion in this
621     /// set and when the Unicode word data is not available. Specifically, this
622     /// only occurs when the `unicode-word-boundary` feature is not enabled.
623     ///
624     /// Since it's generally expected that this routine is called inside of
625     /// a matching engine, callers should check the error condition when
626     /// building the matching engine. If there is a Unicode word boundary
627     /// in the matcher and the data isn't available, then the matcher should
628     /// fail to build.
629     ///
630     /// Callers can check the error condition with [`LookSet::available`].
631     ///
632     /// This also may panic when `at > haystack.len()`. Note that `at ==
633     /// haystack.len()` is legal and guaranteed not to panic.
634     #[inline]
matches(&self, look: Look, haystack: &[u8], at: usize) -> bool635     pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool {
636         self.matches_inline(look, haystack, at)
637     }
638 
639     /// Like `matches`, but forcefully inlined.
640     ///
641     /// # Panics
642     ///
643     /// This panics when testing any Unicode word boundary assertion in this
644     /// set and when the Unicode word data is not available. Specifically, this
645     /// only occurs when the `unicode-word-boundary` feature is not enabled.
646     ///
647     /// Since it's generally expected that this routine is called inside of
648     /// a matching engine, callers should check the error condition when
649     /// building the matching engine. If there is a Unicode word boundary
650     /// in the matcher and the data isn't available, then the matcher should
651     /// fail to build.
652     ///
653     /// Callers can check the error condition with [`LookSet::available`].
654     ///
655     /// This also may panic when `at > haystack.len()`. Note that `at ==
656     /// haystack.len()` is legal and guaranteed not to panic.
657     #[cfg_attr(feature = "perf-inline", inline(always))]
matches_inline( &self, look: Look, haystack: &[u8], at: usize, ) -> bool658     pub(crate) fn matches_inline(
659         &self,
660         look: Look,
661         haystack: &[u8],
662         at: usize,
663     ) -> bool {
664         match look {
665             Look::Start => self.is_start(haystack, at),
666             Look::End => self.is_end(haystack, at),
667             Look::StartLF => self.is_start_lf(haystack, at),
668             Look::EndLF => self.is_end_lf(haystack, at),
669             Look::StartCRLF => self.is_start_crlf(haystack, at),
670             Look::EndCRLF => self.is_end_crlf(haystack, at),
671             Look::WordAscii => self.is_word_ascii(haystack, at),
672             Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),
673             Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),
674             Look::WordUnicodeNegate => {
675                 self.is_word_unicode_negate(haystack, at).unwrap()
676             }
677             Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
678             Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
679             Look::WordStartUnicode => {
680                 self.is_word_start_unicode(haystack, at).unwrap()
681             }
682             Look::WordEndUnicode => {
683                 self.is_word_end_unicode(haystack, at).unwrap()
684             }
685             Look::WordStartHalfAscii => {
686                 self.is_word_start_half_ascii(haystack, at)
687             }
688             Look::WordEndHalfAscii => {
689                 self.is_word_end_half_ascii(haystack, at)
690             }
691             Look::WordStartHalfUnicode => {
692                 self.is_word_start_half_unicode(haystack, at).unwrap()
693             }
694             Look::WordEndHalfUnicode => {
695                 self.is_word_end_half_unicode(haystack, at).unwrap()
696             }
697         }
698     }
699 
700     /// Returns true when _all_ of the assertions in the given set match at the
701     /// given position in the haystack.
702     ///
703     /// # Panics
704     ///
705     /// This panics when testing any Unicode word boundary assertion in this
706     /// set and when the Unicode word data is not available. Specifically, this
707     /// only occurs when the `unicode-word-boundary` feature is not enabled.
708     ///
709     /// Since it's generally expected that this routine is called inside of
710     /// a matching engine, callers should check the error condition when
711     /// building the matching engine. If there is a Unicode word boundary
712     /// in the matcher and the data isn't available, then the matcher should
713     /// fail to build.
714     ///
715     /// Callers can check the error condition with [`LookSet::available`].
716     ///
717     /// This also may panic when `at > haystack.len()`. Note that `at ==
718     /// haystack.len()` is legal and guaranteed not to panic.
719     #[inline]
matches_set( &self, set: LookSet, haystack: &[u8], at: usize, ) -> bool720     pub fn matches_set(
721         &self,
722         set: LookSet,
723         haystack: &[u8],
724         at: usize,
725     ) -> bool {
726         self.matches_set_inline(set, haystack, at)
727     }
728 
729     /// Like `LookSet::matches`, but forcefully inlined for perf.
730     #[cfg_attr(feature = "perf-inline", inline(always))]
matches_set_inline( &self, set: LookSet, haystack: &[u8], at: usize, ) -> bool731     pub(crate) fn matches_set_inline(
732         &self,
733         set: LookSet,
734         haystack: &[u8],
735         at: usize,
736     ) -> bool {
737         // This used to luse LookSet::iter with Look::matches on each element,
738         // but that proved to be quite diastrous for perf. The manual "if
739         // the set has this assertion, check it" turns out to be quite a bit
740         // faster.
741         if set.contains(Look::Start) {
742             if !self.is_start(haystack, at) {
743                 return false;
744             }
745         }
746         if set.contains(Look::End) {
747             if !self.is_end(haystack, at) {
748                 return false;
749             }
750         }
751         if set.contains(Look::StartLF) {
752             if !self.is_start_lf(haystack, at) {
753                 return false;
754             }
755         }
756         if set.contains(Look::EndLF) {
757             if !self.is_end_lf(haystack, at) {
758                 return false;
759             }
760         }
761         if set.contains(Look::StartCRLF) {
762             if !self.is_start_crlf(haystack, at) {
763                 return false;
764             }
765         }
766         if set.contains(Look::EndCRLF) {
767             if !self.is_end_crlf(haystack, at) {
768                 return false;
769             }
770         }
771         if set.contains(Look::WordAscii) {
772             if !self.is_word_ascii(haystack, at) {
773                 return false;
774             }
775         }
776         if set.contains(Look::WordAsciiNegate) {
777             if !self.is_word_ascii_negate(haystack, at) {
778                 return false;
779             }
780         }
781         if set.contains(Look::WordUnicode) {
782             if !self.is_word_unicode(haystack, at).unwrap() {
783                 return false;
784             }
785         }
786         if set.contains(Look::WordUnicodeNegate) {
787             if !self.is_word_unicode_negate(haystack, at).unwrap() {
788                 return false;
789             }
790         }
791         if set.contains(Look::WordStartAscii) {
792             if !self.is_word_start_ascii(haystack, at) {
793                 return false;
794             }
795         }
796         if set.contains(Look::WordEndAscii) {
797             if !self.is_word_end_ascii(haystack, at) {
798                 return false;
799             }
800         }
801         if set.contains(Look::WordStartUnicode) {
802             if !self.is_word_start_unicode(haystack, at).unwrap() {
803                 return false;
804             }
805         }
806         if set.contains(Look::WordEndUnicode) {
807             if !self.is_word_end_unicode(haystack, at).unwrap() {
808                 return false;
809             }
810         }
811         if set.contains(Look::WordStartHalfAscii) {
812             if !self.is_word_start_half_ascii(haystack, at) {
813                 return false;
814             }
815         }
816         if set.contains(Look::WordEndHalfAscii) {
817             if !self.is_word_end_half_ascii(haystack, at) {
818                 return false;
819             }
820         }
821         if set.contains(Look::WordStartHalfUnicode) {
822             if !self.is_word_start_half_unicode(haystack, at).unwrap() {
823                 return false;
824             }
825         }
826         if set.contains(Look::WordEndHalfUnicode) {
827             if !self.is_word_end_half_unicode(haystack, at).unwrap() {
828                 return false;
829             }
830         }
831         true
832     }
833 
834     /// Split up the given byte classes into equivalence classes in a way that
835     /// is consistent with this look-around assertion.
836     #[cfg(feature = "alloc")]
add_to_byteset( &self, look: Look, set: &mut crate::util::alphabet::ByteClassSet, )837     pub(crate) fn add_to_byteset(
838         &self,
839         look: Look,
840         set: &mut crate::util::alphabet::ByteClassSet,
841     ) {
842         match look {
843             Look::Start | Look::End => {}
844             Look::StartLF | Look::EndLF => {
845                 set.set_range(self.lineterm.0, self.lineterm.0);
846             }
847             Look::StartCRLF | Look::EndCRLF => {
848                 set.set_range(b'\r', b'\r');
849                 set.set_range(b'\n', b'\n');
850             }
851             Look::WordAscii
852             | Look::WordAsciiNegate
853             | Look::WordUnicode
854             | Look::WordUnicodeNegate
855             | Look::WordStartAscii
856             | Look::WordEndAscii
857             | Look::WordStartUnicode
858             | Look::WordEndUnicode
859             | Look::WordStartHalfAscii
860             | Look::WordEndHalfAscii
861             | Look::WordStartHalfUnicode
862             | Look::WordEndHalfUnicode => {
863                 // We need to mark all ranges of bytes whose pairs result in
864                 // evaluating \b differently. This isn't technically correct
865                 // for Unicode word boundaries, but DFAs can't handle those
866                 // anyway, and thus, the byte classes don't need to either
867                 // since they are themselves only used in DFAs.
868                 //
869                 // FIXME: It seems like the calls to 'set_range' here are
870                 // completely invariant, which means we could just hard-code
871                 // them here without needing to write a loop. And we only need
872                 // to do this dance at most once per regex.
873                 //
874                 // FIXME: Is this correct for \B?
875                 let iswb = utf8::is_word_byte;
876                 // This unwrap is OK because we guard every use of 'asu8' with
877                 // a check that the input is <= 255.
878                 let asu8 = |b: u16| u8::try_from(b).unwrap();
879                 let mut b1: u16 = 0;
880                 let mut b2: u16;
881                 while b1 <= 255 {
882                     b2 = b1 + 1;
883                     while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) {
884                         b2 += 1;
885                     }
886                     // The guards above guarantee that b2 can never get any
887                     // bigger.
888                     assert!(b2 <= 256);
889                     // Subtracting 1 from b2 is always OK because it is always
890                     // at least 1 greater than b1, and the assert above
891                     // guarantees that the asu8 conversion will succeed.
892                     set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap()));
893                     b1 = b2;
894                 }
895             }
896         }
897     }
898 
899     /// Returns true when [`Look::Start`] is satisfied `at` the given position
900     /// in `haystack`.
901     ///
902     /// # Panics
903     ///
904     /// This may panic when `at > haystack.len()`. Note that `at ==
905     /// haystack.len()` is legal and guaranteed not to panic.
906     #[inline]
is_start(&self, _haystack: &[u8], at: usize) -> bool907     pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool {
908         at == 0
909     }
910 
911     /// Returns true when [`Look::End`] is satisfied `at` the given position in
912     /// `haystack`.
913     ///
914     /// # Panics
915     ///
916     /// This may panic when `at > haystack.len()`. Note that `at ==
917     /// haystack.len()` is legal and guaranteed not to panic.
918     #[inline]
is_end(&self, haystack: &[u8], at: usize) -> bool919     pub fn is_end(&self, haystack: &[u8], at: usize) -> bool {
920         at == haystack.len()
921     }
922 
923     /// Returns true when [`Look::StartLF`] is satisfied `at` the given
924     /// position in `haystack`.
925     ///
926     /// # Panics
927     ///
928     /// This may panic when `at > haystack.len()`. Note that `at ==
929     /// haystack.len()` is legal and guaranteed not to panic.
930     #[inline]
is_start_lf(&self, haystack: &[u8], at: usize) -> bool931     pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool {
932         self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0
933     }
934 
935     /// Returns true when [`Look::EndLF`] is satisfied `at` the given position
936     /// in `haystack`.
937     ///
938     /// # Panics
939     ///
940     /// This may panic when `at > haystack.len()`. Note that `at ==
941     /// haystack.len()` is legal and guaranteed not to panic.
942     #[inline]
is_end_lf(&self, haystack: &[u8], at: usize) -> bool943     pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool {
944         self.is_end(haystack, at) || haystack[at] == self.lineterm.0
945     }
946 
947     /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given
948     /// position in `haystack`.
949     ///
950     /// # Panics
951     ///
952     /// This may panic when `at > haystack.len()`. Note that `at ==
953     /// haystack.len()` is legal and guaranteed not to panic.
954     #[inline]
is_start_crlf(&self, haystack: &[u8], at: usize) -> bool955     pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool {
956         self.is_start(haystack, at)
957             || haystack[at - 1] == b'\n'
958             || (haystack[at - 1] == b'\r'
959                 && (at >= haystack.len() || haystack[at] != b'\n'))
960     }
961 
962     /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given
963     /// position in `haystack`.
964     ///
965     /// # Panics
966     ///
967     /// This may panic when `at > haystack.len()`. Note that `at ==
968     /// haystack.len()` is legal and guaranteed not to panic.
969     #[inline]
is_end_crlf(&self, haystack: &[u8], at: usize) -> bool970     pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool {
971         self.is_end(haystack, at)
972             || haystack[at] == b'\r'
973             || (haystack[at] == b'\n'
974                 && (at == 0 || haystack[at - 1] != b'\r'))
975     }
976 
977     /// Returns true when [`Look::WordAscii`] is satisfied `at` the given
978     /// position in `haystack`.
979     ///
980     /// # Panics
981     ///
982     /// This may panic when `at > haystack.len()`. Note that `at ==
983     /// haystack.len()` is legal and guaranteed not to panic.
984     #[inline]
is_word_ascii(&self, haystack: &[u8], at: usize) -> bool985     pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool {
986         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
987         let word_after =
988             at < haystack.len() && utf8::is_word_byte(haystack[at]);
989         word_before != word_after
990     }
991 
992     /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given
993     /// position in `haystack`.
994     ///
995     /// # Panics
996     ///
997     /// This may panic when `at > haystack.len()`. Note that `at ==
998     /// haystack.len()` is legal and guaranteed not to panic.
999     #[inline]
is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool1000     pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool {
1001         !self.is_word_ascii(haystack, at)
1002     }
1003 
1004     /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given
1005     /// position in `haystack`.
1006     ///
1007     /// # Panics
1008     ///
1009     /// This may panic when `at > haystack.len()`. Note that `at ==
1010     /// haystack.len()` is legal and guaranteed not to panic.
1011     ///
1012     /// # Errors
1013     ///
1014     /// This returns an error when Unicode word boundary tables
1015     /// are not available. Specifically, this only occurs when the
1016     /// `unicode-word-boundary` feature is not enabled.
1017     #[inline]
is_word_unicode( &self, haystack: &[u8], at: usize, ) -> Result<bool, UnicodeWordBoundaryError>1018     pub fn is_word_unicode(
1019         &self,
1020         haystack: &[u8],
1021         at: usize,
1022     ) -> Result<bool, UnicodeWordBoundaryError> {
1023         let word_before = is_word_char::rev(haystack, at)?;
1024         let word_after = is_word_char::fwd(haystack, at)?;
1025         Ok(word_before != word_after)
1026     }
1027 
1028     /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the
1029     /// given position in `haystack`.
1030     ///
1031     /// # Panics
1032     ///
1033     /// This may panic when `at > haystack.len()`. Note that `at ==
1034     /// haystack.len()` is legal and guaranteed not to panic.
1035     ///
1036     /// # Errors
1037     ///
1038     /// This returns an error when Unicode word boundary tables
1039     /// are not available. Specifically, this only occurs when the
1040     /// `unicode-word-boundary` feature is not enabled.
1041     #[inline]
is_word_unicode_negate( &self, haystack: &[u8], at: usize, ) -> Result<bool, UnicodeWordBoundaryError>1042     pub fn is_word_unicode_negate(
1043         &self,
1044         haystack: &[u8],
1045         at: usize,
1046     ) -> Result<bool, UnicodeWordBoundaryError> {
1047         // This is pretty subtle. Why do we need to do UTF-8 decoding here?
1048         // Well... at time of writing, the is_word_char_{fwd,rev} routines will
1049         // only return true if there is a valid UTF-8 encoding of a "word"
1050         // codepoint, and false in every other case (including invalid UTF-8).
1051         // This means that in regions of invalid UTF-8 (which might be a
1052         // subset of valid UTF-8!), it would result in \B matching. While this
1053         // would be questionable in the context of truly invalid UTF-8, it is
1054         // *certainly* wrong to report match boundaries that split the encoding
1055         // of a codepoint. So to work around this, we ensure that we can decode
1056         // a codepoint on either side of `at`. If either direction fails, then
1057         // we don't permit \B to match at all.
1058         //
1059         // Now, this isn't exactly optimal from a perf perspective. We could
1060         // try and detect this in is_word_char::{fwd,rev}, but it's not clear
1061         // if it's worth it. \B is, after all, rarely used. Even worse,
1062         // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this
1063         // will wind up doing UTF-8 decoding twice. Owch. We could fix this
1064         // with more code complexity, but it just doesn't feel worth it for \B.
1065         //
1066         // And in particular, we do *not* have to do this with \b, because \b
1067         // *requires* that at least one side of `at` be a "word" codepoint,
1068         // which in turn implies one side of `at` must be valid UTF-8. This in
1069         // turn implies that \b can never split a valid UTF-8 encoding of a
1070         // codepoint. In the case where one side of `at` is truly invalid UTF-8
1071         // and the other side IS a word codepoint, then we want \b to match
1072         // since it represents a valid UTF-8 boundary. It also makes sense. For
1073         // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
1074         //
1075         // Note also that this is not just '!is_word_unicode(..)' like it is
1076         // for the ASCII case. For example, neither \b nor \B is satisfied
1077         // within invalid UTF-8 sequences.
1078         let word_before = at > 0
1079             && match utf8::decode_last(&haystack[..at]) {
1080                 None | Some(Err(_)) => return Ok(false),
1081                 Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1082             };
1083         let word_after = at < haystack.len()
1084             && match utf8::decode(&haystack[at..]) {
1085                 None | Some(Err(_)) => return Ok(false),
1086                 Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1087             };
1088         Ok(word_before == word_after)
1089     }
1090 
1091     /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
1092     /// position in `haystack`.
1093     ///
1094     /// # Panics
1095     ///
1096     /// This may panic when `at > haystack.len()`. Note that `at ==
1097     /// haystack.len()` is legal and guaranteed not to panic.
1098     #[inline]
is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool1099     pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
1100         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1101         let word_after =
1102             at < haystack.len() && utf8::is_word_byte(haystack[at]);
1103         !word_before && word_after
1104     }
1105 
1106     /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
1107     /// position in `haystack`.
1108     ///
1109     /// # Panics
1110     ///
1111     /// This may panic when `at > haystack.len()`. Note that `at ==
1112     /// haystack.len()` is legal and guaranteed not to panic.
1113     #[inline]
is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool1114     pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
1115         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1116         let word_after =
1117             at < haystack.len() && utf8::is_word_byte(haystack[at]);
1118         word_before && !word_after
1119     }
1120 
1121     /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
1122     /// given position in `haystack`.
1123     ///
1124     /// # Panics
1125     ///
1126     /// This may panic when `at > haystack.len()`. Note that `at ==
1127     /// haystack.len()` is legal and guaranteed not to panic.
1128     ///
1129     /// # Errors
1130     ///
1131     /// This returns an error when Unicode word boundary tables
1132     /// are not available. Specifically, this only occurs when the
1133     /// `unicode-word-boundary` feature is not enabled.
1134     #[inline]
is_word_start_unicode( &self, haystack: &[u8], at: usize, ) -> Result<bool, UnicodeWordBoundaryError>1135     pub fn is_word_start_unicode(
1136         &self,
1137         haystack: &[u8],
1138         at: usize,
1139     ) -> Result<bool, UnicodeWordBoundaryError> {
1140         let word_before = is_word_char::rev(haystack, at)?;
1141         let word_after = is_word_char::fwd(haystack, at)?;
1142         Ok(!word_before && word_after)
1143     }
1144 
1145     /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
1146     /// given position in `haystack`.
1147     ///
1148     /// # Panics
1149     ///
1150     /// This may panic when `at > haystack.len()`. Note that `at ==
1151     /// haystack.len()` is legal and guaranteed not to panic.
1152     ///
1153     /// # Errors
1154     ///
1155     /// This returns an error when Unicode word boundary tables
1156     /// are not available. Specifically, this only occurs when the
1157     /// `unicode-word-boundary` feature is not enabled.
1158     #[inline]
is_word_end_unicode( &self, haystack: &[u8], at: usize, ) -> Result<bool, UnicodeWordBoundaryError>1159     pub fn is_word_end_unicode(
1160         &self,
1161         haystack: &[u8],
1162         at: usize,
1163     ) -> Result<bool, UnicodeWordBoundaryError> {
1164         let word_before = is_word_char::rev(haystack, at)?;
1165         let word_after = is_word_char::fwd(haystack, at)?;
1166         Ok(word_before && !word_after)
1167     }
1168 
1169     /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
1170     /// given position in `haystack`.
1171     ///
1172     /// # Panics
1173     ///
1174     /// This may panic when `at > haystack.len()`. Note that `at ==
1175     /// haystack.len()` is legal and guaranteed not to panic.
1176     #[inline]
is_word_start_half_ascii( &self, haystack: &[u8], at: usize, ) -> bool1177     pub fn is_word_start_half_ascii(
1178         &self,
1179         haystack: &[u8],
1180         at: usize,
1181     ) -> bool {
1182         let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1183         !word_before
1184     }
1185 
1186     /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
1187     /// given position in `haystack`.
1188     ///
1189     /// # Panics
1190     ///
1191     /// This may panic when `at > haystack.len()`. Note that `at ==
1192     /// haystack.len()` is legal and guaranteed not to panic.
1193     #[inline]
is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool1194     pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
1195         let word_after =
1196             at < haystack.len() && utf8::is_word_byte(haystack[at]);
1197         !word_after
1198     }
1199 
1200     /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
1201     /// given position in `haystack`.
1202     ///
1203     /// # Panics
1204     ///
1205     /// This may panic when `at > haystack.len()`. Note that `at ==
1206     /// haystack.len()` is legal and guaranteed not to panic.
1207     ///
1208     /// # Errors
1209     ///
1210     /// This returns an error when Unicode word boundary tables
1211     /// are not available. Specifically, this only occurs when the
1212     /// `unicode-word-boundary` feature is not enabled.
1213     #[inline]
is_word_start_half_unicode( &self, haystack: &[u8], at: usize, ) -> Result<bool, UnicodeWordBoundaryError>1214     pub fn is_word_start_half_unicode(
1215         &self,
1216         haystack: &[u8],
1217         at: usize,
1218     ) -> Result<bool, UnicodeWordBoundaryError> {
1219         // See `is_word_unicode_negate` for why we need to do this. We don't
1220         // need to do it for `is_word_start_unicode` because that guarantees
1221         // that the position matched falls on a valid UTF-8 boundary given
1222         // that the right side must be in \w.
1223         let word_before = at > 0
1224             && match utf8::decode_last(&haystack[..at]) {
1225                 None | Some(Err(_)) => return Ok(false),
1226                 Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1227             };
1228         Ok(!word_before)
1229     }
1230 
1231     /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
1232     /// given position in `haystack`.
1233     ///
1234     /// # Panics
1235     ///
1236     /// This may panic when `at > haystack.len()`. Note that `at ==
1237     /// haystack.len()` is legal and guaranteed not to panic.
1238     ///
1239     /// # Errors
1240     ///
1241     /// This returns an error when Unicode word boundary tables
1242     /// are not available. Specifically, this only occurs when the
1243     /// `unicode-word-boundary` feature is not enabled.
1244     #[inline]
is_word_end_half_unicode( &self, haystack: &[u8], at: usize, ) -> Result<bool, UnicodeWordBoundaryError>1245     pub fn is_word_end_half_unicode(
1246         &self,
1247         haystack: &[u8],
1248         at: usize,
1249     ) -> Result<bool, UnicodeWordBoundaryError> {
1250         // See `is_word_unicode_negate` for why we need to do this. We don't
1251         // need to do it for `is_word_end_unicode` because that guarantees
1252         // that the position matched falls on a valid UTF-8 boundary given
1253         // that the left side must be in \w.
1254         let word_after = at < haystack.len()
1255             && match utf8::decode(&haystack[at..]) {
1256                 None | Some(Err(_)) => return Ok(false),
1257                 Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1258             };
1259         Ok(!word_after)
1260     }
1261 }
1262 
1263 impl Default for LookMatcher {
default() -> LookMatcher1264     fn default() -> LookMatcher {
1265         LookMatcher::new()
1266     }
1267 }
1268 
1269 /// An error that occurs when the Unicode-aware `\w` class is unavailable.
1270 ///
1271 /// This error can occur when the data tables necessary for the Unicode aware
1272 /// Perl character class `\w` are unavailable. The `\w` class is used to
1273 /// determine whether a codepoint is considered a word character or not when
1274 /// determining whether a Unicode aware `\b` (or `\B`) matches at a particular
1275 /// position.
1276 ///
1277 /// This error can only occur when the `unicode-word-boundary` feature is
1278 /// disabled.
1279 #[derive(Clone, Debug)]
1280 pub struct UnicodeWordBoundaryError(());
1281 
1282 impl UnicodeWordBoundaryError {
1283     #[cfg(not(feature = "unicode-word-boundary"))]
new() -> UnicodeWordBoundaryError1284     pub(crate) fn new() -> UnicodeWordBoundaryError {
1285         UnicodeWordBoundaryError(())
1286     }
1287 
1288     /// Returns an error if and only if Unicode word boundary data is
1289     /// unavailable.
check() -> Result<(), UnicodeWordBoundaryError>1290     pub fn check() -> Result<(), UnicodeWordBoundaryError> {
1291         is_word_char::check()
1292     }
1293 }
1294 
1295 #[cfg(feature = "std")]
1296 impl std::error::Error for UnicodeWordBoundaryError {}
1297 
1298 impl core::fmt::Display for UnicodeWordBoundaryError {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result1299     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1300         write!(
1301             f,
1302             "Unicode-aware \\b and \\B are unavailable because the \
1303              requisite data tables are missing, please enable the \
1304              unicode-word-boundary feature"
1305         )
1306     }
1307 }
1308 
1309 // Below are FOUR different ways for checking whether whether a "word"
1310 // codepoint exists at a particular position in the haystack. The four
1311 // different approaches are, in order of preference:
1312 //
1313 // 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the
1314 // first call, and then use that DFA for all subsequent calls.
1315 // 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.
1316 // 3. Do UTF-8 decoding and use our own 'perl_word' table.
1317 // 4. Return an error.
1318 //
1319 // The reason for all of these approaches is a combination of perf and
1320 // permitting one to build regex-automata without the Unicode data necessary
1321 // for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would
1322 // still work.)
1323 //
1324 // The DFA approach is the fastest, but it requires the regex parser, the
1325 // NFA compiler, the DFA builder and the DFA search runtime. That's a lot to
1326 // bring in, but if it's available, it's (probably) the best we can do.
1327 //
1328 // Approaches (2) and (3) are effectively equivalent, but (2) reuses the
1329 // data in regex-syntax and avoids duplicating it in regex-automata.
1330 //
1331 // Finally, (4) unconditionally returns an error since the requisite data isn't
1332 // available anywhere.
1333 //
1334 // There are actually more approaches possible that we didn't implement. For
1335 // example, if the DFA builder is available but the syntax parser is not, we
1336 // could technically hand construct our own NFA from the 'perl_word' data
1337 // table. But to avoid some pretty hairy code duplication, we would in turn
1338 // need to pull the UTF-8 compiler out of the NFA compiler. Yikes.
1339 //
1340 // A possibly more sensible alternative is to use a lazy DFA when the full
1341 // DFA builder isn't available...
1342 //
1343 // Yet another choice would be to build the full DFA and then embed it into the
1344 // source. Then we'd only need to bring in the DFA search runtime, which is
1345 // considerably smaller than the DFA builder code. The problem here is that the
1346 // Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,
1347 // we'd need to build regex-cli, which depends on regex-automata in order to
1348 // build some part of regex-automata. But to be honest, something like this has
1349 // to be allowed somehow? I just don't know what the right process is.
1350 //
1351 // There are perhaps other choices as well. Why did I stop at these 4? Because
1352 // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
1353 // approach eventually, as the benefits of the DFA approach are somewhat
1354 // compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
1355 // the commands below no longer work. If necessary, we should re-capitulate
1356 // the benchmark from whole cloth in rebar.)
1357 //
1358 //   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
1359 //
1360 // Then I changed the code below so that the util/unicode_data/perl_word table
1361 // was used and re-ran the benchmark:
1362 //
1363 //   $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv
1364 //
1365 // And compared them:
1366 //
1367 //   $ regex-cli bench diff dfa.csv table.csv
1368 //   benchmark                             engine                 dfa        table
1369 //   ---------                             ------                 ---        -----
1370 //   internal/count/boundary-words-holmes  regex/automata/pikevm  18.6 MB/s  12.9 MB/s
1371 //
1372 // Which is a nice improvement.
1373 //
1374 // UPDATE: It turns out that it takes approximately 22ms to build the reverse
1375 // DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in
1376 // the grand scheme things, but that is a significant latency cost. So I'm not
1377 // sure that's a good idea. I then tried using a lazy DFA instead, and that
1378 // eliminated the overhead, but since the lazy DFA requires mutable working
1379 // memory, that requires introducing a 'Cache' for every simultaneous call.
1380 //
1381 // I ended up deciding for now to just keep the "UTF-8 decode and check the
1382 // table." The DFA and lazy DFA approaches are still below, but commented out.
1383 //
1384 // [1]: https://github.com/BurntSushi/ucd-generate/issues/11
1385 
1386 /*
1387 /// A module that looks for word codepoints using lazy DFAs.
1388 #[cfg(all(
1389     feature = "unicode-word-boundary",
1390     feature = "syntax",
1391     feature = "unicode-perl",
1392     feature = "hybrid"
1393 ))]
1394 mod is_word_char {
1395     use alloc::vec::Vec;
1396 
1397     use crate::{
1398         hybrid::dfa::{Cache, DFA},
1399         nfa::thompson::NFA,
1400         util::{lazy::Lazy, pool::Pool, primitives::StateID},
1401         Anchored, Input,
1402     };
1403 
1404     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1405         Ok(())
1406     }
1407 
1408     #[cfg_attr(feature = "perf-inline", inline(always))]
1409     pub(super) fn fwd(
1410         haystack: &[u8],
1411         mut at: usize,
1412     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1413         static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap());
1414         static CACHE: Lazy<Pool<Cache>> =
1415             Lazy::new(|| Pool::new(|| WORD.create_cache()));
1416         let dfa = Lazy::get(&WORD);
1417         let mut cache = Lazy::get(&CACHE).get();
1418         let mut sid = dfa
1419             .start_state_forward(
1420                 &mut cache,
1421                 &Input::new("").anchored(Anchored::Yes),
1422             )
1423             .unwrap();
1424         while at < haystack.len() {
1425             let byte = haystack[at];
1426             sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1427             at += 1;
1428             if sid.is_tagged() {
1429                 if sid.is_match() {
1430                     return Ok(true);
1431                 } else if sid.is_dead() {
1432                     return Ok(false);
1433                 }
1434             }
1435         }
1436         Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1437     }
1438 
1439     #[cfg_attr(feature = "perf-inline", inline(always))]
1440     pub(super) fn rev(
1441         haystack: &[u8],
1442         mut at: usize,
1443     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1444         static WORD: Lazy<DFA> = Lazy::new(|| {
1445             DFA::builder()
1446                 .thompson(NFA::config().reverse(true))
1447                 .build(r"\w")
1448                 .unwrap()
1449         });
1450         static CACHE: Lazy<Pool<Cache>> =
1451             Lazy::new(|| Pool::new(|| WORD.create_cache()));
1452         let dfa = Lazy::get(&WORD);
1453         let mut cache = Lazy::get(&CACHE).get();
1454         let mut sid = dfa
1455             .start_state_reverse(
1456                 &mut cache,
1457                 &Input::new("").anchored(Anchored::Yes),
1458             )
1459             .unwrap();
1460         while at > 0 {
1461             at -= 1;
1462             let byte = haystack[at];
1463             sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1464             if sid.is_tagged() {
1465                 if sid.is_match() {
1466                     return Ok(true);
1467                 } else if sid.is_dead() {
1468                     return Ok(false);
1469                 }
1470             }
1471         }
1472         Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1473     }
1474 }
1475 */
1476 
1477 /*
1478 /// A module that looks for word codepoints using fully compiled DFAs.
1479 #[cfg(all(
1480     feature = "unicode-word-boundary",
1481     feature = "syntax",
1482     feature = "unicode-perl",
1483     feature = "dfa-build"
1484 ))]
1485 mod is_word_char {
1486     use alloc::vec::Vec;
1487 
1488     use crate::{
1489         dfa::{dense::DFA, Automaton, StartKind},
1490         nfa::thompson::NFA,
1491         util::{lazy::Lazy, primitives::StateID},
1492         Anchored, Input,
1493     };
1494 
1495     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1496         Ok(())
1497     }
1498 
1499     #[cfg_attr(feature = "perf-inline", inline(always))]
1500     pub(super) fn fwd(
1501         haystack: &[u8],
1502         mut at: usize,
1503     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1504         static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1505             let dfa = DFA::builder()
1506                 .configure(DFA::config().start_kind(StartKind::Anchored))
1507                 .build(r"\w")
1508                 .unwrap();
1509             // OK because our regex has no look-around.
1510             let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1511             (dfa, start_id)
1512         });
1513         let &(ref dfa, mut sid) = Lazy::get(&WORD);
1514         while at < haystack.len() {
1515             let byte = haystack[at];
1516             sid = dfa.next_state(sid, byte);
1517             at += 1;
1518             if dfa.is_special_state(sid) {
1519                 if dfa.is_match_state(sid) {
1520                     return Ok(true);
1521                 } else if dfa.is_dead_state(sid) {
1522                     return Ok(false);
1523                 }
1524             }
1525         }
1526         Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1527     }
1528 
1529     #[cfg_attr(feature = "perf-inline", inline(always))]
1530     pub(super) fn rev(
1531         haystack: &[u8],
1532         mut at: usize,
1533     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1534         static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1535             let dfa = DFA::builder()
1536                 .configure(DFA::config().start_kind(StartKind::Anchored))
1537                 // From ad hoc measurements, it looks like setting
1538                 // shrink==false is slightly faster than shrink==true. I kind
1539                 // of feel like this indicates that shrinking is probably a
1540                 // failure, although it can help in some cases. Sigh.
1541                 .thompson(NFA::config().reverse(true).shrink(false))
1542                 .build(r"\w")
1543                 .unwrap();
1544             // OK because our regex has no look-around.
1545             let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1546             (dfa, start_id)
1547         });
1548         let &(ref dfa, mut sid) = Lazy::get(&WORD);
1549         while at > 0 {
1550             at -= 1;
1551             let byte = haystack[at];
1552             sid = dfa.next_state(sid, byte);
1553             if dfa.is_special_state(sid) {
1554                 if dfa.is_match_state(sid) {
1555                     return Ok(true);
1556                 } else if dfa.is_dead_state(sid) {
1557                     return Ok(false);
1558                 }
1559             }
1560         }
1561         Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1562     }
1563 }
1564 */
1565 
1566 /// A module that looks for word codepoints using regex-syntax's data tables.
1567 #[cfg(all(
1568     feature = "unicode-word-boundary",
1569     feature = "syntax",
1570     feature = "unicode-perl",
1571 ))]
1572 mod is_word_char {
1573     use regex_syntax::try_is_word_character;
1574 
1575     use crate::util::utf8;
1576 
check() -> Result<(), super::UnicodeWordBoundaryError>1577     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1578         Ok(())
1579     }
1580 
1581     #[cfg_attr(feature = "perf-inline", inline(always))]
fwd( haystack: &[u8], at: usize, ) -> Result<bool, super::UnicodeWordBoundaryError>1582     pub(super) fn fwd(
1583         haystack: &[u8],
1584         at: usize,
1585     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1586         Ok(match utf8::decode(&haystack[at..]) {
1587             None | Some(Err(_)) => false,
1588             Some(Ok(ch)) => try_is_word_character(ch).expect(
1589                 "since unicode-word-boundary, syntax and unicode-perl \
1590                  are all enabled, it is expected that \
1591                  try_is_word_character succeeds",
1592             ),
1593         })
1594     }
1595 
1596     #[cfg_attr(feature = "perf-inline", inline(always))]
rev( haystack: &[u8], at: usize, ) -> Result<bool, super::UnicodeWordBoundaryError>1597     pub(super) fn rev(
1598         haystack: &[u8],
1599         at: usize,
1600     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1601         Ok(match utf8::decode_last(&haystack[..at]) {
1602             None | Some(Err(_)) => false,
1603             Some(Ok(ch)) => try_is_word_character(ch).expect(
1604                 "since unicode-word-boundary, syntax and unicode-perl \
1605                  are all enabled, it is expected that \
1606                  try_is_word_character succeeds",
1607             ),
1608         })
1609     }
1610 }
1611 
1612 /// A module that looks for word codepoints using regex-automata's data tables
1613 /// (which are only compiled when regex-syntax's tables aren't available).
1614 ///
1615 /// Note that the cfg should match the one in src/util/unicode_data/mod.rs for
1616 /// perl_word.
1617 #[cfg(all(
1618     feature = "unicode-word-boundary",
1619     not(all(feature = "syntax", feature = "unicode-perl")),
1620 ))]
1621 mod is_word_char {
1622     use crate::util::utf8;
1623 
check() -> Result<(), super::UnicodeWordBoundaryError>1624     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1625         Ok(())
1626     }
1627 
1628     #[cfg_attr(feature = "perf-inline", inline(always))]
fwd( haystack: &[u8], at: usize, ) -> Result<bool, super::UnicodeWordBoundaryError>1629     pub(super) fn fwd(
1630         haystack: &[u8],
1631         at: usize,
1632     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1633         Ok(match utf8::decode(&haystack[at..]) {
1634             None | Some(Err(_)) => false,
1635             Some(Ok(ch)) => is_word_character(ch),
1636         })
1637     }
1638 
1639     #[cfg_attr(feature = "perf-inline", inline(always))]
rev( haystack: &[u8], at: usize, ) -> Result<bool, super::UnicodeWordBoundaryError>1640     pub(super) fn rev(
1641         haystack: &[u8],
1642         at: usize,
1643     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1644         Ok(match utf8::decode_last(&haystack[..at]) {
1645             None | Some(Err(_)) => false,
1646             Some(Ok(ch)) => is_word_character(ch),
1647         })
1648     }
1649 
1650     #[cfg_attr(feature = "perf-inline", inline(always))]
is_word_character(c: char) -> bool1651     fn is_word_character(c: char) -> bool {
1652         use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
1653 
1654         if u8::try_from(c).map_or(false, utf8::is_word_byte) {
1655             return true;
1656         }
1657         PERL_WORD
1658             .binary_search_by(|&(start, end)| {
1659                 use core::cmp::Ordering;
1660 
1661                 if start <= c && c <= end {
1662                     Ordering::Equal
1663                 } else if start > c {
1664                     Ordering::Greater
1665                 } else {
1666                     Ordering::Less
1667                 }
1668             })
1669             .is_ok()
1670     }
1671 }
1672 
1673 /// A module that always returns an error if Unicode word boundaries are
1674 /// disabled. When this feature is disabled, then regex-automata will not
1675 /// include its own data tables even if regex-syntax is disabled.
1676 #[cfg(not(feature = "unicode-word-boundary"))]
1677 mod is_word_char {
check() -> Result<(), super::UnicodeWordBoundaryError>1678     pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1679         Err(super::UnicodeWordBoundaryError::new())
1680     }
1681 
1682     #[cfg_attr(feature = "perf-inline", inline(always))]
fwd( _bytes: &[u8], _at: usize, ) -> Result<bool, super::UnicodeWordBoundaryError>1683     pub(super) fn fwd(
1684         _bytes: &[u8],
1685         _at: usize,
1686     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1687         Err(super::UnicodeWordBoundaryError::new())
1688     }
1689 
1690     #[cfg_attr(feature = "perf-inline", inline(always))]
rev( _bytes: &[u8], _at: usize, ) -> Result<bool, super::UnicodeWordBoundaryError>1691     pub(super) fn rev(
1692         _bytes: &[u8],
1693         _at: usize,
1694     ) -> Result<bool, super::UnicodeWordBoundaryError> {
1695         Err(super::UnicodeWordBoundaryError::new())
1696     }
1697 }
1698 
1699 #[cfg(test)]
1700 mod tests {
1701     use super::*;
1702 
1703     macro_rules! testlook {
1704         ($look:expr, $haystack:expr, $at:expr) => {
1705             LookMatcher::default().matches($look, $haystack.as_bytes(), $at)
1706         };
1707     }
1708 
1709     #[test]
look_matches_start_line()1710     fn look_matches_start_line() {
1711         let look = Look::StartLF;
1712 
1713         assert!(testlook!(look, "", 0));
1714         assert!(testlook!(look, "\n", 0));
1715         assert!(testlook!(look, "\n", 1));
1716         assert!(testlook!(look, "a", 0));
1717         assert!(testlook!(look, "\na", 1));
1718 
1719         assert!(!testlook!(look, "a", 1));
1720         assert!(!testlook!(look, "a\na", 1));
1721     }
1722 
1723     #[test]
look_matches_end_line()1724     fn look_matches_end_line() {
1725         let look = Look::EndLF;
1726 
1727         assert!(testlook!(look, "", 0));
1728         assert!(testlook!(look, "\n", 1));
1729         assert!(testlook!(look, "\na", 0));
1730         assert!(testlook!(look, "\na", 2));
1731         assert!(testlook!(look, "a\na", 1));
1732 
1733         assert!(!testlook!(look, "a", 0));
1734         assert!(!testlook!(look, "\na", 1));
1735         assert!(!testlook!(look, "a\na", 0));
1736         assert!(!testlook!(look, "a\na", 2));
1737     }
1738 
1739     #[test]
look_matches_start_text()1740     fn look_matches_start_text() {
1741         let look = Look::Start;
1742 
1743         assert!(testlook!(look, "", 0));
1744         assert!(testlook!(look, "\n", 0));
1745         assert!(testlook!(look, "a", 0));
1746 
1747         assert!(!testlook!(look, "\n", 1));
1748         assert!(!testlook!(look, "\na", 1));
1749         assert!(!testlook!(look, "a", 1));
1750         assert!(!testlook!(look, "a\na", 1));
1751     }
1752 
1753     #[test]
look_matches_end_text()1754     fn look_matches_end_text() {
1755         let look = Look::End;
1756 
1757         assert!(testlook!(look, "", 0));
1758         assert!(testlook!(look, "\n", 1));
1759         assert!(testlook!(look, "\na", 2));
1760 
1761         assert!(!testlook!(look, "\na", 0));
1762         assert!(!testlook!(look, "a\na", 1));
1763         assert!(!testlook!(look, "a", 0));
1764         assert!(!testlook!(look, "\na", 1));
1765         assert!(!testlook!(look, "a\na", 0));
1766         assert!(!testlook!(look, "a\na", 2));
1767     }
1768 
1769     #[test]
1770     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
look_matches_word_unicode()1771     fn look_matches_word_unicode() {
1772         let look = Look::WordUnicode;
1773 
1774         // \xF0\x9D\x9B\x83 = �� (in \w)
1775         // \xF0\x90\x86\x80 = �� (not in \w)
1776 
1777         // Simple ASCII word boundaries.
1778         assert!(testlook!(look, "a", 0));
1779         assert!(testlook!(look, "a", 1));
1780         assert!(testlook!(look, "a ", 1));
1781         assert!(testlook!(look, " a ", 1));
1782         assert!(testlook!(look, " a ", 2));
1783 
1784         // Unicode word boundaries with a non-ASCII codepoint.
1785         assert!(testlook!(look, "��", 0));
1786         assert!(testlook!(look, "��", 4));
1787         assert!(testlook!(look, "�� ", 4));
1788         assert!(testlook!(look, " �� ", 1));
1789         assert!(testlook!(look, " �� ", 5));
1790 
1791         // Unicode word boundaries between non-ASCII codepoints.
1792         assert!(testlook!(look, "����", 0));
1793         assert!(testlook!(look, "����", 4));
1794 
1795         // Non word boundaries for ASCII.
1796         assert!(!testlook!(look, "", 0));
1797         assert!(!testlook!(look, "ab", 1));
1798         assert!(!testlook!(look, "a ", 2));
1799         assert!(!testlook!(look, " a ", 0));
1800         assert!(!testlook!(look, " a ", 3));
1801 
1802         // Non word boundaries with a non-ASCII codepoint.
1803         assert!(!testlook!(look, "��b", 4));
1804         assert!(!testlook!(look, "�� ", 5));
1805         assert!(!testlook!(look, " �� ", 0));
1806         assert!(!testlook!(look, " �� ", 6));
1807         assert!(!testlook!(look, "��", 1));
1808         assert!(!testlook!(look, "��", 2));
1809         assert!(!testlook!(look, "��", 3));
1810 
1811         // Non word boundaries with non-ASCII codepoints.
1812         assert!(!testlook!(look, "����", 1));
1813         assert!(!testlook!(look, "����", 2));
1814         assert!(!testlook!(look, "����", 3));
1815         assert!(!testlook!(look, "����", 5));
1816         assert!(!testlook!(look, "����", 6));
1817         assert!(!testlook!(look, "����", 7));
1818         assert!(!testlook!(look, "����", 8));
1819     }
1820 
1821     #[test]
look_matches_word_ascii()1822     fn look_matches_word_ascii() {
1823         let look = Look::WordAscii;
1824 
1825         // \xF0\x9D\x9B\x83 = �� (in \w)
1826         // \xF0\x90\x86\x80 = �� (not in \w)
1827 
1828         // Simple ASCII word boundaries.
1829         assert!(testlook!(look, "a", 0));
1830         assert!(testlook!(look, "a", 1));
1831         assert!(testlook!(look, "a ", 1));
1832         assert!(testlook!(look, " a ", 1));
1833         assert!(testlook!(look, " a ", 2));
1834 
1835         // Unicode word boundaries with a non-ASCII codepoint. Since this is
1836         // an ASCII word boundary, none of these match.
1837         assert!(!testlook!(look, "��", 0));
1838         assert!(!testlook!(look, "��", 4));
1839         assert!(!testlook!(look, "�� ", 4));
1840         assert!(!testlook!(look, " �� ", 1));
1841         assert!(!testlook!(look, " �� ", 5));
1842 
1843         // Unicode word boundaries between non-ASCII codepoints. Again, since
1844         // this is an ASCII word boundary, none of these match.
1845         assert!(!testlook!(look, "����", 0));
1846         assert!(!testlook!(look, "����", 4));
1847 
1848         // Non word boundaries for ASCII.
1849         assert!(!testlook!(look, "", 0));
1850         assert!(!testlook!(look, "ab", 1));
1851         assert!(!testlook!(look, "a ", 2));
1852         assert!(!testlook!(look, " a ", 0));
1853         assert!(!testlook!(look, " a ", 3));
1854 
1855         // Non word boundaries with a non-ASCII codepoint.
1856         assert!(testlook!(look, "��b", 4));
1857         assert!(!testlook!(look, "�� ", 5));
1858         assert!(!testlook!(look, " �� ", 0));
1859         assert!(!testlook!(look, " �� ", 6));
1860         assert!(!testlook!(look, "��", 1));
1861         assert!(!testlook!(look, "��", 2));
1862         assert!(!testlook!(look, "��", 3));
1863 
1864         // Non word boundaries with non-ASCII codepoints.
1865         assert!(!testlook!(look, "����", 1));
1866         assert!(!testlook!(look, "����", 2));
1867         assert!(!testlook!(look, "����", 3));
1868         assert!(!testlook!(look, "����", 5));
1869         assert!(!testlook!(look, "����", 6));
1870         assert!(!testlook!(look, "����", 7));
1871         assert!(!testlook!(look, "����", 8));
1872     }
1873 
1874     #[test]
1875     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
look_matches_word_unicode_negate()1876     fn look_matches_word_unicode_negate() {
1877         let look = Look::WordUnicodeNegate;
1878 
1879         // \xF0\x9D\x9B\x83 = �� (in \w)
1880         // \xF0\x90\x86\x80 = �� (not in \w)
1881 
1882         // Simple ASCII word boundaries.
1883         assert!(!testlook!(look, "a", 0));
1884         assert!(!testlook!(look, "a", 1));
1885         assert!(!testlook!(look, "a ", 1));
1886         assert!(!testlook!(look, " a ", 1));
1887         assert!(!testlook!(look, " a ", 2));
1888 
1889         // Unicode word boundaries with a non-ASCII codepoint.
1890         assert!(!testlook!(look, "��", 0));
1891         assert!(!testlook!(look, "��", 4));
1892         assert!(!testlook!(look, "�� ", 4));
1893         assert!(!testlook!(look, " �� ", 1));
1894         assert!(!testlook!(look, " �� ", 5));
1895 
1896         // Unicode word boundaries between non-ASCII codepoints.
1897         assert!(!testlook!(look, "����", 0));
1898         assert!(!testlook!(look, "����", 4));
1899 
1900         // Non word boundaries for ASCII.
1901         assert!(testlook!(look, "", 0));
1902         assert!(testlook!(look, "ab", 1));
1903         assert!(testlook!(look, "a ", 2));
1904         assert!(testlook!(look, " a ", 0));
1905         assert!(testlook!(look, " a ", 3));
1906 
1907         // Non word boundaries with a non-ASCII codepoint.
1908         assert!(testlook!(look, "��b", 4));
1909         assert!(testlook!(look, "�� ", 5));
1910         assert!(testlook!(look, " �� ", 0));
1911         assert!(testlook!(look, " �� ", 6));
1912         // These don't match because they could otherwise return an offset that
1913         // splits the UTF-8 encoding of a codepoint.
1914         assert!(!testlook!(look, "��", 1));
1915         assert!(!testlook!(look, "��", 2));
1916         assert!(!testlook!(look, "��", 3));
1917 
1918         // Non word boundaries with non-ASCII codepoints. These also don't
1919         // match because they could otherwise return an offset that splits the
1920         // UTF-8 encoding of a codepoint.
1921         assert!(!testlook!(look, "����", 1));
1922         assert!(!testlook!(look, "����", 2));
1923         assert!(!testlook!(look, "����", 3));
1924         assert!(!testlook!(look, "����", 5));
1925         assert!(!testlook!(look, "����", 6));
1926         assert!(!testlook!(look, "����", 7));
1927         // But this one does, since �� isn't a word codepoint, and 8 is the end
1928         // of the haystack. So the "end" of the haystack isn't a word and ��
1929         // isn't a word, thus, \B matches.
1930         assert!(testlook!(look, "����", 8));
1931     }
1932 
1933     #[test]
look_matches_word_ascii_negate()1934     fn look_matches_word_ascii_negate() {
1935         let look = Look::WordAsciiNegate;
1936 
1937         // \xF0\x9D\x9B\x83 = �� (in \w)
1938         // \xF0\x90\x86\x80 = �� (not in \w)
1939 
1940         // Simple ASCII word boundaries.
1941         assert!(!testlook!(look, "a", 0));
1942         assert!(!testlook!(look, "a", 1));
1943         assert!(!testlook!(look, "a ", 1));
1944         assert!(!testlook!(look, " a ", 1));
1945         assert!(!testlook!(look, " a ", 2));
1946 
1947         // Unicode word boundaries with a non-ASCII codepoint. Since this is
1948         // an ASCII word boundary, none of these match.
1949         assert!(testlook!(look, "��", 0));
1950         assert!(testlook!(look, "��", 4));
1951         assert!(testlook!(look, "�� ", 4));
1952         assert!(testlook!(look, " �� ", 1));
1953         assert!(testlook!(look, " �� ", 5));
1954 
1955         // Unicode word boundaries between non-ASCII codepoints. Again, since
1956         // this is an ASCII word boundary, none of these match.
1957         assert!(testlook!(look, "����", 0));
1958         assert!(testlook!(look, "����", 4));
1959 
1960         // Non word boundaries for ASCII.
1961         assert!(testlook!(look, "", 0));
1962         assert!(testlook!(look, "ab", 1));
1963         assert!(testlook!(look, "a ", 2));
1964         assert!(testlook!(look, " a ", 0));
1965         assert!(testlook!(look, " a ", 3));
1966 
1967         // Non word boundaries with a non-ASCII codepoint.
1968         assert!(!testlook!(look, "��b", 4));
1969         assert!(testlook!(look, "�� ", 5));
1970         assert!(testlook!(look, " �� ", 0));
1971         assert!(testlook!(look, " �� ", 6));
1972         assert!(testlook!(look, "��", 1));
1973         assert!(testlook!(look, "��", 2));
1974         assert!(testlook!(look, "��", 3));
1975 
1976         // Non word boundaries with non-ASCII codepoints.
1977         assert!(testlook!(look, "����", 1));
1978         assert!(testlook!(look, "����", 2));
1979         assert!(testlook!(look, "����", 3));
1980         assert!(testlook!(look, "����", 5));
1981         assert!(testlook!(look, "����", 6));
1982         assert!(testlook!(look, "����", 7));
1983         assert!(testlook!(look, "����", 8));
1984     }
1985 
1986     #[test]
look_matches_word_start_ascii()1987     fn look_matches_word_start_ascii() {
1988         let look = Look::WordStartAscii;
1989 
1990         // \xF0\x9D\x9B\x83 = �� (in \w)
1991         // \xF0\x90\x86\x80 = �� (not in \w)
1992 
1993         // Simple ASCII word boundaries.
1994         assert!(testlook!(look, "a", 0));
1995         assert!(!testlook!(look, "a", 1));
1996         assert!(!testlook!(look, "a ", 1));
1997         assert!(testlook!(look, " a ", 1));
1998         assert!(!testlook!(look, " a ", 2));
1999 
2000         // Unicode word boundaries with a non-ASCII codepoint. Since this is
2001         // an ASCII word boundary, none of these match.
2002         assert!(!testlook!(look, "��", 0));
2003         assert!(!testlook!(look, "��", 4));
2004         assert!(!testlook!(look, "�� ", 4));
2005         assert!(!testlook!(look, " �� ", 1));
2006         assert!(!testlook!(look, " �� ", 5));
2007 
2008         // Unicode word boundaries between non-ASCII codepoints. Again, since
2009         // this is an ASCII word boundary, none of these match.
2010         assert!(!testlook!(look, "����", 0));
2011         assert!(!testlook!(look, "����", 4));
2012 
2013         // Non word boundaries for ASCII.
2014         assert!(!testlook!(look, "", 0));
2015         assert!(!testlook!(look, "ab", 1));
2016         assert!(!testlook!(look, "a ", 2));
2017         assert!(!testlook!(look, " a ", 0));
2018         assert!(!testlook!(look, " a ", 3));
2019 
2020         // Non word boundaries with a non-ASCII codepoint.
2021         assert!(testlook!(look, "��b", 4));
2022         assert!(!testlook!(look, "b��", 1));
2023         assert!(!testlook!(look, "�� ", 5));
2024         assert!(!testlook!(look, " �� ", 0));
2025         assert!(!testlook!(look, " �� ", 6));
2026         assert!(!testlook!(look, "��", 1));
2027         assert!(!testlook!(look, "��", 2));
2028         assert!(!testlook!(look, "��", 3));
2029 
2030         // Non word boundaries with non-ASCII codepoints.
2031         assert!(!testlook!(look, "����", 1));
2032         assert!(!testlook!(look, "����", 2));
2033         assert!(!testlook!(look, "����", 3));
2034         assert!(!testlook!(look, "����", 5));
2035         assert!(!testlook!(look, "����", 6));
2036         assert!(!testlook!(look, "����", 7));
2037         assert!(!testlook!(look, "����", 8));
2038     }
2039 
2040     #[test]
look_matches_word_end_ascii()2041     fn look_matches_word_end_ascii() {
2042         let look = Look::WordEndAscii;
2043 
2044         // \xF0\x9D\x9B\x83 = �� (in \w)
2045         // \xF0\x90\x86\x80 = �� (not in \w)
2046 
2047         // Simple ASCII word boundaries.
2048         assert!(!testlook!(look, "a", 0));
2049         assert!(testlook!(look, "a", 1));
2050         assert!(testlook!(look, "a ", 1));
2051         assert!(!testlook!(look, " a ", 1));
2052         assert!(testlook!(look, " a ", 2));
2053 
2054         // Unicode word boundaries with a non-ASCII codepoint. Since this is
2055         // an ASCII word boundary, none of these match.
2056         assert!(!testlook!(look, "��", 0));
2057         assert!(!testlook!(look, "��", 4));
2058         assert!(!testlook!(look, "�� ", 4));
2059         assert!(!testlook!(look, " �� ", 1));
2060         assert!(!testlook!(look, " �� ", 5));
2061 
2062         // Unicode word boundaries between non-ASCII codepoints. Again, since
2063         // this is an ASCII word boundary, none of these match.
2064         assert!(!testlook!(look, "����", 0));
2065         assert!(!testlook!(look, "����", 4));
2066 
2067         // Non word boundaries for ASCII.
2068         assert!(!testlook!(look, "", 0));
2069         assert!(!testlook!(look, "ab", 1));
2070         assert!(!testlook!(look, "a ", 2));
2071         assert!(!testlook!(look, " a ", 0));
2072         assert!(!testlook!(look, " a ", 3));
2073 
2074         // Non word boundaries with a non-ASCII codepoint.
2075         assert!(!testlook!(look, "��b", 4));
2076         assert!(testlook!(look, "b��", 1));
2077         assert!(!testlook!(look, "�� ", 5));
2078         assert!(!testlook!(look, " �� ", 0));
2079         assert!(!testlook!(look, " �� ", 6));
2080         assert!(!testlook!(look, "��", 1));
2081         assert!(!testlook!(look, "��", 2));
2082         assert!(!testlook!(look, "��", 3));
2083 
2084         // Non word boundaries with non-ASCII codepoints.
2085         assert!(!testlook!(look, "����", 1));
2086         assert!(!testlook!(look, "����", 2));
2087         assert!(!testlook!(look, "����", 3));
2088         assert!(!testlook!(look, "����", 5));
2089         assert!(!testlook!(look, "����", 6));
2090         assert!(!testlook!(look, "����", 7));
2091         assert!(!testlook!(look, "����", 8));
2092     }
2093 
2094     #[test]
2095     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
look_matches_word_start_unicode()2096     fn look_matches_word_start_unicode() {
2097         let look = Look::WordStartUnicode;
2098 
2099         // \xF0\x9D\x9B\x83 = �� (in \w)
2100         // \xF0\x90\x86\x80 = �� (not in \w)
2101 
2102         // Simple ASCII word boundaries.
2103         assert!(testlook!(look, "a", 0));
2104         assert!(!testlook!(look, "a", 1));
2105         assert!(!testlook!(look, "a ", 1));
2106         assert!(testlook!(look, " a ", 1));
2107         assert!(!testlook!(look, " a ", 2));
2108 
2109         // Unicode word boundaries with a non-ASCII codepoint.
2110         assert!(testlook!(look, "��", 0));
2111         assert!(!testlook!(look, "��", 4));
2112         assert!(!testlook!(look, "�� ", 4));
2113         assert!(testlook!(look, " �� ", 1));
2114         assert!(!testlook!(look, " �� ", 5));
2115 
2116         // Unicode word boundaries between non-ASCII codepoints.
2117         assert!(testlook!(look, "����", 0));
2118         assert!(!testlook!(look, "����", 4));
2119 
2120         // Non word boundaries for ASCII.
2121         assert!(!testlook!(look, "", 0));
2122         assert!(!testlook!(look, "ab", 1));
2123         assert!(!testlook!(look, "a ", 2));
2124         assert!(!testlook!(look, " a ", 0));
2125         assert!(!testlook!(look, " a ", 3));
2126 
2127         // Non word boundaries with a non-ASCII codepoint.
2128         assert!(!testlook!(look, "��b", 4));
2129         assert!(!testlook!(look, "b��", 1));
2130         assert!(!testlook!(look, "�� ", 5));
2131         assert!(!testlook!(look, " �� ", 0));
2132         assert!(!testlook!(look, " �� ", 6));
2133         assert!(!testlook!(look, "��", 1));
2134         assert!(!testlook!(look, "��", 2));
2135         assert!(!testlook!(look, "��", 3));
2136 
2137         // Non word boundaries with non-ASCII codepoints.
2138         assert!(!testlook!(look, "����", 1));
2139         assert!(!testlook!(look, "����", 2));
2140         assert!(!testlook!(look, "����", 3));
2141         assert!(!testlook!(look, "����", 5));
2142         assert!(!testlook!(look, "����", 6));
2143         assert!(!testlook!(look, "����", 7));
2144         assert!(!testlook!(look, "����", 8));
2145     }
2146 
2147     #[test]
2148     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
look_matches_word_end_unicode()2149     fn look_matches_word_end_unicode() {
2150         let look = Look::WordEndUnicode;
2151 
2152         // \xF0\x9D\x9B\x83 = �� (in \w)
2153         // \xF0\x90\x86\x80 = �� (not in \w)
2154 
2155         // Simple ASCII word boundaries.
2156         assert!(!testlook!(look, "a", 0));
2157         assert!(testlook!(look, "a", 1));
2158         assert!(testlook!(look, "a ", 1));
2159         assert!(!testlook!(look, " a ", 1));
2160         assert!(testlook!(look, " a ", 2));
2161 
2162         // Unicode word boundaries with a non-ASCII codepoint.
2163         assert!(!testlook!(look, "��", 0));
2164         assert!(testlook!(look, "��", 4));
2165         assert!(testlook!(look, "�� ", 4));
2166         assert!(!testlook!(look, " �� ", 1));
2167         assert!(testlook!(look, " �� ", 5));
2168 
2169         // Unicode word boundaries between non-ASCII codepoints.
2170         assert!(!testlook!(look, "����", 0));
2171         assert!(testlook!(look, "����", 4));
2172 
2173         // Non word boundaries for ASCII.
2174         assert!(!testlook!(look, "", 0));
2175         assert!(!testlook!(look, "ab", 1));
2176         assert!(!testlook!(look, "a ", 2));
2177         assert!(!testlook!(look, " a ", 0));
2178         assert!(!testlook!(look, " a ", 3));
2179 
2180         // Non word boundaries with a non-ASCII codepoint.
2181         assert!(!testlook!(look, "��b", 4));
2182         assert!(!testlook!(look, "b��", 1));
2183         assert!(!testlook!(look, "�� ", 5));
2184         assert!(!testlook!(look, " �� ", 0));
2185         assert!(!testlook!(look, " �� ", 6));
2186         assert!(!testlook!(look, "��", 1));
2187         assert!(!testlook!(look, "��", 2));
2188         assert!(!testlook!(look, "��", 3));
2189 
2190         // Non word boundaries with non-ASCII codepoints.
2191         assert!(!testlook!(look, "����", 1));
2192         assert!(!testlook!(look, "����", 2));
2193         assert!(!testlook!(look, "����", 3));
2194         assert!(!testlook!(look, "����", 5));
2195         assert!(!testlook!(look, "����", 6));
2196         assert!(!testlook!(look, "����", 7));
2197         assert!(!testlook!(look, "����", 8));
2198     }
2199 
2200     #[test]
look_matches_word_start_half_ascii()2201     fn look_matches_word_start_half_ascii() {
2202         let look = Look::WordStartHalfAscii;
2203 
2204         // \xF0\x9D\x9B\x83 = �� (in \w)
2205         // \xF0\x90\x86\x80 = �� (not in \w)
2206 
2207         // Simple ASCII word boundaries.
2208         assert!(testlook!(look, "a", 0));
2209         assert!(!testlook!(look, "a", 1));
2210         assert!(!testlook!(look, "a ", 1));
2211         assert!(testlook!(look, " a ", 1));
2212         assert!(!testlook!(look, " a ", 2));
2213 
2214         // Unicode word boundaries with a non-ASCII codepoint. Since this is
2215         // an ASCII word boundary, none of these match.
2216         assert!(testlook!(look, "��", 0));
2217         assert!(testlook!(look, "��", 4));
2218         assert!(testlook!(look, "�� ", 4));
2219         assert!(testlook!(look, " �� ", 1));
2220         assert!(testlook!(look, " �� ", 5));
2221 
2222         // Unicode word boundaries between non-ASCII codepoints. Again, since
2223         // this is an ASCII word boundary, none of these match.
2224         assert!(testlook!(look, "����", 0));
2225         assert!(testlook!(look, "����", 4));
2226 
2227         // Non word boundaries for ASCII.
2228         assert!(testlook!(look, "", 0));
2229         assert!(!testlook!(look, "ab", 1));
2230         assert!(testlook!(look, "a ", 2));
2231         assert!(testlook!(look, " a ", 0));
2232         assert!(testlook!(look, " a ", 3));
2233 
2234         // Non word boundaries with a non-ASCII codepoint.
2235         assert!(testlook!(look, "��b", 4));
2236         assert!(!testlook!(look, "b��", 1));
2237         assert!(testlook!(look, "�� ", 5));
2238         assert!(testlook!(look, " �� ", 0));
2239         assert!(testlook!(look, " �� ", 6));
2240         assert!(testlook!(look, "��", 1));
2241         assert!(testlook!(look, "��", 2));
2242         assert!(testlook!(look, "��", 3));
2243 
2244         // Non word boundaries with non-ASCII codepoints.
2245         assert!(testlook!(look, "����", 1));
2246         assert!(testlook!(look, "����", 2));
2247         assert!(testlook!(look, "����", 3));
2248         assert!(testlook!(look, "����", 5));
2249         assert!(testlook!(look, "����", 6));
2250         assert!(testlook!(look, "����", 7));
2251         assert!(testlook!(look, "����", 8));
2252     }
2253 
2254     #[test]
look_matches_word_end_half_ascii()2255     fn look_matches_word_end_half_ascii() {
2256         let look = Look::WordEndHalfAscii;
2257 
2258         // \xF0\x9D\x9B\x83 = �� (in \w)
2259         // \xF0\x90\x86\x80 = �� (not in \w)
2260 
2261         // Simple ASCII word boundaries.
2262         assert!(!testlook!(look, "a", 0));
2263         assert!(testlook!(look, "a", 1));
2264         assert!(testlook!(look, "a ", 1));
2265         assert!(!testlook!(look, " a ", 1));
2266         assert!(testlook!(look, " a ", 2));
2267 
2268         // Unicode word boundaries with a non-ASCII codepoint. Since this is
2269         // an ASCII word boundary, none of these match.
2270         assert!(testlook!(look, "��", 0));
2271         assert!(testlook!(look, "��", 4));
2272         assert!(testlook!(look, "�� ", 4));
2273         assert!(testlook!(look, " �� ", 1));
2274         assert!(testlook!(look, " �� ", 5));
2275 
2276         // Unicode word boundaries between non-ASCII codepoints. Again, since
2277         // this is an ASCII word boundary, none of these match.
2278         assert!(testlook!(look, "����", 0));
2279         assert!(testlook!(look, "����", 4));
2280 
2281         // Non word boundaries for ASCII.
2282         assert!(testlook!(look, "", 0));
2283         assert!(!testlook!(look, "ab", 1));
2284         assert!(testlook!(look, "a ", 2));
2285         assert!(testlook!(look, " a ", 0));
2286         assert!(testlook!(look, " a ", 3));
2287 
2288         // Non word boundaries with a non-ASCII codepoint.
2289         assert!(!testlook!(look, "��b", 4));
2290         assert!(testlook!(look, "b��", 1));
2291         assert!(testlook!(look, "�� ", 5));
2292         assert!(testlook!(look, " �� ", 0));
2293         assert!(testlook!(look, " �� ", 6));
2294         assert!(testlook!(look, "��", 1));
2295         assert!(testlook!(look, "��", 2));
2296         assert!(testlook!(look, "��", 3));
2297 
2298         // Non word boundaries with non-ASCII codepoints.
2299         assert!(testlook!(look, "����", 1));
2300         assert!(testlook!(look, "����", 2));
2301         assert!(testlook!(look, "����", 3));
2302         assert!(testlook!(look, "����", 5));
2303         assert!(testlook!(look, "����", 6));
2304         assert!(testlook!(look, "����", 7));
2305         assert!(testlook!(look, "����", 8));
2306     }
2307 
2308     #[test]
2309     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
look_matches_word_start_half_unicode()2310     fn look_matches_word_start_half_unicode() {
2311         let look = Look::WordStartHalfUnicode;
2312 
2313         // \xF0\x9D\x9B\x83 = �� (in \w)
2314         // \xF0\x90\x86\x80 = �� (not in \w)
2315 
2316         // Simple ASCII word boundaries.
2317         assert!(testlook!(look, "a", 0));
2318         assert!(!testlook!(look, "a", 1));
2319         assert!(!testlook!(look, "a ", 1));
2320         assert!(testlook!(look, " a ", 1));
2321         assert!(!testlook!(look, " a ", 2));
2322 
2323         // Unicode word boundaries with a non-ASCII codepoint.
2324         assert!(testlook!(look, "��", 0));
2325         assert!(!testlook!(look, "��", 4));
2326         assert!(!testlook!(look, "�� ", 4));
2327         assert!(testlook!(look, " �� ", 1));
2328         assert!(!testlook!(look, " �� ", 5));
2329 
2330         // Unicode word boundaries between non-ASCII codepoints.
2331         assert!(testlook!(look, "����", 0));
2332         assert!(!testlook!(look, "����", 4));
2333 
2334         // Non word boundaries for ASCII.
2335         assert!(testlook!(look, "", 0));
2336         assert!(!testlook!(look, "ab", 1));
2337         assert!(testlook!(look, "a ", 2));
2338         assert!(testlook!(look, " a ", 0));
2339         assert!(testlook!(look, " a ", 3));
2340 
2341         // Non word boundaries with a non-ASCII codepoint.
2342         assert!(!testlook!(look, "��b", 4));
2343         assert!(!testlook!(look, "b��", 1));
2344         assert!(testlook!(look, "�� ", 5));
2345         assert!(testlook!(look, " �� ", 0));
2346         assert!(testlook!(look, " �� ", 6));
2347         assert!(!testlook!(look, "��", 1));
2348         assert!(!testlook!(look, "��", 2));
2349         assert!(!testlook!(look, "��", 3));
2350 
2351         // Non word boundaries with non-ASCII codepoints.
2352         assert!(!testlook!(look, "����", 1));
2353         assert!(!testlook!(look, "����", 2));
2354         assert!(!testlook!(look, "����", 3));
2355         assert!(!testlook!(look, "����", 5));
2356         assert!(!testlook!(look, "����", 6));
2357         assert!(!testlook!(look, "����", 7));
2358         assert!(testlook!(look, "����", 8));
2359     }
2360 
2361     #[test]
2362     #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
look_matches_word_end_half_unicode()2363     fn look_matches_word_end_half_unicode() {
2364         let look = Look::WordEndHalfUnicode;
2365 
2366         // \xF0\x9D\x9B\x83 = �� (in \w)
2367         // \xF0\x90\x86\x80 = �� (not in \w)
2368 
2369         // Simple ASCII word boundaries.
2370         assert!(!testlook!(look, "a", 0));
2371         assert!(testlook!(look, "a", 1));
2372         assert!(testlook!(look, "a ", 1));
2373         assert!(!testlook!(look, " a ", 1));
2374         assert!(testlook!(look, " a ", 2));
2375 
2376         // Unicode word boundaries with a non-ASCII codepoint.
2377         assert!(!testlook!(look, "��", 0));
2378         assert!(testlook!(look, "��", 4));
2379         assert!(testlook!(look, "�� ", 4));
2380         assert!(!testlook!(look, " �� ", 1));
2381         assert!(testlook!(look, " �� ", 5));
2382 
2383         // Unicode word boundaries between non-ASCII codepoints.
2384         assert!(!testlook!(look, "����", 0));
2385         assert!(testlook!(look, "����", 4));
2386 
2387         // Non word boundaries for ASCII.
2388         assert!(testlook!(look, "", 0));
2389         assert!(!testlook!(look, "ab", 1));
2390         assert!(testlook!(look, "a ", 2));
2391         assert!(testlook!(look, " a ", 0));
2392         assert!(testlook!(look, " a ", 3));
2393 
2394         // Non word boundaries with a non-ASCII codepoint.
2395         assert!(!testlook!(look, "��b", 4));
2396         assert!(!testlook!(look, "b��", 1));
2397         assert!(testlook!(look, "�� ", 5));
2398         assert!(testlook!(look, " �� ", 0));
2399         assert!(testlook!(look, " �� ", 6));
2400         assert!(!testlook!(look, "��", 1));
2401         assert!(!testlook!(look, "��", 2));
2402         assert!(!testlook!(look, "��", 3));
2403 
2404         // Non word boundaries with non-ASCII codepoints.
2405         assert!(!testlook!(look, "����", 1));
2406         assert!(!testlook!(look, "����", 2));
2407         assert!(!testlook!(look, "����", 3));
2408         assert!(!testlook!(look, "����", 5));
2409         assert!(!testlook!(look, "����", 6));
2410         assert!(!testlook!(look, "����", 7));
2411         assert!(testlook!(look, "����", 8));
2412     }
2413 
2414     #[test]
look_set()2415     fn look_set() {
2416         let mut f = LookSet::default();
2417         assert!(!f.contains(Look::Start));
2418         assert!(!f.contains(Look::End));
2419         assert!(!f.contains(Look::StartLF));
2420         assert!(!f.contains(Look::EndLF));
2421         assert!(!f.contains(Look::WordUnicode));
2422         assert!(!f.contains(Look::WordUnicodeNegate));
2423         assert!(!f.contains(Look::WordAscii));
2424         assert!(!f.contains(Look::WordAsciiNegate));
2425 
2426         f = f.insert(Look::Start);
2427         assert!(f.contains(Look::Start));
2428         f = f.remove(Look::Start);
2429         assert!(!f.contains(Look::Start));
2430 
2431         f = f.insert(Look::End);
2432         assert!(f.contains(Look::End));
2433         f = f.remove(Look::End);
2434         assert!(!f.contains(Look::End));
2435 
2436         f = f.insert(Look::StartLF);
2437         assert!(f.contains(Look::StartLF));
2438         f = f.remove(Look::StartLF);
2439         assert!(!f.contains(Look::StartLF));
2440 
2441         f = f.insert(Look::EndLF);
2442         assert!(f.contains(Look::EndLF));
2443         f = f.remove(Look::EndLF);
2444         assert!(!f.contains(Look::EndLF));
2445 
2446         f = f.insert(Look::StartCRLF);
2447         assert!(f.contains(Look::StartCRLF));
2448         f = f.remove(Look::StartCRLF);
2449         assert!(!f.contains(Look::StartCRLF));
2450 
2451         f = f.insert(Look::EndCRLF);
2452         assert!(f.contains(Look::EndCRLF));
2453         f = f.remove(Look::EndCRLF);
2454         assert!(!f.contains(Look::EndCRLF));
2455 
2456         f = f.insert(Look::WordUnicode);
2457         assert!(f.contains(Look::WordUnicode));
2458         f = f.remove(Look::WordUnicode);
2459         assert!(!f.contains(Look::WordUnicode));
2460 
2461         f = f.insert(Look::WordUnicodeNegate);
2462         assert!(f.contains(Look::WordUnicodeNegate));
2463         f = f.remove(Look::WordUnicodeNegate);
2464         assert!(!f.contains(Look::WordUnicodeNegate));
2465 
2466         f = f.insert(Look::WordAscii);
2467         assert!(f.contains(Look::WordAscii));
2468         f = f.remove(Look::WordAscii);
2469         assert!(!f.contains(Look::WordAscii));
2470 
2471         f = f.insert(Look::WordAsciiNegate);
2472         assert!(f.contains(Look::WordAsciiNegate));
2473         f = f.remove(Look::WordAsciiNegate);
2474         assert!(!f.contains(Look::WordAsciiNegate));
2475 
2476         f = f.insert(Look::WordStartAscii);
2477         assert!(f.contains(Look::WordStartAscii));
2478         f = f.remove(Look::WordStartAscii);
2479         assert!(!f.contains(Look::WordStartAscii));
2480 
2481         f = f.insert(Look::WordEndAscii);
2482         assert!(f.contains(Look::WordEndAscii));
2483         f = f.remove(Look::WordEndAscii);
2484         assert!(!f.contains(Look::WordEndAscii));
2485 
2486         f = f.insert(Look::WordStartUnicode);
2487         assert!(f.contains(Look::WordStartUnicode));
2488         f = f.remove(Look::WordStartUnicode);
2489         assert!(!f.contains(Look::WordStartUnicode));
2490 
2491         f = f.insert(Look::WordEndUnicode);
2492         assert!(f.contains(Look::WordEndUnicode));
2493         f = f.remove(Look::WordEndUnicode);
2494         assert!(!f.contains(Look::WordEndUnicode));
2495 
2496         f = f.insert(Look::WordStartHalfAscii);
2497         assert!(f.contains(Look::WordStartHalfAscii));
2498         f = f.remove(Look::WordStartHalfAscii);
2499         assert!(!f.contains(Look::WordStartHalfAscii));
2500 
2501         f = f.insert(Look::WordEndHalfAscii);
2502         assert!(f.contains(Look::WordEndHalfAscii));
2503         f = f.remove(Look::WordEndHalfAscii);
2504         assert!(!f.contains(Look::WordEndHalfAscii));
2505 
2506         f = f.insert(Look::WordStartHalfUnicode);
2507         assert!(f.contains(Look::WordStartHalfUnicode));
2508         f = f.remove(Look::WordStartHalfUnicode);
2509         assert!(!f.contains(Look::WordStartHalfUnicode));
2510 
2511         f = f.insert(Look::WordEndHalfUnicode);
2512         assert!(f.contains(Look::WordEndHalfUnicode));
2513         f = f.remove(Look::WordEndHalfUnicode);
2514         assert!(!f.contains(Look::WordEndHalfUnicode));
2515     }
2516 
2517     #[test]
look_set_iter()2518     fn look_set_iter() {
2519         let set = LookSet::empty();
2520         assert_eq!(0, set.iter().count());
2521 
2522         let set = LookSet::full();
2523         assert_eq!(18, set.iter().count());
2524 
2525         let set =
2526             LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
2527         assert_eq!(2, set.iter().count());
2528 
2529         let set = LookSet::empty().insert(Look::StartLF);
2530         assert_eq!(1, set.iter().count());
2531 
2532         let set = LookSet::empty().insert(Look::WordAsciiNegate);
2533         assert_eq!(1, set.iter().count());
2534 
2535         let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
2536         assert_eq!(1, set.iter().count());
2537     }
2538 
2539     #[test]
2540     #[cfg(feature = "alloc")]
look_set_debug()2541     fn look_set_debug() {
2542         let res = alloc::format!("{:?}", LookSet::empty());
2543         assert_eq!("∅", res);
2544         let res = alloc::format!("{:?}", LookSet::full());
2545         assert_eq!("Az^$rRbB����<>〈〉◁▷◀▶", res);
2546     }
2547 }
2548