1 /// A state identifier specifically tailored for lazy DFAs.
2 ///
3 /// A lazy state ID logically represents a pointer to a DFA state. In practice,
4 /// by limiting the number of DFA states it can address, it reserves some
5 /// bits of its representation to encode some additional information. That
6 /// additional information is called a "tag." That tag is used to record
7 /// whether the state it points to is an unknown, dead, quit, start or match
8 /// state.
9 ///
10 /// When implementing a low level search routine with a lazy DFA, it is
11 /// necessary to query the type of the current state to know what to do:
12 ///
13 /// * **Unknown** - The state has not yet been computed. The
14 /// parameters used to get this state ID must be re-passed to
15 /// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never
16 /// return an unknown state ID.
17 /// * **Dead** - A dead state only has transitions to itself. It indicates that
18 /// the search cannot do anything else and should stop with whatever result it
19 /// has.
20 /// * **Quit** - A quit state indicates that the automaton could not answer
21 /// whether a match exists or not. Correct search implementations must return a
22 /// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit
23 /// state.
24 /// * **Start** - A start state is a state in which a search can begin.
25 /// Lazy DFAs usually have more than one start state. Branching on
26 /// this isn't required for correctness, but a common optimization is
27 /// to run a prefilter when a search enters a start state. Note that
28 /// start states are *not* tagged automatically, and one must enable the
29 /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states)
30 /// setting for start states to be tagged. The reason for this is
31 /// that a DFA search loop is usually written to execute a prefilter once it
32 /// enters a start state. But if there is no prefilter, this handling can be
33 /// quite diastrous as the DFA may ping-pong between the special handling code
34 /// and a possible optimized hot path for handling untagged states. When start
35 /// states aren't specialized, then they are untagged and remain in the hot
36 /// path.
37 /// * **Match** - A match state indicates that a match has been found.
38 /// Depending on the semantics of your search implementation, it may either
39 /// continue until the end of the haystack or a dead state, or it might quit
40 /// and return the match immediately.
41 ///
42 /// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
43 /// can be used to determine if a tag exists at all. This is useful to avoid
44 /// branching on all of the above types for every byte searched.
45 ///
46 /// # Example
47 ///
48 /// This example shows how `LazyStateID` can be used to implement a correct
49 /// search routine with minimal branching. In particular, this search routine
50 /// implements "leftmost" matching, which means that it doesn't immediately
51 /// stop once a match is found. Instead, it continues until it reaches a dead
52 /// state.
53 ///
54 /// Notice also how a correct search implementation deals with
55 /// [`CacheError`](crate::hybrid::CacheError)s returned by some of
56 /// the lazy DFA routines. When a `CacheError` occurs, it returns
57 /// [`MatchError::gave_up`](crate::MatchError::gave_up).
58 ///
59 /// ```
60 /// use regex_automata::{
61 ///     hybrid::dfa::{Cache, DFA},
62 ///     HalfMatch, MatchError, Input,
63 /// };
64 ///
65 /// fn find_leftmost_first(
66 ///     dfa: &DFA,
67 ///     cache: &mut Cache,
68 ///     haystack: &[u8],
69 /// ) -> Result<Option<HalfMatch>, MatchError> {
70 ///     // The start state is determined by inspecting the position and the
71 ///     // initial bytes of the haystack. Note that start states can never
72 ///     // be match states (since DFAs in this crate delay matches by 1
73 ///     // byte), so we don't need to check if the start state is a match.
74 ///     let mut sid = dfa.start_state_forward(
75 ///         cache,
76 ///         &Input::new(haystack),
77 ///     )?;
78 ///     let mut last_match = None;
79 ///     // Walk all the bytes in the haystack. We can quit early if we see
80 ///     // a dead or a quit state. The former means the automaton will
81 ///     // never transition to any other state. The latter means that the
82 ///     // automaton entered a condition in which its search failed.
83 ///     for (i, &b) in haystack.iter().enumerate() {
84 ///         sid = dfa
85 ///             .next_state(cache, sid, b)
86 ///             .map_err(|_| MatchError::gave_up(i))?;
87 ///         if sid.is_tagged() {
88 ///             if sid.is_match() {
89 ///                 last_match = Some(HalfMatch::new(
90 ///                     dfa.match_pattern(cache, sid, 0),
91 ///                     i,
92 ///                 ));
93 ///             } else if sid.is_dead() {
94 ///                 return Ok(last_match);
95 ///             } else if sid.is_quit() {
96 ///                 // It is possible to enter into a quit state after
97 ///                 // observing a match has occurred. In that case, we
98 ///                 // should return the match instead of an error.
99 ///                 if last_match.is_some() {
100 ///                     return Ok(last_match);
101 ///                 }
102 ///                 return Err(MatchError::quit(b, i));
103 ///             }
104 ///             // Implementors may also want to check for start states and
105 ///             // handle them differently for performance reasons. But it is
106 ///             // not necessary for correctness. Note that in order to check
107 ///             // for start states, you'll need to enable the
108 ///             // 'specialize_start_states' config knob, otherwise start
109 ///             // states will not be tagged.
110 ///         }
111 ///     }
112 ///     // Matches are always delayed by 1 byte, so we must explicitly walk
113 ///     // the special "EOI" transition at the end of the search.
114 ///     sid = dfa
115 ///         .next_eoi_state(cache, sid)
116 ///         .map_err(|_| MatchError::gave_up(haystack.len()))?;
117 ///     if sid.is_match() {
118 ///         last_match = Some(HalfMatch::new(
119 ///             dfa.match_pattern(cache, sid, 0),
120 ///             haystack.len(),
121 ///         ));
122 ///     }
123 ///     Ok(last_match)
124 /// }
125 ///
126 /// // We use a greedy '+' operator to show how the search doesn't just stop
127 /// // once a match is detected. It continues extending the match. Using
128 /// // '[a-z]+?' would also work as expected and stop the search early.
129 /// // Greediness is built into the automaton.
130 /// let dfa = DFA::new(r"[a-z]+")?;
131 /// let mut cache = dfa.create_cache();
132 /// let haystack = "123 foobar 4567".as_bytes();
133 /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
134 /// assert_eq!(mat.pattern().as_usize(), 0);
135 /// assert_eq!(mat.offset(), 10);
136 ///
137 /// // Here's another example that tests our handling of the special
138 /// // EOI transition. This will fail to find a match if we don't call
139 /// // 'next_eoi_state' at the end of the search since the match isn't found
140 /// // until the final byte in the haystack.
141 /// let dfa = DFA::new(r"[0-9]{4}")?;
142 /// let mut cache = dfa.create_cache();
143 /// let haystack = "123 foobar 4567".as_bytes();
144 /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
145 /// assert_eq!(mat.pattern().as_usize(), 0);
146 /// assert_eq!(mat.offset(), 15);
147 ///
148 /// // And note that our search implementation above automatically works
149 /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
150 /// // the appropriate pattern ID for us.
151 /// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
152 /// let mut cache = dfa.create_cache();
153 /// let haystack = "123 foobar 4567".as_bytes();
154 /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
155 /// assert_eq!(mat.pattern().as_usize(), 1);
156 /// assert_eq!(mat.offset(), 3);
157 /// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
158 /// assert_eq!(mat.pattern().as_usize(), 0);
159 /// assert_eq!(mat.offset(), 7);
160 /// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
161 /// assert_eq!(mat.pattern().as_usize(), 1);
162 /// assert_eq!(mat.offset(), 5);
163 ///
164 /// # Ok::<(), Box<dyn std::error::Error>>(())
165 /// ```
166 #[derive(
167     Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
168 )]
169 pub struct LazyStateID(u32);
170 
171 impl LazyStateID {
172     #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
173     const MAX_BIT: usize = 31;
174 
175     #[cfg(target_pointer_width = "16")]
176     const MAX_BIT: usize = 15;
177 
178     const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
179     const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
180     const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
181     const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
182     const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
183     const MAX: usize = LazyStateID::MASK_MATCH - 1;
184 
185     /// Create a new lazy state ID.
186     ///
187     /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
188     /// an error.
189     #[inline]
new(id: usize) -> Result<LazyStateID, LazyStateIDError>190     pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
191         if id > LazyStateID::MAX {
192             let attempted = u64::try_from(id).unwrap();
193             return Err(LazyStateIDError { attempted });
194         }
195         Ok(LazyStateID::new_unchecked(id))
196     }
197 
198     /// Create a new lazy state ID without checking whether the given value
199     /// exceeds [`LazyStateID::MAX`].
200     ///
201     /// While this is unchecked, providing an incorrect value must never
202     /// sacrifice memory safety.
203     #[inline]
new_unchecked(id: usize) -> LazyStateID204     const fn new_unchecked(id: usize) -> LazyStateID {
205         // FIXME: Use as_u32() once const functions in traits are stable.
206         LazyStateID(id as u32)
207     }
208 
209     /// Return this lazy state ID as an untagged `usize`.
210     ///
211     /// If this lazy state ID is tagged, then the usize returned is the state
212     /// ID without the tag. If the ID was not tagged, then the usize returned
213     /// is equivalent to the state ID.
214     #[inline]
as_usize_untagged(&self) -> usize215     pub(crate) fn as_usize_untagged(&self) -> usize {
216         self.as_usize_unchecked() & LazyStateID::MAX
217     }
218 
219     /// Return this lazy state ID as its raw internal `usize` value, which may
220     /// be tagged (and thus greater than LazyStateID::MAX).
221     #[inline]
as_usize_unchecked(&self) -> usize222     pub(crate) const fn as_usize_unchecked(&self) -> usize {
223         // FIXME: Use as_usize() once const functions in traits are stable.
224         self.0 as usize
225     }
226 
227     #[inline]
to_unknown(&self) -> LazyStateID228     pub(crate) const fn to_unknown(&self) -> LazyStateID {
229         LazyStateID::new_unchecked(
230             self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
231         )
232     }
233 
234     #[inline]
to_dead(&self) -> LazyStateID235     pub(crate) const fn to_dead(&self) -> LazyStateID {
236         LazyStateID::new_unchecked(
237             self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
238         )
239     }
240 
241     #[inline]
to_quit(&self) -> LazyStateID242     pub(crate) const fn to_quit(&self) -> LazyStateID {
243         LazyStateID::new_unchecked(
244             self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
245         )
246     }
247 
248     /// Return this lazy state ID as a state ID that is tagged as a start
249     /// state.
250     #[inline]
to_start(&self) -> LazyStateID251     pub(crate) const fn to_start(&self) -> LazyStateID {
252         LazyStateID::new_unchecked(
253             self.as_usize_unchecked() | LazyStateID::MASK_START,
254         )
255     }
256 
257     /// Return this lazy state ID as a lazy state ID that is tagged as a match
258     /// state.
259     #[inline]
to_match(&self) -> LazyStateID260     pub(crate) const fn to_match(&self) -> LazyStateID {
261         LazyStateID::new_unchecked(
262             self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
263         )
264     }
265 
266     /// Return true if and only if this lazy state ID is tagged.
267     ///
268     /// When a lazy state ID is tagged, then one can conclude that it is one
269     /// of a match, start, dead, quit or unknown state.
270     #[inline]
is_tagged(&self) -> bool271     pub const fn is_tagged(&self) -> bool {
272         self.as_usize_unchecked() > LazyStateID::MAX
273     }
274 
275     /// Return true if and only if this represents a lazy state ID that is
276     /// "unknown." That is, the state has not yet been created. When a caller
277     /// sees this state ID, it generally means that a state has to be computed
278     /// in order to proceed.
279     #[inline]
is_unknown(&self) -> bool280     pub const fn is_unknown(&self) -> bool {
281         self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
282     }
283 
284     /// Return true if and only if this represents a dead state. A dead state
285     /// is a state that can never transition to any other state except the
286     /// dead state. When a dead state is seen, it generally indicates that a
287     /// search should stop.
288     #[inline]
is_dead(&self) -> bool289     pub const fn is_dead(&self) -> bool {
290         self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
291     }
292 
293     /// Return true if and only if this represents a quit state. A quit state
294     /// is a state that is representationally equivalent to a dead state,
295     /// except it indicates the automaton has reached a point at which it can
296     /// no longer determine whether a match exists or not. In general, this
297     /// indicates an error during search and the caller must either pass this
298     /// error up or use a different search technique.
299     #[inline]
is_quit(&self) -> bool300     pub const fn is_quit(&self) -> bool {
301         self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
302     }
303 
304     /// Return true if and only if this lazy state ID has been tagged as a
305     /// start state.
306     ///
307     /// Note that if
308     /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is
309     /// disabled (which is the default), then this will always return false
310     /// since start states won't be tagged.
311     #[inline]
is_start(&self) -> bool312     pub const fn is_start(&self) -> bool {
313         self.as_usize_unchecked() & LazyStateID::MASK_START > 0
314     }
315 
316     /// Return true if and only if this lazy state ID has been tagged as a
317     /// match state.
318     #[inline]
is_match(&self) -> bool319     pub const fn is_match(&self) -> bool {
320         self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
321     }
322 }
323 
324 /// This error occurs when a lazy state ID could not be constructed.
325 ///
326 /// This occurs when given an integer exceeding the maximum lazy state ID
327 /// value.
328 ///
329 /// When the `std` feature is enabled, this implements the `Error` trait.
330 #[derive(Clone, Debug, Eq, PartialEq)]
331 pub(crate) struct LazyStateIDError {
332     attempted: u64,
333 }
334 
335 impl LazyStateIDError {
336     /// Returns the value that failed to constructed a lazy state ID.
attempted(&self) -> u64337     pub(crate) fn attempted(&self) -> u64 {
338         self.attempted
339     }
340 }
341 
342 #[cfg(feature = "std")]
343 impl std::error::Error for LazyStateIDError {}
344 
345 impl core::fmt::Display for LazyStateIDError {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result346     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
347         write!(
348             f,
349             "failed to create LazyStateID from {:?}, which exceeds {:?}",
350             self.attempted(),
351             LazyStateID::MAX,
352         )
353     }
354 }
355