1 /*!
2 Provides helpers for dealing with start state configurations in DFAs.
3 */
4 
5 use crate::util::{
6     look::LookMatcher,
7     search::{Anchored, Input},
8     wire::{self, DeserializeError, SerializeError},
9 };
10 
11 /// The configuration used to determine a DFA's start state for a search.
12 ///
13 /// A DFA has a single starting state in the typical textbook description. That
14 /// is, it corresponds to the set of all starting states for the NFA that built
15 /// it, along with their espsilon closures. In this crate, however, DFAs have
16 /// many possible start states due to a few factors:
17 ///
18 /// * DFAs support the ability to run either anchored or unanchored searches.
19 /// Each type of search needs its own start state. For example, an unanchored
20 /// search requires starting at a state corresponding to a regex with a
21 /// `(?s-u:.)*?` prefix, which will match through anything.
22 /// * DFAs also optionally support starting an anchored search for any one
23 /// specific pattern. Each such pattern requires its own start state.
24 /// * If a look-behind assertion like `^` or `\b` is used in the regex, then
25 /// the DFA will need to inspect a single byte immediately before the start of
26 /// the search to choose the correct start state.
27 ///
28 /// Indeed, this configuration precisely encapsulates all of the above factors.
29 /// The [`Config::anchored`] method sets which kind of anchored search to
30 /// perform while the [`Config::look_behind`] method provides a way to set
31 /// the byte that occurs immediately before the start of the search.
32 ///
33 /// Generally speaking, this type is only useful when you want to run searches
34 /// without using an [`Input`]. In particular, an `Input` wants a haystack
35 /// slice, but callers may not have a contiguous sequence of bytes as a
36 /// haystack in all cases. This type provides a lower level of control such
37 /// that callers can provide their own anchored configuration and look-behind
38 /// byte explicitly.
39 ///
40 /// # Example
41 ///
42 /// This shows basic usage that permits running a search with a DFA without
43 /// using the `Input` abstraction.
44 ///
45 /// ```
46 /// use regex_automata::{
47 ///     dfa::{Automaton, dense},
48 ///     util::start,
49 ///     Anchored,
50 /// };
51 ///
52 /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
53 /// let haystack = "quartz";
54 ///
55 /// let config = start::Config::new().anchored(Anchored::Yes);
56 /// let mut state = dfa.start_state(&config)?;
57 /// for &b in haystack.as_bytes().iter() {
58 ///     state = dfa.next_state(state, b);
59 /// }
60 /// state = dfa.next_eoi_state(state);
61 /// assert!(dfa.is_match_state(state));
62 ///
63 /// # Ok::<(), Box<dyn std::error::Error>>(())
64 /// ```
65 ///
66 /// This example shows how to correctly run a search that doesn't begin at
67 /// the start of a haystack. Notice how we set the look-behind byte, and as
68 /// a result, the `\b` assertion does not match.
69 ///
70 /// ```
71 /// use regex_automata::{
72 ///     dfa::{Automaton, dense},
73 ///     util::start,
74 ///     Anchored,
75 /// };
76 ///
77 /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
78 /// let haystack = "quartz";
79 ///
80 /// let config = start::Config::new()
81 ///     .anchored(Anchored::Yes)
82 ///     .look_behind(Some(b'q'));
83 /// let mut state = dfa.start_state(&config)?;
84 /// for &b in haystack.as_bytes().iter().skip(1) {
85 ///     state = dfa.next_state(state, b);
86 /// }
87 /// state = dfa.next_eoi_state(state);
88 /// // No match!
89 /// assert!(!dfa.is_match_state(state));
90 ///
91 /// # Ok::<(), Box<dyn std::error::Error>>(())
92 /// ```
93 ///
94 /// If we had instead not set a look-behind byte, then the DFA would assume
95 /// that it was starting at the beginning of the haystack, and thus `\b` should
96 /// match. This in turn would result in erroneously reporting a match:
97 ///
98 /// ```
99 /// use regex_automata::{
100 ///     dfa::{Automaton, dense},
101 ///     util::start,
102 ///     Anchored,
103 /// };
104 ///
105 /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
106 /// let haystack = "quartz";
107 ///
108 /// // Whoops, forgot the look-behind byte...
109 /// let config = start::Config::new().anchored(Anchored::Yes);
110 /// let mut state = dfa.start_state(&config)?;
111 /// for &b in haystack.as_bytes().iter().skip(1) {
112 ///     state = dfa.next_state(state, b);
113 /// }
114 /// state = dfa.next_eoi_state(state);
115 /// // And now we get a match unexpectedly.
116 /// assert!(dfa.is_match_state(state));
117 ///
118 /// # Ok::<(), Box<dyn std::error::Error>>(())
119 /// ```
120 #[derive(Clone, Debug)]
121 pub struct Config {
122     look_behind: Option<u8>,
123     anchored: Anchored,
124 }
125 
126 impl Config {
127     /// Create a new default start configuration.
128     ///
129     /// The default is an unanchored search that starts at the beginning of the
130     /// haystack.
new() -> Config131     pub fn new() -> Config {
132         Config { anchored: Anchored::No, look_behind: None }
133     }
134 
135     /// A convenience routine for building a start configuration from an
136     /// [`Input`] for a forward search.
137     ///
138     /// This automatically sets the look-behind byte to the byte immediately
139     /// preceding the start of the search. If the start of the search is at
140     /// offset `0`, then no look-behind byte is set.
from_input_forward(input: &Input<'_>) -> Config141     pub fn from_input_forward(input: &Input<'_>) -> Config {
142         let look_behind = input
143             .start()
144             .checked_sub(1)
145             .and_then(|i| input.haystack().get(i).copied());
146         Config { look_behind, anchored: input.get_anchored() }
147     }
148 
149     /// A convenience routine for building a start configuration from an
150     /// [`Input`] for a reverse search.
151     ///
152     /// This automatically sets the look-behind byte to the byte immediately
153     /// following the end of the search. If the end of the search is at
154     /// offset `haystack.len()`, then no look-behind byte is set.
from_input_reverse(input: &Input<'_>) -> Config155     pub fn from_input_reverse(input: &Input<'_>) -> Config {
156         let look_behind = input.haystack().get(input.end()).copied();
157         Config { look_behind, anchored: input.get_anchored() }
158     }
159 
160     /// Set the look-behind byte at the start of a search.
161     ///
162     /// Unless the search is intended to logically start at the beginning of a
163     /// haystack, this should _always_ be set to the byte immediately preceding
164     /// the start of the search. If no look-behind byte is set, then the start
165     /// configuration will assume it is at the beginning of the haystack. For
166     /// example, the anchor `^` will match.
167     ///
168     /// The default is that no look-behind byte is set.
look_behind(mut self, byte: Option<u8>) -> Config169     pub fn look_behind(mut self, byte: Option<u8>) -> Config {
170         self.look_behind = byte;
171         self
172     }
173 
174     /// Set the anchored mode of a search.
175     ///
176     /// The default is an unanchored search.
anchored(mut self, mode: Anchored) -> Config177     pub fn anchored(mut self, mode: Anchored) -> Config {
178         self.anchored = mode;
179         self
180     }
181 
182     /// Return the look-behind byte in this configuration, if one exists.
get_look_behind(&self) -> Option<u8>183     pub fn get_look_behind(&self) -> Option<u8> {
184         self.look_behind
185     }
186 
187     /// Return the anchored mode in this configuration.
get_anchored(&self) -> Anchored188     pub fn get_anchored(&self) -> Anchored {
189         self.anchored
190     }
191 }
192 
193 /// A map from every possible byte value to its corresponding starting
194 /// configuration.
195 ///
196 /// This map is used in order to lookup the start configuration for a particular
197 /// position in a haystack. This start configuration is then used in
198 /// combination with things like the anchored mode and pattern ID to fully
199 /// determine the start state.
200 ///
201 /// Generally speaking, this map is only used for fully compiled DFAs and lazy
202 /// DFAs. For NFAs (including the one-pass DFA), the start state is generally
203 /// selected by virtue of traversing the NFA state graph. DFAs do the same
204 /// thing, but at build time and not search time. (Well, technically the lazy
205 /// DFA does it at search time, but it does enough work to cache the full
206 /// result of the epsilon closure that the NFA engines tend to need to do.)
207 #[derive(Clone)]
208 pub(crate) struct StartByteMap {
209     map: [Start; 256],
210 }
211 
212 impl StartByteMap {
213     /// Create a new map from byte values to their corresponding starting
214     /// configurations. The map is determined, in part, by how look-around
215     /// assertions are matched via the matcher given.
new(lookm: &LookMatcher) -> StartByteMap216     pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap {
217         let mut map = [Start::NonWordByte; 256];
218         map[usize::from(b'\n')] = Start::LineLF;
219         map[usize::from(b'\r')] = Start::LineCR;
220         map[usize::from(b'_')] = Start::WordByte;
221 
222         let mut byte = b'0';
223         while byte <= b'9' {
224             map[usize::from(byte)] = Start::WordByte;
225             byte += 1;
226         }
227         byte = b'A';
228         while byte <= b'Z' {
229             map[usize::from(byte)] = Start::WordByte;
230             byte += 1;
231         }
232         byte = b'a';
233         while byte <= b'z' {
234             map[usize::from(byte)] = Start::WordByte;
235             byte += 1;
236         }
237 
238         let lineterm = lookm.get_line_terminator();
239         // If our line terminator is normal, then it is already handled by
240         // the LineLF and LineCR configurations. But if it's weird, then we
241         // overwrite whatever was there before for that terminator with a
242         // special configuration. The trick here is that if the terminator
243         // is, say, a word byte like `a`, then callers seeing this start
244         // configuration need to account for that and build their DFA state as
245         // if it *also* came from a word byte.
246         if lineterm != b'\r' && lineterm != b'\n' {
247             map[usize::from(lineterm)] = Start::CustomLineTerminator;
248         }
249         StartByteMap { map }
250     }
251 
252     /// Return the starting configuration for the given look-behind byte.
253     ///
254     /// If no look-behind exists, callers should use `Start::Text`.
255     #[cfg_attr(feature = "perf-inline", inline(always))]
get(&self, byte: u8) -> Start256     pub(crate) fn get(&self, byte: u8) -> Start {
257         self.map[usize::from(byte)]
258     }
259 
260     /// Deserializes a byte class map from the given slice. If the slice is of
261     /// insufficient length or otherwise contains an impossible mapping, then
262     /// an error is returned. Upon success, the number of bytes read along with
263     /// the map are returned. The number of bytes read is always a multiple of
264     /// 8.
from_bytes( slice: &[u8], ) -> Result<(StartByteMap, usize), DeserializeError>265     pub(crate) fn from_bytes(
266         slice: &[u8],
267     ) -> Result<(StartByteMap, usize), DeserializeError> {
268         wire::check_slice_len(slice, 256, "start byte map")?;
269         let mut map = [Start::NonWordByte; 256];
270         for (i, &repr) in slice[..256].iter().enumerate() {
271             map[i] = match Start::from_usize(usize::from(repr)) {
272                 Some(start) => start,
273                 None => {
274                     return Err(DeserializeError::generic(
275                         "found invalid starting configuration",
276                     ))
277                 }
278             };
279         }
280         Ok((StartByteMap { map }, 256))
281     }
282 
283     /// Writes this map to the given byte buffer. if the given buffer is too
284     /// small, then an error is returned. Upon success, the total number of
285     /// bytes written is returned. The number of bytes written is guaranteed to
286     /// be a multiple of 8.
write_to( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>287     pub(crate) fn write_to(
288         &self,
289         dst: &mut [u8],
290     ) -> Result<usize, SerializeError> {
291         let nwrite = self.write_to_len();
292         if dst.len() < nwrite {
293             return Err(SerializeError::buffer_too_small("start byte map"));
294         }
295         for (i, &start) in self.map.iter().enumerate() {
296             dst[i] = start.as_u8();
297         }
298         Ok(nwrite)
299     }
300 
301     /// Returns the total number of bytes written by `write_to`.
write_to_len(&self) -> usize302     pub(crate) fn write_to_len(&self) -> usize {
303         256
304     }
305 }
306 
307 impl core::fmt::Debug for StartByteMap {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result308     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
309         use crate::util::escape::DebugByte;
310 
311         write!(f, "StartByteMap{{")?;
312         for byte in 0..=255 {
313             if byte > 0 {
314                 write!(f, ", ")?;
315             }
316             let start = self.map[usize::from(byte)];
317             write!(f, "{:?} => {:?}", DebugByte(byte), start)?;
318         }
319         write!(f, "}}")?;
320         Ok(())
321     }
322 }
323 
324 /// Represents the six possible starting configurations of a DFA search.
325 ///
326 /// The starting configuration is determined by inspecting the the beginning
327 /// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID
328 /// (if specified) and the type of search (anchored or not) is what selects the
329 /// start state to use in a DFA.
330 ///
331 /// As one example, if a DFA only supports unanchored searches and does not
332 /// support anchored searches for each pattern, then it will have at most 6
333 /// distinct start states. (Some start states may be reused if determinization
334 /// can determine that they will be equivalent.) If the DFA supports both
335 /// anchored and unanchored searches, then it will have a maximum of 12
336 /// distinct start states. Finally, if the DFA also supports anchored searches
337 /// for each pattern, then it can have up to `12 + (N * 6)` start states, where
338 /// `N` is the number of patterns.
339 ///
340 /// Handling each of these starting configurations in the context of DFA
341 /// determinization can be *quite* tricky and subtle. But the code is small
342 /// and can be found at `crate::util::determinize::set_lookbehind_from_start`.
343 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
344 pub(crate) enum Start {
345     /// This occurs when the starting position is not any of the ones below.
346     NonWordByte = 0,
347     /// This occurs when the byte immediately preceding the start of the search
348     /// is an ASCII word byte.
349     WordByte = 1,
350     /// This occurs when the starting position of the search corresponds to the
351     /// beginning of the haystack.
352     Text = 2,
353     /// This occurs when the byte immediately preceding the start of the search
354     /// is a line terminator. Specifically, `\n`.
355     LineLF = 3,
356     /// This occurs when the byte immediately preceding the start of the search
357     /// is a line terminator. Specifically, `\r`.
358     LineCR = 4,
359     /// This occurs when a custom line terminator has been set via a
360     /// `LookMatcher`, and when that line terminator is neither a `\r` or a
361     /// `\n`.
362     ///
363     /// If the custom line terminator is a word byte, then this start
364     /// configuration is still selected. DFAs that implement word boundary
365     /// assertions will likely need to check whether the custom line terminator
366     /// is a word byte, in which case, it should behave as if the byte
367     /// satisfies `\b` in addition to multi-line anchors.
368     CustomLineTerminator = 5,
369 }
370 
371 impl Start {
372     /// Return the starting state corresponding to the given integer. If no
373     /// starting state exists for the given integer, then None is returned.
from_usize(n: usize) -> Option<Start>374     pub(crate) fn from_usize(n: usize) -> Option<Start> {
375         match n {
376             0 => Some(Start::NonWordByte),
377             1 => Some(Start::WordByte),
378             2 => Some(Start::Text),
379             3 => Some(Start::LineLF),
380             4 => Some(Start::LineCR),
381             5 => Some(Start::CustomLineTerminator),
382             _ => None,
383         }
384     }
385 
386     /// Returns the total number of starting state configurations.
len() -> usize387     pub(crate) fn len() -> usize {
388         6
389     }
390 
391     /// Return this starting configuration as `u8` integer. It is guaranteed to
392     /// be less than `Start::len()`.
393     #[cfg_attr(feature = "perf-inline", inline(always))]
as_u8(&self) -> u8394     pub(crate) fn as_u8(&self) -> u8 {
395         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
396         // actual int.
397         *self as u8
398     }
399 
400     /// Return this starting configuration as a `usize` integer. It is
401     /// guaranteed to be less than `Start::len()`.
402     #[cfg_attr(feature = "perf-inline", inline(always))]
as_usize(&self) -> usize403     pub(crate) fn as_usize(&self) -> usize {
404         usize::from(self.as_u8())
405     }
406 }
407 
408 #[cfg(test)]
409 mod tests {
410     use super::*;
411 
412     #[test]
start_fwd_done_range()413     fn start_fwd_done_range() {
414         let smap = StartByteMap::new(&LookMatcher::default());
415         let input = Input::new("").range(1..0);
416         let config = Config::from_input_forward(&input);
417         let start =
418             config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
419         assert_eq!(Start::Text, start);
420     }
421 
422     #[test]
start_rev_done_range()423     fn start_rev_done_range() {
424         let smap = StartByteMap::new(&LookMatcher::default());
425         let input = Input::new("").range(1..0);
426         let config = Config::from_input_reverse(&input);
427         let start =
428             config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
429         assert_eq!(Start::Text, start);
430     }
431 
432     #[test]
start_fwd()433     fn start_fwd() {
434         let f = |haystack, start, end| {
435             let smap = StartByteMap::new(&LookMatcher::default());
436             let input = Input::new(haystack).range(start..end);
437             let config = Config::from_input_forward(&input);
438             let start =
439                 config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
440             start
441         };
442 
443         assert_eq!(Start::Text, f("", 0, 0));
444         assert_eq!(Start::Text, f("abc", 0, 3));
445         assert_eq!(Start::Text, f("\nabc", 0, 3));
446 
447         assert_eq!(Start::LineLF, f("\nabc", 1, 3));
448 
449         assert_eq!(Start::LineCR, f("\rabc", 1, 3));
450 
451         assert_eq!(Start::WordByte, f("abc", 1, 3));
452 
453         assert_eq!(Start::NonWordByte, f(" abc", 1, 3));
454     }
455 
456     #[test]
start_rev()457     fn start_rev() {
458         let f = |haystack, start, end| {
459             let smap = StartByteMap::new(&LookMatcher::default());
460             let input = Input::new(haystack).range(start..end);
461             let config = Config::from_input_reverse(&input);
462             let start =
463                 config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
464             start
465         };
466 
467         assert_eq!(Start::Text, f("", 0, 0));
468         assert_eq!(Start::Text, f("abc", 0, 3));
469         assert_eq!(Start::Text, f("abc\n", 0, 4));
470 
471         assert_eq!(Start::LineLF, f("abc\nz", 0, 3));
472 
473         assert_eq!(Start::LineCR, f("abc\rz", 0, 3));
474 
475         assert_eq!(Start::WordByte, f("abc", 0, 2));
476 
477         assert_eq!(Start::NonWordByte, f("abc ", 0, 3));
478     }
479 }
480