1 use crate::{
2     dfa::DEAD,
3     util::{
4         primitives::StateID,
5         wire::{self, DeserializeError, Endian, SerializeError},
6     },
7 };
8 
9 macro_rules! err {
10     ($msg:expr) => {
11         return Err(DeserializeError::generic($msg));
12     };
13 }
14 
15 // Special represents the identifiers in a DFA that correspond to "special"
16 // states. If a state is one or more of the following, then it is considered
17 // special:
18 //
19 // * dead - A non-matching state where all outgoing transitions lead back to
20 //   itself. There is only one of these, regardless of whether minimization
21 //   has run. The dead state always has an ID of 0. i.e., It is always the
22 //   first state in a DFA.
23 // * quit - A state that is entered whenever a byte is seen that should cause
24 //   a DFA to give up and stop searching. This results in a MatchError::quit
25 //   error being returned at search time. The default configuration for a DFA
26 //   has no quit bytes, which means this state is unreachable by default,
27 //   although it is always present for reasons of implementation simplicity.
28 //   This state is only reachable when the caller configures the DFA to quit
29 //   on certain bytes. There is always exactly one of these states and it
30 //   is always the second state. (Its actual ID depends on the size of the
31 //   alphabet in dense DFAs, since state IDs are premultiplied in order to
32 //   allow them to be used directly as indices into the transition table.)
33 // * match - An accepting state, i.e., indicative of a match. There may be
34 //   zero or more of these states.
35 // * accelerated - A state where all of its outgoing transitions, except a
36 //   few, loop back to itself. These states are candidates for acceleration
37 //   via memchr during search. There may be zero or more of these states.
38 // * start - A non-matching state that indicates where the automaton should
39 //   start during a search. There is always at least one starting state and
40 //   all are guaranteed to be non-match states. (A start state cannot be a
41 //   match state because the DFAs in this crate delay all matches by one byte.
42 //   So every search that finds a match must move through one transition to
43 //   some other match state, even when searching an empty string.)
44 //
45 // These are not mutually exclusive categories. Namely, the following
46 // overlappings can occur:
47 //
48 // * {dead, start} - If a DFA can never lead to a match and it is minimized,
49 //   then it will typically compile to something where all starting IDs point
50 //   to the DFA's dead state.
51 // * {match, accelerated} - It is possible for a match state to have the
52 //   majority of its transitions loop back to itself, which means it's
53 //   possible for a match state to be accelerated.
54 // * {start, accelerated} - Similarly, it is possible for a start state to be
55 //   accelerated. Note that it is possible for an accelerated state to be
56 //   neither a match or a start state. Also note that just because both match
57 //   and start states overlap with accelerated states does not mean that
58 //   match and start states overlap with each other. In fact, they are
59 //   guaranteed not to overlap.
60 //
61 // As a special mention, every DFA always has a dead and a quit state, even
62 // though from the perspective of the DFA, they are equivalent. (Indeed,
63 // minimization special cases them to ensure they don't get merged.) The
64 // purpose of keeping them distinct is to use the quit state as a sentinel to
65 // distguish between whether a search finished successfully without finding
66 // anything or whether it gave up before finishing.
67 //
68 // So the main problem we want to solve here is the *fast* detection of whether
69 // a state is special or not. And we also want to do this while storing as
70 // little extra data as possible. AND we want to be able to quickly determine
71 // which categories a state falls into above if it is special.
72 //
73 // We achieve this by essentially shuffling all special states to the beginning
74 // of a DFA. That is, all special states appear before every other non-special
75 // state. By representing special states this way, we can determine whether a
76 // state is special or not by a single comparison, where special.max is the
77 // identifier of the last special state in the DFA:
78 //
79 //     if current_state <= special.max:
80 //         ... do something with special state
81 //
82 // The only thing left to do is to determine what kind of special state
83 // it is. Because what we do next depends on that. Since special states
84 // are typically rare, we can afford to do a bit more extra work, but we'd
85 // still like this to be as fast as possible. The trick we employ here is to
86 // continue shuffling states even within the special state range. Such that
87 // one contiguous region corresponds to match states, another for start states
88 // and then an overlapping range for accelerated states. At a high level, our
89 // special state detection might look like this (for leftmost searching, where
90 // we continue searching even after seeing a match):
91 //
92 //     byte = input[offset]
93 //     current_state = next_state(current_state, byte)
94 //     offset += 1
95 //     if current_state <= special.max:
96 //         if current_state == 0:
97 //             # We can never leave a dead state, so this always marks the
98 //             # end of our search.
99 //             return last_match
100 //         if current_state == special.quit_id:
101 //             # A quit state means we give up. If he DFA has no quit state,
102 //             # then special.quit_id == 0 == dead, which is handled by the
103 //             # conditional above.
104 //             return Err(MatchError::quit { byte, offset: offset - 1 })
105 //         if special.min_match <= current_state <= special.max_match:
106 //             last_match = Some(offset)
107 //             if special.min_accel <= current_state <= special.max_accel:
108 //                 offset = accelerate(input, offset)
109 //                 last_match = Some(offset)
110 //         elif special.min_start <= current_state <= special.max_start:
111 //             offset = prefilter.find(input, offset)
112 //             if special.min_accel <= current_state <= special.max_accel:
113 //                 offset = accelerate(input, offset)
114 //         elif special.min_accel <= current_state <= special.max_accel:
115 //             offset = accelerate(input, offset)
116 //
117 // There are some small details left out of the logic above. For example,
118 // in order to accelerate a state, we need to know which bytes to search for.
119 // This in turn implies some extra data we need to store in the DFA. To keep
120 // things compact, we would ideally only store
121 //
122 //     N = special.max_accel - special.min_accel + 1
123 //
124 // items. But state IDs are premultiplied, which means they are not contiguous.
125 // So in order to take a state ID and index an array of accelerated structures,
126 // we need to do:
127 //
128 //     i = (state_id - special.min_accel) / stride
129 //
130 // (N.B. 'stride' is always a power of 2, so the above can be implemented via
131 // '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
132 // 2^x=stride.)
133 //
134 // Moreover, some of these specialty categories may be empty. For example,
135 // DFAs are not required to have any match states or any accelerated states.
136 // In that case, the lower and upper bounds are both set to 0 (the dead state
137 // ID) and the first `current_state == 0` check subsumes cases where the
138 // ranges are empty.
139 //
140 // Loop unrolling, if applicable, has also been left out of the logic above.
141 //
142 // Graphically, the ranges look like this, where asterisks indicate ranges
143 // that can be empty. Each 'x' is a state.
144 //
145 //      quit
146 //  dead|
147 //     ||
148 //     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
149 //     | |             |    | start |                       |
150 //     | |-------------|    |-------|                       |
151 //     |   match*   |          |    |                       |
152 //     |            |          |    |                       |
153 //     |            |----------|    |                       |
154 //     |                accel*      |                       |
155 //     |                            |                       |
156 //     |                            |                       |
157 //     |----------------------------|------------------------
158 //              special                   non-special*
159 #[derive(Clone, Copy, Debug)]
160 pub(crate) struct Special {
161     /// The identifier of the last special state in a DFA. A state is special
162     /// if and only if its identifier is less than or equal to `max`.
163     pub(crate) max: StateID,
164     /// The identifier of the quit state in a DFA. (There is no analogous field
165     /// for the dead state since the dead state's ID is always zero, regardless
166     /// of state ID size.)
167     pub(crate) quit_id: StateID,
168     /// The identifier of the first match state.
169     pub(crate) min_match: StateID,
170     /// The identifier of the last match state.
171     pub(crate) max_match: StateID,
172     /// The identifier of the first accelerated state.
173     pub(crate) min_accel: StateID,
174     /// The identifier of the last accelerated state.
175     pub(crate) max_accel: StateID,
176     /// The identifier of the first start state.
177     pub(crate) min_start: StateID,
178     /// The identifier of the last start state.
179     pub(crate) max_start: StateID,
180 }
181 
182 impl Special {
183     /// Creates a new set of special ranges for a DFA. All ranges are initially
184     /// set to only contain the dead state. This is interpreted as an empty
185     /// range.
186     #[cfg(feature = "dfa-build")]
new() -> Special187     pub(crate) fn new() -> Special {
188         Special {
189             max: DEAD,
190             quit_id: DEAD,
191             min_match: DEAD,
192             max_match: DEAD,
193             min_accel: DEAD,
194             max_accel: DEAD,
195             min_start: DEAD,
196             max_start: DEAD,
197         }
198     }
199 
200     /// Remaps all of the special state identifiers using the function given.
201     #[cfg(feature = "dfa-build")]
remap(&self, map: impl Fn(StateID) -> StateID) -> Special202     pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
203         Special {
204             max: map(self.max),
205             quit_id: map(self.quit_id),
206             min_match: map(self.min_match),
207             max_match: map(self.max_match),
208             min_accel: map(self.min_accel),
209             max_accel: map(self.max_accel),
210             min_start: map(self.min_start),
211             max_start: map(self.max_start),
212         }
213     }
214 
215     /// Deserialize the given bytes into special state ranges. If the slice
216     /// given is not big enough, then this returns an error. Similarly, if
217     /// any of the expected invariants around special state ranges aren't
218     /// upheld, an error is returned. Note that this does not guarantee that
219     /// the information returned is correct.
220     ///
221     /// Upon success, this returns the number of bytes read in addition to the
222     /// special state IDs themselves.
from_bytes( mut slice: &[u8], ) -> Result<(Special, usize), DeserializeError>223     pub(crate) fn from_bytes(
224         mut slice: &[u8],
225     ) -> Result<(Special, usize), DeserializeError> {
226         wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
227 
228         let mut nread = 0;
229         let mut read_id = |what| -> Result<StateID, DeserializeError> {
230             let (id, nr) = wire::try_read_state_id(slice, what)?;
231             nread += nr;
232             slice = &slice[StateID::SIZE..];
233             Ok(id)
234         };
235 
236         let max = read_id("special max id")?;
237         let quit_id = read_id("special quit id")?;
238         let min_match = read_id("special min match id")?;
239         let max_match = read_id("special max match id")?;
240         let min_accel = read_id("special min accel id")?;
241         let max_accel = read_id("special max accel id")?;
242         let min_start = read_id("special min start id")?;
243         let max_start = read_id("special max start id")?;
244 
245         let special = Special {
246             max,
247             quit_id,
248             min_match,
249             max_match,
250             min_accel,
251             max_accel,
252             min_start,
253             max_start,
254         };
255         special.validate()?;
256         assert_eq!(nread, special.write_to_len());
257         Ok((special, nread))
258     }
259 
260     /// Validate that the information describing special states satisfies
261     /// all known invariants.
validate(&self) -> Result<(), DeserializeError>262     pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
263         // Check that both ends of the range are DEAD or neither are.
264         if self.min_match == DEAD && self.max_match != DEAD {
265             err!("min_match is DEAD, but max_match is not");
266         }
267         if self.min_match != DEAD && self.max_match == DEAD {
268             err!("max_match is DEAD, but min_match is not");
269         }
270         if self.min_accel == DEAD && self.max_accel != DEAD {
271             err!("min_accel is DEAD, but max_accel is not");
272         }
273         if self.min_accel != DEAD && self.max_accel == DEAD {
274             err!("max_accel is DEAD, but min_accel is not");
275         }
276         if self.min_start == DEAD && self.max_start != DEAD {
277             err!("min_start is DEAD, but max_start is not");
278         }
279         if self.min_start != DEAD && self.max_start == DEAD {
280             err!("max_start is DEAD, but min_start is not");
281         }
282 
283         // Check that ranges are well formed.
284         if self.min_match > self.max_match {
285             err!("min_match should not be greater than max_match");
286         }
287         if self.min_accel > self.max_accel {
288             err!("min_accel should not be greater than max_accel");
289         }
290         if self.min_start > self.max_start {
291             err!("min_start should not be greater than max_start");
292         }
293 
294         // Check that ranges are ordered with respect to one another.
295         if self.matches() && self.quit_id >= self.min_match {
296             err!("quit_id should not be greater than min_match");
297         }
298         if self.accels() && self.quit_id >= self.min_accel {
299             err!("quit_id should not be greater than min_accel");
300         }
301         if self.starts() && self.quit_id >= self.min_start {
302             err!("quit_id should not be greater than min_start");
303         }
304         if self.matches() && self.accels() && self.min_accel < self.min_match {
305             err!("min_match should not be greater than min_accel");
306         }
307         if self.matches() && self.starts() && self.min_start < self.min_match {
308             err!("min_match should not be greater than min_start");
309         }
310         if self.accels() && self.starts() && self.min_start < self.min_accel {
311             err!("min_accel should not be greater than min_start");
312         }
313 
314         // Check that max is at least as big as everything else.
315         if self.max < self.quit_id {
316             err!("quit_id should not be greater than max");
317         }
318         if self.max < self.max_match {
319             err!("max_match should not be greater than max");
320         }
321         if self.max < self.max_accel {
322             err!("max_accel should not be greater than max");
323         }
324         if self.max < self.max_start {
325             err!("max_start should not be greater than max");
326         }
327 
328         Ok(())
329     }
330 
331     /// Validate that the special state information is compatible with the
332     /// given state len.
validate_state_len( &self, len: usize, stride2: usize, ) -> Result<(), DeserializeError>333     pub(crate) fn validate_state_len(
334         &self,
335         len: usize,
336         stride2: usize,
337     ) -> Result<(), DeserializeError> {
338         // We assume that 'validate' has already passed, so we know that 'max'
339         // is truly the max. So all we need to check is that the max state ID
340         // is less than the state ID len. The max legal value here is len-1,
341         // which occurs when there are no non-special states.
342         if (self.max.as_usize() >> stride2) >= len {
343             err!("max should not be greater than or equal to state length");
344         }
345         Ok(())
346     }
347 
348     /// Write the IDs and ranges for special states to the given byte buffer.
349     /// The buffer given must have enough room to store all data, otherwise
350     /// this will return an error. The number of bytes written is returned
351     /// on success. The number of bytes written is guaranteed to be a multiple
352     /// of 8.
write_to<E: Endian>( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>353     pub(crate) fn write_to<E: Endian>(
354         &self,
355         dst: &mut [u8],
356     ) -> Result<usize, SerializeError> {
357         use crate::util::wire::write_state_id as write;
358 
359         if dst.len() < self.write_to_len() {
360             return Err(SerializeError::buffer_too_small("special state ids"));
361         }
362 
363         let mut nwrite = 0;
364         nwrite += write::<E>(self.max, &mut dst[nwrite..]);
365         nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
366         nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
367         nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
368         nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
369         nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
370         nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
371         nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
372 
373         assert_eq!(
374             self.write_to_len(),
375             nwrite,
376             "expected to write certain number of bytes",
377         );
378         assert_eq!(
379             nwrite % 8,
380             0,
381             "expected to write multiple of 8 bytes for special states",
382         );
383         Ok(nwrite)
384     }
385 
386     /// Returns the total number of bytes written by `write_to`.
write_to_len(&self) -> usize387     pub(crate) fn write_to_len(&self) -> usize {
388         8 * StateID::SIZE
389     }
390 
391     /// Sets the maximum special state ID based on the current values. This
392     /// should be used once all possible state IDs are set.
393     #[cfg(feature = "dfa-build")]
set_max(&mut self)394     pub(crate) fn set_max(&mut self) {
395         use core::cmp::max;
396         self.max = max(
397             self.quit_id,
398             max(self.max_match, max(self.max_accel, self.max_start)),
399         );
400     }
401 
402     /// Sets the maximum special state ID such that starting states are not
403     /// considered "special." This also marks the min/max starting states as
404     /// DEAD such that 'is_start_state' always returns false, even if the state
405     /// is actually a starting state.
406     ///
407     /// This is useful when there is no prefilter set. It will avoid
408     /// ping-ponging between the hot path in the DFA search code and the start
409     /// state handling code, which is typically only useful for executing a
410     /// prefilter.
411     #[cfg(feature = "dfa-build")]
set_no_special_start_states(&mut self)412     pub(crate) fn set_no_special_start_states(&mut self) {
413         use core::cmp::max;
414         self.max = max(self.quit_id, max(self.max_match, self.max_accel));
415         self.min_start = DEAD;
416         self.max_start = DEAD;
417     }
418 
419     /// Returns true if and only if the given state ID is a special state.
420     #[inline]
is_special_state(&self, id: StateID) -> bool421     pub(crate) fn is_special_state(&self, id: StateID) -> bool {
422         id <= self.max
423     }
424 
425     /// Returns true if and only if the given state ID is a dead state.
426     #[inline]
is_dead_state(&self, id: StateID) -> bool427     pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
428         id == DEAD
429     }
430 
431     /// Returns true if and only if the given state ID is a quit state.
432     #[inline]
is_quit_state(&self, id: StateID) -> bool433     pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
434         !self.is_dead_state(id) && self.quit_id == id
435     }
436 
437     /// Returns true if and only if the given state ID is a match state.
438     #[inline]
is_match_state(&self, id: StateID) -> bool439     pub(crate) fn is_match_state(&self, id: StateID) -> bool {
440         !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
441     }
442 
443     /// Returns true if and only if the given state ID is an accel state.
444     #[inline]
is_accel_state(&self, id: StateID) -> bool445     pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
446         !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
447     }
448 
449     /// Returns true if and only if the given state ID is a start state.
450     #[inline]
is_start_state(&self, id: StateID) -> bool451     pub(crate) fn is_start_state(&self, id: StateID) -> bool {
452         !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
453     }
454 
455     /// Returns the total number of match states for a dense table based DFA.
456     #[inline]
match_len(&self, stride: usize) -> usize457     pub(crate) fn match_len(&self, stride: usize) -> usize {
458         if self.matches() {
459             (self.max_match.as_usize() - self.min_match.as_usize() + stride)
460                 / stride
461         } else {
462             0
463         }
464     }
465 
466     /// Returns true if and only if there is at least one match state.
467     #[inline]
matches(&self) -> bool468     pub(crate) fn matches(&self) -> bool {
469         self.min_match != DEAD
470     }
471 
472     /// Returns the total number of accel states.
473     #[cfg(feature = "dfa-build")]
accel_len(&self, stride: usize) -> usize474     pub(crate) fn accel_len(&self, stride: usize) -> usize {
475         if self.accels() {
476             (self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
477                 / stride
478         } else {
479             0
480         }
481     }
482 
483     /// Returns true if and only if there is at least one accel state.
484     #[inline]
accels(&self) -> bool485     pub(crate) fn accels(&self) -> bool {
486         self.min_accel != DEAD
487     }
488 
489     /// Returns true if and only if there is at least one start state.
490     #[inline]
starts(&self) -> bool491     pub(crate) fn starts(&self) -> bool {
492         self.min_start != DEAD
493     }
494 }
495