1 // This module defines some core types for dealing with accelerated DFA states.
2 // Briefly, a DFA state can be "accelerated" if all of its transitions except
3 // for a few loop back to itself. This directly implies that the only way out
4 // of such a state is if a byte corresponding to one of those non-loopback
5 // transitions is found. Such states are often found in simple repetitions in
6 // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
7 // DFA with regex-cli:
8 //
9 //     $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
10 //     D 000000:
11 //     Q 000001:
12 //      *000002:
13 //     A 000003: \x00-` => 3, a => 8, b-\xFF => 3
14 //     A 000004: \x00-` => 4, a => 7, b-\xFF => 4
15 //       000005: \x00-` => 4, b-\xFF => 4
16 //       000006: \x00-` => 3, a => 6, b-\xFF => 3
17 //       000007: \x00-\xFF => 2, EOI => 2
18 //       000008: \x00-\xFF => 2, EOI => 2
19 //
20 // In particular, state 3 is accelerated (shown via the 'A' indicator) since
21 // the only way to leave that state once entered is to see an 'a' byte. If
22 // there is a long run of non-'a' bytes, then using something like 'memchr'
23 // to find the next 'a' byte can be significantly faster than just using the
24 // standard byte-at-a-time state machine.
25 //
26 // Unfortunately, this optimization rarely applies when Unicode is enabled.
27 // For example, patterns like '[^a]' don't actually match any byte that isn't
28 // 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
29 // 'a'. This makes the state machine much more complex---far beyond a single
30 // state---and removes the ability to easily accelerate it. (Because if the
31 // machine sees a non-UTF-8 sequence, then the machine won't match through it.)
32 //
33 // In practice, we only consider accelerating states that have 3 or fewer
34 // non-loop transitions. At a certain point, you get diminishing returns, but
35 // also because that's what the memchr crate supports. The structures below
36 // hard-code this assumption and provide (de)serialization APIs for use inside
37 // a DFA.
38 //
39 // And finally, note that there is some trickery involved in making it very
40 // fast to not only check whether a state is accelerated at search time, but
41 // also to access the bytes to search for to implement the acceleration itself.
42 // dfa/special.rs provides more detail, but the short story is that all
43 // accelerated states appear contiguously in a DFA. This means we can represent
44 // the ID space of all accelerated DFA states with a single range. So given
45 // a state ID, we can determine whether it's accelerated via
46 //
47 //     min_accel_id <= id <= max_accel_id
48 //
49 // And find its corresponding accelerator with:
50 //
51 //     accels.get((id - min_accel_id) / dfa_stride)
52 
53 #[cfg(feature = "dfa-build")]
54 use alloc::{vec, vec::Vec};
55 
56 use crate::util::{
57     int::Pointer,
58     memchr,
59     wire::{self, DeserializeError, Endian, SerializeError},
60 };
61 
62 /// The base type used to represent a collection of accelerators.
63 ///
64 /// While an `Accel` is represented as a fixed size array of bytes, a
65 /// *collection* of `Accel`s (called `Accels`) is represented internally as a
66 /// slice of u32. While it's a bit unnatural to do this and costs us a bit of
67 /// fairly low-risk not-safe code, it lets us remove the need for a second type
68 /// parameter in the definition of dense::DFA. (Which really wants everything
69 /// to be a slice of u32.)
70 type AccelTy = u32;
71 
72 /// The size of the unit of representation for accelerators.
73 ///
74 /// ACCEL_CAP *must* be a multiple of this size.
75 const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
76 
77 /// The maximum length in bytes that a single Accel can be. This is distinct
78 /// from the capacity of an accelerator in that the length represents only the
79 /// bytes that should be read.
80 const ACCEL_LEN: usize = 4;
81 
82 /// The capacity of each accelerator, in bytes. We set this to 8 since it's a
83 /// multiple of 4 (our ID size) and because it gives us a little wiggle room
84 /// if we want to support more accel bytes in the future without a breaking
85 /// change.
86 ///
87 /// This MUST be a multiple of ACCEL_TY_SIZE.
88 const ACCEL_CAP: usize = 8;
89 
90 /// Search for between 1 and 3 needle bytes in the given haystack, starting the
91 /// search at the given position. If `needles` has a length other than 1-3,
92 /// then this panics.
93 #[cfg_attr(feature = "perf-inline", inline(always))]
find_fwd( needles: &[u8], haystack: &[u8], at: usize, ) -> Option<usize>94 pub(crate) fn find_fwd(
95     needles: &[u8],
96     haystack: &[u8],
97     at: usize,
98 ) -> Option<usize> {
99     let bs = needles;
100     let i = match needles.len() {
101         1 => memchr::memchr(bs[0], &haystack[at..])?,
102         2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
103         3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
104         0 => panic!("cannot find with empty needles"),
105         n => panic!("invalid needles length: {}", n),
106     };
107     Some(at + i)
108 }
109 
110 /// Search for between 1 and 3 needle bytes in the given haystack in reverse,
111 /// starting the search at the given position. If `needles` has a length other
112 /// than 1-3, then this panics.
113 #[cfg_attr(feature = "perf-inline", inline(always))]
find_rev( needles: &[u8], haystack: &[u8], at: usize, ) -> Option<usize>114 pub(crate) fn find_rev(
115     needles: &[u8],
116     haystack: &[u8],
117     at: usize,
118 ) -> Option<usize> {
119     let bs = needles;
120     match needles.len() {
121         1 => memchr::memrchr(bs[0], &haystack[..at]),
122         2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
123         3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
124         0 => panic!("cannot find with empty needles"),
125         n => panic!("invalid needles length: {}", n),
126     }
127 }
128 
129 /// Represents the accelerators for all accelerated states in a dense DFA.
130 ///
131 /// The `A` type parameter represents the type of the underlying bytes.
132 /// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
133 #[derive(Clone)]
134 pub(crate) struct Accels<A> {
135     /// A length prefixed slice of contiguous accelerators. See the top comment
136     /// in this module for more details on how we can jump from a DFA's state
137     /// ID to an accelerator in this list.
138     ///
139     /// The first 4 bytes always correspond to the number of accelerators
140     /// that follow.
141     accels: A,
142 }
143 
144 #[cfg(feature = "dfa-build")]
145 impl Accels<Vec<AccelTy>> {
146     /// Create an empty sequence of accelerators for a DFA.
empty() -> Accels<Vec<AccelTy>>147     pub fn empty() -> Accels<Vec<AccelTy>> {
148         Accels { accels: vec![0] }
149     }
150 
151     /// Add an accelerator to this sequence.
152     ///
153     /// This adds to the accelerator to the end of the sequence and therefore
154     /// should be done in correspondence with its state in the DFA.
155     ///
156     /// This panics if this results in more accelerators than AccelTy::MAX.
add(&mut self, accel: Accel)157     pub fn add(&mut self, accel: Accel) {
158         self.accels.extend_from_slice(&accel.as_accel_tys());
159         let len = self.len();
160         self.set_len(len + 1);
161     }
162 
163     /// Set the number of accelerators in this sequence, which is encoded in
164     /// the first 4 bytes of the underlying bytes.
set_len(&mut self, new_len: usize)165     fn set_len(&mut self, new_len: usize) {
166         // The only way an accelerator gets added is if a state exists for
167         // it, and if a state exists, then its index is guaranteed to be
168         // representable by a AccelTy by virtue of the guarantees provided by
169         // StateID.
170         let new_len = AccelTy::try_from(new_len).unwrap();
171         self.accels[0] = new_len;
172     }
173 }
174 
175 impl<'a> Accels<&'a [AccelTy]> {
176     /// Deserialize a sequence of accelerators from the given bytes. If there
177     /// was a problem deserializing, then an error is returned.
178     ///
179     /// This is guaranteed to run in constant time. This does not guarantee
180     /// that every accelerator in the returned collection is valid. Thus,
181     /// accessing one may panic, or not-safe code that relies on accelerators
182     /// being correct my result in UB.
183     ///
184     /// Callers may check the validity of every accelerator with the `validate`
185     /// method.
from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError>186     pub fn from_bytes_unchecked(
187         mut slice: &'a [u8],
188     ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
189         let slice_start = slice.as_ptr().as_usize();
190 
191         let (accel_len, _) =
192             wire::try_read_u32_as_usize(slice, "accelerators length")?;
193         // The accelerator length is part of the accel_tys slice that
194         // we deserialize. This is perhaps a bit idiosyncratic. It would
195         // probably be better to split out the length into a real field.
196 
197         let accel_tys_len = wire::add(
198             wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
199             1,
200             "total number of accel_tys",
201         )?;
202         let accel_tys_bytes_len = wire::mul(
203             ACCEL_TY_SIZE,
204             accel_tys_len,
205             "total number of bytes in accelerators",
206         )?;
207         wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
208         wire::check_alignment::<AccelTy>(slice)?;
209         let accel_tys = &slice[..accel_tys_bytes_len];
210         slice = &slice[accel_tys_bytes_len..];
211         // SAFETY: We've checked the length and alignment above, and since
212         // slice is just bytes and AccelTy is just a u32, we can safely cast to
213         // a slice of &[AccelTy].
214         let accels = unsafe {
215             core::slice::from_raw_parts(
216                 accel_tys.as_ptr().cast::<AccelTy>(),
217                 accel_tys_len,
218             )
219         };
220         Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
221     }
222 }
223 
224 impl<A: AsRef<[AccelTy]>> Accels<A> {
225     /// Return an owned version of the accelerators.
226     #[cfg(feature = "alloc")]
to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>>227     pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
228         Accels { accels: self.accels.as_ref().to_vec() }
229     }
230 
231     /// Return a borrowed version of the accelerators.
as_ref(&self) -> Accels<&[AccelTy]>232     pub fn as_ref(&self) -> Accels<&[AccelTy]> {
233         Accels { accels: self.accels.as_ref() }
234     }
235 
236     /// Return the bytes representing the serialization of the accelerators.
as_bytes(&self) -> &[u8]237     pub fn as_bytes(&self) -> &[u8] {
238         let accels = self.accels.as_ref();
239         // SAFETY: This is safe because accels is a just a slice of AccelTy,
240         // and u8 always has a smaller alignment.
241         unsafe {
242             core::slice::from_raw_parts(
243                 accels.as_ptr().cast::<u8>(),
244                 accels.len() * ACCEL_TY_SIZE,
245             )
246         }
247     }
248 
249     /// Returns the memory usage, in bytes, of these accelerators.
250     ///
251     /// The memory usage is computed based on the number of bytes used to
252     /// represent all of the accelerators.
253     ///
254     /// This does **not** include the stack size used by this value.
memory_usage(&self) -> usize255     pub fn memory_usage(&self) -> usize {
256         self.as_bytes().len()
257     }
258 
259     /// Return the bytes to search for corresponding to the accelerator in this
260     /// sequence at index `i`. If no such accelerator exists, then this panics.
261     ///
262     /// The significance of the index is that it should be in correspondence
263     /// with the index of the corresponding DFA. That is, accelerated DFA
264     /// states are stored contiguously in the DFA and have an ordering implied
265     /// by their respective state IDs. The state's index in that sequence
266     /// corresponds to the index of its corresponding accelerator.
267     #[cfg_attr(feature = "perf-inline", inline(always))]
needles(&self, i: usize) -> &[u8]268     pub fn needles(&self, i: usize) -> &[u8] {
269         if i >= self.len() {
270             panic!("invalid accelerator index {}", i);
271         }
272         let bytes = self.as_bytes();
273         let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
274         let len = usize::from(bytes[offset]);
275         &bytes[offset + 1..offset + 1 + len]
276     }
277 
278     /// Return the total number of accelerators in this sequence.
len(&self) -> usize279     pub fn len(&self) -> usize {
280         // This should never panic since deserialization checks that the
281         // length can fit into a usize.
282         usize::try_from(self.accels.as_ref()[0]).unwrap()
283     }
284 
285     /// Return the accelerator in this sequence at index `i`. If no such
286     /// accelerator exists, then this returns None.
287     ///
288     /// See the docs for `needles` on the significance of the index.
get(&self, i: usize) -> Option<Accel>289     fn get(&self, i: usize) -> Option<Accel> {
290         if i >= self.len() {
291             return None;
292         }
293         let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
294         let accel = Accel::from_slice(&self.as_bytes()[offset..])
295             .expect("Accels must contain valid accelerators");
296         Some(accel)
297     }
298 
299     /// Returns an iterator of accelerators in this sequence.
iter(&self) -> IterAccels<'_, A>300     fn iter(&self) -> IterAccels<'_, A> {
301         IterAccels { accels: self, i: 0 }
302     }
303 
304     /// Writes these accelerators to the given byte buffer using the indicated
305     /// endianness. If the given buffer is too small, then an error is
306     /// returned. Upon success, the total number of bytes written is returned.
307     /// The number of bytes written is guaranteed to be a multiple of 8.
write_to<E: Endian>( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>308     pub fn write_to<E: Endian>(
309         &self,
310         dst: &mut [u8],
311     ) -> Result<usize, SerializeError> {
312         let nwrite = self.write_to_len();
313         assert_eq!(
314             nwrite % ACCEL_TY_SIZE,
315             0,
316             "expected accelerator bytes written to be a multiple of {}",
317             ACCEL_TY_SIZE,
318         );
319         if dst.len() < nwrite {
320             return Err(SerializeError::buffer_too_small("accelerators"));
321         }
322 
323         // The number of accelerators can never exceed AccelTy::MAX.
324         E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
325         // The actual accelerators are just raw bytes and thus their endianness
326         // is irrelevant. So we can copy them as bytes.
327         dst[ACCEL_TY_SIZE..nwrite]
328             .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
329         Ok(nwrite)
330     }
331 
332     /// Validates that every accelerator in this collection can be successfully
333     /// deserialized as a valid accelerator.
validate(&self) -> Result<(), DeserializeError>334     pub fn validate(&self) -> Result<(), DeserializeError> {
335         for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
336             let _ = Accel::from_slice(chunk)?;
337         }
338         Ok(())
339     }
340 
341     /// Returns the total number of bytes written by `write_to`.
write_to_len(&self) -> usize342     pub fn write_to_len(&self) -> usize {
343         self.as_bytes().len()
344     }
345 }
346 
347 impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result348     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
349         write!(f, "Accels(")?;
350         let mut list = f.debug_list();
351         for a in self.iter() {
352             list.entry(&a);
353         }
354         list.finish()?;
355         write!(f, ")")
356     }
357 }
358 
359 #[derive(Debug)]
360 struct IterAccels<'a, A: AsRef<[AccelTy]>> {
361     accels: &'a Accels<A>,
362     i: usize,
363 }
364 
365 impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
366     type Item = Accel;
367 
next(&mut self) -> Option<Accel>368     fn next(&mut self) -> Option<Accel> {
369         let accel = self.accels.get(self.i)?;
370         self.i += 1;
371         Some(accel)
372     }
373 }
374 
375 /// Accel represents a structure for determining how to "accelerate" a DFA
376 /// state.
377 ///
378 /// Namely, it contains zero or more bytes that must be seen in order for the
379 /// DFA to leave the state it is associated with. In practice, the actual range
380 /// is 1 to 3 bytes.
381 ///
382 /// The purpose of acceleration is to identify states whose vast majority
383 /// of transitions are just loops back to the same state. For example,
384 /// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
385 /// (corresponding to `[^a]+`) where all transitions *except* for `a` and
386 /// `b` loop back to itself. Thus, this state can be "accelerated" by simply
387 /// looking for the next occurrence of either `a` or `b` instead of explicitly
388 /// following transitions. (In this case, `b` transitions to the next state
389 /// where as `a` would transition to the dead state.)
390 #[derive(Clone)]
391 pub(crate) struct Accel {
392     /// The first byte is the length. Subsequent bytes are the accelerated
393     /// bytes.
394     ///
395     /// Note that we make every accelerator 8 bytes as a slightly wasteful
396     /// way of making sure alignment is always correct for state ID sizes of
397     /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
398     /// particularly common, especially when Unicode is enabled.
399     bytes: [u8; ACCEL_CAP],
400 }
401 
402 impl Accel {
403     /// Returns an empty accel, where no bytes are accelerated.
404     #[cfg(feature = "dfa-build")]
new() -> Accel405     pub fn new() -> Accel {
406         Accel { bytes: [0; ACCEL_CAP] }
407     }
408 
409     /// Returns a verified accelerator derived from the beginning of the given
410     /// slice.
411     ///
412     /// If the slice is not long enough or contains invalid bytes for an
413     /// accelerator, then this returns an error.
from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError>414     pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
415         slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
416         let bytes = slice
417             .try_into()
418             .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
419         Accel::from_bytes(bytes)
420     }
421 
422     /// Returns a verified accelerator derived from raw bytes.
423     ///
424     /// If the given bytes are invalid, then this returns an error.
from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError>425     fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
426         if usize::from(bytes[0]) >= ACCEL_LEN {
427             return Err(DeserializeError::generic(
428                 "accelerator bytes cannot have length more than 3",
429             ));
430         }
431         Ok(Accel::from_bytes_unchecked(bytes))
432     }
433 
434     /// Returns an accelerator derived from raw bytes.
435     ///
436     /// This does not check whether the given bytes are valid. Invalid bytes
437     /// cannot sacrifice memory safety, but may result in panics or silent
438     /// logic bugs.
from_bytes_unchecked(bytes: [u8; 4]) -> Accel439     fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
440         Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
441     }
442 
443     /// Attempts to add the given byte to this accelerator. If the accelerator
444     /// is already full or thinks the byte is a poor accelerator, then this
445     /// returns false. Otherwise, returns true.
446     ///
447     /// If the given byte is already in this accelerator, then it panics.
448     #[cfg(feature = "dfa-build")]
add(&mut self, byte: u8) -> bool449     pub fn add(&mut self, byte: u8) -> bool {
450         if self.len() >= 3 {
451             return false;
452         }
453         // As a special case, we totally reject trying to accelerate a state
454         // with an ASCII space. In most cases, it occurs very frequently, and
455         // tends to result in worse overall performance.
456         if byte == b' ' {
457             return false;
458         }
459         assert!(
460             !self.contains(byte),
461             "accelerator already contains {:?}",
462             crate::util::escape::DebugByte(byte)
463         );
464         self.bytes[self.len() + 1] = byte;
465         self.bytes[0] += 1;
466         true
467     }
468 
469     /// Return the number of bytes in this accelerator.
len(&self) -> usize470     pub fn len(&self) -> usize {
471         usize::from(self.bytes[0])
472     }
473 
474     /// Returns true if and only if there are no bytes in this accelerator.
475     #[cfg(feature = "dfa-build")]
is_empty(&self) -> bool476     pub fn is_empty(&self) -> bool {
477         self.len() == 0
478     }
479 
480     /// Returns the slice of bytes to accelerate.
481     ///
482     /// If this accelerator is empty, then this returns an empty slice.
needles(&self) -> &[u8]483     fn needles(&self) -> &[u8] {
484         &self.bytes[1..1 + self.len()]
485     }
486 
487     /// Returns true if and only if this accelerator will accelerate the given
488     /// byte.
489     #[cfg(feature = "dfa-build")]
contains(&self, byte: u8) -> bool490     fn contains(&self, byte: u8) -> bool {
491         self.needles().iter().position(|&b| b == byte).is_some()
492     }
493 
494     /// Returns the accelerator bytes as an array of AccelTys.
495     #[cfg(feature = "dfa-build")]
as_accel_tys(&self) -> [AccelTy; 2]496     fn as_accel_tys(&self) -> [AccelTy; 2] {
497         assert_eq!(ACCEL_CAP, 8);
498         // These unwraps are OK since ACCEL_CAP is set to 8.
499         let first =
500             AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
501         let second =
502             AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
503         [first, second]
504     }
505 }
506 
507 impl core::fmt::Debug for Accel {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result508     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
509         write!(f, "Accel(")?;
510         let mut set = f.debug_set();
511         for &b in self.needles() {
512             set.entry(&crate::util::escape::DebugByte(b));
513         }
514         set.finish()?;
515         write!(f, ")")
516     }
517 }
518