1 use crate::{ 2 dfa::DEAD, 3 util::{ 4 primitives::StateID, 5 wire::{self, DeserializeError, Endian, SerializeError}, 6 }, 7 }; 8 9 macro_rules! err { 10 ($msg:expr) => { 11 return Err(DeserializeError::generic($msg)); 12 }; 13 } 14 15 // Special represents the identifiers in a DFA that correspond to "special" 16 // states. If a state is one or more of the following, then it is considered 17 // special: 18 // 19 // * dead - A non-matching state where all outgoing transitions lead back to 20 // itself. There is only one of these, regardless of whether minimization 21 // has run. The dead state always has an ID of 0. i.e., It is always the 22 // first state in a DFA. 23 // * quit - A state that is entered whenever a byte is seen that should cause 24 // a DFA to give up and stop searching. This results in a MatchError::quit 25 // error being returned at search time. The default configuration for a DFA 26 // has no quit bytes, which means this state is unreachable by default, 27 // although it is always present for reasons of implementation simplicity. 28 // This state is only reachable when the caller configures the DFA to quit 29 // on certain bytes. There is always exactly one of these states and it 30 // is always the second state. (Its actual ID depends on the size of the 31 // alphabet in dense DFAs, since state IDs are premultiplied in order to 32 // allow them to be used directly as indices into the transition table.) 33 // * match - An accepting state, i.e., indicative of a match. There may be 34 // zero or more of these states. 35 // * accelerated - A state where all of its outgoing transitions, except a 36 // few, loop back to itself. These states are candidates for acceleration 37 // via memchr during search. There may be zero or more of these states. 38 // * start - A non-matching state that indicates where the automaton should 39 // start during a search. There is always at least one starting state and 40 // all are guaranteed to be non-match states. (A start state cannot be a 41 // match state because the DFAs in this crate delay all matches by one byte. 42 // So every search that finds a match must move through one transition to 43 // some other match state, even when searching an empty string.) 44 // 45 // These are not mutually exclusive categories. Namely, the following 46 // overlappings can occur: 47 // 48 // * {dead, start} - If a DFA can never lead to a match and it is minimized, 49 // then it will typically compile to something where all starting IDs point 50 // to the DFA's dead state. 51 // * {match, accelerated} - It is possible for a match state to have the 52 // majority of its transitions loop back to itself, which means it's 53 // possible for a match state to be accelerated. 54 // * {start, accelerated} - Similarly, it is possible for a start state to be 55 // accelerated. Note that it is possible for an accelerated state to be 56 // neither a match or a start state. Also note that just because both match 57 // and start states overlap with accelerated states does not mean that 58 // match and start states overlap with each other. In fact, they are 59 // guaranteed not to overlap. 60 // 61 // As a special mention, every DFA always has a dead and a quit state, even 62 // though from the perspective of the DFA, they are equivalent. (Indeed, 63 // minimization special cases them to ensure they don't get merged.) The 64 // purpose of keeping them distinct is to use the quit state as a sentinel to 65 // distguish between whether a search finished successfully without finding 66 // anything or whether it gave up before finishing. 67 // 68 // So the main problem we want to solve here is the *fast* detection of whether 69 // a state is special or not. And we also want to do this while storing as 70 // little extra data as possible. AND we want to be able to quickly determine 71 // which categories a state falls into above if it is special. 72 // 73 // We achieve this by essentially shuffling all special states to the beginning 74 // of a DFA. That is, all special states appear before every other non-special 75 // state. By representing special states this way, we can determine whether a 76 // state is special or not by a single comparison, where special.max is the 77 // identifier of the last special state in the DFA: 78 // 79 // if current_state <= special.max: 80 // ... do something with special state 81 // 82 // The only thing left to do is to determine what kind of special state 83 // it is. Because what we do next depends on that. Since special states 84 // are typically rare, we can afford to do a bit more extra work, but we'd 85 // still like this to be as fast as possible. The trick we employ here is to 86 // continue shuffling states even within the special state range. Such that 87 // one contiguous region corresponds to match states, another for start states 88 // and then an overlapping range for accelerated states. At a high level, our 89 // special state detection might look like this (for leftmost searching, where 90 // we continue searching even after seeing a match): 91 // 92 // byte = input[offset] 93 // current_state = next_state(current_state, byte) 94 // offset += 1 95 // if current_state <= special.max: 96 // if current_state == 0: 97 // # We can never leave a dead state, so this always marks the 98 // # end of our search. 99 // return last_match 100 // if current_state == special.quit_id: 101 // # A quit state means we give up. If he DFA has no quit state, 102 // # then special.quit_id == 0 == dead, which is handled by the 103 // # conditional above. 104 // return Err(MatchError::quit { byte, offset: offset - 1 }) 105 // if special.min_match <= current_state <= special.max_match: 106 // last_match = Some(offset) 107 // if special.min_accel <= current_state <= special.max_accel: 108 // offset = accelerate(input, offset) 109 // last_match = Some(offset) 110 // elif special.min_start <= current_state <= special.max_start: 111 // offset = prefilter.find(input, offset) 112 // if special.min_accel <= current_state <= special.max_accel: 113 // offset = accelerate(input, offset) 114 // elif special.min_accel <= current_state <= special.max_accel: 115 // offset = accelerate(input, offset) 116 // 117 // There are some small details left out of the logic above. For example, 118 // in order to accelerate a state, we need to know which bytes to search for. 119 // This in turn implies some extra data we need to store in the DFA. To keep 120 // things compact, we would ideally only store 121 // 122 // N = special.max_accel - special.min_accel + 1 123 // 124 // items. But state IDs are premultiplied, which means they are not contiguous. 125 // So in order to take a state ID and index an array of accelerated structures, 126 // we need to do: 127 // 128 // i = (state_id - special.min_accel) / stride 129 // 130 // (N.B. 'stride' is always a power of 2, so the above can be implemented via 131 // '(state_id - special.min_accel) >> stride2', where 'stride2' is x in 132 // 2^x=stride.) 133 // 134 // Moreover, some of these specialty categories may be empty. For example, 135 // DFAs are not required to have any match states or any accelerated states. 136 // In that case, the lower and upper bounds are both set to 0 (the dead state 137 // ID) and the first `current_state == 0` check subsumes cases where the 138 // ranges are empty. 139 // 140 // Loop unrolling, if applicable, has also been left out of the logic above. 141 // 142 // Graphically, the ranges look like this, where asterisks indicate ranges 143 // that can be empty. Each 'x' is a state. 144 // 145 // quit 146 // dead| 147 // || 148 // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 149 // | | | | start | | 150 // | |-------------| |-------| | 151 // | match* | | | | 152 // | | | | | 153 // | |----------| | | 154 // | accel* | | 155 // | | | 156 // | | | 157 // |----------------------------|------------------------ 158 // special non-special* 159 #[derive(Clone, Copy, Debug)] 160 pub(crate) struct Special { 161 /// The identifier of the last special state in a DFA. A state is special 162 /// if and only if its identifier is less than or equal to `max`. 163 pub(crate) max: StateID, 164 /// The identifier of the quit state in a DFA. (There is no analogous field 165 /// for the dead state since the dead state's ID is always zero, regardless 166 /// of state ID size.) 167 pub(crate) quit_id: StateID, 168 /// The identifier of the first match state. 169 pub(crate) min_match: StateID, 170 /// The identifier of the last match state. 171 pub(crate) max_match: StateID, 172 /// The identifier of the first accelerated state. 173 pub(crate) min_accel: StateID, 174 /// The identifier of the last accelerated state. 175 pub(crate) max_accel: StateID, 176 /// The identifier of the first start state. 177 pub(crate) min_start: StateID, 178 /// The identifier of the last start state. 179 pub(crate) max_start: StateID, 180 } 181 182 impl Special { 183 /// Creates a new set of special ranges for a DFA. All ranges are initially 184 /// set to only contain the dead state. This is interpreted as an empty 185 /// range. 186 #[cfg(feature = "dfa-build")] new() -> Special187 pub(crate) fn new() -> Special { 188 Special { 189 max: DEAD, 190 quit_id: DEAD, 191 min_match: DEAD, 192 max_match: DEAD, 193 min_accel: DEAD, 194 max_accel: DEAD, 195 min_start: DEAD, 196 max_start: DEAD, 197 } 198 } 199 200 /// Remaps all of the special state identifiers using the function given. 201 #[cfg(feature = "dfa-build")] remap(&self, map: impl Fn(StateID) -> StateID) -> Special202 pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { 203 Special { 204 max: map(self.max), 205 quit_id: map(self.quit_id), 206 min_match: map(self.min_match), 207 max_match: map(self.max_match), 208 min_accel: map(self.min_accel), 209 max_accel: map(self.max_accel), 210 min_start: map(self.min_start), 211 max_start: map(self.max_start), 212 } 213 } 214 215 /// Deserialize the given bytes into special state ranges. If the slice 216 /// given is not big enough, then this returns an error. Similarly, if 217 /// any of the expected invariants around special state ranges aren't 218 /// upheld, an error is returned. Note that this does not guarantee that 219 /// the information returned is correct. 220 /// 221 /// Upon success, this returns the number of bytes read in addition to the 222 /// special state IDs themselves. from_bytes( mut slice: &[u8], ) -> Result<(Special, usize), DeserializeError>223 pub(crate) fn from_bytes( 224 mut slice: &[u8], 225 ) -> Result<(Special, usize), DeserializeError> { 226 wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; 227 228 let mut nread = 0; 229 let mut read_id = |what| -> Result<StateID, DeserializeError> { 230 let (id, nr) = wire::try_read_state_id(slice, what)?; 231 nread += nr; 232 slice = &slice[StateID::SIZE..]; 233 Ok(id) 234 }; 235 236 let max = read_id("special max id")?; 237 let quit_id = read_id("special quit id")?; 238 let min_match = read_id("special min match id")?; 239 let max_match = read_id("special max match id")?; 240 let min_accel = read_id("special min accel id")?; 241 let max_accel = read_id("special max accel id")?; 242 let min_start = read_id("special min start id")?; 243 let max_start = read_id("special max start id")?; 244 245 let special = Special { 246 max, 247 quit_id, 248 min_match, 249 max_match, 250 min_accel, 251 max_accel, 252 min_start, 253 max_start, 254 }; 255 special.validate()?; 256 assert_eq!(nread, special.write_to_len()); 257 Ok((special, nread)) 258 } 259 260 /// Validate that the information describing special states satisfies 261 /// all known invariants. validate(&self) -> Result<(), DeserializeError>262 pub(crate) fn validate(&self) -> Result<(), DeserializeError> { 263 // Check that both ends of the range are DEAD or neither are. 264 if self.min_match == DEAD && self.max_match != DEAD { 265 err!("min_match is DEAD, but max_match is not"); 266 } 267 if self.min_match != DEAD && self.max_match == DEAD { 268 err!("max_match is DEAD, but min_match is not"); 269 } 270 if self.min_accel == DEAD && self.max_accel != DEAD { 271 err!("min_accel is DEAD, but max_accel is not"); 272 } 273 if self.min_accel != DEAD && self.max_accel == DEAD { 274 err!("max_accel is DEAD, but min_accel is not"); 275 } 276 if self.min_start == DEAD && self.max_start != DEAD { 277 err!("min_start is DEAD, but max_start is not"); 278 } 279 if self.min_start != DEAD && self.max_start == DEAD { 280 err!("max_start is DEAD, but min_start is not"); 281 } 282 283 // Check that ranges are well formed. 284 if self.min_match > self.max_match { 285 err!("min_match should not be greater than max_match"); 286 } 287 if self.min_accel > self.max_accel { 288 err!("min_accel should not be greater than max_accel"); 289 } 290 if self.min_start > self.max_start { 291 err!("min_start should not be greater than max_start"); 292 } 293 294 // Check that ranges are ordered with respect to one another. 295 if self.matches() && self.quit_id >= self.min_match { 296 err!("quit_id should not be greater than min_match"); 297 } 298 if self.accels() && self.quit_id >= self.min_accel { 299 err!("quit_id should not be greater than min_accel"); 300 } 301 if self.starts() && self.quit_id >= self.min_start { 302 err!("quit_id should not be greater than min_start"); 303 } 304 if self.matches() && self.accels() && self.min_accel < self.min_match { 305 err!("min_match should not be greater than min_accel"); 306 } 307 if self.matches() && self.starts() && self.min_start < self.min_match { 308 err!("min_match should not be greater than min_start"); 309 } 310 if self.accels() && self.starts() && self.min_start < self.min_accel { 311 err!("min_accel should not be greater than min_start"); 312 } 313 314 // Check that max is at least as big as everything else. 315 if self.max < self.quit_id { 316 err!("quit_id should not be greater than max"); 317 } 318 if self.max < self.max_match { 319 err!("max_match should not be greater than max"); 320 } 321 if self.max < self.max_accel { 322 err!("max_accel should not be greater than max"); 323 } 324 if self.max < self.max_start { 325 err!("max_start should not be greater than max"); 326 } 327 328 Ok(()) 329 } 330 331 /// Validate that the special state information is compatible with the 332 /// given state len. validate_state_len( &self, len: usize, stride2: usize, ) -> Result<(), DeserializeError>333 pub(crate) fn validate_state_len( 334 &self, 335 len: usize, 336 stride2: usize, 337 ) -> Result<(), DeserializeError> { 338 // We assume that 'validate' has already passed, so we know that 'max' 339 // is truly the max. So all we need to check is that the max state ID 340 // is less than the state ID len. The max legal value here is len-1, 341 // which occurs when there are no non-special states. 342 if (self.max.as_usize() >> stride2) >= len { 343 err!("max should not be greater than or equal to state length"); 344 } 345 Ok(()) 346 } 347 348 /// Write the IDs and ranges for special states to the given byte buffer. 349 /// The buffer given must have enough room to store all data, otherwise 350 /// this will return an error. The number of bytes written is returned 351 /// on success. The number of bytes written is guaranteed to be a multiple 352 /// of 8. write_to<E: Endian>( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>353 pub(crate) fn write_to<E: Endian>( 354 &self, 355 dst: &mut [u8], 356 ) -> Result<usize, SerializeError> { 357 use crate::util::wire::write_state_id as write; 358 359 if dst.len() < self.write_to_len() { 360 return Err(SerializeError::buffer_too_small("special state ids")); 361 } 362 363 let mut nwrite = 0; 364 nwrite += write::<E>(self.max, &mut dst[nwrite..]); 365 nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]); 366 nwrite += write::<E>(self.min_match, &mut dst[nwrite..]); 367 nwrite += write::<E>(self.max_match, &mut dst[nwrite..]); 368 nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]); 369 nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]); 370 nwrite += write::<E>(self.min_start, &mut dst[nwrite..]); 371 nwrite += write::<E>(self.max_start, &mut dst[nwrite..]); 372 373 assert_eq!( 374 self.write_to_len(), 375 nwrite, 376 "expected to write certain number of bytes", 377 ); 378 assert_eq!( 379 nwrite % 8, 380 0, 381 "expected to write multiple of 8 bytes for special states", 382 ); 383 Ok(nwrite) 384 } 385 386 /// Returns the total number of bytes written by `write_to`. write_to_len(&self) -> usize387 pub(crate) fn write_to_len(&self) -> usize { 388 8 * StateID::SIZE 389 } 390 391 /// Sets the maximum special state ID based on the current values. This 392 /// should be used once all possible state IDs are set. 393 #[cfg(feature = "dfa-build")] set_max(&mut self)394 pub(crate) fn set_max(&mut self) { 395 use core::cmp::max; 396 self.max = max( 397 self.quit_id, 398 max(self.max_match, max(self.max_accel, self.max_start)), 399 ); 400 } 401 402 /// Sets the maximum special state ID such that starting states are not 403 /// considered "special." This also marks the min/max starting states as 404 /// DEAD such that 'is_start_state' always returns false, even if the state 405 /// is actually a starting state. 406 /// 407 /// This is useful when there is no prefilter set. It will avoid 408 /// ping-ponging between the hot path in the DFA search code and the start 409 /// state handling code, which is typically only useful for executing a 410 /// prefilter. 411 #[cfg(feature = "dfa-build")] set_no_special_start_states(&mut self)412 pub(crate) fn set_no_special_start_states(&mut self) { 413 use core::cmp::max; 414 self.max = max(self.quit_id, max(self.max_match, self.max_accel)); 415 self.min_start = DEAD; 416 self.max_start = DEAD; 417 } 418 419 /// Returns true if and only if the given state ID is a special state. 420 #[inline] is_special_state(&self, id: StateID) -> bool421 pub(crate) fn is_special_state(&self, id: StateID) -> bool { 422 id <= self.max 423 } 424 425 /// Returns true if and only if the given state ID is a dead state. 426 #[inline] is_dead_state(&self, id: StateID) -> bool427 pub(crate) fn is_dead_state(&self, id: StateID) -> bool { 428 id == DEAD 429 } 430 431 /// Returns true if and only if the given state ID is a quit state. 432 #[inline] is_quit_state(&self, id: StateID) -> bool433 pub(crate) fn is_quit_state(&self, id: StateID) -> bool { 434 !self.is_dead_state(id) && self.quit_id == id 435 } 436 437 /// Returns true if and only if the given state ID is a match state. 438 #[inline] is_match_state(&self, id: StateID) -> bool439 pub(crate) fn is_match_state(&self, id: StateID) -> bool { 440 !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match 441 } 442 443 /// Returns true if and only if the given state ID is an accel state. 444 #[inline] is_accel_state(&self, id: StateID) -> bool445 pub(crate) fn is_accel_state(&self, id: StateID) -> bool { 446 !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel 447 } 448 449 /// Returns true if and only if the given state ID is a start state. 450 #[inline] is_start_state(&self, id: StateID) -> bool451 pub(crate) fn is_start_state(&self, id: StateID) -> bool { 452 !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start 453 } 454 455 /// Returns the total number of match states for a dense table based DFA. 456 #[inline] match_len(&self, stride: usize) -> usize457 pub(crate) fn match_len(&self, stride: usize) -> usize { 458 if self.matches() { 459 (self.max_match.as_usize() - self.min_match.as_usize() + stride) 460 / stride 461 } else { 462 0 463 } 464 } 465 466 /// Returns true if and only if there is at least one match state. 467 #[inline] matches(&self) -> bool468 pub(crate) fn matches(&self) -> bool { 469 self.min_match != DEAD 470 } 471 472 /// Returns the total number of accel states. 473 #[cfg(feature = "dfa-build")] accel_len(&self, stride: usize) -> usize474 pub(crate) fn accel_len(&self, stride: usize) -> usize { 475 if self.accels() { 476 (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) 477 / stride 478 } else { 479 0 480 } 481 } 482 483 /// Returns true if and only if there is at least one accel state. 484 #[inline] accels(&self) -> bool485 pub(crate) fn accels(&self) -> bool { 486 self.min_accel != DEAD 487 } 488 489 /// Returns true if and only if there is at least one start state. 490 #[inline] starts(&self) -> bool491 pub(crate) fn starts(&self) -> bool { 492 self.min_start != DEAD 493 } 494 } 495