1 /*! 2 Types and routines specific to sparse DFAs. 3 4 This module is the home of [`sparse::DFA`](DFA). 5 6 Unlike the [`dense`] module, this module does not contain a builder or 7 configuration specific for sparse DFAs. Instead, the intended way to build a 8 sparse DFA is either by using a default configuration with its constructor 9 [`sparse::DFA::new`](DFA::new), or by first configuring the construction of a 10 dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. 11 For example, this configures a sparse DFA to do an overlapping search: 12 13 ``` 14 use regex_automata::{ 15 dfa::{Automaton, OverlappingState, dense}, 16 HalfMatch, Input, MatchKind, 17 }; 18 19 let dense_re = dense::Builder::new() 20 .configure(dense::Config::new().match_kind(MatchKind::All)) 21 .build(r"Samwise|Sam")?; 22 let sparse_re = dense_re.to_sparse()?; 23 24 // Setup our haystack and initial start state. 25 let input = Input::new("Samwise"); 26 let mut state = OverlappingState::start(); 27 28 // First, 'Sam' will match. 29 sparse_re.try_search_overlapping_fwd(&input, &mut state)?; 30 assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); 31 32 // And now 'Samwise' will match. 33 sparse_re.try_search_overlapping_fwd(&input, &mut state)?; 34 assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); 35 # Ok::<(), Box<dyn std::error::Error>>(()) 36 ``` 37 */ 38 39 #[cfg(feature = "dfa-build")] 40 use core::iter; 41 use core::{fmt, mem::size_of}; 42 43 #[cfg(feature = "dfa-build")] 44 use alloc::{vec, vec::Vec}; 45 46 #[cfg(feature = "dfa-build")] 47 use crate::dfa::dense::{self, BuildError}; 48 use crate::{ 49 dfa::{ 50 automaton::{fmt_state_indicator, Automaton, StartError}, 51 dense::Flags, 52 special::Special, 53 StartKind, DEAD, 54 }, 55 util::{ 56 alphabet::{ByteClasses, ByteSet}, 57 escape::DebugByte, 58 int::{Pointer, Usize, U16, U32}, 59 prefilter::Prefilter, 60 primitives::{PatternID, StateID}, 61 search::Anchored, 62 start::{self, Start, StartByteMap}, 63 wire::{self, DeserializeError, Endian, SerializeError}, 64 }, 65 }; 66 67 const LABEL: &str = "rust-regex-automata-dfa-sparse"; 68 const VERSION: u32 = 2; 69 70 /// A sparse deterministic finite automaton (DFA) with variable sized states. 71 /// 72 /// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient 73 /// representation for its transitions. Consequently, sparse DFAs may use much 74 /// less memory than dense DFAs, but this comes at a price. In particular, 75 /// reading the more space efficient transitions takes more work, and 76 /// consequently, searching using a sparse DFA is typically slower than a dense 77 /// DFA. 78 /// 79 /// A sparse DFA can be built using the default configuration via the 80 /// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a 81 /// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse 82 /// DFA using [`dense::DFA::to_sparse`]. 83 /// 84 /// In general, a sparse DFA supports all the same search operations as a dense 85 /// DFA. 86 /// 87 /// Making the choice between a dense and sparse DFA depends on your specific 88 /// work load. If you can sacrifice a bit of search time performance, then a 89 /// sparse DFA might be the best choice. In particular, while sparse DFAs are 90 /// probably always slower than dense DFAs, you may find that they are easily 91 /// fast enough for your purposes! 92 /// 93 /// # Type parameters 94 /// 95 /// A `DFA` has one type parameter, `T`, which is used to represent the parts 96 /// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`. 97 /// 98 /// # The `Automaton` trait 99 /// 100 /// This type implements the [`Automaton`] trait, which means it can be used 101 /// for searching. For example: 102 /// 103 /// ``` 104 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 105 /// 106 /// let dfa = DFA::new("foo[0-9]+")?; 107 /// let expected = Some(HalfMatch::must(0, 8)); 108 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 109 /// # Ok::<(), Box<dyn std::error::Error>>(()) 110 /// ``` 111 #[derive(Clone)] 112 pub struct DFA<T> { 113 // When compared to a dense DFA, a sparse DFA *looks* a lot simpler 114 // representation-wise. In reality, it is perhaps more complicated. Namely, 115 // in a dense DFA, all information needs to be very cheaply accessible 116 // using only state IDs. In a sparse DFA however, each state uses a 117 // variable amount of space because each state encodes more information 118 // than just its transitions. Each state also includes an accelerator if 119 // one exists, along with the matching pattern IDs if the state is a match 120 // state. 121 // 122 // That is, a lot of the complexity is pushed down into how each state 123 // itself is represented. 124 tt: Transitions<T>, 125 st: StartTable<T>, 126 special: Special, 127 pre: Option<Prefilter>, 128 quitset: ByteSet, 129 flags: Flags, 130 } 131 132 #[cfg(feature = "dfa-build")] 133 impl DFA<Vec<u8>> { 134 /// Parse the given regular expression using a default configuration and 135 /// return the corresponding sparse DFA. 136 /// 137 /// If you want a non-default configuration, then use the 138 /// [`dense::Builder`] to set your own configuration, and then call 139 /// [`dense::DFA::to_sparse`] to create a sparse DFA. 140 /// 141 /// # Example 142 /// 143 /// ``` 144 /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; 145 /// 146 /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; 147 /// 148 /// let expected = Some(HalfMatch::must(0, 11)); 149 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); 150 /// # Ok::<(), Box<dyn std::error::Error>>(()) 151 /// ``` 152 #[cfg(feature = "syntax")] new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError>153 pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> { 154 dense::Builder::new() 155 .build(pattern) 156 .and_then(|dense| dense.to_sparse()) 157 } 158 159 /// Parse the given regular expressions using a default configuration and 160 /// return the corresponding multi-DFA. 161 /// 162 /// If you want a non-default configuration, then use the 163 /// [`dense::Builder`] to set your own configuration, and then call 164 /// [`dense::DFA::to_sparse`] to create a sparse DFA. 165 /// 166 /// # Example 167 /// 168 /// ``` 169 /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; 170 /// 171 /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; 172 /// let expected = Some(HalfMatch::must(1, 3)); 173 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); 174 /// # Ok::<(), Box<dyn std::error::Error>>(()) 175 /// ``` 176 #[cfg(feature = "syntax")] new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<DFA<Vec<u8>>, BuildError>177 pub fn new_many<P: AsRef<str>>( 178 patterns: &[P], 179 ) -> Result<DFA<Vec<u8>>, BuildError> { 180 dense::Builder::new() 181 .build_many(patterns) 182 .and_then(|dense| dense.to_sparse()) 183 } 184 } 185 186 #[cfg(feature = "dfa-build")] 187 impl DFA<Vec<u8>> { 188 /// Create a new DFA that matches every input. 189 /// 190 /// # Example 191 /// 192 /// ``` 193 /// use regex_automata::{ 194 /// dfa::{Automaton, sparse}, 195 /// HalfMatch, Input, 196 /// }; 197 /// 198 /// let dfa = sparse::DFA::always_match()?; 199 /// 200 /// let expected = Some(HalfMatch::must(0, 0)); 201 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); 202 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); 203 /// # Ok::<(), Box<dyn std::error::Error>>(()) 204 /// ``` always_match() -> Result<DFA<Vec<u8>>, BuildError>205 pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> { 206 dense::DFA::always_match()?.to_sparse() 207 } 208 209 /// Create a new sparse DFA that never matches any input. 210 /// 211 /// # Example 212 /// 213 /// ``` 214 /// use regex_automata::{dfa::{Automaton, sparse}, Input}; 215 /// 216 /// let dfa = sparse::DFA::never_match()?; 217 /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); 218 /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); 219 /// # Ok::<(), Box<dyn std::error::Error>>(()) 220 /// ``` never_match() -> Result<DFA<Vec<u8>>, BuildError>221 pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> { 222 dense::DFA::never_match()?.to_sparse() 223 } 224 225 /// The implementation for constructing a sparse DFA from a dense DFA. from_dense<T: AsRef<[u32]>>( dfa: &dense::DFA<T>, ) -> Result<DFA<Vec<u8>>, BuildError>226 pub(crate) fn from_dense<T: AsRef<[u32]>>( 227 dfa: &dense::DFA<T>, 228 ) -> Result<DFA<Vec<u8>>, BuildError> { 229 // In order to build the transition table, we need to be able to write 230 // state identifiers for each of the "next" transitions in each state. 231 // Our state identifiers correspond to the byte offset in the 232 // transition table at which the state is encoded. Therefore, we do not 233 // actually know what the state identifiers are until we've allocated 234 // exactly as much space as we need for each state. Thus, construction 235 // of the transition table happens in two passes. 236 // 237 // In the first pass, we fill out the shell of each state, which 238 // includes the transition length, the input byte ranges and 239 // zero-filled space for the transitions and accelerators, if present. 240 // In this first pass, we also build up a map from the state identifier 241 // index of the dense DFA to the state identifier in this sparse DFA. 242 // 243 // In the second pass, we fill in the transitions based on the map 244 // built in the first pass. 245 246 // The capacity given here reflects a minimum. (Well, the true minimum 247 // is likely even bigger, but hopefully this saves a few reallocs.) 248 let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); 249 // This maps state indices from the dense DFA to StateIDs in the sparse 250 // DFA. We build out this map on the first pass, and then use it in the 251 // second pass to back-fill our transitions. 252 let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()]; 253 for state in dfa.states() { 254 let pos = sparse.len(); 255 256 remap[dfa.to_index(state.id())] = StateID::new(pos) 257 .map_err(|_| BuildError::too_many_states())?; 258 // zero-filled space for the transition length 259 sparse.push(0); 260 sparse.push(0); 261 262 let mut transition_len = 0; 263 for (unit1, unit2, _) in state.sparse_transitions() { 264 match (unit1.as_u8(), unit2.as_u8()) { 265 (Some(b1), Some(b2)) => { 266 transition_len += 1; 267 sparse.push(b1); 268 sparse.push(b2); 269 } 270 (None, None) => {} 271 (Some(_), None) | (None, Some(_)) => { 272 // can never occur because sparse_transitions never 273 // groups EOI with any other transition. 274 unreachable!() 275 } 276 } 277 } 278 // Add dummy EOI transition. This is never actually read while 279 // searching, but having space equivalent to the total number 280 // of transitions is convenient. Otherwise, we'd need to track 281 // a different number of transitions for the byte ranges as for 282 // the 'next' states. 283 // 284 // N.B. The loop above is not guaranteed to yield the EOI 285 // transition, since it may point to a DEAD state. By putting 286 // it here, we always write the EOI transition, and thus 287 // guarantee that our transition length is >0. Why do we always 288 // need the EOI transition? Because in order to implement 289 // Automaton::next_eoi_state, this lets us just ask for the last 290 // transition. There are probably other/better ways to do this. 291 transition_len += 1; 292 sparse.push(0); 293 sparse.push(0); 294 295 // Check some assumptions about transition length. 296 assert_ne!( 297 transition_len, 0, 298 "transition length should be non-zero", 299 ); 300 assert!( 301 transition_len <= 257, 302 "expected transition length {} to be <= 257", 303 transition_len, 304 ); 305 306 // Fill in the transition length. 307 // Since transition length is always <= 257, we use the most 308 // significant bit to indicate whether this is a match state or 309 // not. 310 let ntrans = if dfa.is_match_state(state.id()) { 311 transition_len | (1 << 15) 312 } else { 313 transition_len 314 }; 315 wire::NE::write_u16(ntrans, &mut sparse[pos..]); 316 317 // zero-fill the actual transitions. 318 // Unwraps are OK since transition_length <= 257 and our minimum 319 // support usize size is 16-bits. 320 let zeros = usize::try_from(transition_len) 321 .unwrap() 322 .checked_mul(StateID::SIZE) 323 .unwrap(); 324 sparse.extend(iter::repeat(0).take(zeros)); 325 326 // If this is a match state, write the pattern IDs matched by this 327 // state. 328 if dfa.is_match_state(state.id()) { 329 let plen = dfa.match_pattern_len(state.id()); 330 // Write the actual pattern IDs with a u32 length prefix. 331 // First, zero-fill space. 332 let mut pos = sparse.len(); 333 // Unwraps are OK since it's guaranteed that plen <= 334 // PatternID::LIMIT, which is in turn guaranteed to fit into a 335 // u32. 336 let zeros = size_of::<u32>() 337 .checked_mul(plen) 338 .unwrap() 339 .checked_add(size_of::<u32>()) 340 .unwrap(); 341 sparse.extend(iter::repeat(0).take(zeros)); 342 343 // Now write the length prefix. 344 wire::NE::write_u32( 345 // Will never fail since u32::MAX is invalid pattern ID. 346 // Thus, the number of pattern IDs is representable by a 347 // u32. 348 plen.try_into().expect("pattern ID length fits in u32"), 349 &mut sparse[pos..], 350 ); 351 pos += size_of::<u32>(); 352 353 // Now write the pattern IDs. 354 for &pid in dfa.pattern_id_slice(state.id()) { 355 pos += wire::write_pattern_id::<wire::NE>( 356 pid, 357 &mut sparse[pos..], 358 ); 359 } 360 } 361 362 // And now add the accelerator, if one exists. An accelerator is 363 // at most 4 bytes and at least 1 byte. The first byte is the 364 // length, N. N bytes follow the length. The set of bytes that 365 // follow correspond (exhaustively) to the bytes that must be seen 366 // to leave this state. 367 let accel = dfa.accelerator(state.id()); 368 sparse.push(accel.len().try_into().unwrap()); 369 sparse.extend_from_slice(accel); 370 } 371 372 let mut new = DFA { 373 tt: Transitions { 374 sparse, 375 classes: dfa.byte_classes().clone(), 376 state_len: dfa.state_len(), 377 pattern_len: dfa.pattern_len(), 378 }, 379 st: StartTable::from_dense_dfa(dfa, &remap)?, 380 special: dfa.special().remap(|id| remap[dfa.to_index(id)]), 381 pre: dfa.get_prefilter().map(|p| p.clone()), 382 quitset: dfa.quitset().clone(), 383 flags: dfa.flags().clone(), 384 }; 385 // And here's our second pass. Iterate over all of the dense states 386 // again, and update the transitions in each of the states in the 387 // sparse DFA. 388 for old_state in dfa.states() { 389 let new_id = remap[dfa.to_index(old_state.id())]; 390 let mut new_state = new.tt.state_mut(new_id); 391 let sparse = old_state.sparse_transitions(); 392 for (i, (_, _, next)) in sparse.enumerate() { 393 let next = remap[dfa.to_index(next)]; 394 new_state.set_next_at(i, next); 395 } 396 } 397 debug!( 398 "created sparse DFA, memory usage: {} (dense memory usage: {})", 399 new.memory_usage(), 400 dfa.memory_usage(), 401 ); 402 Ok(new) 403 } 404 } 405 406 impl<T: AsRef<[u8]>> DFA<T> { 407 /// Cheaply return a borrowed version of this sparse DFA. Specifically, the 408 /// DFA returned always uses `&[u8]` for its transitions. as_ref<'a>(&'a self) -> DFA<&'a [u8]>409 pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { 410 DFA { 411 tt: self.tt.as_ref(), 412 st: self.st.as_ref(), 413 special: self.special, 414 pre: self.pre.clone(), 415 quitset: self.quitset, 416 flags: self.flags, 417 } 418 } 419 420 /// Return an owned version of this sparse DFA. Specifically, the DFA 421 /// returned always uses `Vec<u8>` for its transitions. 422 /// 423 /// Effectively, this returns a sparse DFA whose transitions live on the 424 /// heap. 425 #[cfg(feature = "alloc")] to_owned(&self) -> DFA<alloc::vec::Vec<u8>>426 pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> { 427 DFA { 428 tt: self.tt.to_owned(), 429 st: self.st.to_owned(), 430 special: self.special, 431 pre: self.pre.clone(), 432 quitset: self.quitset, 433 flags: self.flags, 434 } 435 } 436 437 /// Returns the starting state configuration for this DFA. 438 /// 439 /// The default is [`StartKind::Both`], which means the DFA supports both 440 /// unanchored and anchored searches. However, this can generally lead to 441 /// bigger DFAs. Therefore, a DFA might be compiled with support for just 442 /// unanchored or anchored searches. In that case, running a search with 443 /// an unsupported configuration will panic. start_kind(&self) -> StartKind444 pub fn start_kind(&self) -> StartKind { 445 self.st.kind 446 } 447 448 /// Returns true only if this DFA has starting states for each pattern. 449 /// 450 /// When a DFA has starting states for each pattern, then a search with the 451 /// DFA can be configured to only look for anchored matches of a specific 452 /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can 453 /// accept a [`Anchored::Pattern`] if and only if this method returns true. 454 /// Otherwise, an error will be returned. 455 /// 456 /// Note that if the DFA is empty, this always returns false. starts_for_each_pattern(&self) -> bool457 pub fn starts_for_each_pattern(&self) -> bool { 458 self.st.pattern_len.is_some() 459 } 460 461 /// Returns the equivalence classes that make up the alphabet for this DFA. 462 /// 463 /// Unless [`dense::Config::byte_classes`] was disabled, it is possible 464 /// that multiple distinct bytes are grouped into the same equivalence 465 /// class if it is impossible for them to discriminate between a match and 466 /// a non-match. This has the effect of reducing the overall alphabet size 467 /// and in turn potentially substantially reducing the size of the DFA's 468 /// transition table. 469 /// 470 /// The downside of using equivalence classes like this is that every state 471 /// transition will automatically use this map to convert an arbitrary 472 /// byte to its corresponding equivalence class. In practice this has a 473 /// negligible impact on performance. byte_classes(&self) -> &ByteClasses474 pub fn byte_classes(&self) -> &ByteClasses { 475 &self.tt.classes 476 } 477 478 /// Returns the memory usage, in bytes, of this DFA. 479 /// 480 /// The memory usage is computed based on the number of bytes used to 481 /// represent this DFA. 482 /// 483 /// This does **not** include the stack size used up by this DFA. To 484 /// compute that, use `std::mem::size_of::<sparse::DFA>()`. memory_usage(&self) -> usize485 pub fn memory_usage(&self) -> usize { 486 self.tt.memory_usage() + self.st.memory_usage() 487 } 488 } 489 490 /// Routines for converting a sparse DFA to other representations, such as raw 491 /// bytes suitable for persistent storage. 492 impl<T: AsRef<[u8]>> DFA<T> { 493 /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian 494 /// format. 495 /// 496 /// The written bytes are guaranteed to be deserialized correctly and 497 /// without errors in a semver compatible release of this crate by a 498 /// `DFA`'s deserialization APIs (assuming all other criteria for the 499 /// deserialization APIs has been satisfied): 500 /// 501 /// * [`DFA::from_bytes`] 502 /// * [`DFA::from_bytes_unchecked`] 503 /// 504 /// Note that unlike a [`dense::DFA`]'s serialization methods, this does 505 /// not add any initial padding to the returned bytes. Padding isn't 506 /// required for sparse DFAs since they have no alignment requirements. 507 /// 508 /// # Example 509 /// 510 /// This example shows how to serialize and deserialize a DFA: 511 /// 512 /// ``` 513 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 514 /// 515 /// // Compile our original DFA. 516 /// let original_dfa = DFA::new("foo[0-9]+")?; 517 /// 518 /// // N.B. We use native endianness here to make the example work, but 519 /// // using to_bytes_little_endian would work on a little endian target. 520 /// let buf = original_dfa.to_bytes_native_endian(); 521 /// // Even if buf has initial padding, DFA::from_bytes will automatically 522 /// // ignore it. 523 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; 524 /// 525 /// let expected = Some(HalfMatch::must(0, 8)); 526 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 527 /// # Ok::<(), Box<dyn std::error::Error>>(()) 528 /// ``` 529 #[cfg(feature = "dfa-build")] to_bytes_little_endian(&self) -> Vec<u8>530 pub fn to_bytes_little_endian(&self) -> Vec<u8> { 531 self.to_bytes::<wire::LE>() 532 } 533 534 /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian 535 /// format. 536 /// 537 /// The written bytes are guaranteed to be deserialized correctly and 538 /// without errors in a semver compatible release of this crate by a 539 /// `DFA`'s deserialization APIs (assuming all other criteria for the 540 /// deserialization APIs has been satisfied): 541 /// 542 /// * [`DFA::from_bytes`] 543 /// * [`DFA::from_bytes_unchecked`] 544 /// 545 /// Note that unlike a [`dense::DFA`]'s serialization methods, this does 546 /// not add any initial padding to the returned bytes. Padding isn't 547 /// required for sparse DFAs since they have no alignment requirements. 548 /// 549 /// # Example 550 /// 551 /// This example shows how to serialize and deserialize a DFA: 552 /// 553 /// ``` 554 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 555 /// 556 /// // Compile our original DFA. 557 /// let original_dfa = DFA::new("foo[0-9]+")?; 558 /// 559 /// // N.B. We use native endianness here to make the example work, but 560 /// // using to_bytes_big_endian would work on a big endian target. 561 /// let buf = original_dfa.to_bytes_native_endian(); 562 /// // Even if buf has initial padding, DFA::from_bytes will automatically 563 /// // ignore it. 564 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; 565 /// 566 /// let expected = Some(HalfMatch::must(0, 8)); 567 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 568 /// # Ok::<(), Box<dyn std::error::Error>>(()) 569 /// ``` 570 #[cfg(feature = "dfa-build")] to_bytes_big_endian(&self) -> Vec<u8>571 pub fn to_bytes_big_endian(&self) -> Vec<u8> { 572 self.to_bytes::<wire::BE>() 573 } 574 575 /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian 576 /// format. 577 /// 578 /// The written bytes are guaranteed to be deserialized correctly and 579 /// without errors in a semver compatible release of this crate by a 580 /// `DFA`'s deserialization APIs (assuming all other criteria for the 581 /// deserialization APIs has been satisfied): 582 /// 583 /// * [`DFA::from_bytes`] 584 /// * [`DFA::from_bytes_unchecked`] 585 /// 586 /// Note that unlike a [`dense::DFA`]'s serialization methods, this does 587 /// not add any initial padding to the returned bytes. Padding isn't 588 /// required for sparse DFAs since they have no alignment requirements. 589 /// 590 /// Generally speaking, native endian format should only be used when 591 /// you know that the target you're compiling the DFA for matches the 592 /// endianness of the target on which you're compiling DFA. For example, 593 /// if serialization and deserialization happen in the same process or on 594 /// the same machine. Otherwise, when serializing a DFA for use in a 595 /// portable environment, you'll almost certainly want to serialize _both_ 596 /// a little endian and a big endian version and then load the correct one 597 /// based on the target's configuration. 598 /// 599 /// # Example 600 /// 601 /// This example shows how to serialize and deserialize a DFA: 602 /// 603 /// ``` 604 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 605 /// 606 /// // Compile our original DFA. 607 /// let original_dfa = DFA::new("foo[0-9]+")?; 608 /// 609 /// let buf = original_dfa.to_bytes_native_endian(); 610 /// // Even if buf has initial padding, DFA::from_bytes will automatically 611 /// // ignore it. 612 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; 613 /// 614 /// let expected = Some(HalfMatch::must(0, 8)); 615 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 616 /// # Ok::<(), Box<dyn std::error::Error>>(()) 617 /// ``` 618 #[cfg(feature = "dfa-build")] to_bytes_native_endian(&self) -> Vec<u8>619 pub fn to_bytes_native_endian(&self) -> Vec<u8> { 620 self.to_bytes::<wire::NE>() 621 } 622 623 /// The implementation of the public `to_bytes` serialization methods, 624 /// which is generic over endianness. 625 #[cfg(feature = "dfa-build")] to_bytes<E: Endian>(&self) -> Vec<u8>626 fn to_bytes<E: Endian>(&self) -> Vec<u8> { 627 let mut buf = vec![0; self.write_to_len()]; 628 // This should always succeed since the only possible serialization 629 // error is providing a buffer that's too small, but we've ensured that 630 // `buf` is big enough here. 631 self.write_to::<E>(&mut buf).unwrap(); 632 buf 633 } 634 635 /// Serialize this DFA as raw bytes to the given slice, in little endian 636 /// format. Upon success, the total number of bytes written to `dst` is 637 /// returned. 638 /// 639 /// The written bytes are guaranteed to be deserialized correctly and 640 /// without errors in a semver compatible release of this crate by a 641 /// `DFA`'s deserialization APIs (assuming all other criteria for the 642 /// deserialization APIs has been satisfied): 643 /// 644 /// * [`DFA::from_bytes`] 645 /// * [`DFA::from_bytes_unchecked`] 646 /// 647 /// # Errors 648 /// 649 /// This returns an error if the given destination slice is not big enough 650 /// to contain the full serialized DFA. If an error occurs, then nothing 651 /// is written to `dst`. 652 /// 653 /// # Example 654 /// 655 /// This example shows how to serialize and deserialize a DFA without 656 /// dynamic memory allocation. 657 /// 658 /// ``` 659 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 660 /// 661 /// // Compile our original DFA. 662 /// let original_dfa = DFA::new("foo[0-9]+")?; 663 /// 664 /// // Create a 4KB buffer on the stack to store our serialized DFA. 665 /// let mut buf = [0u8; 4 * (1<<10)]; 666 /// // N.B. We use native endianness here to make the example work, but 667 /// // using write_to_little_endian would work on a little endian target. 668 /// let written = original_dfa.write_to_native_endian(&mut buf)?; 669 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; 670 /// 671 /// let expected = Some(HalfMatch::must(0, 8)); 672 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 673 /// # Ok::<(), Box<dyn std::error::Error>>(()) 674 /// ``` write_to_little_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>675 pub fn write_to_little_endian( 676 &self, 677 dst: &mut [u8], 678 ) -> Result<usize, SerializeError> { 679 self.write_to::<wire::LE>(dst) 680 } 681 682 /// Serialize this DFA as raw bytes to the given slice, in big endian 683 /// format. Upon success, the total number of bytes written to `dst` is 684 /// returned. 685 /// 686 /// The written bytes are guaranteed to be deserialized correctly and 687 /// without errors in a semver compatible release of this crate by a 688 /// `DFA`'s deserialization APIs (assuming all other criteria for the 689 /// deserialization APIs has been satisfied): 690 /// 691 /// * [`DFA::from_bytes`] 692 /// * [`DFA::from_bytes_unchecked`] 693 /// 694 /// # Errors 695 /// 696 /// This returns an error if the given destination slice is not big enough 697 /// to contain the full serialized DFA. If an error occurs, then nothing 698 /// is written to `dst`. 699 /// 700 /// # Example 701 /// 702 /// This example shows how to serialize and deserialize a DFA without 703 /// dynamic memory allocation. 704 /// 705 /// ``` 706 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 707 /// 708 /// // Compile our original DFA. 709 /// let original_dfa = DFA::new("foo[0-9]+")?; 710 /// 711 /// // Create a 4KB buffer on the stack to store our serialized DFA. 712 /// let mut buf = [0u8; 4 * (1<<10)]; 713 /// // N.B. We use native endianness here to make the example work, but 714 /// // using write_to_big_endian would work on a big endian target. 715 /// let written = original_dfa.write_to_native_endian(&mut buf)?; 716 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; 717 /// 718 /// let expected = Some(HalfMatch::must(0, 8)); 719 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 720 /// # Ok::<(), Box<dyn std::error::Error>>(()) 721 /// ``` write_to_big_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>722 pub fn write_to_big_endian( 723 &self, 724 dst: &mut [u8], 725 ) -> Result<usize, SerializeError> { 726 self.write_to::<wire::BE>(dst) 727 } 728 729 /// Serialize this DFA as raw bytes to the given slice, in native endian 730 /// format. Upon success, the total number of bytes written to `dst` is 731 /// returned. 732 /// 733 /// The written bytes are guaranteed to be deserialized correctly and 734 /// without errors in a semver compatible release of this crate by a 735 /// `DFA`'s deserialization APIs (assuming all other criteria for the 736 /// deserialization APIs has been satisfied): 737 /// 738 /// * [`DFA::from_bytes`] 739 /// * [`DFA::from_bytes_unchecked`] 740 /// 741 /// Generally speaking, native endian format should only be used when 742 /// you know that the target you're compiling the DFA for matches the 743 /// endianness of the target on which you're compiling DFA. For example, 744 /// if serialization and deserialization happen in the same process or on 745 /// the same machine. Otherwise, when serializing a DFA for use in a 746 /// portable environment, you'll almost certainly want to serialize _both_ 747 /// a little endian and a big endian version and then load the correct one 748 /// based on the target's configuration. 749 /// 750 /// # Errors 751 /// 752 /// This returns an error if the given destination slice is not big enough 753 /// to contain the full serialized DFA. If an error occurs, then nothing 754 /// is written to `dst`. 755 /// 756 /// # Example 757 /// 758 /// This example shows how to serialize and deserialize a DFA without 759 /// dynamic memory allocation. 760 /// 761 /// ``` 762 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 763 /// 764 /// // Compile our original DFA. 765 /// let original_dfa = DFA::new("foo[0-9]+")?; 766 /// 767 /// // Create a 4KB buffer on the stack to store our serialized DFA. 768 /// let mut buf = [0u8; 4 * (1<<10)]; 769 /// let written = original_dfa.write_to_native_endian(&mut buf)?; 770 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; 771 /// 772 /// let expected = Some(HalfMatch::must(0, 8)); 773 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 774 /// # Ok::<(), Box<dyn std::error::Error>>(()) 775 /// ``` write_to_native_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>776 pub fn write_to_native_endian( 777 &self, 778 dst: &mut [u8], 779 ) -> Result<usize, SerializeError> { 780 self.write_to::<wire::NE>(dst) 781 } 782 783 /// The implementation of the public `write_to` serialization methods, 784 /// which is generic over endianness. write_to<E: Endian>( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>785 fn write_to<E: Endian>( 786 &self, 787 dst: &mut [u8], 788 ) -> Result<usize, SerializeError> { 789 let mut nw = 0; 790 nw += wire::write_label(LABEL, &mut dst[nw..])?; 791 nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; 792 nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; 793 nw += { 794 // Currently unused, intended for future flexibility 795 E::write_u32(0, &mut dst[nw..]); 796 size_of::<u32>() 797 }; 798 nw += self.flags.write_to::<E>(&mut dst[nw..])?; 799 nw += self.tt.write_to::<E>(&mut dst[nw..])?; 800 nw += self.st.write_to::<E>(&mut dst[nw..])?; 801 nw += self.special.write_to::<E>(&mut dst[nw..])?; 802 nw += self.quitset.write_to::<E>(&mut dst[nw..])?; 803 Ok(nw) 804 } 805 806 /// Return the total number of bytes required to serialize this DFA. 807 /// 808 /// This is useful for determining the size of the buffer required to pass 809 /// to one of the serialization routines: 810 /// 811 /// * [`DFA::write_to_little_endian`] 812 /// * [`DFA::write_to_big_endian`] 813 /// * [`DFA::write_to_native_endian`] 814 /// 815 /// Passing a buffer smaller than the size returned by this method will 816 /// result in a serialization error. 817 /// 818 /// # Example 819 /// 820 /// This example shows how to dynamically allocate enough room to serialize 821 /// a sparse DFA. 822 /// 823 /// ``` 824 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 825 /// 826 /// // Compile our original DFA. 827 /// let original_dfa = DFA::new("foo[0-9]+")?; 828 /// 829 /// let mut buf = vec![0; original_dfa.write_to_len()]; 830 /// let written = original_dfa.write_to_native_endian(&mut buf)?; 831 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; 832 /// 833 /// let expected = Some(HalfMatch::must(0, 8)); 834 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 835 /// # Ok::<(), Box<dyn std::error::Error>>(()) 836 /// ``` write_to_len(&self) -> usize837 pub fn write_to_len(&self) -> usize { 838 wire::write_label_len(LABEL) 839 + wire::write_endianness_check_len() 840 + wire::write_version_len() 841 + size_of::<u32>() // unused, intended for future flexibility 842 + self.flags.write_to_len() 843 + self.tt.write_to_len() 844 + self.st.write_to_len() 845 + self.special.write_to_len() 846 + self.quitset.write_to_len() 847 } 848 } 849 850 impl<'a> DFA<&'a [u8]> { 851 /// Safely deserialize a sparse DFA with a specific state identifier 852 /// representation. Upon success, this returns both the deserialized DFA 853 /// and the number of bytes read from the given slice. Namely, the contents 854 /// of the slice beyond the DFA are not read. 855 /// 856 /// Deserializing a DFA using this routine will never allocate heap memory. 857 /// For safety purposes, the DFA's transitions will be verified such that 858 /// every transition points to a valid state. If this verification is too 859 /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which 860 /// will always execute in constant time. 861 /// 862 /// The bytes given must be generated by one of the serialization APIs 863 /// of a `DFA` using a semver compatible release of this crate. Those 864 /// include: 865 /// 866 /// * [`DFA::to_bytes_little_endian`] 867 /// * [`DFA::to_bytes_big_endian`] 868 /// * [`DFA::to_bytes_native_endian`] 869 /// * [`DFA::write_to_little_endian`] 870 /// * [`DFA::write_to_big_endian`] 871 /// * [`DFA::write_to_native_endian`] 872 /// 873 /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The 874 /// `write_to` methods do not allocate and write to an existing slice 875 /// (which may be on the stack). Since deserialization always uses the 876 /// native endianness of the target platform, the serialization API you use 877 /// should match the endianness of the target platform. (It's often a good 878 /// idea to generate serialized DFAs for both forms of endianness and then 879 /// load the correct one based on endianness.) 880 /// 881 /// # Errors 882 /// 883 /// Generally speaking, it's easier to state the conditions in which an 884 /// error is _not_ returned. All of the following must be true: 885 /// 886 /// * The bytes given must be produced by one of the serialization APIs 887 /// on this DFA, as mentioned above. 888 /// * The endianness of the target platform matches the endianness used to 889 /// serialized the provided DFA. 890 /// 891 /// If any of the above are not true, then an error will be returned. 892 /// 893 /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse 894 /// DFA has no alignment requirements. That is, an alignment of `1` is 895 /// valid. 896 /// 897 /// # Panics 898 /// 899 /// This routine will never panic for any input. 900 /// 901 /// # Example 902 /// 903 /// This example shows how to serialize a DFA to raw bytes, deserialize it 904 /// and then use it for searching. 905 /// 906 /// ``` 907 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 908 /// 909 /// let initial = DFA::new("foo[0-9]+")?; 910 /// let bytes = initial.to_bytes_native_endian(); 911 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; 912 /// 913 /// let expected = Some(HalfMatch::must(0, 8)); 914 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 915 /// # Ok::<(), Box<dyn std::error::Error>>(()) 916 /// ``` 917 /// 918 /// # Example: loading a DFA from static memory 919 /// 920 /// One use case this library supports is the ability to serialize a 921 /// DFA to disk and then use `include_bytes!` to store it in a compiled 922 /// Rust program. Those bytes can then be cheaply deserialized into a 923 /// `DFA` structure at runtime and used for searching without having to 924 /// re-compile the DFA (which can be quite costly). 925 /// 926 /// We can show this in two parts. The first part is serializing the DFA to 927 /// a file: 928 /// 929 /// ```no_run 930 /// use regex_automata::dfa::sparse::DFA; 931 /// 932 /// let dfa = DFA::new("foo[0-9]+")?; 933 /// 934 /// // Write a big endian serialized version of this DFA to a file. 935 /// let bytes = dfa.to_bytes_big_endian(); 936 /// std::fs::write("foo.bigendian.dfa", &bytes)?; 937 /// 938 /// // Do it again, but this time for little endian. 939 /// let bytes = dfa.to_bytes_little_endian(); 940 /// std::fs::write("foo.littleendian.dfa", &bytes)?; 941 /// # Ok::<(), Box<dyn std::error::Error>>(()) 942 /// ``` 943 /// 944 /// And now the second part is embedding the DFA into the compiled program 945 /// and deserializing it at runtime on first use. We use conditional 946 /// compilation to choose the correct endianness. We do not need to employ 947 /// any special tricks to ensure a proper alignment, since a sparse DFA has 948 /// no alignment requirements. 949 /// 950 /// ```no_run 951 /// use regex_automata::{ 952 /// dfa::{Automaton, sparse::DFA}, 953 /// util::lazy::Lazy, 954 /// HalfMatch, Input, 955 /// }; 956 /// 957 /// // This crate provides its own "lazy" type, kind of like 958 /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc 959 /// // no-std environments and let's us write this using completely 960 /// // safe code. 961 /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { 962 /// # const _: &str = stringify! { 963 /// #[cfg(target_endian = "big")] 964 /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); 965 /// #[cfg(target_endian = "little")] 966 /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); 967 /// # }; 968 /// # static BYTES: &[u8] = b""; 969 /// 970 /// let (dfa, _) = DFA::from_bytes(BYTES) 971 /// .expect("serialized DFA should be valid"); 972 /// dfa 973 /// }); 974 /// 975 /// let expected = Ok(Some(HalfMatch::must(0, 8))); 976 /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); 977 /// ``` 978 /// 979 /// Alternatively, consider using 980 /// [`lazy_static`](https://crates.io/crates/lazy_static) 981 /// or 982 /// [`once_cell`](https://crates.io/crates/once_cell), 983 /// which will guarantee safety for you. from_bytes( slice: &'a [u8], ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError>984 pub fn from_bytes( 985 slice: &'a [u8], 986 ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { 987 // SAFETY: This is safe because we validate both the sparse transitions 988 // (by trying to decode every state) and start state ID list below. If 989 // either validation fails, then we return an error. 990 let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; 991 let seen = dfa.tt.validate(&dfa.special)?; 992 dfa.st.validate(&dfa.special, &seen)?; 993 // N.B. dfa.special doesn't have a way to do unchecked deserialization, 994 // so it has already been validated. 995 Ok((dfa, nread)) 996 } 997 998 /// Deserialize a DFA with a specific state identifier representation in 999 /// constant time by omitting the verification of the validity of the 1000 /// sparse transitions. 1001 /// 1002 /// This is just like [`DFA::from_bytes`], except it can potentially return 1003 /// a DFA that exhibits undefined behavior if its transitions contains 1004 /// invalid state identifiers. 1005 /// 1006 /// This routine is useful if you need to deserialize a DFA cheaply and 1007 /// cannot afford the transition validation performed by `from_bytes`. 1008 /// 1009 /// # Safety 1010 /// 1011 /// This routine is not safe because it permits callers to provide 1012 /// arbitrary transitions with possibly incorrect state identifiers. While 1013 /// the various serialization routines will never return an incorrect 1014 /// DFA, there is no guarantee that the bytes provided here are correct. 1015 /// While `from_bytes_unchecked` will still do several forms of basic 1016 /// validation, this routine does not check that the transitions themselves 1017 /// are correct. Given an incorrect transition table, it is possible for 1018 /// the search routines to access out-of-bounds memory because of explicit 1019 /// bounds check elision. 1020 /// 1021 /// # Example 1022 /// 1023 /// ``` 1024 /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; 1025 /// 1026 /// let initial = DFA::new("foo[0-9]+")?; 1027 /// let bytes = initial.to_bytes_native_endian(); 1028 /// // SAFETY: This is guaranteed to be safe since the bytes given come 1029 /// // directly from a compatible serialization routine. 1030 /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; 1031 /// 1032 /// let expected = Some(HalfMatch::must(0, 8)); 1033 /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); 1034 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1035 /// ``` from_bytes_unchecked( slice: &'a [u8], ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError>1036 pub unsafe fn from_bytes_unchecked( 1037 slice: &'a [u8], 1038 ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { 1039 let mut nr = 0; 1040 1041 nr += wire::read_label(&slice[nr..], LABEL)?; 1042 nr += wire::read_endianness_check(&slice[nr..])?; 1043 nr += wire::read_version(&slice[nr..], VERSION)?; 1044 1045 let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; 1046 nr += size_of::<u32>(); 1047 1048 let (flags, nread) = Flags::from_bytes(&slice[nr..])?; 1049 nr += nread; 1050 1051 let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; 1052 nr += nread; 1053 1054 let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; 1055 nr += nread; 1056 1057 let (special, nread) = Special::from_bytes(&slice[nr..])?; 1058 nr += nread; 1059 if special.max.as_usize() >= tt.sparse().len() { 1060 return Err(DeserializeError::generic( 1061 "max should not be greater than or equal to sparse bytes", 1062 )); 1063 } 1064 1065 let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; 1066 nr += nread; 1067 1068 // Prefilters don't support serialization, so they're always absent. 1069 let pre = None; 1070 Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) 1071 } 1072 } 1073 1074 impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1075 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1076 writeln!(f, "sparse::DFA(")?; 1077 for state in self.tt.states() { 1078 fmt_state_indicator(f, self, state.id())?; 1079 writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?; 1080 } 1081 writeln!(f, "")?; 1082 for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { 1083 if i % self.st.stride == 0 { 1084 match anchored { 1085 Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, 1086 Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, 1087 Anchored::Pattern(pid) => writeln!( 1088 f, 1089 "START_GROUP(pattern: {:?})", 1090 pid.as_usize() 1091 )?, 1092 } 1093 } 1094 writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; 1095 } 1096 writeln!(f, "state length: {:?}", self.tt.state_len)?; 1097 writeln!(f, "pattern length: {:?}", self.pattern_len())?; 1098 writeln!(f, "flags: {:?}", self.flags)?; 1099 writeln!(f, ")")?; 1100 Ok(()) 1101 } 1102 } 1103 1104 // SAFETY: We assert that our implementation of each method is correct. 1105 unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { 1106 #[inline] is_special_state(&self, id: StateID) -> bool1107 fn is_special_state(&self, id: StateID) -> bool { 1108 self.special.is_special_state(id) 1109 } 1110 1111 #[inline] is_dead_state(&self, id: StateID) -> bool1112 fn is_dead_state(&self, id: StateID) -> bool { 1113 self.special.is_dead_state(id) 1114 } 1115 1116 #[inline] is_quit_state(&self, id: StateID) -> bool1117 fn is_quit_state(&self, id: StateID) -> bool { 1118 self.special.is_quit_state(id) 1119 } 1120 1121 #[inline] is_match_state(&self, id: StateID) -> bool1122 fn is_match_state(&self, id: StateID) -> bool { 1123 self.special.is_match_state(id) 1124 } 1125 1126 #[inline] is_start_state(&self, id: StateID) -> bool1127 fn is_start_state(&self, id: StateID) -> bool { 1128 self.special.is_start_state(id) 1129 } 1130 1131 #[inline] is_accel_state(&self, id: StateID) -> bool1132 fn is_accel_state(&self, id: StateID) -> bool { 1133 self.special.is_accel_state(id) 1134 } 1135 1136 // This is marked as inline to help dramatically boost sparse searching, 1137 // which decodes each state it enters to follow the next transition. 1138 #[cfg_attr(feature = "perf-inline", inline(always))] next_state(&self, current: StateID, input: u8) -> StateID1139 fn next_state(&self, current: StateID, input: u8) -> StateID { 1140 let input = self.tt.classes.get(input); 1141 self.tt.state(current).next(input) 1142 } 1143 1144 #[inline] next_state_unchecked( &self, current: StateID, input: u8, ) -> StateID1145 unsafe fn next_state_unchecked( 1146 &self, 1147 current: StateID, 1148 input: u8, 1149 ) -> StateID { 1150 self.next_state(current, input) 1151 } 1152 1153 #[inline] next_eoi_state(&self, current: StateID) -> StateID1154 fn next_eoi_state(&self, current: StateID) -> StateID { 1155 self.tt.state(current).next_eoi() 1156 } 1157 1158 #[inline] pattern_len(&self) -> usize1159 fn pattern_len(&self) -> usize { 1160 self.tt.pattern_len 1161 } 1162 1163 #[inline] match_len(&self, id: StateID) -> usize1164 fn match_len(&self, id: StateID) -> usize { 1165 self.tt.state(id).pattern_len() 1166 } 1167 1168 #[inline] match_pattern(&self, id: StateID, match_index: usize) -> PatternID1169 fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { 1170 // This is an optimization for the very common case of a DFA with a 1171 // single pattern. This conditional avoids a somewhat more costly path 1172 // that finds the pattern ID from the state machine, which requires 1173 // a bit of slicing/pointer-chasing. This optimization tends to only 1174 // matter when matches are frequent. 1175 if self.tt.pattern_len == 1 { 1176 return PatternID::ZERO; 1177 } 1178 self.tt.state(id).pattern_id(match_index) 1179 } 1180 1181 #[inline] has_empty(&self) -> bool1182 fn has_empty(&self) -> bool { 1183 self.flags.has_empty 1184 } 1185 1186 #[inline] is_utf8(&self) -> bool1187 fn is_utf8(&self) -> bool { 1188 self.flags.is_utf8 1189 } 1190 1191 #[inline] is_always_start_anchored(&self) -> bool1192 fn is_always_start_anchored(&self) -> bool { 1193 self.flags.is_always_start_anchored 1194 } 1195 1196 #[inline] start_state( &self, config: &start::Config, ) -> Result<StateID, StartError>1197 fn start_state( 1198 &self, 1199 config: &start::Config, 1200 ) -> Result<StateID, StartError> { 1201 let anchored = config.get_anchored(); 1202 let start = match config.get_look_behind() { 1203 None => Start::Text, 1204 Some(byte) => { 1205 if !self.quitset.is_empty() && self.quitset.contains(byte) { 1206 return Err(StartError::quit(byte)); 1207 } 1208 self.st.start_map.get(byte) 1209 } 1210 }; 1211 self.st.start(anchored, start) 1212 } 1213 1214 #[inline] universal_start_state(&self, mode: Anchored) -> Option<StateID>1215 fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { 1216 match mode { 1217 Anchored::No => self.st.universal_start_unanchored, 1218 Anchored::Yes => self.st.universal_start_anchored, 1219 Anchored::Pattern(_) => None, 1220 } 1221 } 1222 1223 #[inline] accelerator(&self, id: StateID) -> &[u8]1224 fn accelerator(&self, id: StateID) -> &[u8] { 1225 self.tt.state(id).accelerator() 1226 } 1227 1228 #[inline] get_prefilter(&self) -> Option<&Prefilter>1229 fn get_prefilter(&self) -> Option<&Prefilter> { 1230 self.pre.as_ref() 1231 } 1232 } 1233 1234 /// The transition table portion of a sparse DFA. 1235 /// 1236 /// The transition table is the core part of the DFA in that it describes how 1237 /// to move from one state to another based on the input sequence observed. 1238 /// 1239 /// Unlike a typical dense table based DFA, states in a sparse transition 1240 /// table have variable size. That is, states with more transitions use more 1241 /// space than states with fewer transitions. This means that finding the next 1242 /// transition takes more work than with a dense DFA, but also typically uses 1243 /// much less space. 1244 #[derive(Clone)] 1245 struct Transitions<T> { 1246 /// The raw encoding of each state in this DFA. 1247 /// 1248 /// Each state has the following information: 1249 /// 1250 /// * A set of transitions to subsequent states. Transitions to the dead 1251 /// state are omitted. 1252 /// * If the state can be accelerated, then any additional accelerator 1253 /// information. 1254 /// * If the state is a match state, then the state contains all pattern 1255 /// IDs that match when in that state. 1256 /// 1257 /// To decode a state, use Transitions::state. 1258 /// 1259 /// In practice, T is either Vec<u8> or &[u8]. 1260 sparse: T, 1261 /// A set of equivalence classes, where a single equivalence class 1262 /// represents a set of bytes that never discriminate between a match 1263 /// and a non-match in the DFA. Each equivalence class corresponds to a 1264 /// single character in this DFA's alphabet, where the maximum number of 1265 /// characters is 257 (each possible value of a byte plus the special 1266 /// EOI transition). Consequently, the number of equivalence classes 1267 /// corresponds to the number of transitions for each DFA state. Note 1268 /// though that the *space* used by each DFA state in the transition table 1269 /// may be larger. The total space used by each DFA state is known as the 1270 /// stride and is documented above. 1271 /// 1272 /// The only time the number of equivalence classes is fewer than 257 is 1273 /// if the DFA's kind uses byte classes which is the default. Equivalence 1274 /// classes should generally only be disabled when debugging, so that 1275 /// the transitions themselves aren't obscured. Disabling them has no 1276 /// other benefit, since the equivalence class map is always used while 1277 /// searching. In the vast majority of cases, the number of equivalence 1278 /// classes is substantially smaller than 257, particularly when large 1279 /// Unicode classes aren't used. 1280 /// 1281 /// N.B. Equivalence classes aren't particularly useful in a sparse DFA 1282 /// in the current implementation, since equivalence classes generally tend 1283 /// to correspond to continuous ranges of bytes that map to the same 1284 /// transition. So in a sparse DFA, equivalence classes don't really lead 1285 /// to a space savings. In the future, it would be good to try and remove 1286 /// them from sparse DFAs entirely, but requires a bit of work since sparse 1287 /// DFAs are built from dense DFAs, which are in turn built on top of 1288 /// equivalence classes. 1289 classes: ByteClasses, 1290 /// The total number of states in this DFA. Note that a DFA always has at 1291 /// least one state---the dead state---even the empty DFA. In particular, 1292 /// the dead state always has ID 0 and is correspondingly always the first 1293 /// state. The dead state is never a match state. 1294 state_len: usize, 1295 /// The total number of unique patterns represented by these match states. 1296 pattern_len: usize, 1297 } 1298 1299 impl<'a> Transitions<&'a [u8]> { from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError>1300 unsafe fn from_bytes_unchecked( 1301 mut slice: &'a [u8], 1302 ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { 1303 let slice_start = slice.as_ptr().as_usize(); 1304 1305 let (state_len, nr) = 1306 wire::try_read_u32_as_usize(&slice, "state length")?; 1307 slice = &slice[nr..]; 1308 1309 let (pattern_len, nr) = 1310 wire::try_read_u32_as_usize(&slice, "pattern length")?; 1311 slice = &slice[nr..]; 1312 1313 let (classes, nr) = ByteClasses::from_bytes(&slice)?; 1314 slice = &slice[nr..]; 1315 1316 let (len, nr) = 1317 wire::try_read_u32_as_usize(&slice, "sparse transitions length")?; 1318 slice = &slice[nr..]; 1319 1320 wire::check_slice_len(slice, len, "sparse states byte length")?; 1321 let sparse = &slice[..len]; 1322 slice = &slice[len..]; 1323 1324 let trans = Transitions { sparse, classes, state_len, pattern_len }; 1325 Ok((trans, slice.as_ptr().as_usize() - slice_start)) 1326 } 1327 } 1328 1329 impl<T: AsRef<[u8]>> Transitions<T> { 1330 /// Writes a serialized form of this transition table to the buffer given. 1331 /// If the buffer is too small, then an error is returned. To determine 1332 /// how big the buffer must be, use `write_to_len`. write_to<E: Endian>( &self, mut dst: &mut [u8], ) -> Result<usize, SerializeError>1333 fn write_to<E: Endian>( 1334 &self, 1335 mut dst: &mut [u8], 1336 ) -> Result<usize, SerializeError> { 1337 let nwrite = self.write_to_len(); 1338 if dst.len() < nwrite { 1339 return Err(SerializeError::buffer_too_small( 1340 "sparse transition table", 1341 )); 1342 } 1343 dst = &mut dst[..nwrite]; 1344 1345 // write state length 1346 E::write_u32(u32::try_from(self.state_len).unwrap(), dst); 1347 dst = &mut dst[size_of::<u32>()..]; 1348 1349 // write pattern length 1350 E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); 1351 dst = &mut dst[size_of::<u32>()..]; 1352 1353 // write byte class map 1354 let n = self.classes.write_to(dst)?; 1355 dst = &mut dst[n..]; 1356 1357 // write number of bytes in sparse transitions 1358 E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); 1359 dst = &mut dst[size_of::<u32>()..]; 1360 1361 // write actual transitions 1362 let mut id = DEAD; 1363 while id.as_usize() < self.sparse().len() { 1364 let state = self.state(id); 1365 let n = state.write_to::<E>(&mut dst)?; 1366 dst = &mut dst[n..]; 1367 // The next ID is the offset immediately following `state`. 1368 id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); 1369 } 1370 Ok(nwrite) 1371 } 1372 1373 /// Returns the number of bytes the serialized form of this transition 1374 /// table will use. write_to_len(&self) -> usize1375 fn write_to_len(&self) -> usize { 1376 size_of::<u32>() // state length 1377 + size_of::<u32>() // pattern length 1378 + self.classes.write_to_len() 1379 + size_of::<u32>() // sparse transitions length 1380 + self.sparse().len() 1381 } 1382 1383 /// Validates that every state ID in this transition table is valid. 1384 /// 1385 /// That is, every state ID can be used to correctly index a state in this 1386 /// table. validate(&self, sp: &Special) -> Result<Seen, DeserializeError>1387 fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> { 1388 let mut verified = Seen::new(); 1389 // We need to make sure that we decode the correct number of states. 1390 // Otherwise, an empty set of transitions would validate even if the 1391 // recorded state length is non-empty. 1392 let mut len = 0; 1393 // We can't use the self.states() iterator because it assumes the state 1394 // encodings are valid. It could panic if they aren't. 1395 let mut id = DEAD; 1396 while id.as_usize() < self.sparse().len() { 1397 // Before we even decode the state, we check that the ID itself 1398 // is well formed. That is, if it's a special state then it must 1399 // actually be a quit, dead, accel, match or start state. 1400 if sp.is_special_state(id) { 1401 let is_actually_special = sp.is_dead_state(id) 1402 || sp.is_quit_state(id) 1403 || sp.is_match_state(id) 1404 || sp.is_start_state(id) 1405 || sp.is_accel_state(id); 1406 if !is_actually_special { 1407 // This is kind of a cryptic error message... 1408 return Err(DeserializeError::generic( 1409 "found sparse state tagged as special but \ 1410 wasn't actually special", 1411 )); 1412 } 1413 } 1414 let state = self.try_state(sp, id)?; 1415 verified.insert(id); 1416 // The next ID should be the offset immediately following `state`. 1417 id = StateID::new(wire::add( 1418 id.as_usize(), 1419 state.write_to_len(), 1420 "next state ID offset", 1421 )?) 1422 .map_err(|err| { 1423 DeserializeError::state_id_error(err, "next state ID offset") 1424 })?; 1425 len += 1; 1426 } 1427 // Now that we've checked that all top-level states are correct and 1428 // importantly, collected a set of valid state IDs, we have all the 1429 // information we need to check that all transitions are correct too. 1430 // 1431 // Note that we can't use `valid_ids` to iterate because it will 1432 // be empty in no-std no-alloc contexts. (And yes, that means our 1433 // verification isn't quite as good.) We can use `self.states()` 1434 // though at least, since we know that all states can at least be 1435 // decoded and traversed correctly. 1436 for state in self.states() { 1437 // Check that all transitions in this state are correct. 1438 for i in 0..state.ntrans { 1439 let to = state.next_at(i); 1440 // For no-alloc, we just check that the state can decode. It is 1441 // technically possible that the state ID could still point to 1442 // a non-existent state even if it decodes (fuzzing proved this 1443 // to be true), but it shouldn't result in any memory unsafety 1444 // or panics in non-debug mode. 1445 #[cfg(not(feature = "alloc"))] 1446 { 1447 let _ = self.try_state(sp, to)?; 1448 } 1449 #[cfg(feature = "alloc")] 1450 { 1451 if !verified.contains(&to) { 1452 return Err(DeserializeError::generic( 1453 "found transition that points to a \ 1454 non-existent state", 1455 )); 1456 } 1457 } 1458 } 1459 } 1460 if len != self.state_len { 1461 return Err(DeserializeError::generic( 1462 "mismatching sparse state length", 1463 )); 1464 } 1465 Ok(verified) 1466 } 1467 1468 /// Converts these transitions to a borrowed value. as_ref(&self) -> Transitions<&'_ [u8]>1469 fn as_ref(&self) -> Transitions<&'_ [u8]> { 1470 Transitions { 1471 sparse: self.sparse(), 1472 classes: self.classes.clone(), 1473 state_len: self.state_len, 1474 pattern_len: self.pattern_len, 1475 } 1476 } 1477 1478 /// Converts these transitions to an owned value. 1479 #[cfg(feature = "alloc")] to_owned(&self) -> Transitions<alloc::vec::Vec<u8>>1480 fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> { 1481 Transitions { 1482 sparse: self.sparse().to_vec(), 1483 classes: self.classes.clone(), 1484 state_len: self.state_len, 1485 pattern_len: self.pattern_len, 1486 } 1487 } 1488 1489 /// Return a convenient representation of the given state. 1490 /// 1491 /// This panics if the state is invalid. 1492 /// 1493 /// This is marked as inline to help dramatically boost sparse searching, 1494 /// which decodes each state it enters to follow the next transition. Other 1495 /// functions involved are also inlined, which should hopefully eliminate 1496 /// a lot of the extraneous decoding that is never needed just to follow 1497 /// the next transition. 1498 #[cfg_attr(feature = "perf-inline", inline(always))] state(&self, id: StateID) -> State<'_>1499 fn state(&self, id: StateID) -> State<'_> { 1500 let mut state = &self.sparse()[id.as_usize()..]; 1501 let mut ntrans = wire::read_u16(&state).as_usize(); 1502 let is_match = (1 << 15) & ntrans != 0; 1503 ntrans &= !(1 << 15); 1504 state = &state[2..]; 1505 1506 let (input_ranges, state) = state.split_at(ntrans * 2); 1507 let (next, state) = state.split_at(ntrans * StateID::SIZE); 1508 let (pattern_ids, state) = if is_match { 1509 let npats = wire::read_u32(&state).as_usize(); 1510 state[4..].split_at(npats * 4) 1511 } else { 1512 (&[][..], state) 1513 }; 1514 1515 let accel_len = usize::from(state[0]); 1516 let accel = &state[1..accel_len + 1]; 1517 State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } 1518 } 1519 1520 /// Like `state`, but will return an error if the state encoding is 1521 /// invalid. This is useful for verifying states after deserialization, 1522 /// which is required for a safe deserialization API. 1523 /// 1524 /// Note that this only verifies that this state is decodable and that 1525 /// all of its data is consistent. It does not verify that its state ID 1526 /// transitions point to valid states themselves, nor does it verify that 1527 /// every pattern ID is valid. try_state( &self, sp: &Special, id: StateID, ) -> Result<State<'_>, DeserializeError>1528 fn try_state( 1529 &self, 1530 sp: &Special, 1531 id: StateID, 1532 ) -> Result<State<'_>, DeserializeError> { 1533 if id.as_usize() > self.sparse().len() { 1534 return Err(DeserializeError::generic( 1535 "invalid caller provided sparse state ID", 1536 )); 1537 } 1538 let mut state = &self.sparse()[id.as_usize()..]; 1539 // Encoding format starts with a u16 that stores the total number of 1540 // transitions in this state. 1541 let (mut ntrans, _) = 1542 wire::try_read_u16_as_usize(state, "state transition length")?; 1543 let is_match = ((1 << 15) & ntrans) != 0; 1544 ntrans &= !(1 << 15); 1545 state = &state[2..]; 1546 if ntrans > 257 || ntrans == 0 { 1547 return Err(DeserializeError::generic( 1548 "invalid transition length", 1549 )); 1550 } 1551 if is_match && !sp.is_match_state(id) { 1552 return Err(DeserializeError::generic( 1553 "state marked as match but not in match ID range", 1554 )); 1555 } else if !is_match && sp.is_match_state(id) { 1556 return Err(DeserializeError::generic( 1557 "state in match ID range but not marked as match state", 1558 )); 1559 } 1560 1561 // Each transition has two pieces: an inclusive range of bytes on which 1562 // it is defined, and the state ID that those bytes transition to. The 1563 // pairs come first, followed by a corresponding sequence of state IDs. 1564 let input_ranges_len = ntrans.checked_mul(2).unwrap(); 1565 wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; 1566 let (input_ranges, state) = state.split_at(input_ranges_len); 1567 // Every range should be of the form A-B, where A<=B. 1568 for pair in input_ranges.chunks(2) { 1569 let (start, end) = (pair[0], pair[1]); 1570 if start > end { 1571 return Err(DeserializeError::generic("invalid input range")); 1572 } 1573 } 1574 1575 // And now extract the corresponding sequence of state IDs. We leave 1576 // this sequence as a &[u8] instead of a &[S] because sparse DFAs do 1577 // not have any alignment requirements. 1578 let next_len = ntrans 1579 .checked_mul(self.id_len()) 1580 .expect("state size * #trans should always fit in a usize"); 1581 wire::check_slice_len(state, next_len, "sparse trans state IDs")?; 1582 let (next, state) = state.split_at(next_len); 1583 // We can at least verify that every state ID is in bounds. 1584 for idbytes in next.chunks(self.id_len()) { 1585 let (id, _) = 1586 wire::read_state_id(idbytes, "sparse state ID in try_state")?; 1587 wire::check_slice_len( 1588 self.sparse(), 1589 id.as_usize(), 1590 "invalid sparse state ID", 1591 )?; 1592 } 1593 1594 // If this is a match state, then read the pattern IDs for this state. 1595 // Pattern IDs is a u32-length prefixed sequence of native endian 1596 // encoded 32-bit integers. 1597 let (pattern_ids, state) = if is_match { 1598 let (npats, nr) = 1599 wire::try_read_u32_as_usize(state, "pattern ID length")?; 1600 let state = &state[nr..]; 1601 if npats == 0 { 1602 return Err(DeserializeError::generic( 1603 "state marked as a match, but pattern length is zero", 1604 )); 1605 } 1606 1607 let pattern_ids_len = 1608 wire::mul(npats, 4, "sparse pattern ID byte length")?; 1609 wire::check_slice_len( 1610 state, 1611 pattern_ids_len, 1612 "sparse pattern IDs", 1613 )?; 1614 let (pattern_ids, state) = state.split_at(pattern_ids_len); 1615 for patbytes in pattern_ids.chunks(PatternID::SIZE) { 1616 wire::read_pattern_id( 1617 patbytes, 1618 "sparse pattern ID in try_state", 1619 )?; 1620 } 1621 (pattern_ids, state) 1622 } else { 1623 (&[][..], state) 1624 }; 1625 if is_match && pattern_ids.is_empty() { 1626 return Err(DeserializeError::generic( 1627 "state marked as a match, but has no pattern IDs", 1628 )); 1629 } 1630 if sp.is_match_state(id) && pattern_ids.is_empty() { 1631 return Err(DeserializeError::generic( 1632 "state marked special as a match, but has no pattern IDs", 1633 )); 1634 } 1635 if sp.is_match_state(id) != is_match { 1636 return Err(DeserializeError::generic( 1637 "whether state is a match or not is inconsistent", 1638 )); 1639 } 1640 1641 // Now read this state's accelerator info. The first byte is the length 1642 // of the accelerator, which is typically 0 (for no acceleration) but 1643 // is no bigger than 3. The length indicates the number of bytes that 1644 // follow, where each byte corresponds to a transition out of this 1645 // state. 1646 if state.is_empty() { 1647 return Err(DeserializeError::generic("no accelerator length")); 1648 } 1649 let (accel_len, state) = (usize::from(state[0]), &state[1..]); 1650 1651 if accel_len > 3 { 1652 return Err(DeserializeError::generic( 1653 "sparse invalid accelerator length", 1654 )); 1655 } else if accel_len == 0 && sp.is_accel_state(id) { 1656 return Err(DeserializeError::generic( 1657 "got no accelerators in state, but in accelerator ID range", 1658 )); 1659 } else if accel_len > 0 && !sp.is_accel_state(id) { 1660 return Err(DeserializeError::generic( 1661 "state in accelerator ID range, but has no accelerators", 1662 )); 1663 } 1664 1665 wire::check_slice_len( 1666 state, 1667 accel_len, 1668 "sparse corrupt accelerator length", 1669 )?; 1670 let (accel, _) = (&state[..accel_len], &state[accel_len..]); 1671 1672 let state = State { 1673 id, 1674 is_match, 1675 ntrans, 1676 input_ranges, 1677 next, 1678 pattern_ids, 1679 accel, 1680 }; 1681 if sp.is_quit_state(state.next_at(state.ntrans - 1)) { 1682 return Err(DeserializeError::generic( 1683 "state with EOI transition to quit state is illegal", 1684 )); 1685 } 1686 Ok(state) 1687 } 1688 1689 /// Return an iterator over all of the states in this DFA. 1690 /// 1691 /// The iterator returned yields tuples, where the first element is the 1692 /// state ID and the second element is the state itself. states(&self) -> StateIter<'_, T>1693 fn states(&self) -> StateIter<'_, T> { 1694 StateIter { trans: self, id: DEAD.as_usize() } 1695 } 1696 1697 /// Returns the sparse transitions as raw bytes. sparse(&self) -> &[u8]1698 fn sparse(&self) -> &[u8] { 1699 self.sparse.as_ref() 1700 } 1701 1702 /// Returns the number of bytes represented by a single state ID. id_len(&self) -> usize1703 fn id_len(&self) -> usize { 1704 StateID::SIZE 1705 } 1706 1707 /// Return the memory usage, in bytes, of these transitions. 1708 /// 1709 /// This does not include the size of a `Transitions` value itself. memory_usage(&self) -> usize1710 fn memory_usage(&self) -> usize { 1711 self.sparse().len() 1712 } 1713 } 1714 1715 #[cfg(feature = "dfa-build")] 1716 impl<T: AsMut<[u8]>> Transitions<T> { 1717 /// Return a convenient mutable representation of the given state. 1718 /// This panics if the state is invalid. state_mut(&mut self, id: StateID) -> StateMut<'_>1719 fn state_mut(&mut self, id: StateID) -> StateMut<'_> { 1720 let mut state = &mut self.sparse_mut()[id.as_usize()..]; 1721 let mut ntrans = wire::read_u16(&state).as_usize(); 1722 let is_match = (1 << 15) & ntrans != 0; 1723 ntrans &= !(1 << 15); 1724 state = &mut state[2..]; 1725 1726 let (input_ranges, state) = state.split_at_mut(ntrans * 2); 1727 let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); 1728 let (pattern_ids, state) = if is_match { 1729 let npats = wire::read_u32(&state).as_usize(); 1730 state[4..].split_at_mut(npats * 4) 1731 } else { 1732 (&mut [][..], state) 1733 }; 1734 1735 let accel_len = usize::from(state[0]); 1736 let accel = &mut state[1..accel_len + 1]; 1737 StateMut { 1738 id, 1739 is_match, 1740 ntrans, 1741 input_ranges, 1742 next, 1743 pattern_ids, 1744 accel, 1745 } 1746 } 1747 1748 /// Returns the sparse transitions as raw mutable bytes. sparse_mut(&mut self) -> &mut [u8]1749 fn sparse_mut(&mut self) -> &mut [u8] { 1750 self.sparse.as_mut() 1751 } 1752 } 1753 1754 /// The set of all possible starting states in a DFA. 1755 /// 1756 /// See the eponymous type in the `dense` module for more details. This type 1757 /// is very similar to `dense::StartTable`, except that its underlying 1758 /// representation is `&[u8]` instead of `&[S]`. (The latter would require 1759 /// sparse DFAs to be aligned, which is explicitly something we do not require 1760 /// because we don't really need it.) 1761 #[derive(Clone)] 1762 struct StartTable<T> { 1763 /// The initial start state IDs as a contiguous table of native endian 1764 /// encoded integers, represented by `S`. 1765 /// 1766 /// In practice, T is either Vec<u8> or &[u8] and has no alignment 1767 /// requirements. 1768 /// 1769 /// The first `2 * stride` (currently always 8) entries always correspond 1770 /// to the starts states for the entire DFA, with the first 4 entries being 1771 /// for unanchored searches and the second 4 entries being for anchored 1772 /// searches. To keep things simple, we always use 8 entries even if the 1773 /// `StartKind` is not both. 1774 /// 1775 /// After that, there are `stride * patterns` state IDs, where `patterns` 1776 /// may be zero in the case of a DFA with no patterns or in the case where 1777 /// the DFA was built without enabling starting states for each pattern. 1778 table: T, 1779 /// The starting state configuration supported. When 'both', both 1780 /// unanchored and anchored searches work. When 'unanchored', anchored 1781 /// searches panic. When 'anchored', unanchored searches panic. 1782 kind: StartKind, 1783 /// The start state configuration for every possible byte. 1784 start_map: StartByteMap, 1785 /// The number of starting state IDs per pattern. 1786 stride: usize, 1787 /// The total number of patterns for which starting states are encoded. 1788 /// This is `None` for DFAs that were built without start states for each 1789 /// pattern. Thus, one cannot use this field to say how many patterns 1790 /// are in the DFA in all cases. It is specific to how many patterns are 1791 /// represented in this start table. 1792 pattern_len: Option<usize>, 1793 /// The universal starting state for unanchored searches. This is only 1794 /// present when the DFA supports unanchored searches and when all starting 1795 /// state IDs for an unanchored search are equivalent. 1796 universal_start_unanchored: Option<StateID>, 1797 /// The universal starting state for anchored searches. This is only 1798 /// present when the DFA supports anchored searches and when all starting 1799 /// state IDs for an anchored search are equivalent. 1800 universal_start_anchored: Option<StateID>, 1801 } 1802 1803 #[cfg(feature = "dfa-build")] 1804 impl StartTable<Vec<u8>> { new<T: AsRef<[u32]>>( dfa: &dense::DFA<T>, pattern_len: Option<usize>, ) -> StartTable<Vec<u8>>1805 fn new<T: AsRef<[u32]>>( 1806 dfa: &dense::DFA<T>, 1807 pattern_len: Option<usize>, 1808 ) -> StartTable<Vec<u8>> { 1809 let stride = Start::len(); 1810 // This is OK since the only way we're here is if a dense DFA could be 1811 // constructed successfully, which uses the same space. 1812 let len = stride 1813 .checked_mul(pattern_len.unwrap_or(0)) 1814 .unwrap() 1815 .checked_add(stride.checked_mul(2).unwrap()) 1816 .unwrap() 1817 .checked_mul(StateID::SIZE) 1818 .unwrap(); 1819 StartTable { 1820 table: vec![0; len], 1821 kind: dfa.start_kind(), 1822 start_map: dfa.start_map().clone(), 1823 stride, 1824 pattern_len, 1825 universal_start_unanchored: dfa 1826 .universal_start_state(Anchored::No), 1827 universal_start_anchored: dfa.universal_start_state(Anchored::Yes), 1828 } 1829 } 1830 from_dense_dfa<T: AsRef<[u32]>>( dfa: &dense::DFA<T>, remap: &[StateID], ) -> Result<StartTable<Vec<u8>>, BuildError>1831 fn from_dense_dfa<T: AsRef<[u32]>>( 1832 dfa: &dense::DFA<T>, 1833 remap: &[StateID], 1834 ) -> Result<StartTable<Vec<u8>>, BuildError> { 1835 // Unless the DFA has start states compiled for each pattern, then 1836 // as far as the starting state table is concerned, there are zero 1837 // patterns to account for. It will instead only store starting states 1838 // for the entire DFA. 1839 let start_pattern_len = if dfa.starts_for_each_pattern() { 1840 Some(dfa.pattern_len()) 1841 } else { 1842 None 1843 }; 1844 let mut sl = StartTable::new(dfa, start_pattern_len); 1845 for (old_start_id, anchored, sty) in dfa.starts() { 1846 let new_start_id = remap[dfa.to_index(old_start_id)]; 1847 sl.set_start(anchored, sty, new_start_id); 1848 } 1849 Ok(sl) 1850 } 1851 } 1852 1853 impl<'a> StartTable<&'a [u8]> { from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError>1854 unsafe fn from_bytes_unchecked( 1855 mut slice: &'a [u8], 1856 ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { 1857 let slice_start = slice.as_ptr().as_usize(); 1858 1859 let (kind, nr) = StartKind::from_bytes(slice)?; 1860 slice = &slice[nr..]; 1861 1862 let (start_map, nr) = StartByteMap::from_bytes(slice)?; 1863 slice = &slice[nr..]; 1864 1865 let (stride, nr) = 1866 wire::try_read_u32_as_usize(slice, "sparse start table stride")?; 1867 slice = &slice[nr..]; 1868 if stride != Start::len() { 1869 return Err(DeserializeError::generic( 1870 "invalid sparse starting table stride", 1871 )); 1872 } 1873 1874 let (maybe_pattern_len, nr) = 1875 wire::try_read_u32_as_usize(slice, "sparse start table patterns")?; 1876 slice = &slice[nr..]; 1877 let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { 1878 None 1879 } else { 1880 Some(maybe_pattern_len) 1881 }; 1882 if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { 1883 return Err(DeserializeError::generic( 1884 "sparse invalid number of patterns", 1885 )); 1886 } 1887 1888 let (universal_unanchored, nr) = 1889 wire::try_read_u32(slice, "universal unanchored start")?; 1890 slice = &slice[nr..]; 1891 let universal_start_unanchored = if universal_unanchored == u32::MAX { 1892 None 1893 } else { 1894 Some(StateID::try_from(universal_unanchored).map_err(|e| { 1895 DeserializeError::state_id_error( 1896 e, 1897 "universal unanchored start", 1898 ) 1899 })?) 1900 }; 1901 1902 let (universal_anchored, nr) = 1903 wire::try_read_u32(slice, "universal anchored start")?; 1904 slice = &slice[nr..]; 1905 let universal_start_anchored = if universal_anchored == u32::MAX { 1906 None 1907 } else { 1908 Some(StateID::try_from(universal_anchored).map_err(|e| { 1909 DeserializeError::state_id_error(e, "universal anchored start") 1910 })?) 1911 }; 1912 1913 let pattern_table_size = wire::mul( 1914 stride, 1915 pattern_len.unwrap_or(0), 1916 "sparse invalid pattern length", 1917 )?; 1918 // Our start states always start with a single stride of start states 1919 // for the entire automaton which permit it to match any pattern. What 1920 // follows it are an optional set of start states for each pattern. 1921 let start_state_len = wire::add( 1922 wire::mul(2, stride, "start state stride too big")?, 1923 pattern_table_size, 1924 "sparse invalid 'any' pattern starts size", 1925 )?; 1926 let table_bytes_len = wire::mul( 1927 start_state_len, 1928 StateID::SIZE, 1929 "sparse pattern table bytes length", 1930 )?; 1931 wire::check_slice_len( 1932 slice, 1933 table_bytes_len, 1934 "sparse start ID table", 1935 )?; 1936 let table = &slice[..table_bytes_len]; 1937 slice = &slice[table_bytes_len..]; 1938 1939 let sl = StartTable { 1940 table, 1941 kind, 1942 start_map, 1943 stride, 1944 pattern_len, 1945 universal_start_unanchored, 1946 universal_start_anchored, 1947 }; 1948 Ok((sl, slice.as_ptr().as_usize() - slice_start)) 1949 } 1950 } 1951 1952 impl<T: AsRef<[u8]>> StartTable<T> { write_to<E: Endian>( &self, mut dst: &mut [u8], ) -> Result<usize, SerializeError>1953 fn write_to<E: Endian>( 1954 &self, 1955 mut dst: &mut [u8], 1956 ) -> Result<usize, SerializeError> { 1957 let nwrite = self.write_to_len(); 1958 if dst.len() < nwrite { 1959 return Err(SerializeError::buffer_too_small( 1960 "sparse starting table ids", 1961 )); 1962 } 1963 dst = &mut dst[..nwrite]; 1964 1965 // write start kind 1966 let nw = self.kind.write_to::<E>(dst)?; 1967 dst = &mut dst[nw..]; 1968 // write start byte map 1969 let nw = self.start_map.write_to(dst)?; 1970 dst = &mut dst[nw..]; 1971 // write stride 1972 E::write_u32(u32::try_from(self.stride).unwrap(), dst); 1973 dst = &mut dst[size_of::<u32>()..]; 1974 // write pattern length 1975 E::write_u32( 1976 u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), 1977 dst, 1978 ); 1979 dst = &mut dst[size_of::<u32>()..]; 1980 // write universal start unanchored state id, u32::MAX if absent 1981 E::write_u32( 1982 self.universal_start_unanchored 1983 .map_or(u32::MAX, |sid| sid.as_u32()), 1984 dst, 1985 ); 1986 dst = &mut dst[size_of::<u32>()..]; 1987 // write universal start anchored state id, u32::MAX if absent 1988 E::write_u32( 1989 self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), 1990 dst, 1991 ); 1992 dst = &mut dst[size_of::<u32>()..]; 1993 // write start IDs 1994 for (sid, _, _) in self.iter() { 1995 E::write_u32(sid.as_u32(), dst); 1996 dst = &mut dst[StateID::SIZE..]; 1997 } 1998 Ok(nwrite) 1999 } 2000 2001 /// Returns the number of bytes the serialized form of this transition 2002 /// table will use. write_to_len(&self) -> usize2003 fn write_to_len(&self) -> usize { 2004 self.kind.write_to_len() 2005 + self.start_map.write_to_len() 2006 + size_of::<u32>() // stride 2007 + size_of::<u32>() // # patterns 2008 + size_of::<u32>() // universal unanchored start 2009 + size_of::<u32>() // universal anchored start 2010 + self.table().len() 2011 } 2012 2013 /// Validates that every starting state ID in this table is valid. 2014 /// 2015 /// That is, every starting state ID can be used to correctly decode a 2016 /// state in the DFA's sparse transitions. validate( &self, sp: &Special, seen: &Seen, ) -> Result<(), DeserializeError>2017 fn validate( 2018 &self, 2019 sp: &Special, 2020 seen: &Seen, 2021 ) -> Result<(), DeserializeError> { 2022 for (id, _, _) in self.iter() { 2023 if !seen.contains(&id) { 2024 return Err(DeserializeError::generic( 2025 "found invalid start state ID", 2026 )); 2027 } 2028 if sp.is_match_state(id) { 2029 return Err(DeserializeError::generic( 2030 "start states cannot be match states", 2031 )); 2032 } 2033 } 2034 Ok(()) 2035 } 2036 2037 /// Converts this start list to a borrowed value. as_ref(&self) -> StartTable<&'_ [u8]>2038 fn as_ref(&self) -> StartTable<&'_ [u8]> { 2039 StartTable { 2040 table: self.table(), 2041 kind: self.kind, 2042 start_map: self.start_map.clone(), 2043 stride: self.stride, 2044 pattern_len: self.pattern_len, 2045 universal_start_unanchored: self.universal_start_unanchored, 2046 universal_start_anchored: self.universal_start_anchored, 2047 } 2048 } 2049 2050 /// Converts this start list to an owned value. 2051 #[cfg(feature = "alloc")] to_owned(&self) -> StartTable<alloc::vec::Vec<u8>>2052 fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> { 2053 StartTable { 2054 table: self.table().to_vec(), 2055 kind: self.kind, 2056 start_map: self.start_map.clone(), 2057 stride: self.stride, 2058 pattern_len: self.pattern_len, 2059 universal_start_unanchored: self.universal_start_unanchored, 2060 universal_start_anchored: self.universal_start_anchored, 2061 } 2062 } 2063 2064 /// Return the start state for the given index and pattern ID. If the 2065 /// pattern ID is None, then the corresponding start state for the entire 2066 /// DFA is returned. If the pattern ID is not None, then the corresponding 2067 /// starting state for the given pattern is returned. If this start table 2068 /// does not have individual starting states for each pattern, then this 2069 /// panics. start( &self, anchored: Anchored, start: Start, ) -> Result<StateID, StartError>2070 fn start( 2071 &self, 2072 anchored: Anchored, 2073 start: Start, 2074 ) -> Result<StateID, StartError> { 2075 let start_index = start.as_usize(); 2076 let index = match anchored { 2077 Anchored::No => { 2078 if !self.kind.has_unanchored() { 2079 return Err(StartError::unsupported_anchored(anchored)); 2080 } 2081 start_index 2082 } 2083 Anchored::Yes => { 2084 if !self.kind.has_anchored() { 2085 return Err(StartError::unsupported_anchored(anchored)); 2086 } 2087 self.stride + start_index 2088 } 2089 Anchored::Pattern(pid) => { 2090 let len = match self.pattern_len { 2091 None => { 2092 return Err(StartError::unsupported_anchored(anchored)) 2093 } 2094 Some(len) => len, 2095 }; 2096 if pid.as_usize() >= len { 2097 return Ok(DEAD); 2098 } 2099 (2 * self.stride) 2100 + (self.stride * pid.as_usize()) 2101 + start_index 2102 } 2103 }; 2104 let start = index * StateID::SIZE; 2105 // This OK since we're allowed to assume that the start table contains 2106 // valid StateIDs. 2107 Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) 2108 } 2109 2110 /// Return an iterator over all start IDs in this table. iter(&self) -> StartStateIter<'_, T>2111 fn iter(&self) -> StartStateIter<'_, T> { 2112 StartStateIter { st: self, i: 0 } 2113 } 2114 2115 /// Returns the total number of start state IDs in this table. len(&self) -> usize2116 fn len(&self) -> usize { 2117 self.table().len() / StateID::SIZE 2118 } 2119 2120 /// Returns the table as a raw slice of bytes. table(&self) -> &[u8]2121 fn table(&self) -> &[u8] { 2122 self.table.as_ref() 2123 } 2124 2125 /// Return the memory usage, in bytes, of this start list. 2126 /// 2127 /// This does not include the size of a `StartTable` value itself. memory_usage(&self) -> usize2128 fn memory_usage(&self) -> usize { 2129 self.table().len() 2130 } 2131 } 2132 2133 #[cfg(feature = "dfa-build")] 2134 impl<T: AsMut<[u8]>> StartTable<T> { 2135 /// Set the start state for the given index and pattern. 2136 /// 2137 /// If the pattern ID or state ID are not valid, then this will panic. set_start(&mut self, anchored: Anchored, start: Start, id: StateID)2138 fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { 2139 let start_index = start.as_usize(); 2140 let index = match anchored { 2141 Anchored::No => start_index, 2142 Anchored::Yes => self.stride + start_index, 2143 Anchored::Pattern(pid) => { 2144 let pid = pid.as_usize(); 2145 let len = self 2146 .pattern_len 2147 .expect("start states for each pattern enabled"); 2148 assert!(pid < len, "invalid pattern ID {:?}", pid); 2149 self.stride 2150 .checked_mul(pid) 2151 .unwrap() 2152 .checked_add(self.stride.checked_mul(2).unwrap()) 2153 .unwrap() 2154 .checked_add(start_index) 2155 .unwrap() 2156 } 2157 }; 2158 let start = index * StateID::SIZE; 2159 let end = start + StateID::SIZE; 2160 wire::write_state_id::<wire::NE>( 2161 id, 2162 &mut self.table.as_mut()[start..end], 2163 ); 2164 } 2165 } 2166 2167 /// An iterator over all state state IDs in a sparse DFA. 2168 struct StartStateIter<'a, T> { 2169 st: &'a StartTable<T>, 2170 i: usize, 2171 } 2172 2173 impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { 2174 type Item = (StateID, Anchored, Start); 2175 next(&mut self) -> Option<(StateID, Anchored, Start)>2176 fn next(&mut self) -> Option<(StateID, Anchored, Start)> { 2177 let i = self.i; 2178 if i >= self.st.len() { 2179 return None; 2180 } 2181 self.i += 1; 2182 2183 // This unwrap is okay since the stride of any DFA must always match 2184 // the number of start state types. 2185 let start_type = Start::from_usize(i % self.st.stride).unwrap(); 2186 let anchored = if i < self.st.stride { 2187 Anchored::No 2188 } else if i < (2 * self.st.stride) { 2189 Anchored::Yes 2190 } else { 2191 let pid = (i - (2 * self.st.stride)) / self.st.stride; 2192 Anchored::Pattern(PatternID::new(pid).unwrap()) 2193 }; 2194 let start = i * StateID::SIZE; 2195 let end = start + StateID::SIZE; 2196 let bytes = self.st.table()[start..end].try_into().unwrap(); 2197 // This is OK since we're allowed to assume that any IDs in this start 2198 // table are correct and valid for this DFA. 2199 let id = StateID::from_ne_bytes_unchecked(bytes); 2200 Some((id, anchored, start_type)) 2201 } 2202 } 2203 2204 impl<'a, T> fmt::Debug for StartStateIter<'a, T> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result2205 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 2206 f.debug_struct("StartStateIter").field("i", &self.i).finish() 2207 } 2208 } 2209 2210 /// An iterator over all states in a sparse DFA. 2211 /// 2212 /// This iterator yields tuples, where the first element is the state ID and 2213 /// the second element is the state itself. 2214 struct StateIter<'a, T> { 2215 trans: &'a Transitions<T>, 2216 id: usize, 2217 } 2218 2219 impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { 2220 type Item = State<'a>; 2221 next(&mut self) -> Option<State<'a>>2222 fn next(&mut self) -> Option<State<'a>> { 2223 if self.id >= self.trans.sparse().len() { 2224 return None; 2225 } 2226 let state = self.trans.state(StateID::new_unchecked(self.id)); 2227 self.id = self.id + state.write_to_len(); 2228 Some(state) 2229 } 2230 } 2231 2232 impl<'a, T> fmt::Debug for StateIter<'a, T> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result2233 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 2234 f.debug_struct("StateIter").field("id", &self.id).finish() 2235 } 2236 } 2237 2238 /// A representation of a sparse DFA state that can be cheaply materialized 2239 /// from a state identifier. 2240 #[derive(Clone)] 2241 struct State<'a> { 2242 /// The identifier of this state. 2243 id: StateID, 2244 /// Whether this is a match state or not. 2245 is_match: bool, 2246 /// The number of transitions in this state. 2247 ntrans: usize, 2248 /// Pairs of input ranges, where there is one pair for each transition. 2249 /// Each pair specifies an inclusive start and end byte range for the 2250 /// corresponding transition. 2251 input_ranges: &'a [u8], 2252 /// Transitions to the next state. This slice contains native endian 2253 /// encoded state identifiers, with `S` as the representation. Thus, there 2254 /// are `ntrans * size_of::<S>()` bytes in this slice. 2255 next: &'a [u8], 2256 /// If this is a match state, then this contains the pattern IDs that match 2257 /// when the DFA is in this state. 2258 /// 2259 /// This is a contiguous sequence of 32-bit native endian encoded integers. 2260 pattern_ids: &'a [u8], 2261 /// An accelerator for this state, if present. If this state has no 2262 /// accelerator, then this is an empty slice. When non-empty, this slice 2263 /// has length at most 3 and corresponds to the exhaustive set of bytes 2264 /// that must be seen in order to transition out of this state. 2265 accel: &'a [u8], 2266 } 2267 2268 impl<'a> State<'a> { 2269 /// Searches for the next transition given an input byte. If no such 2270 /// transition could be found, then a dead state is returned. 2271 /// 2272 /// This is marked as inline to help dramatically boost sparse searching, 2273 /// which decodes each state it enters to follow the next transition. 2274 #[cfg_attr(feature = "perf-inline", inline(always))] next(&self, input: u8) -> StateID2275 fn next(&self, input: u8) -> StateID { 2276 // This straight linear search was observed to be much better than 2277 // binary search on ASCII haystacks, likely because a binary search 2278 // visits the ASCII case last but a linear search sees it first. A 2279 // binary search does do a little better on non-ASCII haystacks, but 2280 // not by much. There might be a better trade off lurking here. 2281 for i in 0..(self.ntrans - 1) { 2282 let (start, end) = self.range(i); 2283 if start <= input && input <= end { 2284 return self.next_at(i); 2285 } 2286 // We could bail early with an extra branch: if input < b1, then 2287 // we know we'll never find a matching transition. Interestingly, 2288 // this extra branch seems to not help performance, or will even 2289 // hurt it. It's likely very dependent on the DFA itself and what 2290 // is being searched. 2291 } 2292 DEAD 2293 } 2294 2295 /// Returns the next state ID for the special EOI transition. next_eoi(&self) -> StateID2296 fn next_eoi(&self) -> StateID { 2297 self.next_at(self.ntrans - 1) 2298 } 2299 2300 /// Returns the identifier for this state. id(&self) -> StateID2301 fn id(&self) -> StateID { 2302 self.id 2303 } 2304 2305 /// Returns the inclusive input byte range for the ith transition in this 2306 /// state. range(&self, i: usize) -> (u8, u8)2307 fn range(&self, i: usize) -> (u8, u8) { 2308 (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) 2309 } 2310 2311 /// Returns the next state for the ith transition in this state. next_at(&self, i: usize) -> StateID2312 fn next_at(&self, i: usize) -> StateID { 2313 let start = i * StateID::SIZE; 2314 let end = start + StateID::SIZE; 2315 let bytes = self.next[start..end].try_into().unwrap(); 2316 StateID::from_ne_bytes_unchecked(bytes) 2317 } 2318 2319 /// Returns the pattern ID for the given match index. If the match index 2320 /// is invalid, then this panics. pattern_id(&self, match_index: usize) -> PatternID2321 fn pattern_id(&self, match_index: usize) -> PatternID { 2322 let start = match_index * PatternID::SIZE; 2323 wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 2324 } 2325 2326 /// Returns the total number of pattern IDs for this state. This is always 2327 /// zero when `is_match` is false. pattern_len(&self) -> usize2328 fn pattern_len(&self) -> usize { 2329 assert_eq!(0, self.pattern_ids.len() % 4); 2330 self.pattern_ids.len() / 4 2331 } 2332 2333 /// Return an accelerator for this state. accelerator(&self) -> &'a [u8]2334 fn accelerator(&self) -> &'a [u8] { 2335 self.accel 2336 } 2337 2338 /// Write the raw representation of this state to the given buffer using 2339 /// the given endianness. write_to<E: Endian>( &self, mut dst: &mut [u8], ) -> Result<usize, SerializeError>2340 fn write_to<E: Endian>( 2341 &self, 2342 mut dst: &mut [u8], 2343 ) -> Result<usize, SerializeError> { 2344 let nwrite = self.write_to_len(); 2345 if dst.len() < nwrite { 2346 return Err(SerializeError::buffer_too_small( 2347 "sparse state transitions", 2348 )); 2349 } 2350 2351 let ntrans = 2352 if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; 2353 E::write_u16(u16::try_from(ntrans).unwrap(), dst); 2354 dst = &mut dst[size_of::<u16>()..]; 2355 2356 dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); 2357 dst = &mut dst[self.input_ranges.len()..]; 2358 2359 for i in 0..self.ntrans { 2360 E::write_u32(self.next_at(i).as_u32(), dst); 2361 dst = &mut dst[StateID::SIZE..]; 2362 } 2363 2364 if self.is_match { 2365 E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); 2366 dst = &mut dst[size_of::<u32>()..]; 2367 for i in 0..self.pattern_len() { 2368 let pid = self.pattern_id(i); 2369 E::write_u32(pid.as_u32(), dst); 2370 dst = &mut dst[PatternID::SIZE..]; 2371 } 2372 } 2373 2374 dst[0] = u8::try_from(self.accel.len()).unwrap(); 2375 dst[1..][..self.accel.len()].copy_from_slice(self.accel); 2376 2377 Ok(nwrite) 2378 } 2379 2380 /// Return the total number of bytes that this state consumes in its 2381 /// encoded form. write_to_len(&self) -> usize2382 fn write_to_len(&self) -> usize { 2383 let mut len = 2 2384 + (self.ntrans * 2) 2385 + (self.ntrans * StateID::SIZE) 2386 + (1 + self.accel.len()); 2387 if self.is_match { 2388 len += size_of::<u32>() + self.pattern_ids.len(); 2389 } 2390 len 2391 } 2392 } 2393 2394 impl<'a> fmt::Debug for State<'a> { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result2395 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 2396 let mut printed = false; 2397 for i in 0..(self.ntrans - 1) { 2398 let next = self.next_at(i); 2399 if next == DEAD { 2400 continue; 2401 } 2402 2403 if printed { 2404 write!(f, ", ")?; 2405 } 2406 let (start, end) = self.range(i); 2407 if start == end { 2408 write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?; 2409 } else { 2410 write!( 2411 f, 2412 "{:?}-{:?} => {:?}", 2413 DebugByte(start), 2414 DebugByte(end), 2415 next.as_usize(), 2416 )?; 2417 } 2418 printed = true; 2419 } 2420 let eoi = self.next_at(self.ntrans - 1); 2421 if eoi != DEAD { 2422 if printed { 2423 write!(f, ", ")?; 2424 } 2425 write!(f, "EOI => {:?}", eoi.as_usize())?; 2426 } 2427 Ok(()) 2428 } 2429 } 2430 2431 /// A representation of a mutable sparse DFA state that can be cheaply 2432 /// materialized from a state identifier. 2433 #[cfg(feature = "dfa-build")] 2434 struct StateMut<'a> { 2435 /// The identifier of this state. 2436 id: StateID, 2437 /// Whether this is a match state or not. 2438 is_match: bool, 2439 /// The number of transitions in this state. 2440 ntrans: usize, 2441 /// Pairs of input ranges, where there is one pair for each transition. 2442 /// Each pair specifies an inclusive start and end byte range for the 2443 /// corresponding transition. 2444 input_ranges: &'a mut [u8], 2445 /// Transitions to the next state. This slice contains native endian 2446 /// encoded state identifiers, with `S` as the representation. Thus, there 2447 /// are `ntrans * size_of::<S>()` bytes in this slice. 2448 next: &'a mut [u8], 2449 /// If this is a match state, then this contains the pattern IDs that match 2450 /// when the DFA is in this state. 2451 /// 2452 /// This is a contiguous sequence of 32-bit native endian encoded integers. 2453 pattern_ids: &'a [u8], 2454 /// An accelerator for this state, if present. If this state has no 2455 /// accelerator, then this is an empty slice. When non-empty, this slice 2456 /// has length at most 3 and corresponds to the exhaustive set of bytes 2457 /// that must be seen in order to transition out of this state. 2458 accel: &'a mut [u8], 2459 } 2460 2461 #[cfg(feature = "dfa-build")] 2462 impl<'a> StateMut<'a> { 2463 /// Sets the ith transition to the given state. set_next_at(&mut self, i: usize, next: StateID)2464 fn set_next_at(&mut self, i: usize, next: StateID) { 2465 let start = i * StateID::SIZE; 2466 let end = start + StateID::SIZE; 2467 wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]); 2468 } 2469 } 2470 2471 #[cfg(feature = "dfa-build")] 2472 impl<'a> fmt::Debug for StateMut<'a> { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result2473 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 2474 let state = State { 2475 id: self.id, 2476 is_match: self.is_match, 2477 ntrans: self.ntrans, 2478 input_ranges: self.input_ranges, 2479 next: self.next, 2480 pattern_ids: self.pattern_ids, 2481 accel: self.accel, 2482 }; 2483 fmt::Debug::fmt(&state, f) 2484 } 2485 } 2486 2487 // In order to validate everything, we not only need to make sure we 2488 // can decode every state, but that every transition in every state 2489 // points to a valid state. There are many duplicative transitions, so 2490 // we record state IDs that we've verified so that we don't redo the 2491 // decoding work. 2492 // 2493 // Except, when in no_std mode, we don't have dynamic memory allocation 2494 // available to us, so we skip this optimization. It's not clear 2495 // whether doing something more clever is worth it just yet. If you're 2496 // profiling this code and need it to run faster, please file an issue. 2497 // 2498 // OK, so we also use this to record the set of valid state IDs. Since 2499 // it is possible for a transition to point to an invalid state ID that 2500 // still (somehow) deserializes to a valid state. So we need to make 2501 // sure our transitions are limited to actually correct state IDs. 2502 // The problem is, I'm not sure how to do this verification step in 2503 // no-std no-alloc mode. I think we'd *have* to store the set of valid 2504 // state IDs in the DFA itself. For now, we don't do this verification 2505 // in no-std no-alloc mode. The worst thing that can happen is an 2506 // incorrect result. But no panics or memory safety problems should 2507 // result. Because we still do validate that the state itself is 2508 // "valid" in the sense that everything it points to actually exists. 2509 // 2510 // ---AG 2511 #[derive(Debug)] 2512 struct Seen { 2513 #[cfg(feature = "alloc")] 2514 set: alloc::collections::BTreeSet<StateID>, 2515 #[cfg(not(feature = "alloc"))] 2516 set: core::marker::PhantomData<StateID>, 2517 } 2518 2519 #[cfg(feature = "alloc")] 2520 impl Seen { new() -> Seen2521 fn new() -> Seen { 2522 Seen { set: alloc::collections::BTreeSet::new() } 2523 } insert(&mut self, id: StateID)2524 fn insert(&mut self, id: StateID) { 2525 self.set.insert(id); 2526 } contains(&self, id: &StateID) -> bool2527 fn contains(&self, id: &StateID) -> bool { 2528 self.set.contains(id) 2529 } 2530 } 2531 2532 #[cfg(not(feature = "alloc"))] 2533 impl Seen { new() -> Seen2534 fn new() -> Seen { 2535 Seen { set: core::marker::PhantomData } 2536 } insert(&mut self, _id: StateID)2537 fn insert(&mut self, _id: StateID) {} contains(&self, _id: &StateID) -> bool2538 fn contains(&self, _id: &StateID) -> bool { 2539 true 2540 } 2541 } 2542 2543 /* 2544 /// A binary search routine specialized specifically to a sparse DFA state's 2545 /// transitions. Specifically, the transitions are defined as a set of pairs 2546 /// of input bytes that delineate an inclusive range of bytes. If the input 2547 /// byte is in the range, then the corresponding transition is a match. 2548 /// 2549 /// This binary search accepts a slice of these pairs and returns the position 2550 /// of the matching pair (the ith transition), or None if no matching pair 2551 /// could be found. 2552 /// 2553 /// Note that this routine is not currently used since it was observed to 2554 /// either decrease performance when searching ASCII, or did not provide enough 2555 /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here 2556 /// for posterity in case we can find a way to use it. 2557 /// 2558 /// In theory, we could use the standard library's search routine if we could 2559 /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently 2560 /// guaranteed to be safe and is thus UB (since I don't think the in-memory 2561 /// representation of `(u8, u8)` has been nailed down). One could define a 2562 /// repr(C) type, but the casting doesn't seem justified. 2563 #[cfg_attr(feature = "perf-inline", inline(always))] 2564 fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { 2565 debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); 2566 debug_assert!(ranges.len() <= 512, "ranges should be short"); 2567 2568 let (mut left, mut right) = (0, ranges.len() / 2); 2569 while left < right { 2570 let mid = (left + right) / 2; 2571 let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); 2572 if needle < b1 { 2573 right = mid; 2574 } else if needle > b2 { 2575 left = mid + 1; 2576 } else { 2577 return Some(mid); 2578 } 2579 } 2580 None 2581 } 2582 */ 2583 2584 #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] 2585 mod tests { 2586 use crate::{ 2587 dfa::{dense::DFA, Automaton}, 2588 nfa::thompson, 2589 Input, MatchError, 2590 }; 2591 2592 // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. 2593 #[test] heuristic_unicode_forward()2594 fn heuristic_unicode_forward() { 2595 let dfa = DFA::builder() 2596 .configure(DFA::config().unicode_word_boundary(true)) 2597 .thompson(thompson::Config::new().reverse(true)) 2598 .build(r"\b[0-9]+\b") 2599 .unwrap() 2600 .to_sparse() 2601 .unwrap(); 2602 2603 let input = Input::new("β123").range(2..); 2604 let expected = MatchError::quit(0xB2, 1); 2605 let got = dfa.try_search_fwd(&input); 2606 assert_eq!(Err(expected), got); 2607 2608 let input = Input::new("123β").range(..3); 2609 let expected = MatchError::quit(0xCE, 3); 2610 let got = dfa.try_search_fwd(&input); 2611 assert_eq!(Err(expected), got); 2612 } 2613 2614 // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. 2615 #[test] heuristic_unicode_reverse()2616 fn heuristic_unicode_reverse() { 2617 let dfa = DFA::builder() 2618 .configure(DFA::config().unicode_word_boundary(true)) 2619 .thompson(thompson::Config::new().reverse(true)) 2620 .build(r"\b[0-9]+\b") 2621 .unwrap() 2622 .to_sparse() 2623 .unwrap(); 2624 2625 let input = Input::new("β123").range(2..); 2626 let expected = MatchError::quit(0xB2, 1); 2627 let got = dfa.try_search_rev(&input); 2628 assert_eq!(Err(expected), got); 2629 2630 let input = Input::new("123β").range(..3); 2631 let expected = MatchError::quit(0xCE, 3); 2632 let got = dfa.try_search_rev(&input); 2633 assert_eq!(Err(expected), got); 2634 } 2635 } 2636