1 /*! 2 Provides helpers for dealing with start state configurations in DFAs. 3 */ 4 5 use crate::util::{ 6 look::LookMatcher, 7 search::{Anchored, Input}, 8 wire::{self, DeserializeError, SerializeError}, 9 }; 10 11 /// The configuration used to determine a DFA's start state for a search. 12 /// 13 /// A DFA has a single starting state in the typical textbook description. That 14 /// is, it corresponds to the set of all starting states for the NFA that built 15 /// it, along with their espsilon closures. In this crate, however, DFAs have 16 /// many possible start states due to a few factors: 17 /// 18 /// * DFAs support the ability to run either anchored or unanchored searches. 19 /// Each type of search needs its own start state. For example, an unanchored 20 /// search requires starting at a state corresponding to a regex with a 21 /// `(?s-u:.)*?` prefix, which will match through anything. 22 /// * DFAs also optionally support starting an anchored search for any one 23 /// specific pattern. Each such pattern requires its own start state. 24 /// * If a look-behind assertion like `^` or `\b` is used in the regex, then 25 /// the DFA will need to inspect a single byte immediately before the start of 26 /// the search to choose the correct start state. 27 /// 28 /// Indeed, this configuration precisely encapsulates all of the above factors. 29 /// The [`Config::anchored`] method sets which kind of anchored search to 30 /// perform while the [`Config::look_behind`] method provides a way to set 31 /// the byte that occurs immediately before the start of the search. 32 /// 33 /// Generally speaking, this type is only useful when you want to run searches 34 /// without using an [`Input`]. In particular, an `Input` wants a haystack 35 /// slice, but callers may not have a contiguous sequence of bytes as a 36 /// haystack in all cases. This type provides a lower level of control such 37 /// that callers can provide their own anchored configuration and look-behind 38 /// byte explicitly. 39 /// 40 /// # Example 41 /// 42 /// This shows basic usage that permits running a search with a DFA without 43 /// using the `Input` abstraction. 44 /// 45 /// ``` 46 /// use regex_automata::{ 47 /// dfa::{Automaton, dense}, 48 /// util::start, 49 /// Anchored, 50 /// }; 51 /// 52 /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; 53 /// let haystack = "quartz"; 54 /// 55 /// let config = start::Config::new().anchored(Anchored::Yes); 56 /// let mut state = dfa.start_state(&config)?; 57 /// for &b in haystack.as_bytes().iter() { 58 /// state = dfa.next_state(state, b); 59 /// } 60 /// state = dfa.next_eoi_state(state); 61 /// assert!(dfa.is_match_state(state)); 62 /// 63 /// # Ok::<(), Box<dyn std::error::Error>>(()) 64 /// ``` 65 /// 66 /// This example shows how to correctly run a search that doesn't begin at 67 /// the start of a haystack. Notice how we set the look-behind byte, and as 68 /// a result, the `\b` assertion does not match. 69 /// 70 /// ``` 71 /// use regex_automata::{ 72 /// dfa::{Automaton, dense}, 73 /// util::start, 74 /// Anchored, 75 /// }; 76 /// 77 /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; 78 /// let haystack = "quartz"; 79 /// 80 /// let config = start::Config::new() 81 /// .anchored(Anchored::Yes) 82 /// .look_behind(Some(b'q')); 83 /// let mut state = dfa.start_state(&config)?; 84 /// for &b in haystack.as_bytes().iter().skip(1) { 85 /// state = dfa.next_state(state, b); 86 /// } 87 /// state = dfa.next_eoi_state(state); 88 /// // No match! 89 /// assert!(!dfa.is_match_state(state)); 90 /// 91 /// # Ok::<(), Box<dyn std::error::Error>>(()) 92 /// ``` 93 /// 94 /// If we had instead not set a look-behind byte, then the DFA would assume 95 /// that it was starting at the beginning of the haystack, and thus `\b` should 96 /// match. This in turn would result in erroneously reporting a match: 97 /// 98 /// ``` 99 /// use regex_automata::{ 100 /// dfa::{Automaton, dense}, 101 /// util::start, 102 /// Anchored, 103 /// }; 104 /// 105 /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; 106 /// let haystack = "quartz"; 107 /// 108 /// // Whoops, forgot the look-behind byte... 109 /// let config = start::Config::new().anchored(Anchored::Yes); 110 /// let mut state = dfa.start_state(&config)?; 111 /// for &b in haystack.as_bytes().iter().skip(1) { 112 /// state = dfa.next_state(state, b); 113 /// } 114 /// state = dfa.next_eoi_state(state); 115 /// // And now we get a match unexpectedly. 116 /// assert!(dfa.is_match_state(state)); 117 /// 118 /// # Ok::<(), Box<dyn std::error::Error>>(()) 119 /// ``` 120 #[derive(Clone, Debug)] 121 pub struct Config { 122 look_behind: Option<u8>, 123 anchored: Anchored, 124 } 125 126 impl Config { 127 /// Create a new default start configuration. 128 /// 129 /// The default is an unanchored search that starts at the beginning of the 130 /// haystack. new() -> Config131 pub fn new() -> Config { 132 Config { anchored: Anchored::No, look_behind: None } 133 } 134 135 /// A convenience routine for building a start configuration from an 136 /// [`Input`] for a forward search. 137 /// 138 /// This automatically sets the look-behind byte to the byte immediately 139 /// preceding the start of the search. If the start of the search is at 140 /// offset `0`, then no look-behind byte is set. from_input_forward(input: &Input<'_>) -> Config141 pub fn from_input_forward(input: &Input<'_>) -> Config { 142 let look_behind = input 143 .start() 144 .checked_sub(1) 145 .and_then(|i| input.haystack().get(i).copied()); 146 Config { look_behind, anchored: input.get_anchored() } 147 } 148 149 /// A convenience routine for building a start configuration from an 150 /// [`Input`] for a reverse search. 151 /// 152 /// This automatically sets the look-behind byte to the byte immediately 153 /// following the end of the search. If the end of the search is at 154 /// offset `haystack.len()`, then no look-behind byte is set. from_input_reverse(input: &Input<'_>) -> Config155 pub fn from_input_reverse(input: &Input<'_>) -> Config { 156 let look_behind = input.haystack().get(input.end()).copied(); 157 Config { look_behind, anchored: input.get_anchored() } 158 } 159 160 /// Set the look-behind byte at the start of a search. 161 /// 162 /// Unless the search is intended to logically start at the beginning of a 163 /// haystack, this should _always_ be set to the byte immediately preceding 164 /// the start of the search. If no look-behind byte is set, then the start 165 /// configuration will assume it is at the beginning of the haystack. For 166 /// example, the anchor `^` will match. 167 /// 168 /// The default is that no look-behind byte is set. look_behind(mut self, byte: Option<u8>) -> Config169 pub fn look_behind(mut self, byte: Option<u8>) -> Config { 170 self.look_behind = byte; 171 self 172 } 173 174 /// Set the anchored mode of a search. 175 /// 176 /// The default is an unanchored search. anchored(mut self, mode: Anchored) -> Config177 pub fn anchored(mut self, mode: Anchored) -> Config { 178 self.anchored = mode; 179 self 180 } 181 182 /// Return the look-behind byte in this configuration, if one exists. get_look_behind(&self) -> Option<u8>183 pub fn get_look_behind(&self) -> Option<u8> { 184 self.look_behind 185 } 186 187 /// Return the anchored mode in this configuration. get_anchored(&self) -> Anchored188 pub fn get_anchored(&self) -> Anchored { 189 self.anchored 190 } 191 } 192 193 /// A map from every possible byte value to its corresponding starting 194 /// configuration. 195 /// 196 /// This map is used in order to lookup the start configuration for a particular 197 /// position in a haystack. This start configuration is then used in 198 /// combination with things like the anchored mode and pattern ID to fully 199 /// determine the start state. 200 /// 201 /// Generally speaking, this map is only used for fully compiled DFAs and lazy 202 /// DFAs. For NFAs (including the one-pass DFA), the start state is generally 203 /// selected by virtue of traversing the NFA state graph. DFAs do the same 204 /// thing, but at build time and not search time. (Well, technically the lazy 205 /// DFA does it at search time, but it does enough work to cache the full 206 /// result of the epsilon closure that the NFA engines tend to need to do.) 207 #[derive(Clone)] 208 pub(crate) struct StartByteMap { 209 map: [Start; 256], 210 } 211 212 impl StartByteMap { 213 /// Create a new map from byte values to their corresponding starting 214 /// configurations. The map is determined, in part, by how look-around 215 /// assertions are matched via the matcher given. new(lookm: &LookMatcher) -> StartByteMap216 pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap { 217 let mut map = [Start::NonWordByte; 256]; 218 map[usize::from(b'\n')] = Start::LineLF; 219 map[usize::from(b'\r')] = Start::LineCR; 220 map[usize::from(b'_')] = Start::WordByte; 221 222 let mut byte = b'0'; 223 while byte <= b'9' { 224 map[usize::from(byte)] = Start::WordByte; 225 byte += 1; 226 } 227 byte = b'A'; 228 while byte <= b'Z' { 229 map[usize::from(byte)] = Start::WordByte; 230 byte += 1; 231 } 232 byte = b'a'; 233 while byte <= b'z' { 234 map[usize::from(byte)] = Start::WordByte; 235 byte += 1; 236 } 237 238 let lineterm = lookm.get_line_terminator(); 239 // If our line terminator is normal, then it is already handled by 240 // the LineLF and LineCR configurations. But if it's weird, then we 241 // overwrite whatever was there before for that terminator with a 242 // special configuration. The trick here is that if the terminator 243 // is, say, a word byte like `a`, then callers seeing this start 244 // configuration need to account for that and build their DFA state as 245 // if it *also* came from a word byte. 246 if lineterm != b'\r' && lineterm != b'\n' { 247 map[usize::from(lineterm)] = Start::CustomLineTerminator; 248 } 249 StartByteMap { map } 250 } 251 252 /// Return the starting configuration for the given look-behind byte. 253 /// 254 /// If no look-behind exists, callers should use `Start::Text`. 255 #[cfg_attr(feature = "perf-inline", inline(always))] get(&self, byte: u8) -> Start256 pub(crate) fn get(&self, byte: u8) -> Start { 257 self.map[usize::from(byte)] 258 } 259 260 /// Deserializes a byte class map from the given slice. If the slice is of 261 /// insufficient length or otherwise contains an impossible mapping, then 262 /// an error is returned. Upon success, the number of bytes read along with 263 /// the map are returned. The number of bytes read is always a multiple of 264 /// 8. from_bytes( slice: &[u8], ) -> Result<(StartByteMap, usize), DeserializeError>265 pub(crate) fn from_bytes( 266 slice: &[u8], 267 ) -> Result<(StartByteMap, usize), DeserializeError> { 268 wire::check_slice_len(slice, 256, "start byte map")?; 269 let mut map = [Start::NonWordByte; 256]; 270 for (i, &repr) in slice[..256].iter().enumerate() { 271 map[i] = match Start::from_usize(usize::from(repr)) { 272 Some(start) => start, 273 None => { 274 return Err(DeserializeError::generic( 275 "found invalid starting configuration", 276 )) 277 } 278 }; 279 } 280 Ok((StartByteMap { map }, 256)) 281 } 282 283 /// Writes this map to the given byte buffer. if the given buffer is too 284 /// small, then an error is returned. Upon success, the total number of 285 /// bytes written is returned. The number of bytes written is guaranteed to 286 /// be a multiple of 8. write_to( &self, dst: &mut [u8], ) -> Result<usize, SerializeError>287 pub(crate) fn write_to( 288 &self, 289 dst: &mut [u8], 290 ) -> Result<usize, SerializeError> { 291 let nwrite = self.write_to_len(); 292 if dst.len() < nwrite { 293 return Err(SerializeError::buffer_too_small("start byte map")); 294 } 295 for (i, &start) in self.map.iter().enumerate() { 296 dst[i] = start.as_u8(); 297 } 298 Ok(nwrite) 299 } 300 301 /// Returns the total number of bytes written by `write_to`. write_to_len(&self) -> usize302 pub(crate) fn write_to_len(&self) -> usize { 303 256 304 } 305 } 306 307 impl core::fmt::Debug for StartByteMap { fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result308 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 309 use crate::util::escape::DebugByte; 310 311 write!(f, "StartByteMap{{")?; 312 for byte in 0..=255 { 313 if byte > 0 { 314 write!(f, ", ")?; 315 } 316 let start = self.map[usize::from(byte)]; 317 write!(f, "{:?} => {:?}", DebugByte(byte), start)?; 318 } 319 write!(f, "}}")?; 320 Ok(()) 321 } 322 } 323 324 /// Represents the six possible starting configurations of a DFA search. 325 /// 326 /// The starting configuration is determined by inspecting the the beginning 327 /// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID 328 /// (if specified) and the type of search (anchored or not) is what selects the 329 /// start state to use in a DFA. 330 /// 331 /// As one example, if a DFA only supports unanchored searches and does not 332 /// support anchored searches for each pattern, then it will have at most 6 333 /// distinct start states. (Some start states may be reused if determinization 334 /// can determine that they will be equivalent.) If the DFA supports both 335 /// anchored and unanchored searches, then it will have a maximum of 12 336 /// distinct start states. Finally, if the DFA also supports anchored searches 337 /// for each pattern, then it can have up to `12 + (N * 6)` start states, where 338 /// `N` is the number of patterns. 339 /// 340 /// Handling each of these starting configurations in the context of DFA 341 /// determinization can be *quite* tricky and subtle. But the code is small 342 /// and can be found at `crate::util::determinize::set_lookbehind_from_start`. 343 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 344 pub(crate) enum Start { 345 /// This occurs when the starting position is not any of the ones below. 346 NonWordByte = 0, 347 /// This occurs when the byte immediately preceding the start of the search 348 /// is an ASCII word byte. 349 WordByte = 1, 350 /// This occurs when the starting position of the search corresponds to the 351 /// beginning of the haystack. 352 Text = 2, 353 /// This occurs when the byte immediately preceding the start of the search 354 /// is a line terminator. Specifically, `\n`. 355 LineLF = 3, 356 /// This occurs when the byte immediately preceding the start of the search 357 /// is a line terminator. Specifically, `\r`. 358 LineCR = 4, 359 /// This occurs when a custom line terminator has been set via a 360 /// `LookMatcher`, and when that line terminator is neither a `\r` or a 361 /// `\n`. 362 /// 363 /// If the custom line terminator is a word byte, then this start 364 /// configuration is still selected. DFAs that implement word boundary 365 /// assertions will likely need to check whether the custom line terminator 366 /// is a word byte, in which case, it should behave as if the byte 367 /// satisfies `\b` in addition to multi-line anchors. 368 CustomLineTerminator = 5, 369 } 370 371 impl Start { 372 /// Return the starting state corresponding to the given integer. If no 373 /// starting state exists for the given integer, then None is returned. from_usize(n: usize) -> Option<Start>374 pub(crate) fn from_usize(n: usize) -> Option<Start> { 375 match n { 376 0 => Some(Start::NonWordByte), 377 1 => Some(Start::WordByte), 378 2 => Some(Start::Text), 379 3 => Some(Start::LineLF), 380 4 => Some(Start::LineCR), 381 5 => Some(Start::CustomLineTerminator), 382 _ => None, 383 } 384 } 385 386 /// Returns the total number of starting state configurations. len() -> usize387 pub(crate) fn len() -> usize { 388 6 389 } 390 391 /// Return this starting configuration as `u8` integer. It is guaranteed to 392 /// be less than `Start::len()`. 393 #[cfg_attr(feature = "perf-inline", inline(always))] as_u8(&self) -> u8394 pub(crate) fn as_u8(&self) -> u8 { 395 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an 396 // actual int. 397 *self as u8 398 } 399 400 /// Return this starting configuration as a `usize` integer. It is 401 /// guaranteed to be less than `Start::len()`. 402 #[cfg_attr(feature = "perf-inline", inline(always))] as_usize(&self) -> usize403 pub(crate) fn as_usize(&self) -> usize { 404 usize::from(self.as_u8()) 405 } 406 } 407 408 #[cfg(test)] 409 mod tests { 410 use super::*; 411 412 #[test] start_fwd_done_range()413 fn start_fwd_done_range() { 414 let smap = StartByteMap::new(&LookMatcher::default()); 415 let input = Input::new("").range(1..0); 416 let config = Config::from_input_forward(&input); 417 let start = 418 config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); 419 assert_eq!(Start::Text, start); 420 } 421 422 #[test] start_rev_done_range()423 fn start_rev_done_range() { 424 let smap = StartByteMap::new(&LookMatcher::default()); 425 let input = Input::new("").range(1..0); 426 let config = Config::from_input_reverse(&input); 427 let start = 428 config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); 429 assert_eq!(Start::Text, start); 430 } 431 432 #[test] start_fwd()433 fn start_fwd() { 434 let f = |haystack, start, end| { 435 let smap = StartByteMap::new(&LookMatcher::default()); 436 let input = Input::new(haystack).range(start..end); 437 let config = Config::from_input_forward(&input); 438 let start = 439 config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); 440 start 441 }; 442 443 assert_eq!(Start::Text, f("", 0, 0)); 444 assert_eq!(Start::Text, f("abc", 0, 3)); 445 assert_eq!(Start::Text, f("\nabc", 0, 3)); 446 447 assert_eq!(Start::LineLF, f("\nabc", 1, 3)); 448 449 assert_eq!(Start::LineCR, f("\rabc", 1, 3)); 450 451 assert_eq!(Start::WordByte, f("abc", 1, 3)); 452 453 assert_eq!(Start::NonWordByte, f(" abc", 1, 3)); 454 } 455 456 #[test] start_rev()457 fn start_rev() { 458 let f = |haystack, start, end| { 459 let smap = StartByteMap::new(&LookMatcher::default()); 460 let input = Input::new(haystack).range(start..end); 461 let config = Config::from_input_reverse(&input); 462 let start = 463 config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); 464 start 465 }; 466 467 assert_eq!(Start::Text, f("", 0, 0)); 468 assert_eq!(Start::Text, f("abc", 0, 3)); 469 assert_eq!(Start::Text, f("abc\n", 0, 4)); 470 471 assert_eq!(Start::LineLF, f("abc\nz", 0, 3)); 472 473 assert_eq!(Start::LineCR, f("abc\rz", 0, 3)); 474 475 assert_eq!(Start::WordByte, f("abc", 0, 2)); 476 477 assert_eq!(Start::NonWordByte, f("abc ", 0, 3)); 478 } 479 } 480