1 use std::collections::HashMap; 2 use std::mem; 3 use std::rc::Rc; 4 5 use dense; 6 use error::Result; 7 use nfa::{self, NFA}; 8 use sparse_set::SparseSet; 9 use state_id::{dead_id, StateID}; 10 11 type DFARepr<S> = dense::Repr<Vec<S>, S>; 12 13 /// A determinizer converts an NFA to a DFA. 14 /// 15 /// This determinizer follows the typical powerset construction, where each 16 /// DFA state is comprised of one or more NFA states. In the worst case, there 17 /// is one DFA state for every possible combination of NFA states. In practice, 18 /// this only happens in certain conditions, typically when there are bounded 19 /// repetitions. 20 /// 21 /// The type variable `S` refers to the chosen state identifier representation 22 /// used for the DFA. 23 /// 24 /// The lifetime variable `'a` refers to the lifetime of the NFA being 25 /// converted to a DFA. 26 #[derive(Debug)] 27 pub(crate) struct Determinizer<'a, S: StateID> { 28 /// The NFA we're converting into a DFA. 29 nfa: &'a NFA, 30 /// The DFA we're building. 31 dfa: DFARepr<S>, 32 /// Each DFA state being built is defined as an *ordered* set of NFA 33 /// states, along with a flag indicating whether the state is a match 34 /// state or not. 35 /// 36 /// This is never empty. The first state is always a dummy state such that 37 /// a state id == 0 corresponds to a dead state. 38 builder_states: Vec<Rc<State>>, 39 /// A cache of DFA states that already exist and can be easily looked up 40 /// via ordered sets of NFA states. 41 cache: HashMap<Rc<State>, S>, 42 /// Scratch space for a stack of NFA states to visit, for depth first 43 /// visiting without recursion. 44 stack: Vec<nfa::StateID>, 45 /// Scratch space for storing an ordered sequence of NFA states, for 46 /// amortizing allocation. 47 scratch_nfa_states: Vec<nfa::StateID>, 48 /// Whether to build a DFA that finds the longest possible match. 49 longest_match: bool, 50 } 51 52 /// An intermediate representation for a DFA state during determinization. 53 #[derive(Debug, Eq, Hash, PartialEq)] 54 struct State { 55 /// Whether this state is a match state or not. 56 is_match: bool, 57 /// An ordered sequence of NFA states that make up this DFA state. 58 nfa_states: Vec<nfa::StateID>, 59 } 60 61 impl<'a, S: StateID> Determinizer<'a, S> { 62 /// Create a new determinizer for converting the given NFA to a DFA. new(nfa: &'a NFA) -> Determinizer<'a, S>63 pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { 64 let dead = Rc::new(State::dead()); 65 let mut cache = HashMap::default(); 66 cache.insert(dead.clone(), dead_id()); 67 68 Determinizer { 69 nfa, 70 dfa: DFARepr::empty().anchored(nfa.is_anchored()), 71 builder_states: vec![dead], 72 cache, 73 stack: vec![], 74 scratch_nfa_states: vec![], 75 longest_match: false, 76 } 77 } 78 79 /// Instruct the determinizer to use equivalence classes as the transition 80 /// alphabet instead of all possible byte values. with_byte_classes(mut self) -> Determinizer<'a, S>81 pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { 82 let byte_classes = self.nfa.byte_classes().clone(); 83 self.dfa = DFARepr::empty_with_byte_classes(byte_classes) 84 .anchored(self.nfa.is_anchored()); 85 self 86 } 87 88 /// Instruct the determinizer to build a DFA that recognizes the longest 89 /// possible match instead of the leftmost first match. This is useful when 90 /// constructing reverse DFAs for finding the start of a match. longest_match(mut self, yes: bool) -> Determinizer<'a, S>91 pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { 92 self.longest_match = yes; 93 self 94 } 95 96 /// Build the DFA. If there was a problem constructing the DFA (e.g., if 97 /// the chosen state identifier representation is too small), then an error 98 /// is returned. build(mut self) -> Result<DFARepr<S>>99 pub fn build(mut self) -> Result<DFARepr<S>> { 100 let representative_bytes: Vec<u8> = 101 self.dfa.byte_classes().representatives().collect(); 102 let mut sparse = self.new_sparse_set(); 103 let mut uncompiled = vec![self.add_start(&mut sparse)?]; 104 while let Some(dfa_id) = uncompiled.pop() { 105 for &b in &representative_bytes { 106 let (next_dfa_id, is_new) = 107 self.cached_state(dfa_id, b, &mut sparse)?; 108 self.dfa.add_transition(dfa_id, b, next_dfa_id); 109 if is_new { 110 uncompiled.push(next_dfa_id); 111 } 112 } 113 } 114 115 // At this point, we shuffle the matching states in the final DFA to 116 // the beginning. This permits a DFA's match loop to detect a match 117 // condition by merely inspecting the current state's identifier, and 118 // avoids the need for any additional auxiliary storage. 119 let is_match: Vec<bool> = 120 self.builder_states.iter().map(|s| s.is_match).collect(); 121 self.dfa.shuffle_match_states(&is_match); 122 Ok(self.dfa) 123 } 124 125 /// Return the identifier for the next DFA state given an existing DFA 126 /// state and an input byte. If the next DFA state already exists, then 127 /// return its identifier from the cache. Otherwise, build the state, cache 128 /// it and return its identifier. 129 /// 130 /// The given sparse set is used for scratch space. It must have a capacity 131 /// equivalent to the total number of NFA states, but its contents are 132 /// otherwise unspecified. 133 /// 134 /// This routine returns a boolean indicating whether a new state was 135 /// built. If a new state is built, then the caller needs to add it to its 136 /// frontier of uncompiled DFA states to compute transitions for. cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)>137 fn cached_state( 138 &mut self, 139 dfa_id: S, 140 b: u8, 141 sparse: &mut SparseSet, 142 ) -> Result<(S, bool)> { 143 sparse.clear(); 144 // Compute the set of all reachable NFA states, including epsilons. 145 self.next(dfa_id, b, sparse); 146 // Build a candidate state and check if it has already been built. 147 let state = self.new_state(sparse); 148 if let Some(&cached_id) = self.cache.get(&state) { 149 // Since we have a cached state, put the constructed state's 150 // memory back into our scratch space, so that it can be reused. 151 let _ = 152 mem::replace(&mut self.scratch_nfa_states, state.nfa_states); 153 return Ok((cached_id, false)); 154 } 155 // Nothing was in the cache, so add this state to the cache. 156 self.add_state(state).map(|s| (s, true)) 157 } 158 159 /// Compute the set of all eachable NFA states, including the full epsilon 160 /// closure, from a DFA state for a single byte of input. next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet)161 fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { 162 next_nfa_states.clear(); 163 for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { 164 let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; 165 match *self.nfa.state(nfa_id) { 166 nfa::State::Union { .. } 167 | nfa::State::Fail 168 | nfa::State::Match => {} 169 nfa::State::Range { range: ref r } => { 170 if r.start <= b && b <= r.end { 171 self.epsilon_closure(r.next, next_nfa_states); 172 } 173 } 174 nfa::State::Sparse { ref ranges } => { 175 for r in ranges.iter() { 176 if r.start > b { 177 break; 178 } else if r.start <= b && b <= r.end { 179 self.epsilon_closure(r.next, next_nfa_states); 180 break; 181 } 182 } 183 } 184 } 185 } 186 } 187 188 /// Compute the epsilon closure for the given NFA state. epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet)189 fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { 190 if !self.nfa.state(start).is_epsilon() { 191 set.insert(start); 192 return; 193 } 194 195 self.stack.push(start); 196 while let Some(mut id) = self.stack.pop() { 197 loop { 198 if set.contains(id) { 199 break; 200 } 201 set.insert(id); 202 match *self.nfa.state(id) { 203 nfa::State::Range { .. } 204 | nfa::State::Sparse { .. } 205 | nfa::State::Fail 206 | nfa::State::Match => break, 207 nfa::State::Union { ref alternates } => { 208 id = match alternates.get(0) { 209 None => break, 210 Some(&id) => id, 211 }; 212 self.stack.extend(alternates[1..].iter().rev()); 213 } 214 } 215 } 216 } 217 } 218 219 /// Compute the initial DFA state and return its identifier. 220 /// 221 /// The sparse set given is used for scratch space, and must have capacity 222 /// equal to the total number of NFA states. Its contents are unspecified. add_start(&mut self, sparse: &mut SparseSet) -> Result<S>223 fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> { 224 sparse.clear(); 225 self.epsilon_closure(self.nfa.start(), sparse); 226 let state = self.new_state(&sparse); 227 let id = self.add_state(state)?; 228 self.dfa.set_start_state(id); 229 Ok(id) 230 } 231 232 /// Add the given state to the DFA and make it available in the cache. 233 /// 234 /// The state initially has no transitions. That is, it transitions to the 235 /// dead state for all possible inputs. add_state(&mut self, state: State) -> Result<S>236 fn add_state(&mut self, state: State) -> Result<S> { 237 let id = self.dfa.add_empty_state()?; 238 let rstate = Rc::new(state); 239 self.builder_states.push(rstate.clone()); 240 self.cache.insert(rstate, id); 241 Ok(id) 242 } 243 244 /// Convert the given set of ordered NFA states to a DFA state. new_state(&mut self, set: &SparseSet) -> State245 fn new_state(&mut self, set: &SparseSet) -> State { 246 let mut state = State { 247 is_match: false, 248 nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), 249 }; 250 state.nfa_states.clear(); 251 252 for &id in set { 253 match *self.nfa.state(id) { 254 nfa::State::Range { .. } => { 255 state.nfa_states.push(id); 256 } 257 nfa::State::Sparse { .. } => { 258 state.nfa_states.push(id); 259 } 260 nfa::State::Fail => { 261 break; 262 } 263 nfa::State::Match => { 264 state.is_match = true; 265 if !self.longest_match { 266 break; 267 } 268 } 269 nfa::State::Union { .. } => {} 270 } 271 } 272 state 273 } 274 275 /// Create a new sparse set with enough capacity to hold all NFA states. new_sparse_set(&self) -> SparseSet276 fn new_sparse_set(&self) -> SparseSet { 277 SparseSet::new(self.nfa.len()) 278 } 279 } 280 281 impl State { 282 /// Create a new empty dead state. dead() -> State283 fn dead() -> State { 284 State { nfa_states: vec![], is_match: false } 285 } 286 } 287