1 use std::collections::HashMap;
2 use std::mem;
3 use std::rc::Rc;
4 
5 use dense;
6 use error::Result;
7 use nfa::{self, NFA};
8 use sparse_set::SparseSet;
9 use state_id::{dead_id, StateID};
10 
11 type DFARepr<S> = dense::Repr<Vec<S>, S>;
12 
13 /// A determinizer converts an NFA to a DFA.
14 ///
15 /// This determinizer follows the typical powerset construction, where each
16 /// DFA state is comprised of one or more NFA states. In the worst case, there
17 /// is one DFA state for every possible combination of NFA states. In practice,
18 /// this only happens in certain conditions, typically when there are bounded
19 /// repetitions.
20 ///
21 /// The type variable `S` refers to the chosen state identifier representation
22 /// used for the DFA.
23 ///
24 /// The lifetime variable `'a` refers to the lifetime of the NFA being
25 /// converted to a DFA.
26 #[derive(Debug)]
27 pub(crate) struct Determinizer<'a, S: StateID> {
28     /// The NFA we're converting into a DFA.
29     nfa: &'a NFA,
30     /// The DFA we're building.
31     dfa: DFARepr<S>,
32     /// Each DFA state being built is defined as an *ordered* set of NFA
33     /// states, along with a flag indicating whether the state is a match
34     /// state or not.
35     ///
36     /// This is never empty. The first state is always a dummy state such that
37     /// a state id == 0 corresponds to a dead state.
38     builder_states: Vec<Rc<State>>,
39     /// A cache of DFA states that already exist and can be easily looked up
40     /// via ordered sets of NFA states.
41     cache: HashMap<Rc<State>, S>,
42     /// Scratch space for a stack of NFA states to visit, for depth first
43     /// visiting without recursion.
44     stack: Vec<nfa::StateID>,
45     /// Scratch space for storing an ordered sequence of NFA states, for
46     /// amortizing allocation.
47     scratch_nfa_states: Vec<nfa::StateID>,
48     /// Whether to build a DFA that finds the longest possible match.
49     longest_match: bool,
50 }
51 
52 /// An intermediate representation for a DFA state during determinization.
53 #[derive(Debug, Eq, Hash, PartialEq)]
54 struct State {
55     /// Whether this state is a match state or not.
56     is_match: bool,
57     /// An ordered sequence of NFA states that make up this DFA state.
58     nfa_states: Vec<nfa::StateID>,
59 }
60 
61 impl<'a, S: StateID> Determinizer<'a, S> {
62     /// Create a new determinizer for converting the given NFA to a DFA.
new(nfa: &'a NFA) -> Determinizer<'a, S>63     pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
64         let dead = Rc::new(State::dead());
65         let mut cache = HashMap::default();
66         cache.insert(dead.clone(), dead_id());
67 
68         Determinizer {
69             nfa,
70             dfa: DFARepr::empty().anchored(nfa.is_anchored()),
71             builder_states: vec![dead],
72             cache,
73             stack: vec![],
74             scratch_nfa_states: vec![],
75             longest_match: false,
76         }
77     }
78 
79     /// Instruct the determinizer to use equivalence classes as the transition
80     /// alphabet instead of all possible byte values.
with_byte_classes(mut self) -> Determinizer<'a, S>81     pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
82         let byte_classes = self.nfa.byte_classes().clone();
83         self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
84             .anchored(self.nfa.is_anchored());
85         self
86     }
87 
88     /// Instruct the determinizer to build a DFA that recognizes the longest
89     /// possible match instead of the leftmost first match. This is useful when
90     /// constructing reverse DFAs for finding the start of a match.
longest_match(mut self, yes: bool) -> Determinizer<'a, S>91     pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
92         self.longest_match = yes;
93         self
94     }
95 
96     /// Build the DFA. If there was a problem constructing the DFA (e.g., if
97     /// the chosen state identifier representation is too small), then an error
98     /// is returned.
build(mut self) -> Result<DFARepr<S>>99     pub fn build(mut self) -> Result<DFARepr<S>> {
100         let representative_bytes: Vec<u8> =
101             self.dfa.byte_classes().representatives().collect();
102         let mut sparse = self.new_sparse_set();
103         let mut uncompiled = vec![self.add_start(&mut sparse)?];
104         while let Some(dfa_id) = uncompiled.pop() {
105             for &b in &representative_bytes {
106                 let (next_dfa_id, is_new) =
107                     self.cached_state(dfa_id, b, &mut sparse)?;
108                 self.dfa.add_transition(dfa_id, b, next_dfa_id);
109                 if is_new {
110                     uncompiled.push(next_dfa_id);
111                 }
112             }
113         }
114 
115         // At this point, we shuffle the matching states in the final DFA to
116         // the beginning. This permits a DFA's match loop to detect a match
117         // condition by merely inspecting the current state's identifier, and
118         // avoids the need for any additional auxiliary storage.
119         let is_match: Vec<bool> =
120             self.builder_states.iter().map(|s| s.is_match).collect();
121         self.dfa.shuffle_match_states(&is_match);
122         Ok(self.dfa)
123     }
124 
125     /// Return the identifier for the next DFA state given an existing DFA
126     /// state and an input byte. If the next DFA state already exists, then
127     /// return its identifier from the cache. Otherwise, build the state, cache
128     /// it and return its identifier.
129     ///
130     /// The given sparse set is used for scratch space. It must have a capacity
131     /// equivalent to the total number of NFA states, but its contents are
132     /// otherwise unspecified.
133     ///
134     /// This routine returns a boolean indicating whether a new state was
135     /// built. If a new state is built, then the caller needs to add it to its
136     /// frontier of uncompiled DFA states to compute transitions for.
cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)>137     fn cached_state(
138         &mut self,
139         dfa_id: S,
140         b: u8,
141         sparse: &mut SparseSet,
142     ) -> Result<(S, bool)> {
143         sparse.clear();
144         // Compute the set of all reachable NFA states, including epsilons.
145         self.next(dfa_id, b, sparse);
146         // Build a candidate state and check if it has already been built.
147         let state = self.new_state(sparse);
148         if let Some(&cached_id) = self.cache.get(&state) {
149             // Since we have a cached state, put the constructed state's
150             // memory back into our scratch space, so that it can be reused.
151             let _ =
152                 mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
153             return Ok((cached_id, false));
154         }
155         // Nothing was in the cache, so add this state to the cache.
156         self.add_state(state).map(|s| (s, true))
157     }
158 
159     /// Compute the set of all eachable NFA states, including the full epsilon
160     /// closure, from a DFA state for a single byte of input.
next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet)161     fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
162         next_nfa_states.clear();
163         for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
164             let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
165             match *self.nfa.state(nfa_id) {
166                 nfa::State::Union { .. }
167                 | nfa::State::Fail
168                 | nfa::State::Match => {}
169                 nfa::State::Range { range: ref r } => {
170                     if r.start <= b && b <= r.end {
171                         self.epsilon_closure(r.next, next_nfa_states);
172                     }
173                 }
174                 nfa::State::Sparse { ref ranges } => {
175                     for r in ranges.iter() {
176                         if r.start > b {
177                             break;
178                         } else if r.start <= b && b <= r.end {
179                             self.epsilon_closure(r.next, next_nfa_states);
180                             break;
181                         }
182                     }
183                 }
184             }
185         }
186     }
187 
188     /// Compute the epsilon closure for the given NFA state.
epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet)189     fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
190         if !self.nfa.state(start).is_epsilon() {
191             set.insert(start);
192             return;
193         }
194 
195         self.stack.push(start);
196         while let Some(mut id) = self.stack.pop() {
197             loop {
198                 if set.contains(id) {
199                     break;
200                 }
201                 set.insert(id);
202                 match *self.nfa.state(id) {
203                     nfa::State::Range { .. }
204                     | nfa::State::Sparse { .. }
205                     | nfa::State::Fail
206                     | nfa::State::Match => break,
207                     nfa::State::Union { ref alternates } => {
208                         id = match alternates.get(0) {
209                             None => break,
210                             Some(&id) => id,
211                         };
212                         self.stack.extend(alternates[1..].iter().rev());
213                     }
214                 }
215             }
216         }
217     }
218 
219     /// Compute the initial DFA state and return its identifier.
220     ///
221     /// The sparse set given is used for scratch space, and must have capacity
222     /// equal to the total number of NFA states. Its contents are unspecified.
add_start(&mut self, sparse: &mut SparseSet) -> Result<S>223     fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
224         sparse.clear();
225         self.epsilon_closure(self.nfa.start(), sparse);
226         let state = self.new_state(&sparse);
227         let id = self.add_state(state)?;
228         self.dfa.set_start_state(id);
229         Ok(id)
230     }
231 
232     /// Add the given state to the DFA and make it available in the cache.
233     ///
234     /// The state initially has no transitions. That is, it transitions to the
235     /// dead state for all possible inputs.
add_state(&mut self, state: State) -> Result<S>236     fn add_state(&mut self, state: State) -> Result<S> {
237         let id = self.dfa.add_empty_state()?;
238         let rstate = Rc::new(state);
239         self.builder_states.push(rstate.clone());
240         self.cache.insert(rstate, id);
241         Ok(id)
242     }
243 
244     /// Convert the given set of ordered NFA states to a DFA state.
new_state(&mut self, set: &SparseSet) -> State245     fn new_state(&mut self, set: &SparseSet) -> State {
246         let mut state = State {
247             is_match: false,
248             nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
249         };
250         state.nfa_states.clear();
251 
252         for &id in set {
253             match *self.nfa.state(id) {
254                 nfa::State::Range { .. } => {
255                     state.nfa_states.push(id);
256                 }
257                 nfa::State::Sparse { .. } => {
258                     state.nfa_states.push(id);
259                 }
260                 nfa::State::Fail => {
261                     break;
262                 }
263                 nfa::State::Match => {
264                     state.is_match = true;
265                     if !self.longest_match {
266                         break;
267                     }
268                 }
269                 nfa::State::Union { .. } => {}
270             }
271         }
272         state
273     }
274 
275     /// Create a new sparse set with enough capacity to hold all NFA states.
new_sparse_set(&self) -> SparseSet276     fn new_sparse_set(&self) -> SparseSet {
277         SparseSet::new(self.nfa.len())
278     }
279 }
280 
281 impl State {
282     /// Create a new empty dead state.
dead() -> State283     fn dead() -> State {
284         State { nfa_states: vec![], is_match: false }
285     }
286 }
287