1 use core::{cell::RefCell, fmt, mem};
2 
3 use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
4 
5 use crate::{
6     dfa::{automaton::Automaton, dense, DEAD},
7     util::{
8         alphabet,
9         primitives::{PatternID, StateID},
10     },
11 };
12 
13 /// An implementation of Hopcroft's algorithm for minimizing DFAs.
14 ///
15 /// The algorithm implemented here is mostly taken from Wikipedia:
16 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
17 ///
18 /// This code has had some light optimization attention paid to it,
19 /// particularly in the form of reducing allocation as much as possible.
20 /// However, it is still generally slow. Future optimization work should
21 /// probably focus on the bigger picture rather than micro-optimizations. For
22 /// example:
23 ///
24 /// 1. Figure out how to more intelligently create initial partitions. That is,
25 ///    Hopcroft's algorithm starts by creating two partitions of DFA states
26 ///    that are known to NOT be equivalent: match states and non-match states.
27 ///    The algorithm proceeds by progressively refining these partitions into
28 ///    smaller partitions. If we could start with more partitions, then we
29 ///    could reduce the amount of work that Hopcroft's algorithm needs to do.
30 /// 2. For every partition that we visit, we find all incoming transitions to
31 ///    every state in the partition for *every* element in the alphabet. (This
32 ///    is why using byte classes can significantly decrease minimization times,
33 ///    since byte classes shrink the alphabet.) This is quite costly and there
34 ///    is perhaps some redundant work being performed depending on the specific
35 ///    states in the set. For example, we might be able to only visit some
36 ///    elements of the alphabet based on the transitions.
37 /// 3. Move parts of minimization into determinization. If minimization has
38 ///    fewer states to deal with, then it should run faster. A prime example
39 ///    of this might be large Unicode classes, which are generated in way that
40 ///    can create a lot of redundant states. (Some work has been done on this
41 ///    point during NFA compilation via the algorithm described in the
42 ///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
43 ///    paper.)
44 pub(crate) struct Minimizer<'a> {
45     dfa: &'a mut dense::OwnedDFA,
46     in_transitions: Vec<Vec<Vec<StateID>>>,
47     partitions: Vec<StateSet>,
48     waiting: Vec<StateSet>,
49 }
50 
51 impl<'a> fmt::Debug for Minimizer<'a> {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result52     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53         f.debug_struct("Minimizer")
54             .field("dfa", &self.dfa)
55             .field("in_transitions", &self.in_transitions)
56             .field("partitions", &self.partitions)
57             .field("waiting", &self.waiting)
58             .finish()
59     }
60 }
61 
62 /// A set of states. A state set makes up a single partition in Hopcroft's
63 /// algorithm.
64 ///
65 /// It is represented by an ordered set of state identifiers. We use shared
66 /// ownership so that a single state set can be in both the set of partitions
67 /// and in the set of waiting sets simultaneously without an additional
68 /// allocation. Generally, once a state set is built, it becomes immutable.
69 ///
70 /// We use this representation because it avoids the overhead of more
71 /// traditional set data structures (HashSet/BTreeSet), and also because
72 /// computing intersection/subtraction on this representation is especially
73 /// fast.
74 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
75 struct StateSet {
76     ids: Rc<RefCell<Vec<StateID>>>,
77 }
78 
79 impl<'a> Minimizer<'a> {
new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a>80     pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
81         let in_transitions = Minimizer::incoming_transitions(dfa);
82         let partitions = Minimizer::initial_partitions(dfa);
83         let waiting = partitions.clone();
84         Minimizer { dfa, in_transitions, partitions, waiting }
85     }
86 
run(mut self)87     pub fn run(mut self) {
88         let stride2 = self.dfa.stride2();
89         let as_state_id = |index: usize| -> StateID {
90             StateID::new(index << stride2).unwrap()
91         };
92         let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
93 
94         let mut incoming = StateSet::empty();
95         let mut scratch1 = StateSet::empty();
96         let mut scratch2 = StateSet::empty();
97         let mut newparts = vec![];
98 
99         // This loop is basically Hopcroft's algorithm. Everything else is just
100         // shuffling data around to fit our representation.
101         while let Some(set) = self.waiting.pop() {
102             for b in self.dfa.byte_classes().iter() {
103                 self.find_incoming_to(b, &set, &mut incoming);
104                 // If incoming is empty, then the intersection with any other
105                 // set must also be empty. So 'newparts' just ends up being
106                 // 'self.partitions'. So there's no need to go through the loop
107                 // below.
108                 //
109                 // This actually turns out to be rather large optimization. On
110                 // the order of making minimization 4-5x faster. It's likely
111                 // that the vast majority of all states have very few incoming
112                 // transitions.
113                 if incoming.is_empty() {
114                     continue;
115                 }
116 
117                 for p in 0..self.partitions.len() {
118                     self.partitions[p].intersection(&incoming, &mut scratch1);
119                     if scratch1.is_empty() {
120                         newparts.push(self.partitions[p].clone());
121                         continue;
122                     }
123 
124                     self.partitions[p].subtract(&incoming, &mut scratch2);
125                     if scratch2.is_empty() {
126                         newparts.push(self.partitions[p].clone());
127                         continue;
128                     }
129 
130                     let (x, y) =
131                         (scratch1.deep_clone(), scratch2.deep_clone());
132                     newparts.push(x.clone());
133                     newparts.push(y.clone());
134                     match self.find_waiting(&self.partitions[p]) {
135                         Some(i) => {
136                             self.waiting[i] = x;
137                             self.waiting.push(y);
138                         }
139                         None => {
140                             if x.len() <= y.len() {
141                                 self.waiting.push(x);
142                             } else {
143                                 self.waiting.push(y);
144                             }
145                         }
146                     }
147                 }
148                 newparts = mem::replace(&mut self.partitions, newparts);
149                 newparts.clear();
150             }
151         }
152 
153         // At this point, we now have a minimal partitioning of states, where
154         // each partition is an equivalence class of DFA states. Now we need to
155         // use this partitioning to update the DFA to only contain one state for
156         // each partition.
157 
158         // Create a map from DFA state ID to the representative ID of the
159         // equivalence class to which it belongs. The representative ID of an
160         // equivalence class of states is the minimum ID in that class.
161         let mut state_to_part = vec![DEAD; self.dfa.state_len()];
162         for p in &self.partitions {
163             p.iter(|id| state_to_part[as_index(id)] = p.min());
164         }
165 
166         // Generate a new contiguous sequence of IDs for minimal states, and
167         // create a map from equivalence IDs to the new IDs. Thus, the new
168         // minimal ID of *any* state in the unminimized DFA can be obtained
169         // with minimals_ids[state_to_part[old_id]].
170         let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
171         let mut new_index = 0;
172         for state in self.dfa.states() {
173             if state_to_part[as_index(state.id())] == state.id() {
174                 minimal_ids[as_index(state.id())] = as_state_id(new_index);
175                 new_index += 1;
176             }
177         }
178         // The total number of states in the minimal DFA.
179         let minimal_count = new_index;
180         // Convenience function for remapping state IDs. This takes an old ID,
181         // looks up its Hopcroft partition and then maps that to the new ID
182         // range.
183         let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
184 
185         // Re-map this DFA in place such that the only states remaining
186         // correspond to the representative states of every equivalence class.
187         for id in (0..self.dfa.state_len()).map(as_state_id) {
188             // If this state isn't a representative for an equivalence class,
189             // then we skip it since it won't appear in the minimal DFA.
190             if state_to_part[as_index(id)] != id {
191                 continue;
192             }
193             self.dfa.remap_state(id, remap);
194             self.dfa.swap_states(id, minimal_ids[as_index(id)]);
195         }
196         // Trim off all unused states from the pre-minimized DFA. This
197         // represents all states that were merged into a non-singleton
198         // equivalence class of states, and appeared after the first state
199         // in each such class. (Because the state with the smallest ID in each
200         // equivalence class is its representative ID.)
201         self.dfa.truncate_states(minimal_count);
202 
203         // Update the new start states, which is now just the minimal ID of
204         // whatever state the old start state was collapsed into. Also, we
205         // collect everything before-hand to work around the borrow checker.
206         // We're already allocating so much that this is probably fine. If this
207         // turns out to be costly, then I guess add a `starts_mut` iterator.
208         let starts: Vec<_> = self.dfa.starts().collect();
209         for (old_start_id, anchored, start_type) in starts {
210             self.dfa.set_start_state(
211                 anchored,
212                 start_type,
213                 remap(old_start_id),
214             );
215         }
216 
217         // Update the match state pattern ID list for multi-regexes. All we
218         // need to do is remap the match state IDs. The pattern ID lists are
219         // always the same as they were since match states with distinct
220         // pattern ID lists are always considered distinct states.
221         let mut pmap = BTreeMap::new();
222         for (match_id, pattern_ids) in self.dfa.pattern_map() {
223             let new_id = remap(match_id);
224             pmap.insert(new_id, pattern_ids);
225         }
226         // This unwrap is OK because minimization never increases the number of
227         // match states or patterns in those match states. Since minimization
228         // runs after the pattern map has already been set at least once, we
229         // know that our match states cannot error.
230         self.dfa.set_pattern_map(&pmap).unwrap();
231 
232         // In order to update the ID of the maximum match state, we need to
233         // find the maximum ID among all of the match states in the minimized
234         // DFA. This is not necessarily the new ID of the unminimized maximum
235         // match state, since that could have been collapsed with a much
236         // earlier match state. Therefore, to find the new max match state,
237         // we iterate over all previous match states, find their corresponding
238         // new minimal ID, and take the maximum of those.
239         let old = self.dfa.special().clone();
240         let new = self.dfa.special_mut();
241         // ... but only remap if we had match states.
242         if old.matches() {
243             new.min_match = StateID::MAX;
244             new.max_match = StateID::ZERO;
245             for i in as_index(old.min_match)..=as_index(old.max_match) {
246                 let new_id = remap(as_state_id(i));
247                 if new_id < new.min_match {
248                     new.min_match = new_id;
249                 }
250                 if new_id > new.max_match {
251                     new.max_match = new_id;
252                 }
253             }
254         }
255         // ... same, but for start states.
256         if old.starts() {
257             new.min_start = StateID::MAX;
258             new.max_start = StateID::ZERO;
259             for i in as_index(old.min_start)..=as_index(old.max_start) {
260                 let new_id = remap(as_state_id(i));
261                 if new_id == DEAD {
262                     continue;
263                 }
264                 if new_id < new.min_start {
265                     new.min_start = new_id;
266                 }
267                 if new_id > new.max_start {
268                     new.max_start = new_id;
269                 }
270             }
271             if new.max_start == DEAD {
272                 new.min_start = DEAD;
273             }
274         }
275         new.quit_id = remap(new.quit_id);
276         new.set_max();
277     }
278 
find_waiting(&self, set: &StateSet) -> Option<usize>279     fn find_waiting(&self, set: &StateSet) -> Option<usize> {
280         self.waiting.iter().position(|s| s == set)
281     }
282 
find_incoming_to( &self, b: alphabet::Unit, set: &StateSet, incoming: &mut StateSet, )283     fn find_incoming_to(
284         &self,
285         b: alphabet::Unit,
286         set: &StateSet,
287         incoming: &mut StateSet,
288     ) {
289         incoming.clear();
290         set.iter(|id| {
291             for &inid in
292                 &self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
293             {
294                 incoming.add(inid);
295             }
296         });
297         incoming.canonicalize();
298     }
299 
initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet>300     fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
301         // For match states, we know that two match states with different
302         // pattern ID lists will *always* be distinct, so we can partition them
303         // initially based on that.
304         let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
305         let mut is_quit = StateSet::empty();
306         let mut no_match = StateSet::empty();
307         for state in dfa.states() {
308             if dfa.is_match_state(state.id()) {
309                 let mut pids = vec![];
310                 for i in 0..dfa.match_len(state.id()) {
311                     pids.push(dfa.match_pattern(state.id(), i));
312                 }
313                 matching
314                     .entry(pids)
315                     .or_insert(StateSet::empty())
316                     .add(state.id());
317             } else if dfa.is_quit_state(state.id()) {
318                 is_quit.add(state.id());
319             } else {
320                 no_match.add(state.id());
321             }
322         }
323 
324         let mut sets: Vec<StateSet> =
325             matching.into_iter().map(|(_, set)| set).collect();
326         sets.push(no_match);
327         sets.push(is_quit);
328         sets
329     }
330 
incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>>331     fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
332         let mut incoming = vec![];
333         for _ in dfa.states() {
334             incoming.push(vec![vec![]; dfa.alphabet_len()]);
335         }
336         for state in dfa.states() {
337             for (b, next) in state.transitions() {
338                 incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
339             }
340         }
341         incoming
342     }
343 }
344 
345 impl StateSet {
empty() -> StateSet346     fn empty() -> StateSet {
347         StateSet { ids: Rc::new(RefCell::new(vec![])) }
348     }
349 
add(&mut self, id: StateID)350     fn add(&mut self, id: StateID) {
351         self.ids.borrow_mut().push(id);
352     }
353 
min(&self) -> StateID354     fn min(&self) -> StateID {
355         self.ids.borrow()[0]
356     }
357 
canonicalize(&mut self)358     fn canonicalize(&mut self) {
359         self.ids.borrow_mut().sort();
360         self.ids.borrow_mut().dedup();
361     }
362 
clear(&mut self)363     fn clear(&mut self) {
364         self.ids.borrow_mut().clear();
365     }
366 
len(&self) -> usize367     fn len(&self) -> usize {
368         self.ids.borrow().len()
369     }
370 
is_empty(&self) -> bool371     fn is_empty(&self) -> bool {
372         self.len() == 0
373     }
374 
deep_clone(&self) -> StateSet375     fn deep_clone(&self) -> StateSet {
376         let ids = self.ids.borrow().iter().cloned().collect();
377         StateSet { ids: Rc::new(RefCell::new(ids)) }
378     }
379 
iter<F: FnMut(StateID)>(&self, mut f: F)380     fn iter<F: FnMut(StateID)>(&self, mut f: F) {
381         for &id in self.ids.borrow().iter() {
382             f(id);
383         }
384     }
385 
intersection(&self, other: &StateSet, dest: &mut StateSet)386     fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
387         dest.clear();
388         if self.is_empty() || other.is_empty() {
389             return;
390         }
391 
392         let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
393         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
394         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
395         loop {
396             if a == b {
397                 dest.add(a);
398                 a = match ita.next() {
399                     None => break,
400                     Some(a) => a,
401                 };
402                 b = match itb.next() {
403                     None => break,
404                     Some(b) => b,
405                 };
406             } else if a < b {
407                 a = match ita.next() {
408                     None => break,
409                     Some(a) => a,
410                 };
411             } else {
412                 b = match itb.next() {
413                     None => break,
414                     Some(b) => b,
415                 };
416             }
417         }
418     }
419 
subtract(&self, other: &StateSet, dest: &mut StateSet)420     fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
421         dest.clear();
422         if self.is_empty() || other.is_empty() {
423             self.iter(|s| dest.add(s));
424             return;
425         }
426 
427         let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
428         let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
429         let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
430         loop {
431             if a == b {
432                 a = match ita.next() {
433                     None => break,
434                     Some(a) => a,
435                 };
436                 b = match itb.next() {
437                     None => {
438                         dest.add(a);
439                         break;
440                     }
441                     Some(b) => b,
442                 };
443             } else if a < b {
444                 dest.add(a);
445                 a = match ita.next() {
446                     None => break,
447                     Some(a) => a,
448                 };
449             } else {
450                 b = match itb.next() {
451                     None => {
452                         dest.add(a);
453                         break;
454                     }
455                     Some(b) => b,
456                 };
457             }
458         }
459         for a in ita {
460             dest.add(a);
461         }
462     }
463 }
464