1 use core::{cell::RefCell, fmt, mem}; 2 3 use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; 4 5 use crate::{ 6 dfa::{automaton::Automaton, dense, DEAD}, 7 util::{ 8 alphabet, 9 primitives::{PatternID, StateID}, 10 }, 11 }; 12 13 /// An implementation of Hopcroft's algorithm for minimizing DFAs. 14 /// 15 /// The algorithm implemented here is mostly taken from Wikipedia: 16 /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm 17 /// 18 /// This code has had some light optimization attention paid to it, 19 /// particularly in the form of reducing allocation as much as possible. 20 /// However, it is still generally slow. Future optimization work should 21 /// probably focus on the bigger picture rather than micro-optimizations. For 22 /// example: 23 /// 24 /// 1. Figure out how to more intelligently create initial partitions. That is, 25 /// Hopcroft's algorithm starts by creating two partitions of DFA states 26 /// that are known to NOT be equivalent: match states and non-match states. 27 /// The algorithm proceeds by progressively refining these partitions into 28 /// smaller partitions. If we could start with more partitions, then we 29 /// could reduce the amount of work that Hopcroft's algorithm needs to do. 30 /// 2. For every partition that we visit, we find all incoming transitions to 31 /// every state in the partition for *every* element in the alphabet. (This 32 /// is why using byte classes can significantly decrease minimization times, 33 /// since byte classes shrink the alphabet.) This is quite costly and there 34 /// is perhaps some redundant work being performed depending on the specific 35 /// states in the set. For example, we might be able to only visit some 36 /// elements of the alphabet based on the transitions. 37 /// 3. Move parts of minimization into determinization. If minimization has 38 /// fewer states to deal with, then it should run faster. A prime example 39 /// of this might be large Unicode classes, which are generated in way that 40 /// can create a lot of redundant states. (Some work has been done on this 41 /// point during NFA compilation via the algorithm described in the 42 /// "Incremental Construction of MinimalAcyclic Finite-State Automata" 43 /// paper.) 44 pub(crate) struct Minimizer<'a> { 45 dfa: &'a mut dense::OwnedDFA, 46 in_transitions: Vec<Vec<Vec<StateID>>>, 47 partitions: Vec<StateSet>, 48 waiting: Vec<StateSet>, 49 } 50 51 impl<'a> fmt::Debug for Minimizer<'a> { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result52 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 53 f.debug_struct("Minimizer") 54 .field("dfa", &self.dfa) 55 .field("in_transitions", &self.in_transitions) 56 .field("partitions", &self.partitions) 57 .field("waiting", &self.waiting) 58 .finish() 59 } 60 } 61 62 /// A set of states. A state set makes up a single partition in Hopcroft's 63 /// algorithm. 64 /// 65 /// It is represented by an ordered set of state identifiers. We use shared 66 /// ownership so that a single state set can be in both the set of partitions 67 /// and in the set of waiting sets simultaneously without an additional 68 /// allocation. Generally, once a state set is built, it becomes immutable. 69 /// 70 /// We use this representation because it avoids the overhead of more 71 /// traditional set data structures (HashSet/BTreeSet), and also because 72 /// computing intersection/subtraction on this representation is especially 73 /// fast. 74 #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] 75 struct StateSet { 76 ids: Rc<RefCell<Vec<StateID>>>, 77 } 78 79 impl<'a> Minimizer<'a> { new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a>80 pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { 81 let in_transitions = Minimizer::incoming_transitions(dfa); 82 let partitions = Minimizer::initial_partitions(dfa); 83 let waiting = partitions.clone(); 84 Minimizer { dfa, in_transitions, partitions, waiting } 85 } 86 run(mut self)87 pub fn run(mut self) { 88 let stride2 = self.dfa.stride2(); 89 let as_state_id = |index: usize| -> StateID { 90 StateID::new(index << stride2).unwrap() 91 }; 92 let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; 93 94 let mut incoming = StateSet::empty(); 95 let mut scratch1 = StateSet::empty(); 96 let mut scratch2 = StateSet::empty(); 97 let mut newparts = vec![]; 98 99 // This loop is basically Hopcroft's algorithm. Everything else is just 100 // shuffling data around to fit our representation. 101 while let Some(set) = self.waiting.pop() { 102 for b in self.dfa.byte_classes().iter() { 103 self.find_incoming_to(b, &set, &mut incoming); 104 // If incoming is empty, then the intersection with any other 105 // set must also be empty. So 'newparts' just ends up being 106 // 'self.partitions'. So there's no need to go through the loop 107 // below. 108 // 109 // This actually turns out to be rather large optimization. On 110 // the order of making minimization 4-5x faster. It's likely 111 // that the vast majority of all states have very few incoming 112 // transitions. 113 if incoming.is_empty() { 114 continue; 115 } 116 117 for p in 0..self.partitions.len() { 118 self.partitions[p].intersection(&incoming, &mut scratch1); 119 if scratch1.is_empty() { 120 newparts.push(self.partitions[p].clone()); 121 continue; 122 } 123 124 self.partitions[p].subtract(&incoming, &mut scratch2); 125 if scratch2.is_empty() { 126 newparts.push(self.partitions[p].clone()); 127 continue; 128 } 129 130 let (x, y) = 131 (scratch1.deep_clone(), scratch2.deep_clone()); 132 newparts.push(x.clone()); 133 newparts.push(y.clone()); 134 match self.find_waiting(&self.partitions[p]) { 135 Some(i) => { 136 self.waiting[i] = x; 137 self.waiting.push(y); 138 } 139 None => { 140 if x.len() <= y.len() { 141 self.waiting.push(x); 142 } else { 143 self.waiting.push(y); 144 } 145 } 146 } 147 } 148 newparts = mem::replace(&mut self.partitions, newparts); 149 newparts.clear(); 150 } 151 } 152 153 // At this point, we now have a minimal partitioning of states, where 154 // each partition is an equivalence class of DFA states. Now we need to 155 // use this partitioning to update the DFA to only contain one state for 156 // each partition. 157 158 // Create a map from DFA state ID to the representative ID of the 159 // equivalence class to which it belongs. The representative ID of an 160 // equivalence class of states is the minimum ID in that class. 161 let mut state_to_part = vec![DEAD; self.dfa.state_len()]; 162 for p in &self.partitions { 163 p.iter(|id| state_to_part[as_index(id)] = p.min()); 164 } 165 166 // Generate a new contiguous sequence of IDs for minimal states, and 167 // create a map from equivalence IDs to the new IDs. Thus, the new 168 // minimal ID of *any* state in the unminimized DFA can be obtained 169 // with minimals_ids[state_to_part[old_id]]. 170 let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; 171 let mut new_index = 0; 172 for state in self.dfa.states() { 173 if state_to_part[as_index(state.id())] == state.id() { 174 minimal_ids[as_index(state.id())] = as_state_id(new_index); 175 new_index += 1; 176 } 177 } 178 // The total number of states in the minimal DFA. 179 let minimal_count = new_index; 180 // Convenience function for remapping state IDs. This takes an old ID, 181 // looks up its Hopcroft partition and then maps that to the new ID 182 // range. 183 let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; 184 185 // Re-map this DFA in place such that the only states remaining 186 // correspond to the representative states of every equivalence class. 187 for id in (0..self.dfa.state_len()).map(as_state_id) { 188 // If this state isn't a representative for an equivalence class, 189 // then we skip it since it won't appear in the minimal DFA. 190 if state_to_part[as_index(id)] != id { 191 continue; 192 } 193 self.dfa.remap_state(id, remap); 194 self.dfa.swap_states(id, minimal_ids[as_index(id)]); 195 } 196 // Trim off all unused states from the pre-minimized DFA. This 197 // represents all states that were merged into a non-singleton 198 // equivalence class of states, and appeared after the first state 199 // in each such class. (Because the state with the smallest ID in each 200 // equivalence class is its representative ID.) 201 self.dfa.truncate_states(minimal_count); 202 203 // Update the new start states, which is now just the minimal ID of 204 // whatever state the old start state was collapsed into. Also, we 205 // collect everything before-hand to work around the borrow checker. 206 // We're already allocating so much that this is probably fine. If this 207 // turns out to be costly, then I guess add a `starts_mut` iterator. 208 let starts: Vec<_> = self.dfa.starts().collect(); 209 for (old_start_id, anchored, start_type) in starts { 210 self.dfa.set_start_state( 211 anchored, 212 start_type, 213 remap(old_start_id), 214 ); 215 } 216 217 // Update the match state pattern ID list for multi-regexes. All we 218 // need to do is remap the match state IDs. The pattern ID lists are 219 // always the same as they were since match states with distinct 220 // pattern ID lists are always considered distinct states. 221 let mut pmap = BTreeMap::new(); 222 for (match_id, pattern_ids) in self.dfa.pattern_map() { 223 let new_id = remap(match_id); 224 pmap.insert(new_id, pattern_ids); 225 } 226 // This unwrap is OK because minimization never increases the number of 227 // match states or patterns in those match states. Since minimization 228 // runs after the pattern map has already been set at least once, we 229 // know that our match states cannot error. 230 self.dfa.set_pattern_map(&pmap).unwrap(); 231 232 // In order to update the ID of the maximum match state, we need to 233 // find the maximum ID among all of the match states in the minimized 234 // DFA. This is not necessarily the new ID of the unminimized maximum 235 // match state, since that could have been collapsed with a much 236 // earlier match state. Therefore, to find the new max match state, 237 // we iterate over all previous match states, find their corresponding 238 // new minimal ID, and take the maximum of those. 239 let old = self.dfa.special().clone(); 240 let new = self.dfa.special_mut(); 241 // ... but only remap if we had match states. 242 if old.matches() { 243 new.min_match = StateID::MAX; 244 new.max_match = StateID::ZERO; 245 for i in as_index(old.min_match)..=as_index(old.max_match) { 246 let new_id = remap(as_state_id(i)); 247 if new_id < new.min_match { 248 new.min_match = new_id; 249 } 250 if new_id > new.max_match { 251 new.max_match = new_id; 252 } 253 } 254 } 255 // ... same, but for start states. 256 if old.starts() { 257 new.min_start = StateID::MAX; 258 new.max_start = StateID::ZERO; 259 for i in as_index(old.min_start)..=as_index(old.max_start) { 260 let new_id = remap(as_state_id(i)); 261 if new_id == DEAD { 262 continue; 263 } 264 if new_id < new.min_start { 265 new.min_start = new_id; 266 } 267 if new_id > new.max_start { 268 new.max_start = new_id; 269 } 270 } 271 if new.max_start == DEAD { 272 new.min_start = DEAD; 273 } 274 } 275 new.quit_id = remap(new.quit_id); 276 new.set_max(); 277 } 278 find_waiting(&self, set: &StateSet) -> Option<usize>279 fn find_waiting(&self, set: &StateSet) -> Option<usize> { 280 self.waiting.iter().position(|s| s == set) 281 } 282 find_incoming_to( &self, b: alphabet::Unit, set: &StateSet, incoming: &mut StateSet, )283 fn find_incoming_to( 284 &self, 285 b: alphabet::Unit, 286 set: &StateSet, 287 incoming: &mut StateSet, 288 ) { 289 incoming.clear(); 290 set.iter(|id| { 291 for &inid in 292 &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] 293 { 294 incoming.add(inid); 295 } 296 }); 297 incoming.canonicalize(); 298 } 299 initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet>300 fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { 301 // For match states, we know that two match states with different 302 // pattern ID lists will *always* be distinct, so we can partition them 303 // initially based on that. 304 let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); 305 let mut is_quit = StateSet::empty(); 306 let mut no_match = StateSet::empty(); 307 for state in dfa.states() { 308 if dfa.is_match_state(state.id()) { 309 let mut pids = vec![]; 310 for i in 0..dfa.match_len(state.id()) { 311 pids.push(dfa.match_pattern(state.id(), i)); 312 } 313 matching 314 .entry(pids) 315 .or_insert(StateSet::empty()) 316 .add(state.id()); 317 } else if dfa.is_quit_state(state.id()) { 318 is_quit.add(state.id()); 319 } else { 320 no_match.add(state.id()); 321 } 322 } 323 324 let mut sets: Vec<StateSet> = 325 matching.into_iter().map(|(_, set)| set).collect(); 326 sets.push(no_match); 327 sets.push(is_quit); 328 sets 329 } 330 incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>>331 fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { 332 let mut incoming = vec![]; 333 for _ in dfa.states() { 334 incoming.push(vec![vec![]; dfa.alphabet_len()]); 335 } 336 for state in dfa.states() { 337 for (b, next) in state.transitions() { 338 incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); 339 } 340 } 341 incoming 342 } 343 } 344 345 impl StateSet { empty() -> StateSet346 fn empty() -> StateSet { 347 StateSet { ids: Rc::new(RefCell::new(vec![])) } 348 } 349 add(&mut self, id: StateID)350 fn add(&mut self, id: StateID) { 351 self.ids.borrow_mut().push(id); 352 } 353 min(&self) -> StateID354 fn min(&self) -> StateID { 355 self.ids.borrow()[0] 356 } 357 canonicalize(&mut self)358 fn canonicalize(&mut self) { 359 self.ids.borrow_mut().sort(); 360 self.ids.borrow_mut().dedup(); 361 } 362 clear(&mut self)363 fn clear(&mut self) { 364 self.ids.borrow_mut().clear(); 365 } 366 len(&self) -> usize367 fn len(&self) -> usize { 368 self.ids.borrow().len() 369 } 370 is_empty(&self) -> bool371 fn is_empty(&self) -> bool { 372 self.len() == 0 373 } 374 deep_clone(&self) -> StateSet375 fn deep_clone(&self) -> StateSet { 376 let ids = self.ids.borrow().iter().cloned().collect(); 377 StateSet { ids: Rc::new(RefCell::new(ids)) } 378 } 379 iter<F: FnMut(StateID)>(&self, mut f: F)380 fn iter<F: FnMut(StateID)>(&self, mut f: F) { 381 for &id in self.ids.borrow().iter() { 382 f(id); 383 } 384 } 385 intersection(&self, other: &StateSet, dest: &mut StateSet)386 fn intersection(&self, other: &StateSet, dest: &mut StateSet) { 387 dest.clear(); 388 if self.is_empty() || other.is_empty() { 389 return; 390 } 391 392 let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); 393 let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); 394 let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); 395 loop { 396 if a == b { 397 dest.add(a); 398 a = match ita.next() { 399 None => break, 400 Some(a) => a, 401 }; 402 b = match itb.next() { 403 None => break, 404 Some(b) => b, 405 }; 406 } else if a < b { 407 a = match ita.next() { 408 None => break, 409 Some(a) => a, 410 }; 411 } else { 412 b = match itb.next() { 413 None => break, 414 Some(b) => b, 415 }; 416 } 417 } 418 } 419 subtract(&self, other: &StateSet, dest: &mut StateSet)420 fn subtract(&self, other: &StateSet, dest: &mut StateSet) { 421 dest.clear(); 422 if self.is_empty() || other.is_empty() { 423 self.iter(|s| dest.add(s)); 424 return; 425 } 426 427 let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); 428 let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); 429 let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); 430 loop { 431 if a == b { 432 a = match ita.next() { 433 None => break, 434 Some(a) => a, 435 }; 436 b = match itb.next() { 437 None => { 438 dest.add(a); 439 break; 440 } 441 Some(b) => b, 442 }; 443 } else if a < b { 444 dest.add(a); 445 a = match ita.next() { 446 None => break, 447 Some(a) => a, 448 }; 449 } else { 450 b = match itb.next() { 451 None => { 452 dest.add(a); 453 break; 454 } 455 Some(b) => b, 456 }; 457 } 458 } 459 for a in ita { 460 dest.add(a); 461 } 462 } 463 } 464