xref: /aosp_15_r20/external/mesa3d/src/nouveau/compiler/nak/calc_instr_deps.rs (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 use crate::api::{GetDebugFlags, DEBUG};
5 use crate::ir::*;
6 
7 use std::cmp::max;
8 use std::collections::{HashMap, HashSet};
9 use std::ops::{Index, IndexMut, Range};
10 use std::slice;
11 
12 struct RegTracker<T> {
13     reg: [T; 255],
14     ureg: [T; 63],
15     pred: [T; 7],
16     upred: [T; 7],
17     carry: [T; 1],
18 }
19 
new_array_with<T, const N: usize>(f: &impl Fn() -> T) -> [T; N]20 fn new_array_with<T, const N: usize>(f: &impl Fn() -> T) -> [T; N] {
21     let mut v = Vec::new();
22     for _ in 0..N {
23         v.push(f());
24     }
25     v.try_into()
26         .unwrap_or_else(|_| panic!("Array size mismatch"))
27 }
28 
29 impl<T> RegTracker<T> {
new_with(f: &impl Fn() -> T) -> Self30     pub fn new_with(f: &impl Fn() -> T) -> Self {
31         Self {
32             reg: new_array_with(f),
33             ureg: new_array_with(f),
34             pred: new_array_with(f),
35             upred: new_array_with(f),
36             carry: new_array_with(f),
37         }
38     }
39 
for_each_instr_pred_mut( &mut self, instr: &Instr, mut f: impl FnMut(&mut T), )40     pub fn for_each_instr_pred_mut(
41         &mut self,
42         instr: &Instr,
43         mut f: impl FnMut(&mut T),
44     ) {
45         if let PredRef::Reg(reg) = &instr.pred.pred_ref {
46             for i in &mut self[*reg] {
47                 f(i);
48             }
49         }
50     }
51 
for_each_instr_src_mut( &mut self, instr: &Instr, mut f: impl FnMut(usize, &mut T), )52     pub fn for_each_instr_src_mut(
53         &mut self,
54         instr: &Instr,
55         mut f: impl FnMut(usize, &mut T),
56     ) {
57         for (i, src) in instr.srcs().iter().enumerate() {
58             match &src.src_ref {
59                 SrcRef::Reg(reg) => {
60                     for t in &mut self[*reg] {
61                         f(i, t);
62                     }
63                 }
64                 SrcRef::CBuf(CBufRef {
65                     buf: CBuf::BindlessUGPR(reg),
66                     ..
67                 }) => {
68                     for t in &mut self[*reg] {
69                         f(i, t);
70                     }
71                 }
72                 _ => (),
73             }
74         }
75     }
76 
for_each_instr_dst_mut( &mut self, instr: &Instr, mut f: impl FnMut(usize, &mut T), )77     pub fn for_each_instr_dst_mut(
78         &mut self,
79         instr: &Instr,
80         mut f: impl FnMut(usize, &mut T),
81     ) {
82         for (i, dst) in instr.dsts().iter().enumerate() {
83             if let Dst::Reg(reg) = dst {
84                 for t in &mut self[*reg] {
85                     f(i, t);
86                 }
87             }
88         }
89     }
90 }
91 
92 impl<T> Index<RegRef> for RegTracker<T> {
93     type Output = [T];
94 
index(&self, reg: RegRef) -> &[T]95     fn index(&self, reg: RegRef) -> &[T] {
96         let range = reg.idx_range();
97         let range = Range {
98             start: usize::try_from(range.start).unwrap(),
99             end: usize::try_from(range.end).unwrap(),
100         };
101 
102         match reg.file() {
103             RegFile::GPR => &self.reg[range],
104             RegFile::UGPR => &self.ureg[range],
105             RegFile::Pred => &self.pred[range],
106             RegFile::UPred => &self.upred[range],
107             RegFile::Carry => &self.carry[range],
108             RegFile::Bar => &[], // Barriers have a HW scoreboard
109             RegFile::Mem => panic!("Not a register"),
110         }
111     }
112 }
113 
114 impl<T> IndexMut<RegRef> for RegTracker<T> {
index_mut(&mut self, reg: RegRef) -> &mut [T]115     fn index_mut(&mut self, reg: RegRef) -> &mut [T] {
116         let range = reg.idx_range();
117         let range = Range {
118             start: usize::try_from(range.start).unwrap(),
119             end: usize::try_from(range.end).unwrap(),
120         };
121 
122         match reg.file() {
123             RegFile::GPR => &mut self.reg[range],
124             RegFile::UGPR => &mut self.ureg[range],
125             RegFile::Pred => &mut self.pred[range],
126             RegFile::UPred => &mut self.upred[range],
127             RegFile::Carry => &mut self.carry[range],
128             RegFile::Bar => &mut [], // Barriers have a HW scoreboard
129             RegFile::Mem => panic!("Not a register"),
130         }
131     }
132 }
133 
134 #[derive(Clone)]
135 enum RegUse<T: Clone> {
136     None,
137     Write(T),
138     Reads(Vec<T>),
139 }
140 
141 impl<T: Clone> RegUse<T> {
deps(&self) -> &[T]142     pub fn deps(&self) -> &[T] {
143         match self {
144             RegUse::None => &[],
145             RegUse::Write(dep) => slice::from_ref(dep),
146             RegUse::Reads(deps) => &deps[..],
147         }
148     }
149 
clear(&mut self) -> Self150     pub fn clear(&mut self) -> Self {
151         std::mem::replace(self, RegUse::None)
152     }
153 
clear_write(&mut self) -> Self154     pub fn clear_write(&mut self) -> Self {
155         if matches!(self, RegUse::Write(_)) {
156             std::mem::replace(self, RegUse::None)
157         } else {
158             RegUse::None
159         }
160     }
161 
add_read(&mut self, dep: T) -> Self162     pub fn add_read(&mut self, dep: T) -> Self {
163         match self {
164             RegUse::None => {
165                 *self = RegUse::Reads(vec![dep]);
166                 RegUse::None
167             }
168             RegUse::Write(_) => {
169                 std::mem::replace(self, RegUse::Reads(vec![dep]))
170             }
171             RegUse::Reads(reads) => {
172                 reads.push(dep);
173                 RegUse::None
174             }
175         }
176     }
177 
set_write(&mut self, dep: T) -> Self178     pub fn set_write(&mut self, dep: T) -> Self {
179         std::mem::replace(self, RegUse::Write(dep))
180     }
181 }
182 
183 struct DepNode {
184     read_dep: Option<usize>,
185     first_wait: Option<(usize, usize)>,
186 }
187 
188 struct DepGraph {
189     deps: Vec<DepNode>,
190     instr_deps: HashMap<(usize, usize), (usize, usize)>,
191     instr_waits: HashMap<(usize, usize), Vec<usize>>,
192     active: HashSet<usize>,
193 }
194 
195 impl DepGraph {
new() -> Self196     pub fn new() -> Self {
197         Self {
198             deps: Vec::new(),
199             instr_deps: HashMap::new(),
200             instr_waits: HashMap::new(),
201             active: HashSet::new(),
202         }
203     }
204 
add_new_dep(&mut self, read_dep: Option<usize>) -> usize205     fn add_new_dep(&mut self, read_dep: Option<usize>) -> usize {
206         let dep = self.deps.len();
207         self.deps.push(DepNode {
208             read_dep: read_dep,
209             first_wait: None,
210         });
211         dep
212     }
213 
add_instr(&mut self, block_idx: usize, ip: usize) -> (usize, usize)214     pub fn add_instr(&mut self, block_idx: usize, ip: usize) -> (usize, usize) {
215         let rd = self.add_new_dep(None);
216         let wr = self.add_new_dep(Some(rd));
217         self.instr_deps.insert((block_idx, ip), (rd, wr));
218         (rd, wr)
219     }
220 
add_signal(&mut self, dep: usize)221     pub fn add_signal(&mut self, dep: usize) {
222         self.active.insert(dep);
223     }
224 
add_waits( &mut self, block_idx: usize, ip: usize, mut waits: Vec<usize>, )225     pub fn add_waits(
226         &mut self,
227         block_idx: usize,
228         ip: usize,
229         mut waits: Vec<usize>,
230     ) {
231         for dep in &waits {
232             // A wait on a write automatically waits on the read.  By removing
233             // it from the active set here we ensure that we don't record any
234             // duplicate write/read waits in the retain below.
235             if let Some(rd) = &self.deps[*dep].read_dep {
236                 self.active.remove(rd);
237             }
238         }
239 
240         waits.retain(|dep| {
241             let node = &mut self.deps[*dep];
242             if let Some(wait) = node.first_wait {
243                 // Someone has already waited on this dep
244                 debug_assert!(!self.active.contains(dep));
245                 debug_assert!((block_idx, ip) >= wait);
246                 false
247             } else if !self.active.contains(dep) {
248                 // Even if it doesn't have a use, it may still be deactivated.
249                 // This can happen if we depend the the destination before any
250                 // of its sources.
251                 false
252             } else {
253                 self.deps[*dep].first_wait = Some((block_idx, ip));
254                 self.active.remove(dep);
255                 true
256             }
257         });
258 
259         // Sort for stability.  The list of waits may come from a HashSet (see
260         // add_barrier()) and so it's not guaranteed stable across Rust
261         // versions.  This also ensures that everything always waits on oldest
262         // dependencies first.
263         waits.sort();
264 
265         let _old = self.instr_waits.insert((block_idx, ip), waits);
266         debug_assert!(_old.is_none());
267     }
268 
add_barrier(&mut self, block_idx: usize, ip: usize)269     pub fn add_barrier(&mut self, block_idx: usize, ip: usize) {
270         let waits = self.active.iter().cloned().collect();
271         self.add_waits(block_idx, ip, waits);
272         debug_assert!(self.active.is_empty());
273     }
274 
dep_is_waited_after( &self, dep: usize, block_idx: usize, ip: usize, ) -> bool275     pub fn dep_is_waited_after(
276         &self,
277         dep: usize,
278         block_idx: usize,
279         ip: usize,
280     ) -> bool {
281         if let Some(wait) = self.deps[dep].first_wait {
282             wait > (block_idx, ip)
283         } else {
284             false
285         }
286     }
287 
get_instr_deps( &self, block_idx: usize, ip: usize, ) -> (usize, usize)288     pub fn get_instr_deps(
289         &self,
290         block_idx: usize,
291         ip: usize,
292     ) -> (usize, usize) {
293         *self.instr_deps.get(&(block_idx, ip)).unwrap()
294     }
295 
get_instr_waits(&self, block_idx: usize, ip: usize) -> &[usize]296     pub fn get_instr_waits(&self, block_idx: usize, ip: usize) -> &[usize] {
297         if let Some(waits) = self.instr_waits.get(&(block_idx, ip)) {
298             &waits[..]
299         } else {
300             &[]
301         }
302     }
303 }
304 
305 struct BarAlloc {
306     num_bars: u8,
307     bar_dep: [usize; 6],
308 }
309 
310 impl BarAlloc {
new() -> BarAlloc311     pub fn new() -> BarAlloc {
312         BarAlloc {
313             num_bars: 6,
314             bar_dep: [usize::MAX; 6],
315         }
316     }
317 
bar_is_free(&self, bar: u8) -> bool318     pub fn bar_is_free(&self, bar: u8) -> bool {
319         debug_assert!(bar < self.num_bars);
320         self.bar_dep[usize::from(bar)] == usize::MAX
321     }
322 
set_bar_dep(&mut self, bar: u8, dep: usize)323     pub fn set_bar_dep(&mut self, bar: u8, dep: usize) {
324         debug_assert!(self.bar_is_free(bar));
325         self.bar_dep[usize::from(bar)] = dep;
326     }
327 
free_bar(&mut self, bar: u8)328     pub fn free_bar(&mut self, bar: u8) {
329         debug_assert!(!self.bar_is_free(bar));
330         self.bar_dep[usize::from(bar)] = usize::MAX;
331     }
332 
try_find_free_bar(&self) -> Option<u8>333     pub fn try_find_free_bar(&self) -> Option<u8> {
334         for bar in 0..self.num_bars {
335             if self.bar_is_free(bar) {
336                 return Some(bar);
337             }
338         }
339         None
340     }
341 
free_some_bar(&mut self) -> u8342     pub fn free_some_bar(&mut self) -> u8 {
343         // Get the oldest by looking for the one with the smallest dep
344         let mut bar = 0;
345         for b in 1..self.num_bars {
346             if self.bar_dep[usize::from(b)] < self.bar_dep[usize::from(bar)] {
347                 bar = b;
348             }
349         }
350         self.free_bar(bar);
351         bar
352     }
353 
get_bar_for_dep(&self, dep: usize) -> Option<u8>354     pub fn get_bar_for_dep(&self, dep: usize) -> Option<u8> {
355         for bar in 0..self.num_bars {
356             if self.bar_dep[usize::from(bar)] == dep {
357                 return Some(bar);
358             }
359         }
360         None
361     }
362 }
363 
assign_barriers(f: &mut Function, sm: &dyn ShaderModel)364 fn assign_barriers(f: &mut Function, sm: &dyn ShaderModel) {
365     let mut uses = RegTracker::new_with(&|| RegUse::None);
366     let mut deps = DepGraph::new();
367 
368     for (bi, b) in f.blocks.iter().enumerate() {
369         for (ip, instr) in b.instrs.iter().enumerate() {
370             if instr.is_branch() {
371                 deps.add_barrier(bi, ip);
372             } else {
373                 // Execution predicates are handled immediately and we don't
374                 // need barriers for them, regardless of whether or not it's a
375                 // fixed-latency instruction.
376                 let mut waits = Vec::new();
377                 uses.for_each_instr_pred_mut(instr, |u| {
378                     let u = u.clear_write();
379                     waits.extend_from_slice(u.deps());
380                 });
381 
382                 if instr.has_fixed_latency(sm.sm()) {
383                     // Delays will cover us here.  We just need to make sure
384                     // that we wait on any uses that we consume.
385                     uses.for_each_instr_src_mut(instr, |_, u| {
386                         let u = u.clear_write();
387                         waits.extend_from_slice(u.deps());
388                     });
389                     uses.for_each_instr_dst_mut(instr, |_, u| {
390                         let u = u.clear();
391                         waits.extend_from_slice(u.deps());
392                     });
393                 } else {
394                     let (rd, wr) = deps.add_instr(bi, ip);
395                     uses.for_each_instr_src_mut(instr, |_, u| {
396                         // Only mark a dep as signaled if we actually have
397                         // something that shows up in the register file as
398                         // needing scoreboarding
399                         deps.add_signal(rd);
400                         let u = u.add_read(rd);
401                         waits.extend_from_slice(u.deps());
402                     });
403                     uses.for_each_instr_dst_mut(instr, |_, u| {
404                         // Only mark a dep as signaled if we actually have
405                         // something that shows up in the register file as
406                         // needing scoreboarding
407                         deps.add_signal(wr);
408                         let u = u.set_write(wr);
409                         for dep in u.deps() {
410                             // Don't wait on ourselves
411                             if *dep != rd {
412                                 waits.push(*dep);
413                             }
414                         }
415                     });
416                 }
417                 deps.add_waits(bi, ip, waits);
418             }
419         }
420     }
421 
422     let mut bars = BarAlloc::new();
423 
424     for (bi, b) in f.blocks.iter_mut().enumerate() {
425         for (ip, instr) in b.instrs.iter_mut().enumerate() {
426             let mut wait_mask = 0_u8;
427             for dep in deps.get_instr_waits(bi, ip) {
428                 if let Some(bar) = bars.get_bar_for_dep(*dep) {
429                     wait_mask |= 1 << bar;
430                     bars.free_bar(bar);
431                 }
432             }
433             instr.deps.add_wt_bar_mask(wait_mask);
434 
435             if instr.needs_yield() {
436                 instr.deps.set_yield(true);
437             }
438 
439             if instr.has_fixed_latency(sm.sm()) {
440                 continue;
441             }
442 
443             let (rd_dep, wr_dep) = deps.get_instr_deps(bi, ip);
444             if deps.dep_is_waited_after(rd_dep, bi, ip) {
445                 let rd_bar = bars.try_find_free_bar().unwrap_or_else(|| {
446                     let bar = bars.free_some_bar();
447                     instr.deps.add_wt_bar(bar);
448                     bar
449                 });
450                 bars.set_bar_dep(rd_bar, rd_dep);
451                 instr.deps.set_rd_bar(rd_bar);
452             }
453             if deps.dep_is_waited_after(wr_dep, bi, ip) {
454                 let wr_bar = bars.try_find_free_bar().unwrap_or_else(|| {
455                     let bar = bars.free_some_bar();
456                     instr.deps.add_wt_bar(bar);
457                     bar
458                 });
459                 bars.set_bar_dep(wr_bar, wr_dep);
460                 instr.deps.set_wr_bar(wr_bar);
461             }
462         }
463     }
464 }
465 
exec_latency(sm: u8, op: &Op) -> u32466 fn exec_latency(sm: u8, op: &Op) -> u32 {
467     if sm >= 70 {
468         match op {
469             Op::Bar(_) | Op::MemBar(_) => {
470                 if sm >= 80 {
471                     6
472                 } else {
473                     5
474                 }
475             }
476             Op::CCtl(_op) => {
477                 // CCTL.C needs 8, CCTL.I needs 11
478                 11
479             }
480             // Op::DepBar(_) => 4,
481             _ => 1, // TODO: co-issue
482         }
483     } else {
484         match op {
485             Op::CCtl(_)
486             | Op::MemBar(_)
487             | Op::Bra(_)
488             | Op::SSy(_)
489             | Op::Sync(_)
490             | Op::Brk(_)
491             | Op::PBk(_)
492             | Op::Cont(_)
493             | Op::PCnt(_)
494             | Op::Exit(_)
495             | Op::Bar(_)
496             | Op::Kill(_)
497             | Op::OutFinal(_) => 13,
498             _ => 1,
499         }
500     }
501 }
502 
instr_latency(op: &Op, dst_idx: usize) -> u32503 fn instr_latency(op: &Op, dst_idx: usize) -> u32 {
504     let file = match op.dsts_as_slice()[dst_idx] {
505         Dst::None => return 0,
506         Dst::SSA(vec) => vec.file().unwrap(),
507         Dst::Reg(reg) => reg.file(),
508     };
509 
510     // This is BS and we know it
511     match file {
512         RegFile::GPR => 6,
513         RegFile::UGPR => 12,
514         RegFile::Pred => 13,
515         RegFile::UPred => 11,
516         RegFile::Bar => 0, // Barriers have a HW scoreboard
517         RegFile::Carry => 6,
518         RegFile::Mem => panic!("Not a register"),
519     }
520 }
521 
522 /// Read-after-write latency
raw_latency( _sm: u8, write: &Op, dst_idx: usize, _read: &Op, _src_idx: usize, ) -> u32523 fn raw_latency(
524     _sm: u8,
525     write: &Op,
526     dst_idx: usize,
527     _read: &Op,
528     _src_idx: usize,
529 ) -> u32 {
530     instr_latency(write, dst_idx)
531 }
532 
533 /// Write-after-read latency
war_latency( _sm: u8, _read: &Op, _src_idx: usize, _write: &Op, _dst_idx: usize, ) -> u32534 fn war_latency(
535     _sm: u8,
536     _read: &Op,
537     _src_idx: usize,
538     _write: &Op,
539     _dst_idx: usize,
540 ) -> u32 {
541     // We assume the source gets read in the first 4 cycles.  We don't know how
542     // quickly the write will happen.  This is all a guess.
543     4
544 }
545 
546 /// Write-after-write latency
waw_latency( _sm: u8, a: &Op, a_dst_idx: usize, _b: &Op, _b_dst_idx: usize, ) -> u32547 fn waw_latency(
548     _sm: u8,
549     a: &Op,
550     a_dst_idx: usize,
551     _b: &Op,
552     _b_dst_idx: usize,
553 ) -> u32 {
554     // We know our latencies are wrong so assume the wrote could happen anywhere
555     // between 0 and instr_latency(a) cycles
556     instr_latency(a, a_dst_idx)
557 }
558 
559 /// Predicate read-after-write latency
paw_latency(_sm: u8, _write: &Op, _dst_idx: usize) -> u32560 fn paw_latency(_sm: u8, _write: &Op, _dst_idx: usize) -> u32 {
561     13
562 }
563 
calc_delays(f: &mut Function, sm: &dyn ShaderModel)564 fn calc_delays(f: &mut Function, sm: &dyn ShaderModel) {
565     for b in f.blocks.iter_mut().rev() {
566         let mut cycle = 0_u32;
567 
568         // Vector mapping IP to start cycle
569         let mut instr_cycle = Vec::new();
570         instr_cycle.resize(b.instrs.len(), 0_u32);
571 
572         // Maps registers to RegUse<ip, src_dst_idx>.  Predicates are
573         // represented by  src_idx = usize::MAX.
574         let mut uses: RegTracker<RegUse<(usize, usize)>> =
575             RegTracker::new_with(&|| RegUse::None);
576 
577         // Map from barrier to last waited cycle
578         let mut bars = [0_u32; 6];
579 
580         for ip in (0..b.instrs.len()).rev() {
581             let instr = &b.instrs[ip];
582             let mut min_start = cycle + exec_latency(sm.sm(), &instr.op);
583             if let Some(bar) = instr.deps.rd_bar() {
584                 min_start = max(min_start, bars[usize::from(bar)] + 2);
585             }
586             if let Some(bar) = instr.deps.wr_bar() {
587                 min_start = max(min_start, bars[usize::from(bar)] + 2);
588             }
589             uses.for_each_instr_dst_mut(instr, |i, u| match u {
590                 RegUse::None => {
591                     // We don't know how it will be used but it may be used in
592                     // the next block so we need at least assume the maximum
593                     // destination latency from the end of the block.
594                     let s = instr_latency(&instr.op, i);
595                     min_start = max(min_start, s);
596                 }
597                 RegUse::Write((w_ip, w_dst_idx)) => {
598                     let s = instr_cycle[*w_ip]
599                         + waw_latency(
600                             sm.sm(),
601                             &instr.op,
602                             i,
603                             &b.instrs[*w_ip].op,
604                             *w_dst_idx,
605                         );
606                     min_start = max(min_start, s);
607                 }
608                 RegUse::Reads(reads) => {
609                     for (r_ip, r_src_idx) in reads {
610                         let c = instr_cycle[*r_ip];
611                         let s = if *r_src_idx == usize::MAX {
612                             c + paw_latency(sm.sm(), &instr.op, i)
613                         } else {
614                             c + raw_latency(
615                                 sm.sm(),
616                                 &instr.op,
617                                 i,
618                                 &b.instrs[*r_ip].op,
619                                 *r_src_idx,
620                             )
621                         };
622                         min_start = max(min_start, s);
623                     }
624                 }
625             });
626             uses.for_each_instr_src_mut(instr, |i, u| match u {
627                 RegUse::None => (),
628                 RegUse::Write((w_ip, w_dst_idx)) => {
629                     let s = instr_cycle[*w_ip]
630                         + war_latency(
631                             sm.sm(),
632                             &instr.op,
633                             i,
634                             &b.instrs[*w_ip].op,
635                             *w_dst_idx,
636                         );
637                     min_start = max(min_start, s);
638                 }
639                 RegUse::Reads(_) => (),
640             });
641 
642             let instr = &mut b.instrs[ip];
643 
644             let delay = min_start - cycle;
645             let delay = delay
646                 .clamp(MIN_INSTR_DELAY.into(), MAX_INSTR_DELAY.into())
647                 .try_into()
648                 .unwrap();
649             instr.deps.set_delay(delay);
650 
651             instr_cycle[ip] = min_start;
652             uses.for_each_instr_pred_mut(instr, |c| {
653                 c.add_read((ip, usize::MAX));
654             });
655             uses.for_each_instr_src_mut(instr, |i, c| {
656                 c.add_read((ip, i));
657             });
658             uses.for_each_instr_dst_mut(instr, |i, c| {
659                 c.set_write((ip, i));
660             });
661             for (bar, c) in bars.iter_mut().enumerate() {
662                 if instr.deps.wt_bar_mask & (1 << bar) != 0 {
663                     *c = min_start;
664                 }
665             }
666 
667             cycle = min_start;
668         }
669     }
670 
671     // It's unclear exactly why but the blob inserts a Nop with a delay of 2
672     // after every instruction which has an exec latency.  Perhaps it has
673     // something to do with .yld?  In any case, the extra 2 cycles aren't worth
674     // the chance of weird bugs.
675     f.map_instrs(|mut instr, _| {
676         if matches!(instr.op, Op::SrcBar(_)) {
677             instr.op = Op::Nop(OpNop { label: None });
678             MappedInstrs::One(instr)
679         } else if exec_latency(sm.sm(), &instr.op) > 1 {
680             let mut nop = Instr::new_boxed(OpNop { label: None });
681             nop.deps.set_delay(2);
682             MappedInstrs::Many(vec![instr, nop])
683         } else {
684             MappedInstrs::One(instr)
685         }
686     });
687 }
688 
689 impl Shader<'_> {
assign_deps_serial(&mut self)690     pub fn assign_deps_serial(&mut self) {
691         for f in &mut self.functions {
692             for b in &mut f.blocks.iter_mut().rev() {
693                 let mut wt = 0_u8;
694                 for instr in &mut b.instrs {
695                     if matches!(&instr.op, Op::Bar(_))
696                         || matches!(&instr.op, Op::BClear(_))
697                         || matches!(&instr.op, Op::BSSy(_))
698                         || matches!(&instr.op, Op::BSync(_))
699                     {
700                         instr.deps.set_yield(true);
701                     } else if instr.is_branch() {
702                         instr.deps.add_wt_bar_mask(0x3f);
703                     } else {
704                         instr.deps.add_wt_bar_mask(wt);
705                         if instr.dsts().len() > 0 {
706                             instr.deps.set_wr_bar(0);
707                             wt |= 1 << 0;
708                         }
709                         if !instr.pred.pred_ref.is_none()
710                             || instr.srcs().len() > 0
711                         {
712                             instr.deps.set_rd_bar(1);
713                             wt |= 1 << 1;
714                         }
715                     }
716                 }
717             }
718         }
719     }
720 
calc_instr_deps(&mut self)721     pub fn calc_instr_deps(&mut self) {
722         if DEBUG.serial() {
723             self.assign_deps_serial();
724         } else {
725             for f in &mut self.functions {
726                 assign_barriers(f, self.sm);
727                 calc_delays(f, self.sm);
728             }
729         }
730     }
731 }
732