1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3
4 #![allow(non_upper_case_globals)]
5
6 use crate::api::GetDebugFlags;
7 use crate::api::DEBUG;
8 use crate::builder::*;
9 use crate::ir::*;
10 use crate::nir_instr_printer::NirInstrPrinter;
11 use crate::sph::{OutputTopology, PixelImap};
12
13 use nak_bindings::*;
14
15 use compiler::bindings::*;
16 use compiler::cfg::CFGBuilder;
17 use compiler::nir::*;
18 use std::cmp::max;
19 use std::collections::{HashMap, HashSet};
20 use std::ops::Index;
21
init_info_from_nir(nir: &nir_shader) -> ShaderInfo22 fn init_info_from_nir(nir: &nir_shader) -> ShaderInfo {
23 ShaderInfo {
24 num_gprs: 0,
25 num_instrs: 0,
26 num_control_barriers: 0,
27 slm_size: nir.scratch_size,
28 max_crs_depth: 0,
29 uses_global_mem: false,
30 writes_global_mem: false,
31 // TODO: handle this.
32 uses_fp64: false,
33 stage: match nir.info.stage() {
34 MESA_SHADER_COMPUTE => {
35 ShaderStageInfo::Compute(ComputeShaderInfo {
36 local_size: [
37 nir.info.workgroup_size[0],
38 nir.info.workgroup_size[1],
39 nir.info.workgroup_size[2],
40 ],
41 smem_size: nir.info.shared_size.try_into().unwrap(),
42 })
43 }
44 MESA_SHADER_VERTEX => ShaderStageInfo::Vertex,
45 MESA_SHADER_FRAGMENT => {
46 let info_fs = unsafe { &nir.info.__bindgen_anon_1.fs };
47 ShaderStageInfo::Fragment(FragmentShaderInfo {
48 uses_kill: false,
49 does_interlock: false,
50 post_depth_coverage: info_fs.post_depth_coverage(),
51 early_fragment_tests: info_fs.early_fragment_tests(),
52 uses_sample_shading: info_fs.uses_sample_shading(),
53 })
54 }
55 MESA_SHADER_GEOMETRY => {
56 let info_gs = unsafe { &nir.info.__bindgen_anon_1.gs };
57 let output_topology = match info_gs.output_primitive {
58 MESA_PRIM_POINTS => OutputTopology::PointList,
59 MESA_PRIM_LINE_STRIP => OutputTopology::LineStrip,
60 MESA_PRIM_TRIANGLE_STRIP => OutputTopology::TriangleStrip,
61 _ => panic!(
62 "Invalid GS input primitive {}",
63 info_gs.input_primitive
64 ),
65 };
66
67 ShaderStageInfo::Geometry(GeometryShaderInfo {
68 // TODO: Should be set if VK_NV_geometry_shader_passthrough is in use.
69 passthrough_enable: false,
70 stream_out_mask: info_gs.active_stream_mask(),
71 threads_per_input_primitive: info_gs.invocations,
72 output_topology: output_topology,
73 max_output_vertex_count: info_gs.vertices_out,
74 })
75 }
76 MESA_SHADER_TESS_CTRL => {
77 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
78 ShaderStageInfo::TessellationInit(TessellationInitShaderInfo {
79 per_patch_attribute_count: 6,
80 threads_per_patch: info_tess.tcs_vertices_out,
81 })
82 }
83 MESA_SHADER_TESS_EVAL => {
84 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
85 ShaderStageInfo::Tessellation(TessellationShaderInfo {
86 domain: match info_tess._primitive_mode {
87 TESS_PRIMITIVE_TRIANGLES => {
88 TessellationDomain::Triangle
89 }
90 TESS_PRIMITIVE_QUADS => TessellationDomain::Quad,
91 TESS_PRIMITIVE_ISOLINES => TessellationDomain::Isoline,
92 _ => panic!("Invalid tess_primitive_mode"),
93 },
94 spacing: match info_tess.spacing() {
95 TESS_SPACING_EQUAL => TessellationSpacing::Integer,
96 TESS_SPACING_FRACTIONAL_ODD => {
97 TessellationSpacing::FractionalOdd
98 }
99 TESS_SPACING_FRACTIONAL_EVEN => {
100 TessellationSpacing::FractionalEven
101 }
102 _ => panic!("Invalid gl_tess_spacing"),
103 },
104 primitives: if info_tess.point_mode() {
105 TessellationPrimitives::Points
106 } else if info_tess._primitive_mode
107 == TESS_PRIMITIVE_ISOLINES
108 {
109 TessellationPrimitives::Lines
110 } else if info_tess.ccw() {
111 TessellationPrimitives::TrianglesCCW
112 } else {
113 TessellationPrimitives::TrianglesCW
114 },
115 })
116 }
117 _ => panic!("Unknown shader stage"),
118 },
119 io: match nir.info.stage() {
120 MESA_SHADER_COMPUTE => ShaderIoInfo::None,
121 MESA_SHADER_FRAGMENT => ShaderIoInfo::Fragment(FragmentIoInfo {
122 sysvals_in: SysValInfo {
123 // Required on fragment shaders, otherwise it cause a trap.
124 ab: 1 << 31,
125 c: 0,
126 },
127 sysvals_in_d: [PixelImap::Unused; 8],
128 attr_in: [PixelImap::Unused; 128],
129 barycentric_attr_in: [0; 4],
130 reads_sample_mask: false,
131 writes_color: 0,
132 writes_sample_mask: false,
133 writes_depth: false,
134 }),
135 MESA_SHADER_VERTEX
136 | MESA_SHADER_GEOMETRY
137 | MESA_SHADER_TESS_CTRL
138 | MESA_SHADER_TESS_EVAL => {
139 let num_clip = nir.info.clip_distance_array_size();
140 let num_cull = nir.info.cull_distance_array_size();
141 let clip_enable = (1_u32 << num_clip) - 1;
142 let cull_enable = ((1_u32 << num_cull) - 1) << num_clip;
143
144 ShaderIoInfo::Vtg(VtgIoInfo {
145 sysvals_in: SysValInfo::default(),
146 sysvals_in_d: 0,
147 sysvals_out: SysValInfo::default(),
148 sysvals_out_d: 0,
149 attr_in: [0; 4],
150 attr_out: [0; 4],
151
152 // TODO: figure out how to fill this.
153 store_req_start: u8::MAX,
154 store_req_end: 0,
155
156 clip_enable: clip_enable.try_into().unwrap(),
157 cull_enable: cull_enable.try_into().unwrap(),
158 xfb: if nir.xfb_info.is_null() {
159 None
160 } else {
161 Some(Box::new(unsafe {
162 nak_xfb_from_nir(nir.xfb_info)
163 }))
164 },
165 })
166 }
167 _ => panic!("Unknown shader stage"),
168 },
169 }
170 }
171
alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue>172 fn alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue> {
173 let (file, comps) = if ssa.bit_size == 1 {
174 (RegFile::Pred, ssa.num_components)
175 } else {
176 let bits = ssa.bit_size * ssa.num_components;
177 (RegFile::GPR, bits.div_ceil(32))
178 };
179
180 let mut vec = Vec::new();
181 for _ in 0..comps {
182 vec.push(b.alloc_ssa(file, 1)[0]);
183 }
184 vec
185 }
186
187 struct PhiAllocMap<'a> {
188 alloc: &'a mut PhiAllocator,
189 map: HashMap<(u32, u8), u32>,
190 }
191
192 impl<'a> PhiAllocMap<'a> {
new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a>193 fn new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a> {
194 PhiAllocMap {
195 alloc: alloc,
196 map: HashMap::new(),
197 }
198 }
199
get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32200 fn get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32 {
201 *self
202 .map
203 .entry((phi.def.index, comp))
204 .or_insert_with(|| self.alloc.alloc())
205 }
206 }
207
208 struct PerSizeFloatControls {
209 pub ftz: bool,
210 pub rnd_mode: FRndMode,
211 }
212
213 struct ShaderFloatControls {
214 pub fp16: PerSizeFloatControls,
215 pub fp32: PerSizeFloatControls,
216 pub fp64: PerSizeFloatControls,
217 }
218
219 impl Default for ShaderFloatControls {
default() -> Self220 fn default() -> Self {
221 Self {
222 fp16: PerSizeFloatControls {
223 ftz: false,
224 rnd_mode: FRndMode::NearestEven,
225 },
226 fp32: PerSizeFloatControls {
227 ftz: true, // Default FTZ on fp32
228 rnd_mode: FRndMode::NearestEven,
229 },
230 fp64: PerSizeFloatControls {
231 ftz: false,
232 rnd_mode: FRndMode::NearestEven,
233 },
234 }
235 }
236 }
237
238 impl ShaderFloatControls {
from_nir(nir: &nir_shader) -> ShaderFloatControls239 fn from_nir(nir: &nir_shader) -> ShaderFloatControls {
240 let nir_fc = nir.info.float_controls_execution_mode;
241 let mut fc: ShaderFloatControls = Default::default();
242
243 if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) != 0 {
244 fc.fp16.ftz = false;
245 } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) != 0 {
246 fc.fp16.ftz = true;
247 }
248 if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) != 0 {
249 fc.fp16.rnd_mode = FRndMode::NearestEven;
250 } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) != 0 {
251 fc.fp16.rnd_mode = FRndMode::Zero;
252 }
253
254 if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) != 0 {
255 fc.fp32.ftz = false;
256 } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) != 0 {
257 fc.fp32.ftz = true;
258 }
259 if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) != 0 {
260 fc.fp32.rnd_mode = FRndMode::NearestEven;
261 } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) != 0 {
262 fc.fp32.rnd_mode = FRndMode::Zero;
263 }
264
265 if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) != 0 {
266 fc.fp64.ftz = false;
267 } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) != 0 {
268 fc.fp64.ftz = true;
269 }
270 if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) != 0 {
271 fc.fp64.rnd_mode = FRndMode::NearestEven;
272 } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) != 0 {
273 fc.fp64.rnd_mode = FRndMode::Zero;
274 }
275
276 fc
277 }
278 }
279
280 impl Index<FloatType> for ShaderFloatControls {
281 type Output = PerSizeFloatControls;
282
index(&self, idx: FloatType) -> &PerSizeFloatControls283 fn index(&self, idx: FloatType) -> &PerSizeFloatControls {
284 match idx {
285 FloatType::F16 => &self.fp16,
286 FloatType::F32 => &self.fp32,
287 FloatType::F64 => &self.fp64,
288 }
289 }
290 }
291
292 #[derive(Clone, Copy, Eq, Hash, PartialEq)]
293 enum SyncType {
294 Sync,
295 Brk,
296 Cont,
297 }
298
299 struct ShaderFromNir<'a> {
300 nir: &'a nir_shader,
301 sm: &'a dyn ShaderModel,
302 info: ShaderInfo,
303 float_ctl: ShaderFloatControls,
304 cfg: CFGBuilder<u32, BasicBlock>,
305 label_alloc: LabelAllocator,
306 block_label: HashMap<u32, Label>,
307 bar_label: HashMap<u32, Label>,
308 sync_blocks: HashSet<u32>,
309 crs: Vec<(u32, SyncType)>,
310 fs_out_regs: [SSAValue; 34],
311 end_block_id: u32,
312 ssa_map: HashMap<u32, Vec<SSAValue>>,
313 saturated: HashSet<*const nir_def>,
314 nir_instr_printer: NirInstrPrinter,
315 }
316
317 impl<'a> ShaderFromNir<'a> {
new(nir: &'a nir_shader, sm: &'a dyn ShaderModel) -> Self318 fn new(nir: &'a nir_shader, sm: &'a dyn ShaderModel) -> Self {
319 Self {
320 nir: nir,
321 sm: sm,
322 info: init_info_from_nir(nir),
323 float_ctl: ShaderFloatControls::from_nir(nir),
324 cfg: CFGBuilder::new(),
325 label_alloc: LabelAllocator::new(),
326 block_label: HashMap::new(),
327 bar_label: HashMap::new(),
328 sync_blocks: HashSet::new(),
329 crs: Vec::new(),
330 fs_out_regs: [SSAValue::NONE; 34],
331 end_block_id: 0,
332 ssa_map: HashMap::new(),
333 saturated: HashSet::new(),
334 nir_instr_printer: NirInstrPrinter::new(),
335 }
336 }
337
get_block_label(&mut self, block: &nir_block) -> Label338 fn get_block_label(&mut self, block: &nir_block) -> Label {
339 *self
340 .block_label
341 .entry(block.index)
342 .or_insert_with(|| self.label_alloc.alloc())
343 }
344
push_crs(&mut self, target: &nir_block, sync_type: SyncType)345 fn push_crs(&mut self, target: &nir_block, sync_type: SyncType) {
346 self.sync_blocks.insert(target.index);
347 self.crs.push((target.index, sync_type));
348 let crs_depth = u32::try_from(self.crs.len()).unwrap();
349 self.info.max_crs_depth = max(self.info.max_crs_depth, crs_depth);
350 }
351
pop_crs(&mut self, target: &nir_block, sync_type: SyncType)352 fn pop_crs(&mut self, target: &nir_block, sync_type: SyncType) {
353 if let Some((top_index, top_sync_type)) = self.crs.pop() {
354 assert!(top_index == target.index);
355 assert!(top_sync_type == sync_type);
356 } else {
357 panic!("Tried to pop an empty stack");
358 }
359 }
360
peek_crs(&self, target: &nir_block) -> Option<SyncType>361 fn peek_crs(&self, target: &nir_block) -> Option<SyncType> {
362 for (i, (index, sync_type)) in self.crs.iter().enumerate().rev() {
363 if *index != target.index {
364 continue;
365 }
366
367 match sync_type {
368 SyncType::Sync => {
369 // Sync must always be top-of-stack
370 assert!(i == self.crs.len() - 1);
371 }
372 SyncType::Brk => {
373 // Brk cannot skip over another Brk
374 for (_, inner_sync) in &self.crs[(i + 1)..] {
375 assert!(*inner_sync != SyncType::Brk);
376 }
377 }
378 SyncType::Cont => {
379 // Cont can only skip over Sync
380 for (_, inner_sync) in &self.crs[(i + 1)..] {
381 assert!(*inner_sync == SyncType::Sync);
382 }
383 }
384 }
385
386 return Some(*sync_type);
387 }
388
389 assert!(!self.sync_blocks.contains(&target.index));
390 None
391 }
392
get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue]393 fn get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue] {
394 self.ssa_map.get(&ssa.index).unwrap()
395 }
396
set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>)397 fn set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>) {
398 if def.bit_size == 1 {
399 for s in &vec {
400 assert!(s.is_predicate());
401 }
402 } else {
403 for s in &vec {
404 assert!(!s.is_predicate());
405 }
406 let bits =
407 usize::from(def.bit_size) * usize::from(def.num_components);
408 assert!(vec.len() == bits.div_ceil(32));
409 }
410 self.ssa_map
411 .entry(def.index)
412 .and_modify(|_| panic!("Cannot set an SSA def twice"))
413 .or_insert(vec);
414 }
415
get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8)416 fn get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8) {
417 let vec = self.get_ssa(def);
418 match def.bit_size {
419 1 => (vec[usize::from(c)].into(), 0),
420 8 => (vec[usize::from(c / 4)].into(), c % 4),
421 16 => (vec[usize::from(c / 2)].into(), (c * 2) % 4),
422 32 => (vec[usize::from(c)].into(), 0),
423 64 => {
424 let comps =
425 [vec[usize::from(c) * 2 + 0], vec[usize::from(c) * 2 + 1]];
426 (comps.into(), 0)
427 }
428 _ => panic!("Unsupported bit size: {}", def.bit_size),
429 }
430 }
431
get_ssa_ref(&mut self, src: &nir_src) -> SSARef432 fn get_ssa_ref(&mut self, src: &nir_src) -> SSARef {
433 SSARef::try_from(self.get_ssa(src.as_def())).unwrap()
434 }
435
get_src(&mut self, src: &nir_src) -> Src436 fn get_src(&mut self, src: &nir_src) -> Src {
437 self.get_ssa_ref(src).into()
438 }
439
get_io_addr_offset( &mut self, addr: &nir_src, imm_bits: u8, ) -> (Src, i32)440 fn get_io_addr_offset(
441 &mut self,
442 addr: &nir_src,
443 imm_bits: u8,
444 ) -> (Src, i32) {
445 let addr = addr.as_def();
446 let addr_offset = unsafe {
447 nak_get_io_addr_offset(addr as *const _ as *mut _, imm_bits)
448 };
449
450 if let Some(base_def) = std::ptr::NonNull::new(addr_offset.base.def) {
451 let base_def = unsafe { base_def.as_ref() };
452 let base_comp = u8::try_from(addr_offset.base.comp).unwrap();
453 let (base, _) = self.get_ssa_comp(base_def, base_comp);
454 (base.into(), addr_offset.offset)
455 } else {
456 (SrcRef::Zero.into(), addr_offset.offset)
457 }
458 }
459
get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16)460 fn get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16) {
461 let (off, off_imm) = self.get_io_addr_offset(addr, 16);
462 if let Ok(off_imm_u16) = u16::try_from(off_imm) {
463 (off, off_imm_u16)
464 } else {
465 (self.get_src(addr), 0)
466 }
467 }
468
set_dst(&mut self, def: &nir_def, ssa: SSARef)469 fn set_dst(&mut self, def: &nir_def, ssa: SSARef) {
470 self.set_ssa(def, (*ssa).into());
471 }
472
try_saturate_alu_dst(&mut self, def: &nir_def) -> bool473 fn try_saturate_alu_dst(&mut self, def: &nir_def) -> bool {
474 if def.all_uses_are_fsat() {
475 self.saturated.insert(def as *const _);
476 true
477 } else {
478 false
479 }
480 }
481
alu_src_is_saturated(&self, src: &nir_alu_src) -> bool482 fn alu_src_is_saturated(&self, src: &nir_alu_src) -> bool {
483 self.saturated
484 .get(&(src.src.as_def() as *const _))
485 .is_some()
486 }
487
parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr)488 fn parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr) {
489 // Handle vectors and pack ops as a special case since they're the only
490 // ALU ops that can produce more than 16B. They are also the only ALU
491 // ops which we allow to consume small (8 and 16-bit) vector data
492 // scattered across multiple dwords
493 match alu.op {
494 nir_op_mov
495 | nir_op_pack_32_4x8_split
496 | nir_op_pack_32_2x16_split
497 | nir_op_pack_64_2x32_split
498 | nir_op_vec2
499 | nir_op_vec3
500 | nir_op_vec4
501 | nir_op_vec5
502 | nir_op_vec8
503 | nir_op_vec16 => {
504 let src_bit_size = alu.get_src(0).src.bit_size();
505 let bits = usize::from(alu.def.num_components)
506 * usize::from(alu.def.bit_size);
507
508 // Collect the sources into a vec with src_bit_size per SSA
509 // value in the vec. This implicitly makes 64-bit sources look
510 // like two 32-bit values
511 let mut srcs = Vec::new();
512 if alu.op == nir_op_mov {
513 let src = alu.get_src(0);
514 for c in 0..alu.def.num_components {
515 let s = src.swizzle[usize::from(c)];
516 let (src, byte) =
517 self.get_ssa_comp(src.src.as_def(), s);
518 for ssa in src.iter() {
519 srcs.push((*ssa, byte));
520 }
521 }
522 } else {
523 for src in alu.srcs_as_slice().iter() {
524 let s = src.swizzle[0];
525 let (src, byte) =
526 self.get_ssa_comp(src.src.as_def(), s);
527 for ssa in src.iter() {
528 srcs.push((*ssa, byte));
529 }
530 }
531 }
532
533 let mut comps = Vec::new();
534 match src_bit_size {
535 1 | 32 | 64 => {
536 for (ssa, _) in srcs {
537 comps.push(ssa);
538 }
539 }
540 8 => {
541 for dc in 0..bits.div_ceil(32) {
542 let mut psrc = [Src::new_zero(); 4];
543 let mut psel = [0_u8; 4];
544
545 for b in 0..4 {
546 let sc = dc * 4 + b;
547 if sc < srcs.len() {
548 let (ssa, byte) = srcs[sc];
549 for i in 0..4_u8 {
550 let psrc_i = &mut psrc[usize::from(i)];
551 if *psrc_i == Src::new_zero() {
552 *psrc_i = ssa.into();
553 } else if *psrc_i != Src::from(ssa) {
554 continue;
555 }
556 psel[b] = i * 4 + byte;
557 }
558 }
559 }
560 comps.push(b.prmt4(psrc, psel)[0]);
561 }
562 }
563 16 => {
564 for dc in 0..bits.div_ceil(32) {
565 let mut psrc = [Src::new_zero(); 2];
566 let mut psel = [0_u8; 4];
567
568 for w in 0..2 {
569 let sc = dc * 2 + w;
570 if sc < srcs.len() {
571 let (ssa, byte) = srcs[sc];
572 let w_u8 = u8::try_from(w).unwrap();
573 psrc[w] = ssa.into();
574 psel[w * 2 + 0] = (w_u8 * 4) + byte;
575 psel[w * 2 + 1] = (w_u8 * 4) + byte + 1;
576 }
577 }
578 comps.push(b.prmt(psrc[0], psrc[1], psel)[0]);
579 }
580 }
581 _ => panic!("Unknown bit size: {src_bit_size}"),
582 }
583
584 self.set_ssa(&alu.def, comps);
585 return;
586 }
587 _ => (),
588 }
589
590 let nir_srcs = alu.srcs_as_slice();
591 let mut srcs: Vec<Src> = Vec::new();
592 for (i, alu_src) in nir_srcs.iter().enumerate() {
593 let bit_size = alu_src.src.bit_size();
594 let comps = alu.src_components(i.try_into().unwrap());
595 let ssa = self.get_ssa(alu_src.src.as_def());
596
597 match bit_size {
598 1 => {
599 assert!(comps == 1);
600 let s = usize::from(alu_src.swizzle[0]);
601 srcs.push(ssa[s].into());
602 }
603 8 | 16 => {
604 let num_bytes = usize::from(comps * (bit_size / 8));
605 assert!(num_bytes <= 4);
606
607 let mut bytes = [0_u8; 4];
608 for c in 0..usize::from(comps) {
609 let cs = alu_src.swizzle[c];
610 if bit_size == 8 {
611 bytes[c] = cs;
612 } else {
613 bytes[c * 2 + 0] = cs * 2 + 0;
614 bytes[c * 2 + 1] = cs * 2 + 1;
615 }
616 }
617
618 let mut prmt_srcs = [Src::new_zero(); 4];
619 let mut prmt = [0_u8; 4];
620 for b in 0..num_bytes {
621 for (ds, s) in prmt_srcs.iter_mut().enumerate() {
622 let dw = ssa[usize::from(bytes[b] / 4)];
623 if s.is_zero() {
624 *s = dw.into();
625 } else if *s != Src::from(dw) {
626 continue;
627 }
628 prmt[usize::from(b)] =
629 (ds as u8) * 4 + (bytes[b] % 4);
630 break;
631 }
632 }
633
634 srcs.push(b.prmt4(prmt_srcs, prmt).into());
635 }
636 32 => {
637 assert!(comps == 1);
638 let s = usize::from(alu_src.swizzle[0]);
639 srcs.push(ssa[s].into());
640 }
641 64 => {
642 assert!(comps == 1);
643 let s = usize::from(alu_src.swizzle[0]);
644 srcs.push([ssa[s * 2], ssa[s * 2 + 1]].into());
645 }
646 _ => panic!("Invalid bit size: {bit_size}"),
647 }
648 }
649
650 // Restricts an F16v2 source to just x if the ALU op is single-component. This
651 // must only be called for per-component sources (see nir_op_info::output_sizes
652 // for more details).
653 let restrict_f16v2_src = |mut src: Src| {
654 if alu.def.num_components == 1 {
655 src.src_swizzle = SrcSwizzle::Xx;
656 }
657 src
658 };
659
660 let dst: SSARef = match alu.op {
661 nir_op_b2b1 => {
662 assert!(alu.get_src(0).bit_size() == 32);
663 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], 0.into())
664 }
665 nir_op_b2b32 | nir_op_b2i8 | nir_op_b2i16 | nir_op_b2i32 => {
666 b.sel(srcs[0].bnot(), 0.into(), 1.into())
667 }
668 nir_op_b2i64 => {
669 let lo = b.sel(srcs[0].bnot(), 0.into(), 1.into());
670 let hi = b.copy(0.into());
671 [lo[0], hi[0]].into()
672 }
673 nir_op_b2f16 => b.sel(srcs[0].bnot(), 0.into(), 0x3c00.into()),
674 nir_op_b2f32 => {
675 b.sel(srcs[0].bnot(), 0.0_f32.into(), 1.0_f32.into())
676 }
677 nir_op_b2f64 => {
678 let lo = b.copy(0.into());
679 let hi = b.sel(srcs[0].bnot(), 0.into(), 0x3ff00000.into());
680 [lo[0], hi[0]].into()
681 }
682 nir_op_bcsel => b.sel(srcs[0], srcs[1], srcs[2]),
683 nir_op_bfm => {
684 let dst = b.alloc_ssa(RegFile::GPR, 1);
685 b.push_op(OpBMsk {
686 dst: dst.into(),
687 pos: srcs[1],
688 width: srcs[0],
689 wrap: true,
690 });
691 dst
692 }
693 nir_op_bit_count => {
694 let dst = b.alloc_ssa(RegFile::GPR, 1);
695 b.push_op(OpPopC {
696 dst: dst.into(),
697 src: srcs[0],
698 });
699 dst
700 }
701 nir_op_bitfield_reverse => b.brev(srcs[0]),
702 nir_op_ibitfield_extract | nir_op_ubitfield_extract => {
703 let range = b.alloc_ssa(RegFile::GPR, 1);
704 b.push_op(OpPrmt {
705 dst: range.into(),
706 srcs: [srcs[1], srcs[2]],
707 sel: 0x0040.into(),
708 mode: PrmtMode::Index,
709 });
710
711 let dst = b.alloc_ssa(RegFile::GPR, 1);
712 b.push_op(OpBfe {
713 dst: dst.into(),
714 base: srcs[0],
715 signed: !matches!(alu.op, nir_op_ubitfield_extract),
716 range: range.into(),
717 reverse: false,
718 });
719 dst
720 }
721 nir_op_extract_u8 | nir_op_extract_i8 | nir_op_extract_u16
722 | nir_op_extract_i16 => {
723 let src1 = alu.get_src(1);
724 let elem = src1.src.comp_as_uint(src1.swizzle[0]).unwrap();
725 let elem = u8::try_from(elem).unwrap();
726
727 match alu.op {
728 nir_op_extract_u8 => {
729 assert!(elem < 4);
730 let byte = elem;
731 let zero = 4;
732 b.prmt(srcs[0], 0.into(), [byte, zero, zero, zero])
733 }
734 nir_op_extract_i8 => {
735 assert!(elem < 4);
736 let byte = elem;
737 let sign = byte | 0x8;
738 b.prmt(srcs[0], 0.into(), [byte, sign, sign, sign])
739 }
740 nir_op_extract_u16 => {
741 assert!(elem < 2);
742 let byte = elem * 2;
743 let zero = 4;
744 b.prmt(srcs[0], 0.into(), [byte, byte + 1, zero, zero])
745 }
746 nir_op_extract_i16 => {
747 assert!(elem < 2);
748 let byte = elem * 2;
749 let sign = (byte + 1) | 0x8;
750 b.prmt(srcs[0], 0.into(), [byte, byte + 1, sign, sign])
751 }
752 _ => panic!("Unknown extract op: {}", alu.op),
753 }
754 }
755 nir_op_f2f16 | nir_op_f2f16_rtne | nir_op_f2f16_rtz
756 | nir_op_f2f32 | nir_op_f2f64 => {
757 let src_bits = alu.get_src(0).src.bit_size();
758 let dst_bits = alu.def.bit_size();
759 let src_type = FloatType::from_bits(src_bits.into());
760 let dst_type = FloatType::from_bits(dst_bits.into());
761
762 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
763 b.push_op(OpF2F {
764 dst: dst.into(),
765 src: srcs[0],
766 src_type: FloatType::from_bits(src_bits.into()),
767 dst_type: dst_type,
768 rnd_mode: match alu.op {
769 nir_op_f2f16_rtne => FRndMode::NearestEven,
770 nir_op_f2f16_rtz => FRndMode::Zero,
771 _ => self.float_ctl[dst_type].rnd_mode,
772 },
773 ftz: if src_bits < dst_bits {
774 self.float_ctl[src_type].ftz
775 } else {
776 self.float_ctl[dst_type].ftz
777 },
778 high: false,
779 integer_rnd: false,
780 });
781 dst
782 }
783 nir_op_find_lsb => {
784 let rev = b.brev(srcs[0]);
785 let dst = b.alloc_ssa(RegFile::GPR, 1);
786 b.push_op(OpFlo {
787 dst: dst.into(),
788 src: rev.into(),
789 signed: false,
790 return_shift_amount: true,
791 });
792 dst
793 }
794 nir_op_f2i8 | nir_op_f2i16 | nir_op_f2i32 | nir_op_f2i64
795 | nir_op_f2u8 | nir_op_f2u16 | nir_op_f2u32 | nir_op_f2u64 => {
796 let src_bits = usize::from(alu.get_src(0).bit_size());
797 let dst_bits = alu.def.bit_size();
798 let src_type = FloatType::from_bits(src_bits);
799 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
800 let dst_is_signed = alu.info().output_type & 2 != 0;
801 let dst_type =
802 IntType::from_bits(dst_bits.into(), dst_is_signed);
803 if b.sm() < 70 && dst_bits == 8 {
804 // F2I doesn't support 8-bit destinations pre-Volta
805 let tmp = b.alloc_ssa(RegFile::GPR, 1);
806 let tmp_type = IntType::from_bits(32, dst_is_signed);
807 b.push_op(OpF2I {
808 dst: tmp.into(),
809 src: srcs[0],
810 src_type,
811 dst_type: tmp_type,
812 rnd_mode: FRndMode::Zero,
813 ftz: self.float_ctl[src_type].ftz,
814 });
815 b.push_op(OpI2I {
816 dst: dst.into(),
817 src: tmp.into(),
818 src_type: tmp_type,
819 dst_type,
820 saturate: true,
821 abs: false,
822 neg: false,
823 });
824 } else {
825 b.push_op(OpF2I {
826 dst: dst.into(),
827 src: srcs[0],
828 src_type,
829 dst_type,
830 rnd_mode: FRndMode::Zero,
831 ftz: self.float_ctl[src_type].ftz,
832 });
833 }
834 dst
835 }
836 nir_op_fabs | nir_op_fadd | nir_op_fneg => {
837 let (x, y) = match alu.op {
838 nir_op_fabs => (Src::new_zero().fneg(), srcs[0].fabs()),
839 nir_op_fadd => (srcs[0], srcs[1]),
840 nir_op_fneg => (Src::new_zero().fneg(), srcs[0].fneg()),
841 _ => panic!("Unhandled case"),
842 };
843 let ftype = FloatType::from_bits(alu.def.bit_size().into());
844 let dst;
845 if alu.def.bit_size() == 64 {
846 dst = b.alloc_ssa(RegFile::GPR, 2);
847 b.push_op(OpDAdd {
848 dst: dst.into(),
849 srcs: [x, y],
850 rnd_mode: self.float_ctl[ftype].rnd_mode,
851 });
852 } else if alu.def.bit_size() == 32 {
853 dst = b.alloc_ssa(RegFile::GPR, 1);
854 b.push_op(OpFAdd {
855 dst: dst.into(),
856 srcs: [x, y],
857 saturate: self.try_saturate_alu_dst(&alu.def),
858 rnd_mode: self.float_ctl[ftype].rnd_mode,
859 ftz: self.float_ctl[ftype].ftz,
860 });
861 } else if alu.def.bit_size() == 16 {
862 assert!(
863 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
864 );
865
866 dst = b.alloc_ssa(RegFile::GPR, 1);
867 b.push_op(OpHAdd2 {
868 dst: dst.into(),
869 srcs: [restrict_f16v2_src(x), restrict_f16v2_src(y)],
870 saturate: self.try_saturate_alu_dst(&alu.def),
871 ftz: self.float_ctl[ftype].ftz,
872 f32: false,
873 });
874 } else {
875 panic!("Unsupported float type: f{}", alu.def.bit_size());
876 }
877 dst
878 }
879 nir_op_fceil | nir_op_ffloor | nir_op_fround_even
880 | nir_op_ftrunc => {
881 let dst = b.alloc_ssa(RegFile::GPR, 1);
882 let ty = FloatType::from_bits(alu.def.bit_size().into());
883 let rnd_mode = match alu.op {
884 nir_op_fceil => FRndMode::PosInf,
885 nir_op_ffloor => FRndMode::NegInf,
886 nir_op_ftrunc => FRndMode::Zero,
887 nir_op_fround_even => FRndMode::NearestEven,
888 _ => unreachable!(),
889 };
890 let ftz = self.float_ctl[ty].ftz;
891 if b.sm() >= 70 {
892 assert!(
893 alu.def.bit_size() == 32 || alu.def.bit_size() == 16
894 );
895 b.push_op(OpFRnd {
896 dst: dst.into(),
897 src: srcs[0],
898 src_type: ty,
899 dst_type: ty,
900 rnd_mode,
901 ftz,
902 });
903 } else {
904 assert!(alu.def.bit_size() == 32);
905 b.push_op(OpF2F {
906 dst: dst.into(),
907 src: srcs[0],
908 src_type: ty,
909 dst_type: ty,
910 rnd_mode,
911 ftz,
912 integer_rnd: true,
913 high: false,
914 });
915 }
916 dst
917 }
918 nir_op_fcos => b.fcos(srcs[0]),
919 nir_op_feq | nir_op_fge | nir_op_flt | nir_op_fneu => {
920 let src_type =
921 FloatType::from_bits(alu.get_src(0).bit_size().into());
922 let cmp_op = match alu.op {
923 nir_op_feq => FloatCmpOp::OrdEq,
924 nir_op_fge => FloatCmpOp::OrdGe,
925 nir_op_flt => FloatCmpOp::OrdLt,
926 nir_op_fneu => FloatCmpOp::UnordNe,
927 _ => panic!("Usupported float comparison"),
928 };
929
930 let dst = b.alloc_ssa(RegFile::Pred, alu.def.num_components);
931 if alu.get_src(0).bit_size() == 64 {
932 assert!(alu.def.num_components == 1);
933 b.push_op(OpDSetP {
934 dst: dst.into(),
935 set_op: PredSetOp::And,
936 cmp_op: cmp_op,
937 srcs: [srcs[0], srcs[1]],
938 accum: SrcRef::True.into(),
939 });
940 } else if alu.get_src(0).bit_size() == 32 {
941 assert!(alu.def.num_components == 1);
942 b.push_op(OpFSetP {
943 dst: dst.into(),
944 set_op: PredSetOp::And,
945 cmp_op: cmp_op,
946 srcs: [srcs[0], srcs[1]],
947 accum: SrcRef::True.into(),
948 ftz: self.float_ctl[src_type].ftz,
949 });
950 } else if alu.get_src(0).bit_size() == 16 {
951 assert!(
952 alu.def.num_components == 1
953 || alu.def.num_components == 2
954 );
955
956 let dsts = if alu.def.num_components == 2 {
957 [dst[0].into(), dst[1].into()]
958 } else {
959 [dst[0].into(), Dst::None]
960 };
961
962 b.push_op(OpHSetP2 {
963 dsts,
964 set_op: PredSetOp::And,
965 cmp_op: cmp_op,
966 srcs: [
967 restrict_f16v2_src(srcs[0]),
968 restrict_f16v2_src(srcs[1]),
969 ],
970 accum: SrcRef::True.into(),
971 ftz: self.float_ctl[src_type].ftz,
972 horizontal: false,
973 });
974 } else {
975 panic!(
976 "Unsupported float type: f{}",
977 alu.get_src(0).bit_size()
978 );
979 }
980 dst
981 }
982 nir_op_fexp2 => b.fexp2(srcs[0]),
983 nir_op_ffma => {
984 let ftype = FloatType::from_bits(alu.def.bit_size().into());
985 let dst;
986 if alu.def.bit_size() == 64 {
987 debug_assert!(!self.float_ctl[ftype].ftz);
988 dst = b.alloc_ssa(RegFile::GPR, 2);
989 b.push_op(OpDFma {
990 dst: dst.into(),
991 srcs: [srcs[0], srcs[1], srcs[2]],
992 rnd_mode: self.float_ctl[ftype].rnd_mode,
993 });
994 } else if alu.def.bit_size() == 32 {
995 dst = b.alloc_ssa(RegFile::GPR, 1);
996 b.push_op(OpFFma {
997 dst: dst.into(),
998 srcs: [srcs[0], srcs[1], srcs[2]],
999 saturate: self.try_saturate_alu_dst(&alu.def),
1000 rnd_mode: self.float_ctl[ftype].rnd_mode,
1001 // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1002 // anyway so only set one of the two bits.
1003 ftz: self.float_ctl[ftype].ftz,
1004 dnz: false,
1005 });
1006 } else if alu.def.bit_size() == 16 {
1007 assert!(
1008 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1009 );
1010
1011 dst = b.alloc_ssa(RegFile::GPR, 1);
1012 b.push_op(OpHFma2 {
1013 dst: dst.into(),
1014 srcs: [
1015 restrict_f16v2_src(srcs[0]),
1016 restrict_f16v2_src(srcs[1]),
1017 restrict_f16v2_src(srcs[2]),
1018 ],
1019 saturate: self.try_saturate_alu_dst(&alu.def),
1020 ftz: self.float_ctl[ftype].ftz,
1021 dnz: false,
1022 f32: false,
1023 });
1024 } else {
1025 panic!("Unsupported float type: f{}", alu.def.bit_size());
1026 }
1027 dst
1028 }
1029 nir_op_ffmaz => {
1030 assert!(alu.def.bit_size() == 32);
1031 // DNZ implies FTZ so we need FTZ set or this is invalid
1032 assert!(self.float_ctl.fp32.ftz);
1033 let dst = b.alloc_ssa(RegFile::GPR, 1);
1034 b.push_op(OpFFma {
1035 dst: dst.into(),
1036 srcs: [srcs[0], srcs[1], srcs[2]],
1037 saturate: self.try_saturate_alu_dst(&alu.def),
1038 rnd_mode: self.float_ctl.fp32.rnd_mode,
1039 // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1040 // anyway so only set one of the two bits.
1041 ftz: false,
1042 dnz: true,
1043 });
1044 dst
1045 }
1046 nir_op_flog2 => {
1047 assert!(alu.def.bit_size() == 32);
1048 b.mufu(MuFuOp::Log2, srcs[0])
1049 }
1050 nir_op_fmax | nir_op_fmin => {
1051 let dst;
1052 if alu.def.bit_size() == 64 {
1053 dst = b.alloc_ssa(RegFile::GPR, 2);
1054 b.push_op(OpDMnMx {
1055 dst: dst.into(),
1056 srcs: [srcs[0], srcs[1]],
1057 min: (alu.op == nir_op_fmin).into(),
1058 });
1059 } else if alu.def.bit_size() == 32 {
1060 dst = b.alloc_ssa(RegFile::GPR, 1);
1061 b.push_op(OpFMnMx {
1062 dst: dst.into(),
1063 srcs: [srcs[0], srcs[1]],
1064 min: (alu.op == nir_op_fmin).into(),
1065 ftz: self.float_ctl.fp32.ftz,
1066 });
1067 } else if alu.def.bit_size() == 16 {
1068 dst = b.alloc_ssa(RegFile::GPR, 1);
1069 b.push_op(OpHMnMx2 {
1070 dst: dst.into(),
1071 srcs: [
1072 restrict_f16v2_src(srcs[0]),
1073 restrict_f16v2_src(srcs[1]),
1074 ],
1075 min: (alu.op == nir_op_fmin).into(),
1076 ftz: self.float_ctl.fp16.ftz,
1077 });
1078 } else {
1079 panic!("Unsupported float type: f{}", alu.def.bit_size());
1080 }
1081 dst
1082 }
1083 nir_op_fmul => {
1084 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1085 let dst;
1086 if alu.def.bit_size() == 64 {
1087 debug_assert!(!self.float_ctl[ftype].ftz);
1088 dst = b.alloc_ssa(RegFile::GPR, 2);
1089 b.push_op(OpDMul {
1090 dst: dst.into(),
1091 srcs: [srcs[0], srcs[1]],
1092 rnd_mode: self.float_ctl[ftype].rnd_mode,
1093 });
1094 } else if alu.def.bit_size() == 32 {
1095 dst = b.alloc_ssa(RegFile::GPR, 1);
1096 b.push_op(OpFMul {
1097 dst: dst.into(),
1098 srcs: [srcs[0], srcs[1]],
1099 saturate: self.try_saturate_alu_dst(&alu.def),
1100 rnd_mode: self.float_ctl[ftype].rnd_mode,
1101 ftz: self.float_ctl[ftype].ftz,
1102 dnz: false,
1103 });
1104 } else if alu.def.bit_size() == 16 {
1105 assert!(
1106 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1107 );
1108
1109 dst = b.alloc_ssa(RegFile::GPR, 1);
1110 b.push_op(OpHMul2 {
1111 dst: dst.into(),
1112 srcs: [
1113 restrict_f16v2_src(srcs[0]),
1114 restrict_f16v2_src(srcs[1]),
1115 ],
1116 saturate: self.try_saturate_alu_dst(&alu.def),
1117 ftz: self.float_ctl[ftype].ftz,
1118 dnz: false,
1119 });
1120 } else {
1121 panic!("Unsupported float type: f{}", alu.def.bit_size());
1122 }
1123 dst
1124 }
1125 nir_op_fmulz => {
1126 assert!(alu.def.bit_size() == 32);
1127 // DNZ implies FTZ so we need FTZ set or this is invalid
1128 assert!(self.float_ctl.fp32.ftz);
1129 let dst = b.alloc_ssa(RegFile::GPR, 1);
1130 b.push_op(OpFMul {
1131 dst: dst.into(),
1132 srcs: [srcs[0], srcs[1]],
1133 saturate: self.try_saturate_alu_dst(&alu.def),
1134 rnd_mode: self.float_ctl.fp32.rnd_mode,
1135 // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1136 // anyway so only set one of the two bits.
1137 ftz: false,
1138 dnz: true,
1139 });
1140 dst
1141 }
1142 nir_op_fquantize2f16 => {
1143 let tmp = b.alloc_ssa(RegFile::GPR, 1);
1144 b.push_op(OpF2F {
1145 dst: tmp.into(),
1146 src: srcs[0],
1147 src_type: FloatType::F32,
1148 dst_type: FloatType::F16,
1149 rnd_mode: FRndMode::NearestEven,
1150 ftz: true,
1151 high: false,
1152 integer_rnd: false,
1153 });
1154 assert!(alu.def.bit_size() == 32);
1155 let dst = b.alloc_ssa(RegFile::GPR, 1);
1156 b.push_op(OpF2F {
1157 dst: dst.into(),
1158 src: tmp.into(),
1159 src_type: FloatType::F16,
1160 dst_type: FloatType::F32,
1161 rnd_mode: FRndMode::NearestEven,
1162 ftz: true,
1163 high: false,
1164 integer_rnd: false,
1165 });
1166 if b.sm() < 70 {
1167 // Pre-Volta, F2F.ftz doesn't flush denorms so we need to do
1168 // that manually
1169 let denorm = b.fsetp(
1170 FloatCmpOp::OrdLt,
1171 srcs[0].fabs(),
1172 0x38800000.into(),
1173 );
1174 // Get the correctly signed zero
1175 let zero =
1176 b.lop2(LogicOp2::And, srcs[0], 0x80000000.into());
1177 b.sel(denorm.into(), zero.into(), dst.into())
1178 } else {
1179 dst
1180 }
1181 }
1182 nir_op_frcp => {
1183 assert!(alu.def.bit_size() == 32);
1184 b.mufu(MuFuOp::Rcp, srcs[0])
1185 }
1186 nir_op_frsq => {
1187 assert!(alu.def.bit_size() == 32);
1188 b.mufu(MuFuOp::Rsq, srcs[0])
1189 }
1190 nir_op_fsat => {
1191 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1192
1193 if self.alu_src_is_saturated(&alu.srcs_as_slice()[0]) {
1194 b.copy(srcs[0])
1195 } else if alu.def.bit_size() == 32 {
1196 let dst = b.alloc_ssa(RegFile::GPR, 1);
1197 b.push_op(OpFAdd {
1198 dst: dst.into(),
1199 srcs: [srcs[0], 0.into()],
1200 saturate: true,
1201 rnd_mode: self.float_ctl[ftype].rnd_mode,
1202 ftz: self.float_ctl[ftype].ftz,
1203 });
1204 dst
1205 } else if alu.def.bit_size() == 16 {
1206 assert!(
1207 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1208 );
1209
1210 let dst = b.alloc_ssa(RegFile::GPR, 1);
1211 b.push_op(OpHAdd2 {
1212 dst: dst.into(),
1213 srcs: [restrict_f16v2_src(srcs[0]), 0.into()],
1214 saturate: true,
1215 ftz: self.float_ctl[ftype].ftz,
1216 f32: false,
1217 });
1218 dst
1219 } else {
1220 panic!("Unsupported float type: f{}", alu.def.bit_size());
1221 }
1222 }
1223 nir_op_fsign => {
1224 if alu.def.bit_size() == 64 {
1225 let lz = b.dsetp(FloatCmpOp::OrdLt, srcs[0], 0.into());
1226 let gz = b.dsetp(FloatCmpOp::OrdGt, srcs[0], 0.into());
1227 let hi = b.sel(lz.into(), 0xbff00000.into(), 0.into());
1228 let hi = b.sel(gz.into(), 0x3ff00000.into(), hi.into());
1229 let lo = b.copy(0.into());
1230 [lo[0], hi[0]].into()
1231 } else if alu.def.bit_size() == 32 {
1232 let lz = b.fset(FloatCmpOp::OrdLt, srcs[0], 0.into());
1233 let gz = b.fset(FloatCmpOp::OrdGt, srcs[0], 0.into());
1234 b.fadd(gz.into(), Src::from(lz).fneg())
1235 } else if alu.def.bit_size() == 16 {
1236 let x = restrict_f16v2_src(srcs[0]);
1237
1238 let lz = restrict_f16v2_src(
1239 b.hset2(FloatCmpOp::OrdLt, x, 0.into()).into(),
1240 );
1241 let gz = restrict_f16v2_src(
1242 b.hset2(FloatCmpOp::OrdGt, x, 0.into()).into(),
1243 );
1244
1245 b.hadd2(gz, lz.fneg())
1246 } else {
1247 panic!("Unsupported float type: f{}", alu.def.bit_size());
1248 }
1249 }
1250 nir_op_fsin => b.fsin(srcs[0]),
1251 nir_op_fsqrt => b.mufu(MuFuOp::Sqrt, srcs[0]),
1252 nir_op_i2f16 | nir_op_i2f32 | nir_op_i2f64 => {
1253 let src_bits = alu.get_src(0).src.bit_size();
1254 let dst_bits = alu.def.bit_size();
1255 let dst_type = FloatType::from_bits(dst_bits.into());
1256 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1257 b.push_op(OpI2F {
1258 dst: dst.into(),
1259 src: srcs[0],
1260 dst_type: dst_type,
1261 src_type: IntType::from_bits(src_bits.into(), true),
1262 rnd_mode: self.float_ctl[dst_type].rnd_mode,
1263 });
1264 dst
1265 }
1266 nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32 | nir_op_i2i64
1267 | nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32 | nir_op_u2u64 => {
1268 let src_bits = alu.get_src(0).src.bit_size();
1269 let dst_bits = alu.def.bit_size();
1270
1271 let mut prmt = [0_u8; 8];
1272 match alu.op {
1273 nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32
1274 | nir_op_i2i64 => {
1275 let sign = ((src_bits / 8) - 1) | 0x8;
1276 for i in 0..8 {
1277 if i < (src_bits / 8) {
1278 prmt[usize::from(i)] = i;
1279 } else {
1280 prmt[usize::from(i)] = sign;
1281 }
1282 }
1283 }
1284 nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32
1285 | nir_op_u2u64 => {
1286 for i in 0..8 {
1287 if i < (src_bits / 8) {
1288 prmt[usize::from(i)] = i;
1289 } else {
1290 prmt[usize::from(i)] = 4;
1291 }
1292 }
1293 }
1294 _ => panic!("Invalid integer conversion: {}", alu.op),
1295 }
1296 let prmt_lo: [u8; 4] = prmt[0..4].try_into().unwrap();
1297 let prmt_hi: [u8; 4] = prmt[4..8].try_into().unwrap();
1298
1299 let src = srcs[0].as_ssa().unwrap();
1300 if src_bits == 64 {
1301 if dst_bits == 64 {
1302 *src
1303 } else {
1304 b.prmt(src[0].into(), src[1].into(), prmt_lo)
1305 }
1306 } else {
1307 if dst_bits == 64 {
1308 let lo = b.prmt(src[0].into(), 0.into(), prmt_lo);
1309 let hi = b.prmt(src[0].into(), 0.into(), prmt_hi);
1310 [lo[0], hi[0]].into()
1311 } else {
1312 b.prmt(src[0].into(), 0.into(), prmt_lo)
1313 }
1314 }
1315 }
1316 nir_op_iabs => b.iabs(srcs[0]),
1317 nir_op_iadd => match alu.def.bit_size {
1318 32 => b.iadd(srcs[0], srcs[1], 0.into()),
1319 64 => b.iadd64(srcs[0], srcs[1], 0.into()),
1320 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1321 },
1322 nir_op_iadd3 => match alu.def.bit_size {
1323 32 => b.iadd(srcs[0], srcs[1], srcs[2]),
1324 64 => b.iadd64(srcs[0], srcs[1], srcs[2]),
1325 x => panic!("unsupported bit size for nir_op_iadd3: {x}"),
1326 },
1327 nir_op_iand => b.lop2(LogicOp2::And, srcs[0], srcs[1]),
1328 nir_op_ieq => {
1329 if alu.get_src(0).bit_size() == 1 {
1330 b.lop2(LogicOp2::Xor, srcs[0], srcs[1].bnot())
1331 } else if alu.get_src(0).bit_size() == 64 {
1332 b.isetp64(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1333 } else {
1334 assert!(alu.get_src(0).bit_size() == 32);
1335 b.isetp(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1336 }
1337 }
1338 nir_op_ifind_msb | nir_op_ifind_msb_rev | nir_op_ufind_msb
1339 | nir_op_ufind_msb_rev => {
1340 let dst = b.alloc_ssa(RegFile::GPR, 1);
1341 b.push_op(OpFlo {
1342 dst: dst.into(),
1343 src: srcs[0],
1344 signed: match alu.op {
1345 nir_op_ifind_msb | nir_op_ifind_msb_rev => true,
1346 nir_op_ufind_msb | nir_op_ufind_msb_rev => false,
1347 _ => panic!("Not a find_msb op"),
1348 },
1349 return_shift_amount: match alu.op {
1350 nir_op_ifind_msb | nir_op_ufind_msb => false,
1351 nir_op_ifind_msb_rev | nir_op_ufind_msb_rev => true,
1352 _ => panic!("Not a find_msb op"),
1353 },
1354 });
1355 dst
1356 }
1357 nir_op_ige | nir_op_ilt | nir_op_uge | nir_op_ult => {
1358 let x = *srcs[0].as_ssa().unwrap();
1359 let y = *srcs[1].as_ssa().unwrap();
1360 let (cmp_type, cmp_op) = match alu.op {
1361 nir_op_ige => (IntCmpType::I32, IntCmpOp::Ge),
1362 nir_op_ilt => (IntCmpType::I32, IntCmpOp::Lt),
1363 nir_op_uge => (IntCmpType::U32, IntCmpOp::Ge),
1364 nir_op_ult => (IntCmpType::U32, IntCmpOp::Lt),
1365 _ => panic!("Not an integer comparison"),
1366 };
1367 if alu.get_src(0).bit_size() == 64 {
1368 b.isetp64(cmp_type, cmp_op, x.into(), y.into())
1369 } else {
1370 assert!(alu.get_src(0).bit_size() == 32);
1371 b.isetp(cmp_type, cmp_op, x.into(), y.into())
1372 }
1373 }
1374 nir_op_imad => {
1375 assert!(alu.def.bit_size() == 32);
1376 let dst = b.alloc_ssa(RegFile::GPR, 1);
1377 b.push_op(OpIMad {
1378 dst: dst.into(),
1379 srcs: [srcs[0], srcs[1], srcs[2]],
1380 signed: false,
1381 });
1382 dst
1383 }
1384 nir_op_imax | nir_op_imin | nir_op_umax | nir_op_umin => {
1385 let (tp, min) = match alu.op {
1386 nir_op_imax => (IntCmpType::I32, SrcRef::False),
1387 nir_op_imin => (IntCmpType::I32, SrcRef::True),
1388 nir_op_umax => (IntCmpType::U32, SrcRef::False),
1389 nir_op_umin => (IntCmpType::U32, SrcRef::True),
1390 _ => panic!("Not an integer min/max"),
1391 };
1392 assert!(alu.def.bit_size() == 32);
1393 b.imnmx(tp, srcs[0], srcs[1], min.into())
1394 }
1395 nir_op_imul => {
1396 assert!(alu.def.bit_size() == 32);
1397 b.imul(srcs[0], srcs[1])
1398 }
1399 nir_op_imul_2x32_64 | nir_op_umul_2x32_64 => {
1400 let signed = alu.op == nir_op_imul_2x32_64;
1401 b.imul_2x32_64(srcs[0], srcs[1], signed)
1402 }
1403 nir_op_imul_high | nir_op_umul_high => {
1404 let signed = alu.op == nir_op_imul_high;
1405 let dst64 = b.imul_2x32_64(srcs[0], srcs[1], signed);
1406 dst64[1].into()
1407 }
1408 nir_op_ine => {
1409 if alu.get_src(0).bit_size() == 1 {
1410 b.lop2(LogicOp2::Xor, srcs[0], srcs[1])
1411 } else if alu.get_src(0).bit_size() == 64 {
1412 b.isetp64(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1413 } else {
1414 assert!(alu.get_src(0).bit_size() == 32);
1415 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1416 }
1417 }
1418 nir_op_ineg => {
1419 if alu.def.bit_size == 64 {
1420 b.ineg64(srcs[0])
1421 } else {
1422 assert!(alu.def.bit_size() == 32);
1423 b.ineg(srcs[0])
1424 }
1425 }
1426 nir_op_inot => {
1427 if alu.def.bit_size() == 1 {
1428 b.lop2(LogicOp2::PassB, true.into(), srcs[0].bnot())
1429 } else {
1430 assert!(alu.def.bit_size() == 32);
1431 b.lop2(LogicOp2::PassB, 0.into(), srcs[0].bnot())
1432 }
1433 }
1434 nir_op_ior => b.lop2(LogicOp2::Or, srcs[0], srcs[1]),
1435 nir_op_ishl => {
1436 if alu.def.bit_size() == 64 {
1437 let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1438 (s as u32).into()
1439 } else {
1440 srcs[1]
1441 };
1442 b.shl64(srcs[0], shift)
1443 } else {
1444 assert!(alu.def.bit_size() == 32);
1445 b.shl(srcs[0], srcs[1])
1446 }
1447 }
1448 nir_op_ishr => {
1449 if alu.def.bit_size() == 64 {
1450 let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1451 (s as u32).into()
1452 } else {
1453 srcs[1]
1454 };
1455 b.shr64(srcs[0], shift, true)
1456 } else {
1457 assert!(alu.def.bit_size() == 32);
1458 b.shr(srcs[0], srcs[1], true)
1459 }
1460 }
1461 nir_op_isub => match alu.def.bit_size {
1462 32 => b.iadd(srcs[0], srcs[1].ineg(), 0.into()),
1463 64 => b.iadd64(srcs[0], srcs[1].ineg(), 0.into()),
1464 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1465 },
1466 nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
1467 nir_op_pack_half_2x16_split | nir_op_pack_half_2x16_rtz_split => {
1468 assert!(alu.get_src(0).bit_size() == 32);
1469
1470 let rnd_mode = match alu.op {
1471 nir_op_pack_half_2x16_split => FRndMode::NearestEven,
1472 nir_op_pack_half_2x16_rtz_split => FRndMode::Zero,
1473 _ => panic!("Unhandled fp16 pack op"),
1474 };
1475
1476 if self.sm.sm() >= 86 {
1477 let result: SSARef = b.alloc_ssa(RegFile::GPR, 1);
1478 b.push_op(OpF2FP {
1479 dst: result.into(),
1480 srcs: [srcs[1], srcs[0]],
1481 rnd_mode: rnd_mode,
1482 });
1483
1484 result
1485 } else {
1486 let low = b.alloc_ssa(RegFile::GPR, 1);
1487 let high = b.alloc_ssa(RegFile::GPR, 1);
1488
1489 b.push_op(OpF2F {
1490 dst: low.into(),
1491 src: srcs[0],
1492 src_type: FloatType::F32,
1493 dst_type: FloatType::F16,
1494 rnd_mode: rnd_mode,
1495 ftz: false,
1496 high: false,
1497 integer_rnd: false,
1498 });
1499
1500 let src_bits = usize::from(alu.get_src(1).bit_size());
1501 let src_type = FloatType::from_bits(src_bits);
1502 assert!(matches!(src_type, FloatType::F32));
1503 b.push_op(OpF2F {
1504 dst: high.into(),
1505 src: srcs[1],
1506 src_type: FloatType::F32,
1507 dst_type: FloatType::F16,
1508 rnd_mode: rnd_mode,
1509 ftz: false,
1510 high: false,
1511 integer_rnd: false,
1512 });
1513
1514 b.prmt(low.into(), high.into(), [0, 1, 4, 5])
1515 }
1516 }
1517 nir_op_prmt_nv => {
1518 let dst = b.alloc_ssa(RegFile::GPR, 1);
1519 b.push_op(OpPrmt {
1520 dst: dst.into(),
1521 srcs: [srcs[1], srcs[2]],
1522 sel: srcs[0],
1523 mode: PrmtMode::Index,
1524 });
1525 dst
1526 }
1527 nir_op_sdot_4x8_iadd => {
1528 let dst = b.alloc_ssa(RegFile::GPR, 1);
1529 b.push_op(OpIDp4 {
1530 dst: dst.into(),
1531 src_types: [IntType::I8, IntType::I8],
1532 srcs: [srcs[0], srcs[1], srcs[2]],
1533 });
1534 dst
1535 }
1536 nir_op_sudot_4x8_iadd => {
1537 let dst = b.alloc_ssa(RegFile::GPR, 1);
1538 b.push_op(OpIDp4 {
1539 dst: dst.into(),
1540 src_types: [IntType::I8, IntType::U8],
1541 srcs: [srcs[0], srcs[1], srcs[2]],
1542 });
1543 dst
1544 }
1545 nir_op_udot_4x8_uadd => {
1546 let dst = b.alloc_ssa(RegFile::GPR, 1);
1547 b.push_op(OpIDp4 {
1548 dst: dst.into(),
1549 src_types: [IntType::U8, IntType::U8],
1550 srcs: [srcs[0], srcs[1], srcs[2]],
1551 });
1552 dst
1553 }
1554 nir_op_u2f16 | nir_op_u2f32 | nir_op_u2f64 => {
1555 let src_bits = alu.get_src(0).src.bit_size();
1556 let dst_bits = alu.def.bit_size();
1557 let dst_type = FloatType::from_bits(dst_bits.into());
1558 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1559 b.push_op(OpI2F {
1560 dst: dst.into(),
1561 src: srcs[0],
1562 dst_type: dst_type,
1563 src_type: IntType::from_bits(src_bits.into(), false),
1564 rnd_mode: self.float_ctl[dst_type].rnd_mode,
1565 });
1566 dst
1567 }
1568 nir_op_uadd_sat => {
1569 let x = srcs[0].as_ssa().unwrap();
1570 let y = srcs[1].as_ssa().unwrap();
1571 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1572 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1573 b.push_op(OpIAdd3 {
1574 dst: sum_lo.into(),
1575 overflow: [ovf_lo.into(), Dst::None],
1576 srcs: [0.into(), x[0].into(), y[0].into()],
1577 });
1578 if alu.def.bit_size() == 64 {
1579 let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1580 let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1581 b.push_op(OpIAdd3X {
1582 dst: sum_hi.into(),
1583 overflow: [ovf_hi.into(), Dst::None],
1584 srcs: [0.into(), x[1].into(), y[1].into()],
1585 carry: [ovf_lo.into(), false.into()],
1586 });
1587 let lo =
1588 b.sel(ovf_hi.into(), u32::MAX.into(), sum_lo.into());
1589 let hi =
1590 b.sel(ovf_hi.into(), u32::MAX.into(), sum_hi.into());
1591 [lo[0], hi[0]].into()
1592 } else {
1593 assert!(alu.def.bit_size() == 32);
1594 b.sel(ovf_lo.into(), u32::MAX.into(), sum_lo.into())
1595 }
1596 }
1597 nir_op_usub_sat => {
1598 let x = srcs[0].as_ssa().unwrap();
1599 let y = srcs[1].as_ssa().unwrap();
1600 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1601 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1602 // The result of OpIAdd3X is the 33-bit value
1603 //
1604 // s|o = x + !y + 1
1605 //
1606 // The overflow bit of this result is true if and only if the
1607 // subtract did NOT overflow.
1608 b.push_op(OpIAdd3 {
1609 dst: sum_lo.into(),
1610 overflow: [ovf_lo.into(), Dst::None],
1611 srcs: [0.into(), x[0].into(), Src::from(y[0]).ineg()],
1612 });
1613 if alu.def.bit_size() == 64 {
1614 let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1615 let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1616 b.push_op(OpIAdd3X {
1617 dst: sum_hi.into(),
1618 overflow: [ovf_hi.into(), Dst::None],
1619 srcs: [0.into(), x[1].into(), Src::from(y[1]).bnot()],
1620 carry: [ovf_lo.into(), false.into()],
1621 });
1622 let lo = b.sel(ovf_hi.into(), sum_lo.into(), 0.into());
1623 let hi = b.sel(ovf_hi.into(), sum_hi.into(), 0.into());
1624 [lo[0], hi[0]].into()
1625 } else {
1626 assert!(alu.def.bit_size() == 32);
1627 b.sel(ovf_lo.into(), sum_lo.into(), 0.into())
1628 }
1629 }
1630 nir_op_unpack_32_2x16_split_x => {
1631 b.prmt(srcs[0], 0.into(), [0, 1, 4, 4])
1632 }
1633 nir_op_unpack_32_2x16_split_y => {
1634 b.prmt(srcs[0], 0.into(), [2, 3, 4, 4])
1635 }
1636 nir_op_unpack_64_2x32_split_x => {
1637 let src0_x = srcs[0].as_ssa().unwrap()[0];
1638 b.copy(src0_x.into())
1639 }
1640 nir_op_unpack_64_2x32_split_y => {
1641 let src0_y = srcs[0].as_ssa().unwrap()[1];
1642 b.copy(src0_y.into())
1643 }
1644 nir_op_unpack_half_2x16_split_x
1645 | nir_op_unpack_half_2x16_split_y => {
1646 assert!(alu.def.bit_size() == 32);
1647 let dst = b.alloc_ssa(RegFile::GPR, 1);
1648
1649 b.push_op(OpF2F {
1650 dst: dst[0].into(),
1651 src: srcs[0],
1652 src_type: FloatType::F16,
1653 dst_type: FloatType::F32,
1654 rnd_mode: FRndMode::NearestEven,
1655 ftz: false,
1656 high: alu.op == nir_op_unpack_half_2x16_split_y,
1657 integer_rnd: false,
1658 });
1659
1660 dst
1661 }
1662 nir_op_ushr => {
1663 if alu.def.bit_size() == 64 {
1664 let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1665 (s as u32).into()
1666 } else {
1667 srcs[1]
1668 };
1669 b.shr64(srcs[0], shift, false)
1670 } else {
1671 assert!(alu.def.bit_size() == 32);
1672 b.shr(srcs[0], srcs[1], false)
1673 }
1674 }
1675 _ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
1676 };
1677 self.set_dst(&alu.def, dst);
1678 }
1679
parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr)1680 fn parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr) {
1681 let dim = match tex.sampler_dim {
1682 GLSL_SAMPLER_DIM_1D => {
1683 if tex.is_array {
1684 TexDim::Array1D
1685 } else {
1686 TexDim::_1D
1687 }
1688 }
1689 GLSL_SAMPLER_DIM_2D => {
1690 if tex.is_array {
1691 TexDim::Array2D
1692 } else {
1693 TexDim::_2D
1694 }
1695 }
1696 GLSL_SAMPLER_DIM_3D => {
1697 assert!(!tex.is_array);
1698 TexDim::_3D
1699 }
1700 GLSL_SAMPLER_DIM_CUBE => {
1701 if tex.is_array {
1702 TexDim::ArrayCube
1703 } else {
1704 TexDim::Cube
1705 }
1706 }
1707 GLSL_SAMPLER_DIM_BUF => TexDim::_1D,
1708 GLSL_SAMPLER_DIM_MS => {
1709 if tex.is_array {
1710 TexDim::Array2D
1711 } else {
1712 TexDim::_2D
1713 }
1714 }
1715 _ => panic!("Unsupported texture dimension: {}", tex.sampler_dim),
1716 };
1717
1718 let srcs = tex.srcs_as_slice();
1719 assert!(srcs[0].src_type == nir_tex_src_backend1);
1720 if srcs.len() > 1 {
1721 assert!(srcs.len() == 2);
1722 assert!(srcs[1].src_type == nir_tex_src_backend2);
1723 }
1724
1725 let flags: nak_nir_tex_flags =
1726 unsafe { std::mem::transmute_copy(&tex.backend_flags) };
1727
1728 let mask = tex.def.components_read();
1729 let mut mask = u8::try_from(mask).unwrap();
1730 if flags.is_sparse() {
1731 mask &= !(1 << (tex.def.num_components - 1));
1732 }
1733
1734 let dst_comps = u8::try_from(mask.count_ones()).unwrap();
1735 let dst = b.alloc_ssa(RegFile::GPR, dst_comps);
1736
1737 // On Volta and later, the destination is split in two
1738 let mut dsts = [Dst::None; 2];
1739 if dst_comps > 2 && b.sm() >= 70 {
1740 dsts[0] = SSARef::try_from(&dst[0..2]).unwrap().into();
1741 dsts[1] = SSARef::try_from(&dst[2..]).unwrap().into();
1742 } else {
1743 dsts[0] = dst.into();
1744 }
1745
1746 let fault = if flags.is_sparse() {
1747 b.alloc_ssa(RegFile::Pred, 1).into()
1748 } else {
1749 Dst::None
1750 };
1751
1752 if tex.op == nir_texop_hdr_dim_nv {
1753 let src = self.get_src(&srcs[0].src);
1754 assert!(fault.is_none());
1755 b.push_op(OpTxq {
1756 dsts: dsts,
1757 src: src,
1758 query: TexQuery::Dimension,
1759 mask: mask,
1760 });
1761 } else if tex.op == nir_texop_tex_type_nv {
1762 let src = self.get_src(&srcs[0].src);
1763 assert!(fault.is_none());
1764 b.push_op(OpTxq {
1765 dsts: dsts,
1766 src: src,
1767 query: TexQuery::TextureType,
1768 mask: mask,
1769 });
1770 } else {
1771 let lod_mode = match flags.lod_mode() {
1772 NAK_NIR_LOD_MODE_AUTO => TexLodMode::Auto,
1773 NAK_NIR_LOD_MODE_ZERO => TexLodMode::Zero,
1774 NAK_NIR_LOD_MODE_BIAS => TexLodMode::Bias,
1775 NAK_NIR_LOD_MODE_LOD => TexLodMode::Lod,
1776 NAK_NIR_LOD_MODE_CLAMP => TexLodMode::Clamp,
1777 NAK_NIR_LOD_MODE_BIAS_CLAMP => TexLodMode::BiasClamp,
1778 _ => panic!("Invalid LOD mode"),
1779 };
1780
1781 let offset_mode = match flags.offset_mode() {
1782 NAK_NIR_OFFSET_MODE_NONE => Tld4OffsetMode::None,
1783 NAK_NIR_OFFSET_MODE_AOFFI => Tld4OffsetMode::AddOffI,
1784 NAK_NIR_OFFSET_MODE_PER_PX => Tld4OffsetMode::PerPx,
1785 _ => panic!("Invalid offset mode"),
1786 };
1787
1788 let srcs = [self.get_src(&srcs[0].src), self.get_src(&srcs[1].src)];
1789
1790 if tex.op == nir_texop_txd {
1791 assert!(lod_mode == TexLodMode::Auto);
1792 assert!(offset_mode != Tld4OffsetMode::PerPx);
1793 assert!(!flags.has_z_cmpr());
1794 b.push_op(OpTxd {
1795 dsts: dsts,
1796 fault,
1797 srcs: srcs,
1798 dim: dim,
1799 offset: offset_mode == Tld4OffsetMode::AddOffI,
1800 mask: mask,
1801 });
1802 } else if tex.op == nir_texop_lod {
1803 assert!(offset_mode == Tld4OffsetMode::None);
1804 b.push_op(OpTmml {
1805 dsts: dsts,
1806 srcs: srcs,
1807 dim: dim,
1808 mask: mask,
1809 });
1810 } else if tex.op == nir_texop_txf || tex.op == nir_texop_txf_ms {
1811 assert!(offset_mode != Tld4OffsetMode::PerPx);
1812 b.push_op(OpTld {
1813 dsts: dsts,
1814 fault,
1815 srcs: srcs,
1816 dim: dim,
1817 lod_mode: lod_mode,
1818 is_ms: tex.op == nir_texop_txf_ms,
1819 offset: offset_mode == Tld4OffsetMode::AddOffI,
1820 mask: mask,
1821 });
1822 } else if tex.op == nir_texop_tg4 {
1823 b.push_op(OpTld4 {
1824 dsts: dsts,
1825 fault,
1826 srcs: srcs,
1827 dim: dim,
1828 comp: tex.component().try_into().unwrap(),
1829 offset_mode: offset_mode,
1830 z_cmpr: flags.has_z_cmpr(),
1831 mask: mask,
1832 });
1833 } else {
1834 assert!(offset_mode != Tld4OffsetMode::PerPx);
1835 b.push_op(OpTex {
1836 dsts: dsts,
1837 fault,
1838 srcs: srcs,
1839 dim: dim,
1840 lod_mode: lod_mode,
1841 z_cmpr: flags.has_z_cmpr(),
1842 offset: offset_mode == Tld4OffsetMode::AddOffI,
1843 mask: mask,
1844 });
1845 }
1846 }
1847
1848 let mut di = 0_usize;
1849 let mut nir_dst = Vec::new();
1850 for i in 0..tex.def.num_components() {
1851 if flags.is_sparse() && i == tex.def.num_components - 1 {
1852 let Dst::SSA(fault) = fault else {
1853 panic!("No fault value for sparse op");
1854 };
1855 nir_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
1856 } else if mask & (1 << i) == 0 {
1857 nir_dst.push(b.copy(0.into())[0]);
1858 } else {
1859 nir_dst.push(dst[di]);
1860 di += 1;
1861 }
1862 }
1863 self.set_ssa(tex.def.as_def(), nir_dst);
1864 }
1865
get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType1866 fn get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType {
1867 let bit_size = intrin.def.bit_size();
1868 match intrin.atomic_op() {
1869 nir_atomic_op_iadd => AtomType::U(bit_size),
1870 nir_atomic_op_imin => AtomType::I(bit_size),
1871 nir_atomic_op_umin => AtomType::U(bit_size),
1872 nir_atomic_op_imax => AtomType::I(bit_size),
1873 nir_atomic_op_umax => AtomType::U(bit_size),
1874 nir_atomic_op_iand => AtomType::U(bit_size),
1875 nir_atomic_op_ior => AtomType::U(bit_size),
1876 nir_atomic_op_ixor => AtomType::U(bit_size),
1877 nir_atomic_op_xchg => AtomType::U(bit_size),
1878 nir_atomic_op_fadd => AtomType::F(bit_size),
1879 nir_atomic_op_fmin => AtomType::F(bit_size),
1880 nir_atomic_op_fmax => AtomType::F(bit_size),
1881 nir_atomic_op_cmpxchg => AtomType::U(bit_size),
1882 _ => panic!("Unsupported NIR atomic op"),
1883 }
1884 }
1885
get_atomic_op( &self, intrin: &nir_intrinsic_instr, cmp_src: AtomCmpSrc, ) -> AtomOp1886 fn get_atomic_op(
1887 &self,
1888 intrin: &nir_intrinsic_instr,
1889 cmp_src: AtomCmpSrc,
1890 ) -> AtomOp {
1891 match intrin.atomic_op() {
1892 nir_atomic_op_iadd => AtomOp::Add,
1893 nir_atomic_op_imin => AtomOp::Min,
1894 nir_atomic_op_umin => AtomOp::Min,
1895 nir_atomic_op_imax => AtomOp::Max,
1896 nir_atomic_op_umax => AtomOp::Max,
1897 nir_atomic_op_iand => AtomOp::And,
1898 nir_atomic_op_ior => AtomOp::Or,
1899 nir_atomic_op_ixor => AtomOp::Xor,
1900 nir_atomic_op_xchg => AtomOp::Exch,
1901 nir_atomic_op_fadd => AtomOp::Add,
1902 nir_atomic_op_fmin => AtomOp::Min,
1903 nir_atomic_op_fmax => AtomOp::Max,
1904 nir_atomic_op_cmpxchg => AtomOp::CmpExch(cmp_src),
1905 _ => panic!("Unsupported NIR atomic op"),
1906 }
1907 }
1908
get_eviction_priority( &mut self, access: gl_access_qualifier, ) -> MemEvictionPriority1909 fn get_eviction_priority(
1910 &mut self,
1911 access: gl_access_qualifier,
1912 ) -> MemEvictionPriority {
1913 if self.sm.sm() >= 70 && access & ACCESS_NON_TEMPORAL != 0 {
1914 MemEvictionPriority::First
1915 } else {
1916 MemEvictionPriority::Normal
1917 }
1918 }
1919
get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim1920 fn get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim {
1921 let is_array = intrin.image_array();
1922 let image_dim = intrin.image_dim();
1923 match intrin.image_dim() {
1924 GLSL_SAMPLER_DIM_1D => {
1925 if is_array {
1926 ImageDim::_1DArray
1927 } else {
1928 ImageDim::_1D
1929 }
1930 }
1931 GLSL_SAMPLER_DIM_2D => {
1932 if is_array {
1933 ImageDim::_2DArray
1934 } else {
1935 ImageDim::_2D
1936 }
1937 }
1938 GLSL_SAMPLER_DIM_3D => {
1939 assert!(!is_array);
1940 ImageDim::_3D
1941 }
1942 GLSL_SAMPLER_DIM_CUBE => ImageDim::_2DArray,
1943 GLSL_SAMPLER_DIM_BUF => {
1944 assert!(!is_array);
1945 ImageDim::_1DBuffer
1946 }
1947 _ => panic!("Unsupported image dimension: {}", image_dim),
1948 }
1949 }
1950
get_image_coord( &mut self, intrin: &nir_intrinsic_instr, dim: ImageDim, ) -> Src1951 fn get_image_coord(
1952 &mut self,
1953 intrin: &nir_intrinsic_instr,
1954 dim: ImageDim,
1955 ) -> Src {
1956 let vec = self.get_ssa(intrin.get_src(1).as_def());
1957 // let sample = self.get_src(&srcs[2]);
1958 let comps = usize::from(dim.coord_comps());
1959 SSARef::try_from(&vec[0..comps]).unwrap().into()
1960 }
1961
parse_intrinsic( &mut self, b: &mut impl SSABuilder, intrin: &nir_intrinsic_instr, )1962 fn parse_intrinsic(
1963 &mut self,
1964 b: &mut impl SSABuilder,
1965 intrin: &nir_intrinsic_instr,
1966 ) {
1967 let srcs = intrin.srcs_as_slice();
1968 match intrin.intrinsic {
1969 nir_intrinsic_al2p_nv => {
1970 let offset = self.get_src(&srcs[0]);
1971 let addr = u16::try_from(intrin.base()).unwrap();
1972
1973 let flags = intrin.flags();
1974 let flags: nak_nir_attr_io_flags =
1975 unsafe { std::mem::transmute_copy(&flags) };
1976
1977 let access = AttrAccess {
1978 addr: addr,
1979 comps: 1,
1980 patch: flags.patch(),
1981 output: flags.output(),
1982 phys: false,
1983 };
1984
1985 let dst = b.alloc_ssa(RegFile::GPR, 1);
1986 b.push_op(OpAL2P {
1987 dst: dst.into(),
1988 offset: offset,
1989 access: access,
1990 });
1991 self.set_dst(&intrin.def, dst);
1992 }
1993 nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
1994 let addr = u16::try_from(intrin.base()).unwrap();
1995 let base = u16::try_from(intrin.range_base()).unwrap();
1996 let range = u16::try_from(intrin.range()).unwrap();
1997 let range = base..(base + range);
1998
1999 let flags = intrin.flags();
2000 let flags: nak_nir_attr_io_flags =
2001 unsafe { std::mem::transmute_copy(&flags) };
2002 assert!(!flags.patch() || !flags.phys());
2003
2004 if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
2005 if flags.patch() {
2006 match &mut self.info.stage {
2007 ShaderStageInfo::TessellationInit(stage) => {
2008 assert!(flags.output());
2009 stage.per_patch_attribute_count = max(
2010 stage.per_patch_attribute_count,
2011 (range.end / 4).try_into().unwrap(),
2012 );
2013 }
2014 ShaderStageInfo::Tessellation(_) => (),
2015 _ => panic!("Patch I/O not supported"),
2016 }
2017 } else {
2018 if flags.output() {
2019 if intrin.intrinsic == nir_intrinsic_ast_nv {
2020 io.mark_store_req(range.clone());
2021 }
2022 io.mark_attrs_written(range);
2023 } else {
2024 io.mark_attrs_read(range);
2025 }
2026 }
2027 } else {
2028 panic!("Must be a VTG stage");
2029 }
2030
2031 let access = AttrAccess {
2032 addr: addr,
2033 comps: intrin.num_components,
2034 patch: flags.patch(),
2035 output: flags.output(),
2036 phys: flags.phys(),
2037 };
2038
2039 if intrin.intrinsic == nir_intrinsic_ald_nv {
2040 let vtx = self.get_src(&srcs[0]);
2041 let offset = self.get_src(&srcs[1]);
2042
2043 assert!(intrin.def.bit_size() == 32);
2044 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2045 b.push_op(OpALd {
2046 dst: dst.into(),
2047 vtx: vtx,
2048 offset: offset,
2049 access: access,
2050 });
2051 self.set_dst(&intrin.def, dst);
2052 } else if intrin.intrinsic == nir_intrinsic_ast_nv {
2053 assert!(srcs[0].bit_size() == 32);
2054 let data = self.get_src(&srcs[0]);
2055 let vtx = self.get_src(&srcs[1]);
2056 let offset = self.get_src(&srcs[2]);
2057
2058 b.push_op(OpASt {
2059 data: data,
2060 vtx: vtx,
2061 offset: offset,
2062 access: access,
2063 });
2064 } else {
2065 panic!("Invalid VTG I/O intrinsic");
2066 }
2067 }
2068 nir_intrinsic_as_uniform => {
2069 let src = self.get_ssa(srcs[0].as_def());
2070 let mut dst = Vec::new();
2071 for comp in src {
2072 let u = b.alloc_ssa(RegFile::UGPR, 1);
2073 b.push_op(OpR2UR {
2074 src: [*comp].into(),
2075 dst: u.into(),
2076 });
2077 dst.push(u[0]);
2078 }
2079 self.set_ssa(&intrin.def, dst);
2080 }
2081 nir_intrinsic_ddx
2082 | nir_intrinsic_ddx_coarse
2083 | nir_intrinsic_ddx_fine => {
2084 // TODO: Real coarse derivatives
2085
2086 assert!(intrin.def.bit_size() == 32);
2087 let ftype = FloatType::F32;
2088 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2089
2090 b.push_op(OpShfl {
2091 dst: scratch[0].into(),
2092 in_bounds: Dst::None,
2093 src: self.get_src(&srcs[0]),
2094 lane: 1_u32.into(),
2095 c: (0x3_u32 | 0x1c_u32 << 8).into(),
2096 op: ShflOp::Bfly,
2097 });
2098
2099 let dst = b.alloc_ssa(RegFile::GPR, 1);
2100
2101 b.push_op(OpFSwzAdd {
2102 dst: dst[0].into(),
2103 srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2104 ops: [
2105 FSwzAddOp::SubLeft,
2106 FSwzAddOp::SubRight,
2107 FSwzAddOp::SubLeft,
2108 FSwzAddOp::SubRight,
2109 ],
2110 rnd_mode: self.float_ctl[ftype].rnd_mode,
2111 ftz: self.float_ctl[ftype].ftz,
2112 });
2113
2114 self.set_dst(&intrin.def, dst);
2115 }
2116 nir_intrinsic_ddy
2117 | nir_intrinsic_ddy_coarse
2118 | nir_intrinsic_ddy_fine => {
2119 // TODO: Real coarse derivatives
2120
2121 assert!(intrin.def.bit_size() == 32);
2122 let ftype = FloatType::F32;
2123 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2124
2125 b.push_op(OpShfl {
2126 dst: scratch[0].into(),
2127 in_bounds: Dst::None,
2128 src: self.get_src(&srcs[0]),
2129 lane: 2_u32.into(),
2130 c: (0x3_u32 | 0x1c_u32 << 8).into(),
2131 op: ShflOp::Bfly,
2132 });
2133
2134 let dst = b.alloc_ssa(RegFile::GPR, 1);
2135
2136 b.push_op(OpFSwzAdd {
2137 dst: dst[0].into(),
2138 srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2139 ops: [
2140 FSwzAddOp::SubLeft,
2141 FSwzAddOp::SubLeft,
2142 FSwzAddOp::SubRight,
2143 FSwzAddOp::SubRight,
2144 ],
2145 rnd_mode: self.float_ctl[ftype].rnd_mode,
2146 ftz: self.float_ctl[ftype].ftz,
2147 });
2148
2149 self.set_dst(&intrin.def, dst);
2150 }
2151 nir_intrinsic_ballot => {
2152 assert!(srcs[0].bit_size() == 1);
2153 let src = self.get_src(&srcs[0]);
2154
2155 assert!(intrin.def.bit_size() == 32);
2156 let dst = b.alloc_ssa(RegFile::GPR, 1);
2157
2158 b.push_op(OpVote {
2159 op: VoteOp::Any,
2160 ballot: dst.into(),
2161 vote: Dst::None,
2162 pred: src,
2163 });
2164 self.set_dst(&intrin.def, dst);
2165 }
2166 nir_intrinsic_bar_break_nv => {
2167 let src = self.get_src(&srcs[0]);
2168 let bar_in = b.bmov_to_bar(src);
2169 let cond = self.get_src(&srcs[1]);
2170
2171 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2172 b.push_op(OpBreak {
2173 bar_out: bar_out.into(),
2174 bar_in: bar_in.into(),
2175 cond: cond.into(),
2176 });
2177
2178 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2179 }
2180 nir_intrinsic_bar_set_nv => {
2181 let label = self.label_alloc.alloc();
2182 let old = self.bar_label.insert(intrin.def.index, label);
2183 assert!(old.is_none());
2184
2185 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
2186 b.push_op(OpBClear {
2187 dst: bar_clear.into(),
2188 });
2189
2190 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2191 b.push_op(OpBSSy {
2192 bar_out: bar_out.into(),
2193 bar_in: bar_clear.into(),
2194 cond: SrcRef::True.into(),
2195 target: label,
2196 });
2197
2198 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2199 }
2200 nir_intrinsic_bar_sync_nv => {
2201 let src = self.get_src(&srcs[0]);
2202
2203 let bar = b.bmov_to_bar(src);
2204 b.push_op(OpBSync {
2205 bar: bar.into(),
2206 cond: SrcRef::True.into(),
2207 });
2208
2209 let bar_set_idx = &srcs[1].as_def().index;
2210 if let Some(label) = self.bar_label.get(bar_set_idx) {
2211 b.push_op(OpNop {
2212 label: Some(*label),
2213 });
2214 }
2215 }
2216 nir_intrinsic_bindless_image_atomic
2217 | nir_intrinsic_bindless_image_atomic_swap => {
2218 let handle = self.get_src(&srcs[0]);
2219 let dim = self.get_image_dim(intrin);
2220 let coord = self.get_image_coord(intrin, dim);
2221 // let sample = self.get_src(&srcs[2]);
2222 let atom_type = self.get_atomic_type(intrin);
2223 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Packed);
2224
2225 assert!(
2226 intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
2227 );
2228 assert!(intrin.def.num_components() == 1);
2229 let dst = b.alloc_ssa(RegFile::GPR, intrin.def.bit_size() / 32);
2230
2231 let data = if intrin.intrinsic
2232 == nir_intrinsic_bindless_image_atomic_swap
2233 {
2234 if intrin.def.bit_size() == 64 {
2235 SSARef::from([
2236 self.get_ssa(srcs[3].as_def())[0],
2237 self.get_ssa(srcs[3].as_def())[1],
2238 self.get_ssa(srcs[4].as_def())[0],
2239 self.get_ssa(srcs[4].as_def())[1],
2240 ])
2241 .into()
2242 } else {
2243 SSARef::from([
2244 self.get_ssa(srcs[3].as_def())[0],
2245 self.get_ssa(srcs[4].as_def())[0],
2246 ])
2247 .into()
2248 }
2249 } else {
2250 self.get_src(&srcs[3])
2251 };
2252
2253 let is_reduction =
2254 atom_op.is_reduction() && intrin.def.components_read() == 0;
2255
2256 b.push_op(OpSuAtom {
2257 dst: if self.sm.sm() >= 70 && is_reduction {
2258 Dst::None
2259 } else {
2260 dst.into()
2261 },
2262 fault: Dst::None,
2263 handle: handle,
2264 coord: coord,
2265 data: data,
2266 atom_op: atom_op,
2267 atom_type: atom_type,
2268 image_dim: dim,
2269 mem_order: MemOrder::Strong(MemScope::System),
2270 mem_eviction_priority: self
2271 .get_eviction_priority(intrin.access()),
2272 });
2273 self.set_dst(&intrin.def, dst);
2274 }
2275 nir_intrinsic_bindless_image_load => {
2276 let handle = self.get_src(&srcs[0]);
2277 let dim = self.get_image_dim(intrin);
2278 let coord = self.get_image_coord(intrin, dim);
2279 // let sample = self.get_src(&srcs[2]);
2280
2281 let comps = intrin.num_components;
2282 assert!(intrin.def.bit_size() == 32);
2283 assert!(comps == 1 || comps == 2 || comps == 4);
2284
2285 let dst = b.alloc_ssa(RegFile::GPR, comps);
2286
2287 b.push_op(OpSuLd {
2288 dst: dst.into(),
2289 fault: Dst::None,
2290 image_dim: dim,
2291 mem_order: MemOrder::Strong(MemScope::System),
2292 mem_eviction_priority: self
2293 .get_eviction_priority(intrin.access()),
2294 mask: (1 << comps) - 1,
2295 handle: handle,
2296 coord: coord,
2297 });
2298 self.set_dst(&intrin.def, dst);
2299 }
2300 nir_intrinsic_bindless_image_sparse_load => {
2301 let handle = self.get_src(&srcs[0]);
2302 let dim = self.get_image_dim(intrin);
2303 let coord = self.get_image_coord(intrin, dim);
2304 // let sample = self.get_src(&srcs[2]);
2305
2306 let comps = intrin.num_components;
2307 assert!(intrin.def.bit_size() == 32);
2308 assert!(comps == 5);
2309
2310 let dst = b.alloc_ssa(RegFile::GPR, comps - 1);
2311 let fault = b.alloc_ssa(RegFile::Pred, 1);
2312
2313 b.push_op(OpSuLd {
2314 dst: dst.into(),
2315 fault: fault.into(),
2316 image_dim: dim,
2317 mem_order: MemOrder::Strong(MemScope::System),
2318 mem_eviction_priority: self
2319 .get_eviction_priority(intrin.access()),
2320 mask: (1 << (comps - 1)) - 1,
2321 handle: handle,
2322 coord: coord,
2323 });
2324
2325 let mut final_dst = Vec::new();
2326 for i in 0..usize::from(comps) - 1 {
2327 final_dst.push(dst[i]);
2328 }
2329 final_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
2330
2331 self.set_ssa(&intrin.def, final_dst);
2332 }
2333 nir_intrinsic_bindless_image_store => {
2334 let handle = self.get_src(&srcs[0]);
2335 let dim = self.get_image_dim(intrin);
2336 let coord = self.get_image_coord(intrin, dim);
2337 // let sample = self.get_src(&srcs[2]);
2338 let data = self.get_src(&srcs[3]);
2339
2340 let comps = intrin.num_components;
2341 assert!(srcs[3].bit_size() == 32);
2342 assert!(comps == 1 || comps == 2 || comps == 4);
2343
2344 b.push_op(OpSuSt {
2345 image_dim: dim,
2346 mem_order: MemOrder::Strong(MemScope::System),
2347 mem_eviction_priority: self
2348 .get_eviction_priority(intrin.access()),
2349 mask: (1 << comps) - 1,
2350 handle: handle,
2351 coord: coord,
2352 data: data,
2353 });
2354 }
2355 nir_intrinsic_copy_fs_outputs_nv => {
2356 let ShaderIoInfo::Fragment(info) = &mut self.info.io else {
2357 panic!(
2358 "copy_fs_outputs_nv is only allowed in fragment shaders"
2359 );
2360 };
2361
2362 for i in 0..32 {
2363 if !self.fs_out_regs[i].is_none() {
2364 info.writes_color |= 1 << i;
2365 }
2366 }
2367 let mask_idx = (NAK_FS_OUT_SAMPLE_MASK / 4) as usize;
2368 info.writes_sample_mask = !self.fs_out_regs[mask_idx].is_none();
2369 let depth_idx = (NAK_FS_OUT_DEPTH / 4) as usize;
2370 info.writes_depth = !self.fs_out_regs[depth_idx].is_none();
2371
2372 let mut srcs = Vec::new();
2373 for i in 0..8 {
2374 // Even though the mask is per-component, the actual output
2375 // space is per-output vec4s.
2376 if info.writes_color & (0xf << (i * 4)) != 0 {
2377 for c in 0..4 {
2378 let reg = self.fs_out_regs[i * 4 + c];
2379 if reg.is_none() {
2380 srcs.push(b.undef().into());
2381 } else {
2382 srcs.push(reg.into());
2383 }
2384 }
2385 }
2386 }
2387
2388 // These always come together for some reason
2389 if info.writes_sample_mask || info.writes_depth {
2390 if info.writes_sample_mask {
2391 srcs.push(self.fs_out_regs[mask_idx].into());
2392 } else {
2393 srcs.push(b.undef().into());
2394 }
2395 if info.writes_depth {
2396 srcs.push(self.fs_out_regs[depth_idx].into());
2397 }
2398 }
2399
2400 b.push_op(OpRegOut { srcs: srcs });
2401 }
2402 nir_intrinsic_demote => {
2403 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2404 info.uses_kill = true;
2405 } else {
2406 panic!("OpKill is only available in fragment shaders");
2407 }
2408 b.push_op(OpKill {});
2409 }
2410 nir_intrinsic_demote_if => {
2411 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2412 info.uses_kill = true;
2413 } else {
2414 panic!("OpKill is only available in fragment shaders");
2415 }
2416 let cond = self.get_ssa(srcs[0].as_def())[0];
2417 b.predicate(cond.into()).push_op(OpKill {});
2418 }
2419 nir_intrinsic_global_atomic => {
2420 let bit_size = intrin.def.bit_size();
2421 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2422 let data = self.get_src(&srcs[1]);
2423 let atom_type = self.get_atomic_type(intrin);
2424 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2425
2426 assert!(intrin.def.num_components() == 1);
2427 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2428
2429 let is_reduction =
2430 atom_op.is_reduction() && intrin.def.components_read() == 0;
2431
2432 b.push_op(OpAtom {
2433 dst: if is_reduction { Dst::None } else { dst.into() },
2434 addr: addr,
2435 cmpr: 0.into(),
2436 data: data,
2437 atom_op: atom_op,
2438 atom_type: atom_type,
2439 addr_offset: offset,
2440 mem_space: MemSpace::Global(MemAddrType::A64),
2441 mem_order: MemOrder::Strong(MemScope::System),
2442 mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2443 });
2444 self.set_dst(&intrin.def, dst);
2445 }
2446 nir_intrinsic_global_atomic_swap => {
2447 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2448 let bit_size = intrin.def.bit_size();
2449 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2450 let cmpr = self.get_src(&srcs[1]);
2451 let data = self.get_src(&srcs[2]);
2452 let atom_type = AtomType::U(bit_size);
2453
2454 assert!(intrin.def.num_components() == 1);
2455 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2456
2457 b.push_op(OpAtom {
2458 dst: dst.into(),
2459 addr: addr,
2460 cmpr: cmpr,
2461 data: data,
2462 atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
2463 atom_type: atom_type,
2464 addr_offset: offset,
2465 mem_space: MemSpace::Global(MemAddrType::A64),
2466 mem_order: MemOrder::Strong(MemScope::System),
2467 mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2468 });
2469 self.set_dst(&intrin.def, dst);
2470 }
2471 nir_intrinsic_ipa_nv => {
2472 let addr = u16::try_from(intrin.base()).unwrap();
2473
2474 let flags = intrin.flags();
2475 let flags: nak_nir_ipa_flags =
2476 unsafe { std::mem::transmute_copy(&flags) };
2477
2478 let mode = match flags.interp_mode() {
2479 NAK_INTERP_MODE_PERSPECTIVE => PixelImap::Perspective,
2480 NAK_INTERP_MODE_SCREEN_LINEAR => PixelImap::ScreenLinear,
2481 NAK_INTERP_MODE_CONSTANT => PixelImap::Constant,
2482 _ => panic!("Unsupported interp mode"),
2483 };
2484
2485 let freq = match flags.interp_freq() {
2486 NAK_INTERP_FREQ_PASS => InterpFreq::Pass,
2487 NAK_INTERP_FREQ_PASS_MUL_W => InterpFreq::PassMulW,
2488 NAK_INTERP_FREQ_CONSTANT => InterpFreq::Constant,
2489 NAK_INTERP_FREQ_STATE => InterpFreq::State,
2490 _ => panic!("Invalid interp freq"),
2491 };
2492
2493 let loc = match flags.interp_loc() {
2494 NAK_INTERP_LOC_DEFAULT => InterpLoc::Default,
2495 NAK_INTERP_LOC_CENTROID => InterpLoc::Centroid,
2496 NAK_INTERP_LOC_OFFSET => InterpLoc::Offset,
2497 _ => panic!("Invalid interp loc"),
2498 };
2499
2500 let inv_w = if freq == InterpFreq::PassMulW {
2501 self.get_src(&srcs[0])
2502 } else {
2503 0.into()
2504 };
2505
2506 let offset = if loc == InterpLoc::Offset {
2507 self.get_src(&srcs[1])
2508 } else {
2509 0.into()
2510 };
2511
2512 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2513 panic!("OpIpa is only used for fragment shaders");
2514 };
2515
2516 io.mark_attr_read(addr, mode);
2517
2518 let dst = b.alloc_ssa(RegFile::GPR, 1);
2519 b.push_op(OpIpa {
2520 dst: dst.into(),
2521 addr: addr,
2522 freq: freq,
2523 loc: loc,
2524 inv_w: inv_w,
2525 offset: offset,
2526 });
2527 self.set_dst(&intrin.def, dst);
2528 }
2529 nir_intrinsic_isberd_nv => {
2530 let dst = b.alloc_ssa(RegFile::GPR, 1);
2531 b.push_op(OpIsberd {
2532 dst: dst.into(),
2533 idx: self.get_src(&srcs[0]),
2534 });
2535 self.set_dst(&intrin.def, dst);
2536 }
2537 nir_intrinsic_load_barycentric_at_offset_nv => (),
2538 nir_intrinsic_load_barycentric_centroid => (),
2539 nir_intrinsic_load_barycentric_pixel => (),
2540 nir_intrinsic_load_barycentric_sample => (),
2541 nir_intrinsic_load_global | nir_intrinsic_load_global_constant => {
2542 let size_B =
2543 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2544 assert!(u32::from(size_B) <= intrin.align());
2545 let order =
2546 if intrin.intrinsic == nir_intrinsic_load_global_constant {
2547 MemOrder::Constant
2548 } else {
2549 MemOrder::Strong(MemScope::System)
2550 };
2551 let access = MemAccess {
2552 mem_type: MemType::from_size(size_B, false),
2553 space: MemSpace::Global(MemAddrType::A64),
2554 order: order,
2555 eviction_priority: self
2556 .get_eviction_priority(intrin.access()),
2557 };
2558 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2559 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2560
2561 b.push_op(OpLd {
2562 dst: dst.into(),
2563 addr: addr,
2564 offset: offset,
2565 access: access,
2566 });
2567 self.set_dst(&intrin.def, dst);
2568 }
2569 nir_intrinsic_ldtram_nv => {
2570 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2571 panic!("ldtram_nv is only used for fragment shaders");
2572 };
2573
2574 assert!(
2575 intrin.def.bit_size() == 32
2576 && intrin.def.num_components == 2
2577 );
2578
2579 let flags = intrin.flags();
2580 let use_c = flags != 0;
2581
2582 let addr = u16::try_from(intrin.base()).unwrap();
2583
2584 io.mark_barycentric_attr_in(addr);
2585
2586 let dst = b.alloc_ssa(RegFile::GPR, 2);
2587 b.push_op(OpLdTram {
2588 dst: dst.into(),
2589 addr,
2590 use_c,
2591 });
2592 self.set_dst(&intrin.def, dst);
2593 }
2594 nir_intrinsic_load_sample_id => {
2595 let dst = b.alloc_ssa(RegFile::GPR, 1);
2596 b.push_op(OpPixLd {
2597 dst: dst.into(),
2598 val: PixVal::MyIndex,
2599 });
2600 self.set_dst(&intrin.def, dst);
2601 }
2602 nir_intrinsic_load_sample_mask_in => {
2603 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2604 info.reads_sample_mask = true;
2605 } else {
2606 panic!(
2607 "sample_mask_in is only available in fragment shaders"
2608 );
2609 }
2610
2611 let dst = b.alloc_ssa(RegFile::GPR, 1);
2612 b.push_op(OpPixLd {
2613 dst: dst.into(),
2614 val: PixVal::CovMask,
2615 });
2616 self.set_dst(&intrin.def, dst);
2617 }
2618 nir_intrinsic_load_tess_coord_xy => {
2619 // Loading gl_TessCoord in tessellation evaluation shaders is
2620 // weird. It's treated as a per-vertex output which is indexed
2621 // by LANEID.
2622 match &self.info.stage {
2623 ShaderStageInfo::Tessellation(_) => (),
2624 _ => panic!(
2625 "load_tess_coord is only available in tessellation \
2626 shaders"
2627 ),
2628 };
2629
2630 assert!(intrin.def.bit_size() == 32);
2631 assert!(intrin.def.num_components() == 2);
2632
2633 let vtx = b.alloc_ssa(RegFile::GPR, 1);
2634 b.push_op(OpS2R {
2635 dst: vtx.into(),
2636 idx: 0,
2637 });
2638
2639 let access = AttrAccess {
2640 addr: NAK_ATTR_TESS_COORD,
2641 comps: 2,
2642 patch: false,
2643 output: true,
2644 phys: false,
2645 };
2646
2647 // This is recorded as a patch output in parse_shader() because
2648 // the hardware requires it be in the SPH, whether we use it or
2649 // not.
2650
2651 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2652 b.push_op(OpALd {
2653 dst: dst.into(),
2654 vtx: vtx.into(),
2655 offset: 0.into(),
2656 access: access,
2657 });
2658 self.set_dst(&intrin.def, dst);
2659 }
2660 nir_intrinsic_load_scratch => {
2661 let size_B =
2662 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2663 assert!(u32::from(size_B) <= intrin.align());
2664 let access = MemAccess {
2665 mem_type: MemType::from_size(size_B, false),
2666 space: MemSpace::Local,
2667 order: MemOrder::Strong(MemScope::CTA),
2668 eviction_priority: MemEvictionPriority::Normal,
2669 };
2670 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2671 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2672
2673 b.push_op(OpLd {
2674 dst: dst.into(),
2675 addr: addr,
2676 offset: offset,
2677 access: access,
2678 });
2679 self.set_dst(&intrin.def, dst);
2680 }
2681 nir_intrinsic_load_shared => {
2682 let size_B =
2683 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2684 assert!(u32::from(size_B) <= intrin.align());
2685 let access = MemAccess {
2686 mem_type: MemType::from_size(size_B, false),
2687 space: MemSpace::Shared,
2688 order: MemOrder::Strong(MemScope::CTA),
2689 eviction_priority: MemEvictionPriority::Normal,
2690 };
2691 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2692 let offset = offset + intrin.base();
2693 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2694
2695 b.push_op(OpLd {
2696 dst: dst.into(),
2697 addr: addr,
2698 offset: offset,
2699 access: access,
2700 });
2701 self.set_dst(&intrin.def, dst);
2702 }
2703 nir_intrinsic_load_sysval_nv => {
2704 let idx = u8::try_from(intrin.base()).unwrap();
2705 debug_assert!(intrin.def.num_components == 1);
2706 debug_assert!(
2707 intrin.def.bit_size == 32 || intrin.def.bit_size == 64
2708 );
2709 let comps = intrin.def.bit_size / 32;
2710 let dst = b.alloc_ssa(RegFile::GPR, comps);
2711 if idx == NAK_SV_CLOCK || idx == NAK_SV_CLOCK + 1 {
2712 debug_assert!(idx + comps <= NAK_SV_CLOCK + 2);
2713 b.push_op(OpCS2R {
2714 dst: dst.into(),
2715 idx: idx,
2716 });
2717 } else {
2718 debug_assert!(intrin.def.bit_size == 32);
2719 b.push_op(OpS2R {
2720 dst: dst.into(),
2721 idx: idx,
2722 });
2723 }
2724 self.set_dst(&intrin.def, dst);
2725 }
2726 nir_intrinsic_ldc_nv => {
2727 let size_B =
2728 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2729 let idx = &srcs[0];
2730
2731 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2732
2733 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2734
2735 if let Some(idx_imm) = idx.as_uint() {
2736 let idx_imm: u8 = idx_imm.try_into().unwrap();
2737 let cb = CBufRef {
2738 buf: CBuf::Binding(idx_imm),
2739 offset: off_imm,
2740 };
2741 if off.is_zero() {
2742 for (i, comp) in dst.iter().enumerate() {
2743 let i = u16::try_from(i).unwrap();
2744 b.copy_to((*comp).into(), cb.offset(i * 4).into());
2745 }
2746 } else {
2747 b.push_op(OpLdc {
2748 dst: dst.into(),
2749 cb: cb.into(),
2750 offset: off,
2751 mode: LdcMode::Indexed,
2752 mem_type: MemType::from_size(size_B, false),
2753 });
2754 }
2755 } else {
2756 // In the IndexedSegmented mode, the hardware computes the
2757 // actual index and offset as follows:
2758 //
2759 // idx = imm_idx + reg[31:16]
2760 // offset = imm_offset + reg[15:0]
2761 // ldc c[idx][offset]
2762 //
2763 // So pack the index and offset accordingly
2764 let idx = self.get_src(idx);
2765 let off_idx = b.prmt(off, idx, [0, 1, 4, 5]);
2766 let cb = CBufRef {
2767 buf: CBuf::Binding(0),
2768 offset: off_imm,
2769 };
2770 b.push_op(OpLdc {
2771 dst: dst.into(),
2772 cb: cb.into(),
2773 offset: off_idx.into(),
2774 mode: LdcMode::IndexedSegmented,
2775 mem_type: MemType::from_size(size_B, false),
2776 });
2777 }
2778 self.set_dst(&intrin.def, dst);
2779 }
2780 nir_intrinsic_ldcx_nv => {
2781 let size_B =
2782 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2783
2784 let handle = self.get_ssa_ref(&srcs[0]);
2785 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2786
2787 let cb = CBufRef {
2788 buf: CBuf::BindlessSSA(handle),
2789 offset: off_imm,
2790 };
2791
2792 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2793 if off.is_zero() {
2794 for (i, comp) in dst.iter().enumerate() {
2795 let i = u16::try_from(i).unwrap();
2796 b.copy_to((*comp).into(), cb.offset(i * 4).into());
2797 }
2798 } else {
2799 b.push_op(OpLdc {
2800 dst: dst.into(),
2801 cb: cb.into(),
2802 offset: off,
2803 mode: LdcMode::Indexed,
2804 mem_type: MemType::from_size(size_B, false),
2805 });
2806 }
2807 self.set_dst(&intrin.def, dst);
2808 }
2809 nir_intrinsic_pin_cx_handle_nv => {
2810 let handle = self.get_ssa_ref(&srcs[0]);
2811 b.push_op(OpPin {
2812 src: handle.into(),
2813 dst: handle.into(),
2814 });
2815 }
2816 nir_intrinsic_unpin_cx_handle_nv => {
2817 let handle = self.get_ssa_ref(&srcs[0]);
2818 b.push_op(OpUnpin {
2819 src: handle.into(),
2820 dst: handle.into(),
2821 });
2822 }
2823 nir_intrinsic_barrier => {
2824 let modes = intrin.memory_modes();
2825 let semantics = intrin.memory_semantics();
2826 if (modes & nir_var_mem_global) != 0
2827 && (semantics & NIR_MEMORY_RELEASE) != 0
2828 {
2829 // Pre-Volta doesn't have WBAll but it also seems that we
2830 // don't need it.
2831 if self.sm.sm() >= 70 {
2832 b.push_op(OpCCtl {
2833 op: CCtlOp::WBAll,
2834 mem_space: MemSpace::Global(MemAddrType::A64),
2835 addr: 0.into(),
2836 addr_offset: 0,
2837 });
2838 }
2839 }
2840 match intrin.execution_scope() {
2841 SCOPE_NONE => (),
2842 SCOPE_WORKGROUP => {
2843 assert!(
2844 self.nir.info.stage() == MESA_SHADER_COMPUTE
2845 || self.nir.info.stage() == MESA_SHADER_KERNEL
2846 );
2847 self.info.num_control_barriers = 1;
2848 b.push_op(OpBar {});
2849 }
2850 _ => panic!("Unhandled execution scope"),
2851 }
2852 if intrin.memory_scope() != SCOPE_NONE {
2853 let mem_scope = match intrin.memory_scope() {
2854 SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
2855 SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
2856 MemScope::GPU
2857 }
2858 _ => panic!("Unhandled memory scope"),
2859 };
2860 b.push_op(OpMemBar { scope: mem_scope });
2861 }
2862 if (modes & nir_var_mem_global) != 0
2863 && (semantics & NIR_MEMORY_ACQUIRE) != 0
2864 {
2865 b.push_op(OpCCtl {
2866 op: CCtlOp::IVAll,
2867 mem_space: MemSpace::Global(MemAddrType::A64),
2868 addr: 0.into(),
2869 addr_offset: 0,
2870 });
2871 }
2872 }
2873 nir_intrinsic_quad_broadcast
2874 | nir_intrinsic_read_invocation
2875 | nir_intrinsic_shuffle
2876 | nir_intrinsic_shuffle_down
2877 | nir_intrinsic_shuffle_up
2878 | nir_intrinsic_shuffle_xor => {
2879 assert!(srcs[0].bit_size() == 32);
2880 assert!(srcs[0].num_components() == 1);
2881 let data = self.get_src(&srcs[0]);
2882
2883 assert!(srcs[1].bit_size() == 32);
2884 let idx = self.get_src(&srcs[1]);
2885
2886 assert!(intrin.def.bit_size() == 32);
2887 let dst = b.alloc_ssa(RegFile::GPR, 1);
2888
2889 b.push_op(OpShfl {
2890 dst: dst.into(),
2891 in_bounds: Dst::None,
2892 src: data,
2893 lane: idx,
2894 c: match intrin.intrinsic {
2895 nir_intrinsic_quad_broadcast => 0x1c_03.into(),
2896 nir_intrinsic_shuffle_up => 0.into(),
2897 _ => 0x1f.into(),
2898 },
2899 op: match intrin.intrinsic {
2900 nir_intrinsic_shuffle_down => ShflOp::Down,
2901 nir_intrinsic_shuffle_up => ShflOp::Up,
2902 nir_intrinsic_shuffle_xor => ShflOp::Bfly,
2903 _ => ShflOp::Idx,
2904 },
2905 });
2906 self.set_dst(&intrin.def, dst);
2907 }
2908 nir_intrinsic_quad_swap_horizontal
2909 | nir_intrinsic_quad_swap_vertical
2910 | nir_intrinsic_quad_swap_diagonal => {
2911 assert!(srcs[0].bit_size() == 32);
2912 assert!(srcs[0].num_components() == 1);
2913 let data = self.get_src(&srcs[0]);
2914
2915 assert!(intrin.def.bit_size() == 32);
2916 let dst = b.alloc_ssa(RegFile::GPR, 1);
2917 b.push_op(OpShfl {
2918 dst: dst.into(),
2919 in_bounds: Dst::None,
2920 src: data,
2921 lane: match intrin.intrinsic {
2922 nir_intrinsic_quad_swap_horizontal => 1_u32.into(),
2923 nir_intrinsic_quad_swap_vertical => 2_u32.into(),
2924 nir_intrinsic_quad_swap_diagonal => 3_u32.into(),
2925 op => panic!("Unknown quad intrinsic {}", op),
2926 },
2927 c: 0x1c_03.into(),
2928 op: ShflOp::Bfly,
2929 });
2930 self.set_dst(&intrin.def, dst);
2931 }
2932 nir_intrinsic_shared_atomic => {
2933 let bit_size = intrin.def.bit_size();
2934 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2935 let data = self.get_src(&srcs[1]);
2936 let atom_type = self.get_atomic_type(intrin);
2937 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2938
2939 assert!(intrin.def.num_components() == 1);
2940 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2941
2942 b.push_op(OpAtom {
2943 dst: dst.into(),
2944 addr: addr,
2945 cmpr: 0.into(),
2946 data: data,
2947 atom_op: atom_op,
2948 atom_type: atom_type,
2949 addr_offset: offset,
2950 mem_space: MemSpace::Shared,
2951 mem_order: MemOrder::Strong(MemScope::CTA),
2952 mem_eviction_priority: MemEvictionPriority::Normal,
2953 });
2954 self.set_dst(&intrin.def, dst);
2955 }
2956 nir_intrinsic_shared_atomic_swap => {
2957 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2958 let bit_size = intrin.def.bit_size();
2959 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2960 let cmpr = self.get_src(&srcs[1]);
2961 let data = self.get_src(&srcs[2]);
2962 let atom_type = AtomType::U(bit_size);
2963
2964 assert!(intrin.def.num_components() == 1);
2965 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2966
2967 b.push_op(OpAtom {
2968 dst: dst.into(),
2969 addr: addr,
2970 cmpr: cmpr,
2971 data: data,
2972 atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
2973 atom_type: atom_type,
2974 addr_offset: offset,
2975 mem_space: MemSpace::Shared,
2976 mem_order: MemOrder::Strong(MemScope::CTA),
2977 mem_eviction_priority: MemEvictionPriority::Normal,
2978 });
2979 self.set_dst(&intrin.def, dst);
2980 }
2981 nir_intrinsic_ssa_bar_nv => {
2982 let src = self.get_src(&srcs[0]);
2983 b.push_op(OpSrcBar { src });
2984 }
2985 nir_intrinsic_store_global => {
2986 let data = self.get_src(&srcs[0]);
2987 let size_B =
2988 (srcs[0].bit_size() / 8) * srcs[0].num_components();
2989 assert!(u32::from(size_B) <= intrin.align());
2990 let access = MemAccess {
2991 mem_type: MemType::from_size(size_B, false),
2992 space: MemSpace::Global(MemAddrType::A64),
2993 order: MemOrder::Strong(MemScope::System),
2994 eviction_priority: self
2995 .get_eviction_priority(intrin.access()),
2996 };
2997 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
2998
2999 b.push_op(OpSt {
3000 addr: addr,
3001 data: data,
3002 offset: offset,
3003 access: access,
3004 });
3005 }
3006 nir_intrinsic_fs_out_nv => {
3007 let data = self.get_ssa(srcs[0].as_def());
3008 assert!(data.len() == 1);
3009 let data = data[0];
3010
3011 let addr = u16::try_from(intrin.base()).unwrap();
3012 assert!(addr % 4 == 0);
3013
3014 self.fs_out_regs[usize::from(addr / 4)] = data;
3015 }
3016 nir_intrinsic_store_scratch => {
3017 let data = self.get_src(&srcs[0]);
3018 let size_B =
3019 (srcs[0].bit_size() / 8) * srcs[0].num_components();
3020 assert!(u32::from(size_B) <= intrin.align());
3021 let access = MemAccess {
3022 mem_type: MemType::from_size(size_B, false),
3023 space: MemSpace::Local,
3024 order: MemOrder::Strong(MemScope::CTA),
3025 eviction_priority: MemEvictionPriority::Normal,
3026 };
3027 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3028
3029 b.push_op(OpSt {
3030 addr: addr,
3031 data: data,
3032 offset: offset,
3033 access: access,
3034 });
3035 }
3036 nir_intrinsic_store_shared => {
3037 let data = self.get_src(&srcs[0]);
3038 let size_B =
3039 (srcs[0].bit_size() / 8) * srcs[0].num_components();
3040 assert!(u32::from(size_B) <= intrin.align());
3041 let access = MemAccess {
3042 mem_type: MemType::from_size(size_B, false),
3043 space: MemSpace::Shared,
3044 order: MemOrder::Strong(MemScope::CTA),
3045 eviction_priority: MemEvictionPriority::Normal,
3046 };
3047 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3048 let offset = offset + intrin.base();
3049
3050 b.push_op(OpSt {
3051 addr: addr,
3052 data: data,
3053 offset: offset,
3054 access: access,
3055 });
3056 }
3057 nir_intrinsic_emit_vertex_nv | nir_intrinsic_end_primitive_nv => {
3058 assert!(intrin.def.bit_size() == 32);
3059 assert!(intrin.def.num_components() == 1);
3060
3061 let dst = b.alloc_ssa(RegFile::GPR, 1);
3062 let handle = self.get_src(&srcs[0]);
3063 let stream_id = intrin.stream_id();
3064
3065 b.push_op(OpOut {
3066 dst: dst.into(),
3067 handle: handle,
3068 stream: stream_id.into(),
3069 out_type: if intrin.intrinsic
3070 == nir_intrinsic_emit_vertex_nv
3071 {
3072 OutType::Emit
3073 } else {
3074 OutType::Cut
3075 },
3076 });
3077 self.set_dst(&intrin.def, dst);
3078 }
3079
3080 nir_intrinsic_final_primitive_nv => {
3081 let handle = self.get_src(&srcs[0]);
3082
3083 if self.sm.sm() >= 70 {
3084 b.push_op(OpOutFinal { handle: handle });
3085 } else {
3086 b.push_op(OpRegOut { srcs: vec![handle] });
3087 }
3088 }
3089 nir_intrinsic_vote_all
3090 | nir_intrinsic_vote_any
3091 | nir_intrinsic_vote_ieq => {
3092 assert!(srcs[0].bit_size() == 1);
3093 let src = self.get_src(&srcs[0]);
3094
3095 assert!(intrin.def.bit_size() == 1);
3096 let dst = b.alloc_ssa(RegFile::Pred, 1);
3097
3098 b.push_op(OpVote {
3099 op: match intrin.intrinsic {
3100 nir_intrinsic_vote_all => VoteOp::All,
3101 nir_intrinsic_vote_any => VoteOp::Any,
3102 nir_intrinsic_vote_ieq => VoteOp::Eq,
3103 _ => panic!("Unknown vote intrinsic"),
3104 },
3105 ballot: Dst::None,
3106 vote: dst.into(),
3107 pred: src,
3108 });
3109 self.set_dst(&intrin.def, dst);
3110 }
3111 nir_intrinsic_is_sparse_texels_resident => {
3112 let src = self.get_src(&srcs[0]);
3113 let dst = b.isetp(IntCmpType::I32, IntCmpOp::Ne, src, 0.into());
3114 self.set_dst(&intrin.def, dst);
3115 }
3116 _ => panic!(
3117 "Unsupported intrinsic instruction: {}",
3118 intrin.info().name()
3119 ),
3120 }
3121 }
3122
parse_load_const( &mut self, b: &mut impl SSABuilder, load_const: &nir_load_const_instr, )3123 fn parse_load_const(
3124 &mut self,
3125 b: &mut impl SSABuilder,
3126 load_const: &nir_load_const_instr,
3127 ) {
3128 let values = &load_const.values();
3129
3130 let mut dst = Vec::new();
3131 match load_const.def.bit_size {
3132 1 => {
3133 for c in 0..load_const.def.num_components {
3134 let imm_b1 = unsafe { values[usize::from(c)].b };
3135 dst.push(b.copy(imm_b1.into())[0]);
3136 }
3137 }
3138 8 => {
3139 for dw in 0..load_const.def.num_components.div_ceil(4) {
3140 let mut imm_u32 = 0;
3141 for b in 0..4 {
3142 let c = dw * 4 + b;
3143 if c < load_const.def.num_components {
3144 let imm_u8 = unsafe { values[usize::from(c)].u8_ };
3145 imm_u32 |= u32::from(imm_u8) << b * 8;
3146 }
3147 }
3148 dst.push(b.copy(imm_u32.into())[0]);
3149 }
3150 }
3151 16 => {
3152 for dw in 0..load_const.def.num_components.div_ceil(2) {
3153 let mut imm_u32 = 0;
3154 for w in 0..2 {
3155 let c = dw * 2 + w;
3156 if c < load_const.def.num_components {
3157 let imm_u16 =
3158 unsafe { values[usize::from(c)].u16_ };
3159 imm_u32 |= u32::from(imm_u16) << w * 16;
3160 }
3161 }
3162 dst.push(b.copy(imm_u32.into())[0]);
3163 }
3164 }
3165 32 => {
3166 for c in 0..load_const.def.num_components {
3167 let imm_u32 = unsafe { values[usize::from(c)].u32_ };
3168 dst.push(b.copy(imm_u32.into())[0]);
3169 }
3170 }
3171 64 => {
3172 for c in 0..load_const.def.num_components {
3173 let imm_u64 = unsafe { values[c as usize].u64_ };
3174 dst.push(b.copy((imm_u64 as u32).into())[0]);
3175 dst.push(b.copy(((imm_u64 >> 32) as u32).into())[0]);
3176 }
3177 }
3178 _ => panic!("Unknown bit size: {}", load_const.def.bit_size),
3179 }
3180
3181 self.set_ssa(&load_const.def, dst);
3182 }
3183
parse_undef( &mut self, b: &mut impl SSABuilder, undef: &nir_undef_instr, )3184 fn parse_undef(
3185 &mut self,
3186 b: &mut impl SSABuilder,
3187 undef: &nir_undef_instr,
3188 ) {
3189 let dst = alloc_ssa_for_nir(b, &undef.def);
3190 for c in &dst {
3191 b.push_op(OpUndef { dst: (*c).into() });
3192 }
3193 self.set_ssa(&undef.def, dst);
3194 }
3195
emit_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, target: &nir_block, )3196 fn emit_jump(
3197 &mut self,
3198 b: &mut impl SSABuilder,
3199 nb: &nir_block,
3200 target: &nir_block,
3201 ) {
3202 if target.index == self.end_block_id {
3203 b.push_op(OpExit {});
3204 } else {
3205 self.cfg.add_edge(nb.index, target.index);
3206 let target_label = self.get_block_label(target);
3207
3208 match self.peek_crs(target) {
3209 Some(SyncType::Sync) => {
3210 b.push_op(OpSync {
3211 target: target_label,
3212 });
3213 }
3214 Some(SyncType::Brk) => {
3215 b.push_op(OpBrk {
3216 target: target_label,
3217 });
3218 }
3219 Some(SyncType::Cont) => {
3220 b.push_op(OpCont {
3221 target: target_label,
3222 });
3223 }
3224 None => {
3225 b.push_op(OpBra {
3226 target: target_label,
3227 });
3228 }
3229 }
3230 }
3231 }
3232
emit_pred_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, pred: Pred, target: &nir_block, fallthrough: &nir_block, )3233 fn emit_pred_jump(
3234 &mut self,
3235 b: &mut impl SSABuilder,
3236 nb: &nir_block,
3237 pred: Pred,
3238 target: &nir_block,
3239 fallthrough: &nir_block,
3240 ) {
3241 // The fall-through edge has to come first
3242 self.cfg.add_edge(nb.index, fallthrough.index);
3243 let op = if target.index == self.end_block_id {
3244 Op::Exit(OpExit {})
3245 } else {
3246 self.cfg.add_edge(nb.index, target.index);
3247 Op::Bra(OpBra {
3248 target: self.get_block_label(target),
3249 })
3250 };
3251 b.predicate(pred).push_op(op);
3252 }
3253
parse_block( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nb: &nir_block, )3254 fn parse_block(
3255 &mut self,
3256 ssa_alloc: &mut SSAValueAllocator,
3257 phi_map: &mut PhiAllocMap,
3258 nb: &nir_block,
3259 ) {
3260 let sm = self.sm;
3261 let mut b = SSAInstrBuilder::new(sm, ssa_alloc);
3262
3263 if self.sm.sm() >= 70 && nb.index == 0 && self.nir.info.shared_size > 0
3264 {
3265 // The blob seems to always do a BSYNC before accessing shared
3266 // memory. Perhaps this is to ensure that our allocation is
3267 // actually available and not in use by another thread?
3268 let label = self.label_alloc.alloc();
3269 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
3270
3271 b.push_op(OpBClear {
3272 dst: bar_clear.into(),
3273 });
3274
3275 let bar = b.alloc_ssa(RegFile::Bar, 1);
3276 b.push_op(OpBSSy {
3277 bar_out: bar.into(),
3278 bar_in: bar_clear.into(),
3279 cond: SrcRef::True.into(),
3280 target: label,
3281 });
3282
3283 b.push_op(OpBSync {
3284 bar: bar.into(),
3285 cond: SrcRef::True.into(),
3286 });
3287
3288 b.push_op(OpNop { label: Some(label) });
3289 }
3290
3291 let mut phi = OpPhiDsts::new();
3292 for ni in nb.iter_instr_list() {
3293 let Some(np) = ni.as_phi() else {
3294 break;
3295 };
3296
3297 if DEBUG.annotate() {
3298 let annotation = self
3299 .nir_instr_printer
3300 .instr_to_string(ni)
3301 .split_whitespace()
3302 .collect::<Vec<_>>()
3303 .join(" ");
3304 b.push_op(OpAnnotate {
3305 annotation: format!("generated by \"{}\"", annotation,),
3306 });
3307 }
3308
3309 let uniform = !nb.divergent
3310 && self.sm.sm() >= 75
3311 && !DEBUG.no_ugpr()
3312 && !np.def.divergent;
3313
3314 // This should be ensured by nak_nir_lower_cf()
3315 if uniform {
3316 for ps in np.iter_srcs() {
3317 assert!(!ps.pred().divergent);
3318 }
3319 }
3320
3321 let mut b = UniformBuilder::new(&mut b, uniform);
3322 let dst = alloc_ssa_for_nir(&mut b, np.def.as_def());
3323 for i in 0..dst.len() {
3324 let phi_id = phi_map.get_phi_id(np, i.try_into().unwrap());
3325 phi.dsts.push(phi_id, dst[i].into());
3326 }
3327 self.set_ssa(np.def.as_def(), dst);
3328 }
3329
3330 if !phi.dsts.is_empty() {
3331 b.push_op(phi);
3332 }
3333
3334 if self.sm.sm() < 75 && nb.cf_node.prev().is_none() {
3335 if let Some(_) = nb.parent().as_loop() {
3336 b.push_op(OpPCnt {
3337 target: self.get_block_label(nb),
3338 });
3339 self.push_crs(nb, SyncType::Cont);
3340 }
3341 }
3342
3343 let mut goto = None;
3344 for ni in nb.iter_instr_list() {
3345 if DEBUG.annotate() && ni.type_ != nir_instr_type_phi {
3346 let annotation = self
3347 .nir_instr_printer
3348 .instr_to_string(ni)
3349 .split_whitespace()
3350 .collect::<Vec<_>>()
3351 .join(" ");
3352 b.push_op(OpAnnotate {
3353 annotation: format!("generated by \"{}\"", annotation,),
3354 });
3355 }
3356
3357 let uniform = !nb.divergent
3358 && self.sm.sm() >= 75
3359 && !DEBUG.no_ugpr()
3360 && ni.def().is_some_and(|d| !d.divergent);
3361 let mut b = UniformBuilder::new(&mut b, uniform);
3362
3363 match ni.type_ {
3364 nir_instr_type_alu => {
3365 self.parse_alu(&mut b, ni.as_alu().unwrap())
3366 }
3367 nir_instr_type_jump => {
3368 let jump = ni.as_jump().unwrap();
3369 if jump.type_ == nir_jump_goto
3370 || jump.type_ == nir_jump_goto_if
3371 {
3372 goto = Some(jump);
3373 }
3374 }
3375 nir_instr_type_tex => {
3376 self.parse_tex(&mut b, ni.as_tex().unwrap())
3377 }
3378 nir_instr_type_intrinsic => {
3379 self.parse_intrinsic(&mut b, ni.as_intrinsic().unwrap())
3380 }
3381 nir_instr_type_load_const => {
3382 self.parse_load_const(&mut b, ni.as_load_const().unwrap())
3383 }
3384 nir_instr_type_undef => {
3385 self.parse_undef(&mut b, ni.as_undef().unwrap())
3386 }
3387 nir_instr_type_phi => (),
3388 _ => panic!("Unsupported instruction type"),
3389 }
3390 }
3391
3392 if self.sm.sm() < 70 {
3393 if let Some(ni) = nb.following_if() {
3394 let fb = ni.following_block();
3395 b.push_op(OpSSy {
3396 target: self.get_block_label(fb),
3397 });
3398 self.push_crs(fb, SyncType::Sync);
3399 } else if let Some(nl) = nb.following_loop() {
3400 let fb = nl.following_block();
3401 b.push_op(OpPBk {
3402 target: self.get_block_label(fb),
3403 });
3404 self.push_crs(fb, SyncType::Brk);
3405 }
3406 }
3407
3408 let succ = nb.successors();
3409 for sb in succ {
3410 let sb = match sb {
3411 Some(b) => b,
3412 None => continue,
3413 };
3414
3415 let mut phi = OpPhiSrcs::new();
3416
3417 for ni in sb.iter_instr_list() {
3418 let Some(np) = ni.as_phi() else {
3419 break;
3420 };
3421
3422 if DEBUG.annotate() {
3423 let annotation = self
3424 .nir_instr_printer
3425 .instr_to_string(ni)
3426 .split_whitespace()
3427 .collect::<Vec<_>>()
3428 .join(" ");
3429 b.push_op(OpAnnotate {
3430 annotation: format!("generated by \"{}\"", annotation,),
3431 });
3432 }
3433
3434 for ps in np.iter_srcs() {
3435 if ps.pred().index == nb.index {
3436 let src = *self.get_src(&ps.src).as_ssa().unwrap();
3437 for (i, src) in src.iter().enumerate() {
3438 let phi_id =
3439 phi_map.get_phi_id(np, i.try_into().unwrap());
3440 phi.srcs.push(phi_id, (*src).into());
3441 }
3442 break;
3443 }
3444 }
3445 }
3446
3447 if !phi.srcs.is_empty() {
3448 b.push_op(phi);
3449 }
3450 }
3451
3452 if let Some(goto) = goto {
3453 let target = goto.target().unwrap();
3454 if goto.type_ == nir_jump_goto {
3455 self.emit_jump(&mut b, nb, target);
3456 } else {
3457 let cond = self.get_ssa(goto.condition.as_def())[0];
3458 let else_target = goto.else_target().unwrap();
3459
3460 /* Next block in the NIR CF list */
3461 let next_block = nb.cf_node.next().unwrap().as_block().unwrap();
3462
3463 if else_target as *const _ == next_block as *const _ {
3464 self.emit_pred_jump(
3465 &mut b,
3466 nb,
3467 // This is the branch to jump to the else
3468 cond.into(),
3469 target,
3470 else_target,
3471 );
3472 } else if target as *const _ == next_block as *const _ {
3473 self.emit_pred_jump(
3474 &mut b,
3475 nb,
3476 Pred::from(cond).bnot(),
3477 else_target,
3478 target,
3479 );
3480 } else {
3481 panic!(
3482 "One of the two goto targets must be the next block in \
3483 the NIR CF list"
3484 );
3485 }
3486 }
3487 } else {
3488 if let Some(ni) = nb.following_if() {
3489 let cond = self.get_ssa(ni.condition.as_def())[0];
3490 self.emit_pred_jump(
3491 &mut b,
3492 nb,
3493 // This is the branch to jump to the else
3494 Pred::from(cond).bnot(),
3495 ni.first_else_block(),
3496 ni.first_then_block(),
3497 );
3498 } else {
3499 assert!(succ[1].is_none());
3500 let s0 = succ[0].unwrap();
3501 self.emit_jump(&mut b, nb, s0);
3502 }
3503 }
3504
3505 let bb = BasicBlock {
3506 label: self.get_block_label(nb),
3507 uniform: !nb.divergent,
3508 instrs: b.as_vec(),
3509 };
3510 self.cfg.add_node(nb.index, bb);
3511 }
3512
parse_if( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, ni: &nir_if, )3513 fn parse_if(
3514 &mut self,
3515 ssa_alloc: &mut SSAValueAllocator,
3516 phi_map: &mut PhiAllocMap,
3517 ni: &nir_if,
3518 ) {
3519 self.parse_cf_list(ssa_alloc, phi_map, ni.iter_then_list());
3520 self.parse_cf_list(ssa_alloc, phi_map, ni.iter_else_list());
3521
3522 if self.sm.sm() < 70 {
3523 let next_block = ni.cf_node.next().unwrap().as_block().unwrap();
3524 self.pop_crs(next_block, SyncType::Sync);
3525 }
3526 }
3527
parse_loop( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nl: &nir_loop, )3528 fn parse_loop(
3529 &mut self,
3530 ssa_alloc: &mut SSAValueAllocator,
3531 phi_map: &mut PhiAllocMap,
3532 nl: &nir_loop,
3533 ) {
3534 self.parse_cf_list(ssa_alloc, phi_map, nl.iter_body());
3535
3536 if self.sm.sm() < 70 {
3537 let header = nl.iter_body().next().unwrap().as_block().unwrap();
3538 self.pop_crs(header, SyncType::Cont);
3539 let next_block = nl.cf_node.next().unwrap().as_block().unwrap();
3540 self.pop_crs(next_block, SyncType::Brk);
3541 }
3542 }
3543
parse_cf_list( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, list: ExecListIter<nir_cf_node>, )3544 fn parse_cf_list(
3545 &mut self,
3546 ssa_alloc: &mut SSAValueAllocator,
3547 phi_map: &mut PhiAllocMap,
3548 list: ExecListIter<nir_cf_node>,
3549 ) {
3550 for node in list {
3551 match node.type_ {
3552 nir_cf_node_block => {
3553 let nb = node.as_block().unwrap();
3554 self.parse_block(ssa_alloc, phi_map, nb);
3555 }
3556 nir_cf_node_if => {
3557 let ni = node.as_if().unwrap();
3558 self.parse_if(ssa_alloc, phi_map, ni);
3559 }
3560 nir_cf_node_loop => {
3561 let nl = node.as_loop().unwrap();
3562 self.parse_loop(ssa_alloc, phi_map, nl);
3563 }
3564 _ => panic!("Invalid inner CF node type"),
3565 }
3566 }
3567 }
3568
parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function3569 pub fn parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function {
3570 let mut ssa_alloc = SSAValueAllocator::new();
3571 let end_nb = nfi.end_block();
3572 self.end_block_id = end_nb.index;
3573
3574 let mut phi_alloc = PhiAllocator::new();
3575 let mut phi_map = PhiAllocMap::new(&mut phi_alloc);
3576
3577 self.parse_cf_list(&mut ssa_alloc, &mut phi_map, nfi.iter_body());
3578
3579 let cfg = std::mem::take(&mut self.cfg).as_cfg();
3580 assert!(cfg.len() > 0);
3581 for i in 0..cfg.len() {
3582 if cfg[i].falls_through() {
3583 assert!(cfg.succ_indices(i)[0] == i + 1);
3584 }
3585 }
3586
3587 let mut f = Function {
3588 ssa_alloc: ssa_alloc,
3589 phi_alloc: phi_alloc,
3590 blocks: cfg,
3591 };
3592 f.repair_ssa();
3593 f
3594 }
3595
parse_shader(mut self) -> Shader<'a>3596 pub fn parse_shader(mut self) -> Shader<'a> {
3597 let mut functions = Vec::new();
3598 for nf in self.nir.iter_functions() {
3599 if let Some(nfi) = nf.get_impl() {
3600 let f = self.parse_function_impl(nfi);
3601 functions.push(f);
3602 }
3603 }
3604
3605 // Tessellation evaluation shaders MUST claim to read gl_TessCoord or
3606 // the hardware will throw an SPH error.
3607 if matches!(self.info.stage, ShaderStageInfo::Tessellation(_)) {
3608 match &mut self.info.io {
3609 ShaderIoInfo::Vtg(io) => {
3610 let tc = NAK_ATTR_TESS_COORD;
3611 io.mark_attrs_written(tc..(tc + 8));
3612 }
3613 _ => panic!("Tessellation must have ShaderIoInfo::Vtg"),
3614 }
3615 }
3616
3617 Shader {
3618 sm: self.sm,
3619 info: self.info,
3620 functions: functions,
3621 }
3622 }
3623 }
3624
nak_shader_from_nir<'a>( ns: &'a nir_shader, sm: &'a dyn ShaderModel, ) -> Shader<'a>3625 pub fn nak_shader_from_nir<'a>(
3626 ns: &'a nir_shader,
3627 sm: &'a dyn ShaderModel,
3628 ) -> Shader<'a> {
3629 ShaderFromNir::new(ns, sm).parse_shader()
3630 }
3631