1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "aco_builder.h"
10
11 #include "util/u_debug.h"
12
13 #include "c11/threads.h"
14
15 namespace aco {
16
17 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
18
19 uint64_t debug_flags = 0;
20
21 static const struct debug_control aco_debug_options[] = {
22 {"validateir", DEBUG_VALIDATE_IR},
23 {"validatera", DEBUG_VALIDATE_RA},
24 {"validate-livevars", DEBUG_VALIDATE_LIVE_VARS},
25 {"novalidateir", DEBUG_NO_VALIDATE_IR},
26 {"force-waitcnt", DEBUG_FORCE_WAITCNT},
27 {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
28 {"novn", DEBUG_NO_VN},
29 {"noopt", DEBUG_NO_OPT},
30 {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
31 {"nosched-ilp", DEBUG_NO_SCHED_ILP},
32 {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
33 {"perfinfo", DEBUG_PERF_INFO},
34 {"liveinfo", DEBUG_LIVE_INFO},
35 {NULL, 0}};
36
37 static once_flag init_once_flag = ONCE_FLAG_INIT;
38
39 static void
init_once()40 init_once()
41 {
42 debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
43
44 #ifndef NDEBUG
45 /* enable some flags by default on debug builds */
46 debug_flags |= aco::DEBUG_VALIDATE_IR;
47 #endif
48
49 if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
50 debug_flags &= ~aco::DEBUG_VALIDATE_IR;
51 }
52
53 void
init()54 init()
55 {
56 call_once(&init_once_flag, init_once);
57 }
58
59 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)60 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
61 enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
62 ac_shader_config* config)
63 {
64 instruction_buffer = &program->m;
65 program->stage = stage;
66 program->config = config;
67 program->info = *info;
68 program->gfx_level = gfx_level;
69 if (family == CHIP_UNKNOWN) {
70 switch (gfx_level) {
71 case GFX6: program->family = CHIP_TAHITI; break;
72 case GFX7: program->family = CHIP_BONAIRE; break;
73 case GFX8: program->family = CHIP_POLARIS10; break;
74 case GFX9: program->family = CHIP_VEGA10; break;
75 case GFX10: program->family = CHIP_NAVI10; break;
76 case GFX10_3: program->family = CHIP_NAVI21; break;
77 case GFX11: program->family = CHIP_NAVI31; break;
78 case GFX12: program->family = CHIP_GFX1200; break;
79 default: program->family = CHIP_UNKNOWN; break;
80 }
81 } else {
82 program->family = family;
83 }
84 program->wave_size = info->wave_size;
85 program->lane_mask = program->wave_size == 32 ? s1 : s2;
86
87 program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
88 : gfx_level >= GFX7 ? 512
89 : 256;
90 program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
91
92 /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
93 program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
94
95 /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96 program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97
98 program->dev.vgpr_limit = stage == raytracing_cs ? 128 : 256;
99 program->dev.physical_vgprs = 256;
100 program->dev.vgpr_alloc_granule = 4;
101
102 if (gfx_level >= GFX10) {
103 program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
104 program->dev.sgpr_alloc_granule = 128;
105 program->dev.sgpr_limit =
106 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
107
108 if (family == CHIP_NAVI31 || family == CHIP_NAVI32 || family == CHIP_GFX1151 ||
109 gfx_level >= GFX12) {
110 program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
111 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
112 } else {
113 program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
114 if (gfx_level >= GFX10_3)
115 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
116 else
117 program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
118 }
119 } else if (program->gfx_level >= GFX8) {
120 program->dev.physical_sgprs = 800;
121 program->dev.sgpr_alloc_granule = 16;
122 program->dev.sgpr_limit = 102;
123 if (family == CHIP_TONGA || family == CHIP_ICELAND)
124 program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
125 } else {
126 program->dev.physical_sgprs = 512;
127 program->dev.sgpr_alloc_granule = 8;
128 program->dev.sgpr_limit = 104;
129 }
130
131 program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
132
133 program->dev.max_waves_per_simd = 10;
134 if (program->gfx_level >= GFX10_3)
135 program->dev.max_waves_per_simd = 16;
136 else if (program->gfx_level == GFX10)
137 program->dev.max_waves_per_simd = 20;
138 else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
139 program->dev.max_waves_per_simd = 8;
140
141 program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
142
143 switch (program->family) {
144 /* GFX8 APUs */
145 case CHIP_CARRIZO:
146 case CHIP_STONEY:
147 /* GFX9 APUS */
148 case CHIP_RAVEN:
149 case CHIP_RAVEN2:
150 case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
151 default: break;
152 }
153
154 program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
155 /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
156 program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
157 if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
158 program->family == CHIP_HAWAII)
159 program->dev.has_fast_fma32 = true;
160 program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
161 program->dev.has_fmac_legacy32 = program->gfx_level >= GFX10_3 && program->gfx_level < GFX12;
162
163 program->dev.fused_mad_mix = program->gfx_level >= GFX10;
164 if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
165 program->family == CHIP_MI100 || program->family == CHIP_MI200)
166 program->dev.fused_mad_mix = true;
167
168 if (program->gfx_level >= GFX11) {
169 program->dev.scratch_global_offset_min = -4096;
170 program->dev.scratch_global_offset_max = 4095;
171 } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
172 program->dev.scratch_global_offset_min = -2048;
173 program->dev.scratch_global_offset_max = 2047;
174 } else if (program->gfx_level == GFX9) {
175 /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
176 program->dev.scratch_global_offset_min = 0;
177 program->dev.scratch_global_offset_max = 4095;
178 }
179
180 if (program->gfx_level >= GFX12) {
181 /* Same as GFX11, except one less for VSAMPLE. */
182 program->dev.max_nsa_vgprs = 3;
183 } else if (program->gfx_level >= GFX11) {
184 /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
185 * rest of the address.
186 */
187 program->dev.max_nsa_vgprs = 4;
188 } else if (program->gfx_level >= GFX10_3) {
189 /* GFX10.3 can have up to 3 NSA dwords. */
190 program->dev.max_nsa_vgprs = 13;
191 } else if (program->gfx_level >= GFX10) {
192 /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
193 program->dev.max_nsa_vgprs = 5;
194 } else {
195 program->dev.max_nsa_vgprs = 0;
196 }
197
198 program->wgp_mode = wgp_mode;
199
200 program->progress = CompilationProgress::after_isel;
201
202 program->next_fp_mode.must_flush_denorms32 = false;
203 program->next_fp_mode.must_flush_denorms16_64 = false;
204 program->next_fp_mode.care_about_round32 = false;
205 program->next_fp_mode.care_about_round16_64 = false;
206 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
207 program->next_fp_mode.denorm32 = 0;
208 program->next_fp_mode.round16_64 = fp_round_ne;
209 program->next_fp_mode.round32 = fp_round_ne;
210 }
211
212 bool
is_wait_export_ready(amd_gfx_level gfx_level,const Instruction * instr)213 is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr)
214 {
215 return instr->opcode == aco_opcode::s_wait_event &&
216 (gfx_level >= GFX12 ? (instr->salu().imm & wait_event_imm_wait_export_ready_gfx12)
217 : !(instr->salu().imm & wait_event_imm_dont_wait_export_ready_gfx11));
218 }
219
220 memory_sync_info
get_sync_info(const Instruction * instr)221 get_sync_info(const Instruction* instr)
222 {
223 /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
224 * overlapping waves in the queue family.
225 */
226 if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
227 instr->opcode == aco_opcode::s_wait_event) {
228 return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
229 } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
230 return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
231 }
232
233 switch (instr->format) {
234 case Format::SMEM: return instr->smem().sync;
235 case Format::MUBUF: return instr->mubuf().sync;
236 case Format::MIMG: return instr->mimg().sync;
237 case Format::MTBUF: return instr->mtbuf().sync;
238 case Format::FLAT:
239 case Format::GLOBAL:
240 case Format::SCRATCH: return instr->flatlike().sync;
241 case Format::DS: return instr->ds().sync;
242 case Format::LDSDIR: return instr->ldsdir().sync;
243 default: return memory_sync_info();
244 }
245 }
246
247 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)248 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
249 {
250 if (!instr->isVALU())
251 return false;
252
253 if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
254 return false;
255
256 if (instr->isSDWA())
257 return true;
258
259 if (instr->isVOP3()) {
260 VALU_instruction& vop3 = instr->valu();
261 if (instr->format == Format::VOP3)
262 return false;
263 if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
264 return false;
265 if (vop3.omod && gfx_level < GFX9)
266 return false;
267
268 // TODO: return true if we know we will use vcc
269 if (!pre_ra && instr->definitions.size() >= 2)
270 return false;
271
272 for (unsigned i = 1; i < instr->operands.size(); i++) {
273 if (instr->operands[i].isLiteral())
274 return false;
275 if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
276 return false;
277 }
278 }
279
280 if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
281 return false;
282
283 if (!instr->operands.empty()) {
284 if (instr->operands[0].isLiteral())
285 return false;
286 if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
287 return false;
288 if (instr->operands[0].bytes() > 4)
289 return false;
290 if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
291 return false;
292 }
293
294 bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
295 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
296
297 if (gfx_level != GFX8 && is_mac)
298 return false;
299
300 // TODO: return true if we know we will use vcc
301 if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
302 return false;
303 if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
304 return false;
305
306 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
307 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
308 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
309 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
310 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
311 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
312 }
313
314 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
315 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)316 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
317 {
318 if (instr->isSDWA())
319 return NULL;
320
321 aco_ptr<Instruction> tmp = std::move(instr);
322 Format format = asSDWA(withoutVOP3(tmp->format));
323 instr.reset(
324 create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
325 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
326 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
327
328 SDWA_instruction& sdwa = instr->sdwa();
329
330 if (tmp->isVOP3()) {
331 VALU_instruction& vop3 = tmp->valu();
332 sdwa.neg = vop3.neg;
333 sdwa.abs = vop3.abs;
334 sdwa.omod = vop3.omod;
335 sdwa.clamp = vop3.clamp;
336 }
337
338 for (unsigned i = 0; i < instr->operands.size(); i++) {
339 /* SDWA only uses operands 0 and 1. */
340 if (i >= 2)
341 break;
342
343 sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
344 }
345
346 sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
347
348 if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
349 instr->definitions[0].setFixed(vcc);
350 if (instr->definitions.size() >= 2)
351 instr->definitions[1].setFixed(vcc);
352 if (instr->operands.size() >= 3)
353 instr->operands[2].setFixed(vcc);
354
355 instr->pass_flags = tmp->pass_flags;
356
357 return tmp;
358 }
359
360 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)361 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
362 {
363 assert(instr->isVALU() && !instr->operands.empty());
364
365 if (instr->isDPP())
366 return instr->isDPP8() == dpp8;
367
368 if (instr->isSDWA() || instr->isVINTERP_INREG())
369 return false;
370
371 if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
372 return false;
373
374 if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
375 instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
376 return false;
377
378 if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
379 instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
380 gfx_level < GFX11)
381 return false;
382
383 if (instr->isVOP3() && gfx_level < GFX11) {
384 const VALU_instruction* vop3 = &instr->valu();
385 if (vop3->clamp || vop3->omod)
386 return false;
387 if (dpp8)
388 return false;
389 }
390
391 for (unsigned i = 0; i < instr->operands.size(); i++) {
392 if (instr->operands[i].isLiteral())
393 return false;
394 if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
395 return false;
396 }
397
398 /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
399 if (instr->writes_exec())
400 return false;
401
402 /* simpler than listing all VOP3P opcodes which do not support DPP */
403 if (instr->isVOP3P()) {
404 return instr->opcode == aco_opcode::v_fma_mix_f32 ||
405 instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
406 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
407 instr->opcode == aco_opcode::v_dot2_f32_f16 ||
408 instr->opcode == aco_opcode::v_dot2_f32_bf16;
409 }
410
411 if (instr->opcode == aco_opcode::v_pk_fmac_f16)
412 return gfx_level < GFX11;
413
414 /* there are more cases but those all take 64-bit inputs */
415 return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
416 instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
417 instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
418 instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
419 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
420 instr->opcode != aco_opcode::v_cvt_f64_i32 &&
421 instr->opcode != aco_opcode::v_cvt_f64_f32 &&
422 instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
423 instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
424 instr->opcode != aco_opcode::v_mul_hi_i32 &&
425 instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
426 instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
427 instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
428 instr->opcode != aco_opcode::v_mad_u64_u32 &&
429 instr->opcode != aco_opcode::v_mad_i64_i32 &&
430 instr->opcode != aco_opcode::v_permlane16_b32 &&
431 instr->opcode != aco_opcode::v_permlanex16_b32 &&
432 instr->opcode != aco_opcode::v_permlane64_b32 &&
433 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
434 instr->opcode != aco_opcode::v_writelane_b32_e64 &&
435 instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32;
436 }
437
438 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)439 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
440 {
441 if (instr->isDPP())
442 return NULL;
443
444 aco_ptr<Instruction> tmp = std::move(instr);
445 Format format =
446 (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
447 if (dpp8)
448 instr.reset(
449 create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
450 else
451 instr.reset(
452 create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
453 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
454 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
455
456 if (dpp8) {
457 DPP8_instruction* dpp = &instr->dpp8();
458 dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
459 dpp->fetch_inactive = gfx_level >= GFX10;
460 } else {
461 DPP16_instruction* dpp = &instr->dpp16();
462 dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
463 dpp->row_mask = 0xf;
464 dpp->bank_mask = 0xf;
465 dpp->fetch_inactive = gfx_level >= GFX10;
466 }
467
468 instr->valu().neg = tmp->valu().neg;
469 instr->valu().abs = tmp->valu().abs;
470 instr->valu().omod = tmp->valu().omod;
471 instr->valu().clamp = tmp->valu().clamp;
472 instr->valu().opsel = tmp->valu().opsel;
473 instr->valu().opsel_lo = tmp->valu().opsel_lo;
474 instr->valu().opsel_hi = tmp->valu().opsel_hi;
475
476 if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
477 instr->definitions.back().setFixed(vcc);
478
479 if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
480 gfx_level < GFX11)
481 instr->operands[2].setFixed(vcc);
482
483 instr->pass_flags = tmp->pass_flags;
484
485 /* DPP16 supports input modifiers, so we might no longer need VOP3. */
486 bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
487 (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
488
489 /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
490 remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
491 !instr->definitions.back().isFixed() ||
492 instr->definitions.back().physReg() == vcc;
493
494 /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
495 remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
496 instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
497
498 if (remove_vop3)
499 instr->format = withoutVOP3(instr->format);
500
501 return tmp;
502 }
503
504 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)505 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
506 {
507 if (op == aco_opcode::v_mov_b32)
508 return gfx_level >= GFX10;
509
510 if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
511 op == aco_opcode::v_ldexp_f64)
512 return idx == 0;
513
514 return instr_info.can_use_input_modifiers[(int)op];
515 }
516
517 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)518 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
519 {
520 /* opsel is only GFX9+ */
521 if (gfx_level < GFX9)
522 return false;
523
524 switch (op) {
525 case aco_opcode::v_div_fixup_f16:
526 case aco_opcode::v_fma_f16:
527 case aco_opcode::v_mad_f16:
528 case aco_opcode::v_mad_u16:
529 case aco_opcode::v_mad_i16:
530 case aco_opcode::v_med3_f16:
531 case aco_opcode::v_med3_i16:
532 case aco_opcode::v_med3_u16:
533 case aco_opcode::v_min3_f16:
534 case aco_opcode::v_min3_i16:
535 case aco_opcode::v_min3_u16:
536 case aco_opcode::v_max3_f16:
537 case aco_opcode::v_max3_i16:
538 case aco_opcode::v_max3_u16:
539 case aco_opcode::v_minmax_f16:
540 case aco_opcode::v_maxmin_f16:
541 case aco_opcode::v_max_u16_e64:
542 case aco_opcode::v_max_i16_e64:
543 case aco_opcode::v_min_u16_e64:
544 case aco_opcode::v_min_i16_e64:
545 case aco_opcode::v_add_i16:
546 case aco_opcode::v_sub_i16:
547 case aco_opcode::v_add_u16_e64:
548 case aco_opcode::v_sub_u16_e64:
549 case aco_opcode::v_lshlrev_b16_e64:
550 case aco_opcode::v_lshrrev_b16_e64:
551 case aco_opcode::v_ashrrev_i16_e64:
552 case aco_opcode::v_and_b16:
553 case aco_opcode::v_or_b16:
554 case aco_opcode::v_xor_b16:
555 case aco_opcode::v_mul_lo_u16_e64: return true;
556 case aco_opcode::v_pack_b32_f16:
557 case aco_opcode::v_cvt_pknorm_i16_f16:
558 case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
559 case aco_opcode::v_mad_u32_u16:
560 case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
561 case aco_opcode::v_dot2_f16_f16:
562 case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
563 case aco_opcode::v_cndmask_b16: return idx != 2;
564 case aco_opcode::v_interp_p10_f16_f32_inreg:
565 case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
566 case aco_opcode::v_interp_p2_f16_f32_inreg:
567 case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
568 default:
569 return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
570 }
571 }
572
573 bool
can_write_m0(const aco_ptr<Instruction> & instr)574 can_write_m0(const aco_ptr<Instruction>& instr)
575 {
576 if (instr->isSALU())
577 return true;
578
579 /* VALU can't write m0 on any GPU generations. */
580 if (instr->isVALU())
581 return false;
582
583 switch (instr->opcode) {
584 case aco_opcode::p_parallelcopy:
585 case aco_opcode::p_extract:
586 case aco_opcode::p_insert:
587 /* These pseudo instructions are implemented with SALU when writing m0. */
588 return true;
589 default:
590 /* Assume that no other instructions can write m0. */
591 return false;
592 }
593 }
594
595 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)596 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
597 {
598 /* partial register writes are GFX9+, only */
599 if (gfx_level < GFX9)
600 return false;
601
602 switch (op) {
603 /* VOP3 */
604 case aco_opcode::v_mad_legacy_f16:
605 case aco_opcode::v_mad_legacy_u16:
606 case aco_opcode::v_mad_legacy_i16:
607 case aco_opcode::v_fma_legacy_f16:
608 case aco_opcode::v_div_fixup_legacy_f16: return false;
609 case aco_opcode::v_interp_p2_f16:
610 case aco_opcode::v_interp_p2_hi_f16:
611 case aco_opcode::v_fma_mixlo_f16:
612 case aco_opcode::v_fma_mixhi_f16:
613 /* VOP2 */
614 case aco_opcode::v_mac_f16:
615 case aco_opcode::v_madak_f16:
616 case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
617 case aco_opcode::v_add_f16:
618 case aco_opcode::v_sub_f16:
619 case aco_opcode::v_subrev_f16:
620 case aco_opcode::v_mul_f16:
621 case aco_opcode::v_max_f16:
622 case aco_opcode::v_min_f16:
623 case aco_opcode::v_ldexp_f16:
624 case aco_opcode::v_fmac_f16:
625 case aco_opcode::v_fmamk_f16:
626 case aco_opcode::v_fmaak_f16:
627 /* VOP1 */
628 case aco_opcode::v_cvt_f16_f32:
629 case aco_opcode::p_v_cvt_f16_f32_rtne:
630 case aco_opcode::v_cvt_f16_u16:
631 case aco_opcode::v_cvt_f16_i16:
632 case aco_opcode::v_rcp_f16:
633 case aco_opcode::v_sqrt_f16:
634 case aco_opcode::v_rsq_f16:
635 case aco_opcode::v_log_f16:
636 case aco_opcode::v_exp_f16:
637 case aco_opcode::v_frexp_mant_f16:
638 case aco_opcode::v_frexp_exp_i16_f16:
639 case aco_opcode::v_floor_f16:
640 case aco_opcode::v_ceil_f16:
641 case aco_opcode::v_trunc_f16:
642 case aco_opcode::v_rndne_f16:
643 case aco_opcode::v_fract_f16:
644 case aco_opcode::v_sin_f16:
645 case aco_opcode::v_cos_f16:
646 case aco_opcode::v_cvt_u16_f16:
647 case aco_opcode::v_cvt_i16_f16:
648 case aco_opcode::v_cvt_norm_i16_f16:
649 case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
650 /* all non legacy opsel instructions preserve the high bits */
651 default: return can_use_opsel(gfx_level, op, -1);
652 }
653 }
654
655 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
656 * only supports v0-v127.
657 * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
658 */
659 uint8_t
get_gfx11_true16_mask(aco_opcode op)660 get_gfx11_true16_mask(aco_opcode op)
661 {
662 switch (op) {
663 case aco_opcode::v_ceil_f16:
664 case aco_opcode::v_cos_f16:
665 case aco_opcode::v_cvt_f16_i16:
666 case aco_opcode::v_cvt_f16_u16:
667 case aco_opcode::v_cvt_i16_f16:
668 case aco_opcode::v_cvt_u16_f16:
669 case aco_opcode::v_cvt_norm_i16_f16:
670 case aco_opcode::v_cvt_norm_u16_f16:
671 case aco_opcode::v_exp_f16:
672 case aco_opcode::v_floor_f16:
673 case aco_opcode::v_fract_f16:
674 case aco_opcode::v_frexp_exp_i16_f16:
675 case aco_opcode::v_frexp_mant_f16:
676 case aco_opcode::v_log_f16:
677 case aco_opcode::v_not_b16:
678 case aco_opcode::v_rcp_f16:
679 case aco_opcode::v_rndne_f16:
680 case aco_opcode::v_rsq_f16:
681 case aco_opcode::v_sin_f16:
682 case aco_opcode::v_sqrt_f16:
683 case aco_opcode::v_trunc_f16:
684 case aco_opcode::v_swap_b16:
685 case aco_opcode::v_mov_b16: return 0x1 | 0x8;
686 case aco_opcode::v_add_f16:
687 case aco_opcode::v_fmaak_f16:
688 case aco_opcode::v_fmac_f16:
689 case aco_opcode::v_fmamk_f16:
690 case aco_opcode::v_ldexp_f16:
691 case aco_opcode::v_max_f16:
692 case aco_opcode::v_min_f16:
693 case aco_opcode::v_mul_f16:
694 case aco_opcode::v_sub_f16:
695 case aco_opcode::v_subrev_f16:
696 case aco_opcode::v_and_b16:
697 case aco_opcode::v_or_b16:
698 case aco_opcode::v_xor_b16: return 0x3 | 0x8;
699 case aco_opcode::v_cvt_f32_f16:
700 case aco_opcode::v_cvt_i32_i16:
701 case aco_opcode::v_cvt_u32_u16: return 0x1;
702 case aco_opcode::v_cmp_class_f16:
703 case aco_opcode::v_cmp_eq_f16:
704 case aco_opcode::v_cmp_eq_i16:
705 case aco_opcode::v_cmp_eq_u16:
706 case aco_opcode::v_cmp_ge_f16:
707 case aco_opcode::v_cmp_ge_i16:
708 case aco_opcode::v_cmp_ge_u16:
709 case aco_opcode::v_cmp_gt_f16:
710 case aco_opcode::v_cmp_gt_i16:
711 case aco_opcode::v_cmp_gt_u16:
712 case aco_opcode::v_cmp_le_f16:
713 case aco_opcode::v_cmp_le_i16:
714 case aco_opcode::v_cmp_le_u16:
715 case aco_opcode::v_cmp_lg_f16:
716 case aco_opcode::v_cmp_lg_i16:
717 case aco_opcode::v_cmp_lg_u16:
718 case aco_opcode::v_cmp_lt_f16:
719 case aco_opcode::v_cmp_lt_i16:
720 case aco_opcode::v_cmp_lt_u16:
721 case aco_opcode::v_cmp_neq_f16:
722 case aco_opcode::v_cmp_nge_f16:
723 case aco_opcode::v_cmp_ngt_f16:
724 case aco_opcode::v_cmp_nle_f16:
725 case aco_opcode::v_cmp_nlg_f16:
726 case aco_opcode::v_cmp_nlt_f16:
727 case aco_opcode::v_cmp_o_f16:
728 case aco_opcode::v_cmp_u_f16:
729 case aco_opcode::v_cmpx_class_f16:
730 case aco_opcode::v_cmpx_eq_f16:
731 case aco_opcode::v_cmpx_eq_i16:
732 case aco_opcode::v_cmpx_eq_u16:
733 case aco_opcode::v_cmpx_ge_f16:
734 case aco_opcode::v_cmpx_ge_i16:
735 case aco_opcode::v_cmpx_ge_u16:
736 case aco_opcode::v_cmpx_gt_f16:
737 case aco_opcode::v_cmpx_gt_i16:
738 case aco_opcode::v_cmpx_gt_u16:
739 case aco_opcode::v_cmpx_le_f16:
740 case aco_opcode::v_cmpx_le_i16:
741 case aco_opcode::v_cmpx_le_u16:
742 case aco_opcode::v_cmpx_lg_f16:
743 case aco_opcode::v_cmpx_lg_i16:
744 case aco_opcode::v_cmpx_lg_u16:
745 case aco_opcode::v_cmpx_lt_f16:
746 case aco_opcode::v_cmpx_lt_i16:
747 case aco_opcode::v_cmpx_lt_u16:
748 case aco_opcode::v_cmpx_neq_f16:
749 case aco_opcode::v_cmpx_nge_f16:
750 case aco_opcode::v_cmpx_ngt_f16:
751 case aco_opcode::v_cmpx_nle_f16:
752 case aco_opcode::v_cmpx_nlg_f16:
753 case aco_opcode::v_cmpx_nlt_f16:
754 case aco_opcode::v_cmpx_o_f16:
755 case aco_opcode::v_cmpx_u_f16: return 0x3;
756 case aco_opcode::v_cvt_f16_f32:
757 case aco_opcode::v_sat_pk_u8_i16: return 0x8;
758 default: return 0x0;
759 }
760 }
761
762 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)763 get_reduction_identity(ReduceOp op, unsigned idx)
764 {
765 switch (op) {
766 case iadd8:
767 case iadd16:
768 case iadd32:
769 case iadd64:
770 case fadd16:
771 case fadd32:
772 case fadd64:
773 case ior8:
774 case ior16:
775 case ior32:
776 case ior64:
777 case ixor8:
778 case ixor16:
779 case ixor32:
780 case ixor64:
781 case umax8:
782 case umax16:
783 case umax32:
784 case umax64: return 0;
785 case imul8:
786 case imul16:
787 case imul32:
788 case imul64: return idx ? 0 : 1;
789 case fmul16: return 0x3c00u; /* 1.0 */
790 case fmul32: return 0x3f800000u; /* 1.0 */
791 case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
792 case imin8: return INT8_MAX;
793 case imin16: return INT16_MAX;
794 case imin32: return INT32_MAX;
795 case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
796 case imax8: return INT8_MIN;
797 case imax16: return INT16_MIN;
798 case imax32: return INT32_MIN;
799 case imax64: return idx ? 0x80000000u : 0;
800 case umin8:
801 case umin16:
802 case iand8:
803 case iand16: return 0xffffffffu;
804 case umin32:
805 case umin64:
806 case iand32:
807 case iand64: return 0xffffffffu;
808 case fmin16: return 0x7c00u; /* infinity */
809 case fmin32: return 0x7f800000u; /* infinity */
810 case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
811 case fmax16: return 0xfc00u; /* negative infinity */
812 case fmax32: return 0xff800000u; /* negative infinity */
813 case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
814 default: unreachable("Invalid reduction operation"); break;
815 }
816 return 0;
817 }
818
819 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)820 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
821 {
822 if (instr->isPseudo())
823 return instr->operands[index].bytes() * 8u;
824 else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
825 instr->opcode == aco_opcode::v_mad_i64_i32)
826 return index == 2 ? 64 : 32;
827 else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
828 instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
829 instr->opcode == aco_opcode::v_fma_mixhi_f16)
830 return instr->valu().opsel_hi[index] ? 16 : 32;
831 else if (instr->opcode == aco_opcode::v_interp_p10_f16_f32_inreg ||
832 instr->opcode == aco_opcode::v_interp_p10_rtz_f16_f32_inreg)
833 return index == 1 ? 32 : 16;
834 else if (instr->opcode == aco_opcode::v_interp_p2_f16_f32_inreg ||
835 instr->opcode == aco_opcode::v_interp_p2_rtz_f16_f32_inreg)
836 return index == 0 ? 16 : 32;
837 else if (instr->isVALU() || instr->isSALU())
838 return instr_info.operand_size[(int)instr->opcode];
839 else
840 return 0;
841 }
842
843 bool
needs_exec_mask(const Instruction * instr)844 needs_exec_mask(const Instruction* instr)
845 {
846 if (instr->isVALU()) {
847 return instr->opcode != aco_opcode::v_readlane_b32 &&
848 instr->opcode != aco_opcode::v_readlane_b32_e64 &&
849 instr->opcode != aco_opcode::v_writelane_b32 &&
850 instr->opcode != aco_opcode::v_writelane_b32_e64;
851 }
852
853 if (instr->isVMEM() || instr->isFlatLike())
854 return true;
855
856 if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
857 return instr->reads_exec();
858
859 if (instr->isPseudo()) {
860 switch (instr->opcode) {
861 case aco_opcode::p_create_vector:
862 case aco_opcode::p_extract_vector:
863 case aco_opcode::p_split_vector:
864 case aco_opcode::p_phi:
865 case aco_opcode::p_parallelcopy:
866 for (Definition def : instr->definitions) {
867 if (def.getTemp().type() == RegType::vgpr)
868 return true;
869 }
870 return instr->reads_exec();
871 case aco_opcode::p_spill:
872 case aco_opcode::p_reload:
873 case aco_opcode::p_end_linear_vgpr:
874 case aco_opcode::p_logical_start:
875 case aco_opcode::p_logical_end:
876 case aco_opcode::p_startpgm:
877 case aco_opcode::p_end_wqm:
878 case aco_opcode::p_init_scratch: return instr->reads_exec();
879 case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
880 default: break;
881 }
882 }
883
884 return true;
885 }
886
887 struct CmpInfo {
888 aco_opcode swapped;
889 aco_opcode inverse;
890 aco_opcode vcmpx;
891 };
892
893 static ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)894 get_cmp_info(aco_opcode op, CmpInfo* info)
895 {
896 info->swapped = aco_opcode::num_opcodes;
897 info->inverse = aco_opcode::num_opcodes;
898 info->vcmpx = aco_opcode::num_opcodes;
899 switch (op) {
900 // clang-format off
901 #define CMP2(ord, unord, ord_swap, unord_swap, sz) \
902 case aco_opcode::v_cmp_##ord##_f##sz: \
903 case aco_opcode::v_cmp_n##unord##_f##sz: \
904 info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
905 : aco_opcode::v_cmp_n##unord_swap##_f##sz; \
906 info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
907 : aco_opcode::v_cmp_n##ord##_f##sz; \
908 info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz \
909 : aco_opcode::v_cmpx_n##unord##_f##sz; \
910 return true;
911 #define CMP(ord, unord, ord_swap, unord_swap) \
912 CMP2(ord, unord, ord_swap, unord_swap, 16) \
913 CMP2(ord, unord, ord_swap, unord_swap, 32) \
914 CMP2(ord, unord, ord_swap, unord_swap, 64)
915 CMP(lt, /*n*/ge, gt, /*n*/le)
916 CMP(eq, /*n*/lg, eq, /*n*/lg)
917 CMP(le, /*n*/gt, ge, /*n*/lt)
918 CMP(gt, /*n*/le, lt, /*n*/ge)
919 CMP(lg, /*n*/eq, lg, /*n*/eq)
920 CMP(ge, /*n*/lt, le, /*n*/gt)
921 #undef CMP
922 #undef CMP2
923 #define ORD_TEST(sz) \
924 case aco_opcode::v_cmp_u_f##sz: \
925 info->swapped = aco_opcode::v_cmp_u_f##sz; \
926 info->inverse = aco_opcode::v_cmp_o_f##sz; \
927 info->vcmpx = aco_opcode::v_cmpx_u_f##sz; \
928 return true; \
929 case aco_opcode::v_cmp_o_f##sz: \
930 info->swapped = aco_opcode::v_cmp_o_f##sz; \
931 info->inverse = aco_opcode::v_cmp_u_f##sz; \
932 info->vcmpx = aco_opcode::v_cmpx_o_f##sz; \
933 return true;
934 ORD_TEST(16)
935 ORD_TEST(32)
936 ORD_TEST(64)
937 #undef ORD_TEST
938 #define CMPI2(op, swap, inv, type, sz) \
939 case aco_opcode::v_cmp_##op##_##type##sz: \
940 info->swapped = aco_opcode::v_cmp_##swap##_##type##sz; \
941 info->inverse = aco_opcode::v_cmp_##inv##_##type##sz; \
942 info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz; \
943 return true;
944 #define CMPI(op, swap, inv) \
945 CMPI2(op, swap, inv, i, 16) \
946 CMPI2(op, swap, inv, u, 16) \
947 CMPI2(op, swap, inv, i, 32) \
948 CMPI2(op, swap, inv, u, 32) \
949 CMPI2(op, swap, inv, i, 64) \
950 CMPI2(op, swap, inv, u, 64)
951 CMPI(lt, gt, ge)
952 CMPI(eq, eq, lg)
953 CMPI(le, ge, gt)
954 CMPI(gt, lt, le)
955 CMPI(lg, lg, eq)
956 CMPI(ge, le, lt)
957 #undef CMPI
958 #undef CMPI2
959 #define CMPCLASS(sz) \
960 case aco_opcode::v_cmp_class_f##sz: \
961 info->vcmpx = aco_opcode::v_cmpx_class_f##sz; \
962 return true;
963 CMPCLASS(16)
964 CMPCLASS(32)
965 CMPCLASS(64)
966 #undef CMPCLASS
967 // clang-format on
968 default: return false;
969 }
970 }
971
972 aco_opcode
get_vcmp_inverse(aco_opcode op)973 get_vcmp_inverse(aco_opcode op)
974 {
975 CmpInfo info;
976 return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
977 }
978
979 aco_opcode
get_vcmp_swapped(aco_opcode op)980 get_vcmp_swapped(aco_opcode op)
981 {
982 CmpInfo info;
983 return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
984 }
985
986 aco_opcode
get_vcmpx(aco_opcode op)987 get_vcmpx(aco_opcode op)
988 {
989 CmpInfo info;
990 return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
991 }
992
993 bool
is_cmpx(aco_opcode op)994 is_cmpx(aco_opcode op)
995 {
996 CmpInfo info;
997 return !get_cmp_info(op, &info);
998 }
999
1000 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1001 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1002 {
1003 if (idx0 == idx1) {
1004 *new_op = instr->opcode;
1005 return true;
1006 }
1007
1008 if (idx0 > idx1)
1009 std::swap(idx0, idx1);
1010
1011 if (instr->isDPP())
1012 return false;
1013
1014 if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1015 return false;
1016
1017 if (instr->isVOPC()) {
1018 CmpInfo info;
1019 if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1020 *new_op = info.swapped;
1021 return true;
1022 }
1023 }
1024
1025 /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1026 switch (instr->opcode) {
1027 case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1028 case aco_opcode::v_add_u32:
1029 case aco_opcode::v_add_co_u32:
1030 case aco_opcode::v_add_co_u32_e64:
1031 case aco_opcode::v_add_i32:
1032 case aco_opcode::v_add_i16:
1033 case aco_opcode::v_add_u16_e64:
1034 case aco_opcode::v_add3_u32:
1035 case aco_opcode::v_add_f16:
1036 case aco_opcode::v_add_f32:
1037 case aco_opcode::v_mul_i32_i24:
1038 case aco_opcode::v_mul_hi_i32_i24:
1039 case aco_opcode::v_mul_u32_u24:
1040 case aco_opcode::v_mul_hi_u32_u24:
1041 case aco_opcode::v_mul_lo_u16:
1042 case aco_opcode::v_mul_lo_u16_e64:
1043 case aco_opcode::v_mul_f16:
1044 case aco_opcode::v_mul_f32:
1045 case aco_opcode::v_mul_legacy_f32:
1046 case aco_opcode::v_or_b32:
1047 case aco_opcode::v_and_b32:
1048 case aco_opcode::v_xor_b32:
1049 case aco_opcode::v_xnor_b32:
1050 case aco_opcode::v_xor3_b32:
1051 case aco_opcode::v_or3_b32:
1052 case aco_opcode::v_and_b16:
1053 case aco_opcode::v_or_b16:
1054 case aco_opcode::v_xor_b16:
1055 case aco_opcode::v_max3_f32:
1056 case aco_opcode::v_min3_f32:
1057 case aco_opcode::v_max3_f16:
1058 case aco_opcode::v_min3_f16:
1059 case aco_opcode::v_med3_f16:
1060 case aco_opcode::v_max3_u32:
1061 case aco_opcode::v_min3_u32:
1062 case aco_opcode::v_med3_u32:
1063 case aco_opcode::v_max3_i32:
1064 case aco_opcode::v_min3_i32:
1065 case aco_opcode::v_med3_i32:
1066 case aco_opcode::v_max3_u16:
1067 case aco_opcode::v_min3_u16:
1068 case aco_opcode::v_med3_u16:
1069 case aco_opcode::v_max3_i16:
1070 case aco_opcode::v_min3_i16:
1071 case aco_opcode::v_med3_i16:
1072 case aco_opcode::v_max_f16:
1073 case aco_opcode::v_max_f32:
1074 case aco_opcode::v_min_f16:
1075 case aco_opcode::v_min_f32:
1076 case aco_opcode::v_max_i32:
1077 case aco_opcode::v_min_i32:
1078 case aco_opcode::v_max_u32:
1079 case aco_opcode::v_min_u32:
1080 case aco_opcode::v_max_i16:
1081 case aco_opcode::v_min_i16:
1082 case aco_opcode::v_max_u16:
1083 case aco_opcode::v_min_u16:
1084 case aco_opcode::v_max_i16_e64:
1085 case aco_opcode::v_min_i16_e64:
1086 case aco_opcode::v_max_u16_e64:
1087 case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1088 case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1089 case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1090 case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1091 case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1092 case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1093 case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1094 case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1095 case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1096 case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1097 case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1098 case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1099 case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1100 case aco_opcode::v_addc_co_u32:
1101 case aco_opcode::v_mad_i32_i24:
1102 case aco_opcode::v_mad_u32_u24:
1103 case aco_opcode::v_lerp_u8:
1104 case aco_opcode::v_sad_u8:
1105 case aco_opcode::v_sad_hi_u8:
1106 case aco_opcode::v_sad_u16:
1107 case aco_opcode::v_sad_u32:
1108 case aco_opcode::v_xad_u32:
1109 case aco_opcode::v_add_lshl_u32:
1110 case aco_opcode::v_and_or_b32:
1111 case aco_opcode::v_mad_u16:
1112 case aco_opcode::v_mad_i16:
1113 case aco_opcode::v_mad_u32_u16:
1114 case aco_opcode::v_mad_i32_i16:
1115 case aco_opcode::v_maxmin_f32:
1116 case aco_opcode::v_minmax_f32:
1117 case aco_opcode::v_maxmin_f16:
1118 case aco_opcode::v_minmax_f16:
1119 case aco_opcode::v_maxmin_u32:
1120 case aco_opcode::v_minmax_u32:
1121 case aco_opcode::v_maxmin_i32:
1122 case aco_opcode::v_minmax_i32:
1123 case aco_opcode::v_fma_f32:
1124 case aco_opcode::v_fma_legacy_f32:
1125 case aco_opcode::v_fmac_f32:
1126 case aco_opcode::v_fmac_legacy_f32:
1127 case aco_opcode::v_mac_f32:
1128 case aco_opcode::v_mac_legacy_f32:
1129 case aco_opcode::v_fma_f16:
1130 case aco_opcode::v_fmac_f16:
1131 case aco_opcode::v_mac_f16:
1132 case aco_opcode::v_dot4c_i32_i8:
1133 case aco_opcode::v_dot2c_f32_f16:
1134 case aco_opcode::v_dot2_f32_f16:
1135 case aco_opcode::v_dot2_f32_bf16:
1136 case aco_opcode::v_dot2_f16_f16:
1137 case aco_opcode::v_dot2_bf16_bf16:
1138 case aco_opcode::v_fma_mix_f32:
1139 case aco_opcode::v_fma_mixlo_f16:
1140 case aco_opcode::v_fma_mixhi_f16:
1141 case aco_opcode::v_pk_fmac_f16: {
1142 if (idx1 == 2)
1143 return false;
1144 *new_op = instr->opcode;
1145 return true;
1146 }
1147 case aco_opcode::v_subb_co_u32: {
1148 if (idx1 == 2)
1149 return false;
1150 *new_op = aco_opcode::v_subbrev_co_u32;
1151 return true;
1152 }
1153 case aco_opcode::v_subbrev_co_u32: {
1154 if (idx1 == 2)
1155 return false;
1156 *new_op = aco_opcode::v_subb_co_u32;
1157 return true;
1158 }
1159 default: return false;
1160 }
1161 }
1162
wait_imm()1163 wait_imm::wait_imm()
1164 : exp(unset_counter), lgkm(unset_counter), vm(unset_counter), vs(unset_counter),
1165 sample(unset_counter), bvh(unset_counter), km(unset_counter)
1166 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1167 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1168 : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_), sample(unset_counter), bvh(unset_counter),
1169 km(unset_counter)
1170 {}
1171
1172 uint16_t
pack(enum amd_gfx_level gfx_level) const1173 wait_imm::pack(enum amd_gfx_level gfx_level) const
1174 {
1175 uint16_t imm = 0;
1176 assert(exp == unset_counter || exp <= 0x7);
1177 if (gfx_level >= GFX11) {
1178 assert(lgkm == unset_counter || lgkm <= 0x3f);
1179 assert(vm == unset_counter || vm <= 0x3f);
1180 imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1181 } else if (gfx_level >= GFX10) {
1182 assert(lgkm == unset_counter || lgkm <= 0x3f);
1183 assert(vm == unset_counter || vm <= 0x3f);
1184 imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1185 } else if (gfx_level >= GFX9) {
1186 assert(lgkm == unset_counter || lgkm <= 0xf);
1187 assert(vm == unset_counter || vm <= 0x3f);
1188 imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1189 } else {
1190 assert(lgkm == unset_counter || lgkm <= 0xf);
1191 assert(vm == unset_counter || vm <= 0xf);
1192 imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1193 }
1194 if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1195 imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1196 architecture when interpreting the immediate */
1197 if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1198 imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1199 architecture when interpreting the immediate */
1200 return imm;
1201 }
1202
1203 wait_imm
max(enum amd_gfx_level gfx_level)1204 wait_imm::max(enum amd_gfx_level gfx_level)
1205 {
1206 wait_imm imm;
1207 imm.vm = gfx_level >= GFX9 ? 63 : 15;
1208 imm.exp = 7;
1209 imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
1210 imm.vs = gfx_level >= GFX10 ? 63 : 0;
1211 imm.sample = gfx_level >= GFX12 ? 63 : 0;
1212 imm.bvh = gfx_level >= GFX12 ? 7 : 0;
1213 imm.km = gfx_level >= GFX12 ? 31 : 0;
1214 return imm;
1215 }
1216
1217 bool
unpack(enum amd_gfx_level gfx_level,const Instruction * instr)1218 wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
1219 {
1220 if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
1221 return false;
1222
1223 aco_opcode op = instr->opcode;
1224 uint16_t packed = instr->salu().imm;
1225
1226 if (op == aco_opcode::s_wait_loadcnt) {
1227 vm = std::min<uint8_t>(vm, packed);
1228 } else if (op == aco_opcode::s_wait_storecnt) {
1229 vs = std::min<uint8_t>(vs, packed);
1230 } else if (op == aco_opcode::s_wait_samplecnt) {
1231 sample = std::min<uint8_t>(sample, packed);
1232 } else if (op == aco_opcode::s_wait_bvhcnt) {
1233 bvh = std::min<uint8_t>(bvh, packed);
1234 } else if (op == aco_opcode::s_wait_expcnt) {
1235 exp = std::min<uint8_t>(exp, packed);
1236 } else if (op == aco_opcode::s_wait_dscnt) {
1237 lgkm = std::min<uint8_t>(lgkm, packed);
1238 } else if (op == aco_opcode::s_wait_kmcnt) {
1239 km = std::min<uint8_t>(km, packed);
1240 } else if (op == aco_opcode::s_wait_loadcnt_dscnt) {
1241 uint32_t vm2 = (packed >> 8) & 0x3f;
1242 uint32_t ds = packed & 0x3f;
1243 vm = std::min<uint8_t>(vm, vm2 == 0x3f ? wait_imm::unset_counter : vm2);
1244 lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1245 } else if (op == aco_opcode::s_wait_storecnt_dscnt) {
1246 uint32_t vs2 = (packed >> 8) & 0x3f;
1247 uint32_t ds = packed & 0x3f;
1248 vs = std::min<uint8_t>(vs, vs2 == 0x3f ? wait_imm::unset_counter : vs2);
1249 lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1250 } else if (op == aco_opcode::s_waitcnt_expcnt) {
1251 exp = std::min<uint8_t>(exp, packed);
1252 } else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
1253 lgkm = std::min<uint8_t>(lgkm, packed);
1254 } else if (op == aco_opcode::s_waitcnt_vmcnt) {
1255 vm = std::min<uint8_t>(vm, packed);
1256 } else if (op == aco_opcode::s_waitcnt_vscnt) {
1257 vs = std::min<uint8_t>(vs, packed);
1258 } else if (op == aco_opcode::s_waitcnt) {
1259 uint8_t vm2, lgkm2, exp2;
1260 if (gfx_level >= GFX11) {
1261 vm2 = (packed >> 10) & 0x3f;
1262 lgkm2 = (packed >> 4) & 0x3f;
1263 exp2 = packed & 0x7;
1264 } else {
1265 vm2 = packed & 0xf;
1266 if (gfx_level >= GFX9)
1267 vm2 |= (packed >> 10) & 0x30;
1268
1269 exp2 = (packed >> 4) & 0x7;
1270
1271 lgkm2 = (packed >> 8) & 0xf;
1272 if (gfx_level >= GFX10)
1273 lgkm2 |= (packed >> 8) & 0x30;
1274 }
1275
1276 if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
1277 vm2 = wait_imm::unset_counter;
1278 if (exp2 == 0x7)
1279 exp2 = wait_imm::unset_counter;
1280 if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
1281 lgkm2 = wait_imm::unset_counter;
1282
1283 vm = std::min(vm, vm2);
1284 exp = std::min(exp, exp2);
1285 lgkm = std::min(lgkm, lgkm2);
1286 } else {
1287 return false;
1288 }
1289 return true;
1290 }
1291
1292 bool
combine(const wait_imm & other)1293 wait_imm::combine(const wait_imm& other)
1294 {
1295 bool changed = false;
1296 for (unsigned i = 0; i < wait_type_num; i++) {
1297 if (other[i] < (*this)[i])
1298 changed = true;
1299 (*this)[i] = std::min((*this)[i], other[i]);
1300 }
1301 return changed;
1302 }
1303
1304 bool
empty() const1305 wait_imm::empty() const
1306 {
1307 for (unsigned i = 0; i < wait_type_num; i++) {
1308 if ((*this)[i] != unset_counter)
1309 return false;
1310 }
1311 return true;
1312 }
1313
1314 void
print(FILE * output) const1315 wait_imm::print(FILE* output) const
1316 {
1317 const char* names[wait_type_num];
1318 names[wait_type_exp] = "exp";
1319 names[wait_type_vm] = "vm";
1320 names[wait_type_lgkm] = "lgkm";
1321 names[wait_type_vs] = "vs";
1322 names[wait_type_sample] = "sample";
1323 names[wait_type_bvh] = "bvh";
1324 names[wait_type_km] = "km";
1325 for (unsigned i = 0; i < wait_type_num; i++) {
1326 if ((*this)[i] != unset_counter)
1327 fprintf(output, "%s: %u\n", names[i], (*this)[i]);
1328 }
1329 }
1330
1331 bool
should_form_clause(const Instruction * a,const Instruction * b)1332 should_form_clause(const Instruction* a, const Instruction* b)
1333 {
1334 if (a->definitions.empty() != b->definitions.empty())
1335 return false;
1336
1337 if (a->format != b->format)
1338 return false;
1339
1340 if (a->operands.empty() || b->operands.empty())
1341 return false;
1342
1343 /* Assume loads which don't use descriptors might load from similar addresses. */
1344 if (a->isFlatLike() || a->accessesLDS())
1345 return true;
1346 if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1347 return true;
1348
1349 /* If they load from the same descriptor, assume they might load from similar
1350 * addresses.
1351 */
1352 if (a->isVMEM() || a->isSMEM())
1353 return a->operands[0].tempId() == b->operands[0].tempId();
1354
1355 if (a->isEXP() && b->isEXP())
1356 return true;
1357
1358 return false;
1359 }
1360
1361 int
get_op_fixed_to_def(Instruction * instr)1362 get_op_fixed_to_def(Instruction* instr)
1363 {
1364 if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1365 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1366 instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1367 instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1368 instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1369 instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1370 instr->opcode == aco_opcode::v_dot4c_i32_i8 || instr->opcode == aco_opcode::s_fmac_f32 ||
1371 instr->opcode == aco_opcode::s_fmac_f16) {
1372 return 2;
1373 } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1374 instr->opcode == aco_opcode::s_cmovk_i32) {
1375 return 0;
1376 } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1377 return 3;
1378 } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1379 !instr->operands[2].isUndefined()) {
1380 return 2;
1381 }
1382 return -1;
1383 }
1384
1385 uint8_t
get_vmem_type(enum amd_gfx_level gfx_level,Instruction * instr)1386 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
1387 {
1388 if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
1389 return vmem_bvh;
1390 else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
1391 return vmem_sampler;
1392 else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
1393 instr->operands[1].regClass() == s4)
1394 return vmem_sampler;
1395 else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
1396 return vmem_nosampler;
1397 return 0;
1398 }
1399
1400 unsigned
parse_vdst_wait(Instruction * instr)1401 parse_vdst_wait(Instruction* instr)
1402 {
1403 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP())
1404 return 0;
1405 else if (instr->isLDSDIR())
1406 return instr->ldsdir().wait_vdst;
1407 else if (instr->opcode == aco_opcode::s_waitcnt_depctr)
1408 return (instr->salu().imm >> 12) & 0xf;
1409 else
1410 return 15;
1411 }
1412
1413 bool
dealloc_vgprs(Program * program)1414 dealloc_vgprs(Program* program)
1415 {
1416 if (program->gfx_level < GFX11)
1417 return false;
1418
1419 /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
1420 * store. */
1421 if (uses_scratch(program))
1422 return false;
1423
1424 /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
1425 * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
1426 * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
1427 * pending VMEM stores or exports if we insert the sendmsg for these stages. */
1428 if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1429 program->stage.hw == AC_HW_PIXEL_SHADER))
1430 return false;
1431
1432 Block& block = program->blocks.back();
1433
1434 /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1435 Builder bld(program);
1436 if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1437 bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1438 /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1439 bld.sopp(aco_opcode::s_nop, 0);
1440 bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
1441 }
1442
1443 return true;
1444 }
1445
1446 bool
isTrans() const1447 Instruction::isTrans() const noexcept
1448 {
1449 return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1450 instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental ||
1451 instr_info.classes[(int)opcode] == instr_class::valu_pseudo_scalar_trans;
1452 }
1453
1454 size_t
get_instr_data_size(Format format)1455 get_instr_data_size(Format format)
1456 {
1457 switch (format) {
1458 case Format::SOP1:
1459 case Format::SOP2:
1460 case Format::SOPC:
1461 case Format::SOPK:
1462 case Format::SOPP: return sizeof(SALU_instruction);
1463 case Format::SMEM: return sizeof(SMEM_instruction);
1464 case Format::PSEUDO: return sizeof(Pseudo_instruction);
1465 case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
1466 case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
1467 case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
1468 case Format::DS: return sizeof(DS_instruction);
1469 case Format::FLAT:
1470 case Format::GLOBAL:
1471 case Format::SCRATCH: return sizeof(FLAT_instruction);
1472 case Format::LDSDIR: return sizeof(LDSDIR_instruction);
1473 case Format::MTBUF: return sizeof(MTBUF_instruction);
1474 case Format::MUBUF: return sizeof(MUBUF_instruction);
1475 case Format::MIMG: return sizeof(MIMG_instruction);
1476 case Format::VOPD: return sizeof(VOPD_instruction);
1477 case Format::VINTERP_INREG: return sizeof(VINTERP_inreg_instruction);
1478 case Format::VINTRP: return sizeof(VINTRP_instruction);
1479 case Format::EXP: return sizeof(Export_instruction);
1480 default:
1481 if ((uint16_t)format & (uint16_t)Format::DPP16)
1482 return sizeof(DPP16_instruction);
1483 else if ((uint16_t)format & (uint16_t)Format::DPP8)
1484 return sizeof(DPP8_instruction);
1485 else if ((uint16_t)format & (uint16_t)Format::SDWA)
1486 return sizeof(SDWA_instruction);
1487 else
1488 return sizeof(VALU_instruction);
1489 }
1490 }
1491
1492 Instruction*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1493 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1494 uint32_t num_definitions)
1495 {
1496 size_t size = get_instr_data_size(format);
1497 size_t total_size = size + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1498
1499 void* data = instruction_buffer->allocate(total_size, alignof(uint32_t));
1500 memset(data, 0, total_size);
1501 Instruction* inst = (Instruction*)data;
1502
1503 inst->opcode = opcode;
1504 inst->format = format;
1505
1506 uint16_t operands_offset = size - offsetof(Instruction, operands);
1507 inst->operands = aco::span<Operand>(operands_offset, num_operands);
1508 uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1509 inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1510
1511 return inst;
1512 }
1513
1514 } // namespace aco
1515