1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include "util/bitset.h"
11
12 #include <algorithm>
13 #include <bitset>
14 #include <set>
15 #include <stack>
16 #include <vector>
17
18 namespace aco {
19 namespace {
20
21 struct State {
22 Program* program;
23 Block* block;
24 std::vector<aco_ptr<Instruction>> old_instructions;
25 };
26
27 struct NOP_ctx_gfx6 {
joinaco::__anonbf39a1ec0111::NOP_ctx_gfx628 void join(const NOP_ctx_gfx6& other)
29 {
30 set_vskip_mode_then_vector =
31 MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
32 valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
33 salu_wr_m0_then_gds_msg_ttrace =
34 MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
35 valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
36 salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
37 salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
38 setreg_then_getsetreg = MAX2(setreg_then_getsetreg, other.setreg_then_getsetreg);
39 vmem_store_then_wr_data |= other.vmem_store_then_wr_data;
40 smem_clause |= other.smem_clause;
41 smem_write |= other.smem_write;
42 for (unsigned i = 0; i < BITSET_WORDS(128); i++) {
43 smem_clause_read_write[i] |= other.smem_clause_read_write[i];
44 smem_clause_write[i] |= other.smem_clause_write[i];
45 }
46 }
47
operator ==aco::__anonbf39a1ec0111::NOP_ctx_gfx648 bool operator==(const NOP_ctx_gfx6& other)
49 {
50 return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
51 valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
52 vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
53 salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
54 valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
55 salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
56 salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
57 setreg_then_getsetreg == other.setreg_then_getsetreg &&
58 smem_clause == other.smem_clause && smem_write == other.smem_write &&
59 BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
60 BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
61 }
62
add_wait_statesaco::__anonbf39a1ec0111::NOP_ctx_gfx663 void add_wait_states(unsigned amount)
64 {
65 if ((set_vskip_mode_then_vector -= amount) < 0)
66 set_vskip_mode_then_vector = 0;
67
68 if ((valu_wr_vcc_then_div_fmas -= amount) < 0)
69 valu_wr_vcc_then_div_fmas = 0;
70
71 if ((salu_wr_m0_then_gds_msg_ttrace -= amount) < 0)
72 salu_wr_m0_then_gds_msg_ttrace = 0;
73
74 if ((valu_wr_exec_then_dpp -= amount) < 0)
75 valu_wr_exec_then_dpp = 0;
76
77 if ((salu_wr_m0_then_lds -= amount) < 0)
78 salu_wr_m0_then_lds = 0;
79
80 if ((salu_wr_m0_then_moverel -= amount) < 0)
81 salu_wr_m0_then_moverel = 0;
82
83 if ((setreg_then_getsetreg -= amount) < 0)
84 setreg_then_getsetreg = 0;
85
86 vmem_store_then_wr_data.reset();
87 }
88
89 /* setting MODE.vskip and then any vector op requires 2 wait states */
90 int8_t set_vskip_mode_then_vector = 0;
91
92 /* VALU writing VCC followed by v_div_fmas require 4 wait states */
93 int8_t valu_wr_vcc_then_div_fmas = 0;
94
95 /* SALU writing M0 followed by GDS, s_sendmsg or s_ttrace_data requires 1 wait state */
96 int8_t salu_wr_m0_then_gds_msg_ttrace = 0;
97
98 /* VALU writing EXEC followed by DPP requires 5 wait states */
99 int8_t valu_wr_exec_then_dpp = 0;
100
101 /* SALU writing M0 followed by some LDS instructions requires 1 wait state on GFX10 */
102 int8_t salu_wr_m0_then_lds = 0;
103
104 /* SALU writing M0 followed by s_moverel requires 1 wait state on GFX9 */
105 int8_t salu_wr_m0_then_moverel = 0;
106
107 /* s_setreg followed by a s_getreg/s_setreg of the same register needs 2 wait states
108 * currently we don't look at the actual register */
109 int8_t setreg_then_getsetreg = 0;
110
111 /* some memory instructions writing >64bit followed by a instructions
112 * writing the VGPRs holding the writedata requires 1 wait state */
113 std::bitset<256> vmem_store_then_wr_data;
114
115 /* we break up SMEM clauses that contain stores or overwrite an
116 * operand/definition of another instruction in the clause */
117 bool smem_clause = false;
118 bool smem_write = false;
119 BITSET_DECLARE(smem_clause_read_write, 128) = {0};
120 BITSET_DECLARE(smem_clause_write, 128) = {0};
121 };
122
123 struct NOP_ctx_gfx10 {
124 bool has_VOPC_write_exec = false;
125 bool has_nonVALU_exec_read = false;
126 bool has_VMEM = false;
127 bool has_branch_after_VMEM = false;
128 bool has_DS = false;
129 bool has_branch_after_DS = false;
130 bool has_NSA_MIMG = false;
131 bool has_writelane = false;
132 std::bitset<128> sgprs_read_by_VMEM;
133 std::bitset<128> sgprs_read_by_VMEM_store;
134 std::bitset<128> sgprs_read_by_DS;
135 std::bitset<128> sgprs_read_by_SMEM;
136
joinaco::__anonbf39a1ec0111::NOP_ctx_gfx10137 void join(const NOP_ctx_gfx10& other)
138 {
139 has_VOPC_write_exec |= other.has_VOPC_write_exec;
140 has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
141 has_VMEM |= other.has_VMEM;
142 has_branch_after_VMEM |= other.has_branch_after_VMEM;
143 has_DS |= other.has_DS;
144 has_branch_after_DS |= other.has_branch_after_DS;
145 has_NSA_MIMG |= other.has_NSA_MIMG;
146 has_writelane |= other.has_writelane;
147 sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
148 sgprs_read_by_DS |= other.sgprs_read_by_DS;
149 sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
150 sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
151 }
152
operator ==aco::__anonbf39a1ec0111::NOP_ctx_gfx10153 bool operator==(const NOP_ctx_gfx10& other)
154 {
155 return has_VOPC_write_exec == other.has_VOPC_write_exec &&
156 has_nonVALU_exec_read == other.has_nonVALU_exec_read && has_VMEM == other.has_VMEM &&
157 has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
158 has_branch_after_DS == other.has_branch_after_DS &&
159 has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
160 sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
161 sgprs_read_by_DS == other.sgprs_read_by_DS &&
162 sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
163 sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
164 }
165 };
166
167 template <int Max> struct VGPRCounterMap {
168 public:
169 int base = 0;
170 BITSET_DECLARE(resident, 256);
171 int val[256];
172
173 /* Initializes all counters to Max. */
VGPRCounterMapaco::__anonbf39a1ec0111::VGPRCounterMap174 VGPRCounterMap() { BITSET_ZERO(resident); }
175
176 /* Increase all counters, clamping at Max. */
incaco::__anonbf39a1ec0111::VGPRCounterMap177 void inc() { base++; }
178
179 /* Set counter to 0. */
setaco::__anonbf39a1ec0111::VGPRCounterMap180 void set(unsigned idx)
181 {
182 val[idx] = -base;
183 BITSET_SET(resident, idx);
184 }
185
setaco::__anonbf39a1ec0111::VGPRCounterMap186 void set(PhysReg reg, unsigned bytes)
187 {
188 if (reg.reg() < 256)
189 return;
190
191 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
192 set(reg.reg() - 256 + i);
193 }
194
195 /* Reset all counters to Max. */
resetaco::__anonbf39a1ec0111::VGPRCounterMap196 void reset()
197 {
198 base = 0;
199 BITSET_ZERO(resident);
200 }
201
resetaco::__anonbf39a1ec0111::VGPRCounterMap202 void reset(PhysReg reg, unsigned bytes)
203 {
204 if (reg.reg() < 256)
205 return;
206
207 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
208 BITSET_CLEAR(resident, reg.reg() - 256 + i);
209 }
210
getaco::__anonbf39a1ec0111::VGPRCounterMap211 uint8_t get(unsigned idx)
212 {
213 return BITSET_TEST(resident, idx) ? MIN2(val[idx] + base, Max) : Max;
214 }
215
getaco::__anonbf39a1ec0111::VGPRCounterMap216 uint8_t get(PhysReg reg, unsigned offset = 0)
217 {
218 assert(reg.reg() >= 256);
219 return get(reg.reg() - 256 + offset);
220 }
221
join_minaco::__anonbf39a1ec0111::VGPRCounterMap222 void join_min(const VGPRCounterMap& other)
223 {
224 unsigned i;
225 BITSET_FOREACH_SET (i, other.resident, 256) {
226 if (BITSET_TEST(resident, i))
227 val[i] = MIN2(val[i] + base, other.val[i] + other.base) - base;
228 else
229 val[i] = other.val[i] + other.base - base;
230 }
231 BITSET_OR(resident, resident, other.resident);
232 }
233
operator ==aco::__anonbf39a1ec0111::VGPRCounterMap234 bool operator==(const VGPRCounterMap& other) const
235 {
236 if (!BITSET_EQUAL(resident, other.resident))
237 return false;
238
239 unsigned i;
240 BITSET_FOREACH_SET (i, other.resident, 256) {
241 if (!BITSET_TEST(resident, i))
242 return false;
243 if (val[i] + base != other.val[i] + other.base)
244 return false;
245 }
246 return true;
247 }
248 };
249
250 struct NOP_ctx_gfx11 {
251 /* VcmpxPermlaneHazard */
252 bool has_Vcmpx = false;
253
254 /* LdsDirectVMEMHazard */
255 std::bitset<256> vgpr_used_by_vmem_load;
256 std::bitset<256> vgpr_used_by_vmem_sample;
257 std::bitset<256> vgpr_used_by_vmem_bvh;
258 std::bitset<256> vgpr_used_by_vmem_store;
259 std::bitset<256> vgpr_used_by_ds;
260
261 /* VALUTransUseHazard */
262 VGPRCounterMap<15> valu_since_wr_by_trans;
263 VGPRCounterMap<2> trans_since_wr_by_trans;
264
265 /* VALUMaskWriteHazard */
266 std::bitset<128> sgpr_read_by_valu_as_lanemask;
267 std::bitset<128> sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
268
269 /* WMMAHazards */
270 std::bitset<256> vgpr_written_by_wmma;
271
joinaco::__anonbf39a1ec0111::NOP_ctx_gfx11272 void join(const NOP_ctx_gfx11& other)
273 {
274 has_Vcmpx |= other.has_Vcmpx;
275 vgpr_used_by_vmem_load |= other.vgpr_used_by_vmem_load;
276 vgpr_used_by_vmem_sample |= other.vgpr_used_by_vmem_sample;
277 vgpr_used_by_vmem_bvh |= other.vgpr_used_by_vmem_bvh;
278 vgpr_used_by_vmem_store |= other.vgpr_used_by_vmem_store;
279 vgpr_used_by_ds |= other.vgpr_used_by_ds;
280 valu_since_wr_by_trans.join_min(other.valu_since_wr_by_trans);
281 trans_since_wr_by_trans.join_min(other.trans_since_wr_by_trans);
282 sgpr_read_by_valu_as_lanemask |= other.sgpr_read_by_valu_as_lanemask;
283 sgpr_read_by_valu_as_lanemask_then_wr_by_salu |=
284 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu;
285 vgpr_written_by_wmma |= other.vgpr_written_by_wmma;
286 }
287
operator ==aco::__anonbf39a1ec0111::NOP_ctx_gfx11288 bool operator==(const NOP_ctx_gfx11& other)
289 {
290 return has_Vcmpx == other.has_Vcmpx &&
291 vgpr_used_by_vmem_load == other.vgpr_used_by_vmem_load &&
292 vgpr_used_by_vmem_sample == other.vgpr_used_by_vmem_sample &&
293 vgpr_used_by_vmem_bvh == other.vgpr_used_by_vmem_bvh &&
294 vgpr_used_by_vmem_store == other.vgpr_used_by_vmem_store &&
295 vgpr_used_by_ds == other.vgpr_used_by_ds &&
296 valu_since_wr_by_trans == other.valu_since_wr_by_trans &&
297 trans_since_wr_by_trans == other.trans_since_wr_by_trans &&
298 sgpr_read_by_valu_as_lanemask == other.sgpr_read_by_valu_as_lanemask &&
299 sgpr_read_by_valu_as_lanemask_then_wr_by_salu ==
300 other.sgpr_read_by_valu_as_lanemask_then_wr_by_salu &&
301 vgpr_written_by_wmma == other.vgpr_written_by_wmma;
302 }
303 };
304
305 int
get_wait_states(aco_ptr<Instruction> & instr)306 get_wait_states(aco_ptr<Instruction>& instr)
307 {
308 if (instr->opcode == aco_opcode::s_nop)
309 return instr->salu().imm + 1;
310 else if (instr->opcode == aco_opcode::p_constaddr)
311 return 3; /* lowered to 3 instructions in the assembler */
312 else
313 return 1;
314 }
315
316 bool
regs_intersect(PhysReg a_reg,unsigned a_size,PhysReg b_reg,unsigned b_size)317 regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
318 {
319 return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
320 }
321
322 template <typename GlobalState, typename BlockState,
323 bool (*block_cb)(GlobalState&, BlockState&, Block*),
324 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
325 void
search_backwards_internal(State & state,GlobalState & global_state,BlockState block_state,Block * block,bool start_at_end)326 search_backwards_internal(State& state, GlobalState& global_state, BlockState block_state,
327 Block* block, bool start_at_end)
328 {
329 if (block == state.block && start_at_end) {
330 /* If it's the current block, block->instructions is incomplete. */
331 for (int pred_idx = state.old_instructions.size() - 1; pred_idx >= 0; pred_idx--) {
332 aco_ptr<Instruction>& instr = state.old_instructions[pred_idx];
333 if (!instr)
334 break; /* Instruction has been moved to block->instructions. */
335 if (instr_cb(global_state, block_state, instr))
336 return;
337 }
338 }
339
340 for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
341 if (instr_cb(global_state, block_state, block->instructions[pred_idx]))
342 return;
343 }
344
345 PRAGMA_DIAGNOSTIC_PUSH
346 PRAGMA_DIAGNOSTIC_IGNORED(-Waddress)
347 if (block_cb != nullptr && !block_cb(global_state, block_state, block))
348 return;
349 PRAGMA_DIAGNOSTIC_POP
350
351 for (unsigned lin_pred : block->linear_preds) {
352 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
353 state, global_state, block_state, &state.program->blocks[lin_pred], true);
354 }
355 }
356
357 template <typename GlobalState, typename BlockState,
358 bool (*block_cb)(GlobalState&, BlockState&, Block*),
359 bool (*instr_cb)(GlobalState&, BlockState&, aco_ptr<Instruction>&)>
360 void
search_backwards(State & state,GlobalState & global_state,BlockState & block_state)361 search_backwards(State& state, GlobalState& global_state, BlockState& block_state)
362 {
363 search_backwards_internal<GlobalState, BlockState, block_cb, instr_cb>(
364 state, global_state, block_state, state.block, false);
365 }
366
367 struct HandleRawHazardGlobalState {
368 PhysReg reg;
369 int nops_needed;
370 };
371
372 struct HandleRawHazardBlockState {
373 uint32_t mask;
374 int nops_needed;
375 };
376
377 template <bool Valu, bool Vintrp, bool Salu>
378 bool
handle_raw_hazard_instr(HandleRawHazardGlobalState & global_state,HandleRawHazardBlockState & block_state,aco_ptr<Instruction> & pred)379 handle_raw_hazard_instr(HandleRawHazardGlobalState& global_state,
380 HandleRawHazardBlockState& block_state, aco_ptr<Instruction>& pred)
381 {
382 unsigned mask_size = util_last_bit(block_state.mask);
383
384 uint32_t writemask = 0;
385 for (Definition& def : pred->definitions) {
386 if (regs_intersect(global_state.reg, mask_size, def.physReg(), def.size())) {
387 unsigned start = def.physReg() > global_state.reg ? def.physReg() - global_state.reg : 0;
388 unsigned end = MIN2(mask_size, start + def.size());
389 writemask |= u_bit_consecutive(start, end - start);
390 }
391 }
392
393 bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) || (pred->isVINTRP() && Vintrp) ||
394 (pred->isSALU() && Salu));
395 if (is_hazard) {
396 global_state.nops_needed = MAX2(global_state.nops_needed, block_state.nops_needed);
397 return true;
398 }
399
400 block_state.mask &= ~writemask;
401 block_state.nops_needed = MAX2(block_state.nops_needed - get_wait_states(pred), 0);
402
403 if (block_state.mask == 0)
404 block_state.nops_needed = 0;
405
406 return block_state.nops_needed == 0;
407 }
408
409 template <bool Valu, bool Vintrp, bool Salu>
410 void
handle_raw_hazard(State & state,int * NOPs,int min_states,Operand op)411 handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
412 {
413 if (*NOPs >= min_states)
414 return;
415
416 HandleRawHazardGlobalState global = {op.physReg(), 0};
417 HandleRawHazardBlockState block = {u_bit_consecutive(0, op.size()), min_states};
418
419 /* Loops require branch instructions, which count towards the wait
420 * states. So even with loops this should finish unless nops_needed is some
421 * huge value. */
422 search_backwards<HandleRawHazardGlobalState, HandleRawHazardBlockState, nullptr,
423 handle_raw_hazard_instr<Valu, Vintrp, Salu>>(state, global, block);
424
425 *NOPs = MAX2(*NOPs, global.nops_needed);
426 }
427
428 static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
429 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
430 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
431
432 void
set_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)433 set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
434 {
435 unsigned end = start + size - 1;
436 unsigned start_mod = start % BITSET_WORDBITS;
437 if (start_mod + size <= BITSET_WORDBITS) {
438 BITSET_SET_RANGE_INSIDE_WORD(words, start, end);
439 } else {
440 unsigned first_size = BITSET_WORDBITS - start_mod;
441 set_bitset_range(words, start, BITSET_WORDBITS - start_mod);
442 set_bitset_range(words, start + first_size, size - first_size);
443 }
444 }
445
446 bool
test_bitset_range(BITSET_WORD * words,unsigned start,unsigned size)447 test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
448 {
449 unsigned end = start + size - 1;
450 unsigned start_mod = start % BITSET_WORDBITS;
451 if (start_mod + size <= BITSET_WORDBITS) {
452 return BITSET_TEST_RANGE(words, start, end);
453 } else {
454 unsigned first_size = BITSET_WORDBITS - start_mod;
455 return test_bitset_range(words, start, BITSET_WORDBITS - start_mod) ||
456 test_bitset_range(words, start + first_size, size - first_size);
457 }
458 }
459
460 /* A SMEM clause is any group of consecutive SMEM instructions. The
461 * instructions in this group may return out of order and/or may be replayed.
462 *
463 * To fix this potential hazard correctly, we have to make sure that when a
464 * clause has more than one instruction, no instruction in the clause writes
465 * to a register that is read by another instruction in the clause (including
466 * itself). In this case, we have to break the SMEM clause by inserting non
467 * SMEM instructions.
468 *
469 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
470 */
471 void
handle_smem_clause_hazards(Program * program,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,int * NOPs)472 handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
473 int* NOPs)
474 {
475 /* break off from previous SMEM clause if needed */
476 if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
477 /* Don't allow clauses with store instructions since the clause's
478 * instructions may use the same address. */
479 if (ctx.smem_write || instr->definitions.empty() ||
480 instr_info.is_atomic[(unsigned)instr->opcode]) {
481 *NOPs = 1;
482 } else if (program->dev.xnack_enabled) {
483 for (Operand op : instr->operands) {
484 if (!op.isConstant() &&
485 test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
486 *NOPs = 1;
487 break;
488 }
489 }
490
491 Definition def = instr->definitions[0];
492 if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
493 *NOPs = 1;
494 }
495 }
496 }
497
498 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
499 void
handle_instruction_gfx6(State & state,NOP_ctx_gfx6 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)500 handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
501 std::vector<aco_ptr<Instruction>>& new_instructions)
502 {
503 /* check hazards */
504 int NOPs = 0;
505
506 if (instr->isSMEM()) {
507 if (state.program->gfx_level == GFX6) {
508 /* A read of an SGPR by SMRD instruction requires 4 wait states
509 * when the SGPR was written by a VALU instruction. According to LLVM,
510 * there is also an undocumented hardware behavior when the buffer
511 * descriptor is written by a SALU instruction */
512 for (unsigned i = 0; i < instr->operands.size(); i++) {
513 Operand op = instr->operands[i];
514 if (op.isConstant())
515 continue;
516
517 bool is_buffer_desc = i == 0 && op.size() > 2;
518 if (is_buffer_desc)
519 handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
520 else
521 handle_valu_then_read_hazard(state, &NOPs, 4, op);
522 }
523 }
524
525 handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
526 } else if (instr->isSALU()) {
527 if (instr->opcode == aco_opcode::s_setreg_b32 ||
528 instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
529 instr->opcode == aco_opcode::s_getreg_b32) {
530 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
531 }
532
533 if (state.program->gfx_level == GFX9) {
534 if (instr->opcode == aco_opcode::s_movrels_b32 ||
535 instr->opcode == aco_opcode::s_movrels_b64 ||
536 instr->opcode == aco_opcode::s_movreld_b32 ||
537 instr->opcode == aco_opcode::s_movreld_b64) {
538 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
539 }
540 }
541
542 if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_ttracedata)
543 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
544 } else if (instr->isDS() && instr->ds().gds) {
545 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
546 } else if (instr->isVALU() || instr->isVINTRP()) {
547 if (instr->isDPP()) {
548 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
549 handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
550 }
551
552 for (Definition def : instr->definitions) {
553 if (def.regClass().type() != RegType::sgpr) {
554 for (unsigned i = 0; i < def.size(); i++)
555 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data[(def.physReg() & 0xff) + i]);
556 }
557 }
558
559 if ((instr->opcode == aco_opcode::v_readlane_b32 ||
560 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
561 instr->opcode == aco_opcode::v_writelane_b32 ||
562 instr->opcode == aco_opcode::v_writelane_b32_e64) &&
563 !instr->operands[1].isConstant()) {
564 handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
565 }
566
567 /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
568 * is followed by a read with v_readfirstlane or v_readlane to fix GPU
569 * hangs on GFX6. Note that v_writelane_* is apparently not affected.
570 * This hazard isn't documented anywhere but AMD confirmed that hazard.
571 */
572 if (state.program->gfx_level == GFX6 &&
573 (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
574 instr->opcode == aco_opcode::v_readfirstlane_b32)) {
575 handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
576 }
577
578 if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
579 instr->opcode == aco_opcode::v_div_fmas_f64)
580 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
581 } else if (instr->isVMEM() || instr->isFlatLike()) {
582 /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
583 for (Operand op : instr->operands) {
584 if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
585 handle_valu_then_read_hazard(state, &NOPs, 5, op);
586 }
587 }
588
589 if (!instr->isSALU() && instr->format != Format::SMEM)
590 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
591
592 if (state.program->gfx_level == GFX9) {
593 bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
594 if (instr->isVINTRP() || lds_scratch_global ||
595 instr->opcode == aco_opcode::ds_read_addtid_b32 ||
596 instr->opcode == aco_opcode::ds_write_addtid_b32 ||
597 instr->opcode == aco_opcode::buffer_store_lds_dword) {
598 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
599 }
600 }
601
602 ctx.add_wait_states(NOPs + get_wait_states(instr));
603
604 // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
605 if (NOPs) {
606 /* create NOP */
607 aco_ptr<Instruction> nop{create_instruction(aco_opcode::s_nop, Format::SOPP, 0, 0)};
608 nop->salu().imm = NOPs - 1;
609 new_instructions.emplace_back(std::move(nop));
610 }
611
612 /* update information to check for later hazards */
613 if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
614 ctx.smem_clause = false;
615 ctx.smem_write = false;
616
617 if (state.program->dev.xnack_enabled) {
618 BITSET_ZERO(ctx.smem_clause_read_write);
619 BITSET_ZERO(ctx.smem_clause_write);
620 }
621 }
622
623 if (instr->isSMEM()) {
624 if (instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
625 ctx.smem_write = true;
626 } else {
627 ctx.smem_clause = true;
628
629 if (state.program->dev.xnack_enabled) {
630 for (Operand op : instr->operands) {
631 if (!op.isConstant()) {
632 set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
633 }
634 }
635
636 Definition def = instr->definitions[0];
637 set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
638 set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
639 }
640 }
641 } else if (instr->isVALU()) {
642 for (Definition def : instr->definitions) {
643 if (def.regClass().type() == RegType::sgpr) {
644 if (def.physReg() == vcc || def.physReg() == vcc_hi) {
645 ctx.valu_wr_vcc_then_div_fmas = 4;
646 }
647 if (def.physReg() == exec || def.physReg() == exec_hi) {
648 ctx.valu_wr_exec_then_dpp = 5;
649 }
650 }
651 }
652 } else if (instr->isSALU()) {
653 if (!instr->definitions.empty()) {
654 /* all other definitions should be SCC */
655 Definition def = instr->definitions[0];
656 if (def.physReg() == m0) {
657 ctx.salu_wr_m0_then_gds_msg_ttrace = 1;
658 ctx.salu_wr_m0_then_lds = 1;
659 ctx.salu_wr_m0_then_moverel = 1;
660 }
661 } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
662 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
663 SALU_instruction& sopk = instr->salu();
664 unsigned offset = (sopk.imm >> 6) & 0x1f;
665 unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
666 unsigned reg = sopk.imm & 0x3f;
667 ctx.setreg_then_getsetreg = 2;
668
669 if (reg == 1 && offset >= 28 && size > (28 - offset))
670 ctx.set_vskip_mode_then_vector = 2;
671 }
672 } else if (instr->isVMEM() || instr->isFlatLike()) {
673 /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
674 bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
675 instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
676 /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
677 * store) */
678 bool consider_mimg = instr->isMIMG() &&
679 instr->operands[1].regClass().type() == RegType::vgpr &&
680 instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
681 /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
682 bool consider_flat =
683 instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
684 if (consider_buf || consider_mimg || consider_flat) {
685 PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
686 unsigned size = instr->operands[consider_flat ? 2 : 3].size();
687 for (unsigned i = 0; i < size; i++)
688 ctx.vmem_store_then_wr_data[(wrdata & 0xff) + i] = 1;
689 }
690 }
691 }
692
693 bool
is_latest_instr_vintrp(bool & global_state,bool & block_state,aco_ptr<Instruction> & pred)694 is_latest_instr_vintrp(bool& global_state, bool& block_state, aco_ptr<Instruction>& pred)
695 {
696 if (pred->isVINTRP())
697 global_state = true;
698 return true;
699 }
700
701 template <bool Salu, bool Sgpr>
702 bool
handle_wr_hazard_instr(int & global_state,int & block_state,aco_ptr<Instruction> & pred)703 handle_wr_hazard_instr(int& global_state, int& block_state, aco_ptr<Instruction>& pred)
704 {
705 if (Salu ? pred->isSALU() : (pred->isVALU() || pred->isVINTRP())) {
706 for (Definition dst : pred->definitions) {
707 if ((dst.physReg().reg() < 256) == Sgpr) {
708 global_state = MAX2(global_state, block_state);
709 return true;
710 }
711 }
712 }
713
714 block_state -= get_wait_states(pred);
715 return block_state <= 0;
716 }
717
718 template <bool Salu, bool Sgpr>
719 void
handle_wr_hazard(State & state,int * NOPs,int min_states)720 handle_wr_hazard(State& state, int* NOPs, int min_states)
721 {
722 if (*NOPs >= min_states)
723 return;
724
725 int global = 0;
726 int block = min_states;
727 search_backwards<int, int, nullptr, handle_wr_hazard_instr<Salu, Sgpr>>(state, global, block);
728 *NOPs = MAX2(*NOPs, global);
729 }
730
731 void
resolve_all_gfx6(State & state,NOP_ctx_gfx6 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)732 resolve_all_gfx6(State& state, NOP_ctx_gfx6& ctx,
733 std::vector<aco_ptr<Instruction>>& new_instructions)
734 {
735 int NOPs = 0;
736
737 /* SGPR->SMEM hazards */
738 if (state.program->gfx_level == GFX6) {
739 handle_wr_hazard<true, true>(state, &NOPs, 4);
740 handle_wr_hazard<false, true>(state, &NOPs, 4);
741 }
742
743 /* Break up SMEM clauses */
744 if (ctx.smem_clause || ctx.smem_write)
745 NOPs = MAX2(NOPs, 1);
746
747 /* SALU/GDS hazards */
748 NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
749 if (state.program->gfx_level == GFX9)
750 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
751 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_gds_msg_ttrace);
752
753 /* VALU hazards */
754 NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
755 if (state.program->gfx_level >= GFX8)
756 handle_wr_hazard<false, false>(state, &NOPs, 2); /* VALU->DPP */
757 NOPs = MAX2(NOPs, ctx.vmem_store_then_wr_data.any() ? 1 : 0);
758 if (state.program->gfx_level == GFX6) {
759 /* VINTRP->v_readlane_b32/etc */
760 bool vintrp = false;
761 search_backwards<bool, bool, nullptr, is_latest_instr_vintrp>(state, vintrp, vintrp);
762 if (vintrp)
763 NOPs = MAX2(NOPs, 1);
764 }
765 NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
766
767 /* VALU(sgpr)->VMEM/v_readlane_b32/etc hazards. v_readlane_b32/etc require only 4 NOPs. */
768 handle_wr_hazard<false, true>(state, &NOPs, 5);
769
770 NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
771
772 if (state.program->gfx_level == GFX9)
773 NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
774
775 ctx.add_wait_states(NOPs);
776 if (NOPs) {
777 Builder bld(state.program, &new_instructions);
778 bld.sopp(aco_opcode::s_nop, NOPs - 1);
779 }
780 }
781
782 template <std::size_t N>
783 bool
check_written_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)784 check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
785 {
786 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
787 [&check_regs](const Definition& def) -> bool
788 {
789 bool writes_any = false;
790 for (unsigned i = 0; i < def.size(); i++) {
791 unsigned def_reg = def.physReg() + i;
792 writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
793 }
794 return writes_any;
795 });
796 }
797
798 template <std::size_t N>
799 bool
check_read_regs(const aco_ptr<Instruction> & instr,const std::bitset<N> & check_regs)800 check_read_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
801 {
802 return std::any_of(instr->operands.begin(), instr->operands.end(),
803 [&check_regs](const Operand& op) -> bool
804 {
805 if (op.isConstant())
806 return false;
807 bool writes_any = false;
808 for (unsigned i = 0; i < op.size(); i++) {
809 unsigned op_reg = op.physReg() + i;
810 writes_any |= op_reg < check_regs.size() && check_regs[op_reg];
811 }
812 return writes_any;
813 });
814 }
815
816 template <std::size_t N>
817 void
mark_read_regs(const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)818 mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
819 {
820 for (const Operand& op : instr->operands) {
821 for (unsigned i = 0; i < op.size(); i++) {
822 unsigned reg = op.physReg() + i;
823 if (reg < reg_reads.size())
824 reg_reads.set(reg);
825 }
826 }
827 }
828
829 template <std::size_t N>
830 void
mark_read_regs_exec(State & state,const aco_ptr<Instruction> & instr,std::bitset<N> & reg_reads)831 mark_read_regs_exec(State& state, const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
832 {
833 mark_read_regs(instr, reg_reads);
834 reg_reads.set(exec);
835 if (state.program->wave_size == 64)
836 reg_reads.set(exec_hi);
837 }
838
839 bool
VALU_writes_sgpr(aco_ptr<Instruction> & instr)840 VALU_writes_sgpr(aco_ptr<Instruction>& instr)
841 {
842 if (instr->isVOPC())
843 return true;
844 if (instr->isVOP3() && instr->definitions.size() == 2)
845 return true;
846 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
847 instr->opcode == aco_opcode::v_readlane_b32 ||
848 instr->opcode == aco_opcode::v_readlane_b32_e64)
849 return true;
850 return false;
851 }
852
853 bool
instr_writes_sgpr(const aco_ptr<Instruction> & instr)854 instr_writes_sgpr(const aco_ptr<Instruction>& instr)
855 {
856 return std::any_of(instr->definitions.begin(), instr->definitions.end(),
857 [](const Definition& def) -> bool
858 { return def.getTemp().type() == RegType::sgpr; });
859 }
860
861 inline bool
instr_is_branch(const aco_ptr<Instruction> & instr)862 instr_is_branch(const aco_ptr<Instruction>& instr)
863 {
864 return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
865 instr->opcode == aco_opcode::s_cbranch_scc1 ||
866 instr->opcode == aco_opcode::s_cbranch_vccz ||
867 instr->opcode == aco_opcode::s_cbranch_vccnz ||
868 instr->opcode == aco_opcode::s_cbranch_execz ||
869 instr->opcode == aco_opcode::s_cbranch_execnz ||
870 instr->opcode == aco_opcode::s_cbranch_cdbgsys ||
871 instr->opcode == aco_opcode::s_cbranch_cdbguser ||
872 instr->opcode == aco_opcode::s_cbranch_cdbgsys_or_user ||
873 instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
874 instr->opcode == aco_opcode::s_subvector_loop_begin ||
875 instr->opcode == aco_opcode::s_subvector_loop_end ||
876 instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
877 instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
878 }
879
880 void
handle_instruction_gfx10(State & state,NOP_ctx_gfx10 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)881 handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
882 std::vector<aco_ptr<Instruction>>& new_instructions)
883 {
884 // TODO: s_dcache_inv needs to be in it's own group on GFX10
885
886 Builder bld(state.program, &new_instructions);
887
888 unsigned vm_vsrc = 7;
889 unsigned sa_sdst = 1;
890 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
891 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
892 vm_vsrc = 0;
893 sa_sdst = 0;
894 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
895 vm_vsrc = (instr->salu().imm >> 2) & 0x7;
896 sa_sdst = instr->salu().imm & 0x1;
897 }
898
899 /* VMEMtoScalarWriteHazard
900 * Handle EXEC/M0/SGPR write following a VMEM/DS instruction without a VALU or "waitcnt vmcnt(0)"
901 * in-between.
902 */
903 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
904 /* Remember all SGPRs that are read by the VMEM/DS instruction */
905 if (instr->isVMEM() || instr->isFlatLike())
906 mark_read_regs_exec(
907 state, instr,
908 instr->definitions.empty() ? ctx.sgprs_read_by_VMEM_store : ctx.sgprs_read_by_VMEM);
909 if (instr->isFlat() || instr->isDS())
910 mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
911 } else if (instr->isSALU() || instr->isSMEM()) {
912 wait_imm imm;
913 if (imm.unpack(state.program->gfx_level, instr.get())) {
914 if (imm.vm == 0)
915 ctx.sgprs_read_by_VMEM.reset();
916 if (imm.lgkm == 0)
917 ctx.sgprs_read_by_DS.reset();
918 if (imm.vs == 0)
919 ctx.sgprs_read_by_VMEM_store.reset();
920 } else if (vm_vsrc == 0) {
921 ctx.sgprs_read_by_VMEM.reset();
922 ctx.sgprs_read_by_DS.reset();
923 ctx.sgprs_read_by_VMEM_store.reset();
924 }
925
926 /* Check if SALU writes an SGPR that was previously read by the VALU */
927 if (check_written_regs(instr, ctx.sgprs_read_by_VMEM) ||
928 check_written_regs(instr, ctx.sgprs_read_by_DS) ||
929 check_written_regs(instr, ctx.sgprs_read_by_VMEM_store)) {
930 ctx.sgprs_read_by_VMEM.reset();
931 ctx.sgprs_read_by_DS.reset();
932 ctx.sgprs_read_by_VMEM_store.reset();
933
934 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
935 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
936 }
937 } else if (instr->isVALU()) {
938 /* Hazard is mitigated by any VALU instruction */
939 ctx.sgprs_read_by_VMEM.reset();
940 ctx.sgprs_read_by_DS.reset();
941 ctx.sgprs_read_by_VMEM_store.reset();
942 }
943
944 /* VcmpxPermlaneHazard
945 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
946 */
947 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
948 /* we only need to check definitions[0] because since GFX10 v_cmpx only writes one dest */
949 ctx.has_VOPC_write_exec = true;
950 } else if (ctx.has_VOPC_write_exec && (instr->opcode == aco_opcode::v_permlane16_b32 ||
951 instr->opcode == aco_opcode::v_permlanex16_b32)) {
952 ctx.has_VOPC_write_exec = false;
953
954 /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
955 bld.vop1(aco_opcode::v_mov_b32, Definition(instr->operands[0].physReg(), v1),
956 Operand(instr->operands[0].physReg(), v1));
957 } else if (instr->isVALU() && instr->opcode != aco_opcode::v_nop) {
958 ctx.has_VOPC_write_exec = false;
959 }
960
961 /* VcmpxExecWARHazard
962 * Handle any VALU instruction writing the exec mask after it was read by a non-VALU instruction.
963 */
964 if (!instr->isVALU() && instr->reads_exec()) {
965 ctx.has_nonVALU_exec_read = true;
966 } else if (instr->isVALU() && ctx.has_nonVALU_exec_read) {
967 if (instr->writes_exec()) {
968 ctx.has_nonVALU_exec_read = false;
969
970 /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
971 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
972 } else if (instr_writes_sgpr(instr)) {
973 /* Any VALU instruction that writes an SGPR mitigates the problem */
974 ctx.has_nonVALU_exec_read = false;
975 }
976 } else if (sa_sdst == 0) {
977 ctx.has_nonVALU_exec_read = false;
978 }
979
980 /* SMEMtoVectorWriteHazard
981 * Handle any VALU instruction writing an SGPR after an SMEM reads it.
982 */
983 if (instr->isSMEM()) {
984 /* Remember all SGPRs that are read by the SMEM instruction */
985 mark_read_regs(instr, ctx.sgprs_read_by_SMEM);
986 } else if (VALU_writes_sgpr(instr)) {
987 /* Check if VALU writes an SGPR that was previously read by SMEM */
988 if (check_written_regs(instr, ctx.sgprs_read_by_SMEM)) {
989 ctx.sgprs_read_by_SMEM.reset();
990
991 /* Insert s_mov to mitigate the problem */
992 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
993 }
994 } else if (instr->isSALU()) {
995 wait_imm imm;
996 if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
997 /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
998 ctx.sgprs_read_by_SMEM.reset();
999 } else if (instr->format != Format::SOPP && instr->definitions.size()) {
1000 /* SALU can mitigate the hazard */
1001 ctx.sgprs_read_by_SMEM.reset();
1002 }
1003 }
1004
1005 /* LdsBranchVmemWARHazard
1006 * Handle VMEM/GLOBAL/SCRATCH->branch->DS and DS->branch->VMEM/GLOBAL/SCRATCH patterns.
1007 */
1008 if (instr->isVMEM() || instr->isGlobal() || instr->isScratch()) {
1009 if (ctx.has_branch_after_DS)
1010 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1011 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_DS = false;
1012 ctx.has_VMEM = true;
1013 } else if (instr->isDS()) {
1014 if (ctx.has_branch_after_VMEM)
1015 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1016 ctx.has_branch_after_VMEM = ctx.has_branch_after_DS = ctx.has_VMEM = false;
1017 ctx.has_DS = true;
1018 } else if (instr_is_branch(instr)) {
1019 ctx.has_branch_after_VMEM |= ctx.has_VMEM;
1020 ctx.has_branch_after_DS |= ctx.has_DS;
1021 ctx.has_VMEM = ctx.has_DS = false;
1022 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
1023 /* Only s_waitcnt_vscnt can mitigate the hazard */
1024 const SALU_instruction& sopk = instr->salu();
1025 if (sopk.operands[0].physReg() == sgpr_null && sopk.imm == 0)
1026 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1027 }
1028
1029 /* NSAToVMEMBug
1030 * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
1031 * 0).
1032 */
1033 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
1034 ctx.has_NSA_MIMG = true;
1035 } else if (ctx.has_NSA_MIMG) {
1036 ctx.has_NSA_MIMG = false;
1037
1038 if (instr->isMUBUF() || instr->isMTBUF()) {
1039 uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
1040 if (offset & 6)
1041 bld.sopp(aco_opcode::s_nop, 0);
1042 }
1043 }
1044
1045 /* waNsaCannotFollowWritelane
1046 * Handles NSA MIMG immediately following a v_writelane_b32.
1047 */
1048 if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
1049 ctx.has_writelane = true;
1050 } else if (ctx.has_writelane) {
1051 ctx.has_writelane = false;
1052 if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
1053 bld.sopp(aco_opcode::s_nop, 0);
1054 }
1055 }
1056
1057 void
resolve_all_gfx10(State & state,NOP_ctx_gfx10 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1058 resolve_all_gfx10(State& state, NOP_ctx_gfx10& ctx,
1059 std::vector<aco_ptr<Instruction>>& new_instructions)
1060 {
1061 Builder bld(state.program, &new_instructions);
1062
1063 size_t prev_count = new_instructions.size();
1064
1065 /* VcmpxPermlaneHazard */
1066 if (ctx.has_VOPC_write_exec) {
1067 ctx.has_VOPC_write_exec = false;
1068 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1069
1070 /* VALU mitigates VMEMtoScalarWriteHazard. */
1071 ctx.sgprs_read_by_VMEM.reset();
1072 ctx.sgprs_read_by_DS.reset();
1073 ctx.sgprs_read_by_VMEM_store.reset();
1074 }
1075
1076 unsigned waitcnt_depctr = 0xffff;
1077
1078 /* VMEMtoScalarWriteHazard */
1079 if (ctx.sgprs_read_by_VMEM.any() || ctx.sgprs_read_by_DS.any() ||
1080 ctx.sgprs_read_by_VMEM_store.any()) {
1081 ctx.sgprs_read_by_VMEM.reset();
1082 ctx.sgprs_read_by_DS.reset();
1083 ctx.sgprs_read_by_VMEM_store.reset();
1084 waitcnt_depctr &= 0xffe3;
1085 }
1086
1087 /* VcmpxExecWARHazard */
1088 if (ctx.has_nonVALU_exec_read) {
1089 ctx.has_nonVALU_exec_read = false;
1090 waitcnt_depctr &= 0xfffe;
1091 }
1092
1093 if (waitcnt_depctr != 0xffff)
1094 bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1095
1096 /* SMEMtoVectorWriteHazard */
1097 if (ctx.sgprs_read_by_SMEM.any()) {
1098 ctx.sgprs_read_by_SMEM.reset();
1099 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
1100 }
1101
1102 /* LdsBranchVmemWARHazard */
1103 if (ctx.has_VMEM || ctx.has_branch_after_VMEM || ctx.has_DS || ctx.has_branch_after_DS) {
1104 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
1105 ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
1106 }
1107
1108 /* NSAToVMEMBug/waNsaCannotFollowWritelane */
1109 if (ctx.has_NSA_MIMG || ctx.has_writelane) {
1110 ctx.has_NSA_MIMG = ctx.has_writelane = false;
1111 /* Any instruction resolves these hazards. */
1112 if (new_instructions.size() == prev_count)
1113 bld.sopp(aco_opcode::s_nop, 0);
1114 }
1115 }
1116
1117 void
fill_vgpr_bitset(std::bitset<256> & set,PhysReg reg,unsigned bytes)1118 fill_vgpr_bitset(std::bitset<256>& set, PhysReg reg, unsigned bytes)
1119 {
1120 if (reg.reg() < 256)
1121 return;
1122 for (unsigned i = 0; i < DIV_ROUND_UP(bytes, 4); i++)
1123 set.set(reg.reg() - 256 + i);
1124 }
1125
1126 bool
test_vgpr_bitset(std::bitset<256> & set,Operand op)1127 test_vgpr_bitset(std::bitset<256>& set, Operand op)
1128 {
1129 if (op.physReg().reg() < 256)
1130 return false;
1131 for (unsigned i = 0; i < op.size(); i++) {
1132 if (set[op.physReg().reg() - 256 + i])
1133 return true;
1134 }
1135 return false;
1136 }
1137
1138 /* GFX11 */
1139 struct LdsDirectVALUHazardGlobalState {
1140 unsigned wait_vdst = 15;
1141 PhysReg vgpr;
1142 std::set<unsigned> loop_headers_visited;
1143 };
1144
1145 struct LdsDirectVALUHazardBlockState {
1146 unsigned num_valu = 0;
1147 bool has_trans = false;
1148
1149 unsigned num_instrs = 0;
1150 unsigned num_blocks = 0;
1151 };
1152
1153 bool
handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,aco_ptr<Instruction> & instr)1154 handle_lds_direct_valu_hazard_instr(LdsDirectVALUHazardGlobalState& global_state,
1155 LdsDirectVALUHazardBlockState& block_state,
1156 aco_ptr<Instruction>& instr)
1157 {
1158 if (instr->isVALU()) {
1159 block_state.has_trans |= instr->isTrans();
1160
1161 bool uses_vgpr = false;
1162 for (Definition& def : instr->definitions)
1163 uses_vgpr |= regs_intersect(def.physReg(), def.size(), global_state.vgpr, 1);
1164 for (Operand& op : instr->operands) {
1165 uses_vgpr |=
1166 !op.isConstant() && regs_intersect(op.physReg(), op.size(), global_state.vgpr, 1);
1167 }
1168 if (uses_vgpr) {
1169 /* Transcendentals execute in parallel to other VALU and va_vdst count becomes unusable */
1170 global_state.wait_vdst =
1171 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1172 return true;
1173 }
1174
1175 block_state.num_valu++;
1176 }
1177
1178 if (parse_vdst_wait(instr.get()) == 0)
1179 return true;
1180
1181 block_state.num_instrs++;
1182 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1183 /* Exit to limit compile times and set wait_vdst to be safe. */
1184 global_state.wait_vdst =
1185 MIN2(global_state.wait_vdst, block_state.has_trans ? 0 : block_state.num_valu);
1186 return true;
1187 }
1188
1189 return block_state.num_valu >= global_state.wait_vdst;
1190 }
1191
1192 bool
handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState & global_state,LdsDirectVALUHazardBlockState & block_state,Block * block)1193 handle_lds_direct_valu_hazard_block(LdsDirectVALUHazardGlobalState& global_state,
1194 LdsDirectVALUHazardBlockState& block_state, Block* block)
1195 {
1196 if (block->kind & block_kind_loop_header) {
1197 if (global_state.loop_headers_visited.count(block->index))
1198 return false;
1199 global_state.loop_headers_visited.insert(block->index);
1200 }
1201
1202 block_state.num_blocks++;
1203
1204 return true;
1205 }
1206
1207 unsigned
handle_lds_direct_valu_hazard(State & state,aco_ptr<Instruction> & instr)1208 handle_lds_direct_valu_hazard(State& state, aco_ptr<Instruction>& instr)
1209 {
1210 /* LdsDirectVALUHazard
1211 * Handle LDSDIR writing a VGPR after it's used by a VALU instruction.
1212 */
1213 if (instr->ldsdir().wait_vdst == 0)
1214 return 0; /* early exit */
1215
1216 LdsDirectVALUHazardGlobalState global_state;
1217 global_state.wait_vdst = instr->ldsdir().wait_vdst;
1218 global_state.vgpr = instr->definitions[0].physReg();
1219 LdsDirectVALUHazardBlockState block_state;
1220 search_backwards<LdsDirectVALUHazardGlobalState, LdsDirectVALUHazardBlockState,
1221 &handle_lds_direct_valu_hazard_block, &handle_lds_direct_valu_hazard_instr>(
1222 state, global_state, block_state);
1223 return global_state.wait_vdst;
1224 }
1225
1226 enum VALUPartialForwardingHazardState : uint8_t {
1227 nothing_written,
1228 written_after_exec_write,
1229 exec_written,
1230 };
1231
1232 struct VALUPartialForwardingHazardGlobalState {
1233 bool hazard_found = false;
1234 std::set<unsigned> loop_headers_visited;
1235 };
1236
1237 struct VALUPartialForwardingHazardBlockState {
1238 /* initialized by number of VGPRs read by VALU, decrement when encountered to return early */
1239 uint8_t num_vgprs_read = 0;
1240 BITSET_DECLARE(vgprs_read, 256) = {0};
1241 enum VALUPartialForwardingHazardState state = nothing_written;
1242 unsigned num_valu_since_read = 0;
1243 unsigned num_valu_since_write = 0;
1244
1245 unsigned num_instrs = 0;
1246 unsigned num_blocks = 0;
1247 };
1248
1249 bool
handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,aco_ptr<Instruction> & instr)1250 handle_valu_partial_forwarding_hazard_instr(VALUPartialForwardingHazardGlobalState& global_state,
1251 VALUPartialForwardingHazardBlockState& block_state,
1252 aco_ptr<Instruction>& instr)
1253 {
1254 if (instr->isSALU() && !instr->definitions.empty()) {
1255 if (block_state.state == written_after_exec_write && instr->writes_exec())
1256 block_state.state = exec_written;
1257 } else if (instr->isVALU()) {
1258 bool vgpr_write = false;
1259 for (Definition& def : instr->definitions) {
1260 if (def.physReg().reg() < 256)
1261 continue;
1262
1263 for (unsigned i = 0; i < def.size(); i++) {
1264 unsigned reg = def.physReg().reg() - 256 + i;
1265 if (!BITSET_TEST(block_state.vgprs_read, reg))
1266 continue;
1267
1268 if (block_state.state == exec_written && block_state.num_valu_since_write < 3) {
1269 global_state.hazard_found = true;
1270 return true;
1271 }
1272
1273 BITSET_CLEAR(block_state.vgprs_read, reg);
1274 block_state.num_vgprs_read--;
1275 vgpr_write = true;
1276 }
1277 }
1278
1279 if (vgpr_write) {
1280 /* If the state is nothing_written: the check below should ensure that this write is
1281 * close enough to the read.
1282 *
1283 * If the state is exec_written: the current choice of second write has failed. Reset and
1284 * try with the current write as the second one, if it's close enough to the read.
1285 *
1286 * If the state is written_after_exec_write: a further second write would be better, if
1287 * it's close enough to the read.
1288 */
1289 if (block_state.state == nothing_written || block_state.num_valu_since_read < 5) {
1290 block_state.state = written_after_exec_write;
1291 block_state.num_valu_since_write = 0;
1292 } else {
1293 block_state.num_valu_since_write++;
1294 }
1295 } else {
1296 block_state.num_valu_since_write++;
1297 }
1298
1299 block_state.num_valu_since_read++;
1300 } else if (parse_vdst_wait(instr.get()) == 0) {
1301 return true;
1302 }
1303
1304 if (block_state.num_valu_since_read >= (block_state.state == nothing_written ? 5 : 8))
1305 return true; /* Hazard not possible at this distance. */
1306 if (block_state.num_vgprs_read == 0)
1307 return true; /* All VGPRs have been written and a hazard was never found. */
1308
1309 block_state.num_instrs++;
1310 if (block_state.num_instrs > 256 || block_state.num_blocks > 32) {
1311 /* Exit to limit compile times and set hazard_found=true to be safe. */
1312 global_state.hazard_found = true;
1313 return true;
1314 }
1315
1316 return false;
1317 }
1318
1319 bool
handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState & global_state,VALUPartialForwardingHazardBlockState & block_state,Block * block)1320 handle_valu_partial_forwarding_hazard_block(VALUPartialForwardingHazardGlobalState& global_state,
1321 VALUPartialForwardingHazardBlockState& block_state,
1322 Block* block)
1323 {
1324 if (block->kind & block_kind_loop_header) {
1325 if (global_state.loop_headers_visited.count(block->index))
1326 return false;
1327 global_state.loop_headers_visited.insert(block->index);
1328 }
1329
1330 block_state.num_blocks++;
1331
1332 return true;
1333 }
1334
1335 bool
handle_valu_partial_forwarding_hazard(State & state,aco_ptr<Instruction> & instr)1336 handle_valu_partial_forwarding_hazard(State& state, aco_ptr<Instruction>& instr)
1337 {
1338 /* VALUPartialForwardingHazard
1339 * VALU instruction reads two VGPRs: one written before an exec write by SALU and one after.
1340 * For the hazard, there must be less than 3 VALU between the first and second VGPR writes.
1341 * There also must be less than 5 VALU between the second VGPR write and the current instruction.
1342 */
1343 if (state.program->wave_size != 64 || !instr->isVALU())
1344 return false;
1345
1346 unsigned num_vgprs = 0;
1347 for (Operand& op : instr->operands)
1348 num_vgprs += op.physReg().reg() < 256 ? op.size() : 1;
1349 if (num_vgprs <= 1)
1350 return false; /* early exit */
1351
1352 VALUPartialForwardingHazardBlockState block_state;
1353
1354 for (unsigned i = 0; i < instr->operands.size(); i++) {
1355 Operand& op = instr->operands[i];
1356 if (op.physReg().reg() < 256)
1357 continue;
1358 for (unsigned j = 0; j < op.size(); j++)
1359 BITSET_SET(block_state.vgprs_read, op.physReg().reg() - 256 + j);
1360 }
1361 block_state.num_vgprs_read = BITSET_COUNT(block_state.vgprs_read);
1362
1363 if (block_state.num_vgprs_read <= 1)
1364 return false; /* early exit */
1365
1366 VALUPartialForwardingHazardGlobalState global_state;
1367 search_backwards<VALUPartialForwardingHazardGlobalState, VALUPartialForwardingHazardBlockState,
1368 &handle_valu_partial_forwarding_hazard_block,
1369 &handle_valu_partial_forwarding_hazard_instr>(state, global_state, block_state);
1370 return global_state.hazard_found;
1371 }
1372
1373 void
handle_instruction_gfx11(State & state,NOP_ctx_gfx11 & ctx,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & new_instructions)1374 handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>& instr,
1375 std::vector<aco_ptr<Instruction>>& new_instructions)
1376 {
1377 Builder bld(state.program, &new_instructions);
1378
1379 /* VcmpxPermlaneHazard
1380 * Handle any permlane following a VOPC instruction writing exec, insert v_mov between them.
1381 */
1382 if (instr->isVOPC() && instr->definitions[0].physReg() == exec) {
1383 ctx.has_Vcmpx = true;
1384 } else if (ctx.has_Vcmpx && (instr->opcode == aco_opcode::v_permlane16_b32 ||
1385 instr->opcode == aco_opcode::v_permlanex16_b32 ||
1386 instr->opcode == aco_opcode::v_permlane64_b32 ||
1387 instr->opcode == aco_opcode::v_permlane16_var_b32 ||
1388 instr->opcode == aco_opcode::v_permlanex16_var_b32)) {
1389 ctx.has_Vcmpx = false;
1390
1391 /* Unlike on GFX10, v_nop should resolve the hazard on GFX11. */
1392 bld.vop1(aco_opcode::v_nop);
1393 } else if (instr->isVALU()) {
1394 ctx.has_Vcmpx = false;
1395 }
1396
1397 unsigned va_vdst = parse_vdst_wait(instr.get());
1398 unsigned vm_vsrc = 7;
1399 unsigned sa_sdst = 1;
1400
1401 if (debug_flags & DEBUG_FORCE_WAITDEPS) {
1402 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0000);
1403 va_vdst = 0;
1404 vm_vsrc = 0;
1405 sa_sdst = 0;
1406 } else if (instr->opcode == aco_opcode::s_waitcnt_depctr) {
1407 /* va_vdst already obtained through parse_vdst_wait(). */
1408 vm_vsrc = (instr->salu().imm >> 2) & 0x7;
1409 sa_sdst = instr->salu().imm & 0x1;
1410 } else if (instr->isLDSDIR() && state.program->gfx_level >= GFX12) {
1411 vm_vsrc = instr->ldsdir().wait_vsrc ? 7 : 0;
1412 }
1413
1414 if (instr->isLDSDIR()) {
1415 unsigned count = handle_lds_direct_valu_hazard(state, instr);
1416 LDSDIR_instruction* ldsdir = &instr->ldsdir();
1417 if (count < va_vdst) {
1418 ldsdir->wait_vdst = MIN2(ldsdir->wait_vdst, count);
1419 va_vdst = MIN2(va_vdst, count);
1420 }
1421 }
1422
1423 /* VALUTransUseHazard
1424 * VALU reads VGPR written by transcendental instruction without 6+ VALU or 2+ transcendental
1425 * in-between.
1426 */
1427 if (state.program->gfx_level < GFX11_5 && va_vdst > 0 && instr->isVALU()) {
1428 uint8_t num_valu = 15;
1429 uint8_t num_trans = 15;
1430 for (Operand& op : instr->operands) {
1431 if (op.physReg().reg() < 256)
1432 continue;
1433 for (unsigned i = 0; i < op.size(); i++) {
1434 num_valu = std::min(num_valu, ctx.valu_since_wr_by_trans.get(op.physReg(), i));
1435 num_trans = std::min(num_trans, ctx.trans_since_wr_by_trans.get(op.physReg(), i));
1436 }
1437 }
1438 if (num_trans <= 1 && num_valu <= 5) {
1439 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1440 va_vdst = 0;
1441 }
1442 }
1443
1444 if (va_vdst > 0 && state.program->gfx_level < GFX12 &&
1445 handle_valu_partial_forwarding_hazard(state, instr)) {
1446 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1447 va_vdst = 0;
1448 }
1449
1450 if (state.program->gfx_level < GFX12) {
1451 /* VALUMaskWriteHazard
1452 * VALU reads SGPR as a lane mask and later written by SALU cannot safely be read by SALU or
1453 * VALU.
1454 */
1455 if (state.program->wave_size == 64 && (instr->isSALU() || instr->isVALU()) &&
1456 check_read_regs(instr, ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu)) {
1457 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1458 sa_sdst = 0;
1459 }
1460
1461 if (va_vdst == 0) {
1462 ctx.valu_since_wr_by_trans.reset();
1463 ctx.trans_since_wr_by_trans.reset();
1464 }
1465
1466 if (sa_sdst == 0)
1467 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1468
1469 if (state.program->wave_size == 64 && instr->isSALU() &&
1470 check_written_regs(instr, ctx.sgpr_read_by_valu_as_lanemask)) {
1471 unsigned reg = instr->definitions[0].physReg().reg();
1472 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1473 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu[reg + i] = 1;
1474 }
1475
1476 if (instr->isVALU()) {
1477 bool is_trans = instr->isTrans();
1478
1479 ctx.valu_since_wr_by_trans.inc();
1480 if (is_trans)
1481 ctx.trans_since_wr_by_trans.inc();
1482
1483 if (is_trans) {
1484 for (Definition& def : instr->definitions) {
1485 ctx.valu_since_wr_by_trans.set(def.physReg(), def.bytes());
1486 ctx.trans_since_wr_by_trans.set(def.physReg(), def.bytes());
1487 }
1488 }
1489
1490 if (state.program->wave_size == 64) {
1491 for (Operand& op : instr->operands) {
1492 /* This should ignore exec reads */
1493 if (!op.isConstant() && op.physReg().reg() < 126)
1494 ctx.sgpr_read_by_valu_as_lanemask.reset();
1495 }
1496 switch (instr->opcode) {
1497 case aco_opcode::v_addc_co_u32:
1498 case aco_opcode::v_subb_co_u32:
1499 case aco_opcode::v_subbrev_co_u32:
1500 case aco_opcode::v_cndmask_b16:
1501 case aco_opcode::v_cndmask_b32:
1502 case aco_opcode::v_div_fmas_f32:
1503 case aco_opcode::v_div_fmas_f64:
1504 if (instr->operands.back().physReg() != exec) {
1505 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg());
1506 ctx.sgpr_read_by_valu_as_lanemask.set(instr->operands.back().physReg().reg() + 1);
1507 }
1508 break;
1509 default: break;
1510 }
1511 }
1512 }
1513 }
1514
1515 /* LdsDirectVMEMHazard
1516 * Handle LDSDIR writing a VGPR after it's used by a VMEM/DS instruction.
1517 */
1518 if (instr->isVMEM() || instr->isFlatLike()) {
1519 if (instr->definitions.empty()) {
1520 for (Operand& op : instr->operands)
1521 fill_vgpr_bitset(ctx.vgpr_used_by_vmem_store, op.physReg(), op.bytes());
1522 } else {
1523 uint8_t vmem_type = state.program->gfx_level >= GFX12
1524 ? get_vmem_type(state.program->gfx_level, instr.get())
1525 : vmem_nosampler;
1526 std::bitset<256>* vgprs = &ctx.vgpr_used_by_vmem_load;
1527 if (vmem_type == vmem_sampler)
1528 vgprs = &ctx.vgpr_used_by_vmem_sample;
1529 else if (vmem_type == vmem_bvh)
1530 vgprs = &ctx.vgpr_used_by_vmem_bvh;
1531
1532 for (Definition& def : instr->definitions)
1533 fill_vgpr_bitset(*vgprs, def.physReg(), def.bytes());
1534 for (Operand& op : instr->operands)
1535 fill_vgpr_bitset(*vgprs, op.physReg(), op.bytes());
1536 }
1537 }
1538 if (instr->isDS() || instr->isFlat()) {
1539 for (Definition& def : instr->definitions)
1540 fill_vgpr_bitset(ctx.vgpr_used_by_ds, def.physReg(), def.bytes());
1541 for (Operand& op : instr->operands)
1542 fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
1543 }
1544 wait_imm imm;
1545 if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
1546 ctx.vgpr_used_by_vmem_load.reset();
1547 ctx.vgpr_used_by_vmem_sample.reset();
1548 ctx.vgpr_used_by_vmem_bvh.reset();
1549 ctx.vgpr_used_by_vmem_store.reset();
1550 ctx.vgpr_used_by_ds.reset();
1551 } else if (imm.unpack(state.program->gfx_level, instr.get())) {
1552 if (imm.vm == 0)
1553 ctx.vgpr_used_by_vmem_load.reset();
1554 if (imm.sample == 0)
1555 ctx.vgpr_used_by_vmem_sample.reset();
1556 if (imm.bvh == 0)
1557 ctx.vgpr_used_by_vmem_bvh.reset();
1558 if (imm.lgkm == 0)
1559 ctx.vgpr_used_by_ds.reset();
1560 if (imm.vs == 0)
1561 ctx.vgpr_used_by_vmem_store.reset();
1562 }
1563 if (instr->isLDSDIR()) {
1564 if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
1565 ctx.vgpr_used_by_vmem_sample[instr->definitions[0].physReg().reg() - 256] ||
1566 ctx.vgpr_used_by_vmem_bvh[instr->definitions[0].physReg().reg() - 256] ||
1567 ctx.vgpr_used_by_vmem_store[instr->definitions[0].physReg().reg() - 256] ||
1568 ctx.vgpr_used_by_ds[instr->definitions[0].physReg().reg() - 256]) {
1569 if (state.program->gfx_level >= GFX12)
1570 instr->ldsdir().wait_vsrc = 0;
1571 else
1572 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xffe3);
1573 ctx.vgpr_used_by_vmem_load.reset();
1574 ctx.vgpr_used_by_vmem_sample.reset();
1575 ctx.vgpr_used_by_vmem_bvh.reset();
1576 ctx.vgpr_used_by_vmem_store.reset();
1577 ctx.vgpr_used_by_ds.reset();
1578 }
1579 }
1580
1581 /* WMMA Hazards */
1582 if (instr_info.classes[(int)instr->opcode] == instr_class::wmma) {
1583 assert(instr->operands.back().regClass() == instr->definitions[0].regClass());
1584
1585 bool is_swmma = instr->operands.size() == 4;
1586 if (test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[0]) ||
1587 test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[1]) ||
1588 (is_swmma && test_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->operands[2]))) {
1589 bld.vop1(aco_opcode::v_nop);
1590 }
1591
1592 ctx.vgpr_written_by_wmma.reset();
1593 fill_vgpr_bitset(ctx.vgpr_written_by_wmma, instr->definitions[0].physReg(),
1594 instr->definitions[0].bytes());
1595 } else if (instr->isVALU()) {
1596 ctx.vgpr_written_by_wmma.reset();
1597 }
1598 }
1599
1600 bool
has_vdst0_since_valu_instr(bool & global_state,unsigned & block_state,aco_ptr<Instruction> & pred)1601 has_vdst0_since_valu_instr(bool& global_state, unsigned& block_state, aco_ptr<Instruction>& pred)
1602 {
1603 if (parse_vdst_wait(pred.get()) == 0)
1604 return true;
1605
1606 if (--block_state == 0) {
1607 global_state = false;
1608 return true;
1609 }
1610
1611 if (pred->isVALU()) {
1612 bool vgpr_rd_or_wr = false;
1613 for (Definition def : pred->definitions) {
1614 if (def.physReg().reg() >= 256)
1615 vgpr_rd_or_wr = true;
1616 }
1617 for (Operand op : pred->operands) {
1618 if (op.physReg().reg() >= 256)
1619 vgpr_rd_or_wr = true;
1620 }
1621 if (vgpr_rd_or_wr) {
1622 global_state = false;
1623 return true;
1624 }
1625 }
1626
1627 return false;
1628 }
1629
1630 void
resolve_all_gfx11(State & state,NOP_ctx_gfx11 & ctx,std::vector<aco_ptr<Instruction>> & new_instructions)1631 resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
1632 std::vector<aco_ptr<Instruction>>& new_instructions)
1633 {
1634 Builder bld(state.program, &new_instructions);
1635
1636 unsigned waitcnt_depctr = 0xffff;
1637 bool valu_read_sgpr = false;
1638
1639 /* LdsDirectVALUHazard/VALUPartialForwardingHazard/VALUTransUseHazard */
1640 bool has_vdst0_since_valu = true;
1641 unsigned depth = 16;
1642 search_backwards<bool, unsigned, nullptr, has_vdst0_since_valu_instr>(
1643 state, has_vdst0_since_valu, depth);
1644 if (!has_vdst0_since_valu) {
1645 waitcnt_depctr &= 0x0fff;
1646 ctx.valu_since_wr_by_trans.reset();
1647 ctx.trans_since_wr_by_trans.reset();
1648 }
1649
1650 /* VcmpxPermlaneHazard/WMMAHazards */
1651 if (ctx.has_Vcmpx || ctx.vgpr_written_by_wmma.any()) {
1652 ctx.has_Vcmpx = false;
1653 ctx.vgpr_written_by_wmma.reset();
1654 bld.vop1(aco_opcode::v_nop);
1655 }
1656
1657 /* VALUMaskWriteHazard */
1658 if (state.program->gfx_level < GFX12 && state.program->wave_size == 64) {
1659 if (ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.any()) {
1660 waitcnt_depctr &= 0xfffe;
1661 ctx.sgpr_read_by_valu_as_lanemask_then_wr_by_salu.reset();
1662 }
1663 if (ctx.sgpr_read_by_valu_as_lanemask.any()) {
1664 valu_read_sgpr = true;
1665 ctx.sgpr_read_by_valu_as_lanemask.reset();
1666 }
1667 }
1668
1669 /* LdsDirectVMEMHazard */
1670 if (ctx.vgpr_used_by_vmem_load.any() || ctx.vgpr_used_by_vmem_store.any() ||
1671 ctx.vgpr_used_by_ds.any() || ctx.vgpr_used_by_vmem_sample.any() ||
1672 ctx.vgpr_used_by_vmem_bvh.any()) {
1673 waitcnt_depctr &= 0xffe3;
1674 ctx.vgpr_used_by_vmem_load.reset();
1675 ctx.vgpr_used_by_vmem_store.reset();
1676 ctx.vgpr_used_by_ds.reset();
1677 }
1678
1679 if (waitcnt_depctr != 0xffff)
1680 bld.sopp(aco_opcode::s_waitcnt_depctr, waitcnt_depctr);
1681
1682 if (valu_read_sgpr) {
1683 /* This has to be after the s_waitcnt_depctr so that the instruction is not involved in any
1684 * other hazards. */
1685 bld.vop3(aco_opcode::v_xor3_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
1686 Operand(PhysReg(0), s1), Operand(PhysReg(0), s1));
1687
1688 /* workaround possible LdsDirectVALUHazard/VALUPartialForwardingHazard */
1689 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
1690 }
1691 }
1692
1693 template <typename Ctx>
1694 using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
1695 std::vector<aco_ptr<Instruction>>&);
1696
1697 template <typename Ctx>
1698 using ResolveAll = void (*)(State& state, Ctx&, std::vector<aco_ptr<Instruction>>&);
1699
1700 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1701 void
handle_block(Program * program,Ctx & ctx,Block & block)1702 handle_block(Program* program, Ctx& ctx, Block& block)
1703 {
1704 if (block.instructions.empty())
1705 return;
1706
1707 State state;
1708 state.program = program;
1709 state.block = █
1710 state.old_instructions = std::move(block.instructions);
1711
1712 block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
1713 block.instructions.reserve(state.old_instructions.size());
1714
1715 bool found_end = false;
1716 for (aco_ptr<Instruction>& instr : state.old_instructions) {
1717 Handle(state, ctx, instr, block.instructions);
1718
1719 /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
1720 if (instr->opcode == aco_opcode::s_setpc_b64) {
1721 block.instructions.emplace_back(std::move(instr));
1722
1723 std::vector<aco_ptr<Instruction>> resolve_instrs;
1724 Resolve(state, ctx, resolve_instrs);
1725 block.instructions.insert(std::prev(block.instructions.end()),
1726 std::move_iterator(resolve_instrs.begin()),
1727 std::move_iterator(resolve_instrs.end()));
1728
1729 found_end = true;
1730 continue;
1731 }
1732
1733 found_end |= instr->opcode == aco_opcode::s_endpgm;
1734 block.instructions.emplace_back(std::move(instr));
1735 }
1736
1737 /* Resolve all possible hazards (we don't know what the shader is concatenated with). */
1738 if (block.linear_succs.empty() && !found_end)
1739 Resolve(state, ctx, block.instructions);
1740 }
1741
1742 template <typename Ctx, HandleInstr<Ctx> Handle, ResolveAll<Ctx> Resolve>
1743 void
mitigate_hazards(Program * program)1744 mitigate_hazards(Program* program)
1745 {
1746 std::vector<Ctx> all_ctx(program->blocks.size());
1747 std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
1748
1749 for (unsigned i = 0; i < program->blocks.size(); i++) {
1750 Block& block = program->blocks[i];
1751 Ctx& ctx = all_ctx[i];
1752
1753 if (block.kind & block_kind_loop_header) {
1754 loop_header_indices.push(i);
1755 } else if (block.kind & block_kind_loop_exit) {
1756 /* Go through the whole loop again */
1757 for (unsigned idx = loop_header_indices.top(); idx < i; idx++) {
1758 Ctx loop_block_ctx;
1759 for (unsigned b : program->blocks[idx].linear_preds)
1760 loop_block_ctx.join(all_ctx[b]);
1761
1762 handle_block<Ctx, Handle, Resolve>(program, loop_block_ctx, program->blocks[idx]);
1763
1764 /* We only need to continue if the loop header context changed */
1765 if (idx == loop_header_indices.top() && loop_block_ctx == all_ctx[idx])
1766 break;
1767
1768 all_ctx[idx] = loop_block_ctx;
1769 }
1770
1771 loop_header_indices.pop();
1772 }
1773
1774 for (unsigned b : block.linear_preds)
1775 ctx.join(all_ctx[b]);
1776
1777 handle_block<Ctx, Handle, Resolve>(program, ctx, block);
1778 }
1779 }
1780
1781 /* FeatureRequiredExportPriority in LLVM */
1782 void
required_export_priority(Program * program)1783 required_export_priority(Program* program)
1784 {
1785 /* Skip callees, assuming that the caller has already increased the priority. */
1786 bool increase_priority = !program->is_epilog && !program->info.vs.has_prolog &&
1787 (!program->info.merged_shader_compiled_separately ||
1788 program->stage.sw == SWStage::VS || program->stage.sw == SWStage::TES);
1789 increase_priority |= program->is_prolog;
1790
1791 for (Block& block : program->blocks) {
1792 std::vector<aco_ptr<Instruction>> new_instructions;
1793 new_instructions.reserve(block.instructions.size() + 6);
1794
1795 Builder bld(program, &new_instructions);
1796
1797 if (increase_priority && block.index == 0) {
1798 if (!block.instructions.empty() && block.instructions[0]->opcode == aco_opcode::s_setprio)
1799 block.instructions[0]->salu().imm = MAX2(block.instructions[0]->salu().imm, 2);
1800 else
1801 bld.sopp(aco_opcode::s_setprio, 2);
1802 }
1803
1804 for (unsigned i = 0; i < block.instructions.size(); i++) {
1805 Instruction* instr = block.instructions[i].get();
1806 new_instructions.push_back(std::move(block.instructions[i]));
1807
1808 if (instr->opcode == aco_opcode::s_setprio) {
1809 instr->salu().imm = MAX2(instr->salu().imm, 2);
1810 continue;
1811 }
1812
1813 bool end_of_export_sequence = instr->isEXP() && (i == block.instructions.size() - 1 ||
1814 !block.instructions[i + 1]->isEXP());
1815 if (!end_of_export_sequence)
1816 continue;
1817
1818 bool before_endpgm = false;
1819 if (i != block.instructions.size() - 1) {
1820 before_endpgm = block.instructions[i + 1]->opcode == aco_opcode::s_endpgm;
1821 } else {
1822 /* Does this fallthrough to a s_endpgm? */
1823 for (unsigned j = block.index + 1; j < program->blocks.size(); j++) {
1824 if (program->blocks[j].instructions.size() == 1 &&
1825 program->blocks[j].instructions[0]->opcode == aco_opcode::s_endpgm)
1826 before_endpgm = true;
1827 if (!program->blocks[j].instructions.empty())
1828 break;
1829 }
1830 }
1831
1832 bld.sopp(aco_opcode::s_setprio, 0);
1833 if (!before_endpgm)
1834 bld.sopk(aco_opcode::s_waitcnt_expcnt, Operand(sgpr_null, s1), 0);
1835 bld.sopp(aco_opcode::s_nop, 0);
1836 bld.sopp(aco_opcode::s_nop, 0);
1837 if (!before_endpgm)
1838 bld.sopp(aco_opcode::s_setprio, 2);
1839 }
1840
1841 block.instructions = std::move(new_instructions);
1842 }
1843 }
1844
1845 } /* end namespace */
1846
1847 void
insert_NOPs(Program * program)1848 insert_NOPs(Program* program)
1849 {
1850 if (program->gfx_level >= GFX11)
1851 mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program);
1852 else if (program->gfx_level >= GFX10_3)
1853 ; /* no hazards/bugs to mitigate */
1854 else if (program->gfx_level >= GFX10)
1855 mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
1856 else
1857 mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
1858
1859 if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1860 program->stage.hw == AC_HW_PIXEL_SHADER))
1861 required_export_priority(program);
1862 }
1863
1864 } // namespace aco
1865