xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_insert_delay_alu.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include <map>
11 #include <stack>
12 #include <vector>
13 
14 namespace aco {
15 
16 namespace {
17 
18 /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
19  * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
20  * that we should switch to a different wave and contains info on dependencies as to
21  * when we can switch back.
22  *
23  * This seems to apply only for ALU->ALU dependencies as other instructions have better
24  * integration with the frontend.
25  *
26  * Note that if we do not emit s_delay_alu things will still be correct, but the wave
27  * will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
28  * I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
29  * instructions can take a different number of cycles based on the exec mask)
30  */
31 struct alu_delay_info {
32    /* These are the values directly above the max representable value, i.e. the wait
33     * would turn into a no-op when we try to wait for something further back than
34     * this.
35     */
36    static constexpr int8_t valu_nop = 5;
37    static constexpr int8_t trans_nop = 4;
38 
39    /* How many VALU instructions ago this value was written */
40    int8_t valu_instrs = valu_nop;
41    /* Cycles until the writing VALU instruction is finished */
42    int8_t valu_cycles = 0;
43 
44    /* How many Transcedent instructions ago this value was written */
45    int8_t trans_instrs = trans_nop;
46    /* Cycles until the writing Transcendent instruction is finished */
47    int8_t trans_cycles = 0;
48 
49    /* Cycles until the writing SALU instruction is finished*/
50    int8_t salu_cycles = 0;
51 
combineaco::__anon2031e1fc0111::alu_delay_info52    bool combine(const alu_delay_info& other)
53    {
54       bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
55                      other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
56                      other.trans_cycles > trans_cycles;
57       valu_instrs = std::min(valu_instrs, other.valu_instrs);
58       trans_instrs = std::min(trans_instrs, other.trans_instrs);
59       salu_cycles = std::max(salu_cycles, other.salu_cycles);
60       valu_cycles = std::max(valu_cycles, other.valu_cycles);
61       trans_cycles = std::max(trans_cycles, other.trans_cycles);
62       return changed;
63    }
64 
65    /* Needs to be called after any change to keep the data consistent. */
fixupaco::__anon2031e1fc0111::alu_delay_info66    bool fixup()
67    {
68       if (valu_instrs >= valu_nop || valu_cycles <= 0) {
69          valu_instrs = valu_nop;
70          valu_cycles = 0;
71       }
72 
73       if (trans_instrs >= trans_nop || trans_cycles <= 0) {
74          trans_instrs = trans_nop;
75          trans_cycles = 0;
76       }
77 
78       salu_cycles = std::max<int8_t>(salu_cycles, 0);
79 
80       return empty();
81    }
82 
83    /* Returns true if a wait would be a no-op */
emptyaco::__anon2031e1fc0111::alu_delay_info84    bool empty() const
85    {
86       return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
87    }
88 
printaco::__anon2031e1fc0111::alu_delay_info89    UNUSED void print(FILE* output) const
90    {
91       if (valu_instrs != valu_nop)
92          fprintf(output, "valu_instrs: %u\n", valu_instrs);
93       if (valu_cycles)
94          fprintf(output, "valu_cycles: %u\n", valu_cycles);
95       if (trans_instrs != trans_nop)
96          fprintf(output, "trans_instrs: %u\n", trans_instrs);
97       if (trans_cycles)
98          fprintf(output, "trans_cycles: %u\n", trans_cycles);
99       if (salu_cycles)
100          fprintf(output, "salu_cycles: %u\n", salu_cycles);
101    }
102 };
103 
104 struct delay_ctx {
105    Program* program;
106    std::map<PhysReg, alu_delay_info> gpr_map;
107 
delay_ctxaco::__anon2031e1fc0111::delay_ctx108    delay_ctx() {}
delay_ctxaco::__anon2031e1fc0111::delay_ctx109    delay_ctx(Program* program_) : program(program_) {}
110 
joinaco::__anon2031e1fc0111::delay_ctx111    bool join(const delay_ctx* other)
112    {
113       bool changed = false;
114       for (const auto& entry : other->gpr_map) {
115          using iterator = std::map<PhysReg, alu_delay_info>::iterator;
116          const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
117          if (insert_pair.second)
118             changed = true;
119          else
120             changed |= insert_pair.first->second.combine(entry.second);
121       }
122 
123       return changed;
124    }
125 
printaco::__anon2031e1fc0111::delay_ctx126    UNUSED void print(FILE* output) const
127    {
128       for (const auto& entry : gpr_map) {
129          fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
130                  entry.first.reg() & 0xff);
131          entry.second.print(output);
132          fprintf(output, "}\n");
133       }
134    }
135 };
136 
137 void
check_alu(delay_ctx & ctx,alu_delay_info & delay,Instruction * instr)138 check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
139 {
140    for (const Operand op : instr->operands) {
141       if (op.isConstant() || op.isUndefined())
142          continue;
143 
144       /* check consecutively read gprs */
145       for (unsigned j = 0; j < op.size(); j++) {
146          std::map<PhysReg, alu_delay_info>::iterator it =
147             ctx.gpr_map.find(PhysReg{op.physReg() + j});
148          if (it != ctx.gpr_map.end())
149             delay.combine(it->second);
150       }
151    }
152 }
153 
154 bool
parse_delay_alu(delay_ctx & ctx,alu_delay_info & delay,Instruction * instr)155 parse_delay_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
156 {
157    if (instr->opcode != aco_opcode::s_delay_alu)
158       return false;
159 
160    unsigned imm[2] = {instr->salu().imm & 0xf, (instr->salu().imm >> 7) & 0xf};
161    for (unsigned i = 0; i < 2; ++i) {
162       alu_delay_wait wait = (alu_delay_wait)imm[i];
163       if (wait >= alu_delay_wait::VALU_DEP_1 && wait <= alu_delay_wait::VALU_DEP_4)
164          delay.valu_instrs = imm[i] - (uint32_t)alu_delay_wait::VALU_DEP_1 + 1;
165       else if (wait >= alu_delay_wait::TRANS32_DEP_1 && wait <= alu_delay_wait::TRANS32_DEP_3)
166          delay.trans_instrs = imm[i] - (uint32_t)alu_delay_wait::TRANS32_DEP_1 + 1;
167       else if (wait >= alu_delay_wait::SALU_CYCLE_1)
168          delay.salu_cycles = imm[i] - (uint32_t)alu_delay_wait::SALU_CYCLE_1 + 1;
169    }
170 
171    delay.valu_cycles = instr->pass_flags & 0xffff;
172    delay.trans_cycles = instr->pass_flags >> 16;
173 
174    return true;
175 }
176 
177 void
update_alu(delay_ctx & ctx,bool is_valu,bool is_trans,int cycles)178 update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
179 {
180    std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
181    while (it != ctx.gpr_map.end()) {
182       alu_delay_info& entry = it->second;
183       entry.valu_instrs += is_valu ? 1 : 0;
184       entry.trans_instrs += is_trans ? 1 : 0;
185       entry.salu_cycles -= cycles;
186       entry.valu_cycles -= cycles;
187       entry.trans_cycles -= cycles;
188       it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
189    }
190 }
191 
192 void
kill_alu(alu_delay_info & delay,Instruction * instr,delay_ctx & ctx)193 kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
194 {
195    if (parse_vdst_wait(instr) == 0) {
196       std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
197       while (it != ctx.gpr_map.end()) {
198          alu_delay_info& entry = it->second;
199          entry.valu_instrs = alu_delay_info::valu_nop;
200          entry.trans_instrs = alu_delay_info::trans_nop;
201          it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
202       }
203    }
204 
205    if (instr->isVALU() || instr->isSALU())
206       check_alu(ctx, delay, instr);
207 
208    if (!delay.empty()) {
209       update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
210 
211       /* remove all gprs with higher counter from map */
212       std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
213       while (it != ctx.gpr_map.end()) {
214          if (delay.valu_instrs <= it->second.valu_instrs)
215             it->second.valu_instrs = alu_delay_info::valu_nop;
216          if (delay.trans_instrs <= it->second.trans_instrs)
217             it->second.trans_instrs = alu_delay_info::trans_nop;
218          it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
219       }
220    }
221 }
222 
223 void
gen_alu(Instruction * instr,delay_ctx & ctx)224 gen_alu(Instruction* instr, delay_ctx& ctx)
225 {
226    Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
227    bool is_valu = instr->isVALU();
228    bool is_trans = instr->isTrans();
229 
230    if (is_trans || is_valu || instr->isSALU()) {
231       alu_delay_info delay;
232       if (is_trans) {
233          delay.trans_instrs = 0;
234          delay.trans_cycles = cycle_info.latency;
235       } else if (is_valu) {
236          delay.valu_instrs = 0;
237          delay.valu_cycles = cycle_info.latency;
238       } else if (instr->isSALU()) {
239          delay.salu_cycles = cycle_info.latency;
240       }
241 
242       for (const Definition& def : instr->definitions) {
243          for (unsigned i = 0; i < def.size(); i++) {
244             auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + i}, delay);
245             if (!it.second)
246                it.first->second.combine(delay);
247          }
248       }
249    }
250 
251    update_alu(ctx, is_valu && instr_info.classes[(int)instr->opcode] != instr_class::wmma, is_trans,
252               cycle_info.issue_cycles);
253 }
254 
255 void
emit_delay_alu(delay_ctx & ctx,std::vector<aco_ptr<Instruction>> & instructions,alu_delay_info & delay)256 emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
257                alu_delay_info& delay)
258 {
259    uint32_t imm = 0;
260    if (delay.trans_instrs != delay.trans_nop) {
261       imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
262    }
263 
264    if (delay.valu_instrs != delay.valu_nop) {
265       imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
266    }
267 
268    /* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
269     * drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
270     * getting this wrong isn't an issue. */
271    if (delay.salu_cycles && imm <= 0xf) {
272       unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
273       imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
274    }
275 
276    Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
277    inst->salu().imm = imm;
278    inst->pass_flags = (delay.valu_cycles | (delay.trans_cycles << 16));
279    instructions.emplace_back(inst);
280    delay = alu_delay_info();
281 }
282 
283 void
handle_block(Program * program,Block & block,delay_ctx & ctx)284 handle_block(Program* program, Block& block, delay_ctx& ctx)
285 {
286    std::vector<aco_ptr<Instruction>> new_instructions;
287    alu_delay_info queued_delay;
288 
289    for (size_t i = 0; i < block.instructions.size(); i++) {
290       aco_ptr<Instruction>& instr = block.instructions[i];
291       bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
292 
293       kill_alu(queued_delay, instr.get(), ctx);
294       gen_alu(instr.get(), ctx);
295 
296       if (!is_delay_alu) {
297          if (!queued_delay.empty())
298             emit_delay_alu(ctx, new_instructions, queued_delay);
299          new_instructions.emplace_back(std::move(instr));
300       }
301    }
302 
303    if (!queued_delay.empty())
304       emit_delay_alu(ctx, new_instructions, queued_delay);
305    block.instructions.swap(new_instructions);
306 }
307 
308 } /* end namespace */
309 
310 void
insert_delay_alu(Program * program)311 insert_delay_alu(Program* program)
312 {
313    /* per BB ctx */
314    std::vector<bool> done(program->blocks.size());
315    std::vector<delay_ctx> in_ctx(program->blocks.size(), delay_ctx(program));
316    std::vector<delay_ctx> out_ctx(program->blocks.size(), delay_ctx(program));
317 
318    std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
319    unsigned loop_progress = 0;
320 
321    for (unsigned i = 0; i < program->blocks.size();) {
322       Block& current = program->blocks[i++];
323 
324       if (current.kind & block_kind_discard_early_exit) {
325          /* Because the jump to the discard early exit block may happen anywhere in a block, it's
326           * not possible to join it with its predecessors this way.
327           */
328          continue;
329       }
330 
331       delay_ctx ctx = in_ctx[current.index];
332 
333       if (current.kind & block_kind_loop_header) {
334          loop_header_indices.push(current.index);
335       } else if (current.kind & block_kind_loop_exit) {
336          bool repeat = false;
337          if (loop_progress == loop_header_indices.size()) {
338             i = loop_header_indices.top();
339             repeat = true;
340          }
341          loop_header_indices.pop();
342          loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
343          if (repeat)
344             continue;
345       }
346 
347       bool changed = false;
348       for (unsigned b : current.linear_preds)
349          changed |= ctx.join(&out_ctx[b]);
350 
351       if (done[current.index] && !changed) {
352          in_ctx[current.index] = std::move(ctx);
353          continue;
354       } else {
355          in_ctx[current.index] = ctx;
356       }
357 
358       loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
359       done[current.index] = true;
360 
361       handle_block(program, current, ctx);
362 
363       out_ctx[current.index] = std::move(ctx);
364    }
365 }
366 
367 void
combine_delay_alu(Program * program)368 combine_delay_alu(Program* program)
369 {
370    /* Combine s_delay_alu using the skip field. */
371    for (Block& block : program->blocks) {
372       int i = 0;
373       int prev_delay_alu = -1;
374       for (aco_ptr<Instruction>& instr : block.instructions) {
375          if (instr->opcode != aco_opcode::s_delay_alu) {
376             block.instructions[i++] = std::move(instr);
377             continue;
378          }
379 
380          uint16_t imm = instr->salu().imm;
381          int skip = i - prev_delay_alu - 1;
382          if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
383             if (imm >> 7 == 0)
384                prev_delay_alu = i;
385             block.instructions[i++] = std::move(instr);
386             continue;
387          }
388 
389          block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
390          prev_delay_alu = -1;
391       }
392       block.instructions.resize(i);
393    }
394 }
395 
396 } // namespace aco
397