xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_ssa_elimination.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include <algorithm>
10 #include <map>
11 #include <vector>
12 
13 namespace aco {
14 namespace {
15 
16 struct phi_info_item {
17    Definition def;
18    Operand op;
19 };
20 
21 struct ssa_elimination_ctx {
22    /* The outer vectors should be indexed by block index. The inner vectors store phi information
23     * for each block. */
24    std::vector<std::vector<phi_info_item>> logical_phi_info;
25    std::vector<std::vector<phi_info_item>> linear_phi_info;
26    std::vector<bool> empty_blocks;
27    std::vector<bool> blocks_incoming_exec_used;
28    Program* program;
29 
ssa_elimination_ctxaco::__anon342995f70111::ssa_elimination_ctx30    ssa_elimination_ctx(Program* program_)
31        : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
32          empty_blocks(program_->blocks.size(), true),
33          blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
34    {}
35 };
36 
37 void
collect_phi_info(ssa_elimination_ctx & ctx)38 collect_phi_info(ssa_elimination_ctx& ctx)
39 {
40    for (Block& block : ctx.program->blocks) {
41       for (aco_ptr<Instruction>& phi : block.instructions) {
42          if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
43             break;
44 
45          for (unsigned i = 0; i < phi->operands.size(); i++) {
46             if (phi->operands[i].isUndefined())
47                continue;
48             if (phi->operands[i].physReg() == phi->definitions[0].physReg())
49                continue;
50 
51             assert(phi->definitions[0].size() == phi->operands[i].size());
52 
53             Block::edge_vec& preds =
54                phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
55             uint32_t pred_idx = preds[i];
56             auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
57                                                               : ctx.linear_phi_info[pred_idx];
58             info_vec.push_back({phi->definitions[0], phi->operands[i]});
59             ctx.empty_blocks[pred_idx] = false;
60          }
61       }
62    }
63 }
64 
65 void
insert_parallelcopies(ssa_elimination_ctx & ctx)66 insert_parallelcopies(ssa_elimination_ctx& ctx)
67 {
68    /* insert the parallelcopies from logical phis before p_logical_end */
69    for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
70       auto& logical_phi_info = ctx.logical_phi_info[block_idx];
71       if (logical_phi_info.empty())
72          continue;
73 
74       Block& block = ctx.program->blocks[block_idx];
75       unsigned idx = block.instructions.size() - 1;
76       while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
77          assert(idx > 0);
78          idx--;
79       }
80 
81       std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
82       aco_ptr<Instruction> pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO,
83                                                  logical_phi_info.size(), logical_phi_info.size())};
84       unsigned i = 0;
85       for (auto& phi_info : logical_phi_info) {
86          pc->definitions[i] = phi_info.def;
87          pc->operands[i] = phi_info.op;
88          i++;
89       }
90       /* this shouldn't be needed since we're only copying vgprs */
91       pc->pseudo().tmp_in_scc = false;
92       block.instructions.insert(it, std::move(pc));
93    }
94 
95    /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
96    for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
97       auto& linear_phi_info = ctx.linear_phi_info[block_idx];
98       if (linear_phi_info.empty())
99          continue;
100 
101       Block& block = ctx.program->blocks[block_idx];
102       std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
103       --it;
104       assert((*it)->isBranch());
105       PhysReg scratch_sgpr = (*it)->definitions[0].physReg();
106       aco_ptr<Instruction> pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO,
107                                                  linear_phi_info.size(), linear_phi_info.size())};
108       unsigned i = 0;
109       for (auto& phi_info : linear_phi_info) {
110          pc->definitions[i] = phi_info.def;
111          pc->operands[i] = phi_info.op;
112          i++;
113       }
114       pc->pseudo().tmp_in_scc = block.scc_live_out;
115       pc->pseudo().scratch_sgpr = scratch_sgpr;
116       pc->pseudo().needs_scratch_reg = true;
117       block.instructions.insert(it, std::move(pc));
118    }
119 }
120 
121 bool
is_empty_block(Block * block,bool ignore_exec_writes)122 is_empty_block(Block* block, bool ignore_exec_writes)
123 {
124    /* check if this block is empty and the exec mask is not needed */
125    for (aco_ptr<Instruction>& instr : block->instructions) {
126       switch (instr->opcode) {
127       case aco_opcode::p_linear_phi:
128       case aco_opcode::p_phi:
129       case aco_opcode::p_logical_start:
130       case aco_opcode::p_logical_end:
131       case aco_opcode::p_branch: break;
132       case aco_opcode::p_parallelcopy:
133          for (unsigned i = 0; i < instr->definitions.size(); i++) {
134             if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
135                continue;
136             if (instr->definitions[i].physReg() != instr->operands[i].physReg())
137                return false;
138          }
139          break;
140       case aco_opcode::s_andn2_b64:
141       case aco_opcode::s_andn2_b32:
142          if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
143             break;
144          return false;
145       default: return false;
146       }
147    }
148    return true;
149 }
150 
151 void
try_remove_merge_block(ssa_elimination_ctx & ctx,Block * block)152 try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
153 {
154    /* check if the successor is another merge block which restores exec */
155    // TODO: divergent loops also restore exec
156    if (block->linear_succs.size() != 1 ||
157        !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge))
158       return;
159 
160    /* check if this block is empty */
161    if (!is_empty_block(block, true))
162       return;
163 
164    /* keep the branch instruction and remove the rest */
165    aco_ptr<Instruction> branch = std::move(block->instructions.back());
166    block->instructions.clear();
167    block->instructions.emplace_back(std::move(branch));
168 }
169 
170 void
try_remove_invert_block(ssa_elimination_ctx & ctx,Block * block)171 try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
172 {
173    assert(block->linear_succs.size() == 2);
174    /* only remove this block if the successor got removed as well */
175    if (block->linear_succs[0] != block->linear_succs[1])
176       return;
177 
178    /* check if block is otherwise empty */
179    if (!is_empty_block(block, true))
180       return;
181 
182    unsigned succ_idx = block->linear_succs[0];
183    assert(block->linear_preds.size() == 2);
184    for (unsigned i = 0; i < 2; i++) {
185       Block* pred = &ctx.program->blocks[block->linear_preds[i]];
186       pred->linear_succs[0] = succ_idx;
187       ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
188 
189       Pseudo_branch_instruction& branch = pred->instructions.back()->branch();
190       assert(branch.isBranch());
191       branch.target[0] = succ_idx;
192       branch.target[1] = succ_idx;
193    }
194 
195    block->instructions.clear();
196    block->linear_preds.clear();
197    block->linear_succs.clear();
198 }
199 
200 void
try_remove_simple_block(ssa_elimination_ctx & ctx,Block * block)201 try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
202 {
203    if (!is_empty_block(block, false))
204       return;
205 
206    Block& pred = ctx.program->blocks[block->linear_preds[0]];
207    Block& succ = ctx.program->blocks[block->linear_succs[0]];
208    Pseudo_branch_instruction& branch = pred.instructions.back()->branch();
209    if (branch.opcode == aco_opcode::p_branch) {
210       branch.target[0] = succ.index;
211       branch.target[1] = succ.index;
212    } else if (branch.target[0] == block->index) {
213       branch.target[0] = succ.index;
214    } else if (branch.target[0] == succ.index) {
215       assert(branch.target[1] == block->index);
216       branch.target[1] = succ.index;
217       branch.opcode = aco_opcode::p_branch;
218       branch.rarely_taken = branch.never_taken = false;
219    } else if (branch.target[1] == block->index) {
220       /* check if there is a fall-through path from block to succ */
221       bool falls_through = block->index < succ.index;
222       for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) {
223          assert(ctx.program->blocks[j].index == j);
224          if (!ctx.program->blocks[j].instructions.empty())
225             falls_through = false;
226       }
227       if (falls_through) {
228          branch.target[1] = succ.index;
229       } else {
230          /* check if there is a fall-through path for the alternative target */
231          if (block->index >= branch.target[0])
232             return;
233          for (unsigned j = block->index + 1; j < branch.target[0]; j++) {
234             if (!ctx.program->blocks[j].instructions.empty())
235                return;
236          }
237 
238          /* This is a (uniform) break or continue block. The branch condition has to be inverted. */
239          if (branch.opcode == aco_opcode::p_cbranch_z)
240             branch.opcode = aco_opcode::p_cbranch_nz;
241          else if (branch.opcode == aco_opcode::p_cbranch_nz)
242             branch.opcode = aco_opcode::p_cbranch_z;
243          else
244             assert(false);
245          /* also invert the linear successors */
246          pred.linear_succs[0] = pred.linear_succs[1];
247          pred.linear_succs[1] = succ.index;
248          branch.target[1] = branch.target[0];
249          branch.target[0] = succ.index;
250       }
251    } else {
252       assert(false);
253    }
254 
255    if (branch.target[0] == branch.target[1]) {
256       while (branch.operands.size())
257          branch.operands.pop_back();
258 
259       branch.opcode = aco_opcode::p_branch;
260       branch.rarely_taken = branch.never_taken = false;
261    }
262 
263    for (unsigned i = 0; i < pred.linear_succs.size(); i++)
264       if (pred.linear_succs[i] == block->index)
265          pred.linear_succs[i] = succ.index;
266 
267    for (unsigned i = 0; i < succ.linear_preds.size(); i++)
268       if (succ.linear_preds[i] == block->index)
269          succ.linear_preds[i] = pred.index;
270 
271    block->instructions.clear();
272    block->linear_preds.clear();
273    block->linear_succs.clear();
274 }
275 
276 bool
instr_writes_exec(Instruction * instr)277 instr_writes_exec(Instruction* instr)
278 {
279    for (Definition& def : instr->definitions)
280       if (def.physReg() == exec || def.physReg() == exec_hi)
281          return true;
282 
283    return false;
284 }
285 
286 template <typename T, typename U>
287 bool
regs_intersect(const T & a,const U & b)288 regs_intersect(const T& a, const U& b)
289 {
290    const unsigned a_lo = a.physReg();
291    const unsigned a_hi = a_lo + a.size();
292    const unsigned b_lo = b.physReg();
293    const unsigned b_hi = b_lo + b.size();
294 
295    return a_hi > b_lo && b_hi > a_lo;
296 }
297 
298 void
try_optimize_branching_sequence(ssa_elimination_ctx & ctx,Block & block,const int exec_val_idx,const int exec_copy_idx)299 try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const int exec_val_idx,
300                                 const int exec_copy_idx)
301 {
302    /* Try to optimize the branching sequence at the end of a block.
303     *
304     * We are looking for blocks that look like this:
305     *
306     * BB:
307     * ... instructions ...
308     * s[N:M] = <exec_val instruction>
309     * ... other instructions that don't depend on exec ...
310     * p_logical_end
311     * exec = <exec_copy instruction> s[N:M]
312     * p_cbranch exec
313     *
314     * The main motivation is to eliminate exec_copy.
315     * Depending on the context, we try to do the following:
316     *
317     * 1. Reassign exec_val to write exec directly
318     * 2. If possible, eliminate exec_copy
319     * 3. When exec_copy also saves the old exec mask, insert a
320     *    new copy instruction before exec_val
321     * 4. Reassign any instruction that used s[N:M] to use exec
322     *
323     * This is beneficial for the following reasons:
324     *
325     * - Fewer instructions in the block when exec_copy can be eliminated
326     * - As a result, when exec_val is VOPC this also improves the stalls
327     *   due to SALU waiting for VALU. This works best when we can also
328     *   remove the branching instruction, in which case the stall
329     *   is entirely eliminated.
330     * - When exec_copy can't be removed, the reassignment may still be
331     *   very slightly beneficial to latency.
332     */
333 
334    aco_ptr<Instruction>& exec_val = block.instructions[exec_val_idx];
335    aco_ptr<Instruction>& exec_copy = block.instructions[exec_copy_idx];
336 
337    const aco_opcode and_saveexec = ctx.program->lane_mask == s2 ? aco_opcode::s_and_saveexec_b64
338                                                                 : aco_opcode::s_and_saveexec_b32;
339 
340    if (exec_copy->opcode != and_saveexec && exec_copy->opcode != aco_opcode::p_parallelcopy)
341       return;
342 
343    if (exec_val->definitions.size() > 1)
344       return;
345 
346    const bool vcmpx_exec_only = ctx.program->gfx_level >= GFX10;
347 
348    /* Check if a suitable v_cmpx opcode exists. */
349    const aco_opcode v_cmpx_op =
350       exec_val->isVOPC() ? get_vcmpx(exec_val->opcode) : aco_opcode::num_opcodes;
351    const bool vopc = v_cmpx_op != aco_opcode::num_opcodes;
352 
353    /* V_CMPX+DPP returns 0 with reads from disabled lanes, unlike V_CMP+DPP (RDNA3 ISA doc, 7.7) */
354    if (vopc && exec_val->isDPP())
355       return;
356 
357    /* If s_and_saveexec is used, we'll need to insert a new instruction to save the old exec. */
358    bool save_original_exec = exec_copy->opcode == and_saveexec;
359 
360    const Definition exec_wr_def = exec_val->definitions[0];
361    const Definition exec_copy_def = exec_copy->definitions[0];
362 
363    if (save_original_exec) {
364       for (int i = exec_copy_idx - 1; i >= 0; i--) {
365          const aco_ptr<Instruction>& instr = block.instructions[i];
366          if (instr->opcode == aco_opcode::p_parallelcopy &&
367              instr->definitions[0].physReg() == exec &&
368              instr->definitions[0].regClass() == ctx.program->lane_mask &&
369              instr->operands[0].physReg() == exec_copy_def.physReg()) {
370             /* The register that we should save exec to already contains the same value as exec. */
371             save_original_exec = false;
372             break;
373          }
374          /* exec_copy_def is clobbered or exec written before we found a copy. */
375          if ((i != exec_val_idx || !vcmpx_exec_only) &&
376              std::any_of(instr->definitions.begin(), instr->definitions.end(),
377                          [&exec_copy_def, &ctx](const Definition& def) -> bool
378                          {
379                             return regs_intersect(exec_copy_def, def) ||
380                                    regs_intersect(Definition(exec, ctx.program->lane_mask), def);
381                          }))
382             break;
383 
384          if (instr->isPseudo() && instr->pseudo().needs_scratch_reg &&
385              regs_intersect(exec_copy_def, Definition(instr->pseudo().scratch_sgpr, s1)))
386             break;
387       }
388    }
389 
390    /* Position where the original exec mask copy should be inserted. */
391    const int save_original_exec_idx = exec_val_idx;
392    /* The copy can be removed when it kills its operand.
393     * v_cmpx also writes the original destination pre GFX10.
394     */
395    const bool can_remove_copy = exec_copy->operands[0].isKill() || (vopc && !vcmpx_exec_only);
396 
397    /* Always allow reassigning when the value is written by (usable) VOPC.
398     * Note, VOPC implicitly contains "& exec" because it yields zero on inactive lanes.
399     * Additionally, when value is copied as-is, also allow SALU and parallelcopies.
400     */
401    const bool can_reassign =
402       vopc || (exec_copy->opcode == aco_opcode::p_parallelcopy &&
403                (exec_val->isSALU() || exec_val->opcode == aco_opcode::p_parallelcopy ||
404                 exec_val->opcode == aco_opcode::p_create_vector));
405 
406    /* The reassignment is not worth it when both the original exec needs to be copied
407     * and the new exec copy can't be removed. In this case we'd end up with more instructions.
408     */
409    if (!can_reassign || (save_original_exec && !can_remove_copy))
410       return;
411 
412    /* When exec_val and exec_copy are non-adjacent, check whether there are any
413     * instructions inbetween (besides p_logical_end) which may inhibit the optimization.
414     */
415    for (int idx = exec_val_idx + 1; idx < exec_copy_idx; ++idx) {
416       aco_ptr<Instruction>& instr = block.instructions[idx];
417 
418       if (save_original_exec) {
419          /* Check if the instruction uses the exec_copy_def register, in which case we can't
420           * optimize. */
421          for (const Operand& op : instr->operands)
422             if (regs_intersect(exec_copy_def, op))
423                return;
424          for (const Definition& def : instr->definitions)
425             if (regs_intersect(exec_copy_def, def))
426                return;
427          if (instr->isPseudo() && instr->pseudo().needs_scratch_reg &&
428              regs_intersect(exec_copy_def, Definition(instr->pseudo().scratch_sgpr, s1)))
429             return;
430       }
431 
432       /* Check if the instruction may implicitly read VCC, eg. v_cndmask or add with carry.
433        * Rewriting these operands may require format conversion because of encoding limitations.
434        */
435       if (exec_wr_def.physReg() == vcc && instr->isVALU() && instr->operands.size() >= 3 &&
436           !instr->isVOP3())
437          return;
438    }
439 
440    if (save_original_exec) {
441       /* We insert the exec copy before exec_val, so exec_val can't use those registers. */
442       for (const Operand& op : exec_val->operands)
443          if (regs_intersect(exec_copy_def, op))
444             return;
445       /* We would write over the saved exec value in this case. */
446       if (((vopc && !vcmpx_exec_only) || !can_remove_copy) &&
447           regs_intersect(exec_copy_def, exec_wr_def))
448          return;
449    }
450 
451    if (vopc) {
452       /* Add one extra definition for exec and copy the VOP3-specific fields if present. */
453       if (!vcmpx_exec_only) {
454          if (exec_val->isSDWA()) {
455             /* This might work but it needs testing and more code to copy the instruction. */
456             return;
457          } else {
458             aco_ptr<Instruction> tmp = std::move(exec_val);
459             exec_val.reset(create_instruction(tmp->opcode, tmp->format, tmp->operands.size(),
460                                               tmp->definitions.size() + 1));
461             std::copy(tmp->operands.cbegin(), tmp->operands.cend(), exec_val->operands.begin());
462             std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(),
463                       exec_val->definitions.begin());
464 
465             VALU_instruction& src = tmp->valu();
466             VALU_instruction& dst = exec_val->valu();
467             dst.opsel = src.opsel;
468             dst.omod = src.omod;
469             dst.clamp = src.clamp;
470             dst.neg = src.neg;
471             dst.abs = src.abs;
472          }
473       }
474 
475       /* Set v_cmpx opcode. */
476       exec_val->opcode = v_cmpx_op;
477 
478       *exec_val->definitions.rbegin() = Definition(exec, ctx.program->lane_mask);
479 
480       /* Change instruction from VOP3 to plain VOPC when possible. */
481       if (vcmpx_exec_only && !exec_val->usesModifiers() &&
482           (exec_val->operands.size() < 2 || exec_val->operands[1].isOfType(RegType::vgpr)))
483          exec_val->format = Format::VOPC;
484    } else {
485       /* Reassign the instruction to write exec directly. */
486       exec_val->definitions[0] = Definition(exec, ctx.program->lane_mask);
487    }
488 
489    /* If there are other instructions (besides p_logical_end) between
490     * writing the value and copying it to exec, reassign uses
491     * of the old definition.
492     */
493    for (int idx = exec_val_idx + 1; idx < exec_copy_idx; ++idx) {
494       aco_ptr<Instruction>& instr = block.instructions[idx];
495       for (Operand& op : instr->operands) {
496          if (op.physReg() == exec_wr_def.physReg())
497             op = Operand(exec, op.regClass());
498          if (exec_wr_def.size() == 2 && op.physReg() == exec_wr_def.physReg().advance(4))
499             op = Operand(exec_hi, op.regClass());
500       }
501    }
502 
503    if (can_remove_copy) {
504       /* Remove the copy. */
505       exec_copy.reset();
506    } else {
507       /* Reassign the copy to write the register of the original value. */
508       exec_copy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1));
509       exec_copy->definitions[0] = exec_wr_def;
510       exec_copy->operands[0] = Operand(exec, ctx.program->lane_mask);
511    }
512 
513    bool has_nonzero_op =
514       std::any_of(exec_val->operands.begin(), exec_val->operands.end(),
515                   [](const Operand& op) -> bool { return op.isConstant() && op.constantValue(); });
516    if (exec_val->isPseudo() && has_nonzero_op) {
517       /* Remove the branch instruction when exec is constant non-zero. */
518       aco_ptr<Instruction>& branch = block.instructions.back();
519       if (branch->opcode == aco_opcode::p_cbranch_z && branch->operands[0].physReg() == exec)
520          block.instructions.back().reset();
521    }
522 
523    if (save_original_exec) {
524       /* Insert a new instruction that saves the original exec before it is overwritten.
525        * Do this last, because inserting in the instructions vector may invalidate the exec_val
526        * reference.
527        */
528       const auto it = std::next(block.instructions.begin(), save_original_exec_idx);
529       aco_ptr<Instruction> copy(
530          create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1));
531       copy->definitions[0] = exec_copy_def;
532       copy->operands[0] = Operand(exec, ctx.program->lane_mask);
533       block.instructions.insert(it, std::move(copy));
534    }
535 }
536 
537 void
eliminate_useless_exec_writes_in_block(ssa_elimination_ctx & ctx,Block & block)538 eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
539 {
540    /* Check if any successor needs the outgoing exec mask from the current block. */
541 
542    bool exec_write_used;
543    if (block.kind & block_kind_end_with_regs) {
544       /* Last block of a program with succeed shader part should respect final exec write. */
545       exec_write_used = true;
546    } else {
547       bool copy_to_exec = false;
548       bool copy_from_exec = false;
549 
550       for (const auto& successor_phi_info : ctx.linear_phi_info[block.index]) {
551          copy_to_exec |= successor_phi_info.def.physReg() == exec;
552          copy_from_exec |= successor_phi_info.op.physReg() == exec;
553       }
554 
555       if (copy_from_exec)
556          exec_write_used = true;
557       else if (copy_to_exec)
558          exec_write_used = false;
559       else
560          /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
561          exec_write_used =
562             std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
563                         [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
564    }
565 
566    /* Collect information about the branching sequence. */
567 
568    bool branch_exec_val_found = false;
569    int branch_exec_val_idx = -1;
570    int branch_exec_copy_idx = -1;
571    unsigned branch_exec_tempid = 0;
572 
573    /* Go through all instructions and eliminate useless exec writes. */
574 
575    for (int i = block.instructions.size() - 1; i >= 0; --i) {
576       aco_ptr<Instruction>& instr = block.instructions[i];
577 
578       /* We already take information from phis into account before the loop, so let's just break on
579        * phis. */
580       if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
581          break;
582 
583       /* See if the current instruction needs or writes exec. */
584       bool needs_exec =
585          needs_exec_mask(instr.get()) ||
586          (instr->opcode == aco_opcode::p_logical_end && !ctx.logical_phi_info[block.index].empty());
587       bool writes_exec = instr_writes_exec(instr.get());
588 
589       /* See if we found an unused exec write. */
590       if (writes_exec && !exec_write_used) {
591          /* Don't eliminate an instruction that writes registers other than exec and scc.
592           * It is possible that this is eg. an s_and_saveexec and the saved value is
593           * used by a later branch.
594           */
595          bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
596                                          [](const Definition& def) -> bool
597                                          { return def.physReg() != exec && def.physReg() != scc; });
598          if (!writes_other) {
599             instr.reset();
600             continue;
601          }
602       }
603 
604       /* For a newly encountered exec write, clear the used flag. */
605       if (writes_exec) {
606          if (instr->operands.size() && !branch_exec_val_found) {
607             /* We are in a branch that jumps according to exec.
608              * We just found the instruction that copies to exec before the branch.
609              */
610             assert(branch_exec_copy_idx == -1);
611             branch_exec_copy_idx = i;
612             branch_exec_tempid = instr->operands[0].tempId();
613             branch_exec_val_found = true;
614          } else if (branch_exec_val_idx == -1) {
615             /* The current instruction overwrites exec before branch_exec_val_idx was
616              * found, therefore we can't optimize the branching sequence.
617              */
618             branch_exec_copy_idx = -1;
619             branch_exec_tempid = 0;
620          }
621 
622          exec_write_used = false;
623       } else if (branch_exec_tempid && instr->definitions.size() &&
624                  instr->definitions[0].tempId() == branch_exec_tempid) {
625          /* We just found the instruction that produces the exec mask that is copied. */
626          assert(branch_exec_val_idx == -1);
627          branch_exec_val_idx = i;
628       } else if (branch_exec_tempid && branch_exec_val_idx == -1 && needs_exec) {
629          /* There is an instruction that needs the original exec mask before
630           * branch_exec_val_idx was found, so we can't optimize the branching sequence. */
631          branch_exec_copy_idx = -1;
632          branch_exec_tempid = 0;
633       }
634 
635       /* If the current instruction needs exec, mark it as used. */
636       exec_write_used |= needs_exec;
637    }
638 
639    /* Remember if the current block needs an incoming exec mask from its predecessors. */
640    ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
641 
642    /* See if we can optimize the instruction that produces the exec mask. */
643    if (branch_exec_val_idx != -1) {
644       assert(branch_exec_tempid && branch_exec_copy_idx != -1);
645       try_optimize_branching_sequence(ctx, block, branch_exec_val_idx, branch_exec_copy_idx);
646    }
647 
648    /* Cleanup: remove deleted instructions from the vector. */
649    auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
650    block.instructions.resize(new_end - block.instructions.begin());
651 }
652 
653 void
jump_threading(ssa_elimination_ctx & ctx)654 jump_threading(ssa_elimination_ctx& ctx)
655 {
656    for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
657       Block* block = &ctx.program->blocks[i];
658       eliminate_useless_exec_writes_in_block(ctx, *block);
659 
660       if (!ctx.empty_blocks[i])
661          continue;
662 
663       if (block->kind & block_kind_invert) {
664          try_remove_invert_block(ctx, block);
665          continue;
666       }
667 
668       if (block->linear_succs.size() > 1)
669          continue;
670 
671       if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
672          try_remove_merge_block(ctx, block);
673 
674       if (block->linear_preds.size() == 1)
675          try_remove_simple_block(ctx, block);
676    }
677 }
678 
679 } /* end namespace */
680 
681 void
ssa_elimination(Program * program)682 ssa_elimination(Program* program)
683 {
684    ssa_elimination_ctx ctx(program);
685 
686    /* Collect information about every phi-instruction */
687    collect_phi_info(ctx);
688 
689    /* eliminate empty blocks */
690    jump_threading(ctx);
691 
692    /* insert parallelcopies from SSA elimination */
693    insert_parallelcopies(ctx);
694 }
695 } // namespace aco
696