1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include <algorithm>
10 #include <map>
11 #include <vector>
12
13 namespace aco {
14 namespace {
15
16 struct phi_info_item {
17 Definition def;
18 Operand op;
19 };
20
21 struct ssa_elimination_ctx {
22 /* The outer vectors should be indexed by block index. The inner vectors store phi information
23 * for each block. */
24 std::vector<std::vector<phi_info_item>> logical_phi_info;
25 std::vector<std::vector<phi_info_item>> linear_phi_info;
26 std::vector<bool> empty_blocks;
27 std::vector<bool> blocks_incoming_exec_used;
28 Program* program;
29
ssa_elimination_ctxaco::__anon342995f70111::ssa_elimination_ctx30 ssa_elimination_ctx(Program* program_)
31 : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
32 empty_blocks(program_->blocks.size(), true),
33 blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
34 {}
35 };
36
37 void
collect_phi_info(ssa_elimination_ctx & ctx)38 collect_phi_info(ssa_elimination_ctx& ctx)
39 {
40 for (Block& block : ctx.program->blocks) {
41 for (aco_ptr<Instruction>& phi : block.instructions) {
42 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
43 break;
44
45 for (unsigned i = 0; i < phi->operands.size(); i++) {
46 if (phi->operands[i].isUndefined())
47 continue;
48 if (phi->operands[i].physReg() == phi->definitions[0].physReg())
49 continue;
50
51 assert(phi->definitions[0].size() == phi->operands[i].size());
52
53 Block::edge_vec& preds =
54 phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
55 uint32_t pred_idx = preds[i];
56 auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
57 : ctx.linear_phi_info[pred_idx];
58 info_vec.push_back({phi->definitions[0], phi->operands[i]});
59 ctx.empty_blocks[pred_idx] = false;
60 }
61 }
62 }
63 }
64
65 void
insert_parallelcopies(ssa_elimination_ctx & ctx)66 insert_parallelcopies(ssa_elimination_ctx& ctx)
67 {
68 /* insert the parallelcopies from logical phis before p_logical_end */
69 for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
70 auto& logical_phi_info = ctx.logical_phi_info[block_idx];
71 if (logical_phi_info.empty())
72 continue;
73
74 Block& block = ctx.program->blocks[block_idx];
75 unsigned idx = block.instructions.size() - 1;
76 while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
77 assert(idx > 0);
78 idx--;
79 }
80
81 std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
82 aco_ptr<Instruction> pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO,
83 logical_phi_info.size(), logical_phi_info.size())};
84 unsigned i = 0;
85 for (auto& phi_info : logical_phi_info) {
86 pc->definitions[i] = phi_info.def;
87 pc->operands[i] = phi_info.op;
88 i++;
89 }
90 /* this shouldn't be needed since we're only copying vgprs */
91 pc->pseudo().tmp_in_scc = false;
92 block.instructions.insert(it, std::move(pc));
93 }
94
95 /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
96 for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
97 auto& linear_phi_info = ctx.linear_phi_info[block_idx];
98 if (linear_phi_info.empty())
99 continue;
100
101 Block& block = ctx.program->blocks[block_idx];
102 std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
103 --it;
104 assert((*it)->isBranch());
105 PhysReg scratch_sgpr = (*it)->definitions[0].physReg();
106 aco_ptr<Instruction> pc{create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO,
107 linear_phi_info.size(), linear_phi_info.size())};
108 unsigned i = 0;
109 for (auto& phi_info : linear_phi_info) {
110 pc->definitions[i] = phi_info.def;
111 pc->operands[i] = phi_info.op;
112 i++;
113 }
114 pc->pseudo().tmp_in_scc = block.scc_live_out;
115 pc->pseudo().scratch_sgpr = scratch_sgpr;
116 pc->pseudo().needs_scratch_reg = true;
117 block.instructions.insert(it, std::move(pc));
118 }
119 }
120
121 bool
is_empty_block(Block * block,bool ignore_exec_writes)122 is_empty_block(Block* block, bool ignore_exec_writes)
123 {
124 /* check if this block is empty and the exec mask is not needed */
125 for (aco_ptr<Instruction>& instr : block->instructions) {
126 switch (instr->opcode) {
127 case aco_opcode::p_linear_phi:
128 case aco_opcode::p_phi:
129 case aco_opcode::p_logical_start:
130 case aco_opcode::p_logical_end:
131 case aco_opcode::p_branch: break;
132 case aco_opcode::p_parallelcopy:
133 for (unsigned i = 0; i < instr->definitions.size(); i++) {
134 if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
135 continue;
136 if (instr->definitions[i].physReg() != instr->operands[i].physReg())
137 return false;
138 }
139 break;
140 case aco_opcode::s_andn2_b64:
141 case aco_opcode::s_andn2_b32:
142 if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
143 break;
144 return false;
145 default: return false;
146 }
147 }
148 return true;
149 }
150
151 void
try_remove_merge_block(ssa_elimination_ctx & ctx,Block * block)152 try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
153 {
154 /* check if the successor is another merge block which restores exec */
155 // TODO: divergent loops also restore exec
156 if (block->linear_succs.size() != 1 ||
157 !(ctx.program->blocks[block->linear_succs[0]].kind & block_kind_merge))
158 return;
159
160 /* check if this block is empty */
161 if (!is_empty_block(block, true))
162 return;
163
164 /* keep the branch instruction and remove the rest */
165 aco_ptr<Instruction> branch = std::move(block->instructions.back());
166 block->instructions.clear();
167 block->instructions.emplace_back(std::move(branch));
168 }
169
170 void
try_remove_invert_block(ssa_elimination_ctx & ctx,Block * block)171 try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
172 {
173 assert(block->linear_succs.size() == 2);
174 /* only remove this block if the successor got removed as well */
175 if (block->linear_succs[0] != block->linear_succs[1])
176 return;
177
178 /* check if block is otherwise empty */
179 if (!is_empty_block(block, true))
180 return;
181
182 unsigned succ_idx = block->linear_succs[0];
183 assert(block->linear_preds.size() == 2);
184 for (unsigned i = 0; i < 2; i++) {
185 Block* pred = &ctx.program->blocks[block->linear_preds[i]];
186 pred->linear_succs[0] = succ_idx;
187 ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
188
189 Pseudo_branch_instruction& branch = pred->instructions.back()->branch();
190 assert(branch.isBranch());
191 branch.target[0] = succ_idx;
192 branch.target[1] = succ_idx;
193 }
194
195 block->instructions.clear();
196 block->linear_preds.clear();
197 block->linear_succs.clear();
198 }
199
200 void
try_remove_simple_block(ssa_elimination_ctx & ctx,Block * block)201 try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
202 {
203 if (!is_empty_block(block, false))
204 return;
205
206 Block& pred = ctx.program->blocks[block->linear_preds[0]];
207 Block& succ = ctx.program->blocks[block->linear_succs[0]];
208 Pseudo_branch_instruction& branch = pred.instructions.back()->branch();
209 if (branch.opcode == aco_opcode::p_branch) {
210 branch.target[0] = succ.index;
211 branch.target[1] = succ.index;
212 } else if (branch.target[0] == block->index) {
213 branch.target[0] = succ.index;
214 } else if (branch.target[0] == succ.index) {
215 assert(branch.target[1] == block->index);
216 branch.target[1] = succ.index;
217 branch.opcode = aco_opcode::p_branch;
218 branch.rarely_taken = branch.never_taken = false;
219 } else if (branch.target[1] == block->index) {
220 /* check if there is a fall-through path from block to succ */
221 bool falls_through = block->index < succ.index;
222 for (unsigned j = block->index + 1; falls_through && j < succ.index; j++) {
223 assert(ctx.program->blocks[j].index == j);
224 if (!ctx.program->blocks[j].instructions.empty())
225 falls_through = false;
226 }
227 if (falls_through) {
228 branch.target[1] = succ.index;
229 } else {
230 /* check if there is a fall-through path for the alternative target */
231 if (block->index >= branch.target[0])
232 return;
233 for (unsigned j = block->index + 1; j < branch.target[0]; j++) {
234 if (!ctx.program->blocks[j].instructions.empty())
235 return;
236 }
237
238 /* This is a (uniform) break or continue block. The branch condition has to be inverted. */
239 if (branch.opcode == aco_opcode::p_cbranch_z)
240 branch.opcode = aco_opcode::p_cbranch_nz;
241 else if (branch.opcode == aco_opcode::p_cbranch_nz)
242 branch.opcode = aco_opcode::p_cbranch_z;
243 else
244 assert(false);
245 /* also invert the linear successors */
246 pred.linear_succs[0] = pred.linear_succs[1];
247 pred.linear_succs[1] = succ.index;
248 branch.target[1] = branch.target[0];
249 branch.target[0] = succ.index;
250 }
251 } else {
252 assert(false);
253 }
254
255 if (branch.target[0] == branch.target[1]) {
256 while (branch.operands.size())
257 branch.operands.pop_back();
258
259 branch.opcode = aco_opcode::p_branch;
260 branch.rarely_taken = branch.never_taken = false;
261 }
262
263 for (unsigned i = 0; i < pred.linear_succs.size(); i++)
264 if (pred.linear_succs[i] == block->index)
265 pred.linear_succs[i] = succ.index;
266
267 for (unsigned i = 0; i < succ.linear_preds.size(); i++)
268 if (succ.linear_preds[i] == block->index)
269 succ.linear_preds[i] = pred.index;
270
271 block->instructions.clear();
272 block->linear_preds.clear();
273 block->linear_succs.clear();
274 }
275
276 bool
instr_writes_exec(Instruction * instr)277 instr_writes_exec(Instruction* instr)
278 {
279 for (Definition& def : instr->definitions)
280 if (def.physReg() == exec || def.physReg() == exec_hi)
281 return true;
282
283 return false;
284 }
285
286 template <typename T, typename U>
287 bool
regs_intersect(const T & a,const U & b)288 regs_intersect(const T& a, const U& b)
289 {
290 const unsigned a_lo = a.physReg();
291 const unsigned a_hi = a_lo + a.size();
292 const unsigned b_lo = b.physReg();
293 const unsigned b_hi = b_lo + b.size();
294
295 return a_hi > b_lo && b_hi > a_lo;
296 }
297
298 void
try_optimize_branching_sequence(ssa_elimination_ctx & ctx,Block & block,const int exec_val_idx,const int exec_copy_idx)299 try_optimize_branching_sequence(ssa_elimination_ctx& ctx, Block& block, const int exec_val_idx,
300 const int exec_copy_idx)
301 {
302 /* Try to optimize the branching sequence at the end of a block.
303 *
304 * We are looking for blocks that look like this:
305 *
306 * BB:
307 * ... instructions ...
308 * s[N:M] = <exec_val instruction>
309 * ... other instructions that don't depend on exec ...
310 * p_logical_end
311 * exec = <exec_copy instruction> s[N:M]
312 * p_cbranch exec
313 *
314 * The main motivation is to eliminate exec_copy.
315 * Depending on the context, we try to do the following:
316 *
317 * 1. Reassign exec_val to write exec directly
318 * 2. If possible, eliminate exec_copy
319 * 3. When exec_copy also saves the old exec mask, insert a
320 * new copy instruction before exec_val
321 * 4. Reassign any instruction that used s[N:M] to use exec
322 *
323 * This is beneficial for the following reasons:
324 *
325 * - Fewer instructions in the block when exec_copy can be eliminated
326 * - As a result, when exec_val is VOPC this also improves the stalls
327 * due to SALU waiting for VALU. This works best when we can also
328 * remove the branching instruction, in which case the stall
329 * is entirely eliminated.
330 * - When exec_copy can't be removed, the reassignment may still be
331 * very slightly beneficial to latency.
332 */
333
334 aco_ptr<Instruction>& exec_val = block.instructions[exec_val_idx];
335 aco_ptr<Instruction>& exec_copy = block.instructions[exec_copy_idx];
336
337 const aco_opcode and_saveexec = ctx.program->lane_mask == s2 ? aco_opcode::s_and_saveexec_b64
338 : aco_opcode::s_and_saveexec_b32;
339
340 if (exec_copy->opcode != and_saveexec && exec_copy->opcode != aco_opcode::p_parallelcopy)
341 return;
342
343 if (exec_val->definitions.size() > 1)
344 return;
345
346 const bool vcmpx_exec_only = ctx.program->gfx_level >= GFX10;
347
348 /* Check if a suitable v_cmpx opcode exists. */
349 const aco_opcode v_cmpx_op =
350 exec_val->isVOPC() ? get_vcmpx(exec_val->opcode) : aco_opcode::num_opcodes;
351 const bool vopc = v_cmpx_op != aco_opcode::num_opcodes;
352
353 /* V_CMPX+DPP returns 0 with reads from disabled lanes, unlike V_CMP+DPP (RDNA3 ISA doc, 7.7) */
354 if (vopc && exec_val->isDPP())
355 return;
356
357 /* If s_and_saveexec is used, we'll need to insert a new instruction to save the old exec. */
358 bool save_original_exec = exec_copy->opcode == and_saveexec;
359
360 const Definition exec_wr_def = exec_val->definitions[0];
361 const Definition exec_copy_def = exec_copy->definitions[0];
362
363 if (save_original_exec) {
364 for (int i = exec_copy_idx - 1; i >= 0; i--) {
365 const aco_ptr<Instruction>& instr = block.instructions[i];
366 if (instr->opcode == aco_opcode::p_parallelcopy &&
367 instr->definitions[0].physReg() == exec &&
368 instr->definitions[0].regClass() == ctx.program->lane_mask &&
369 instr->operands[0].physReg() == exec_copy_def.physReg()) {
370 /* The register that we should save exec to already contains the same value as exec. */
371 save_original_exec = false;
372 break;
373 }
374 /* exec_copy_def is clobbered or exec written before we found a copy. */
375 if ((i != exec_val_idx || !vcmpx_exec_only) &&
376 std::any_of(instr->definitions.begin(), instr->definitions.end(),
377 [&exec_copy_def, &ctx](const Definition& def) -> bool
378 {
379 return regs_intersect(exec_copy_def, def) ||
380 regs_intersect(Definition(exec, ctx.program->lane_mask), def);
381 }))
382 break;
383
384 if (instr->isPseudo() && instr->pseudo().needs_scratch_reg &&
385 regs_intersect(exec_copy_def, Definition(instr->pseudo().scratch_sgpr, s1)))
386 break;
387 }
388 }
389
390 /* Position where the original exec mask copy should be inserted. */
391 const int save_original_exec_idx = exec_val_idx;
392 /* The copy can be removed when it kills its operand.
393 * v_cmpx also writes the original destination pre GFX10.
394 */
395 const bool can_remove_copy = exec_copy->operands[0].isKill() || (vopc && !vcmpx_exec_only);
396
397 /* Always allow reassigning when the value is written by (usable) VOPC.
398 * Note, VOPC implicitly contains "& exec" because it yields zero on inactive lanes.
399 * Additionally, when value is copied as-is, also allow SALU and parallelcopies.
400 */
401 const bool can_reassign =
402 vopc || (exec_copy->opcode == aco_opcode::p_parallelcopy &&
403 (exec_val->isSALU() || exec_val->opcode == aco_opcode::p_parallelcopy ||
404 exec_val->opcode == aco_opcode::p_create_vector));
405
406 /* The reassignment is not worth it when both the original exec needs to be copied
407 * and the new exec copy can't be removed. In this case we'd end up with more instructions.
408 */
409 if (!can_reassign || (save_original_exec && !can_remove_copy))
410 return;
411
412 /* When exec_val and exec_copy are non-adjacent, check whether there are any
413 * instructions inbetween (besides p_logical_end) which may inhibit the optimization.
414 */
415 for (int idx = exec_val_idx + 1; idx < exec_copy_idx; ++idx) {
416 aco_ptr<Instruction>& instr = block.instructions[idx];
417
418 if (save_original_exec) {
419 /* Check if the instruction uses the exec_copy_def register, in which case we can't
420 * optimize. */
421 for (const Operand& op : instr->operands)
422 if (regs_intersect(exec_copy_def, op))
423 return;
424 for (const Definition& def : instr->definitions)
425 if (regs_intersect(exec_copy_def, def))
426 return;
427 if (instr->isPseudo() && instr->pseudo().needs_scratch_reg &&
428 regs_intersect(exec_copy_def, Definition(instr->pseudo().scratch_sgpr, s1)))
429 return;
430 }
431
432 /* Check if the instruction may implicitly read VCC, eg. v_cndmask or add with carry.
433 * Rewriting these operands may require format conversion because of encoding limitations.
434 */
435 if (exec_wr_def.physReg() == vcc && instr->isVALU() && instr->operands.size() >= 3 &&
436 !instr->isVOP3())
437 return;
438 }
439
440 if (save_original_exec) {
441 /* We insert the exec copy before exec_val, so exec_val can't use those registers. */
442 for (const Operand& op : exec_val->operands)
443 if (regs_intersect(exec_copy_def, op))
444 return;
445 /* We would write over the saved exec value in this case. */
446 if (((vopc && !vcmpx_exec_only) || !can_remove_copy) &&
447 regs_intersect(exec_copy_def, exec_wr_def))
448 return;
449 }
450
451 if (vopc) {
452 /* Add one extra definition for exec and copy the VOP3-specific fields if present. */
453 if (!vcmpx_exec_only) {
454 if (exec_val->isSDWA()) {
455 /* This might work but it needs testing and more code to copy the instruction. */
456 return;
457 } else {
458 aco_ptr<Instruction> tmp = std::move(exec_val);
459 exec_val.reset(create_instruction(tmp->opcode, tmp->format, tmp->operands.size(),
460 tmp->definitions.size() + 1));
461 std::copy(tmp->operands.cbegin(), tmp->operands.cend(), exec_val->operands.begin());
462 std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(),
463 exec_val->definitions.begin());
464
465 VALU_instruction& src = tmp->valu();
466 VALU_instruction& dst = exec_val->valu();
467 dst.opsel = src.opsel;
468 dst.omod = src.omod;
469 dst.clamp = src.clamp;
470 dst.neg = src.neg;
471 dst.abs = src.abs;
472 }
473 }
474
475 /* Set v_cmpx opcode. */
476 exec_val->opcode = v_cmpx_op;
477
478 *exec_val->definitions.rbegin() = Definition(exec, ctx.program->lane_mask);
479
480 /* Change instruction from VOP3 to plain VOPC when possible. */
481 if (vcmpx_exec_only && !exec_val->usesModifiers() &&
482 (exec_val->operands.size() < 2 || exec_val->operands[1].isOfType(RegType::vgpr)))
483 exec_val->format = Format::VOPC;
484 } else {
485 /* Reassign the instruction to write exec directly. */
486 exec_val->definitions[0] = Definition(exec, ctx.program->lane_mask);
487 }
488
489 /* If there are other instructions (besides p_logical_end) between
490 * writing the value and copying it to exec, reassign uses
491 * of the old definition.
492 */
493 for (int idx = exec_val_idx + 1; idx < exec_copy_idx; ++idx) {
494 aco_ptr<Instruction>& instr = block.instructions[idx];
495 for (Operand& op : instr->operands) {
496 if (op.physReg() == exec_wr_def.physReg())
497 op = Operand(exec, op.regClass());
498 if (exec_wr_def.size() == 2 && op.physReg() == exec_wr_def.physReg().advance(4))
499 op = Operand(exec_hi, op.regClass());
500 }
501 }
502
503 if (can_remove_copy) {
504 /* Remove the copy. */
505 exec_copy.reset();
506 } else {
507 /* Reassign the copy to write the register of the original value. */
508 exec_copy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1));
509 exec_copy->definitions[0] = exec_wr_def;
510 exec_copy->operands[0] = Operand(exec, ctx.program->lane_mask);
511 }
512
513 bool has_nonzero_op =
514 std::any_of(exec_val->operands.begin(), exec_val->operands.end(),
515 [](const Operand& op) -> bool { return op.isConstant() && op.constantValue(); });
516 if (exec_val->isPseudo() && has_nonzero_op) {
517 /* Remove the branch instruction when exec is constant non-zero. */
518 aco_ptr<Instruction>& branch = block.instructions.back();
519 if (branch->opcode == aco_opcode::p_cbranch_z && branch->operands[0].physReg() == exec)
520 block.instructions.back().reset();
521 }
522
523 if (save_original_exec) {
524 /* Insert a new instruction that saves the original exec before it is overwritten.
525 * Do this last, because inserting in the instructions vector may invalidate the exec_val
526 * reference.
527 */
528 const auto it = std::next(block.instructions.begin(), save_original_exec_idx);
529 aco_ptr<Instruction> copy(
530 create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1));
531 copy->definitions[0] = exec_copy_def;
532 copy->operands[0] = Operand(exec, ctx.program->lane_mask);
533 block.instructions.insert(it, std::move(copy));
534 }
535 }
536
537 void
eliminate_useless_exec_writes_in_block(ssa_elimination_ctx & ctx,Block & block)538 eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
539 {
540 /* Check if any successor needs the outgoing exec mask from the current block. */
541
542 bool exec_write_used;
543 if (block.kind & block_kind_end_with_regs) {
544 /* Last block of a program with succeed shader part should respect final exec write. */
545 exec_write_used = true;
546 } else {
547 bool copy_to_exec = false;
548 bool copy_from_exec = false;
549
550 for (const auto& successor_phi_info : ctx.linear_phi_info[block.index]) {
551 copy_to_exec |= successor_phi_info.def.physReg() == exec;
552 copy_from_exec |= successor_phi_info.op.physReg() == exec;
553 }
554
555 if (copy_from_exec)
556 exec_write_used = true;
557 else if (copy_to_exec)
558 exec_write_used = false;
559 else
560 /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
561 exec_write_used =
562 std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
563 [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
564 }
565
566 /* Collect information about the branching sequence. */
567
568 bool branch_exec_val_found = false;
569 int branch_exec_val_idx = -1;
570 int branch_exec_copy_idx = -1;
571 unsigned branch_exec_tempid = 0;
572
573 /* Go through all instructions and eliminate useless exec writes. */
574
575 for (int i = block.instructions.size() - 1; i >= 0; --i) {
576 aco_ptr<Instruction>& instr = block.instructions[i];
577
578 /* We already take information from phis into account before the loop, so let's just break on
579 * phis. */
580 if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
581 break;
582
583 /* See if the current instruction needs or writes exec. */
584 bool needs_exec =
585 needs_exec_mask(instr.get()) ||
586 (instr->opcode == aco_opcode::p_logical_end && !ctx.logical_phi_info[block.index].empty());
587 bool writes_exec = instr_writes_exec(instr.get());
588
589 /* See if we found an unused exec write. */
590 if (writes_exec && !exec_write_used) {
591 /* Don't eliminate an instruction that writes registers other than exec and scc.
592 * It is possible that this is eg. an s_and_saveexec and the saved value is
593 * used by a later branch.
594 */
595 bool writes_other = std::any_of(instr->definitions.begin(), instr->definitions.end(),
596 [](const Definition& def) -> bool
597 { return def.physReg() != exec && def.physReg() != scc; });
598 if (!writes_other) {
599 instr.reset();
600 continue;
601 }
602 }
603
604 /* For a newly encountered exec write, clear the used flag. */
605 if (writes_exec) {
606 if (instr->operands.size() && !branch_exec_val_found) {
607 /* We are in a branch that jumps according to exec.
608 * We just found the instruction that copies to exec before the branch.
609 */
610 assert(branch_exec_copy_idx == -1);
611 branch_exec_copy_idx = i;
612 branch_exec_tempid = instr->operands[0].tempId();
613 branch_exec_val_found = true;
614 } else if (branch_exec_val_idx == -1) {
615 /* The current instruction overwrites exec before branch_exec_val_idx was
616 * found, therefore we can't optimize the branching sequence.
617 */
618 branch_exec_copy_idx = -1;
619 branch_exec_tempid = 0;
620 }
621
622 exec_write_used = false;
623 } else if (branch_exec_tempid && instr->definitions.size() &&
624 instr->definitions[0].tempId() == branch_exec_tempid) {
625 /* We just found the instruction that produces the exec mask that is copied. */
626 assert(branch_exec_val_idx == -1);
627 branch_exec_val_idx = i;
628 } else if (branch_exec_tempid && branch_exec_val_idx == -1 && needs_exec) {
629 /* There is an instruction that needs the original exec mask before
630 * branch_exec_val_idx was found, so we can't optimize the branching sequence. */
631 branch_exec_copy_idx = -1;
632 branch_exec_tempid = 0;
633 }
634
635 /* If the current instruction needs exec, mark it as used. */
636 exec_write_used |= needs_exec;
637 }
638
639 /* Remember if the current block needs an incoming exec mask from its predecessors. */
640 ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
641
642 /* See if we can optimize the instruction that produces the exec mask. */
643 if (branch_exec_val_idx != -1) {
644 assert(branch_exec_tempid && branch_exec_copy_idx != -1);
645 try_optimize_branching_sequence(ctx, block, branch_exec_val_idx, branch_exec_copy_idx);
646 }
647
648 /* Cleanup: remove deleted instructions from the vector. */
649 auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
650 block.instructions.resize(new_end - block.instructions.begin());
651 }
652
653 void
jump_threading(ssa_elimination_ctx & ctx)654 jump_threading(ssa_elimination_ctx& ctx)
655 {
656 for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
657 Block* block = &ctx.program->blocks[i];
658 eliminate_useless_exec_writes_in_block(ctx, *block);
659
660 if (!ctx.empty_blocks[i])
661 continue;
662
663 if (block->kind & block_kind_invert) {
664 try_remove_invert_block(ctx, block);
665 continue;
666 }
667
668 if (block->linear_succs.size() > 1)
669 continue;
670
671 if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
672 try_remove_merge_block(ctx, block);
673
674 if (block->linear_preds.size() == 1)
675 try_remove_simple_block(ctx, block);
676 }
677 }
678
679 } /* end namespace */
680
681 void
ssa_elimination(Program * program)682 ssa_elimination(Program* program)
683 {
684 ssa_elimination_ctx ctx(program);
685
686 /* Collect information about every phi-instruction */
687 collect_phi_info(ctx);
688
689 /* eliminate empty blocks */
690 jump_threading(ctx);
691
692 /* insert parallelcopies from SSA elimination */
693 insert_parallelcopies(ctx);
694 }
695 } // namespace aco
696