xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_reduce_assign.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "aco_builder.h"
9 #include "aco_ir.h"
10 
11 #include <vector>
12 
13 /*
14  * Insert p_linear_start instructions right before RA to correctly allocate
15  * temporaries for reductions that have to disrespect EXEC by executing in
16  * WWM.
17  */
18 
19 namespace aco {
20 
21 void
setup_reduce_temp(Program * program)22 setup_reduce_temp(Program* program)
23 {
24    unsigned last_top_level_block_idx = 0;
25    unsigned maxSize = 0;
26 
27    std::vector<bool> hasReductions(program->blocks.size());
28    for (Block& block : program->blocks) {
29       for (aco_ptr<Instruction>& instr : block.instructions) {
30          if (instr->opcode == aco_opcode::p_interp_gfx11 ||
31              instr->opcode == aco_opcode::p_bpermute_permlane) {
32             maxSize = MAX2(maxSize, 1);
33             hasReductions[block.index] = true;
34          } else if (instr->format == Format::PSEUDO_REDUCTION) {
35             maxSize = MAX2(maxSize, instr->operands[0].size());
36             hasReductions[block.index] = true;
37          }
38       }
39    }
40 
41    if (maxSize == 0)
42       return;
43 
44    assert(maxSize == 1 || maxSize == 2);
45    Temp reduceTmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
46    Temp vtmp(0, RegClass(RegType::vgpr, maxSize).as_linear());
47    int inserted_at = -1;
48    int vtmp_inserted_at = -1;
49 
50    for (Block& block : program->blocks) {
51 
52       if (block.kind & block_kind_top_level) {
53          last_top_level_block_idx = block.index;
54 
55          /* TODO: this could be improved in this case:
56           *    start_linear_vgpr
57           *    if (...) {
58           *       use_linear_vgpr
59           *    }
60           *    end_linear_vgpr
61           * Here, the linear vgpr is used before any phi copies, so this isn't necessary.
62           */
63          if (inserted_at >= 0) {
64             aco_ptr<Instruction> end{create_instruction(
65                aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_inserted_at >= 0 ? 2 : 1, 0)};
66             end->operands[0] = Operand(reduceTmp);
67             if (vtmp_inserted_at >= 0)
68                end->operands[1] = Operand(vtmp);
69 
70             /* insert after the phis of the block */
71             std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.begin();
72             while ((*it)->opcode == aco_opcode::p_linear_phi || (*it)->opcode == aco_opcode::p_phi)
73                ++it;
74             block.instructions.insert(it, std::move(end));
75             inserted_at = vtmp_inserted_at = -1;
76          }
77       }
78 
79       if (!hasReductions[block.index])
80          continue;
81 
82       std::vector<aco_ptr<Instruction>>::iterator it;
83       for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
84          Instruction* instr = (*it).get();
85          if (instr->format != Format::PSEUDO_REDUCTION &&
86              instr->opcode != aco_opcode::p_interp_gfx11 &&
87              instr->opcode != aco_opcode::p_bpermute_permlane)
88             continue;
89 
90          if ((int)last_top_level_block_idx != inserted_at) {
91             reduceTmp = program->allocateTmp(reduceTmp.regClass());
92             aco_ptr<Instruction> create{
93                create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
94             create->definitions[0] = Definition(reduceTmp);
95             /* find the right place to insert this definition */
96             if (last_top_level_block_idx == block.index) {
97                /* insert right before the current instruction */
98                it = block.instructions.insert(it, std::move(create));
99                it++;
100                /* inserted_at is intentionally not updated here, so later blocks
101                 * would insert at the end instead of using this one. */
102             } else {
103                assert(last_top_level_block_idx < block.index);
104                /* insert after p_logical_end of the last top-level block */
105                std::vector<aco_ptr<Instruction>>& instructions =
106                   program->blocks[last_top_level_block_idx].instructions;
107                auto insert_point =
108                   std::find_if(instructions.rbegin(), instructions.rend(),
109                                [](const auto& iter) {
110                                   return iter->opcode == aco_opcode::p_logical_end;
111                                })
112                      .base();
113                instructions.insert(insert_point, std::move(create));
114                inserted_at = last_top_level_block_idx;
115             }
116          }
117 
118          /* same as before, except for the vector temporary instead of the reduce temporary */
119          bool need_vtmp = false;
120          if (instr->isReduction()) {
121             ReduceOp op = instr->reduction().reduce_op;
122             unsigned cluster_size = instr->reduction().cluster_size;
123             need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
124                         op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
125                         op == imax64 || op == imul64;
126             bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
127                                    op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
128                                    op == iadd64;
129 
130             if (program->gfx_level >= GFX10 && cluster_size == 64)
131                need_vtmp = true;
132             if (program->gfx_level >= GFX10 && gfx10_need_vtmp)
133                need_vtmp = true;
134             if (program->gfx_level <= GFX7)
135                need_vtmp = true;
136 
137             need_vtmp |= cluster_size == 32;
138          }
139 
140          if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
141             vtmp = program->allocateTmp(vtmp.regClass());
142             aco_ptr<Instruction> create{
143                create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
144             create->definitions[0] = Definition(vtmp);
145             if (last_top_level_block_idx == block.index) {
146                it = block.instructions.insert(it, std::move(create));
147                it++;
148             } else {
149                assert(last_top_level_block_idx < block.index);
150                std::vector<aco_ptr<Instruction>>& instructions =
151                   program->blocks[last_top_level_block_idx].instructions;
152                auto insert_point =
153                   std::find_if(instructions.rbegin(), instructions.rend(),
154                                [](const auto& iter) {
155                                   return iter->opcode == aco_opcode::p_logical_end;
156                                })
157                      .base();
158                instructions.insert(insert_point, std::move(create));
159                vtmp_inserted_at = last_top_level_block_idx;
160             }
161          }
162 
163          if (instr->isReduction()) {
164             instr->operands[1] = Operand(reduceTmp);
165             if (need_vtmp)
166                instr->operands[2] = Operand(vtmp);
167          } else {
168             assert(instr->opcode == aco_opcode::p_interp_gfx11 ||
169                    instr->opcode == aco_opcode::p_bpermute_permlane);
170             instr->operands[0] = Operand(reduceTmp);
171          }
172       }
173    }
174 }
175 
176 }; // namespace aco
177