1*61046927SAndroid Build Coastguard Worker /*
2*61046927SAndroid Build Coastguard Worker * Copyright © 2020 Valve Corporation
3*61046927SAndroid Build Coastguard Worker *
4*61046927SAndroid Build Coastguard Worker * SPDX-License-Identifier: MIT
5*61046927SAndroid Build Coastguard Worker */
6*61046927SAndroid Build Coastguard Worker
7*61046927SAndroid Build Coastguard Worker #include "aco_ir.h"
8*61046927SAndroid Build Coastguard Worker
9*61046927SAndroid Build Coastguard Worker #include "util/crc32.h"
10*61046927SAndroid Build Coastguard Worker
11*61046927SAndroid Build Coastguard Worker #include <algorithm>
12*61046927SAndroid Build Coastguard Worker #include <deque>
13*61046927SAndroid Build Coastguard Worker #include <set>
14*61046927SAndroid Build Coastguard Worker #include <vector>
15*61046927SAndroid Build Coastguard Worker
16*61046927SAndroid Build Coastguard Worker namespace aco {
17*61046927SAndroid Build Coastguard Worker
18*61046927SAndroid Build Coastguard Worker namespace {
19*61046927SAndroid Build Coastguard Worker
20*61046927SAndroid Build Coastguard Worker class BlockCycleEstimator {
21*61046927SAndroid Build Coastguard Worker public:
22*61046927SAndroid Build Coastguard Worker enum resource {
23*61046927SAndroid Build Coastguard Worker null = 0,
24*61046927SAndroid Build Coastguard Worker scalar,
25*61046927SAndroid Build Coastguard Worker branch_sendmsg,
26*61046927SAndroid Build Coastguard Worker valu,
27*61046927SAndroid Build Coastguard Worker valu_complex,
28*61046927SAndroid Build Coastguard Worker lds,
29*61046927SAndroid Build Coastguard Worker export_gds,
30*61046927SAndroid Build Coastguard Worker vmem,
31*61046927SAndroid Build Coastguard Worker resource_count,
32*61046927SAndroid Build Coastguard Worker };
33*61046927SAndroid Build Coastguard Worker
BlockCycleEstimator(Program * program_)34*61046927SAndroid Build Coastguard Worker BlockCycleEstimator(Program* program_) : program(program_) {}
35*61046927SAndroid Build Coastguard Worker
36*61046927SAndroid Build Coastguard Worker Program* program;
37*61046927SAndroid Build Coastguard Worker
38*61046927SAndroid Build Coastguard Worker int32_t cur_cycle = 0;
39*61046927SAndroid Build Coastguard Worker int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
40*61046927SAndroid Build Coastguard Worker unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
41*61046927SAndroid Build Coastguard Worker int32_t reg_available[512] = {0};
42*61046927SAndroid Build Coastguard Worker std::deque<int32_t> mem_ops[wait_type_num];
43*61046927SAndroid Build Coastguard Worker
44*61046927SAndroid Build Coastguard Worker void add(aco_ptr<Instruction>& instr);
45*61046927SAndroid Build Coastguard Worker void join(const BlockCycleEstimator& other);
46*61046927SAndroid Build Coastguard Worker
47*61046927SAndroid Build Coastguard Worker private:
48*61046927SAndroid Build Coastguard Worker unsigned get_waitcnt_cost(wait_imm imm);
49*61046927SAndroid Build Coastguard Worker unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
50*61046927SAndroid Build Coastguard Worker
51*61046927SAndroid Build Coastguard Worker void use_resources(aco_ptr<Instruction>& instr);
52*61046927SAndroid Build Coastguard Worker int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
53*61046927SAndroid Build Coastguard Worker };
54*61046927SAndroid Build Coastguard Worker
55*61046927SAndroid Build Coastguard Worker struct perf_info {
56*61046927SAndroid Build Coastguard Worker int latency;
57*61046927SAndroid Build Coastguard Worker
58*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::resource rsrc0;
59*61046927SAndroid Build Coastguard Worker unsigned cost0;
60*61046927SAndroid Build Coastguard Worker
61*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::resource rsrc1;
62*61046927SAndroid Build Coastguard Worker unsigned cost1;
63*61046927SAndroid Build Coastguard Worker };
64*61046927SAndroid Build Coastguard Worker
65*61046927SAndroid Build Coastguard Worker static bool
is_dual_issue_capable(const Program & program,const Instruction & instr)66*61046927SAndroid Build Coastguard Worker is_dual_issue_capable(const Program& program, const Instruction& instr)
67*61046927SAndroid Build Coastguard Worker {
68*61046927SAndroid Build Coastguard Worker if (program.gfx_level < GFX11 || !instr.isVALU() || instr.isDPP())
69*61046927SAndroid Build Coastguard Worker return false;
70*61046927SAndroid Build Coastguard Worker
71*61046927SAndroid Build Coastguard Worker switch (instr.opcode) {
72*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fma_f32:
73*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmac_f32:
74*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmaak_f32:
75*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmamk_f32:
76*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mul_f32:
77*61046927SAndroid Build Coastguard Worker case aco_opcode::v_add_f32:
78*61046927SAndroid Build Coastguard Worker case aco_opcode::v_sub_f32:
79*61046927SAndroid Build Coastguard Worker case aco_opcode::v_subrev_f32:
80*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mul_legacy_f32:
81*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fma_legacy_f32:
82*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmac_legacy_f32:
83*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fma_f16:
84*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmac_f16:
85*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmaak_f16:
86*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fmamk_f16:
87*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mul_f16:
88*61046927SAndroid Build Coastguard Worker case aco_opcode::v_add_f16:
89*61046927SAndroid Build Coastguard Worker case aco_opcode::v_sub_f16:
90*61046927SAndroid Build Coastguard Worker case aco_opcode::v_subrev_f16:
91*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mov_b32:
92*61046927SAndroid Build Coastguard Worker case aco_opcode::v_movreld_b32:
93*61046927SAndroid Build Coastguard Worker case aco_opcode::v_movrels_b32:
94*61046927SAndroid Build Coastguard Worker case aco_opcode::v_movrelsd_b32:
95*61046927SAndroid Build Coastguard Worker case aco_opcode::v_movrelsd_2_b32:
96*61046927SAndroid Build Coastguard Worker case aco_opcode::v_cndmask_b32:
97*61046927SAndroid Build Coastguard Worker case aco_opcode::v_writelane_b32_e64:
98*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mov_b16:
99*61046927SAndroid Build Coastguard Worker case aco_opcode::v_cndmask_b16:
100*61046927SAndroid Build Coastguard Worker case aco_opcode::v_max_f32:
101*61046927SAndroid Build Coastguard Worker case aco_opcode::v_min_f32:
102*61046927SAndroid Build Coastguard Worker case aco_opcode::v_max_f16:
103*61046927SAndroid Build Coastguard Worker case aco_opcode::v_min_f16:
104*61046927SAndroid Build Coastguard Worker case aco_opcode::v_max_i16_e64:
105*61046927SAndroid Build Coastguard Worker case aco_opcode::v_min_i16_e64:
106*61046927SAndroid Build Coastguard Worker case aco_opcode::v_max_u16_e64:
107*61046927SAndroid Build Coastguard Worker case aco_opcode::v_min_u16_e64:
108*61046927SAndroid Build Coastguard Worker case aco_opcode::v_add_i16:
109*61046927SAndroid Build Coastguard Worker case aco_opcode::v_sub_i16:
110*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mad_i16:
111*61046927SAndroid Build Coastguard Worker case aco_opcode::v_add_u16_e64:
112*61046927SAndroid Build Coastguard Worker case aco_opcode::v_sub_u16_e64:
113*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mad_u16:
114*61046927SAndroid Build Coastguard Worker case aco_opcode::v_mul_lo_u16_e64:
115*61046927SAndroid Build Coastguard Worker case aco_opcode::v_not_b16:
116*61046927SAndroid Build Coastguard Worker case aco_opcode::v_and_b16:
117*61046927SAndroid Build Coastguard Worker case aco_opcode::v_or_b16:
118*61046927SAndroid Build Coastguard Worker case aco_opcode::v_xor_b16:
119*61046927SAndroid Build Coastguard Worker case aco_opcode::v_lshrrev_b16_e64:
120*61046927SAndroid Build Coastguard Worker case aco_opcode::v_ashrrev_i16_e64:
121*61046927SAndroid Build Coastguard Worker case aco_opcode::v_lshlrev_b16_e64:
122*61046927SAndroid Build Coastguard Worker case aco_opcode::v_dot2_bf16_bf16:
123*61046927SAndroid Build Coastguard Worker case aco_opcode::v_dot2_f32_bf16:
124*61046927SAndroid Build Coastguard Worker case aco_opcode::v_dot2_f16_f16:
125*61046927SAndroid Build Coastguard Worker case aco_opcode::v_dot2_f32_f16:
126*61046927SAndroid Build Coastguard Worker case aco_opcode::v_dot2c_f32_f16: return true;
127*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fma_mix_f32:
128*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fma_mixlo_f16:
129*61046927SAndroid Build Coastguard Worker case aco_opcode::v_fma_mixhi_f16: {
130*61046927SAndroid Build Coastguard Worker /* dst and acc type must match */
131*61046927SAndroid Build Coastguard Worker if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
132*61046927SAndroid Build Coastguard Worker return false;
133*61046927SAndroid Build Coastguard Worker
134*61046927SAndroid Build Coastguard Worker /* If all operands are vgprs, two must be the same. */
135*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < 3; i++) {
136*61046927SAndroid Build Coastguard Worker if (instr.operands[i].isConstant() || instr.operands[i].isOfType(RegType::sgpr))
137*61046927SAndroid Build Coastguard Worker return true;
138*61046927SAndroid Build Coastguard Worker for (unsigned j = 0; j < i; j++) {
139*61046927SAndroid Build Coastguard Worker if (instr.operands[i].physReg() == instr.operands[j].physReg())
140*61046927SAndroid Build Coastguard Worker return true;
141*61046927SAndroid Build Coastguard Worker }
142*61046927SAndroid Build Coastguard Worker }
143*61046927SAndroid Build Coastguard Worker return false;
144*61046927SAndroid Build Coastguard Worker }
145*61046927SAndroid Build Coastguard Worker default: return false;
146*61046927SAndroid Build Coastguard Worker }
147*61046927SAndroid Build Coastguard Worker }
148*61046927SAndroid Build Coastguard Worker
149*61046927SAndroid Build Coastguard Worker static perf_info
get_perf_info(const Program & program,const Instruction & instr)150*61046927SAndroid Build Coastguard Worker get_perf_info(const Program& program, const Instruction& instr)
151*61046927SAndroid Build Coastguard Worker {
152*61046927SAndroid Build Coastguard Worker instr_class cls = instr_info.classes[(int)instr.opcode];
153*61046927SAndroid Build Coastguard Worker
154*61046927SAndroid Build Coastguard Worker #define WAIT(res) BlockCycleEstimator::res, 0
155*61046927SAndroid Build Coastguard Worker #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
156*61046927SAndroid Build Coastguard Worker
157*61046927SAndroid Build Coastguard Worker if (program.gfx_level >= GFX10) {
158*61046927SAndroid Build Coastguard Worker /* fp64 might be incorrect */
159*61046927SAndroid Build Coastguard Worker switch (cls) {
160*61046927SAndroid Build Coastguard Worker case instr_class::valu32:
161*61046927SAndroid Build Coastguard Worker case instr_class::valu_convert32:
162*61046927SAndroid Build Coastguard Worker case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
163*61046927SAndroid Build Coastguard Worker case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
164*61046927SAndroid Build Coastguard Worker case instr_class::valu_quarter_rate32:
165*61046927SAndroid Build Coastguard Worker return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
166*61046927SAndroid Build Coastguard Worker case instr_class::valu_transcendental32:
167*61046927SAndroid Build Coastguard Worker return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
168*61046927SAndroid Build Coastguard Worker case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
169*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_add:
170*61046927SAndroid Build Coastguard Worker return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
171*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_convert:
172*61046927SAndroid Build Coastguard Worker return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
173*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_transcendental:
174*61046927SAndroid Build Coastguard Worker return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
175*61046927SAndroid Build Coastguard Worker case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
176*61046927SAndroid Build Coastguard Worker case instr_class::sfpu: return {4, WAIT_USE(scalar, 1)};
177*61046927SAndroid Build Coastguard Worker case instr_class::valu_pseudo_scalar_trans:
178*61046927SAndroid Build Coastguard Worker return {7, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 1)};
179*61046927SAndroid Build Coastguard Worker case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
180*61046927SAndroid Build Coastguard Worker case instr_class::branch:
181*61046927SAndroid Build Coastguard Worker case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
182*61046927SAndroid Build Coastguard Worker case instr_class::ds:
183*61046927SAndroid Build Coastguard Worker return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
184*61046927SAndroid Build Coastguard Worker : perf_info{0, WAIT_USE(lds, 1)};
185*61046927SAndroid Build Coastguard Worker case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
186*61046927SAndroid Build Coastguard Worker case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
187*61046927SAndroid Build Coastguard Worker case instr_class::wmma: {
188*61046927SAndroid Build Coastguard Worker /* int8 and (b)f16 have the same performance. */
189*61046927SAndroid Build Coastguard Worker uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
190*61046927SAndroid Build Coastguard Worker return {cost, WAIT_USE(valu, cost)};
191*61046927SAndroid Build Coastguard Worker }
192*61046927SAndroid Build Coastguard Worker case instr_class::barrier:
193*61046927SAndroid Build Coastguard Worker case instr_class::waitcnt:
194*61046927SAndroid Build Coastguard Worker case instr_class::other:
195*61046927SAndroid Build Coastguard Worker default: return {0};
196*61046927SAndroid Build Coastguard Worker }
197*61046927SAndroid Build Coastguard Worker } else {
198*61046927SAndroid Build Coastguard Worker switch (cls) {
199*61046927SAndroid Build Coastguard Worker case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
200*61046927SAndroid Build Coastguard Worker case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
201*61046927SAndroid Build Coastguard Worker case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
202*61046927SAndroid Build Coastguard Worker case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
203*61046927SAndroid Build Coastguard Worker case instr_class::valu_fma:
204*61046927SAndroid Build Coastguard Worker return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
205*61046927SAndroid Build Coastguard Worker : perf_info{16, WAIT_USE(valu, 16)};
206*61046927SAndroid Build Coastguard Worker case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
207*61046927SAndroid Build Coastguard Worker case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
208*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
209*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
210*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
211*61046927SAndroid Build Coastguard Worker case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
212*61046927SAndroid Build Coastguard Worker case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
213*61046927SAndroid Build Coastguard Worker case instr_class::branch:
214*61046927SAndroid Build Coastguard Worker return {8, WAIT_USE(branch_sendmsg, 8)};
215*61046927SAndroid Build Coastguard Worker return {4, WAIT_USE(branch_sendmsg, 4)};
216*61046927SAndroid Build Coastguard Worker case instr_class::ds:
217*61046927SAndroid Build Coastguard Worker return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
218*61046927SAndroid Build Coastguard Worker : perf_info{4, WAIT_USE(lds, 4)};
219*61046927SAndroid Build Coastguard Worker case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
220*61046927SAndroid Build Coastguard Worker case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
221*61046927SAndroid Build Coastguard Worker case instr_class::barrier:
222*61046927SAndroid Build Coastguard Worker case instr_class::waitcnt:
223*61046927SAndroid Build Coastguard Worker case instr_class::other:
224*61046927SAndroid Build Coastguard Worker default: return {4};
225*61046927SAndroid Build Coastguard Worker }
226*61046927SAndroid Build Coastguard Worker }
227*61046927SAndroid Build Coastguard Worker
228*61046927SAndroid Build Coastguard Worker #undef WAIT_USE
229*61046927SAndroid Build Coastguard Worker #undef WAIT
230*61046927SAndroid Build Coastguard Worker }
231*61046927SAndroid Build Coastguard Worker
232*61046927SAndroid Build Coastguard Worker void
use_resources(aco_ptr<Instruction> & instr)233*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
234*61046927SAndroid Build Coastguard Worker {
235*61046927SAndroid Build Coastguard Worker perf_info perf = get_perf_info(*program, *instr);
236*61046927SAndroid Build Coastguard Worker
237*61046927SAndroid Build Coastguard Worker if (perf.rsrc0 != resource_count) {
238*61046927SAndroid Build Coastguard Worker res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
239*61046927SAndroid Build Coastguard Worker res_usage[(int)perf.rsrc0] += perf.cost0;
240*61046927SAndroid Build Coastguard Worker }
241*61046927SAndroid Build Coastguard Worker
242*61046927SAndroid Build Coastguard Worker if (perf.rsrc1 != resource_count) {
243*61046927SAndroid Build Coastguard Worker res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
244*61046927SAndroid Build Coastguard Worker res_usage[(int)perf.rsrc1] += perf.cost1;
245*61046927SAndroid Build Coastguard Worker }
246*61046927SAndroid Build Coastguard Worker }
247*61046927SAndroid Build Coastguard Worker
248*61046927SAndroid Build Coastguard Worker int32_t
cycles_until_res_available(aco_ptr<Instruction> & instr)249*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
250*61046927SAndroid Build Coastguard Worker {
251*61046927SAndroid Build Coastguard Worker perf_info perf = get_perf_info(*program, *instr);
252*61046927SAndroid Build Coastguard Worker
253*61046927SAndroid Build Coastguard Worker int32_t cost = 0;
254*61046927SAndroid Build Coastguard Worker if (perf.rsrc0 != resource_count)
255*61046927SAndroid Build Coastguard Worker cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
256*61046927SAndroid Build Coastguard Worker if (perf.rsrc1 != resource_count)
257*61046927SAndroid Build Coastguard Worker cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
258*61046927SAndroid Build Coastguard Worker
259*61046927SAndroid Build Coastguard Worker return cost;
260*61046927SAndroid Build Coastguard Worker }
261*61046927SAndroid Build Coastguard Worker
262*61046927SAndroid Build Coastguard Worker static std::array<unsigned, wait_type_num>
get_wait_counter_info(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)263*61046927SAndroid Build Coastguard Worker get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
264*61046927SAndroid Build Coastguard Worker {
265*61046927SAndroid Build Coastguard Worker /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
266*61046927SAndroid Build Coastguard Worker * depends a lot on the situation. */
267*61046927SAndroid Build Coastguard Worker
268*61046927SAndroid Build Coastguard Worker std::array<unsigned, wait_type_num> info{};
269*61046927SAndroid Build Coastguard Worker
270*61046927SAndroid Build Coastguard Worker if (instr->isEXP()) {
271*61046927SAndroid Build Coastguard Worker info[wait_type_exp] = 16;
272*61046927SAndroid Build Coastguard Worker } else if (instr->isLDSDIR()) {
273*61046927SAndroid Build Coastguard Worker info[wait_type_exp] = 13;
274*61046927SAndroid Build Coastguard Worker } else if (instr->isFlatLike()) {
275*61046927SAndroid Build Coastguard Worker info[wait_type_lgkm] = instr->isFlat() ? 20 : 0;
276*61046927SAndroid Build Coastguard Worker if (!instr->definitions.empty() || gfx_level < GFX10)
277*61046927SAndroid Build Coastguard Worker info[wait_type_vm] = 320;
278*61046927SAndroid Build Coastguard Worker else
279*61046927SAndroid Build Coastguard Worker info[wait_type_vs] = 320;
280*61046927SAndroid Build Coastguard Worker } else if (instr->isSMEM()) {
281*61046927SAndroid Build Coastguard Worker wait_type type = gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
282*61046927SAndroid Build Coastguard Worker if (instr->definitions.empty()) {
283*61046927SAndroid Build Coastguard Worker info[type] = 200;
284*61046927SAndroid Build Coastguard Worker } else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
285*61046927SAndroid Build Coastguard Worker info[type] = 1;
286*61046927SAndroid Build Coastguard Worker } else {
287*61046927SAndroid Build Coastguard Worker bool likely_desc_load = instr->operands[0].size() == 2;
288*61046927SAndroid Build Coastguard Worker bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
289*61046927SAndroid Build Coastguard Worker bool const_offset =
290*61046927SAndroid Build Coastguard Worker instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
291*61046927SAndroid Build Coastguard Worker
292*61046927SAndroid Build Coastguard Worker if (likely_desc_load || const_offset)
293*61046927SAndroid Build Coastguard Worker info[type] = 30; /* likely to hit L0 cache */
294*61046927SAndroid Build Coastguard Worker else
295*61046927SAndroid Build Coastguard Worker info[type] = 200;
296*61046927SAndroid Build Coastguard Worker }
297*61046927SAndroid Build Coastguard Worker } else if (instr->isDS()) {
298*61046927SAndroid Build Coastguard Worker info[wait_type_lgkm] = 20;
299*61046927SAndroid Build Coastguard Worker } else if (instr->isVMEM() && instr->definitions.empty() && gfx_level >= GFX10) {
300*61046927SAndroid Build Coastguard Worker info[wait_type_vs] = 320;
301*61046927SAndroid Build Coastguard Worker } else if (instr->isVMEM()) {
302*61046927SAndroid Build Coastguard Worker uint8_t vm_type = get_vmem_type(gfx_level, instr.get());
303*61046927SAndroid Build Coastguard Worker wait_type type = wait_type_vm;
304*61046927SAndroid Build Coastguard Worker if (gfx_level >= GFX12 && vm_type == vmem_bvh)
305*61046927SAndroid Build Coastguard Worker type = wait_type_bvh;
306*61046927SAndroid Build Coastguard Worker else if (gfx_level >= GFX12 && vm_type == vmem_sampler)
307*61046927SAndroid Build Coastguard Worker type = wait_type_sample;
308*61046927SAndroid Build Coastguard Worker info[type] = 320;
309*61046927SAndroid Build Coastguard Worker }
310*61046927SAndroid Build Coastguard Worker
311*61046927SAndroid Build Coastguard Worker return info;
312*61046927SAndroid Build Coastguard Worker }
313*61046927SAndroid Build Coastguard Worker
314*61046927SAndroid Build Coastguard Worker static wait_imm
get_wait_imm(Program * program,aco_ptr<Instruction> & instr)315*61046927SAndroid Build Coastguard Worker get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
316*61046927SAndroid Build Coastguard Worker {
317*61046927SAndroid Build Coastguard Worker wait_imm imm;
318*61046927SAndroid Build Coastguard Worker if (instr->opcode == aco_opcode::s_endpgm) {
319*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++)
320*61046927SAndroid Build Coastguard Worker imm[i] = 0;
321*61046927SAndroid Build Coastguard Worker } else if (imm.unpack(program->gfx_level, instr.get())) {
322*61046927SAndroid Build Coastguard Worker } else if (instr->isVINTERP_INREG()) {
323*61046927SAndroid Build Coastguard Worker imm.exp = instr->vinterp_inreg().wait_exp;
324*61046927SAndroid Build Coastguard Worker if (imm.exp == 0x7)
325*61046927SAndroid Build Coastguard Worker imm.exp = wait_imm::unset_counter;
326*61046927SAndroid Build Coastguard Worker } else {
327*61046927SAndroid Build Coastguard Worker /* If an instruction increases a counter, it waits for it to be below maximum first. */
328*61046927SAndroid Build Coastguard Worker std::array<unsigned, wait_type_num> wait_info =
329*61046927SAndroid Build Coastguard Worker get_wait_counter_info(program->gfx_level, instr);
330*61046927SAndroid Build Coastguard Worker wait_imm max = wait_imm::max(program->gfx_level);
331*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++) {
332*61046927SAndroid Build Coastguard Worker if (wait_info[i])
333*61046927SAndroid Build Coastguard Worker imm[i] = max[i] - 1;
334*61046927SAndroid Build Coastguard Worker }
335*61046927SAndroid Build Coastguard Worker }
336*61046927SAndroid Build Coastguard Worker return imm;
337*61046927SAndroid Build Coastguard Worker }
338*61046927SAndroid Build Coastguard Worker
339*61046927SAndroid Build Coastguard Worker unsigned
get_dependency_cost(aco_ptr<Instruction> & instr)340*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
341*61046927SAndroid Build Coastguard Worker {
342*61046927SAndroid Build Coastguard Worker int deps_available = cur_cycle;
343*61046927SAndroid Build Coastguard Worker
344*61046927SAndroid Build Coastguard Worker wait_imm imm = get_wait_imm(program, instr);
345*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++) {
346*61046927SAndroid Build Coastguard Worker if (imm[i] == wait_imm::unset_counter)
347*61046927SAndroid Build Coastguard Worker continue;
348*61046927SAndroid Build Coastguard Worker for (int j = 0; j < (int)mem_ops[i].size() - imm[i]; j++)
349*61046927SAndroid Build Coastguard Worker deps_available = MAX2(deps_available, mem_ops[i][j]);
350*61046927SAndroid Build Coastguard Worker }
351*61046927SAndroid Build Coastguard Worker
352*61046927SAndroid Build Coastguard Worker if (instr->opcode == aco_opcode::s_endpgm) {
353*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < 512; i++)
354*61046927SAndroid Build Coastguard Worker deps_available = MAX2(deps_available, reg_available[i]);
355*61046927SAndroid Build Coastguard Worker } else if (program->gfx_level >= GFX10) {
356*61046927SAndroid Build Coastguard Worker for (Operand& op : instr->operands) {
357*61046927SAndroid Build Coastguard Worker if (op.isConstant() || op.isUndefined())
358*61046927SAndroid Build Coastguard Worker continue;
359*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < op.size(); i++)
360*61046927SAndroid Build Coastguard Worker deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
361*61046927SAndroid Build Coastguard Worker }
362*61046927SAndroid Build Coastguard Worker }
363*61046927SAndroid Build Coastguard Worker
364*61046927SAndroid Build Coastguard Worker if (program->gfx_level < GFX10)
365*61046927SAndroid Build Coastguard Worker deps_available = align(deps_available, 4);
366*61046927SAndroid Build Coastguard Worker
367*61046927SAndroid Build Coastguard Worker return deps_available - cur_cycle;
368*61046927SAndroid Build Coastguard Worker }
369*61046927SAndroid Build Coastguard Worker
370*61046927SAndroid Build Coastguard Worker static bool
is_vector(aco_opcode op)371*61046927SAndroid Build Coastguard Worker is_vector(aco_opcode op)
372*61046927SAndroid Build Coastguard Worker {
373*61046927SAndroid Build Coastguard Worker switch (instr_info.classes[(int)op]) {
374*61046927SAndroid Build Coastguard Worker case instr_class::valu32:
375*61046927SAndroid Build Coastguard Worker case instr_class::valu_convert32:
376*61046927SAndroid Build Coastguard Worker case instr_class::valu_fma:
377*61046927SAndroid Build Coastguard Worker case instr_class::valu_double:
378*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_add:
379*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_convert:
380*61046927SAndroid Build Coastguard Worker case instr_class::valu_double_transcendental:
381*61046927SAndroid Build Coastguard Worker case instr_class::vmem:
382*61046927SAndroid Build Coastguard Worker case instr_class::ds:
383*61046927SAndroid Build Coastguard Worker case instr_class::exp:
384*61046927SAndroid Build Coastguard Worker case instr_class::valu64:
385*61046927SAndroid Build Coastguard Worker case instr_class::valu_quarter_rate32:
386*61046927SAndroid Build Coastguard Worker case instr_class::valu_transcendental32: return true;
387*61046927SAndroid Build Coastguard Worker default: return false;
388*61046927SAndroid Build Coastguard Worker }
389*61046927SAndroid Build Coastguard Worker }
390*61046927SAndroid Build Coastguard Worker
391*61046927SAndroid Build Coastguard Worker void
add(aco_ptr<Instruction> & instr)392*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
393*61046927SAndroid Build Coastguard Worker {
394*61046927SAndroid Build Coastguard Worker perf_info perf = get_perf_info(*program, *instr);
395*61046927SAndroid Build Coastguard Worker
396*61046927SAndroid Build Coastguard Worker cur_cycle += get_dependency_cost(instr);
397*61046927SAndroid Build Coastguard Worker
398*61046927SAndroid Build Coastguard Worker unsigned start;
399*61046927SAndroid Build Coastguard Worker bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
400*61046927SAndroid Build Coastguard Worker is_vector(instr->opcode) && !is_dual_issue_capable(*program, *instr) &&
401*61046927SAndroid Build Coastguard Worker program->workgroup_size > 32;
402*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
403*61046927SAndroid Build Coastguard Worker cur_cycle += cycles_until_res_available(instr);
404*61046927SAndroid Build Coastguard Worker
405*61046927SAndroid Build Coastguard Worker start = cur_cycle;
406*61046927SAndroid Build Coastguard Worker use_resources(instr);
407*61046927SAndroid Build Coastguard Worker
408*61046927SAndroid Build Coastguard Worker /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
409*61046927SAndroid Build Coastguard Worker cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
410*61046927SAndroid Build Coastguard Worker }
411*61046927SAndroid Build Coastguard Worker
412*61046927SAndroid Build Coastguard Worker wait_imm imm = get_wait_imm(program, instr);
413*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++) {
414*61046927SAndroid Build Coastguard Worker while (mem_ops[i].size() > imm[i])
415*61046927SAndroid Build Coastguard Worker mem_ops[i].pop_front();
416*61046927SAndroid Build Coastguard Worker }
417*61046927SAndroid Build Coastguard Worker
418*61046927SAndroid Build Coastguard Worker std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program->gfx_level, instr);
419*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++) {
420*61046927SAndroid Build Coastguard Worker if (wait_info[i])
421*61046927SAndroid Build Coastguard Worker mem_ops[i].push_back(cur_cycle + wait_info[i]);
422*61046927SAndroid Build Coastguard Worker }
423*61046927SAndroid Build Coastguard Worker
424*61046927SAndroid Build Coastguard Worker /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
425*61046927SAndroid Build Coastguard Worker * Before waitcnt insertion, this is necessary to consider memory operations.
426*61046927SAndroid Build Coastguard Worker */
427*61046927SAndroid Build Coastguard Worker unsigned latency = 0;
428*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++)
429*61046927SAndroid Build Coastguard Worker latency = MAX2(latency, i == wait_type_vs ? 0 : wait_info[i]);
430*61046927SAndroid Build Coastguard Worker int32_t result_available = start + MAX2(perf.latency, (int32_t)latency);
431*61046927SAndroid Build Coastguard Worker
432*61046927SAndroid Build Coastguard Worker for (Definition& def : instr->definitions) {
433*61046927SAndroid Build Coastguard Worker int32_t* available = ®_available[def.physReg().reg()];
434*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < def.size(); i++)
435*61046927SAndroid Build Coastguard Worker available[i] = MAX2(available[i], result_available);
436*61046927SAndroid Build Coastguard Worker }
437*61046927SAndroid Build Coastguard Worker }
438*61046927SAndroid Build Coastguard Worker
439*61046927SAndroid Build Coastguard Worker void
join(const BlockCycleEstimator & pred)440*61046927SAndroid Build Coastguard Worker BlockCycleEstimator::join(const BlockCycleEstimator& pred)
441*61046927SAndroid Build Coastguard Worker {
442*61046927SAndroid Build Coastguard Worker assert(cur_cycle == 0);
443*61046927SAndroid Build Coastguard Worker
444*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < (unsigned)resource_count; i++) {
445*61046927SAndroid Build Coastguard Worker assert(res_usage[i] == 0);
446*61046927SAndroid Build Coastguard Worker res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
447*61046927SAndroid Build Coastguard Worker }
448*61046927SAndroid Build Coastguard Worker
449*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < 512; i++)
450*61046927SAndroid Build Coastguard Worker reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
451*61046927SAndroid Build Coastguard Worker
452*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < wait_type_num; i++) {
453*61046927SAndroid Build Coastguard Worker std::deque<int32_t>& ops = mem_ops[i];
454*61046927SAndroid Build Coastguard Worker const std::deque<int32_t>& pred_ops = pred.mem_ops[i];
455*61046927SAndroid Build Coastguard Worker for (unsigned j = 0; j < MIN2(ops.size(), pred_ops.size()); j++)
456*61046927SAndroid Build Coastguard Worker ops.rbegin()[j] = MAX2(ops.rbegin()[j], pred_ops.rbegin()[j] - pred.cur_cycle);
457*61046927SAndroid Build Coastguard Worker for (int j = pred_ops.size() - ops.size() - 1; j >= 0; j--)
458*61046927SAndroid Build Coastguard Worker ops.push_front(pred_ops[j] - pred.cur_cycle);
459*61046927SAndroid Build Coastguard Worker }
460*61046927SAndroid Build Coastguard Worker }
461*61046927SAndroid Build Coastguard Worker
462*61046927SAndroid Build Coastguard Worker } /* end namespace */
463*61046927SAndroid Build Coastguard Worker
464*61046927SAndroid Build Coastguard Worker /* sgpr_presched/vgpr_presched */
465*61046927SAndroid Build Coastguard Worker void
collect_presched_stats(Program * program)466*61046927SAndroid Build Coastguard Worker collect_presched_stats(Program* program)
467*61046927SAndroid Build Coastguard Worker {
468*61046927SAndroid Build Coastguard Worker RegisterDemand presched_demand;
469*61046927SAndroid Build Coastguard Worker for (Block& block : program->blocks)
470*61046927SAndroid Build Coastguard Worker presched_demand.update(block.register_demand);
471*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
472*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
473*61046927SAndroid Build Coastguard Worker }
474*61046927SAndroid Build Coastguard Worker
475*61046927SAndroid Build Coastguard Worker /* instructions/branches/vmem_clauses/smem_clauses/cycles */
476*61046927SAndroid Build Coastguard Worker void
collect_preasm_stats(Program * program)477*61046927SAndroid Build Coastguard Worker collect_preasm_stats(Program* program)
478*61046927SAndroid Build Coastguard Worker {
479*61046927SAndroid Build Coastguard Worker for (Block& block : program->blocks) {
480*61046927SAndroid Build Coastguard Worker std::set<Instruction*> vmem_clause;
481*61046927SAndroid Build Coastguard Worker std::set<Instruction*> smem_clause;
482*61046927SAndroid Build Coastguard Worker
483*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_instructions] += block.instructions.size();
484*61046927SAndroid Build Coastguard Worker
485*61046927SAndroid Build Coastguard Worker for (aco_ptr<Instruction>& instr : block.instructions) {
486*61046927SAndroid Build Coastguard Worker const bool is_branch =
487*61046927SAndroid Build Coastguard Worker instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
488*61046927SAndroid Build Coastguard Worker if (is_branch)
489*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_branches]++;
490*61046927SAndroid Build Coastguard Worker
491*61046927SAndroid Build Coastguard Worker if (instr->isVALU() || instr->isVINTRP())
492*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_valu]++;
493*61046927SAndroid Build Coastguard Worker if (instr->isSALU() && !instr->isSOPP() &&
494*61046927SAndroid Build Coastguard Worker instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
495*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_salu]++;
496*61046927SAndroid Build Coastguard Worker if (instr->isVOPD())
497*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_vopd]++;
498*61046927SAndroid Build Coastguard Worker
499*61046927SAndroid Build Coastguard Worker if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
500*61046927SAndroid Build Coastguard Worker !instr->operands.empty()) {
501*61046927SAndroid Build Coastguard Worker if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
502*61046927SAndroid Build Coastguard Worker [&](Instruction* other)
503*61046927SAndroid Build Coastguard Worker { return should_form_clause(instr.get(), other); }))
504*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_vmem_clauses]++;
505*61046927SAndroid Build Coastguard Worker vmem_clause.insert(instr.get());
506*61046927SAndroid Build Coastguard Worker
507*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_vmem]++;
508*61046927SAndroid Build Coastguard Worker } else {
509*61046927SAndroid Build Coastguard Worker vmem_clause.clear();
510*61046927SAndroid Build Coastguard Worker }
511*61046927SAndroid Build Coastguard Worker
512*61046927SAndroid Build Coastguard Worker if (instr->isSMEM() && !instr->operands.empty()) {
513*61046927SAndroid Build Coastguard Worker if (std::none_of(smem_clause.begin(), smem_clause.end(),
514*61046927SAndroid Build Coastguard Worker [&](Instruction* other)
515*61046927SAndroid Build Coastguard Worker { return should_form_clause(instr.get(), other); }))
516*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_smem_clauses]++;
517*61046927SAndroid Build Coastguard Worker smem_clause.insert(instr.get());
518*61046927SAndroid Build Coastguard Worker
519*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_smem]++;
520*61046927SAndroid Build Coastguard Worker } else {
521*61046927SAndroid Build Coastguard Worker smem_clause.clear();
522*61046927SAndroid Build Coastguard Worker }
523*61046927SAndroid Build Coastguard Worker }
524*61046927SAndroid Build Coastguard Worker }
525*61046927SAndroid Build Coastguard Worker
526*61046927SAndroid Build Coastguard Worker double latency = 0;
527*61046927SAndroid Build Coastguard Worker double usage[(int)BlockCycleEstimator::resource_count] = {0};
528*61046927SAndroid Build Coastguard Worker std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
529*61046927SAndroid Build Coastguard Worker
530*61046927SAndroid Build Coastguard Worker constexpr const unsigned vmem_latency = 320;
531*61046927SAndroid Build Coastguard Worker for (const Definition def : program->args_pending_vmem) {
532*61046927SAndroid Build Coastguard Worker blocks[0].mem_ops[wait_type_vm].push_back(vmem_latency);
533*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < def.size(); i++)
534*61046927SAndroid Build Coastguard Worker blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
535*61046927SAndroid Build Coastguard Worker }
536*61046927SAndroid Build Coastguard Worker
537*61046927SAndroid Build Coastguard Worker for (Block& block : program->blocks) {
538*61046927SAndroid Build Coastguard Worker BlockCycleEstimator& block_est = blocks[block.index];
539*61046927SAndroid Build Coastguard Worker for (unsigned pred : block.linear_preds)
540*61046927SAndroid Build Coastguard Worker block_est.join(blocks[pred]);
541*61046927SAndroid Build Coastguard Worker
542*61046927SAndroid Build Coastguard Worker for (aco_ptr<Instruction>& instr : block.instructions) {
543*61046927SAndroid Build Coastguard Worker unsigned before = block_est.cur_cycle;
544*61046927SAndroid Build Coastguard Worker block_est.add(instr);
545*61046927SAndroid Build Coastguard Worker instr->pass_flags = block_est.cur_cycle - before;
546*61046927SAndroid Build Coastguard Worker }
547*61046927SAndroid Build Coastguard Worker
548*61046927SAndroid Build Coastguard Worker /* TODO: it would be nice to be able to consider estimated loop trip
549*61046927SAndroid Build Coastguard Worker * counts used for loop unrolling.
550*61046927SAndroid Build Coastguard Worker */
551*61046927SAndroid Build Coastguard Worker
552*61046927SAndroid Build Coastguard Worker /* TODO: estimate the trip_count of divergent loops (those which break
553*61046927SAndroid Build Coastguard Worker * divergent) higher than of uniform loops
554*61046927SAndroid Build Coastguard Worker */
555*61046927SAndroid Build Coastguard Worker
556*61046927SAndroid Build Coastguard Worker /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
557*61046927SAndroid Build Coastguard Worker * and any lane in the wave takes a side of a divergent branch 75% of the
558*61046927SAndroid Build Coastguard Worker * time.
559*61046927SAndroid Build Coastguard Worker */
560*61046927SAndroid Build Coastguard Worker double iter = 1.0;
561*61046927SAndroid Build Coastguard Worker iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
562*61046927SAndroid Build Coastguard Worker iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
563*61046927SAndroid Build Coastguard Worker iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
564*61046927SAndroid Build Coastguard Worker iter *= pow(0.5, block.uniform_if_depth);
565*61046927SAndroid Build Coastguard Worker iter *= pow(0.75, block.divergent_if_logical_depth);
566*61046927SAndroid Build Coastguard Worker
567*61046927SAndroid Build Coastguard Worker bool divergent_if_linear_else =
568*61046927SAndroid Build Coastguard Worker block.logical_preds.empty() && block.linear_preds.size() == 1 &&
569*61046927SAndroid Build Coastguard Worker block.linear_succs.size() == 1 &&
570*61046927SAndroid Build Coastguard Worker program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
571*61046927SAndroid Build Coastguard Worker if (divergent_if_linear_else)
572*61046927SAndroid Build Coastguard Worker iter *= 0.25;
573*61046927SAndroid Build Coastguard Worker
574*61046927SAndroid Build Coastguard Worker latency += block_est.cur_cycle * iter;
575*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
576*61046927SAndroid Build Coastguard Worker usage[i] += block_est.res_usage[i] * iter;
577*61046927SAndroid Build Coastguard Worker }
578*61046927SAndroid Build Coastguard Worker
579*61046927SAndroid Build Coastguard Worker /* This likely exaggerates the effectiveness of parallelism because it
580*61046927SAndroid Build Coastguard Worker * ignores instruction ordering. It can assume there might be SALU/VALU/etc
581*61046927SAndroid Build Coastguard Worker * work to from other waves while one is idle but that might not be the case
582*61046927SAndroid Build Coastguard Worker * because those other waves have not reached such a point yet.
583*61046927SAndroid Build Coastguard Worker */
584*61046927SAndroid Build Coastguard Worker
585*61046927SAndroid Build Coastguard Worker double parallelism = program->num_waves;
586*61046927SAndroid Build Coastguard Worker for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
587*61046927SAndroid Build Coastguard Worker if (usage[i] > 0.0)
588*61046927SAndroid Build Coastguard Worker parallelism = MIN2(parallelism, latency / usage[i]);
589*61046927SAndroid Build Coastguard Worker }
590*61046927SAndroid Build Coastguard Worker double waves_per_cycle = 1.0 / latency * parallelism;
591*61046927SAndroid Build Coastguard Worker double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
592*61046927SAndroid Build Coastguard Worker
593*61046927SAndroid Build Coastguard Worker double max_utilization = 1.0;
594*61046927SAndroid Build Coastguard Worker if (program->workgroup_size != UINT_MAX)
595*61046927SAndroid Build Coastguard Worker max_utilization =
596*61046927SAndroid Build Coastguard Worker program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
597*61046927SAndroid Build Coastguard Worker wave64_per_cycle *= max_utilization;
598*61046927SAndroid Build Coastguard Worker
599*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_latency] = round(latency);
600*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
601*61046927SAndroid Build Coastguard Worker
602*61046927SAndroid Build Coastguard Worker if (debug_flags & DEBUG_PERF_INFO) {
603*61046927SAndroid Build Coastguard Worker aco_print_program(program, stderr, print_no_ssa | print_perf_info);
604*61046927SAndroid Build Coastguard Worker
605*61046927SAndroid Build Coastguard Worker fprintf(stderr, "num_waves: %u\n", program->num_waves);
606*61046927SAndroid Build Coastguard Worker fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
607*61046927SAndroid Build Coastguard Worker fprintf(stderr, "branch_sendmsg_usage: %f\n",
608*61046927SAndroid Build Coastguard Worker usage[(int)BlockCycleEstimator::branch_sendmsg]);
609*61046927SAndroid Build Coastguard Worker fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
610*61046927SAndroid Build Coastguard Worker fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
611*61046927SAndroid Build Coastguard Worker fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
612*61046927SAndroid Build Coastguard Worker fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
613*61046927SAndroid Build Coastguard Worker fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
614*61046927SAndroid Build Coastguard Worker fprintf(stderr, "latency: %f\n", latency);
615*61046927SAndroid Build Coastguard Worker fprintf(stderr, "parallelism: %f\n", parallelism);
616*61046927SAndroid Build Coastguard Worker fprintf(stderr, "max_utilization: %f\n", max_utilization);
617*61046927SAndroid Build Coastguard Worker fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
618*61046927SAndroid Build Coastguard Worker fprintf(stderr, "\n");
619*61046927SAndroid Build Coastguard Worker }
620*61046927SAndroid Build Coastguard Worker }
621*61046927SAndroid Build Coastguard Worker
622*61046927SAndroid Build Coastguard Worker void
collect_postasm_stats(Program * program,const std::vector<uint32_t> & code)623*61046927SAndroid Build Coastguard Worker collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
624*61046927SAndroid Build Coastguard Worker {
625*61046927SAndroid Build Coastguard Worker program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
626*61046927SAndroid Build Coastguard Worker }
627*61046927SAndroid Build Coastguard Worker
628*61046927SAndroid Build Coastguard Worker Instruction_cycle_info
get_cycle_info(const Program & program,const Instruction & instr)629*61046927SAndroid Build Coastguard Worker get_cycle_info(const Program& program, const Instruction& instr)
630*61046927SAndroid Build Coastguard Worker {
631*61046927SAndroid Build Coastguard Worker perf_info info = get_perf_info(program, instr);
632*61046927SAndroid Build Coastguard Worker return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
633*61046927SAndroid Build Coastguard Worker }
634*61046927SAndroid Build Coastguard Worker
635*61046927SAndroid Build Coastguard Worker } // namespace aco
636