1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "util/crc32.h"
10
11 #include <algorithm>
12 #include <deque>
13 #include <set>
14 #include <vector>
15
16 namespace aco {
17
18 namespace {
19
20 class BlockCycleEstimator {
21 public:
22 enum resource {
23 null = 0,
24 scalar,
25 branch_sendmsg,
26 valu,
27 valu_complex,
28 lds,
29 export_gds,
30 vmem,
31 resource_count,
32 };
33
BlockCycleEstimator(Program * program_)34 BlockCycleEstimator(Program* program_) : program(program_) {}
35
36 Program* program;
37
38 int32_t cur_cycle = 0;
39 int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
40 unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
41 int32_t reg_available[512] = {0};
42 std::deque<int32_t> mem_ops[wait_type_num];
43
44 void add(aco_ptr<Instruction>& instr);
45 void join(const BlockCycleEstimator& other);
46
47 private:
48 unsigned get_waitcnt_cost(wait_imm imm);
49 unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
50
51 void use_resources(aco_ptr<Instruction>& instr);
52 int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
53 };
54
55 struct perf_info {
56 int latency;
57
58 BlockCycleEstimator::resource rsrc0;
59 unsigned cost0;
60
61 BlockCycleEstimator::resource rsrc1;
62 unsigned cost1;
63 };
64
65 static bool
is_dual_issue_capable(const Program & program,const Instruction & instr)66 is_dual_issue_capable(const Program& program, const Instruction& instr)
67 {
68 if (program.gfx_level < GFX11 || !instr.isVALU() || instr.isDPP())
69 return false;
70
71 switch (instr.opcode) {
72 case aco_opcode::v_fma_f32:
73 case aco_opcode::v_fmac_f32:
74 case aco_opcode::v_fmaak_f32:
75 case aco_opcode::v_fmamk_f32:
76 case aco_opcode::v_mul_f32:
77 case aco_opcode::v_add_f32:
78 case aco_opcode::v_sub_f32:
79 case aco_opcode::v_subrev_f32:
80 case aco_opcode::v_mul_legacy_f32:
81 case aco_opcode::v_fma_legacy_f32:
82 case aco_opcode::v_fmac_legacy_f32:
83 case aco_opcode::v_fma_f16:
84 case aco_opcode::v_fmac_f16:
85 case aco_opcode::v_fmaak_f16:
86 case aco_opcode::v_fmamk_f16:
87 case aco_opcode::v_mul_f16:
88 case aco_opcode::v_add_f16:
89 case aco_opcode::v_sub_f16:
90 case aco_opcode::v_subrev_f16:
91 case aco_opcode::v_mov_b32:
92 case aco_opcode::v_movreld_b32:
93 case aco_opcode::v_movrels_b32:
94 case aco_opcode::v_movrelsd_b32:
95 case aco_opcode::v_movrelsd_2_b32:
96 case aco_opcode::v_cndmask_b32:
97 case aco_opcode::v_writelane_b32_e64:
98 case aco_opcode::v_mov_b16:
99 case aco_opcode::v_cndmask_b16:
100 case aco_opcode::v_max_f32:
101 case aco_opcode::v_min_f32:
102 case aco_opcode::v_max_f16:
103 case aco_opcode::v_min_f16:
104 case aco_opcode::v_max_i16_e64:
105 case aco_opcode::v_min_i16_e64:
106 case aco_opcode::v_max_u16_e64:
107 case aco_opcode::v_min_u16_e64:
108 case aco_opcode::v_add_i16:
109 case aco_opcode::v_sub_i16:
110 case aco_opcode::v_mad_i16:
111 case aco_opcode::v_add_u16_e64:
112 case aco_opcode::v_sub_u16_e64:
113 case aco_opcode::v_mad_u16:
114 case aco_opcode::v_mul_lo_u16_e64:
115 case aco_opcode::v_not_b16:
116 case aco_opcode::v_and_b16:
117 case aco_opcode::v_or_b16:
118 case aco_opcode::v_xor_b16:
119 case aco_opcode::v_lshrrev_b16_e64:
120 case aco_opcode::v_ashrrev_i16_e64:
121 case aco_opcode::v_lshlrev_b16_e64:
122 case aco_opcode::v_dot2_bf16_bf16:
123 case aco_opcode::v_dot2_f32_bf16:
124 case aco_opcode::v_dot2_f16_f16:
125 case aco_opcode::v_dot2_f32_f16:
126 case aco_opcode::v_dot2c_f32_f16: return true;
127 case aco_opcode::v_fma_mix_f32:
128 case aco_opcode::v_fma_mixlo_f16:
129 case aco_opcode::v_fma_mixhi_f16: {
130 /* dst and acc type must match */
131 if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
132 return false;
133
134 /* If all operands are vgprs, two must be the same. */
135 for (unsigned i = 0; i < 3; i++) {
136 if (instr.operands[i].isConstant() || instr.operands[i].isOfType(RegType::sgpr))
137 return true;
138 for (unsigned j = 0; j < i; j++) {
139 if (instr.operands[i].physReg() == instr.operands[j].physReg())
140 return true;
141 }
142 }
143 return false;
144 }
145 default: return false;
146 }
147 }
148
149 static perf_info
get_perf_info(const Program & program,const Instruction & instr)150 get_perf_info(const Program& program, const Instruction& instr)
151 {
152 instr_class cls = instr_info.classes[(int)instr.opcode];
153
154 #define WAIT(res) BlockCycleEstimator::res, 0
155 #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
156
157 if (program.gfx_level >= GFX10) {
158 /* fp64 might be incorrect */
159 switch (cls) {
160 case instr_class::valu32:
161 case instr_class::valu_convert32:
162 case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
163 case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
164 case instr_class::valu_quarter_rate32:
165 return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
166 case instr_class::valu_transcendental32:
167 return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
168 case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
169 case instr_class::valu_double_add:
170 return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
171 case instr_class::valu_double_convert:
172 return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
173 case instr_class::valu_double_transcendental:
174 return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
175 case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
176 case instr_class::sfpu: return {4, WAIT_USE(scalar, 1)};
177 case instr_class::valu_pseudo_scalar_trans:
178 return {7, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 1)};
179 case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
180 case instr_class::branch:
181 case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
182 case instr_class::ds:
183 return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
184 : perf_info{0, WAIT_USE(lds, 1)};
185 case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
186 case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
187 case instr_class::wmma: {
188 /* int8 and (b)f16 have the same performance. */
189 uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
190 return {cost, WAIT_USE(valu, cost)};
191 }
192 case instr_class::barrier:
193 case instr_class::waitcnt:
194 case instr_class::other:
195 default: return {0};
196 }
197 } else {
198 switch (cls) {
199 case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
200 case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
201 case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
202 case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
203 case instr_class::valu_fma:
204 return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
205 : perf_info{16, WAIT_USE(valu, 16)};
206 case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
207 case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
208 case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
209 case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
210 case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
211 case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
212 case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
213 case instr_class::branch:
214 return {8, WAIT_USE(branch_sendmsg, 8)};
215 return {4, WAIT_USE(branch_sendmsg, 4)};
216 case instr_class::ds:
217 return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
218 : perf_info{4, WAIT_USE(lds, 4)};
219 case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
220 case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
221 case instr_class::barrier:
222 case instr_class::waitcnt:
223 case instr_class::other:
224 default: return {4};
225 }
226 }
227
228 #undef WAIT_USE
229 #undef WAIT
230 }
231
232 void
use_resources(aco_ptr<Instruction> & instr)233 BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
234 {
235 perf_info perf = get_perf_info(*program, *instr);
236
237 if (perf.rsrc0 != resource_count) {
238 res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
239 res_usage[(int)perf.rsrc0] += perf.cost0;
240 }
241
242 if (perf.rsrc1 != resource_count) {
243 res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
244 res_usage[(int)perf.rsrc1] += perf.cost1;
245 }
246 }
247
248 int32_t
cycles_until_res_available(aco_ptr<Instruction> & instr)249 BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
250 {
251 perf_info perf = get_perf_info(*program, *instr);
252
253 int32_t cost = 0;
254 if (perf.rsrc0 != resource_count)
255 cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
256 if (perf.rsrc1 != resource_count)
257 cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
258
259 return cost;
260 }
261
262 static std::array<unsigned, wait_type_num>
get_wait_counter_info(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)263 get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
264 {
265 /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
266 * depends a lot on the situation. */
267
268 std::array<unsigned, wait_type_num> info{};
269
270 if (instr->isEXP()) {
271 info[wait_type_exp] = 16;
272 } else if (instr->isLDSDIR()) {
273 info[wait_type_exp] = 13;
274 } else if (instr->isFlatLike()) {
275 info[wait_type_lgkm] = instr->isFlat() ? 20 : 0;
276 if (!instr->definitions.empty() || gfx_level < GFX10)
277 info[wait_type_vm] = 320;
278 else
279 info[wait_type_vs] = 320;
280 } else if (instr->isSMEM()) {
281 wait_type type = gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
282 if (instr->definitions.empty()) {
283 info[type] = 200;
284 } else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
285 info[type] = 1;
286 } else {
287 bool likely_desc_load = instr->operands[0].size() == 2;
288 bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
289 bool const_offset =
290 instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
291
292 if (likely_desc_load || const_offset)
293 info[type] = 30; /* likely to hit L0 cache */
294 else
295 info[type] = 200;
296 }
297 } else if (instr->isDS()) {
298 info[wait_type_lgkm] = 20;
299 } else if (instr->isVMEM() && instr->definitions.empty() && gfx_level >= GFX10) {
300 info[wait_type_vs] = 320;
301 } else if (instr->isVMEM()) {
302 uint8_t vm_type = get_vmem_type(gfx_level, instr.get());
303 wait_type type = wait_type_vm;
304 if (gfx_level >= GFX12 && vm_type == vmem_bvh)
305 type = wait_type_bvh;
306 else if (gfx_level >= GFX12 && vm_type == vmem_sampler)
307 type = wait_type_sample;
308 info[type] = 320;
309 }
310
311 return info;
312 }
313
314 static wait_imm
get_wait_imm(Program * program,aco_ptr<Instruction> & instr)315 get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
316 {
317 wait_imm imm;
318 if (instr->opcode == aco_opcode::s_endpgm) {
319 for (unsigned i = 0; i < wait_type_num; i++)
320 imm[i] = 0;
321 } else if (imm.unpack(program->gfx_level, instr.get())) {
322 } else if (instr->isVINTERP_INREG()) {
323 imm.exp = instr->vinterp_inreg().wait_exp;
324 if (imm.exp == 0x7)
325 imm.exp = wait_imm::unset_counter;
326 } else {
327 /* If an instruction increases a counter, it waits for it to be below maximum first. */
328 std::array<unsigned, wait_type_num> wait_info =
329 get_wait_counter_info(program->gfx_level, instr);
330 wait_imm max = wait_imm::max(program->gfx_level);
331 for (unsigned i = 0; i < wait_type_num; i++) {
332 if (wait_info[i])
333 imm[i] = max[i] - 1;
334 }
335 }
336 return imm;
337 }
338
339 unsigned
get_dependency_cost(aco_ptr<Instruction> & instr)340 BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
341 {
342 int deps_available = cur_cycle;
343
344 wait_imm imm = get_wait_imm(program, instr);
345 for (unsigned i = 0; i < wait_type_num; i++) {
346 if (imm[i] == wait_imm::unset_counter)
347 continue;
348 for (int j = 0; j < (int)mem_ops[i].size() - imm[i]; j++)
349 deps_available = MAX2(deps_available, mem_ops[i][j]);
350 }
351
352 if (instr->opcode == aco_opcode::s_endpgm) {
353 for (unsigned i = 0; i < 512; i++)
354 deps_available = MAX2(deps_available, reg_available[i]);
355 } else if (program->gfx_level >= GFX10) {
356 for (Operand& op : instr->operands) {
357 if (op.isConstant() || op.isUndefined())
358 continue;
359 for (unsigned i = 0; i < op.size(); i++)
360 deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
361 }
362 }
363
364 if (program->gfx_level < GFX10)
365 deps_available = align(deps_available, 4);
366
367 return deps_available - cur_cycle;
368 }
369
370 static bool
is_vector(aco_opcode op)371 is_vector(aco_opcode op)
372 {
373 switch (instr_info.classes[(int)op]) {
374 case instr_class::valu32:
375 case instr_class::valu_convert32:
376 case instr_class::valu_fma:
377 case instr_class::valu_double:
378 case instr_class::valu_double_add:
379 case instr_class::valu_double_convert:
380 case instr_class::valu_double_transcendental:
381 case instr_class::vmem:
382 case instr_class::ds:
383 case instr_class::exp:
384 case instr_class::valu64:
385 case instr_class::valu_quarter_rate32:
386 case instr_class::valu_transcendental32: return true;
387 default: return false;
388 }
389 }
390
391 void
add(aco_ptr<Instruction> & instr)392 BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
393 {
394 perf_info perf = get_perf_info(*program, *instr);
395
396 cur_cycle += get_dependency_cost(instr);
397
398 unsigned start;
399 bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
400 is_vector(instr->opcode) && !is_dual_issue_capable(*program, *instr) &&
401 program->workgroup_size > 32;
402 for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
403 cur_cycle += cycles_until_res_available(instr);
404
405 start = cur_cycle;
406 use_resources(instr);
407
408 /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
409 cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
410 }
411
412 wait_imm imm = get_wait_imm(program, instr);
413 for (unsigned i = 0; i < wait_type_num; i++) {
414 while (mem_ops[i].size() > imm[i])
415 mem_ops[i].pop_front();
416 }
417
418 std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program->gfx_level, instr);
419 for (unsigned i = 0; i < wait_type_num; i++) {
420 if (wait_info[i])
421 mem_ops[i].push_back(cur_cycle + wait_info[i]);
422 }
423
424 /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
425 * Before waitcnt insertion, this is necessary to consider memory operations.
426 */
427 unsigned latency = 0;
428 for (unsigned i = 0; i < wait_type_num; i++)
429 latency = MAX2(latency, i == wait_type_vs ? 0 : wait_info[i]);
430 int32_t result_available = start + MAX2(perf.latency, (int32_t)latency);
431
432 for (Definition& def : instr->definitions) {
433 int32_t* available = ®_available[def.physReg().reg()];
434 for (unsigned i = 0; i < def.size(); i++)
435 available[i] = MAX2(available[i], result_available);
436 }
437 }
438
439 void
join(const BlockCycleEstimator & pred)440 BlockCycleEstimator::join(const BlockCycleEstimator& pred)
441 {
442 assert(cur_cycle == 0);
443
444 for (unsigned i = 0; i < (unsigned)resource_count; i++) {
445 assert(res_usage[i] == 0);
446 res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
447 }
448
449 for (unsigned i = 0; i < 512; i++)
450 reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
451
452 for (unsigned i = 0; i < wait_type_num; i++) {
453 std::deque<int32_t>& ops = mem_ops[i];
454 const std::deque<int32_t>& pred_ops = pred.mem_ops[i];
455 for (unsigned j = 0; j < MIN2(ops.size(), pred_ops.size()); j++)
456 ops.rbegin()[j] = MAX2(ops.rbegin()[j], pred_ops.rbegin()[j] - pred.cur_cycle);
457 for (int j = pred_ops.size() - ops.size() - 1; j >= 0; j--)
458 ops.push_front(pred_ops[j] - pred.cur_cycle);
459 }
460 }
461
462 } /* end namespace */
463
464 /* sgpr_presched/vgpr_presched */
465 void
collect_presched_stats(Program * program)466 collect_presched_stats(Program* program)
467 {
468 RegisterDemand presched_demand;
469 for (Block& block : program->blocks)
470 presched_demand.update(block.register_demand);
471 program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
472 program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
473 }
474
475 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
476 void
collect_preasm_stats(Program * program)477 collect_preasm_stats(Program* program)
478 {
479 for (Block& block : program->blocks) {
480 std::set<Instruction*> vmem_clause;
481 std::set<Instruction*> smem_clause;
482
483 program->statistics[aco_statistic_instructions] += block.instructions.size();
484
485 for (aco_ptr<Instruction>& instr : block.instructions) {
486 const bool is_branch =
487 instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
488 if (is_branch)
489 program->statistics[aco_statistic_branches]++;
490
491 if (instr->isVALU() || instr->isVINTRP())
492 program->statistics[aco_statistic_valu]++;
493 if (instr->isSALU() && !instr->isSOPP() &&
494 instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
495 program->statistics[aco_statistic_salu]++;
496 if (instr->isVOPD())
497 program->statistics[aco_statistic_vopd]++;
498
499 if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
500 !instr->operands.empty()) {
501 if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
502 [&](Instruction* other)
503 { return should_form_clause(instr.get(), other); }))
504 program->statistics[aco_statistic_vmem_clauses]++;
505 vmem_clause.insert(instr.get());
506
507 program->statistics[aco_statistic_vmem]++;
508 } else {
509 vmem_clause.clear();
510 }
511
512 if (instr->isSMEM() && !instr->operands.empty()) {
513 if (std::none_of(smem_clause.begin(), smem_clause.end(),
514 [&](Instruction* other)
515 { return should_form_clause(instr.get(), other); }))
516 program->statistics[aco_statistic_smem_clauses]++;
517 smem_clause.insert(instr.get());
518
519 program->statistics[aco_statistic_smem]++;
520 } else {
521 smem_clause.clear();
522 }
523 }
524 }
525
526 double latency = 0;
527 double usage[(int)BlockCycleEstimator::resource_count] = {0};
528 std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
529
530 constexpr const unsigned vmem_latency = 320;
531 for (const Definition def : program->args_pending_vmem) {
532 blocks[0].mem_ops[wait_type_vm].push_back(vmem_latency);
533 for (unsigned i = 0; i < def.size(); i++)
534 blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
535 }
536
537 for (Block& block : program->blocks) {
538 BlockCycleEstimator& block_est = blocks[block.index];
539 for (unsigned pred : block.linear_preds)
540 block_est.join(blocks[pred]);
541
542 for (aco_ptr<Instruction>& instr : block.instructions) {
543 unsigned before = block_est.cur_cycle;
544 block_est.add(instr);
545 instr->pass_flags = block_est.cur_cycle - before;
546 }
547
548 /* TODO: it would be nice to be able to consider estimated loop trip
549 * counts used for loop unrolling.
550 */
551
552 /* TODO: estimate the trip_count of divergent loops (those which break
553 * divergent) higher than of uniform loops
554 */
555
556 /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
557 * and any lane in the wave takes a side of a divergent branch 75% of the
558 * time.
559 */
560 double iter = 1.0;
561 iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
562 iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
563 iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
564 iter *= pow(0.5, block.uniform_if_depth);
565 iter *= pow(0.75, block.divergent_if_logical_depth);
566
567 bool divergent_if_linear_else =
568 block.logical_preds.empty() && block.linear_preds.size() == 1 &&
569 block.linear_succs.size() == 1 &&
570 program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
571 if (divergent_if_linear_else)
572 iter *= 0.25;
573
574 latency += block_est.cur_cycle * iter;
575 for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
576 usage[i] += block_est.res_usage[i] * iter;
577 }
578
579 /* This likely exaggerates the effectiveness of parallelism because it
580 * ignores instruction ordering. It can assume there might be SALU/VALU/etc
581 * work to from other waves while one is idle but that might not be the case
582 * because those other waves have not reached such a point yet.
583 */
584
585 double parallelism = program->num_waves;
586 for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
587 if (usage[i] > 0.0)
588 parallelism = MIN2(parallelism, latency / usage[i]);
589 }
590 double waves_per_cycle = 1.0 / latency * parallelism;
591 double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
592
593 double max_utilization = 1.0;
594 if (program->workgroup_size != UINT_MAX)
595 max_utilization =
596 program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
597 wave64_per_cycle *= max_utilization;
598
599 program->statistics[aco_statistic_latency] = round(latency);
600 program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
601
602 if (debug_flags & DEBUG_PERF_INFO) {
603 aco_print_program(program, stderr, print_no_ssa | print_perf_info);
604
605 fprintf(stderr, "num_waves: %u\n", program->num_waves);
606 fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
607 fprintf(stderr, "branch_sendmsg_usage: %f\n",
608 usage[(int)BlockCycleEstimator::branch_sendmsg]);
609 fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
610 fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
611 fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
612 fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
613 fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
614 fprintf(stderr, "latency: %f\n", latency);
615 fprintf(stderr, "parallelism: %f\n", parallelism);
616 fprintf(stderr, "max_utilization: %f\n", max_utilization);
617 fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
618 fprintf(stderr, "\n");
619 }
620 }
621
622 void
collect_postasm_stats(Program * program,const std::vector<uint32_t> & code)623 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
624 {
625 program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
626 }
627
628 Instruction_cycle_info
get_cycle_info(const Program & program,const Instruction & instr)629 get_cycle_info(const Program& program, const Instruction& instr)
630 {
631 perf_info info = get_perf_info(program, instr);
632 return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
633 }
634
635 } // namespace aco
636