1 /* -*- mesa-c++ -*-
2 * Copyright 2022 Collabora LTD
3 * Author: Gert Wollny <[email protected]>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "sfn_scheduler.h"
8
9 #include "../r600_isa.h"
10
11 #include "amd_family.h"
12 #include "sfn_alu_defines.h"
13 #include "sfn_debug.h"
14 #include "sfn_instr_alugroup.h"
15 #include "sfn_instr_controlflow.h"
16 #include "sfn_instr_export.h"
17 #include "sfn_instr_fetch.h"
18 #include "sfn_instr_lds.h"
19 #include "sfn_instr_mem.h"
20 #include "sfn_instr_tex.h"
21
22 #include <algorithm>
23 #include <sstream>
24
25 namespace r600 {
26
27 class CollectInstructions : public InstrVisitor {
28
29 public:
CollectInstructions(ValueFactory & vf)30 CollectInstructions(ValueFactory& vf):
31 m_value_factory(vf)
32 {
33 }
34
visit(AluInstr * instr)35 void visit(AluInstr *instr) override
36 {
37 if (instr->has_alu_flag(alu_is_trans))
38 alu_trans.push_back(instr);
39 else {
40 if (instr->alu_slots() == 1)
41 alu_vec.push_back(instr);
42 else
43 alu_groups.push_back(instr->split(m_value_factory));
44 }
45 }
visit(AluGroup * instr)46 void visit(AluGroup *instr) override { alu_groups.push_back(instr); }
visit(TexInstr * instr)47 void visit(TexInstr *instr) override { tex.push_back(instr); }
visit(ExportInstr * instr)48 void visit(ExportInstr *instr) override { exports.push_back(instr); }
visit(FetchInstr * instr)49 void visit(FetchInstr *instr) override { fetches.push_back(instr); }
visit(Block * instr)50 void visit(Block *instr) override
51 {
52 for (auto& i : *instr)
53 i->accept(*this);
54 }
55
visit(ControlFlowInstr * instr)56 void visit(ControlFlowInstr *instr) override
57 {
58 assert(!m_cf_instr);
59 m_cf_instr = instr;
60 }
61
visit(IfInstr * instr)62 void visit(IfInstr *instr) override
63 {
64 assert(!m_cf_instr);
65 m_cf_instr = instr;
66 }
67
visit(EmitVertexInstr * instr)68 void visit(EmitVertexInstr *instr) override
69 {
70 assert(!m_cf_instr);
71 m_cf_instr = instr;
72 }
73
visit(ScratchIOInstr * instr)74 void visit(ScratchIOInstr *instr) override { mem_write_instr.push_back(instr); }
75
visit(StreamOutInstr * instr)76 void visit(StreamOutInstr *instr) override { mem_write_instr.push_back(instr); }
77
visit(MemRingOutInstr * instr)78 void visit(MemRingOutInstr *instr) override { mem_ring_writes.push_back(instr); }
79
visit(GDSInstr * instr)80 void visit(GDSInstr *instr) override { gds_op.push_back(instr); }
81
visit(WriteTFInstr * instr)82 void visit(WriteTFInstr *instr) override { write_tf.push_back(instr); }
83
visit(LDSReadInstr * instr)84 void visit(LDSReadInstr *instr) override
85 {
86 std::vector<AluInstr *> buffer;
87 m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
88 for (auto& i : buffer) {
89 i->accept(*this);
90 }
91 }
92
visit(LDSAtomicInstr * instr)93 void visit(LDSAtomicInstr *instr) override
94 {
95 std::vector<AluInstr *> buffer;
96 m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
97 for (auto& i : buffer) {
98 i->accept(*this);
99 }
100 }
101
visit(RatInstr * instr)102 void visit(RatInstr *instr) override { rat_instr.push_back(instr); }
103
104 std::list<AluInstr *> alu_trans;
105 std::list<AluInstr *> alu_vec;
106 std::list<TexInstr *> tex;
107 std::list<AluGroup *> alu_groups;
108 std::list<ExportInstr *> exports;
109 std::list<FetchInstr *> fetches;
110 std::list<WriteOutInstr *> mem_write_instr;
111 std::list<MemRingOutInstr *> mem_ring_writes;
112 std::list<GDSInstr *> gds_op;
113 std::list<WriteTFInstr *> write_tf;
114 std::list<RatInstr *> rat_instr;
115
116 Instr *m_cf_instr{nullptr};
117 ValueFactory& m_value_factory;
118
119 AluInstr *m_last_lds_instr{nullptr};
120 };
121
122 struct ArrayChanHash
123 {
operator ()r600::ArrayChanHash124 std::size_t operator()(std::pair<int, int> const& s) const noexcept
125 {
126 return std::hash<size_t>{}((size_t(s.first) << 3) | s.second);
127 }
128 };
129
130 using ArrayCheckSet = std::unordered_set<std::pair<int, int>, ArrayChanHash>;
131
132 class BlockScheduler {
133 public:
134 BlockScheduler(r600_chip_class chip_class,
135 radeon_family family);
136
137 void run(Shader *shader);
138
139 void finalize();
140
141 private:
142 void
143 schedule_block(Block& in_block, Shader::ShaderBlocks& out_blocks, ValueFactory& vf);
144
145 bool collect_ready(CollectInstructions& available);
146
147 template <typename T>
148 bool collect_ready_type(std::list<T *>& ready, std::list<T *>& orig);
149
150 bool collect_ready_alu_vec(std::list<AluInstr *>& ready,
151 std::list<AluInstr *>& available);
152
153 bool schedule_tex(Shader::ShaderBlocks& out_blocks);
154 bool schedule_vtx(Shader::ShaderBlocks& out_blocks);
155
156 template <typename I>
157 bool schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
158
159 template <typename I>
160 bool schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
161
162 bool schedule_alu(Shader::ShaderBlocks& out_blocks);
163 void start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type);
164
165 bool schedule_alu_to_group_vec(AluGroup *group);
166 bool schedule_alu_to_group_trans(AluGroup *group, std::list<AluInstr *>& readylist);
167
168 bool schedule_exports(Shader::ShaderBlocks& out_blocks,
169 std::list<ExportInstr *>& ready_list);
170
171 void maybe_split_alu_block(Shader::ShaderBlocks& out_blocks);
172
173 template <typename I> bool schedule(std::list<I *>& ready_list);
174
175 template <typename I> bool schedule_block(std::list<I *>& ready_list);
176
177 void update_array_writes(const AluGroup& group);
178 bool check_array_reads(const AluInstr& instr);
179 bool check_array_reads(const AluGroup& group);
180
181 std::list<AluInstr *> alu_vec_ready;
182 std::list<AluInstr *> alu_trans_ready;
183 std::list<AluGroup *> alu_groups_ready;
184 std::list<TexInstr *> tex_ready;
185 std::list<ExportInstr *> exports_ready;
186 std::list<FetchInstr *> fetches_ready;
187 std::list<WriteOutInstr *> memops_ready;
188 std::list<MemRingOutInstr *> mem_ring_writes_ready;
189 std::list<GDSInstr *> gds_ready;
190 std::list<WriteTFInstr *> write_tf_ready;
191 std::list<RatInstr *> rat_instr_ready;
192
193 enum {
194 sched_alu,
195 sched_tex,
196 sched_fetch,
197 sched_free,
198 sched_mem_ring,
199 sched_gds,
200 sched_write_tf,
201 sched_rat,
202 } current_shed;
203
204 ExportInstr *m_last_pos;
205 ExportInstr *m_last_pixel;
206 ExportInstr *m_last_param;
207
208 Block *m_current_block;
209
210 int m_lds_addr_count{0};
211 int m_alu_groups_scheduled{0};
212 r600_chip_class m_chip_class;
213 radeon_family m_chip_family;
214 bool m_idx0_loading{false};
215 bool m_idx1_loading{false};
216 bool m_idx0_pending{false};
217 bool m_idx1_pending{false};
218
219 bool m_nop_after_rel_dest{false};
220 bool m_nop_befor_rel_src{false};
221 uint32_t m_next_block_id{1};
222
223
224 ArrayCheckSet m_last_indirect_array_write;
225 ArrayCheckSet m_last_direct_array_write;
226 };
227
228 Shader *
schedule(Shader * original)229 schedule(Shader *original)
230 {
231 Block::set_chipclass(original->chip_class());
232 AluGroup::set_chipclass(original->chip_class());
233
234 sfn_log << SfnLog::schedule << "Original shader\n";
235 if (sfn_log.has_debug_flag(SfnLog::schedule)) {
236 std::stringstream ss;
237 original->print(ss);
238 sfn_log << ss.str() << "\n\n";
239 }
240
241 // TODO later it might be necessary to clone the shader
242 // to be able to re-start scheduling
243
244 auto scheduled_shader = original;
245
246 BlockScheduler s(original->chip_class(), original->chip_family());
247
248 s.run(scheduled_shader);
249 s.finalize();
250
251 sfn_log << SfnLog::schedule << "Scheduled shader\n";
252 if (sfn_log.has_debug_flag(SfnLog::schedule)) {
253 std::stringstream ss;
254 scheduled_shader->print(ss);
255 sfn_log << ss.str() << "\n\n";
256 }
257
258 return scheduled_shader;
259 }
260
BlockScheduler(r600_chip_class chip_class,radeon_family chip_family)261 BlockScheduler::BlockScheduler(r600_chip_class chip_class,
262 radeon_family chip_family):
263 current_shed(sched_alu),
264 m_last_pos(nullptr),
265 m_last_pixel(nullptr),
266 m_last_param(nullptr),
267 m_current_block(nullptr),
268 m_chip_class(chip_class),
269 m_chip_family(chip_family)
270 {
271 m_nop_after_rel_dest = chip_family == CHIP_RV770;
272
273 m_nop_befor_rel_src = m_chip_class == ISA_CC_R600 &&
274 chip_family != CHIP_RV670 &&
275 chip_family != CHIP_RS780 &&
276 chip_family != CHIP_RS880;
277 }
278
279 void
run(Shader * shader)280 BlockScheduler::run(Shader *shader)
281 {
282 Shader::ShaderBlocks scheduled_blocks;
283
284 for (auto& block : shader->func()) {
285 sfn_log << SfnLog::schedule << "Process block " << block->id() << "\n";
286 if (sfn_log.has_debug_flag(SfnLog::schedule)) {
287 std::stringstream ss;
288 block->print(ss);
289 sfn_log << ss.str() << "\n";
290 }
291 schedule_block(*block, scheduled_blocks, shader->value_factory());
292 }
293
294 shader->reset_function(scheduled_blocks);
295 }
296
297 void
schedule_block(Block & in_block,Shader::ShaderBlocks & out_blocks,ValueFactory & vf)298 BlockScheduler::schedule_block(Block& in_block,
299 Shader::ShaderBlocks& out_blocks,
300 ValueFactory& vf)
301 {
302
303 assert(in_block.id() >= 0);
304
305 current_shed = sched_fetch;
306 auto last_shed = sched_fetch;
307
308 CollectInstructions cir(vf);
309 in_block.accept(cir);
310
311 bool have_instr = collect_ready(cir);
312
313 m_current_block = new Block(in_block.nesting_depth(), m_next_block_id++);
314 m_current_block->set_instr_flag(Instr::force_cf);
315 assert(m_current_block->id() >= 0);
316
317 while (have_instr) {
318
319 sfn_log << SfnLog::schedule << "Have ready instructions\n";
320
321 if (alu_vec_ready.size())
322 sfn_log << SfnLog::schedule << " ALU V:" << alu_vec_ready.size() << "\n";
323
324 if (alu_trans_ready.size())
325 sfn_log << SfnLog::schedule << " ALU T:" << alu_trans_ready.size() << "\n";
326
327 if (alu_groups_ready.size())
328 sfn_log << SfnLog::schedule << " ALU G:" << alu_groups_ready.size() << "\n";
329
330 if (exports_ready.size())
331 sfn_log << SfnLog::schedule << " EXP:" << exports_ready.size() << "\n";
332 if (tex_ready.size())
333 sfn_log << SfnLog::schedule << " TEX:" << tex_ready.size() << "\n";
334 if (fetches_ready.size())
335 sfn_log << SfnLog::schedule << " FETCH:" << fetches_ready.size() << "\n";
336 if (mem_ring_writes_ready.size())
337 sfn_log << SfnLog::schedule << " MEM_RING:" << mem_ring_writes_ready.size()
338 << "\n";
339 if (memops_ready.size())
340 sfn_log << SfnLog::schedule << " MEM_OPS:" << mem_ring_writes_ready.size()
341 << "\n";
342
343 if (!m_current_block->lds_group_active() &&
344 m_current_block->expected_ar_uses() == 0) {
345 if (last_shed != sched_free && memops_ready.size() > 8)
346 current_shed = sched_free;
347 else if (mem_ring_writes_ready.size() > 15)
348 current_shed = sched_mem_ring;
349 else if (rat_instr_ready.size() > 3)
350 current_shed = sched_rat;
351 else if (tex_ready.size() > (m_chip_class >= ISA_CC_EVERGREEN ? 15 : 7))
352 current_shed = sched_tex;
353 }
354
355 switch (current_shed) {
356 case sched_alu:
357 if (!schedule_alu(out_blocks)) {
358 assert(!m_current_block->lds_group_active());
359 current_shed = sched_tex;
360 continue;
361 }
362 last_shed = current_shed;
363 break;
364 case sched_tex:
365 if (tex_ready.empty() || !schedule_tex(out_blocks)) {
366 current_shed = sched_fetch;
367 continue;
368 }
369 last_shed = current_shed;
370 break;
371 case sched_fetch:
372 if (!fetches_ready.empty()) {
373 schedule_vtx(out_blocks);
374 last_shed = current_shed;
375 }
376 current_shed = sched_gds;
377 continue;
378 case sched_gds:
379 if (!gds_ready.empty()) {
380 schedule_gds(out_blocks, gds_ready);
381 last_shed = current_shed;
382 }
383 current_shed = sched_mem_ring;
384 continue;
385 case sched_mem_ring:
386 if (mem_ring_writes_ready.empty() ||
387 !schedule_cf(out_blocks, mem_ring_writes_ready)) {
388 current_shed = sched_write_tf;
389 continue;
390 }
391 last_shed = current_shed;
392 break;
393 case sched_write_tf:
394 if (write_tf_ready.empty() || !schedule_gds(out_blocks, write_tf_ready)) {
395 current_shed = sched_rat;
396 continue;
397 }
398 last_shed = current_shed;
399 break;
400 case sched_rat:
401 if (rat_instr_ready.empty() || !schedule_cf(out_blocks, rat_instr_ready)) {
402 current_shed = sched_free;
403 continue;
404 }
405 last_shed = current_shed;
406 break;
407 case sched_free:
408 if (memops_ready.empty() || !schedule_cf(out_blocks, memops_ready)) {
409 current_shed = sched_alu;
410 break;
411 }
412 last_shed = current_shed;
413 }
414
415 have_instr = collect_ready(cir);
416 }
417
418 /* Emit exports always at end of a block */
419 while (collect_ready_type(exports_ready, cir.exports))
420 schedule_exports(out_blocks, exports_ready);
421
422 ASSERTED bool fail = false;
423
424 if (!cir.alu_groups.empty()) {
425 std::cerr << "Unscheduled ALU groups:\n";
426 for (auto& a : cir.alu_groups) {
427 std::cerr << " " << *a << "\n";
428 }
429 fail = true;
430 }
431
432 if (!cir.alu_vec.empty()) {
433 std::cerr << "Unscheduled ALU vec ops:\n";
434 for (auto& a : cir.alu_vec) {
435 std::cerr << " [" << a->block_id() << ":"
436 << a->index() <<"]:" << *a << "\n";
437 for (auto& d : a->required_instr())
438 std::cerr << " R["<< d->block_id() << ":" << d->index() <<"]:"
439 << *d << "\n";
440 }
441 fail = true;
442 }
443
444 if (!cir.alu_trans.empty()) {
445 std::cerr << "Unscheduled ALU trans ops:\n";
446 for (auto& a : cir.alu_trans) {
447 std::cerr << " " << " [" << a->block_id() << ":"
448 << a->index() <<"]:" << *a << "\n";
449 for (auto& d : a->required_instr())
450 std::cerr << " R:" << *d << "\n";
451 }
452 fail = true;
453 }
454 if (!cir.mem_write_instr.empty()) {
455 std::cerr << "Unscheduled MEM ops:\n";
456 for (auto& a : cir.mem_write_instr) {
457 std::cerr << " " << *a << "\n";
458 }
459 fail = true;
460 }
461
462 if (!cir.fetches.empty()) {
463 std::cerr << "Unscheduled Fetch ops:\n";
464 for (auto& a : cir.fetches) {
465 std::cerr << " " << *a << "\n";
466 }
467 fail = true;
468 }
469
470 if (!cir.tex.empty()) {
471 std::cerr << "Unscheduled Tex ops:\n";
472 for (auto& a : cir.tex) {
473 std::cerr << " " << *a << "\n";
474 }
475 fail = true;
476 }
477
478 if (fail) {
479 std::cerr << "Failing block:\n";
480 for (auto& i : in_block)
481 std::cerr << "[" << i->block_id() << ":" << i->index() << "] "
482 << (i->is_scheduled() ? "S " : "")
483 << *i << "\n";
484 std::cerr << "\nSo far scheduled: ";
485
486 for (auto i : *m_current_block)
487 std::cerr << "[" << i->block_id() << ":" << i->index() << "] " << *i << "\n";
488 std::cerr << "\n\n: ";
489 }
490
491 assert(cir.tex.empty());
492 assert(cir.exports.empty());
493 assert(cir.fetches.empty());
494 assert(cir.alu_vec.empty());
495 assert(cir.mem_write_instr.empty());
496 assert(cir.mem_ring_writes.empty());
497
498 assert(!fail);
499
500 if (cir.m_cf_instr) {
501 // Assert that if condition is ready
502 if (m_current_block->type() != Block::alu) {
503 start_new_block(out_blocks, Block::alu);
504 }
505 m_current_block->push_back(cir.m_cf_instr);
506 cir.m_cf_instr->set_scheduled();
507 }
508
509 if (m_current_block->type() == Block::alu)
510 maybe_split_alu_block(out_blocks);
511 else
512 out_blocks.push_back(m_current_block);
513 }
514
515 void
finalize()516 BlockScheduler::finalize()
517 {
518 if (m_last_pos)
519 m_last_pos->set_is_last_export(true);
520 if (m_last_pixel)
521 m_last_pixel->set_is_last_export(true);
522 if (m_last_param)
523 m_last_param->set_is_last_export(true);
524 }
525
526 bool
schedule_alu(Shader::ShaderBlocks & out_blocks)527 BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks)
528 {
529 bool success = false;
530 AluGroup *group = nullptr;
531
532 sfn_log << SfnLog::schedule << "Schedule alu with " <<
533 m_current_block->expected_ar_uses()
534 << " pending AR loads\n";
535
536 bool has_alu_ready = !alu_vec_ready.empty() || !alu_trans_ready.empty();
537
538 bool has_lds_ready =
539 !alu_vec_ready.empty() && (*alu_vec_ready.begin())->has_lds_access();
540
541 bool has_ar_read_ready = !alu_vec_ready.empty() &&
542 std::get<0>((*alu_vec_ready.begin())->indirect_addr());
543
544 /* If we have ready ALU instructions we have to start a new ALU block */
545 if (has_alu_ready || !alu_groups_ready.empty()) {
546 if (m_current_block->type() != Block::alu) {
547 start_new_block(out_blocks, Block::alu);
548 m_alu_groups_scheduled = 0;
549 }
550 }
551
552 /* Schedule groups first. unless we have a pending LDS instruction
553 * We don't want the LDS instructions to be too far apart because the
554 * fetch + read from queue has to be in the same ALU CF block */
555 if (!alu_groups_ready.empty() && !has_lds_ready && !has_ar_read_ready) {
556 group = *alu_groups_ready.begin();
557
558 if (!check_array_reads(*group)) {
559
560
561 sfn_log << SfnLog::schedule << "try schedule " <<
562 *group << "\n";
563
564 /* Only start a new CF if we have no pending AR reads */
565 if (m_current_block->try_reserve_kcache(*group)) {
566 alu_groups_ready.erase(alu_groups_ready.begin());
567 success = true;
568 } else {
569 if (m_current_block->expected_ar_uses() == 0) {
570 start_new_block(out_blocks, Block::alu);
571
572 if (!m_current_block->try_reserve_kcache(*group))
573 unreachable("Scheduling a group in a new block should always succeed");
574 alu_groups_ready.erase(alu_groups_ready.begin());
575 sfn_log << SfnLog::schedule << "Schedule ALU group\n";
576 success = true;
577 } else {
578 sfn_log << SfnLog::schedule << "Don't add group because of " <<
579 m_current_block->expected_ar_uses()
580 << "pending AR loads\n";
581 group = nullptr;
582 }
583 }
584 }
585 }
586
587 if (!group && has_alu_ready) {
588 group = new AluGroup();
589 sfn_log << SfnLog::schedule << "START new ALU group\n";
590 } else if (!success) {
591 return false;
592 }
593
594 assert(group);
595
596 int free_slots = group->free_slots();
597
598 while (free_slots && has_alu_ready) {
599 if (!alu_vec_ready.empty())
600 success |= schedule_alu_to_group_vec(group);
601
602 /* Apparently one can't schedule a t-slot if there is already
603 * and LDS instruction scheduled.
604 * TODO: check whether this is only relevant for actual LDS instructions
605 * or also for instructions that read from the LDS return value queue */
606
607 if (free_slots & 0x10 && !has_lds_ready) {
608 sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
609 if (!alu_trans_ready.empty())
610 success |= schedule_alu_to_group_trans(group, alu_trans_ready);
611 if (!alu_vec_ready.empty())
612 success |= schedule_alu_to_group_trans(group, alu_vec_ready);
613 }
614
615 if (success) {
616 ++m_alu_groups_scheduled;
617 break;
618 } else if (m_current_block->kcache_reservation_failed()) {
619 // LDS read groups should not lead to impossible
620 // kcache constellations
621 assert(!m_current_block->lds_group_active());
622
623 // AR is loaded but not all uses are done, we don't want
624 // to start a new CF here
625 assert(m_current_block->expected_ar_uses() ==0);
626
627 // kcache reservation failed, so we have to start a new CF
628 start_new_block(out_blocks, Block::alu);
629 } else {
630 // Ready is not empty, but we didn't schedule anything, this
631 // means we had a indirect array read or write conflict that we
632 // can resolve with an extra group that has a NOP instruction
633 if (!alu_trans_ready.empty() || !alu_vec_ready.empty()) {
634 group->add_vec_instructions(new AluInstr(op0_nop, 0));
635 break;
636 } else {
637 return false;
638 }
639 }
640 }
641
642
643
644 sfn_log << SfnLog::schedule << "Finalize ALU group\n";
645 group->set_scheduled();
646 group->fix_last_flag();
647 group->set_nesting_depth(m_current_block->nesting_depth());
648
649 auto [addr, is_index] = group->addr();
650 if (is_index) {
651 if (addr->sel() == AddressRegister::idx0 && m_idx0_pending) {
652 assert(!group->has_lds_group_start());
653 assert(m_current_block->expected_ar_uses() == 0);
654 start_new_block(out_blocks, Block::alu);
655 m_current_block->try_reserve_kcache(*group);
656 }
657 if (addr->sel() == AddressRegister::idx1 && m_idx1_pending) {
658 assert(!group->has_lds_group_start());
659 assert(m_current_block->expected_ar_uses() == 0);
660 start_new_block(out_blocks, Block::alu);
661 m_current_block->try_reserve_kcache(*group);
662 }
663 }
664
665 m_current_block->push_back(group);
666
667 update_array_writes(*group);
668
669 m_idx0_pending |= m_idx0_loading;
670 m_idx0_loading = false;
671
672 m_idx1_pending |= m_idx1_loading;
673 m_idx1_loading = false;
674
675 if (!m_current_block->lds_group_active() &&
676 m_current_block->expected_ar_uses() == 0 &&
677 (!addr || is_index)) {
678 group->set_instr_flag(Instr::no_lds_or_addr_group);
679 }
680
681 if (group->has_lds_group_start())
682 m_current_block->lds_group_start(*group->begin());
683
684 if (group->has_lds_group_end())
685 m_current_block->lds_group_end();
686
687 if (group->has_kill_op()) {
688 assert(!group->has_lds_group_start());
689 assert(m_current_block->expected_ar_uses() == 0);
690 start_new_block(out_blocks, Block::alu);
691 }
692
693 return success;
694 }
695
696 bool
schedule_tex(Shader::ShaderBlocks & out_blocks)697 BlockScheduler::schedule_tex(Shader::ShaderBlocks& out_blocks)
698 {
699 if (m_current_block->type() != Block::tex || m_current_block->remaining_slots() == 0) {
700 start_new_block(out_blocks, Block::tex);
701 m_current_block->set_instr_flag(Instr::force_cf);
702 }
703
704 if (!tex_ready.empty() && m_current_block->remaining_slots() > 0) {
705 auto ii = tex_ready.begin();
706 sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
707
708 if ((unsigned)m_current_block->remaining_slots() < 1 + (*ii)->prepare_instr().size())
709 start_new_block(out_blocks, Block::tex);
710
711 for (auto prep : (*ii)->prepare_instr()) {
712 prep->set_scheduled();
713 m_current_block->push_back(prep);
714 }
715
716 (*ii)->set_scheduled();
717 m_current_block->push_back(*ii);
718 tex_ready.erase(ii);
719 return true;
720 }
721 return false;
722 }
723
724 bool
schedule_vtx(Shader::ShaderBlocks & out_blocks)725 BlockScheduler::schedule_vtx(Shader::ShaderBlocks& out_blocks)
726 {
727 if (m_current_block->type() != Block::vtx || m_current_block->remaining_slots() == 0) {
728 start_new_block(out_blocks, Block::vtx);
729 m_current_block->set_instr_flag(Instr::force_cf);
730 }
731 return schedule_block(fetches_ready);
732 }
733
734 template <typename I>
735 bool
schedule_gds(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)736 BlockScheduler::schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
737 {
738 bool was_full = m_current_block->remaining_slots() == 0;
739 if (m_current_block->type() != Block::gds || was_full) {
740 start_new_block(out_blocks, Block::gds);
741 if (was_full)
742 m_current_block->set_instr_flag(Instr::force_cf);
743 }
744 return schedule_block(ready_list);
745 }
746
747 void
start_new_block(Shader::ShaderBlocks & out_blocks,Block::Type type)748 BlockScheduler::start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type)
749 {
750 if (!m_current_block->empty()) {
751 sfn_log << SfnLog::schedule << "Start new block\n";
752 assert(!m_current_block->lds_group_active());
753
754 if (m_current_block->type() != Block::alu)
755 out_blocks.push_back(m_current_block);
756 else
757 maybe_split_alu_block(out_blocks);
758 m_current_block = new Block(m_current_block->nesting_depth(), m_next_block_id++);
759 m_current_block->set_instr_flag(Instr::force_cf);
760 m_idx0_pending = m_idx1_pending = false;
761
762 }
763 m_current_block->set_type(type, m_chip_class);
764 }
765
maybe_split_alu_block(Shader::ShaderBlocks & out_blocks)766 void BlockScheduler::maybe_split_alu_block(Shader::ShaderBlocks& out_blocks)
767 {
768 // TODO: needs fixing
769 if (m_current_block->remaining_slots() > 0) {
770 out_blocks.push_back(m_current_block);
771 return;
772 }
773
774 int used_slots = 0;
775 int pending_slots = 0;
776
777 Instr *next_block_start = nullptr;
778 for (auto cur_group : *m_current_block) {
779 /* This limit is a bit fishy, it should be 128 */
780 if (used_slots + pending_slots + cur_group->slots() < 128) {
781 if (cur_group->can_start_alu_block()) {
782 next_block_start = cur_group;
783 used_slots += pending_slots;
784 pending_slots = cur_group->slots();
785 } else {
786 pending_slots += cur_group->slots();
787 }
788 } else {
789 assert(next_block_start);
790 next_block_start->set_instr_flag(Instr::force_cf);
791 used_slots = pending_slots;
792 pending_slots = cur_group->slots();
793 }
794 }
795
796 Block *sub_block = new Block(m_current_block->nesting_depth(),
797 m_next_block_id++);
798 sub_block->set_type(Block::alu, m_chip_class);
799 sub_block->set_instr_flag(Instr::force_cf);
800
801 for (auto instr : *m_current_block) {
802 auto group = instr->as_alu_group();
803 if (!group) {
804 sub_block->push_back(instr);
805 continue;
806 }
807
808 if (group->group_force_alu_cf()) {
809 assert(!sub_block->lds_group_active());
810 out_blocks.push_back(sub_block);
811 sub_block = new Block(m_current_block->nesting_depth(),
812 m_next_block_id++);
813 sub_block->set_type(Block::alu, m_chip_class);
814 sub_block->set_instr_flag(Instr::force_cf);
815 }
816 sub_block->push_back(group);
817 if (group->has_lds_group_start())
818 sub_block->lds_group_start(*group->begin());
819
820 if (group->has_lds_group_end())
821 sub_block->lds_group_end();
822
823 }
824 if (!sub_block->empty())
825 out_blocks.push_back(sub_block);
826 }
827
828 template <typename I>
829 bool
schedule_cf(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)830 BlockScheduler::schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
831 {
832 if (ready_list.empty())
833 return false;
834 if (m_current_block->type() != Block::cf)
835 start_new_block(out_blocks, Block::cf);
836 return schedule(ready_list);
837 }
838
839 bool
schedule_alu_to_group_vec(AluGroup * group)840 BlockScheduler::schedule_alu_to_group_vec(AluGroup *group)
841 {
842 assert(group);
843 assert(!alu_vec_ready.empty());
844
845 bool success = false;
846 auto i = alu_vec_ready.begin();
847 auto e = alu_vec_ready.end();
848 while (i != e) {
849 sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;
850
851 if (check_array_reads(**i)) {
852 ++i;
853 continue;
854 }
855
856 // precausion: don't kill while we hae LDS queue reads in the pipeline
857 if ((*i)->is_kill() && m_current_block->lds_group_active())
858 continue;
859
860 if (!m_current_block->try_reserve_kcache(**i)) {
861 sfn_log << SfnLog::schedule << " failed (kcache)\n";
862 ++i;
863 continue;
864 }
865
866 if (group->add_vec_instructions(*i)) {
867 auto old_i = i;
868 ++i;
869 if ((*old_i)->has_alu_flag(alu_is_lds)) {
870 --m_lds_addr_count;
871 }
872
873 if ((*old_i)->num_ar_uses())
874 m_current_block->set_expected_ar_uses((*old_i)->num_ar_uses());
875 auto addr = std::get<0>((*old_i)->indirect_addr());
876 bool has_indirect_reg_load = addr != nullptr && addr->has_flag(Register::addr_or_idx);
877
878 bool is_idx_load_on_eg = false;
879 if (!(*old_i)->has_alu_flag(alu_is_lds)) {
880 bool load_idx0_eg = (*old_i)->opcode() == op1_set_cf_idx0;
881 bool load_idx0_ca = ((*old_i)->opcode() == op1_mova_int &&
882 (*old_i)->dest()->sel() == AddressRegister::idx0);
883
884 bool load_idx1_eg = (*old_i)->opcode() == op1_set_cf_idx1;
885 bool load_idx1_ca = ((*old_i)->opcode() == op1_mova_int &&
886 (*old_i)->dest()->sel() == AddressRegister::idx1);
887
888 is_idx_load_on_eg = load_idx0_eg || load_idx1_eg;
889
890 bool load_idx0 = load_idx0_eg || load_idx0_ca;
891 bool load_idx1 = load_idx1_eg || load_idx1_ca;
892
893
894 assert(!m_idx0_pending || !load_idx0);
895 assert(!m_idx1_pending || !load_idx1);
896
897 m_idx0_loading |= load_idx0;
898 m_idx1_loading |= load_idx1;
899 }
900
901 if (has_indirect_reg_load || is_idx_load_on_eg)
902 m_current_block->dec_expected_ar_uses();
903
904 alu_vec_ready.erase(old_i);
905 success = true;
906 sfn_log << SfnLog::schedule << " success\n";
907 } else {
908 ++i;
909 sfn_log << SfnLog::schedule << " failed\n";
910 }
911 }
912 return success;
913 }
914
915 bool
schedule_alu_to_group_trans(AluGroup * group,std::list<AluInstr * > & readylist)916 BlockScheduler::schedule_alu_to_group_trans(AluGroup *group,
917 std::list<AluInstr *>& readylist)
918 {
919 assert(group);
920
921 bool success = false;
922 auto i = readylist.begin();
923 auto e = readylist.end();
924 while (i != e) {
925
926 if (check_array_reads(**i)) {
927 ++i;
928 continue;
929 }
930
931 sfn_log << SfnLog::schedule << "Try schedule to trans " << **i;
932 if (!m_current_block->try_reserve_kcache(**i)) {
933 sfn_log << SfnLog::schedule << " failed (kcache)\n";
934 ++i;
935 continue;
936 }
937
938 if (group->add_trans_instructions(*i)) {
939 auto old_i = i;
940 ++i;
941 auto addr = std::get<0>((*old_i)->indirect_addr());
942 if (addr && addr->has_flag(Register::addr_or_idx))
943 m_current_block->dec_expected_ar_uses();
944
945 readylist.erase(old_i);
946 success = true;
947 sfn_log << SfnLog::schedule << " success\n";
948 break;
949 } else {
950 ++i;
951 sfn_log << SfnLog::schedule << " failed\n";
952 }
953 }
954 return success;
955 }
956
957 template <typename I>
958 bool
schedule(std::list<I * > & ready_list)959 BlockScheduler::schedule(std::list<I *>& ready_list)
960 {
961 if (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
962 auto ii = ready_list.begin();
963 sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
964 (*ii)->set_scheduled();
965 m_current_block->push_back(*ii);
966 ready_list.erase(ii);
967 return true;
968 }
969 return false;
970 }
971
972 template <typename I>
973 bool
schedule_block(std::list<I * > & ready_list)974 BlockScheduler::schedule_block(std::list<I *>& ready_list)
975 {
976 bool success = false;
977 while (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
978 auto ii = ready_list.begin();
979 sfn_log << SfnLog::schedule << "Schedule: " << **ii << " "
980 << m_current_block->remaining_slots() << "\n";
981 (*ii)->set_scheduled();
982 m_current_block->push_back(*ii);
983 ready_list.erase(ii);
984 success = true;
985 }
986 return success;
987 }
988
989 bool
schedule_exports(Shader::ShaderBlocks & out_blocks,std::list<ExportInstr * > & ready_list)990 BlockScheduler::schedule_exports(Shader::ShaderBlocks& out_blocks,
991 std::list<ExportInstr *>& ready_list)
992 {
993 if (m_current_block->type() != Block::cf)
994 start_new_block(out_blocks, Block::cf);
995
996 if (!ready_list.empty()) {
997 auto ii = ready_list.begin();
998 sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
999 (*ii)->set_scheduled();
1000 m_current_block->push_back(*ii);
1001 switch ((*ii)->export_type()) {
1002 case ExportInstr::pos:
1003 m_last_pos = *ii;
1004 break;
1005 case ExportInstr::param:
1006 m_last_param = *ii;
1007 break;
1008 case ExportInstr::pixel:
1009 m_last_pixel = *ii;
1010 break;
1011 }
1012 (*ii)->set_is_last_export(false);
1013 ready_list.erase(ii);
1014 return true;
1015 }
1016 return false;
1017 }
1018
1019 bool
collect_ready(CollectInstructions & available)1020 BlockScheduler::collect_ready(CollectInstructions& available)
1021 {
1022 sfn_log << SfnLog::schedule << "Ready instructions\n";
1023 bool result = false;
1024 result |= collect_ready_alu_vec(alu_vec_ready, available.alu_vec);
1025 result |= collect_ready_type(alu_trans_ready, available.alu_trans);
1026 result |= collect_ready_type(alu_groups_ready, available.alu_groups);
1027 result |= collect_ready_type(gds_ready, available.gds_op);
1028 result |= collect_ready_type(tex_ready, available.tex);
1029 result |= collect_ready_type(fetches_ready, available.fetches);
1030 result |= collect_ready_type(memops_ready, available.mem_write_instr);
1031 result |= collect_ready_type(mem_ring_writes_ready, available.mem_ring_writes);
1032 result |= collect_ready_type(write_tf_ready, available.write_tf);
1033 result |= collect_ready_type(rat_instr_ready, available.rat_instr);
1034
1035 sfn_log << SfnLog::schedule << "\n";
1036 return result;
1037 }
1038
1039 bool
collect_ready_alu_vec(std::list<AluInstr * > & ready,std::list<AluInstr * > & available)1040 BlockScheduler::collect_ready_alu_vec(std::list<AluInstr *>& ready,
1041 std::list<AluInstr *>& available)
1042 {
1043 auto i = available.begin();
1044 auto e = available.end();
1045
1046 for (auto alu : ready) {
1047 alu->add_priority(100 * alu->register_priority());
1048 }
1049
1050 int max_check = 0;
1051 while (i != e && max_check++ < 64) {
1052 if (ready.size() < 64 && (*i)->ready()) {
1053
1054 int priority = 0;
1055 /* LDS fetches that use static offsets are usually ready ery fast,
1056 * so that they would get schedules early, and this leaves the
1057 * problem that we allocate too many registers with just constant
1058 * values, and this will make problems with RA. So limit the number of
1059 * LDS address registers.
1060 */
1061 if ((*i)->has_alu_flag(alu_lds_address)) {
1062 if (m_lds_addr_count > 64) {
1063 ++i;
1064 continue;
1065 } else {
1066 ++m_lds_addr_count;
1067 }
1068 }
1069
1070 /* LDS instructions are scheduled with high priority.
1071 * instractions that can go into the t slot and don't have
1072 * indirect access are put in last, so that they don't block
1073 * vec-only instructions when scheduling to the vector slots
1074 * for everything else we look at the register use */
1075
1076 auto [addr, dummy1, dummy2] = (*i)->indirect_addr();
1077
1078 if ((*i)->has_lds_access()) {
1079 priority = 100000;
1080 if ((*i)->has_alu_flag(alu_is_lds))
1081 priority += 100000;
1082 } else if (addr) {
1083 priority = 10000;
1084 } else if (AluGroup::has_t()) {
1085 auto opinfo = alu_ops.find((*i)->opcode());
1086 assert(opinfo != alu_ops.end());
1087 if (opinfo->second.can_channel(AluOp::t, m_chip_class))
1088 priority = -1;
1089 }
1090
1091 priority += 100 * (*i)->register_priority();
1092
1093 (*i)->add_priority(priority);
1094 ready.push_back(*i);
1095
1096 auto old_i = i;
1097 ++i;
1098 available.erase(old_i);
1099 } else
1100 ++i;
1101 }
1102
1103 for (auto& i : ready)
1104 sfn_log << SfnLog::schedule << "V: " << *i << "\n";
1105
1106 ready.sort([](const AluInstr *lhs, const AluInstr *rhs) {
1107 return lhs->priority() > rhs->priority();
1108 });
1109
1110 for (auto& i : ready)
1111 sfn_log << SfnLog::schedule << "V (S): " << i->priority() << " " << *i << "\n";
1112
1113 return !ready.empty();
1114 }
1115
1116 template <typename T> struct type_char {
1117 };
1118
1119 template <> struct type_char<AluInstr> {
valuer600::type_char1120 static char value() { return 'A';};
1121 };
1122
1123 template <> struct type_char<AluGroup> {
valuer600::type_char1124 static char value() { return 'G';};
1125 };
1126
1127 template <> struct type_char<ExportInstr> {
valuer600::type_char1128 static char value() { return 'E';};
1129 };
1130
1131 template <> struct type_char<TexInstr> {
valuer600::type_char1132 static char value() { return 'T';};
1133 };
1134
1135 template <> struct type_char<FetchInstr> {
valuer600::type_char1136 static char value() { return 'F';};
1137 };
1138
1139 template <> struct type_char<WriteOutInstr> {
valuer600::type_char1140 static char value() { return 'M';};
1141 };
1142
1143 template <> struct type_char<MemRingOutInstr> {
valuer600::type_char1144 static char value() { return 'R';};
1145 };
1146
1147 template <> struct type_char<WriteTFInstr> {
valuer600::type_char1148 static char value() { return 'X';};
1149 };
1150
1151 template <> struct type_char<GDSInstr> {
valuer600::type_char1152 static char value() { return 'S';};
1153 };
1154
1155 template <> struct type_char<RatInstr> {
valuer600::type_char1156 static char value() { return 'I';};
1157 };
1158
1159 template <typename T>
1160 bool
collect_ready_type(std::list<T * > & ready,std::list<T * > & available)1161 BlockScheduler::collect_ready_type(std::list<T *>& ready, std::list<T *>& available)
1162 {
1163 auto i = available.begin();
1164 auto e = available.end();
1165
1166 int lookahead = 16;
1167 while (i != e && ready.size() < 16 && lookahead-- > 0) {
1168 if ((*i)->ready()) {
1169 ready.push_back(*i);
1170 auto old_i = i;
1171 ++i;
1172 available.erase(old_i);
1173 } else
1174 ++i;
1175 }
1176
1177 for (auto& i : ready)
1178 sfn_log << SfnLog::schedule << type_char<T>::value() << "; " << *i << "\n";
1179
1180 return !ready.empty();
1181 }
1182
1183 class CheckArrayAccessVisitor : public ConstRegisterVisitor {
1184 public:
1185 using ConstRegisterVisitor::visit;
visit(const Register & value)1186 void visit(const Register& value) override {(void)value;}
visit(const LocalArray & value)1187 void visit(const LocalArray& value) override {(void)value;}
visit(const UniformValue & value)1188 void visit(const UniformValue& value) override {(void)value;}
visit(const LiteralConstant & value)1189 void visit(const LiteralConstant& value) override {(void)value;}
visit(const InlineConstant & value)1190 void visit(const InlineConstant& value) override {(void)value;}
1191 };
1192
1193 class UpdateArrayWrite : public CheckArrayAccessVisitor {
1194 public:
UpdateArrayWrite(ArrayCheckSet & indirect_arrays,ArrayCheckSet & direct_arrays,bool tdw)1195 UpdateArrayWrite(ArrayCheckSet& indirect_arrays,
1196 ArrayCheckSet& direct_arrays,
1197 bool tdw):
1198 last_indirect_array_write(indirect_arrays),
1199 last_direct_array_write(direct_arrays),
1200 track_direct_writes(tdw)
1201 {
1202 }
1203
visit(const LocalArrayValue & value)1204 void visit(const LocalArrayValue& value) override {
1205 int array_base = value.array().base_sel();
1206 auto entry = std::make_pair(array_base, value.chan());
1207 if (value.addr())
1208 last_indirect_array_write.insert(entry);
1209 else if (track_direct_writes)
1210 last_direct_array_write.insert(entry);
1211 }
1212 private:
1213 ArrayCheckSet& last_indirect_array_write;
1214 ArrayCheckSet& last_direct_array_write;
1215 bool track_direct_writes {false};
1216 };
1217
1218
update_array_writes(const AluGroup & group)1219 void BlockScheduler::update_array_writes(const AluGroup& group)
1220 {
1221 if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1222 m_last_direct_array_write.clear();
1223 m_last_indirect_array_write.clear();
1224
1225 UpdateArrayWrite visitor(m_last_indirect_array_write,
1226 m_last_direct_array_write,
1227 m_nop_befor_rel_src);
1228
1229 for (auto alu : group) {
1230 if (alu && alu->dest())
1231 alu->dest()->accept(visitor);
1232 }
1233 }
1234 }
1235
1236 class CheckArrayRead : public CheckArrayAccessVisitor {
1237 public:
CheckArrayRead(const ArrayCheckSet & indirect_arrays,const ArrayCheckSet & direct_arrays)1238 CheckArrayRead(const ArrayCheckSet& indirect_arrays,
1239 const ArrayCheckSet& direct_arrays):
1240 last_indirect_array_write(indirect_arrays),
1241 last_direct_array_write(direct_arrays)
1242 {
1243 }
1244
visit(const LocalArrayValue & value)1245 void visit(const LocalArrayValue& value) override {
1246 int array_base = value.array().base_sel();
1247 auto entry = std::make_pair(array_base, value.chan());
1248
1249 if (last_indirect_array_write.find(entry) !=
1250 last_indirect_array_write.end())
1251 need_extra_group = true;
1252
1253 if (value.addr() && last_direct_array_write.find(entry) !=
1254 last_direct_array_write.end()) {
1255 need_extra_group = true;
1256 }
1257 }
1258
1259 const ArrayCheckSet& last_indirect_array_write;
1260 const ArrayCheckSet& last_direct_array_write;
1261 bool need_extra_group {false};
1262 };
1263
1264
check_array_reads(const AluInstr & instr)1265 bool BlockScheduler::check_array_reads(const AluInstr& instr)
1266 {
1267 if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1268
1269 CheckArrayRead visitor(m_last_indirect_array_write,
1270 m_last_direct_array_write);
1271
1272 for (auto& s : instr.sources()) {
1273 s->accept(visitor);
1274 }
1275 return visitor.need_extra_group;
1276 }
1277 return false;
1278 }
1279
check_array_reads(const AluGroup & group)1280 bool BlockScheduler::check_array_reads(const AluGroup& group)
1281 {
1282 if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1283
1284 CheckArrayRead visitor(m_last_indirect_array_write,
1285 m_last_direct_array_write);
1286
1287 for (auto alu : group) {
1288 if (!alu)
1289 continue;
1290 for (auto& s : alu->sources()) {
1291 s->accept(visitor);
1292 }
1293 }
1294 return visitor.need_extra_group;
1295 }
1296 return false;
1297 }
1298
1299
1300 } // namespace r600
1301