xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /* -*- mesa-c++  -*-
2  * Copyright 2022 Collabora LTD
3  * Author: Gert Wollny <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "sfn_scheduler.h"
8 
9 #include "../r600_isa.h"
10 
11 #include "amd_family.h"
12 #include "sfn_alu_defines.h"
13 #include "sfn_debug.h"
14 #include "sfn_instr_alugroup.h"
15 #include "sfn_instr_controlflow.h"
16 #include "sfn_instr_export.h"
17 #include "sfn_instr_fetch.h"
18 #include "sfn_instr_lds.h"
19 #include "sfn_instr_mem.h"
20 #include "sfn_instr_tex.h"
21 
22 #include <algorithm>
23 #include <sstream>
24 
25 namespace r600 {
26 
27 class CollectInstructions : public InstrVisitor {
28 
29 public:
CollectInstructions(ValueFactory & vf)30    CollectInstructions(ValueFactory& vf):
31        m_value_factory(vf)
32    {
33    }
34 
visit(AluInstr * instr)35    void visit(AluInstr *instr) override
36    {
37       if (instr->has_alu_flag(alu_is_trans))
38          alu_trans.push_back(instr);
39       else {
40          if (instr->alu_slots() == 1)
41             alu_vec.push_back(instr);
42          else
43             alu_groups.push_back(instr->split(m_value_factory));
44       }
45    }
visit(AluGroup * instr)46    void visit(AluGroup *instr) override { alu_groups.push_back(instr); }
visit(TexInstr * instr)47    void visit(TexInstr *instr) override { tex.push_back(instr); }
visit(ExportInstr * instr)48    void visit(ExportInstr *instr) override { exports.push_back(instr); }
visit(FetchInstr * instr)49    void visit(FetchInstr *instr) override { fetches.push_back(instr); }
visit(Block * instr)50    void visit(Block *instr) override
51    {
52       for (auto& i : *instr)
53          i->accept(*this);
54    }
55 
visit(ControlFlowInstr * instr)56    void visit(ControlFlowInstr *instr) override
57    {
58       assert(!m_cf_instr);
59       m_cf_instr = instr;
60    }
61 
visit(IfInstr * instr)62    void visit(IfInstr *instr) override
63    {
64       assert(!m_cf_instr);
65       m_cf_instr = instr;
66    }
67 
visit(EmitVertexInstr * instr)68    void visit(EmitVertexInstr *instr) override
69    {
70       assert(!m_cf_instr);
71       m_cf_instr = instr;
72    }
73 
visit(ScratchIOInstr * instr)74    void visit(ScratchIOInstr *instr) override { mem_write_instr.push_back(instr); }
75 
visit(StreamOutInstr * instr)76    void visit(StreamOutInstr *instr) override { mem_write_instr.push_back(instr); }
77 
visit(MemRingOutInstr * instr)78    void visit(MemRingOutInstr *instr) override { mem_ring_writes.push_back(instr); }
79 
visit(GDSInstr * instr)80    void visit(GDSInstr *instr) override { gds_op.push_back(instr); }
81 
visit(WriteTFInstr * instr)82    void visit(WriteTFInstr *instr) override { write_tf.push_back(instr); }
83 
visit(LDSReadInstr * instr)84    void visit(LDSReadInstr *instr) override
85    {
86       std::vector<AluInstr *> buffer;
87       m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
88       for (auto& i : buffer) {
89          i->accept(*this);
90       }
91    }
92 
visit(LDSAtomicInstr * instr)93    void visit(LDSAtomicInstr *instr) override
94    {
95       std::vector<AluInstr *> buffer;
96       m_last_lds_instr = instr->split(buffer, m_last_lds_instr);
97       for (auto& i : buffer) {
98          i->accept(*this);
99       }
100    }
101 
visit(RatInstr * instr)102    void visit(RatInstr *instr) override { rat_instr.push_back(instr); }
103 
104    std::list<AluInstr *> alu_trans;
105    std::list<AluInstr *> alu_vec;
106    std::list<TexInstr *> tex;
107    std::list<AluGroup *> alu_groups;
108    std::list<ExportInstr *> exports;
109    std::list<FetchInstr *> fetches;
110    std::list<WriteOutInstr *> mem_write_instr;
111    std::list<MemRingOutInstr *> mem_ring_writes;
112    std::list<GDSInstr *> gds_op;
113    std::list<WriteTFInstr *> write_tf;
114    std::list<RatInstr *> rat_instr;
115 
116    Instr *m_cf_instr{nullptr};
117    ValueFactory& m_value_factory;
118 
119    AluInstr *m_last_lds_instr{nullptr};
120 };
121 
122 struct ArrayChanHash
123 {
operator ()r600::ArrayChanHash124     std::size_t operator()(std::pair<int, int> const& s) const noexcept
125     {
126        return std::hash<size_t>{}((size_t(s.first) << 3) | s.second);
127     }
128 };
129 
130 using ArrayCheckSet = std::unordered_set<std::pair<int, int>, ArrayChanHash>;
131 
132 class BlockScheduler {
133 public:
134    BlockScheduler(r600_chip_class chip_class,
135                   radeon_family family);
136 
137    void run(Shader *shader);
138 
139    void finalize();
140 
141 private:
142    void
143    schedule_block(Block& in_block, Shader::ShaderBlocks& out_blocks, ValueFactory& vf);
144 
145    bool collect_ready(CollectInstructions& available);
146 
147    template <typename T>
148    bool collect_ready_type(std::list<T *>& ready, std::list<T *>& orig);
149 
150    bool collect_ready_alu_vec(std::list<AluInstr *>& ready,
151                               std::list<AluInstr *>& available);
152 
153    bool schedule_tex(Shader::ShaderBlocks& out_blocks);
154    bool schedule_vtx(Shader::ShaderBlocks& out_blocks);
155 
156    template <typename I>
157    bool schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
158 
159    template <typename I>
160    bool schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list);
161 
162    bool schedule_alu(Shader::ShaderBlocks& out_blocks);
163    void start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type);
164 
165    bool schedule_alu_to_group_vec(AluGroup *group);
166    bool schedule_alu_to_group_trans(AluGroup *group, std::list<AluInstr *>& readylist);
167 
168    bool schedule_exports(Shader::ShaderBlocks& out_blocks,
169                          std::list<ExportInstr *>& ready_list);
170 
171    void maybe_split_alu_block(Shader::ShaderBlocks& out_blocks);
172 
173    template <typename I> bool schedule(std::list<I *>& ready_list);
174 
175    template <typename I> bool schedule_block(std::list<I *>& ready_list);
176 
177    void update_array_writes(const AluGroup& group);
178    bool check_array_reads(const AluInstr& instr);
179    bool check_array_reads(const AluGroup& group);
180 
181    std::list<AluInstr *> alu_vec_ready;
182    std::list<AluInstr *> alu_trans_ready;
183    std::list<AluGroup *> alu_groups_ready;
184    std::list<TexInstr *> tex_ready;
185    std::list<ExportInstr *> exports_ready;
186    std::list<FetchInstr *> fetches_ready;
187    std::list<WriteOutInstr *> memops_ready;
188    std::list<MemRingOutInstr *> mem_ring_writes_ready;
189    std::list<GDSInstr *> gds_ready;
190    std::list<WriteTFInstr *> write_tf_ready;
191    std::list<RatInstr *> rat_instr_ready;
192 
193    enum {
194       sched_alu,
195       sched_tex,
196       sched_fetch,
197       sched_free,
198       sched_mem_ring,
199       sched_gds,
200       sched_write_tf,
201       sched_rat,
202    } current_shed;
203 
204    ExportInstr *m_last_pos;
205    ExportInstr *m_last_pixel;
206    ExportInstr *m_last_param;
207 
208    Block *m_current_block;
209 
210    int m_lds_addr_count{0};
211    int m_alu_groups_scheduled{0};
212    r600_chip_class m_chip_class;
213    radeon_family m_chip_family;
214    bool m_idx0_loading{false};
215    bool m_idx1_loading{false};
216    bool m_idx0_pending{false};
217    bool m_idx1_pending{false};
218 
219    bool m_nop_after_rel_dest{false};
220    bool m_nop_befor_rel_src{false};
221    uint32_t m_next_block_id{1};
222 
223 
224    ArrayCheckSet m_last_indirect_array_write;
225    ArrayCheckSet m_last_direct_array_write;
226 };
227 
228 Shader *
schedule(Shader * original)229 schedule(Shader *original)
230 {
231    Block::set_chipclass(original->chip_class());
232    AluGroup::set_chipclass(original->chip_class());
233 
234    sfn_log << SfnLog::schedule << "Original shader\n";
235    if (sfn_log.has_debug_flag(SfnLog::schedule)) {
236       std::stringstream ss;
237       original->print(ss);
238       sfn_log << ss.str() << "\n\n";
239    }
240 
241    // TODO later it might be necessary to clone the shader
242    // to be able to re-start scheduling
243 
244    auto scheduled_shader = original;
245 
246    BlockScheduler s(original->chip_class(), original->chip_family());
247 
248    s.run(scheduled_shader);
249    s.finalize();
250 
251    sfn_log << SfnLog::schedule << "Scheduled shader\n";
252    if (sfn_log.has_debug_flag(SfnLog::schedule)) {
253       std::stringstream ss;
254       scheduled_shader->print(ss);
255       sfn_log << ss.str() << "\n\n";
256    }
257 
258    return scheduled_shader;
259 }
260 
BlockScheduler(r600_chip_class chip_class,radeon_family chip_family)261 BlockScheduler::BlockScheduler(r600_chip_class chip_class,
262                                radeon_family chip_family):
263     current_shed(sched_alu),
264     m_last_pos(nullptr),
265     m_last_pixel(nullptr),
266     m_last_param(nullptr),
267     m_current_block(nullptr),
268     m_chip_class(chip_class),
269     m_chip_family(chip_family)
270 {
271    m_nop_after_rel_dest = chip_family == CHIP_RV770;
272 
273    m_nop_befor_rel_src = m_chip_class == ISA_CC_R600 &&
274                          chip_family != CHIP_RV670 &&
275                          chip_family != CHIP_RS780 &&
276                          chip_family != CHIP_RS880;
277 }
278 
279 void
run(Shader * shader)280 BlockScheduler::run(Shader *shader)
281 {
282    Shader::ShaderBlocks scheduled_blocks;
283 
284    for (auto& block : shader->func()) {
285       sfn_log << SfnLog::schedule << "Process block " << block->id() << "\n";
286       if (sfn_log.has_debug_flag(SfnLog::schedule)) {
287          std::stringstream ss;
288          block->print(ss);
289          sfn_log << ss.str() << "\n";
290       }
291       schedule_block(*block, scheduled_blocks, shader->value_factory());
292    }
293 
294    shader->reset_function(scheduled_blocks);
295 }
296 
297 void
schedule_block(Block & in_block,Shader::ShaderBlocks & out_blocks,ValueFactory & vf)298 BlockScheduler::schedule_block(Block& in_block,
299                               Shader::ShaderBlocks& out_blocks,
300                               ValueFactory& vf)
301 {
302 
303    assert(in_block.id() >= 0);
304 
305    current_shed = sched_fetch;
306    auto last_shed = sched_fetch;
307 
308    CollectInstructions cir(vf);
309    in_block.accept(cir);
310 
311    bool have_instr = collect_ready(cir);
312 
313    m_current_block = new Block(in_block.nesting_depth(), m_next_block_id++);
314    m_current_block->set_instr_flag(Instr::force_cf);
315    assert(m_current_block->id() >= 0);
316 
317    while (have_instr) {
318 
319       sfn_log << SfnLog::schedule << "Have ready instructions\n";
320 
321       if (alu_vec_ready.size())
322          sfn_log << SfnLog::schedule << "  ALU V:" << alu_vec_ready.size() << "\n";
323 
324       if (alu_trans_ready.size())
325          sfn_log << SfnLog::schedule << "  ALU T:" << alu_trans_ready.size() << "\n";
326 
327       if (alu_groups_ready.size())
328          sfn_log << SfnLog::schedule << "  ALU G:" << alu_groups_ready.size() << "\n";
329 
330       if (exports_ready.size())
331          sfn_log << SfnLog::schedule << "  EXP:" << exports_ready.size() << "\n";
332       if (tex_ready.size())
333          sfn_log << SfnLog::schedule << "  TEX:" << tex_ready.size() << "\n";
334       if (fetches_ready.size())
335          sfn_log << SfnLog::schedule << "  FETCH:" << fetches_ready.size() << "\n";
336       if (mem_ring_writes_ready.size())
337          sfn_log << SfnLog::schedule << "  MEM_RING:" << mem_ring_writes_ready.size()
338                  << "\n";
339       if (memops_ready.size())
340          sfn_log << SfnLog::schedule << "  MEM_OPS:" << mem_ring_writes_ready.size()
341                  << "\n";
342 
343       if (!m_current_block->lds_group_active() &&
344           m_current_block->expected_ar_uses() == 0) {
345          if (last_shed != sched_free && memops_ready.size() > 8)
346             current_shed = sched_free;
347          else if (mem_ring_writes_ready.size() > 15)
348             current_shed = sched_mem_ring;
349          else if (rat_instr_ready.size() > 3)
350             current_shed = sched_rat;
351          else if (tex_ready.size() > (m_chip_class >= ISA_CC_EVERGREEN ? 15 : 7))
352             current_shed = sched_tex;
353       }
354 
355       switch (current_shed) {
356       case sched_alu:
357          if (!schedule_alu(out_blocks)) {
358             assert(!m_current_block->lds_group_active());
359             current_shed = sched_tex;
360             continue;
361          }
362          last_shed = current_shed;
363          break;
364       case sched_tex:
365          if (tex_ready.empty() || !schedule_tex(out_blocks)) {
366             current_shed = sched_fetch;
367             continue;
368          }
369          last_shed = current_shed;
370          break;
371       case sched_fetch:
372          if (!fetches_ready.empty()) {
373             schedule_vtx(out_blocks);
374             last_shed = current_shed;
375          }
376          current_shed = sched_gds;
377          continue;
378       case sched_gds:
379          if (!gds_ready.empty()) {
380             schedule_gds(out_blocks, gds_ready);
381             last_shed = current_shed;
382          }
383          current_shed = sched_mem_ring;
384          continue;
385       case sched_mem_ring:
386          if (mem_ring_writes_ready.empty() ||
387              !schedule_cf(out_blocks, mem_ring_writes_ready)) {
388             current_shed = sched_write_tf;
389             continue;
390          }
391          last_shed = current_shed;
392          break;
393       case sched_write_tf:
394          if (write_tf_ready.empty() || !schedule_gds(out_blocks, write_tf_ready)) {
395             current_shed = sched_rat;
396             continue;
397          }
398          last_shed = current_shed;
399          break;
400       case sched_rat:
401          if (rat_instr_ready.empty() || !schedule_cf(out_blocks, rat_instr_ready)) {
402             current_shed = sched_free;
403             continue;
404          }
405          last_shed = current_shed;
406          break;
407       case sched_free:
408          if (memops_ready.empty() || !schedule_cf(out_blocks, memops_ready)) {
409             current_shed = sched_alu;
410             break;
411          }
412          last_shed = current_shed;
413       }
414 
415       have_instr = collect_ready(cir);
416    }
417 
418    /* Emit exports always at end of a block */
419    while (collect_ready_type(exports_ready, cir.exports))
420       schedule_exports(out_blocks, exports_ready);
421 
422    ASSERTED bool fail = false;
423 
424    if (!cir.alu_groups.empty()) {
425       std::cerr << "Unscheduled ALU groups:\n";
426       for (auto& a : cir.alu_groups) {
427          std::cerr << "   " << *a << "\n";
428       }
429       fail = true;
430    }
431 
432    if (!cir.alu_vec.empty()) {
433       std::cerr << "Unscheduled ALU vec ops:\n";
434       for (auto& a : cir.alu_vec) {
435          std::cerr << "   [" << a->block_id() << ":"
436                    << a->index() <<"]:" << *a << "\n";
437          for (auto& d : a->required_instr())
438             std::cerr << "      R["<< d->block_id() << ":" << d->index() <<"]:"
439                       << *d << "\n";
440       }
441       fail = true;
442    }
443 
444    if (!cir.alu_trans.empty()) {
445       std::cerr << "Unscheduled ALU trans ops:\n";
446       for (auto& a : cir.alu_trans) {
447          std::cerr << "   " << "   [" << a->block_id() << ":"
448                    << a->index() <<"]:" << *a << "\n";
449          for (auto& d : a->required_instr())
450             std::cerr << "      R:" << *d << "\n";
451       }
452       fail = true;
453    }
454    if (!cir.mem_write_instr.empty()) {
455       std::cerr << "Unscheduled MEM ops:\n";
456       for (auto& a : cir.mem_write_instr) {
457          std::cerr << "   " << *a << "\n";
458       }
459       fail = true;
460    }
461 
462    if (!cir.fetches.empty()) {
463       std::cerr << "Unscheduled Fetch ops:\n";
464       for (auto& a : cir.fetches) {
465          std::cerr << "   " << *a << "\n";
466       }
467       fail = true;
468    }
469 
470    if (!cir.tex.empty()) {
471       std::cerr << "Unscheduled Tex ops:\n";
472       for (auto& a : cir.tex) {
473          std::cerr << "   " << *a << "\n";
474       }
475       fail = true;
476    }
477 
478    if (fail) {
479       std::cerr << "Failing block:\n";
480       for (auto& i : in_block)
481          std::cerr << "[" << i->block_id() << ":" << i->index() << "] "
482                    << (i->is_scheduled() ? "S " : "")
483                    << *i << "\n";
484       std::cerr << "\nSo far scheduled: ";
485 
486       for (auto i : *m_current_block)
487          std::cerr << "[" << i->block_id() << ":" << i->index() << "] " << *i << "\n";
488       std::cerr << "\n\n: ";
489    }
490 
491    assert(cir.tex.empty());
492    assert(cir.exports.empty());
493    assert(cir.fetches.empty());
494    assert(cir.alu_vec.empty());
495    assert(cir.mem_write_instr.empty());
496    assert(cir.mem_ring_writes.empty());
497 
498    assert(!fail);
499 
500    if (cir.m_cf_instr) {
501       // Assert that if condition is ready
502       if (m_current_block->type() != Block::alu) {
503          start_new_block(out_blocks, Block::alu);
504       }
505       m_current_block->push_back(cir.m_cf_instr);
506       cir.m_cf_instr->set_scheduled();
507    }
508 
509    if (m_current_block->type() == Block::alu)
510       maybe_split_alu_block(out_blocks);
511    else
512       out_blocks.push_back(m_current_block);
513 }
514 
515 void
finalize()516 BlockScheduler::finalize()
517 {
518    if (m_last_pos)
519       m_last_pos->set_is_last_export(true);
520    if (m_last_pixel)
521       m_last_pixel->set_is_last_export(true);
522    if (m_last_param)
523       m_last_param->set_is_last_export(true);
524 }
525 
526 bool
schedule_alu(Shader::ShaderBlocks & out_blocks)527 BlockScheduler::schedule_alu(Shader::ShaderBlocks& out_blocks)
528 {
529    bool success = false;
530    AluGroup *group = nullptr;
531 
532    sfn_log << SfnLog::schedule << "Schedule alu with " <<
533               m_current_block->expected_ar_uses()
534            << " pending AR loads\n";
535 
536    bool has_alu_ready = !alu_vec_ready.empty() || !alu_trans_ready.empty();
537 
538    bool has_lds_ready =
539       !alu_vec_ready.empty() && (*alu_vec_ready.begin())->has_lds_access();
540 
541    bool has_ar_read_ready = !alu_vec_ready.empty() &&
542                             std::get<0>((*alu_vec_ready.begin())->indirect_addr());
543 
544    /* If we have ready ALU instructions we have to start a new ALU block */
545    if (has_alu_ready || !alu_groups_ready.empty()) {
546       if (m_current_block->type() != Block::alu) {
547          start_new_block(out_blocks, Block::alu);
548          m_alu_groups_scheduled = 0;
549       }
550    }
551 
552    /* Schedule groups first. unless we have a pending LDS instruction
553     * We don't want the LDS instructions to be too far apart because the
554     * fetch + read from queue has to be in the same ALU CF block */
555    if (!alu_groups_ready.empty() && !has_lds_ready && !has_ar_read_ready) {
556       group = *alu_groups_ready.begin();
557 
558       if (!check_array_reads(*group)) {
559 
560 
561          sfn_log << SfnLog::schedule << "try schedule " <<
562                     *group << "\n";
563 
564          /* Only start a new CF if we have no pending AR reads */
565          if (m_current_block->try_reserve_kcache(*group)) {
566             alu_groups_ready.erase(alu_groups_ready.begin());
567             success = true;
568          } else {
569             if (m_current_block->expected_ar_uses() == 0) {
570                start_new_block(out_blocks, Block::alu);
571 
572                if (!m_current_block->try_reserve_kcache(*group))
573                   unreachable("Scheduling a group in a new block should always succeed");
574                alu_groups_ready.erase(alu_groups_ready.begin());
575                sfn_log << SfnLog::schedule << "Schedule ALU group\n";
576                success = true;
577             } else {
578                sfn_log << SfnLog::schedule << "Don't add group because of " <<
579                           m_current_block->expected_ar_uses()
580                        << "pending AR loads\n";
581                group = nullptr;
582             }
583          }
584       }
585    }
586 
587    if (!group && has_alu_ready) {
588       group = new AluGroup();
589       sfn_log << SfnLog::schedule << "START new ALU group\n";
590    } else if (!success) {
591       return false;
592    }
593 
594    assert(group);
595 
596    int free_slots = group->free_slots();
597 
598    while (free_slots && has_alu_ready) {
599       if (!alu_vec_ready.empty())
600          success |= schedule_alu_to_group_vec(group);
601 
602       /* Apparently one can't schedule a t-slot if there is already
603        * and LDS instruction scheduled.
604        * TODO: check whether this is only relevant for actual LDS instructions
605        * or also for instructions that read from the LDS return value queue */
606 
607       if (free_slots & 0x10 && !has_lds_ready) {
608          sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
609          if (!alu_trans_ready.empty())
610             success |= schedule_alu_to_group_trans(group, alu_trans_ready);
611          if (!alu_vec_ready.empty())
612             success |= schedule_alu_to_group_trans(group, alu_vec_ready);
613       }
614 
615       if (success) {
616          ++m_alu_groups_scheduled;
617          break;
618       } else if (m_current_block->kcache_reservation_failed()) {
619          // LDS read groups should not lead to impossible
620          // kcache constellations
621          assert(!m_current_block->lds_group_active());
622 
623          // AR is loaded but not all uses are done, we don't want
624          // to start a new CF here
625          assert(m_current_block->expected_ar_uses() ==0);
626 
627          // kcache reservation failed, so we have to start a new CF
628          start_new_block(out_blocks, Block::alu);
629       } else {
630          // Ready is not empty, but we didn't schedule anything, this
631          // means we had a indirect array read or write conflict that we
632          // can resolve with an extra group that has a NOP instruction
633          if (!alu_trans_ready.empty()  || !alu_vec_ready.empty()) {
634             group->add_vec_instructions(new AluInstr(op0_nop, 0));
635             break;
636          } else {
637             return false;
638          }
639       }
640    }
641 
642 
643 
644    sfn_log << SfnLog::schedule << "Finalize ALU group\n";
645    group->set_scheduled();
646    group->fix_last_flag();
647    group->set_nesting_depth(m_current_block->nesting_depth());
648 
649    auto [addr, is_index] = group->addr();
650    if (is_index) {
651       if (addr->sel() == AddressRegister::idx0 && m_idx0_pending) {
652          assert(!group->has_lds_group_start());
653          assert(m_current_block->expected_ar_uses() == 0);
654          start_new_block(out_blocks, Block::alu);
655          m_current_block->try_reserve_kcache(*group);
656       }
657       if (addr->sel() == AddressRegister::idx1 && m_idx1_pending) {
658          assert(!group->has_lds_group_start());
659          assert(m_current_block->expected_ar_uses() == 0);
660          start_new_block(out_blocks, Block::alu);
661          m_current_block->try_reserve_kcache(*group);
662       }
663    }
664 
665    m_current_block->push_back(group);
666 
667    update_array_writes(*group);
668 
669    m_idx0_pending |= m_idx0_loading;
670    m_idx0_loading = false;
671 
672    m_idx1_pending |= m_idx1_loading;
673    m_idx1_loading = false;
674 
675    if (!m_current_block->lds_group_active() &&
676        m_current_block->expected_ar_uses() == 0 &&
677        (!addr || is_index)) {
678       group->set_instr_flag(Instr::no_lds_or_addr_group);
679    }
680 
681    if (group->has_lds_group_start())
682       m_current_block->lds_group_start(*group->begin());
683 
684    if (group->has_lds_group_end())
685       m_current_block->lds_group_end();
686 
687    if (group->has_kill_op()) {
688       assert(!group->has_lds_group_start());
689       assert(m_current_block->expected_ar_uses() == 0);
690       start_new_block(out_blocks, Block::alu);
691    }
692 
693    return success;
694 }
695 
696 bool
schedule_tex(Shader::ShaderBlocks & out_blocks)697 BlockScheduler::schedule_tex(Shader::ShaderBlocks& out_blocks)
698 {
699    if (m_current_block->type() != Block::tex || m_current_block->remaining_slots() == 0) {
700       start_new_block(out_blocks, Block::tex);
701       m_current_block->set_instr_flag(Instr::force_cf);
702    }
703 
704    if (!tex_ready.empty() && m_current_block->remaining_slots() > 0) {
705       auto ii = tex_ready.begin();
706       sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
707 
708       if ((unsigned)m_current_block->remaining_slots() < 1 + (*ii)->prepare_instr().size())
709          start_new_block(out_blocks, Block::tex);
710 
711       for (auto prep : (*ii)->prepare_instr()) {
712          prep->set_scheduled();
713          m_current_block->push_back(prep);
714       }
715 
716       (*ii)->set_scheduled();
717       m_current_block->push_back(*ii);
718       tex_ready.erase(ii);
719       return true;
720    }
721    return false;
722 }
723 
724 bool
schedule_vtx(Shader::ShaderBlocks & out_blocks)725 BlockScheduler::schedule_vtx(Shader::ShaderBlocks& out_blocks)
726 {
727    if (m_current_block->type() != Block::vtx || m_current_block->remaining_slots() == 0) {
728       start_new_block(out_blocks, Block::vtx);
729       m_current_block->set_instr_flag(Instr::force_cf);
730    }
731    return schedule_block(fetches_ready);
732 }
733 
734 template <typename I>
735 bool
schedule_gds(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)736 BlockScheduler::schedule_gds(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
737 {
738    bool was_full = m_current_block->remaining_slots() == 0;
739    if (m_current_block->type() != Block::gds || was_full) {
740       start_new_block(out_blocks, Block::gds);
741       if (was_full)
742          m_current_block->set_instr_flag(Instr::force_cf);
743    }
744    return schedule_block(ready_list);
745 }
746 
747 void
start_new_block(Shader::ShaderBlocks & out_blocks,Block::Type type)748 BlockScheduler::start_new_block(Shader::ShaderBlocks& out_blocks, Block::Type type)
749 {
750    if (!m_current_block->empty()) {
751       sfn_log << SfnLog::schedule << "Start new block\n";
752       assert(!m_current_block->lds_group_active());
753 
754       if (m_current_block->type() != Block::alu)
755          out_blocks.push_back(m_current_block);
756       else
757          maybe_split_alu_block(out_blocks);
758       m_current_block = new Block(m_current_block->nesting_depth(), m_next_block_id++);
759       m_current_block->set_instr_flag(Instr::force_cf);
760       m_idx0_pending = m_idx1_pending = false;
761 
762    }
763    m_current_block->set_type(type, m_chip_class);
764 }
765 
maybe_split_alu_block(Shader::ShaderBlocks & out_blocks)766 void BlockScheduler::maybe_split_alu_block(Shader::ShaderBlocks& out_blocks)
767 {
768    // TODO: needs fixing
769    if (m_current_block->remaining_slots() > 0) {
770       out_blocks.push_back(m_current_block);
771       return;
772    }
773 
774    int used_slots = 0;
775    int pending_slots = 0;
776 
777    Instr *next_block_start = nullptr;
778    for (auto cur_group : *m_current_block) {
779       /* This limit is a bit fishy, it should be 128 */
780       if (used_slots + pending_slots + cur_group->slots() < 128) {
781          if (cur_group->can_start_alu_block()) {
782             next_block_start = cur_group;
783             used_slots += pending_slots;
784             pending_slots = cur_group->slots();
785          } else {
786             pending_slots += cur_group->slots();
787          }
788       } else {
789          assert(next_block_start);
790          next_block_start->set_instr_flag(Instr::force_cf);
791          used_slots = pending_slots;
792          pending_slots = cur_group->slots();
793       }
794    }
795 
796    Block *sub_block = new Block(m_current_block->nesting_depth(),
797                                 m_next_block_id++);
798    sub_block->set_type(Block::alu, m_chip_class);
799    sub_block->set_instr_flag(Instr::force_cf);
800 
801    for (auto instr : *m_current_block) {
802       auto group = instr->as_alu_group();
803       if (!group) {
804             sub_block->push_back(instr);
805             continue;
806       }
807 
808       if (group->group_force_alu_cf()) {
809          assert(!sub_block->lds_group_active());
810          out_blocks.push_back(sub_block);
811          sub_block = new Block(m_current_block->nesting_depth(),
812                                          m_next_block_id++);
813          sub_block->set_type(Block::alu, m_chip_class);
814          sub_block->set_instr_flag(Instr::force_cf);
815       }
816       sub_block->push_back(group);
817       if (group->has_lds_group_start())
818          sub_block->lds_group_start(*group->begin());
819 
820       if (group->has_lds_group_end())
821          sub_block->lds_group_end();
822 
823    }
824    if (!sub_block->empty())
825       out_blocks.push_back(sub_block);
826 }
827 
828 template <typename I>
829 bool
schedule_cf(Shader::ShaderBlocks & out_blocks,std::list<I * > & ready_list)830 BlockScheduler::schedule_cf(Shader::ShaderBlocks& out_blocks, std::list<I *>& ready_list)
831 {
832    if (ready_list.empty())
833       return false;
834    if (m_current_block->type() != Block::cf)
835       start_new_block(out_blocks, Block::cf);
836    return schedule(ready_list);
837 }
838 
839 bool
schedule_alu_to_group_vec(AluGroup * group)840 BlockScheduler::schedule_alu_to_group_vec(AluGroup *group)
841 {
842    assert(group);
843    assert(!alu_vec_ready.empty());
844 
845    bool success = false;
846    auto i = alu_vec_ready.begin();
847    auto e = alu_vec_ready.end();
848    while (i != e) {
849       sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;
850 
851       if (check_array_reads(**i)) {
852          ++i;
853          continue;
854       }
855 
856       // precausion: don't kill while we hae LDS queue reads in the pipeline
857       if ((*i)->is_kill() && m_current_block->lds_group_active())
858          continue;
859 
860       if (!m_current_block->try_reserve_kcache(**i)) {
861          sfn_log << SfnLog::schedule << " failed (kcache)\n";
862          ++i;
863          continue;
864       }
865 
866       if (group->add_vec_instructions(*i)) {
867          auto old_i = i;
868          ++i;
869          if ((*old_i)->has_alu_flag(alu_is_lds)) {
870             --m_lds_addr_count;
871          }
872 
873          if ((*old_i)->num_ar_uses())
874             m_current_block->set_expected_ar_uses((*old_i)->num_ar_uses());
875          auto addr = std::get<0>((*old_i)->indirect_addr());
876          bool has_indirect_reg_load = addr != nullptr && addr->has_flag(Register::addr_or_idx);
877 
878          bool is_idx_load_on_eg = false;
879          if (!(*old_i)->has_alu_flag(alu_is_lds)) {
880             bool load_idx0_eg = (*old_i)->opcode() == op1_set_cf_idx0;
881             bool load_idx0_ca = ((*old_i)->opcode() == op1_mova_int &&
882                                  (*old_i)->dest()->sel() == AddressRegister::idx0);
883 
884             bool load_idx1_eg = (*old_i)->opcode() == op1_set_cf_idx1;
885             bool load_idx1_ca = ((*old_i)->opcode() == op1_mova_int &&
886                                  (*old_i)->dest()->sel() == AddressRegister::idx1);
887 
888             is_idx_load_on_eg = load_idx0_eg || load_idx1_eg;
889 
890             bool load_idx0 = load_idx0_eg || load_idx0_ca;
891             bool load_idx1 = load_idx1_eg || load_idx1_ca;
892 
893 
894             assert(!m_idx0_pending || !load_idx0);
895             assert(!m_idx1_pending || !load_idx1);
896 
897             m_idx0_loading |= load_idx0;
898             m_idx1_loading |= load_idx1;
899          }
900 
901          if (has_indirect_reg_load || is_idx_load_on_eg)
902             m_current_block->dec_expected_ar_uses();
903 
904          alu_vec_ready.erase(old_i);
905          success = true;
906          sfn_log << SfnLog::schedule << " success\n";
907       } else {
908          ++i;
909          sfn_log << SfnLog::schedule << " failed\n";
910       }
911    }
912    return success;
913 }
914 
915 bool
schedule_alu_to_group_trans(AluGroup * group,std::list<AluInstr * > & readylist)916 BlockScheduler::schedule_alu_to_group_trans(AluGroup *group,
917                                            std::list<AluInstr *>& readylist)
918 {
919    assert(group);
920 
921    bool success = false;
922    auto i = readylist.begin();
923    auto e = readylist.end();
924    while (i != e) {
925 
926       if (check_array_reads(**i)) {
927          ++i;
928          continue;
929       }
930 
931       sfn_log << SfnLog::schedule << "Try schedule to trans " << **i;
932       if (!m_current_block->try_reserve_kcache(**i)) {
933          sfn_log << SfnLog::schedule << " failed (kcache)\n";
934          ++i;
935          continue;
936       }
937 
938       if (group->add_trans_instructions(*i)) {
939          auto old_i = i;
940          ++i;
941          auto addr = std::get<0>((*old_i)->indirect_addr());
942          if (addr && addr->has_flag(Register::addr_or_idx))
943             m_current_block->dec_expected_ar_uses();
944 
945          readylist.erase(old_i);
946          success = true;
947          sfn_log << SfnLog::schedule << " success\n";
948          break;
949       } else {
950          ++i;
951          sfn_log << SfnLog::schedule << " failed\n";
952       }
953    }
954    return success;
955 }
956 
957 template <typename I>
958 bool
schedule(std::list<I * > & ready_list)959 BlockScheduler::schedule(std::list<I *>& ready_list)
960 {
961    if (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
962       auto ii = ready_list.begin();
963       sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
964       (*ii)->set_scheduled();
965       m_current_block->push_back(*ii);
966       ready_list.erase(ii);
967       return true;
968    }
969    return false;
970 }
971 
972 template <typename I>
973 bool
schedule_block(std::list<I * > & ready_list)974 BlockScheduler::schedule_block(std::list<I *>& ready_list)
975 {
976    bool success = false;
977    while (!ready_list.empty() && m_current_block->remaining_slots() > 0) {
978       auto ii = ready_list.begin();
979       sfn_log << SfnLog::schedule << "Schedule: " << **ii << " "
980               << m_current_block->remaining_slots() << "\n";
981       (*ii)->set_scheduled();
982       m_current_block->push_back(*ii);
983       ready_list.erase(ii);
984       success = true;
985    }
986    return success;
987 }
988 
989 bool
schedule_exports(Shader::ShaderBlocks & out_blocks,std::list<ExportInstr * > & ready_list)990 BlockScheduler::schedule_exports(Shader::ShaderBlocks& out_blocks,
991                                 std::list<ExportInstr *>& ready_list)
992 {
993    if (m_current_block->type() != Block::cf)
994       start_new_block(out_blocks, Block::cf);
995 
996    if (!ready_list.empty()) {
997       auto ii = ready_list.begin();
998       sfn_log << SfnLog::schedule << "Schedule: " << **ii << "\n";
999       (*ii)->set_scheduled();
1000       m_current_block->push_back(*ii);
1001       switch ((*ii)->export_type()) {
1002       case ExportInstr::pos:
1003          m_last_pos = *ii;
1004          break;
1005       case ExportInstr::param:
1006          m_last_param = *ii;
1007          break;
1008       case ExportInstr::pixel:
1009          m_last_pixel = *ii;
1010          break;
1011       }
1012       (*ii)->set_is_last_export(false);
1013       ready_list.erase(ii);
1014       return true;
1015    }
1016    return false;
1017 }
1018 
1019 bool
collect_ready(CollectInstructions & available)1020 BlockScheduler::collect_ready(CollectInstructions& available)
1021 {
1022    sfn_log << SfnLog::schedule << "Ready instructions\n";
1023    bool result = false;
1024    result |= collect_ready_alu_vec(alu_vec_ready, available.alu_vec);
1025    result |= collect_ready_type(alu_trans_ready, available.alu_trans);
1026    result |= collect_ready_type(alu_groups_ready, available.alu_groups);
1027    result |= collect_ready_type(gds_ready, available.gds_op);
1028    result |= collect_ready_type(tex_ready, available.tex);
1029    result |= collect_ready_type(fetches_ready, available.fetches);
1030    result |= collect_ready_type(memops_ready, available.mem_write_instr);
1031    result |= collect_ready_type(mem_ring_writes_ready, available.mem_ring_writes);
1032    result |= collect_ready_type(write_tf_ready, available.write_tf);
1033    result |= collect_ready_type(rat_instr_ready, available.rat_instr);
1034 
1035    sfn_log << SfnLog::schedule << "\n";
1036    return result;
1037 }
1038 
1039 bool
collect_ready_alu_vec(std::list<AluInstr * > & ready,std::list<AluInstr * > & available)1040 BlockScheduler::collect_ready_alu_vec(std::list<AluInstr *>& ready,
1041                                      std::list<AluInstr *>& available)
1042 {
1043    auto i = available.begin();
1044    auto e = available.end();
1045 
1046    for (auto alu : ready) {
1047       alu->add_priority(100 * alu->register_priority());
1048    }
1049 
1050    int max_check = 0;
1051    while (i != e && max_check++ < 64) {
1052       if (ready.size() < 64 && (*i)->ready()) {
1053 
1054          int priority = 0;
1055          /* LDS fetches that use static offsets are usually ready ery fast,
1056           * so that they would get schedules early, and this leaves the
1057           * problem that we allocate too many registers with just constant
1058           * values, and this will make problems with RA. So limit the number of
1059           * LDS address registers.
1060           */
1061          if ((*i)->has_alu_flag(alu_lds_address)) {
1062             if (m_lds_addr_count > 64) {
1063                ++i;
1064                continue;
1065             } else {
1066                ++m_lds_addr_count;
1067             }
1068          }
1069 
1070          /* LDS instructions are scheduled with high priority.
1071           * instractions that can go into the t slot and don't have
1072           * indirect access are put in last, so that they don't block
1073           * vec-only instructions when scheduling to the vector slots
1074           * for everything else we look at the register use */
1075 
1076          auto [addr, dummy1, dummy2] = (*i)->indirect_addr();
1077 
1078          if ((*i)->has_lds_access()) {
1079             priority = 100000;
1080             if ((*i)->has_alu_flag(alu_is_lds))
1081                priority += 100000;
1082          } else if (addr) {
1083             priority = 10000;
1084          } else if (AluGroup::has_t()) {
1085             auto opinfo = alu_ops.find((*i)->opcode());
1086             assert(opinfo != alu_ops.end());
1087             if (opinfo->second.can_channel(AluOp::t, m_chip_class))
1088                priority = -1;
1089          }
1090 
1091          priority += 100 * (*i)->register_priority();
1092 
1093          (*i)->add_priority(priority);
1094          ready.push_back(*i);
1095 
1096          auto old_i = i;
1097          ++i;
1098          available.erase(old_i);
1099       } else
1100          ++i;
1101    }
1102 
1103    for (auto& i : ready)
1104       sfn_log << SfnLog::schedule << "V:  " << *i << "\n";
1105 
1106    ready.sort([](const AluInstr *lhs, const AluInstr *rhs) {
1107       return lhs->priority() > rhs->priority();
1108    });
1109 
1110    for (auto& i : ready)
1111       sfn_log << SfnLog::schedule << "V (S):  " << i->priority() << " " << *i << "\n";
1112 
1113    return !ready.empty();
1114 }
1115 
1116 template <typename T> struct type_char {
1117 };
1118 
1119 template <> struct type_char<AluInstr> {
valuer600::type_char1120    static char value() { return 'A';};
1121 };
1122 
1123 template <> struct type_char<AluGroup> {
valuer600::type_char1124    static char value() { return 'G';};
1125 };
1126 
1127 template <> struct type_char<ExportInstr> {
valuer600::type_char1128    static char value() { return 'E';};
1129 };
1130 
1131 template <> struct type_char<TexInstr> {
valuer600::type_char1132    static char value() { return 'T';};
1133 };
1134 
1135 template <> struct type_char<FetchInstr> {
valuer600::type_char1136    static char value() { return 'F';};
1137 };
1138 
1139 template <> struct type_char<WriteOutInstr> {
valuer600::type_char1140    static char value() { return 'M';};
1141 };
1142 
1143 template <> struct type_char<MemRingOutInstr> {
valuer600::type_char1144    static char value() { return 'R';};
1145 };
1146 
1147 template <> struct type_char<WriteTFInstr> {
valuer600::type_char1148    static char value() { return 'X';};
1149 };
1150 
1151 template <> struct type_char<GDSInstr> {
valuer600::type_char1152    static char value() { return 'S';};
1153 };
1154 
1155 template <> struct type_char<RatInstr> {
valuer600::type_char1156    static char value() { return 'I';};
1157 };
1158 
1159 template <typename T>
1160 bool
collect_ready_type(std::list<T * > & ready,std::list<T * > & available)1161 BlockScheduler::collect_ready_type(std::list<T *>& ready, std::list<T *>& available)
1162 {
1163    auto i = available.begin();
1164    auto e = available.end();
1165 
1166    int lookahead = 16;
1167    while (i != e && ready.size() < 16 && lookahead-- > 0) {
1168       if ((*i)->ready()) {
1169          ready.push_back(*i);
1170          auto old_i = i;
1171          ++i;
1172          available.erase(old_i);
1173       } else
1174          ++i;
1175    }
1176 
1177    for (auto& i : ready)
1178       sfn_log << SfnLog::schedule << type_char<T>::value() << ";  " << *i << "\n";
1179 
1180    return !ready.empty();
1181 }
1182 
1183 class CheckArrayAccessVisitor : public  ConstRegisterVisitor {
1184 public:
1185    using ConstRegisterVisitor::visit;
visit(const Register & value)1186    void visit(const Register& value) override {(void)value;}
visit(const LocalArray & value)1187    void visit(const LocalArray& value) override {(void)value;}
visit(const UniformValue & value)1188    void visit(const UniformValue& value) override {(void)value;}
visit(const LiteralConstant & value)1189    void visit(const LiteralConstant& value) override {(void)value;}
visit(const InlineConstant & value)1190    void visit(const InlineConstant& value) override {(void)value;}
1191 };
1192 
1193 class UpdateArrayWrite : public CheckArrayAccessVisitor {
1194 public:
UpdateArrayWrite(ArrayCheckSet & indirect_arrays,ArrayCheckSet & direct_arrays,bool tdw)1195    UpdateArrayWrite(ArrayCheckSet& indirect_arrays,
1196                     ArrayCheckSet& direct_arrays,
1197                     bool tdw):
1198       last_indirect_array_write(indirect_arrays),
1199       last_direct_array_write(direct_arrays),
1200       track_direct_writes(tdw)
1201    {
1202    }
1203 
visit(const LocalArrayValue & value)1204    void visit(const LocalArrayValue& value) override {
1205       int array_base = value.array().base_sel();
1206       auto entry = std::make_pair(array_base, value.chan());
1207       if (value.addr())
1208          last_indirect_array_write.insert(entry);
1209       else if (track_direct_writes)
1210          last_direct_array_write.insert(entry);
1211    }
1212 private:
1213    ArrayCheckSet& last_indirect_array_write;
1214    ArrayCheckSet& last_direct_array_write;
1215    bool track_direct_writes {false};
1216 };
1217 
1218 
update_array_writes(const AluGroup & group)1219 void BlockScheduler::update_array_writes(const AluGroup& group)
1220 {
1221    if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1222       m_last_direct_array_write.clear();
1223       m_last_indirect_array_write.clear();
1224 
1225       UpdateArrayWrite visitor(m_last_indirect_array_write,
1226                                m_last_direct_array_write,
1227                                m_nop_befor_rel_src);
1228 
1229       for (auto alu : group) {
1230          if (alu && alu->dest())
1231             alu->dest()->accept(visitor);
1232       }
1233    }
1234 }
1235 
1236 class CheckArrayRead : public CheckArrayAccessVisitor {
1237 public:
CheckArrayRead(const ArrayCheckSet & indirect_arrays,const ArrayCheckSet & direct_arrays)1238    CheckArrayRead(const ArrayCheckSet& indirect_arrays,
1239                   const ArrayCheckSet& direct_arrays):
1240       last_indirect_array_write(indirect_arrays),
1241       last_direct_array_write(direct_arrays)
1242    {
1243    }
1244 
visit(const LocalArrayValue & value)1245    void visit(const LocalArrayValue& value) override {
1246       int array_base = value.array().base_sel();
1247       auto entry = std::make_pair(array_base, value.chan());
1248 
1249       if (last_indirect_array_write.find(entry) !=
1250           last_indirect_array_write.end())
1251          need_extra_group = true;
1252 
1253       if (value.addr() && last_direct_array_write.find(entry) !=
1254           last_direct_array_write.end()) {
1255          need_extra_group = true;
1256       }
1257    }
1258 
1259    const ArrayCheckSet& last_indirect_array_write;
1260    const ArrayCheckSet& last_direct_array_write;
1261    bool need_extra_group {false};
1262 };
1263 
1264 
check_array_reads(const AluInstr & instr)1265 bool BlockScheduler::check_array_reads(const AluInstr& instr)
1266 {
1267    if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1268 
1269       CheckArrayRead visitor(m_last_indirect_array_write,
1270                              m_last_direct_array_write);
1271 
1272       for (auto& s : instr.sources()) {
1273          s->accept(visitor);
1274       }
1275       return visitor.need_extra_group;
1276    }
1277    return false;
1278 }
1279 
check_array_reads(const AluGroup & group)1280 bool BlockScheduler::check_array_reads(const AluGroup& group)
1281 {
1282    if (m_nop_after_rel_dest || m_nop_befor_rel_src) {
1283 
1284       CheckArrayRead visitor(m_last_indirect_array_write,
1285                              m_last_direct_array_write);
1286 
1287       for (auto alu : group) {
1288          if (!alu)
1289             continue;
1290          for (auto& s : alu->sources()) {
1291             s->accept(visitor);
1292          }
1293       }
1294       return visitor.need_extra_group;
1295    }
1296    return false;
1297 }
1298 
1299 
1300 } // namespace r600
1301