xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_register_allocation.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include "util/bitset.h"
10 #include "util/enum_operators.h"
11 
12 #include <algorithm>
13 #include <array>
14 #include <bitset>
15 #include <map>
16 #include <optional>
17 #include <vector>
18 
19 namespace aco {
20 namespace {
21 
22 struct ra_ctx;
23 struct DefInfo;
24 
25 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
26                                      unsigned idx, RegClass rc);
27 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
28                           RegClass rc);
29 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
30                              bool allow_16bit_write);
31 
32 struct assignment {
33    PhysReg reg;
34    RegClass rc;
35    union {
36       struct {
37          bool assigned : 1;
38          bool vcc : 1;
39          bool m0 : 1;
40          bool renamed : 1;
41       };
42       uint8_t _ = 0;
43    };
44    uint32_t affinity = 0;
45    assignment() = default;
assignmentaco::__anon8422c4420111::assignment46    assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
setaco::__anon8422c4420111::assignment47    void set(const Definition& def)
48    {
49       assigned = true;
50       reg = def.physReg();
51       rc = def.regClass();
52    }
53 };
54 
55 /* Iterator type for making PhysRegInterval compatible with range-based for */
56 struct PhysRegIterator {
57    using difference_type = int;
58    using value_type = unsigned;
59    using reference = const unsigned&;
60    using pointer = const unsigned*;
61    using iterator_category = std::bidirectional_iterator_tag;
62 
63    PhysReg reg;
64 
operator *aco::__anon8422c4420111::PhysRegIterator65    PhysReg operator*() const { return reg; }
66 
operator ++aco::__anon8422c4420111::PhysRegIterator67    PhysRegIterator& operator++()
68    {
69       reg.reg_b += 4;
70       return *this;
71    }
72 
operator --aco::__anon8422c4420111::PhysRegIterator73    PhysRegIterator& operator--()
74    {
75       reg.reg_b -= 4;
76       return *this;
77    }
78 
operator ==aco::__anon8422c4420111::PhysRegIterator79    bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
80 
operator !=aco::__anon8422c4420111::PhysRegIterator81    bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
82 
operator <aco::__anon8422c4420111::PhysRegIterator83    bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
84 };
85 
86 struct ra_ctx {
87 
88    Program* program;
89    Block* block = NULL;
90    aco::monotonic_buffer_resource memory;
91    std::vector<assignment> assignments;
92    std::vector<aco::unordered_map<uint32_t, Temp>> renames;
93    std::vector<uint32_t> loop_header;
94    aco::unordered_map<uint32_t, Temp> orig_names;
95    aco::unordered_map<uint32_t, Instruction*> vectors;
96    aco::unordered_map<uint32_t, Instruction*> split_vectors;
97    aco_ptr<Instruction> pseudo_dummy;
98    aco_ptr<Instruction> phi_dummy;
99    uint16_t max_used_sgpr = 0;
100    uint16_t max_used_vgpr = 0;
101    uint16_t sgpr_limit;
102    uint16_t vgpr_limit;
103    std::bitset<512> war_hint;
104    PhysRegIterator rr_sgpr_it;
105    PhysRegIterator rr_vgpr_it;
106 
107    uint16_t sgpr_bounds;
108    uint16_t vgpr_bounds;
109    uint16_t num_linear_vgprs;
110 
111    ra_test_policy policy;
112 
ra_ctxaco::__anon8422c4420111::ra_ctx113    ra_ctx(Program* program_, ra_test_policy policy_)
114        : program(program_), assignments(program->peekAllocationId()),
115          renames(program->blocks.size(), aco::unordered_map<uint32_t, Temp>(memory)),
116          orig_names(memory), vectors(memory), split_vectors(memory), policy(policy_)
117    {
118       pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
119       phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
120       sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
121       vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
122 
123       sgpr_bounds = program->max_reg_demand.sgpr;
124       vgpr_bounds = program->max_reg_demand.vgpr;
125       num_linear_vgprs = 0;
126    }
127 };
128 
129 /* Half-open register interval used in "sliding window"-style for-loops */
130 struct PhysRegInterval {
131    PhysReg lo_;
132    unsigned size;
133 
134    /* Inclusive lower bound */
loaco::__anon8422c4420111::PhysRegInterval135    PhysReg lo() const { return lo_; }
136 
137    /* Exclusive upper bound */
hiaco::__anon8422c4420111::PhysRegInterval138    PhysReg hi() const { return PhysReg{lo() + size}; }
139 
operator +=aco::__anon8422c4420111::PhysRegInterval140    PhysRegInterval& operator+=(uint32_t stride)
141    {
142       lo_ = PhysReg{lo_.reg() + stride};
143       return *this;
144    }
145 
operator !=aco::__anon8422c4420111::PhysRegInterval146    bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
147 
148    /* Construct a half-open interval, excluding the end register */
from_untilaco::__anon8422c4420111::PhysRegInterval149    static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
150 
containsaco::__anon8422c4420111::PhysRegInterval151    bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
152 
containsaco::__anon8422c4420111::PhysRegInterval153    bool contains(const PhysRegInterval& needle) const
154    {
155       return needle.lo() >= lo() && needle.hi() <= hi();
156    }
157 
beginaco::__anon8422c4420111::PhysRegInterval158    PhysRegIterator begin() const { return {lo_}; }
159 
endaco::__anon8422c4420111::PhysRegInterval160    PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
161 };
162 
163 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)164 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
165 {
166    return a.hi() > b.lo() && b.hi() > a.lo();
167 }
168 
169 /* Gets the stride for full (non-subdword) registers */
170 uint32_t
get_stride(RegClass rc)171 get_stride(RegClass rc)
172 {
173    if (rc.type() == RegType::vgpr) {
174       return 1;
175    } else {
176       uint32_t size = rc.size();
177       if (size == 2) {
178          return 2;
179       } else if (size >= 4) {
180          return 4;
181       } else {
182          return 1;
183       }
184    }
185 }
186 
187 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegType type,bool linear_vgpr)188 get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
189 {
190    uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
191    if (type == RegType::vgpr && linear_vgpr) {
192       return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
193    } else if (type == RegType::vgpr) {
194       return PhysRegInterval{PhysReg(256), linear_vgpr_start};
195    } else {
196       return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
197    }
198 }
199 
200 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegClass rc)201 get_reg_bounds(ra_ctx& ctx, RegClass rc)
202 {
203    return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr());
204 }
205 
206 struct DefInfo {
207    PhysRegInterval bounds;
208    uint8_t size;
209    uint8_t stride;
210    /* Even if stride=4, we might be able to write to the high half instead without preserving the
211     * low half. In that case, data_stride=2. */
212    uint8_t data_stride;
213    RegClass rc;
214 
DefInfoaco::__anon8422c4420111::DefInfo215    DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
216    {
217       size = rc.size();
218       stride = get_stride(rc);
219       data_stride = 0;
220 
221       bounds = get_reg_bounds(ctx, rc);
222 
223       if (rc.is_subdword() && operand >= 0) {
224          /* stride in bytes */
225          stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
226       } else if (rc.is_subdword()) {
227          get_subdword_definition_info(ctx.program, instr);
228       } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
229          /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
230           *
231           * The register use is not calculated correctly, and the hardware assumes a
232           * full dword per component. Don't use the last registers of the register file.
233           * Otherwise, the instruction will be skipped.
234           *
235           * https://reviews.llvm.org/D81172
236           */
237          bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
238          assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
239 
240          if (imageGather4D16Bug)
241             bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
242       }
243 
244       if (!data_stride)
245          data_stride = rc.is_subdword() ? stride : (stride * 4);
246    }
247 
248 private:
249    void get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr);
250 };
251 
252 class RegisterFile {
253 public:
RegisterFile()254    RegisterFile() { regs.fill(0); }
255 
256    std::array<uint32_t, 512> regs;
257    std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
258 
operator [](PhysReg index) const259    const uint32_t& operator[](PhysReg index) const { return regs[index]; }
260 
operator [](PhysReg index)261    uint32_t& operator[](PhysReg index) { return regs[index]; }
262 
count_zero(PhysRegInterval reg_interval) const263    unsigned count_zero(PhysRegInterval reg_interval) const
264    {
265       unsigned res = 0;
266       for (PhysReg reg : reg_interval)
267          res += !regs[reg];
268       return res;
269    }
270 
271    /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes) const272    bool test(PhysReg start, unsigned num_bytes) const
273    {
274       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
275          assert(i <= 511);
276          if (regs[i] & 0x0FFFFFFF)
277             return true;
278          if (regs[i] == 0xF0000000) {
279             auto it = subdword_regs.find(i);
280             assert(it != subdword_regs.end());
281             for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
282                if (it->second[j])
283                   return true;
284             }
285          }
286       }
287       return false;
288    }
289 
block(PhysReg start,RegClass rc)290    void block(PhysReg start, RegClass rc)
291    {
292       if (rc.is_subdword())
293          fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
294       else
295          fill(start, rc.size(), 0xFFFFFFFF);
296    }
297 
is_blocked(PhysReg start) const298    bool is_blocked(PhysReg start) const
299    {
300       if (regs[start] == 0xFFFFFFFF)
301          return true;
302       if (regs[start] == 0xF0000000) {
303          auto it = subdword_regs.find(start);
304          assert(it != subdword_regs.end());
305          for (unsigned i = start.byte(); i < 4; i++)
306             if (it->second[i] == 0xFFFFFFFF)
307                return true;
308       }
309       return false;
310    }
311 
is_empty_or_blocked(PhysReg start) const312    bool is_empty_or_blocked(PhysReg start) const
313    {
314       /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
315        * incremented value to 1 */
316       if (regs[start] == 0xF0000000) {
317          auto it = subdword_regs.find(start);
318          assert(it != subdword_regs.end());
319          return it->second[start.byte()] + 1 <= 1;
320       }
321       return regs[start] + 1 <= 1;
322    }
323 
clear(PhysReg start,RegClass rc)324    void clear(PhysReg start, RegClass rc)
325    {
326       if (rc.is_subdword())
327          fill_subdword(start, rc.bytes(), 0);
328       else
329          fill(start, rc.size(), 0);
330    }
331 
fill(Operand op)332    void fill(Operand op)
333    {
334       if (op.regClass().is_subdword())
335          fill_subdword(op.physReg(), op.bytes(), op.tempId());
336       else
337          fill(op.physReg(), op.size(), op.tempId());
338    }
339 
clear(Operand op)340    void clear(Operand op) { clear(op.physReg(), op.regClass()); }
341 
fill(Definition def)342    void fill(Definition def)
343    {
344       if (def.regClass().is_subdword())
345          fill_subdword(def.physReg(), def.bytes(), def.tempId());
346       else
347          fill(def.physReg(), def.size(), def.tempId());
348    }
349 
clear(Definition def)350    void clear(Definition def) { clear(def.physReg(), def.regClass()); }
351 
get_id(PhysReg reg) const352    unsigned get_id(PhysReg reg) const
353    {
354       return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg];
355    }
356 
357 private:
fill(PhysReg start,unsigned size,uint32_t val)358    void fill(PhysReg start, unsigned size, uint32_t val)
359    {
360       for (unsigned i = 0; i < size; i++)
361          regs[start + i] = val;
362    }
363 
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)364    void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
365    {
366       fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
367       for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
368          /* emplace or get */
369          std::array<uint32_t, 4>& sub =
370             subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
371          for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
372             sub[j] = val;
373 
374          if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
375             subdword_regs.erase(i);
376             regs[i] = 0;
377          }
378       }
379    }
380 };
381 
382 std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file,
383                                 const PhysRegInterval reg_interval);
384 
385 /* helper function for debugging */
386 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)387 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
388 {
389    if (reg_file[reg] == 0xFFFFFFFF) {
390       printf((const char*)u8"☐");
391    } else if (reg_file[reg]) {
392       const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
393       if (show_subdword_alloc) {
394          auto block_chars = {
395             // clang-format off
396             u8"?", u8"▘", u8"▝", u8"▀",
397             u8"▖", u8"▌", u8"▞", u8"▛",
398             u8"▗", u8"▚", u8"▐", u8"▜",
399             u8"▄", u8"▙", u8"▟", u8"▉"
400             // clang-format on
401          };
402          unsigned index = 0;
403          for (int i = 0; i < 4; ++i) {
404             if (reg_file.subdword_regs.at(reg)[i]) {
405                index |= 1 << i;
406             }
407          }
408          printf("%s", (const char*)(block_chars.begin()[index]));
409       } else {
410          /* Indicate filled register slot */
411          if (!has_adjacent_variable) {
412             printf((const char*)u8"█");
413          } else {
414             /* Use a slightly shorter box to leave a small gap between adjacent variables */
415             printf((const char*)u8"▉");
416          }
417       }
418    } else {
419       printf((const char*)u8"·");
420    }
421 }
422 
423 /* helper function for debugging */
424 UNUSED void
print_regs(ra_ctx & ctx,PhysRegInterval regs,const RegisterFile & reg_file)425 print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file)
426 {
427    char reg_char = regs.lo().reg() >= 256 ? 'v' : 's';
428    const int max_regs_per_line = 64;
429 
430    /* print markers */
431    printf("       ");
432    for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
433       printf("%-3.2u ", i);
434    }
435    printf("\n");
436 
437    /* print usage */
438    auto line_begin_it = regs.begin();
439    while (line_begin_it != regs.end()) {
440       const int regs_in_line =
441          std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
442 
443       if (line_begin_it == regs.begin()) {
444          printf("%cgprs: ", reg_char);
445       } else {
446          printf("  %+4d ", std::distance(regs.begin(), line_begin_it));
447       }
448       const auto line_end_it = std::next(line_begin_it, regs_in_line);
449 
450       for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
451          bool has_adjacent_variable =
452             (std::next(reg_it) != line_end_it &&
453              reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
454          print_reg(reg_file, *reg_it, has_adjacent_variable);
455       }
456 
457       line_begin_it = line_end_it;
458       printf("\n");
459    }
460 
461    const unsigned free_regs =
462       std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
463    printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
464 
465    /* print assignments ordered by registers */
466    std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
467    for (unsigned id : find_vars(ctx, reg_file, regs)) {
468       const assignment& var = ctx.assignments[id];
469       PhysReg reg = var.reg;
470       ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
471       assert(inserted.second);
472    }
473 
474    for (const auto& reg_and_var : regs_to_vars) {
475       const auto& first_reg = reg_and_var.first;
476       const auto& size_id = reg_and_var.second;
477 
478       printf("%%%u ", size_id.second);
479       if (ctx.orig_names.count(size_id.second) &&
480           ctx.orig_names[size_id.second].id() != size_id.second) {
481          printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
482       }
483       printf("= %c[%d", reg_char, first_reg.reg() % 256);
484       PhysReg last_reg = first_reg.advance(size_id.first - 1);
485       if (first_reg.reg() != last_reg.reg()) {
486          assert(first_reg.byte() == 0 && last_reg.byte() == 3);
487          printf("-%d", last_reg.reg() % 256);
488       }
489       printf("]");
490       if (first_reg.byte() != 0 || last_reg.byte() != 3) {
491          printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
492       }
493       printf("\n");
494    }
495 }
496 
497 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)498 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
499                             unsigned idx, RegClass rc)
500 {
501    assert(gfx_level >= GFX8);
502    if (instr->isPseudo()) {
503       /* v_readfirstlane_b32 cannot use SDWA */
504       if (instr->opcode == aco_opcode::p_as_uniform)
505          return 4;
506       else
507          return rc.bytes() % 2 == 0 ? 2 : 1;
508    }
509 
510    assert(rc.bytes() <= 2);
511    if (instr->isVALU()) {
512       if (can_use_SDWA(gfx_level, instr, false))
513          return rc.bytes();
514       if (can_use_opsel(gfx_level, instr->opcode, idx))
515          return 2;
516       if (instr->isVOP3P())
517          return 2;
518    }
519 
520    switch (instr->opcode) {
521    case aco_opcode::v_cvt_f32_ubyte0: return 1;
522    case aco_opcode::ds_write_b8:
523    case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
524    case aco_opcode::buffer_store_byte:
525    case aco_opcode::buffer_store_short:
526    case aco_opcode::buffer_store_format_d16_x:
527    case aco_opcode::flat_store_byte:
528    case aco_opcode::flat_store_short:
529    case aco_opcode::scratch_store_byte:
530    case aco_opcode::scratch_store_short:
531    case aco_opcode::global_store_byte:
532    case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
533    default: return 4;
534    }
535 }
536 
537 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)538 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
539                      RegClass rc)
540 {
541    amd_gfx_level gfx_level = ctx.program->gfx_level;
542    if (instr->isPseudo() || byte == 0)
543       return;
544 
545    assert(rc.bytes() <= 2);
546    if (instr->isVALU()) {
547       if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
548          switch (byte) {
549          case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
550          case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
551          case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
552          case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
553          }
554          return;
555       }
556 
557       /* use SDWA */
558       if (can_use_SDWA(gfx_level, instr, false)) {
559          convert_to_SDWA(gfx_level, instr);
560          return;
561       }
562 
563       /* use opsel */
564       if (instr->isVOP3P()) {
565          assert(byte == 2 && !instr->valu().opsel_lo[idx]);
566          instr->valu().opsel_lo[idx] = true;
567          instr->valu().opsel_hi[idx] = true;
568          return;
569       }
570 
571       assert(can_use_opsel(gfx_level, instr->opcode, idx));
572       instr->valu().opsel[idx] = true;
573       return;
574    }
575 
576    assert(byte == 2);
577    if (instr->opcode == aco_opcode::ds_write_b8)
578       instr->opcode = aco_opcode::ds_write_b8_d16_hi;
579    else if (instr->opcode == aco_opcode::ds_write_b16)
580       instr->opcode = aco_opcode::ds_write_b16_d16_hi;
581    else if (instr->opcode == aco_opcode::buffer_store_byte)
582       instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
583    else if (instr->opcode == aco_opcode::buffer_store_short)
584       instr->opcode = aco_opcode::buffer_store_short_d16_hi;
585    else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
586       instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
587    else if (instr->opcode == aco_opcode::flat_store_byte)
588       instr->opcode = aco_opcode::flat_store_byte_d16_hi;
589    else if (instr->opcode == aco_opcode::flat_store_short)
590       instr->opcode = aco_opcode::flat_store_short_d16_hi;
591    else if (instr->opcode == aco_opcode::scratch_store_byte)
592       instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
593    else if (instr->opcode == aco_opcode::scratch_store_short)
594       instr->opcode = aco_opcode::scratch_store_short_d16_hi;
595    else if (instr->opcode == aco_opcode::global_store_byte)
596       instr->opcode = aco_opcode::global_store_byte_d16_hi;
597    else if (instr->opcode == aco_opcode::global_store_short)
598       instr->opcode = aco_opcode::global_store_short_d16_hi;
599    else
600       unreachable("Something went wrong: Impossible register assignment.");
601    return;
602 }
603 
604 void
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr)605 DefInfo::get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr)
606 {
607    amd_gfx_level gfx_level = program->gfx_level;
608    assert(gfx_level >= GFX8);
609 
610    stride = rc.bytes() % 2 == 0 ? 2 : 1;
611 
612    if (instr->isPseudo()) {
613       if (instr->opcode == aco_opcode::p_interp_gfx11) {
614          rc = RegClass(RegType::vgpr, rc.size());
615          stride = 1;
616       }
617       return;
618    }
619 
620    if (instr->isVALU()) {
621       assert(rc.bytes() <= 2);
622 
623       if (can_use_SDWA(gfx_level, instr, false) || instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
624          return;
625 
626       rc = instr_is_16bit(gfx_level, instr->opcode) ? v2b : v1;
627       stride = rc == v2b ? 4 : 1;
628       if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
629           can_use_opsel(gfx_level, instr->opcode, -1)) {
630          data_stride = 2;
631          stride = rc == v2b ? 2 : stride;
632       }
633       return;
634    }
635 
636    switch (instr->opcode) {
637    case aco_opcode::v_interp_p2_f16: return;
638    /* D16 loads with _hi version */
639    case aco_opcode::ds_read_u8_d16:
640    case aco_opcode::ds_read_i8_d16:
641    case aco_opcode::ds_read_u16_d16:
642    case aco_opcode::flat_load_ubyte_d16:
643    case aco_opcode::flat_load_sbyte_d16:
644    case aco_opcode::flat_load_short_d16:
645    case aco_opcode::global_load_ubyte_d16:
646    case aco_opcode::global_load_sbyte_d16:
647    case aco_opcode::global_load_short_d16:
648    case aco_opcode::scratch_load_ubyte_d16:
649    case aco_opcode::scratch_load_sbyte_d16:
650    case aco_opcode::scratch_load_short_d16:
651    case aco_opcode::buffer_load_ubyte_d16:
652    case aco_opcode::buffer_load_sbyte_d16:
653    case aco_opcode::buffer_load_short_d16:
654    case aco_opcode::buffer_load_format_d16_x: {
655       assert(gfx_level >= GFX9);
656       if (program->dev.sram_ecc_enabled) {
657          rc = v1;
658          stride = 1;
659          data_stride = 2;
660       } else {
661          stride = 2;
662       }
663       return;
664    }
665    /* 3-component D16 loads */
666    case aco_opcode::buffer_load_format_d16_xyz:
667    case aco_opcode::tbuffer_load_format_d16_xyz: {
668       assert(gfx_level >= GFX9);
669       if (program->dev.sram_ecc_enabled) {
670          rc = v2;
671          stride = 1;
672       } else {
673          stride = 4;
674       }
675       return;
676    }
677    default: break;
678    }
679 
680    if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
681       assert(gfx_level >= GFX9);
682       stride = 4;
683    } else {
684       rc = RegClass(RegType::vgpr, rc.size());
685       stride = 1;
686    }
687 }
688 
689 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg,bool allow_16bit_write)690 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
691                         bool allow_16bit_write)
692 {
693    if (instr->isPseudo())
694       return;
695 
696    if (instr->isVALU()) {
697       amd_gfx_level gfx_level = program->gfx_level;
698       assert(instr->definitions[0].bytes() <= 2);
699 
700       if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
701          return;
702 
703       if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
704          return;
705 
706       /* use SDWA */
707       if (can_use_SDWA(gfx_level, instr, false)) {
708          convert_to_SDWA(gfx_level, instr);
709          return;
710       }
711 
712       assert(allow_16bit_write);
713 
714       if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
715          instr->opcode = aco_opcode::v_fma_mixhi_f16;
716          return;
717       }
718 
719       /* use opsel */
720       assert(reg.byte() == 2);
721       assert(can_use_opsel(gfx_level, instr->opcode, -1));
722       instr->valu().opsel[3] = true; /* dst in high half */
723       return;
724    }
725 
726    if (reg.byte() == 0)
727       return;
728    else if (instr->opcode == aco_opcode::v_interp_p2_f16)
729       instr->opcode = aco_opcode::v_interp_p2_hi_f16;
730    else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
731       instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
732    else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
733       instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
734    else if (instr->opcode == aco_opcode::buffer_load_short_d16)
735       instr->opcode = aco_opcode::buffer_load_short_d16_hi;
736    else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
737       instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
738    else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
739       instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
740    else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
741       instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
742    else if (instr->opcode == aco_opcode::flat_load_short_d16)
743       instr->opcode = aco_opcode::flat_load_short_d16_hi;
744    else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
745       instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
746    else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
747       instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
748    else if (instr->opcode == aco_opcode::scratch_load_short_d16)
749       instr->opcode = aco_opcode::scratch_load_short_d16_hi;
750    else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
751       instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
752    else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
753       instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
754    else if (instr->opcode == aco_opcode::global_load_short_d16)
755       instr->opcode = aco_opcode::global_load_short_d16_hi;
756    else if (instr->opcode == aco_opcode::ds_read_u8_d16)
757       instr->opcode = aco_opcode::ds_read_u8_d16_hi;
758    else if (instr->opcode == aco_opcode::ds_read_i8_d16)
759       instr->opcode = aco_opcode::ds_read_i8_d16_hi;
760    else if (instr->opcode == aco_opcode::ds_read_u16_d16)
761       instr->opcode = aco_opcode::ds_read_u16_d16_hi;
762    else
763       unreachable("Something went wrong: Impossible register assignment.");
764 }
765 
766 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)767 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
768 {
769    uint16_t max_addressible_sgpr = ctx.sgpr_limit;
770    unsigned size = rc.size();
771    if (rc.type() == RegType::vgpr) {
772       assert(reg >= 256);
773       uint16_t hi = reg - 256 + size - 1;
774       assert(hi <= 255);
775       ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
776    } else if (reg + rc.size() <= max_addressible_sgpr) {
777       uint16_t hi = reg + size - 1;
778       ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
779    }
780 }
781 
782 enum UpdateRenames {
783    rename_not_killed_ops = 0x1,
784    fill_killed_ops = 0x2,
785    rename_precolored_ops = 0x4,
786 };
787 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
788 
789 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)790 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
791                std::vector<std::pair<Operand, Definition>>& parallelcopies,
792                aco_ptr<Instruction>& instr, UpdateRenames flags)
793 {
794    /* clear operands */
795    for (std::pair<Operand, Definition>& copy : parallelcopies) {
796       /* the definitions with id are not from this function and already handled */
797       if (copy.second.isTemp())
798          continue;
799       reg_file.clear(copy.first);
800    }
801 
802    /* allocate id's and rename operands: this is done transparently here */
803    auto it = parallelcopies.begin();
804    while (it != parallelcopies.end()) {
805       if (it->second.isTemp()) {
806          ++it;
807          continue;
808       }
809 
810       /* check if we moved a definition: change the register and remove copy */
811       bool is_def = false;
812       for (Definition& def : instr->definitions) {
813          if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
814             // FIXME: ensure that the definition can use this reg
815             def.setFixed(it->second.physReg());
816             reg_file.fill(def);
817             ctx.assignments[def.tempId()].reg = def.physReg();
818             it = parallelcopies.erase(it);
819             is_def = true;
820             break;
821          }
822       }
823       if (is_def)
824          continue;
825 
826       /* check if we moved another parallelcopy definition */
827       for (std::pair<Operand, Definition>& other : parallelcopies) {
828          if (!other.second.isTemp())
829             continue;
830          if (it->first.getTemp() == other.second.getTemp()) {
831             other.second.setFixed(it->second.physReg());
832             ctx.assignments[other.second.tempId()].reg = other.second.physReg();
833             it = parallelcopies.erase(it);
834             is_def = true;
835             /* check if we moved an operand, again */
836             bool fill = true;
837             for (Operand& op : instr->operands) {
838                if (op.isTemp() && op.tempId() == other.second.tempId()) {
839                   // FIXME: ensure that the operand can use this reg
840                   op.setFixed(other.second.physReg());
841                   fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
842                }
843             }
844             if (fill)
845                reg_file.fill(other.second);
846             break;
847          }
848       }
849       if (is_def)
850          continue;
851 
852       std::pair<Operand, Definition>& copy = *it;
853       copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
854       ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
855       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
856 
857       /* check if we moved an operand */
858       bool first[2] = {true, true};
859       bool fill = true;
860       for (unsigned i = 0; i < instr->operands.size(); i++) {
861          Operand& op = instr->operands[i];
862          if (!op.isTemp())
863             continue;
864          if (op.tempId() == copy.first.tempId()) {
865             /* only rename precolored operands if the copy-location matches */
866             bool omit_renaming = (flags & rename_precolored_ops) && op.isFixed() &&
867                                  op.physReg() != copy.second.physReg();
868 
869             /* Omit renaming in some cases for p_create_vector in order to avoid
870              * unnecessary shuffle code. */
871             if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
872                omit_renaming = true;
873                for (std::pair<Operand, Definition>& pc : parallelcopies) {
874                   PhysReg def_reg = pc.second.physReg();
875                   omit_renaming &= def_reg > copy.first.physReg()
876                                       ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
877                                       : (def_reg + pc.second.size() <= copy.first.physReg().reg());
878                }
879             }
880 
881             /* Fix the kill flags */
882             if (first[omit_renaming])
883                op.setFirstKill(omit_renaming || op.isKill());
884             else
885                op.setKill(omit_renaming || op.isKill());
886             first[omit_renaming] = false;
887 
888             if (omit_renaming)
889                continue;
890 
891             op.setTemp(copy.second.getTemp());
892             op.setFixed(copy.second.physReg());
893 
894             fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
895          }
896       }
897 
898       if (fill)
899          reg_file.fill(copy.second);
900 
901       ++it;
902    }
903 }
904 
905 std::optional<PhysReg>
get_reg_simple(ra_ctx & ctx,const RegisterFile & reg_file,DefInfo info)906 get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info)
907 {
908    PhysRegInterval bounds = info.bounds;
909    uint32_t size = info.size;
910    uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
911    RegClass rc = info.rc;
912 
913    if (stride < size && !rc.is_subdword()) {
914       DefInfo new_info = info;
915       new_info.stride = stride * 2;
916       if (size % new_info.stride == 0) {
917          std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
918          if (res)
919             return res;
920       }
921    }
922 
923    PhysRegIterator& rr_it = rc.type() == RegType::vgpr ? ctx.rr_vgpr_it : ctx.rr_sgpr_it;
924    if (stride == 1) {
925       if (rr_it != bounds.begin() && bounds.contains(rr_it.reg)) {
926          assert(bounds.begin() < rr_it);
927          assert(rr_it < bounds.end());
928          info.bounds = PhysRegInterval::from_until(rr_it.reg, bounds.hi());
929          std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
930          if (res)
931             return res;
932          bounds = PhysRegInterval::from_until(bounds.lo(), rr_it.reg);
933       }
934    }
935 
936    auto is_free = [&](PhysReg reg_index)
937    { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
938 
939    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
940         reg_win += stride) {
941       if (std::all_of(reg_win.begin(), reg_win.end(), is_free)) {
942          if (stride == 1) {
943             PhysRegIterator new_rr_it{PhysReg{reg_win.lo() + size}};
944             if (new_rr_it < bounds.end())
945                rr_it = new_rr_it;
946          }
947          adjust_max_used_regs(ctx, rc, reg_win.lo());
948          return reg_win.lo();
949       }
950    }
951 
952    /* do this late because using the upper bytes of a register can require
953     * larger instruction encodings or copies
954     * TODO: don't do this in situations where it doesn't benefit */
955    if (rc.is_subdword()) {
956       for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry :
957            reg_file.subdword_regs) {
958          assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
959          if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
960             continue;
961 
962          auto it = entry.second.begin();
963          for (unsigned i = 0; i < 4; i += info.stride) {
964             /* check if there's a block of free bytes large enough to hold the register */
965             bool reg_found =
966                std::all_of(std::next(it, i), std::next(it, std::min(4u, i + rc.bytes())),
967                            [](unsigned v) { return v == 0; });
968 
969             /* check if also the neighboring reg is free if needed */
970             if (reg_found && i + rc.bytes() > 4)
971                reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
972 
973             if (reg_found) {
974                PhysReg res{entry.first};
975                res.reg_b += i;
976                adjust_max_used_regs(ctx, rc, entry.first);
977                return res;
978             }
979          }
980       }
981    }
982 
983    return {};
984 }
985 
986 /* collect variables from a register area */
987 std::vector<unsigned>
find_vars(ra_ctx & ctx,const RegisterFile & reg_file,const PhysRegInterval reg_interval)988 find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval)
989 {
990    std::vector<unsigned> vars;
991    for (PhysReg j : reg_interval) {
992       if (reg_file.is_blocked(j))
993          continue;
994       if (reg_file[j] == 0xF0000000) {
995          for (unsigned k = 0; k < 4; k++) {
996             unsigned id = reg_file.subdword_regs.at(j)[k];
997             if (id && (vars.empty() || id != vars.back()))
998                vars.emplace_back(id);
999          }
1000       } else {
1001          unsigned id = reg_file[j];
1002          if (id && (vars.empty() || id != vars.back()))
1003             vars.emplace_back(id);
1004       }
1005    }
1006    return vars;
1007 }
1008 
1009 /* collect variables from a register area and clear reg_file
1010  * variables are sorted in decreasing size and
1011  * increasing assigned register
1012  */
1013 std::vector<unsigned>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1014 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1015 {
1016    std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1017    std::sort(ids.begin(), ids.end(),
1018              [&](unsigned a, unsigned b)
1019              {
1020                 assignment& var_a = ctx.assignments[a];
1021                 assignment& var_b = ctx.assignments[b];
1022                 return var_a.rc.bytes() > var_b.rc.bytes() ||
1023                        (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1024              });
1025 
1026    for (unsigned id : ids) {
1027       assignment& var = ctx.assignments[id];
1028       reg_file.clear(var.reg, var.rc);
1029    }
1030    return ids;
1031 }
1032 
1033 std::optional<PhysReg>
get_reg_for_create_vector_copy(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg,DefInfo info,unsigned id)1034 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1035                                std::vector<std::pair<Operand, Definition>>& parallelcopies,
1036                                aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1037                                DefInfo info, unsigned id)
1038 {
1039    PhysReg reg = def_reg.lo();
1040    /* dead operand: return position in vector */
1041    for (unsigned i = 0; i < instr->operands.size(); i++) {
1042       if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1043           instr->operands[i].isKillBeforeDef()) {
1044          assert(!reg_file.test(reg, instr->operands[i].bytes()));
1045          if (info.rc.is_subdword() || reg.byte() == 0)
1046             return reg;
1047          else
1048             return {};
1049       }
1050       reg.reg_b += instr->operands[i].bytes();
1051    }
1052 
1053    /* GFX9+ has a VGPR swap instruction. */
1054    if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
1055       return {};
1056 
1057    /* check if the previous position was in vector */
1058    assignment& var = ctx.assignments[id];
1059    if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1060       reg = def_reg.lo();
1061       /* try to use the previous register of the operand */
1062       for (unsigned i = 0; i < instr->operands.size(); i++) {
1063          if (reg != var.reg) {
1064             reg.reg_b += instr->operands[i].bytes();
1065             continue;
1066          }
1067 
1068          /* check if we can swap positions */
1069          if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1070              instr->operands[i].regClass() == info.rc) {
1071             assignment& op = ctx.assignments[instr->operands[i].tempId()];
1072             /* if everything matches, create parallelcopy for the killed operand */
1073             if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
1074                 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1075                Definition pc_def = Definition(reg, info.rc);
1076                parallelcopies.emplace_back(instr->operands[i], pc_def);
1077                return op.reg;
1078             }
1079          }
1080          return {};
1081       }
1082    }
1083    return {};
1084 }
1085 
1086 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::vector<unsigned> & vars,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)1087 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1088                     std::vector<std::pair<Operand, Definition>>& parallelcopies,
1089                     const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
1090                     const PhysRegInterval def_reg)
1091 {
1092    /* Variables are sorted from large to small and with increasing assigned register */
1093    for (unsigned id : vars) {
1094       assignment& var = ctx.assignments[id];
1095       PhysRegInterval bounds = get_reg_bounds(ctx, var.rc);
1096       DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1097       uint32_t size = info.size;
1098 
1099       /* check if this is a dead operand, then we can re-use the space from the definition
1100        * also use the correct stride for sub-dword operands */
1101       bool is_dead_operand = false;
1102       std::optional<PhysReg> res;
1103       if (instr->opcode == aco_opcode::p_create_vector) {
1104          res =
1105             get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1106       } else {
1107          for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1108             if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1109                info = DefInfo(ctx, instr, var.rc, i);
1110                if (instr->operands[i].isKillBeforeDef()) {
1111                   info.bounds = def_reg;
1112                   res = get_reg_simple(ctx, reg_file, info);
1113                   is_dead_operand = true;
1114                }
1115                break;
1116             }
1117          }
1118       }
1119       if (!res && !def_reg.size) {
1120          /* If this is before definitions are handled, def_reg may be an empty interval. */
1121          info.bounds = bounds;
1122          res = get_reg_simple(ctx, reg_file, info);
1123       } else if (!res) {
1124          /* Try to find space within the bounds but outside of the definition */
1125          info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1126          res = get_reg_simple(ctx, reg_file, info);
1127          if (!res && def_reg.hi() <= bounds.hi()) {
1128             unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1129             info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1130             res = get_reg_simple(ctx, reg_file, info);
1131          }
1132       }
1133 
1134       if (res) {
1135          /* mark the area as blocked */
1136          reg_file.block(*res, var.rc);
1137 
1138          /* create parallelcopy pair (without definition id) */
1139          Temp tmp = Temp(id, var.rc);
1140          Operand pc_op = Operand(tmp);
1141          pc_op.setFixed(var.reg);
1142          Definition pc_def = Definition(*res, pc_op.regClass());
1143          parallelcopies.emplace_back(pc_op, pc_def);
1144          continue;
1145       }
1146 
1147       PhysReg best_pos = bounds.lo();
1148       unsigned num_moves = 0xFF;
1149       unsigned num_vars = 0;
1150 
1151       /* we use a sliding window to find potential positions */
1152       unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1153       for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1154            reg_win += stride) {
1155          if (!is_dead_operand && intersects(reg_win, def_reg))
1156             continue;
1157 
1158          /* second, check that we have at most k=num_moves elements in the window
1159           * and no element is larger than the currently processed one */
1160          unsigned k = 0;
1161          unsigned n = 0;
1162          unsigned last_var = 0;
1163          bool found = true;
1164          for (PhysReg j : reg_win) {
1165             if (reg_file[j] == 0 || reg_file[j] == last_var)
1166                continue;
1167 
1168             if (reg_file.is_blocked(j) || k > num_moves) {
1169                found = false;
1170                break;
1171             }
1172             if (reg_file[j] == 0xF0000000) {
1173                k += 1;
1174                n++;
1175                continue;
1176             }
1177             /* we cannot split live ranges of linear vgprs */
1178             if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1179                found = false;
1180                break;
1181             }
1182             bool is_kill = false;
1183             for (const Operand& op : instr->operands) {
1184                if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1185                   is_kill = true;
1186                   break;
1187                }
1188             }
1189             if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1190                found = false;
1191                break;
1192             }
1193 
1194             k += ctx.assignments[reg_file[j]].rc.size();
1195             last_var = reg_file[j];
1196             n++;
1197             if (k > num_moves || (k == num_moves && n <= num_vars)) {
1198                found = false;
1199                break;
1200             }
1201          }
1202 
1203          if (found) {
1204             best_pos = reg_win.lo();
1205             num_moves = k;
1206             num_vars = n;
1207          }
1208       }
1209 
1210       /* FIXME: we messed up and couldn't find space for the variables to be copied */
1211       if (num_moves == 0xFF)
1212          return false;
1213 
1214       PhysRegInterval reg_win{best_pos, size};
1215 
1216       /* collect variables and block reg file */
1217       std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1218 
1219       /* mark the area as blocked */
1220       reg_file.block(reg_win.lo(), var.rc);
1221       adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1222 
1223       if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
1224          return false;
1225 
1226       /* create parallelcopy pair (without definition id) */
1227       Temp tmp = Temp(id, var.rc);
1228       Operand pc_op = Operand(tmp);
1229       pc_op.setFixed(var.reg);
1230       Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1231       parallelcopies.emplace_back(pc_op, pc_def);
1232    }
1233 
1234    return true;
1235 }
1236 
1237 std::optional<PhysReg>
get_reg_impl(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1238 get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
1239              std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1240              aco_ptr<Instruction>& instr)
1241 {
1242    const PhysRegInterval& bounds = info.bounds;
1243    uint32_t size = info.size;
1244    uint32_t stride = info.stride;
1245    RegClass rc = info.rc;
1246 
1247    /* check how many free regs we have */
1248    unsigned regs_free = reg_file.count_zero(bounds);
1249 
1250    /* mark and count killed operands */
1251    unsigned killed_ops = 0;
1252    std::bitset<256> is_killed_operand; /* per-register */
1253    for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1254       Operand& op = instr->operands[j];
1255       if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1256           !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1257          assert(op.isFixed());
1258 
1259          for (unsigned i = 0; i < op.size(); ++i) {
1260             is_killed_operand[(op.physReg() & 0xff) + i] = true;
1261          }
1262 
1263          killed_ops += op.getTemp().size();
1264       }
1265    }
1266 
1267    assert((regs_free + ctx.num_linear_vgprs) >= size);
1268 
1269    /* we might have to move dead operands to dst in order to make space */
1270    unsigned op_moves = 0;
1271 
1272    if (size > (regs_free - killed_ops))
1273       op_moves = size - (regs_free - killed_ops);
1274 
1275    /* find the best position to place the definition */
1276    PhysRegInterval best_win = {bounds.lo(), size};
1277    unsigned num_moves = 0xFF;
1278    unsigned num_vars = 0;
1279 
1280    /* we use a sliding window to check potential positions */
1281    for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1282         reg_win += stride) {
1283       /* first check if the register window starts in the middle of an
1284        * allocated variable: this is what we have to fix to allow for
1285        * num_moves > size */
1286       if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1287           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1288          continue;
1289       if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1290           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1291          continue;
1292 
1293       /* second, check that we have at most k=num_moves elements in the window
1294        * and no element is larger than the currently processed one */
1295       unsigned k = op_moves;
1296       unsigned n = 0;
1297       unsigned remaining_op_moves = op_moves;
1298       unsigned last_var = 0;
1299       bool found = true;
1300       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1301       for (const PhysReg j : reg_win) {
1302          /* dead operands effectively reduce the number of estimated moves */
1303          if (is_killed_operand[j & 0xFF]) {
1304             if (remaining_op_moves) {
1305                k--;
1306                remaining_op_moves--;
1307             }
1308             continue;
1309          }
1310 
1311          if (reg_file[j] == 0 || reg_file[j] == last_var)
1312             continue;
1313 
1314          if (reg_file[j] == 0xF0000000) {
1315             k += 1;
1316             n++;
1317             continue;
1318          }
1319 
1320          if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1321             found = false;
1322             break;
1323          }
1324 
1325          /* we cannot split live ranges of linear vgprs */
1326          if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1327             found = false;
1328             break;
1329          }
1330 
1331          k += ctx.assignments[reg_file[j]].rc.size();
1332          n++;
1333          last_var = reg_file[j];
1334       }
1335 
1336       if (!found || k > num_moves)
1337          continue;
1338       if (k == num_moves && n < num_vars)
1339          continue;
1340       if (!aligned && k == num_moves && n == num_vars)
1341          continue;
1342 
1343       if (found) {
1344          best_win = reg_win;
1345          num_moves = k;
1346          num_vars = n;
1347       }
1348    }
1349 
1350    if (num_moves == 0xFF)
1351       return {};
1352 
1353    /* now, we figured the placement for our definition */
1354    RegisterFile tmp_file(reg_file);
1355 
1356    /* p_create_vector: also re-place killed operands in the definition space */
1357    if (instr->opcode == aco_opcode::p_create_vector) {
1358       for (Operand& op : instr->operands) {
1359          if (op.isTemp() && op.isFirstKillBeforeDef())
1360             tmp_file.fill(op);
1361       }
1362    }
1363 
1364    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1365 
1366    /* re-enable killed operands */
1367    if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
1368       for (Operand& op : instr->operands) {
1369          if (op.isTemp() && op.isFirstKillBeforeDef())
1370             tmp_file.fill(op);
1371       }
1372    }
1373 
1374    std::vector<std::pair<Operand, Definition>> pc;
1375    if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
1376       return {};
1377 
1378    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1379 
1380    adjust_max_used_regs(ctx, rc, best_win.lo());
1381    return best_win.lo();
1382 }
1383 
1384 bool
get_reg_specified(ra_ctx & ctx,const RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg,int operand)1385 get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
1386                   aco_ptr<Instruction>& instr, PhysReg reg, int operand)
1387 {
1388    /* catch out-of-range registers */
1389    if (reg >= PhysReg{512})
1390       return false;
1391 
1392    DefInfo info(ctx, instr, rc, operand);
1393 
1394    if (reg.reg_b % info.data_stride)
1395       return false;
1396 
1397    assert(util_is_power_of_two_nonzero(info.stride));
1398    reg.reg_b &= ~(info.stride - 1);
1399 
1400    PhysRegInterval reg_win = {PhysReg(reg.reg()), info.rc.size()};
1401    PhysRegInterval vcc_win = {vcc, 2};
1402    /* VCC is outside the bounds */
1403    bool is_vcc =
1404       info.rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1405    bool is_m0 = info.rc == s1 && reg == m0 && can_write_m0(instr);
1406    if (!info.bounds.contains(reg_win) && !is_vcc && !is_m0)
1407       return false;
1408 
1409    if (reg_file.test(reg, info.rc.bytes()))
1410       return false;
1411 
1412    adjust_max_used_regs(ctx, info.rc, reg_win.lo());
1413    return true;
1414 }
1415 
1416 bool
increase_register_file(ra_ctx & ctx,RegClass rc)1417 increase_register_file(ra_ctx& ctx, RegClass rc)
1418 {
1419    if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
1420        ctx.vgpr_bounds < ctx.vgpr_limit) {
1421       /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
1422       update_vgpr_sgpr_demand(
1423          ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
1424 
1425       ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
1426    } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1427       update_vgpr_sgpr_demand(
1428          ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
1429 
1430       ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
1431    } else {
1432       return false;
1433    }
1434 
1435    return true;
1436 }
1437 
1438 struct IDAndRegClass {
IDAndRegClassaco::__anon8422c4420111::IDAndRegClass1439    IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1440 
1441    unsigned id;
1442    RegClass rc;
1443 };
1444 
1445 struct IDAndInfo {
IDAndInfoaco::__anon8422c4420111::IDAndInfo1446    IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1447 
1448    unsigned id;
1449    DefInfo info;
1450 };
1451 
1452 void
add_rename(ra_ctx & ctx,Temp orig_val,Temp new_val)1453 add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val)
1454 {
1455    ctx.renames[ctx.block->index][orig_val.id()] = new_val;
1456    ctx.orig_names.emplace(new_val.id(), orig_val);
1457    ctx.assignments[orig_val.id()].renamed = true;
1458 }
1459 
1460 /* Reallocates vars by sorting them and placing each variable after the previous
1461  * one. If one of the variables has 0xffffffff as an ID, the register assigned
1462  * for that variable will be returned.
1463  */
1464 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1465 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1466                       std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1467 {
1468    /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1469     * temporary sizes to dwords.
1470     */
1471    std::vector<IDAndInfo> sorted;
1472    for (IDAndRegClass var : vars) {
1473       DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1474       sorted.emplace_back(var.id, info);
1475    }
1476 
1477    std::sort(
1478       sorted.begin(), sorted.end(),
1479       [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1480       {
1481          unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1482          unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1483          if (a_stride > b_stride)
1484             return true;
1485          if (a_stride < b_stride)
1486             return false;
1487          if (a.id == 0xffffffff || b.id == 0xffffffff)
1488             return a.id ==
1489                    0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1490          return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1491       });
1492 
1493    PhysReg next_reg = start;
1494    PhysReg space_reg;
1495    for (IDAndInfo& var : sorted) {
1496       unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1497       next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1498 
1499       /* 0xffffffff is a special variable ID used reserve a space for killed
1500        * operands and definitions.
1501        */
1502       if (var.id != 0xffffffff) {
1503          if (next_reg != ctx.assignments[var.id].reg) {
1504             RegClass rc = ctx.assignments[var.id].rc;
1505             Temp tmp(var.id, rc);
1506 
1507             Operand pc_op(tmp);
1508             pc_op.setFixed(ctx.assignments[var.id].reg);
1509             Definition pc_def(next_reg, rc);
1510             parallelcopies.emplace_back(pc_op, pc_def);
1511          }
1512       } else {
1513          space_reg = next_reg;
1514       }
1515 
1516       adjust_max_used_regs(ctx, var.info.rc, next_reg);
1517 
1518       next_reg = next_reg.advance(var.info.rc.size() * 4);
1519    }
1520 
1521    return space_reg;
1522 }
1523 
1524 bool
is_mimg_vaddr_intact(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)1525 is_mimg_vaddr_intact(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
1526 {
1527    PhysReg first{512};
1528    for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
1529       Operand op = instr->operands[i + 3];
1530 
1531       if (ctx.assignments[op.tempId()].assigned) {
1532          PhysReg reg = ctx.assignments[op.tempId()].reg;
1533 
1534          if (first.reg() == 512) {
1535             PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1536             first = reg.advance(i * -4);
1537             PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
1538             if (!bounds.contains(vec)) /* not enough space for other operands */
1539                return false;
1540          } else {
1541             if (reg != first.advance(i * 4)) /* not at the best position */
1542                return false;
1543          }
1544       } else {
1545          /* If there's an unexpected temporary, this operand is unlikely to be
1546           * placed in the best position.
1547           */
1548          if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
1549             return false;
1550       }
1551    }
1552 
1553    return true;
1554 }
1555 
1556 std::optional<PhysReg>
get_reg_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr,int operand)1557 get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr,
1558                int operand)
1559 {
1560    Instruction* vec = ctx.vectors[temp.id()];
1561    unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
1562    unsigned our_offset = 0;
1563    for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1564       Operand& op = vec->operands[i];
1565       if (op.isTemp() && op.tempId() == temp.id())
1566          break;
1567       else
1568          our_offset += op.bytes();
1569    }
1570 
1571    if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
1572       unsigned their_offset = 0;
1573       /* check for every operand of the vector
1574        * - whether the operand is assigned and
1575        * - we can use the register relative to that operand
1576        */
1577       for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1578          Operand& op = vec->operands[i];
1579          if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1580              ctx.assignments[op.tempId()].assigned) {
1581             PhysReg reg = ctx.assignments[op.tempId()].reg;
1582             reg.reg_b += (our_offset - their_offset);
1583             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand))
1584                return reg;
1585 
1586             /* return if MIMG vaddr components don't remain vector-aligned */
1587             if (vec->format == Format::MIMG)
1588                return {};
1589          }
1590          their_offset += op.bytes();
1591       }
1592 
1593       /* We didn't find a register relative to other vector operands.
1594        * Try to find new space which fits the whole vector.
1595        */
1596       RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1597       DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1598       std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
1599       if (reg) {
1600          reg->reg_b += our_offset;
1601          /* make sure to only use byte offset if the instruction supports it */
1602          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg, operand))
1603             return reg;
1604       }
1605    }
1606    return {};
1607 }
1608 
1609 bool
compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies)1610 compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
1611                      std::vector<std::pair<Operand, Definition>>& parallelcopies)
1612 {
1613    PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
1614    int zeros = reg_file.count_zero(linear_vgpr_bounds);
1615    if (zeros == 0)
1616       return false;
1617 
1618    std::vector<IDAndRegClass> vars;
1619    for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
1620       vars.emplace_back(id, ctx.assignments[id].rc);
1621 
1622    ctx.num_linear_vgprs -= zeros;
1623    compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
1624 
1625    return true;
1626 }
1627 
1628 /* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
1629  * from normal VGPRs. This is for two reasons:
1630  * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
1631  *   linear one, we never have to swap a normal VGPR and a linear one.
1632  * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
1633  *   linear VGPR in control flow.
1634  */
1635 PhysReg
alloc_linear_vgpr(ra_ctx & ctx,const RegisterFile & reg_file,aco_ptr<Instruction> & instr,std::vector<std::pair<Operand,Definition>> & parallelcopies)1636 alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
1637                   std::vector<std::pair<Operand, Definition>>& parallelcopies)
1638 {
1639    assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
1640    assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
1641 
1642    RegClass rc = instr->definitions[0].regClass();
1643 
1644    /* Try to choose an unused space in the linear VGPR bounds. */
1645    for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
1646       PhysReg reg(256 + ctx.vgpr_bounds - i);
1647       if (!reg_file.test(reg, rc.bytes())) {
1648          adjust_max_used_regs(ctx, rc, reg);
1649          return reg;
1650       }
1651    }
1652 
1653    PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1654 
1655    /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
1656    compact_linear_vgprs(ctx, reg_file, parallelcopies);
1657 
1658    PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
1659    /* Space that was for normal VGPRs, but is now for linear VGPRs. */
1660    PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
1661 
1662    RegisterFile tmp_file(reg_file);
1663    PhysRegInterval reg_win{reg, rc.size()};
1664    std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
1665 
1666    /* Re-enable killed operands */
1667    for (Operand& op : instr->operands) {
1668       if (op.isTemp() && op.isFirstKillBeforeDef())
1669          tmp_file.fill(op);
1670    }
1671 
1672    /* Find new assignments for blocking vars. */
1673    std::vector<std::pair<Operand, Definition>> pc;
1674    if (!ctx.policy.skip_optimistic_path &&
1675        get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
1676       parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1677    } else {
1678       /* Fallback algorithm: reallocate all variables at once. */
1679       std::vector<IDAndRegClass> vars;
1680       for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
1681          vars.emplace_back(id, ctx.assignments[id].rc);
1682       compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
1683 
1684       std::vector<IDAndRegClass> killed_op_vars;
1685       for (Operand& op : instr->operands) {
1686          if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
1687             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1688       }
1689       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
1690    }
1691 
1692    /* If this is updated earlier, a killed operand can't be placed inside the definition. */
1693    ctx.num_linear_vgprs += rc.size();
1694 
1695    adjust_max_used_regs(ctx, rc, reg);
1696    return reg;
1697 }
1698 
1699 bool
should_compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file)1700 should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file)
1701 {
1702    if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
1703       return false;
1704 
1705    /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
1706     * ensure in advance that there is enough space for normal VGPRs. */
1707    unsigned max_vgpr_usage = 0;
1708    unsigned next_toplevel = ctx.block->index + 1;
1709    for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
1710       max_vgpr_usage =
1711          MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
1712    }
1713    max_vgpr_usage =
1714       MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].live_in_demand.vgpr);
1715 
1716    for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
1717       max_vgpr_usage -= ctx.assignments[tmp].rc.size();
1718 
1719    return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
1720 }
1721 
1722 PhysReg
get_reg(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1723 get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1724         std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1725         int operand_index = -1)
1726 {
1727    auto split_vec = ctx.split_vectors.find(temp.id());
1728    if (split_vec != ctx.split_vectors.end()) {
1729       unsigned offset = 0;
1730       for (Definition def : split_vec->second->definitions) {
1731          if (ctx.assignments[def.tempId()].affinity) {
1732             assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1733             if (affinity.assigned) {
1734                PhysReg reg = affinity.reg;
1735                reg.reg_b -= offset;
1736                if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand_index))
1737                   return reg;
1738             }
1739          }
1740          offset += def.bytes();
1741       }
1742    }
1743 
1744    if (ctx.assignments[temp.id()].affinity) {
1745       assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1746       if (affinity.assigned) {
1747          if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, operand_index))
1748             return affinity.reg;
1749       }
1750    }
1751    if (ctx.assignments[temp.id()].vcc) {
1752       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc, operand_index))
1753          return vcc;
1754    }
1755    if (ctx.assignments[temp.id()].m0) {
1756       if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0, operand_index))
1757          return m0;
1758    }
1759 
1760    std::optional<PhysReg> res;
1761 
1762    if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1763       res = get_reg_vector(ctx, reg_file, temp, instr, operand_index);
1764       if (res)
1765          return *res;
1766    }
1767 
1768    if (temp.size() == 1 && operand_index == -1) {
1769       for (const Operand& op : instr->operands) {
1770          if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass() == temp.regClass()) {
1771             assert(op.isFixed());
1772             if (op.physReg() == vcc || op.physReg() == vcc_hi)
1773                continue;
1774             if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, op.physReg(),
1775                                   operand_index))
1776                return op.physReg();
1777          }
1778       }
1779    }
1780 
1781    DefInfo info(ctx, instr, temp.regClass(), operand_index);
1782 
1783    if (!ctx.policy.skip_optimistic_path) {
1784       /* try to find space without live-range splits */
1785       res = get_reg_simple(ctx, reg_file, info);
1786 
1787       if (res)
1788          return *res;
1789    }
1790 
1791    /* try to find space with live-range splits */
1792    res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1793 
1794    if (res)
1795       return *res;
1796 
1797    /* try compacting the linear vgprs to make more space */
1798    std::vector<std::pair<Operand, Definition>> pc;
1799    if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
1800        compact_linear_vgprs(ctx, reg_file, pc)) {
1801       parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1802 
1803       /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
1804        * space here. */
1805       RegisterFile tmp_file(reg_file);
1806       for (std::pair<Operand, Definition>& copy : pc)
1807          tmp_file.clear(copy.first);
1808 
1809       return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
1810    }
1811 
1812    /* We should only fail here because keeping under the limit would require
1813     * too many moves. */
1814    assert(reg_file.count_zero(info.bounds) >= info.size);
1815 
1816    /* try using more registers */
1817    if (!increase_register_file(ctx, info.rc)) {
1818       /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
1819        * compact at the end) */
1820       unsigned def_size = info.rc.size();
1821       for (Definition def : instr->definitions) {
1822          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1823             def_size += def.regClass().size();
1824       }
1825 
1826       unsigned killed_op_size = 0;
1827       for (Operand op : instr->operands) {
1828          if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1829             killed_op_size += op.regClass().size();
1830       }
1831 
1832       const PhysRegInterval regs = get_reg_bounds(ctx, info.rc);
1833 
1834       /* reallocate passthrough variables and non-killed operands */
1835       std::vector<IDAndRegClass> vars;
1836       for (unsigned id : find_vars(ctx, reg_file, regs))
1837          vars.emplace_back(id, ctx.assignments[id].rc);
1838       vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1839 
1840       PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1841 
1842       /* reallocate killed operands */
1843       std::vector<IDAndRegClass> killed_op_vars;
1844       for (Operand op : instr->operands) {
1845          if (op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1846             killed_op_vars.emplace_back(op.tempId(), op.regClass());
1847       }
1848       compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1849 
1850       /* reallocate definitions */
1851       std::vector<IDAndRegClass> def_vars;
1852       for (Definition def : instr->definitions) {
1853          if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1854             def_vars.emplace_back(def.tempId(), def.regClass());
1855       }
1856       def_vars.emplace_back(0xffffffff, info.rc);
1857       return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1858    }
1859 
1860    return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1861 }
1862 
1863 PhysReg
get_reg_create_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1864 get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1865                       std::vector<std::pair<Operand, Definition>>& parallelcopies,
1866                       aco_ptr<Instruction>& instr)
1867 {
1868    RegClass rc = temp.regClass();
1869    /* create_vector instructions have different costs w.r.t. register coalescing */
1870    uint32_t size = rc.size();
1871    uint32_t bytes = rc.bytes();
1872    uint32_t stride = get_stride(rc);
1873    PhysRegInterval bounds = get_reg_bounds(ctx, rc);
1874 
1875    // TODO: improve p_create_vector for sub-dword vectors
1876 
1877    PhysReg best_pos{0xFFF};
1878    unsigned num_moves = 0xFF;
1879    bool best_avoid = true;
1880    uint32_t correct_pos_mask = 0;
1881 
1882    /* test for each operand which definition placement causes the least shuffle instructions */
1883    for (unsigned i = 0, offset = 0; i < instr->operands.size();
1884         offset += instr->operands[i].bytes(), i++) {
1885       // TODO: think about, if we can alias live operands on the same register
1886       if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1887           instr->operands[i].getTemp().type() != rc.type())
1888          continue;
1889 
1890       if (offset > instr->operands[i].physReg().reg_b)
1891          continue;
1892 
1893       unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1894       if (reg_lower % 4)
1895          continue;
1896       PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1897       unsigned k = 0;
1898 
1899       /* no need to check multiple times */
1900       if (reg_win.lo() == best_pos)
1901          continue;
1902 
1903       /* check borders */
1904       // TODO: this can be improved */
1905       if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1906          continue;
1907       if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1908           reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1909          continue;
1910       if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1911           reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1912          continue;
1913 
1914       /* count variables to be moved and check "avoid" */
1915       bool avoid = false;
1916       bool linear_vgpr = false;
1917       for (PhysReg j : reg_win) {
1918          if (reg_file[j] != 0) {
1919             if (reg_file[j] == 0xF0000000) {
1920                PhysReg reg;
1921                reg.reg_b = j * 4;
1922                unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1923                for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1924                   k += reg_file.test(reg, 1);
1925             } else {
1926                k += 4;
1927                linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1928             }
1929          }
1930          avoid |= ctx.war_hint[j];
1931       }
1932 
1933       /* we cannot split live ranges of linear vgprs */
1934       if (linear_vgpr)
1935          continue;
1936 
1937       if (avoid && !best_avoid)
1938          continue;
1939 
1940       /* count operands in wrong positions */
1941       uint32_t correct_pos_mask_new = 0;
1942       for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1943            offset2 += instr->operands[j].bytes(), j++) {
1944          Operand& op = instr->operands[j];
1945          if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1946             correct_pos_mask_new |= 1 << j;
1947          else
1948             k += op.bytes();
1949       }
1950       bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1951       if (k > num_moves || (!aligned && k == num_moves))
1952          continue;
1953 
1954       best_pos = reg_win.lo();
1955       num_moves = k;
1956       best_avoid = avoid;
1957       correct_pos_mask = correct_pos_mask_new;
1958    }
1959 
1960    /* too many moves: try the generic get_reg() function */
1961    if (num_moves >= 2 * bytes) {
1962       return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1963    } else if (num_moves > bytes) {
1964       DefInfo info(ctx, instr, rc, -1);
1965       std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
1966       if (res)
1967          return *res;
1968    }
1969 
1970    /* re-enable killed operands which are in the wrong position */
1971    RegisterFile tmp_file(reg_file);
1972    for (Operand& op : instr->operands) {
1973       if (op.isTemp() && op.isFirstKillBeforeDef())
1974          tmp_file.fill(op);
1975    }
1976    for (unsigned i = 0; i < instr->operands.size(); i++) {
1977       if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
1978          tmp_file.clear(instr->operands[i]);
1979    }
1980 
1981    /* collect variables to be moved */
1982    std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
1983 
1984    bool success = false;
1985    std::vector<std::pair<Operand, Definition>> pc;
1986    success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
1987 
1988    if (!success) {
1989       if (!increase_register_file(ctx, temp.regClass())) {
1990          /* use the fallback algorithm in get_reg() */
1991          return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1992       }
1993       return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
1994    }
1995 
1996    parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1997    adjust_max_used_regs(ctx, rc, best_pos);
1998 
1999    return best_pos;
2000 }
2001 
2002 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)2003 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
2004 {
2005    if (instr->format != Format::PSEUDO)
2006       return;
2007 
2008    /* all instructions which use handle_operands() need this information */
2009    switch (instr->opcode) {
2010    case aco_opcode::p_extract_vector:
2011    case aco_opcode::p_create_vector:
2012    case aco_opcode::p_split_vector:
2013    case aco_opcode::p_parallelcopy:
2014    case aco_opcode::p_start_linear_vgpr: break;
2015    default: return;
2016    }
2017 
2018    bool writes_linear = false;
2019    /* if all definitions are logical vgpr, no need to care for SCC */
2020    for (Definition& def : instr->definitions) {
2021       if (def.getTemp().regClass().is_linear())
2022          writes_linear = true;
2023    }
2024    /* if all operands are constant, no need to care either */
2025    bool reads_linear = false;
2026    for (Operand& op : instr->operands) {
2027       if (op.isTemp() && op.getTemp().regClass().is_linear())
2028          reads_linear = true;
2029    }
2030 
2031    if (!writes_linear || !reads_linear || !reg_file[scc])
2032       return;
2033 
2034    instr->pseudo().needs_scratch_reg = true;
2035    instr->pseudo().tmp_in_scc = reg_file[scc];
2036 
2037    int reg = ctx.max_used_sgpr;
2038    for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
2039       ;
2040    if (reg < 0) {
2041       reg = ctx.max_used_sgpr + 1;
2042       for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
2043          ;
2044    }
2045 
2046    adjust_max_used_regs(ctx, s1, reg);
2047    instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
2048 }
2049 
2050 bool
operand_can_use_reg(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)2051 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
2052                     RegClass rc)
2053 {
2054    if (reg.byte()) {
2055       unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
2056       if (reg.byte() % stride)
2057          return false;
2058    }
2059 
2060    switch (instr->format) {
2061    case Format::SMEM:
2062       return reg != scc && reg != exec &&
2063              (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
2064              (reg != vcc || (instr->definitions.empty() && idx == 2) ||
2065               gfx_level >= GFX10); /* sdata can be vcc */
2066    case Format::MUBUF:
2067    case Format::MTBUF: return idx != 2 || gfx_level < GFX12 || reg != scc;
2068    default:
2069       // TODO: there are more instructions with restrictions on registers
2070       return true;
2071    }
2072 }
2073 
2074 void
handle_fixed_operands(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr)2075 handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
2076                       std::vector<std::pair<Operand, Definition>>& parallelcopy,
2077                       aco_ptr<Instruction>& instr)
2078 {
2079    assert(instr->operands.size() <= 128);
2080 
2081    RegisterFile tmp_file(register_file);
2082 
2083    BITSET_DECLARE(mask, 128) = {0};
2084 
2085    for (unsigned i = 0; i < instr->operands.size(); i++) {
2086       Operand& op = instr->operands[i];
2087 
2088       if (!op.isTemp() || !op.isFixed())
2089          continue;
2090 
2091       PhysReg src = ctx.assignments[op.tempId()].reg;
2092       adjust_max_used_regs(ctx, op.regClass(), op.physReg());
2093 
2094       if (op.physReg() == src) {
2095          tmp_file.block(op.physReg(), op.regClass());
2096          continue;
2097       }
2098 
2099       unsigned j;
2100       bool found = false;
2101       BITSET_FOREACH_SET (j, mask, i) {
2102          if (instr->operands[j].tempId() == op.tempId() &&
2103              instr->operands[j].physReg() == op.physReg()) {
2104             found = true;
2105             break;
2106          }
2107       }
2108       if (found)
2109          continue; /* the copy is already added to the list */
2110 
2111       /* clear from register_file so fixed operands are not collected be collect_vars() */
2112       tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
2113 
2114       BITSET_SET(mask, i);
2115 
2116       Operand pc_op(instr->operands[i].getTemp(), src);
2117       Definition pc_def = Definition(op.physReg(), pc_op.regClass());
2118       parallelcopy.emplace_back(pc_op, pc_def);
2119    }
2120 
2121    if (BITSET_IS_EMPTY(mask))
2122       return;
2123 
2124    unsigned i;
2125    std::vector<unsigned> blocking_vars;
2126    BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
2127       Operand& op = instr->operands[i];
2128       PhysRegInterval target{op.physReg(), op.size()};
2129       std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
2130       blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
2131 
2132       /* prevent get_regs_for_copies() from using these registers */
2133       tmp_file.block(op.physReg(), op.regClass());
2134    }
2135 
2136    get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
2137    update_renames(ctx, register_file, parallelcopy, instr,
2138                   rename_not_killed_ops | fill_killed_ops | rename_precolored_ops);
2139 }
2140 
2141 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)2142 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
2143                     std::vector<std::pair<Operand, Definition>>& parallelcopy,
2144                     aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
2145 {
2146    /* clear the operand in case it's only a stride mismatch */
2147    PhysReg src = ctx.assignments[operand.tempId()].reg;
2148    register_file.clear(src, operand.regClass());
2149    PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
2150 
2151    Operand pc_op = operand;
2152    pc_op.setFixed(src);
2153    Definition pc_def = Definition(dst, pc_op.regClass());
2154    parallelcopy.emplace_back(pc_op, pc_def);
2155    update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
2156 }
2157 
2158 PhysReg
get_reg_phi(ra_ctx & ctx,IDSet & live_in,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,Block & block,aco_ptr<Instruction> & phi,Temp tmp)2159 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
2160             std::vector<aco_ptr<Instruction>>& instructions, Block& block,
2161             aco_ptr<Instruction>& phi, Temp tmp)
2162 {
2163    std::vector<std::pair<Operand, Definition>> parallelcopy;
2164    PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
2165    update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
2166 
2167    /* process parallelcopy */
2168    for (std::pair<Operand, Definition> pc : parallelcopy) {
2169       /* see if it's a copy from a different phi */
2170       // TODO: prefer moving some previous phis over live-ins
2171       // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2172       // problem in practice since they can only be fixed to exec)
2173       Instruction* prev_phi = NULL;
2174       for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2175          if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2176             prev_phi = phi_it->get();
2177       }
2178       if (prev_phi) {
2179          /* if so, just update that phi's register */
2180          prev_phi->definitions[0].setFixed(pc.second.physReg());
2181          register_file.fill(prev_phi->definitions[0]);
2182          ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2183                                                                pc.second.regClass()};
2184          continue;
2185       }
2186 
2187       /* rename */
2188       auto orig_it = ctx.orig_names.find(pc.first.tempId());
2189       Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
2190       add_rename(ctx, orig, pc.second.getTemp());
2191 
2192       /* otherwise, this is a live-in and we need to create a new phi
2193        * to move it in this block's predecessors */
2194       aco_opcode opcode =
2195          pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2196       Block::edge_vec& preds =
2197          pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2198       aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2199       new_phi->definitions[0] = pc.second;
2200       for (unsigned i = 0; i < preds.size(); i++)
2201          new_phi->operands[i] = Operand(pc.first);
2202       instructions.emplace_back(std::move(new_phi));
2203 
2204       /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2205        * a loop header.
2206        */
2207       live_in.erase(orig.id());
2208    }
2209 
2210    return reg;
2211 }
2212 
2213 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)2214 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2215                   std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2216 {
2217    /* move all phis to instructions */
2218    for (aco_ptr<Instruction>& phi : block.instructions) {
2219       if (!is_phi(phi))
2220          break;
2221       if (!phi->definitions[0].isKill())
2222          instructions.emplace_back(std::move(phi));
2223    }
2224 
2225    /* assign phis with all-matching registers to that register */
2226    for (aco_ptr<Instruction>& phi : instructions) {
2227       Definition& definition = phi->definitions[0];
2228       if (definition.isFixed())
2229          continue;
2230 
2231       if (!phi->operands[0].isTemp())
2232          continue;
2233 
2234       PhysReg reg = phi->operands[0].physReg();
2235       auto OpsSame = [=](const Operand& op) -> bool
2236       { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2237       bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2238       if (!all_same)
2239          continue;
2240 
2241       if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1))
2242          continue;
2243 
2244       definition.setFixed(reg);
2245       register_file.fill(definition);
2246       ctx.assignments[definition.tempId()].set(definition);
2247    }
2248 
2249    /* try to find a register that is used by at least one operand */
2250    for (aco_ptr<Instruction>& phi : instructions) {
2251       Definition& definition = phi->definitions[0];
2252       if (definition.isFixed())
2253          continue;
2254 
2255       /* use affinity if available */
2256       if (ctx.assignments[definition.tempId()].affinity &&
2257           ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2258          assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2259          assert(affinity.rc == definition.regClass());
2260          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg, -1)) {
2261             definition.setFixed(affinity.reg);
2262             register_file.fill(definition);
2263             ctx.assignments[definition.tempId()].set(definition);
2264             continue;
2265          }
2266       }
2267 
2268       /* by going backwards, we aim to avoid copies in else-blocks */
2269       for (int i = phi->operands.size() - 1; i >= 0; i--) {
2270          const Operand& op = phi->operands[i];
2271          if (!op.isTemp() || !op.isFixed())
2272             continue;
2273 
2274          PhysReg reg = op.physReg();
2275          if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1)) {
2276             definition.setFixed(reg);
2277             register_file.fill(definition);
2278             ctx.assignments[definition.tempId()].set(definition);
2279             break;
2280          }
2281       }
2282    }
2283 
2284    /* find registers for phis where the register was blocked or no operand was assigned */
2285 
2286    /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2287    for (unsigned i = 0; i < instructions.size(); i++) {
2288       aco_ptr<Instruction>& phi = instructions[i];
2289       Definition& definition = phi->definitions[0];
2290       if (definition.isFixed())
2291          continue;
2292 
2293       definition.setFixed(
2294          get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2295 
2296       register_file.fill(definition);
2297       ctx.assignments[definition.tempId()].set(definition);
2298    }
2299 }
2300 
2301 inline Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2302 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2303 {
2304    /* This variable didn't get renamed, yet. */
2305    if (!ctx.assignments[val.id()].renamed)
2306       return val;
2307 
2308    auto it = ctx.renames[block_idx].find(val.id());
2309    if (it == ctx.renames[block_idx].end())
2310       return val;
2311    else
2312       return it->second;
2313 }
2314 
2315 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2316 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2317 {
2318    /* This variable didn't get renamed, yet. */
2319    if (!ctx.assignments[val.id()].renamed)
2320       return val;
2321 
2322    Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2323    if (preds.size() == 0)
2324       return val;
2325 
2326    if (preds.size() == 1) {
2327       /* if the block has only one predecessor, just look there for the name */
2328       return read_variable(ctx, val, preds[0]);
2329    }
2330 
2331    /* there are multiple predecessors and the block is sealed */
2332    Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2333 
2334    /* get the rename from each predecessor and check if they are the same */
2335    Temp new_val;
2336    bool needs_phi = false;
2337    for (unsigned i = 0; i < preds.size(); i++) {
2338       ops[i] = read_variable(ctx, val, preds[i]);
2339       if (i == 0)
2340          new_val = ops[i];
2341       else
2342          needs_phi |= !(new_val == ops[i]);
2343    }
2344 
2345    if (needs_phi) {
2346       assert(!val.regClass().is_linear_vgpr());
2347 
2348       /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2349       aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2350       aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2351       new_val = ctx.program->allocateTmp(val.regClass());
2352       phi->definitions[0] = Definition(new_val);
2353       ctx.assignments.emplace_back();
2354       assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2355       for (unsigned i = 0; i < preds.size(); i++) {
2356          /* update the operands so that it uses the new affinity */
2357          phi->operands[i] = Operand(ops[i]);
2358          assert(ctx.assignments[ops[i].id()].assigned);
2359          assert(ops[i].regClass() == new_val.regClass());
2360          phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2361       }
2362       block->instructions.insert(block->instructions.begin(), std::move(phi));
2363    }
2364 
2365    return new_val;
2366 }
2367 
2368 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx)2369 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2370                  uint32_t loop_exit_idx)
2371 {
2372    Block& loop_header = ctx.program->blocks[loop_header_idx];
2373    aco::unordered_map<uint32_t, Temp> renames(ctx.memory);
2374 
2375    /* create phis for variables renamed during the loop */
2376    for (unsigned t : live_in) {
2377       if (!ctx.assignments[t].renamed)
2378          continue;
2379 
2380       Temp val = Temp(t, ctx.program->temp_rc[t]);
2381       Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2382       Temp renamed = handle_live_in(ctx, val, &loop_header);
2383       if (renamed == prev)
2384          continue;
2385 
2386       /* insert additional renames at block end, but don't overwrite */
2387       renames[prev.id()] = renamed;
2388       ctx.orig_names[renamed.id()] = val;
2389       for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2390          auto it = ctx.renames[idx].emplace(val.id(), renamed);
2391          /* if insertion is unsuccessful, update if necessary */
2392          if (!it.second && it.first->second == prev)
2393             it.first->second = renamed;
2394       }
2395 
2396       /* update loop-carried values of the phi created by handle_live_in() */
2397       for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2398          Operand& op = loop_header.instructions[0]->operands[i];
2399          if (op.getTemp() == prev)
2400             op.setTemp(renamed);
2401       }
2402 
2403       /* use the assignment from the loop preheader and fix def reg */
2404       assignment& var = ctx.assignments[prev.id()];
2405       ctx.assignments[renamed.id()] = var;
2406       loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2407    }
2408 
2409    /* rename loop carried phi operands */
2410    for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2411       aco_ptr<Instruction>& phi = loop_header.instructions[i];
2412       if (!is_phi(phi))
2413          break;
2414       const Block::edge_vec& preds =
2415          phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2416       for (unsigned j = 1; j < phi->operands.size(); j++) {
2417          Operand& op = phi->operands[j];
2418          if (!op.isTemp())
2419             continue;
2420 
2421          /* Find the original name, since this operand might not use the original name if the phi
2422           * was created after init_reg_file().
2423           */
2424          auto it = ctx.orig_names.find(op.tempId());
2425          Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2426 
2427          op.setTemp(read_variable(ctx, orig, preds[j]));
2428          op.setFixed(ctx.assignments[op.tempId()].reg);
2429       }
2430    }
2431 
2432    /* return early if no new phi was created */
2433    if (renames.empty())
2434       return;
2435 
2436    /* propagate new renames through loop */
2437    for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2438       Block& current = ctx.program->blocks[idx];
2439       /* rename all uses in this block */
2440       for (aco_ptr<Instruction>& instr : current.instructions) {
2441          /* phis are renamed after RA */
2442          if (idx == loop_header_idx && is_phi(instr))
2443             continue;
2444 
2445          for (Operand& op : instr->operands) {
2446             if (!op.isTemp())
2447                continue;
2448 
2449             auto rename = renames.find(op.tempId());
2450             if (rename != renames.end()) {
2451                assert(rename->second.id());
2452                op.setTemp(rename->second);
2453             }
2454          }
2455       }
2456    }
2457 }
2458 
2459 /**
2460  * This function serves the purpose to correctly initialize the register file
2461  * at the beginning of a block (before any existing phis).
2462  * In order to do so, all live-in variables are entered into the RegisterFile.
2463  * Reg-to-reg moves (renames) from previous blocks are taken into account and
2464  * the SSA is repaired by inserting corresponding phi-nodes.
2465  */
2466 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2467 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2468 {
2469    if (block.kind & block_kind_loop_exit) {
2470       uint32_t header = ctx.loop_header.back();
2471       ctx.loop_header.pop_back();
2472       handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
2473    }
2474 
2475    RegisterFile register_file;
2476    const IDSet& live_in = live_out_per_block[block.index];
2477    assert(block.index != 0 || live_in.empty());
2478 
2479    if (block.kind & block_kind_loop_header) {
2480       ctx.loop_header.emplace_back(block.index);
2481       /* already rename phis incoming value */
2482       for (aco_ptr<Instruction>& instr : block.instructions) {
2483          if (!is_phi(instr))
2484             break;
2485          Operand& operand = instr->operands[0];
2486          if (operand.isTemp()) {
2487             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2488             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2489          }
2490       }
2491       for (unsigned t : live_in) {
2492          Temp val = Temp(t, ctx.program->temp_rc[t]);
2493          Temp renamed = read_variable(ctx, val, block.index - 1);
2494          if (renamed != val)
2495             add_rename(ctx, val, renamed);
2496          assignment& var = ctx.assignments[renamed.id()];
2497          assert(var.assigned);
2498          register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2499       }
2500    } else {
2501       /* rename phi operands */
2502       for (aco_ptr<Instruction>& instr : block.instructions) {
2503          if (!is_phi(instr))
2504             break;
2505          const Block::edge_vec& preds =
2506             instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2507 
2508          for (unsigned i = 0; i < instr->operands.size(); i++) {
2509             Operand& operand = instr->operands[i];
2510             if (!operand.isTemp())
2511                continue;
2512             operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2513             operand.setFixed(ctx.assignments[operand.tempId()].reg);
2514          }
2515       }
2516       for (unsigned t : live_in) {
2517          Temp val = Temp(t, ctx.program->temp_rc[t]);
2518          Temp renamed = handle_live_in(ctx, val, &block);
2519          assignment& var = ctx.assignments[renamed.id()];
2520          /* due to live-range splits, the live-in might be a phi, now */
2521          if (var.assigned) {
2522             register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2523          }
2524          if (renamed != val) {
2525             add_rename(ctx, val, renamed);
2526          }
2527       }
2528    }
2529 
2530    return register_file;
2531 }
2532 
2533 bool
vop3_can_use_vop2acc(ra_ctx & ctx,Instruction * instr)2534 vop3_can_use_vop2acc(ra_ctx& ctx, Instruction* instr)
2535 {
2536    if (!instr->isVOP3() && !instr->isVOP3P())
2537       return false;
2538 
2539    switch (instr->opcode) {
2540    case aco_opcode::v_mad_f32:
2541    case aco_opcode::v_mad_f16:
2542    case aco_opcode::v_mad_legacy_f16: break;
2543    case aco_opcode::v_fma_f32:
2544    case aco_opcode::v_pk_fma_f16:
2545    case aco_opcode::v_fma_f16:
2546    case aco_opcode::v_dot4_i32_i8:
2547       if (ctx.program->gfx_level < GFX10)
2548          return false;
2549       break;
2550    case aco_opcode::v_mad_legacy_f32:
2551       if (!ctx.program->dev.has_mac_legacy32)
2552          return false;
2553       break;
2554    case aco_opcode::v_fma_legacy_f32:
2555       if (!ctx.program->dev.has_fmac_legacy32)
2556          return false;
2557       break;
2558    default: return false;
2559    }
2560 
2561    if (!instr->operands[2].isOfType(RegType::vgpr) || !instr->operands[2].isKillBeforeDef() ||
2562        (!instr->operands[0].isOfType(RegType::vgpr) && !instr->operands[1].isOfType(RegType::vgpr)))
2563       return false;
2564 
2565    if (instr->isVOP3P()) {
2566       for (unsigned i = 0; i < 3; i++) {
2567          if (instr->operands[i].isLiteral())
2568             continue;
2569 
2570          if (instr->valu().opsel_lo[i])
2571             return false;
2572 
2573          /* v_pk_fmac_f16 inline constants are replicated to hi bits starting with gfx11. */
2574          if (instr->valu().opsel_hi[i] ==
2575              (instr->operands[i].isConstant() && ctx.program->gfx_level >= GFX11))
2576             return false;
2577       }
2578    } else {
2579       if (instr->valu().opsel & (ctx.program->gfx_level < GFX11 ? 0xf : ~0x3))
2580          return false;
2581       for (unsigned i = 0; i < 2; i++) {
2582          if (!instr->operands[i].isOfType(RegType::vgpr) && instr->valu().opsel[i])
2583             return false;
2584       }
2585    }
2586 
2587    unsigned im_mask = instr->isDPP16() && instr->isVOP3() ? 0x3 : 0;
2588    if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
2589        (instr->valu().neg & ~im_mask))
2590       return false;
2591 
2592    return true;
2593 }
2594 
2595 bool
sop2_can_use_sopk(ra_ctx & ctx,Instruction * instr)2596 sop2_can_use_sopk(ra_ctx& ctx, Instruction* instr)
2597 {
2598    if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_add_u32 &&
2599        instr->opcode != aco_opcode::s_mul_i32 && instr->opcode != aco_opcode::s_cselect_b32)
2600       return false;
2601 
2602    if (instr->opcode == aco_opcode::s_add_u32 && !instr->definitions[1].isKill())
2603       return false;
2604 
2605    uint32_t literal_idx = 0;
2606 
2607    if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2608       literal_idx = 1;
2609 
2610    if (!instr->operands[!literal_idx].isTemp() || !instr->operands[!literal_idx].isKillBeforeDef())
2611       return false;
2612 
2613    if (!instr->operands[literal_idx].isLiteral())
2614       return false;
2615 
2616    const uint32_t i16_mask = 0xffff8000u;
2617    uint32_t value = instr->operands[literal_idx].constantValue();
2618    if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2619       return false;
2620 
2621    return true;
2622 }
2623 
2624 void
get_affinities(ra_ctx & ctx)2625 get_affinities(ra_ctx& ctx)
2626 {
2627    std::vector<std::vector<Temp>> phi_resources;
2628    aco::unordered_map<uint32_t, uint32_t> temp_to_phi_resources(ctx.memory);
2629 
2630    for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2631         block_rit++) {
2632       Block& block = *block_rit;
2633 
2634       std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2635       for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2636          aco_ptr<Instruction>& instr = *rit;
2637          if (is_phi(instr))
2638             break;
2639 
2640          /* add vector affinities */
2641          if (instr->opcode == aco_opcode::p_create_vector) {
2642             for (const Operand& op : instr->operands) {
2643                if (op.isTemp() && op.isFirstKill() &&
2644                    op.getTemp().type() == instr->definitions[0].getTemp().type())
2645                   ctx.vectors[op.tempId()] = instr.get();
2646             }
2647          } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
2648                     !instr->mimg().strict_wqm && ctx.program->gfx_level < GFX12) {
2649             for (unsigned i = 3; i < instr->operands.size(); i++)
2650                ctx.vectors[instr->operands[i].tempId()] = instr.get();
2651          } else if (instr->opcode == aco_opcode::p_split_vector &&
2652                     instr->operands[0].isFirstKillBeforeDef()) {
2653             ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2654          } else if (instr->isVOPC() && !instr->isVOP3()) {
2655             if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2656                ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2657          } else if (instr->isVOP2() && !instr->isVOP3()) {
2658             if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2659                 instr->operands[2].regClass().type() == RegType::sgpr)
2660                ctx.assignments[instr->operands[2].tempId()].vcc = true;
2661             if (instr->definitions.size() == 2)
2662                ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2663          } else if (instr->opcode == aco_opcode::s_and_b32 ||
2664                     instr->opcode == aco_opcode::s_and_b64) {
2665             /* If SCC is used by a branch, we might be able to use
2666              * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2667              */
2668             if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2669                 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2670                ctx.assignments[instr->operands[0].tempId()].vcc = true;
2671          } else if (instr->opcode == aco_opcode::s_sendmsg) {
2672             ctx.assignments[instr->operands[0].tempId()].m0 = true;
2673          }
2674 
2675          int op_fixed_to_def0 = get_op_fixed_to_def(instr.get());
2676          for (unsigned i = 0; i < instr->definitions.size(); i++) {
2677             const Definition& def = instr->definitions[i];
2678             if (!def.isTemp())
2679                continue;
2680             /* mark last-seen phi operand */
2681             auto it = temp_to_phi_resources.find(def.tempId());
2682             if (it != temp_to_phi_resources.end() &&
2683                 def.regClass() == phi_resources[it->second][0].regClass()) {
2684                phi_resources[it->second][0] = def.getTemp();
2685                /* try to coalesce phi affinities with parallelcopies */
2686                Operand op;
2687                if (instr->opcode == aco_opcode::p_parallelcopy) {
2688                   op = instr->operands[i];
2689                } else if (i == 0 && op_fixed_to_def0 != -1) {
2690                   op = instr->operands[op_fixed_to_def0];
2691                } else if (vop3_can_use_vop2acc(ctx, instr.get())) {
2692                   op = instr->operands[2];
2693                } else if (i == 0 && sop2_can_use_sopk(ctx, instr.get())) {
2694                   op = instr->operands[instr->operands[0].isLiteral()];
2695                } else {
2696                   continue;
2697                }
2698 
2699                if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2700                   phi_resources[it->second].emplace_back(op.getTemp());
2701                   temp_to_phi_resources[op.tempId()] = it->second;
2702                }
2703             }
2704          }
2705       }
2706 
2707       /* collect phi affinities */
2708       for (; rit != block.instructions.rend(); ++rit) {
2709          aco_ptr<Instruction>& instr = *rit;
2710          assert(is_phi(instr));
2711 
2712          if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2713             continue;
2714 
2715          assert(instr->definitions[0].isTemp());
2716          auto it = temp_to_phi_resources.find(instr->definitions[0].tempId());
2717          unsigned index = phi_resources.size();
2718          std::vector<Temp>* affinity_related;
2719          if (it != temp_to_phi_resources.end()) {
2720             index = it->second;
2721             phi_resources[index][0] = instr->definitions[0].getTemp();
2722             affinity_related = &phi_resources[index];
2723          } else {
2724             phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2725             affinity_related = &phi_resources.back();
2726          }
2727 
2728          for (const Operand& op : instr->operands) {
2729             if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2730                affinity_related->emplace_back(op.getTemp());
2731                if (block.kind & block_kind_loop_header)
2732                   continue;
2733                temp_to_phi_resources[op.tempId()] = index;
2734             }
2735          }
2736       }
2737 
2738       /* visit the loop header phis first in order to create nested affinities */
2739       if (block.kind & block_kind_loop_exit) {
2740          /* find loop header */
2741          auto header_rit = block_rit;
2742          while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2743             header_rit++;
2744 
2745          for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2746             if (!is_phi(phi))
2747                break;
2748             if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2749                continue;
2750 
2751             /* create an (empty) merge-set for the phi-related variables */
2752             auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
2753             unsigned index = phi_resources.size();
2754             if (it == temp_to_phi_resources.end()) {
2755                temp_to_phi_resources[phi->definitions[0].tempId()] = index;
2756                phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2757             } else {
2758                index = it->second;
2759             }
2760             for (unsigned i = 1; i < phi->operands.size(); i++) {
2761                const Operand& op = phi->operands[i];
2762                if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2763                   temp_to_phi_resources[op.tempId()] = index;
2764                }
2765             }
2766          }
2767       }
2768    }
2769    /* create affinities */
2770    for (std::vector<Temp>& vec : phi_resources) {
2771       for (unsigned i = 1; i < vec.size(); i++)
2772          if (vec[i].id() != vec[0].id())
2773             ctx.assignments[vec[i].id()].affinity = vec[0].id();
2774    }
2775 }
2776 
2777 void
optimize_encoding_vop2(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2778 optimize_encoding_vop2(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2779 {
2780    if (!vop3_can_use_vop2acc(ctx, instr.get()))
2781       return;
2782 
2783    for (unsigned i = ctx.program->gfx_level < GFX11 ? 0 : 2; i < 3; i++) {
2784       if (instr->operands[i].physReg().byte())
2785          return;
2786    }
2787 
2788    unsigned def_id = instr->definitions[0].tempId();
2789    if (ctx.assignments[def_id].affinity) {
2790       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2791       if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2792           !register_file.test(affinity.reg, instr->operands[2].bytes()))
2793          return;
2794    }
2795 
2796    if (!instr->operands[1].isOfType(RegType::vgpr))
2797       instr->valu().swapOperands(0, 1);
2798 
2799    if (instr->isVOP3P() && instr->operands[0].isLiteral()) {
2800       unsigned literal = instr->operands[0].constantValue();
2801       unsigned lo = (literal >> (instr->valu().opsel_lo[0] * 16)) & 0xffff;
2802       unsigned hi = (literal >> (instr->valu().opsel_hi[0] * 16)) & 0xffff;
2803       instr->operands[0] = Operand::literal32(lo | (hi << 16));
2804    }
2805 
2806    instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
2807                             (unsigned)Format::VOP2);
2808    instr->valu().opsel_lo = 0;
2809    instr->valu().opsel_hi = 0;
2810    switch (instr->opcode) {
2811    case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2812    case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2813    case aco_opcode::v_mad_f16:
2814    case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2815    case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2816    case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2817    case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2818    case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2819    case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2820    default: break;
2821    }
2822 }
2823 
2824 void
optimize_encoding_sopk(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2825 optimize_encoding_sopk(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2826 {
2827    /* try to optimize sop2 with literal source to sopk */
2828    if (!sop2_can_use_sopk(ctx, instr.get()))
2829       return;
2830    unsigned literal_idx = instr->operands[1].isLiteral();
2831 
2832    if (instr->operands[!literal_idx].physReg() >= 128)
2833       return;
2834 
2835    unsigned def_id = instr->definitions[0].tempId();
2836    if (ctx.assignments[def_id].affinity) {
2837       assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2838       if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2839           !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2840          return;
2841    }
2842 
2843    instr->format = Format::SOPK;
2844    instr->salu().imm = instr->operands[literal_idx].constantValue() & 0xffff;
2845    if (literal_idx == 0)
2846       std::swap(instr->operands[0], instr->operands[1]);
2847    if (instr->operands.size() > 2)
2848       std::swap(instr->operands[1], instr->operands[2]);
2849    instr->operands.pop_back();
2850 
2851    switch (instr->opcode) {
2852    case aco_opcode::s_add_u32:
2853    case aco_opcode::s_add_i32: instr->opcode = aco_opcode::s_addk_i32; break;
2854    case aco_opcode::s_mul_i32: instr->opcode = aco_opcode::s_mulk_i32; break;
2855    case aco_opcode::s_cselect_b32: instr->opcode = aco_opcode::s_cmovk_i32; break;
2856    default: unreachable("illegal instruction");
2857    }
2858 }
2859 
2860 void
optimize_encoding(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2861 optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2862 {
2863    if (instr->isVALU())
2864       optimize_encoding_vop2(ctx, register_file, instr);
2865    if (instr->isSALU())
2866       optimize_encoding_sopk(ctx, register_file, instr);
2867 }
2868 
2869 void
emit_parallel_copy_internal(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)2870 emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
2871                             aco_ptr<Instruction>& instr,
2872                             std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
2873                             RegisterFile& register_file)
2874 {
2875    if (parallelcopy.empty())
2876       return;
2877 
2878    aco_ptr<Instruction> pc;
2879    pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(),
2880                                parallelcopy.size()));
2881    bool linear_vgpr = false;
2882    bool sgpr_operands_alias_defs = false;
2883    uint64_t sgpr_operands[4] = {0, 0, 0, 0};
2884    for (unsigned i = 0; i < parallelcopy.size(); i++) {
2885       linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
2886 
2887       if (temp_in_scc && parallelcopy[i].first.isTemp() &&
2888           parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2889          if (!sgpr_operands_alias_defs) {
2890             unsigned reg = parallelcopy[i].first.physReg().reg();
2891             unsigned size = parallelcopy[i].first.getTemp().size();
2892             sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
2893 
2894             reg = parallelcopy[i].second.physReg().reg();
2895             size = parallelcopy[i].second.getTemp().size();
2896             if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
2897                sgpr_operands_alias_defs = true;
2898          }
2899       }
2900 
2901       pc->operands[i] = parallelcopy[i].first;
2902       pc->definitions[i] = parallelcopy[i].second;
2903       assert(pc->operands[i].size() == pc->definitions[i].size());
2904 
2905       /* it might happen that the operand is already renamed. we have to restore the
2906        * original name. */
2907       auto it = ctx.orig_names.find(pc->operands[i].tempId());
2908       Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
2909       add_rename(ctx, orig, pc->definitions[i].getTemp());
2910    }
2911 
2912    if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
2913       /* disable definitions and re-enable operands */
2914       RegisterFile tmp_file(register_file);
2915       for (const Definition& def : instr->definitions) {
2916          if (def.isTemp() && !def.isKill())
2917             tmp_file.clear(def);
2918       }
2919       for (const Operand& op : instr->operands) {
2920          if (op.isTemp() && op.isFirstKill())
2921             tmp_file.block(op.physReg(), op.regClass());
2922       }
2923 
2924       handle_pseudo(ctx, tmp_file, pc.get());
2925    } else {
2926       pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
2927       pc->pseudo().tmp_in_scc = false;
2928    }
2929 
2930    instructions.emplace_back(std::move(pc));
2931 
2932    parallelcopy.clear();
2933 }
2934 
2935 void
emit_parallel_copy(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)2936 emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
2937                    aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
2938                    bool temp_in_scc, RegisterFile& register_file)
2939 {
2940    if (parallelcopy.empty())
2941       return;
2942 
2943    std::vector<std::pair<Operand, Definition>> linear_vgpr;
2944    if (ctx.num_linear_vgprs) {
2945       unsigned next = 0;
2946       for (unsigned i = 0; i < parallelcopy.size(); i++) {
2947          if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
2948             linear_vgpr.push_back(parallelcopy[i]);
2949             continue;
2950          }
2951 
2952          if (next != i)
2953             parallelcopy[next] = parallelcopy[i];
2954          next++;
2955       }
2956       parallelcopy.resize(next);
2957    }
2958 
2959    /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the
2960     * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */
2961    emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc,
2962                                register_file);
2963    emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc,
2964                                register_file);
2965 }
2966 
2967 } /* end namespace */
2968 
2969 void
register_allocation(Program * program,ra_test_policy policy)2970 register_allocation(Program* program, ra_test_policy policy)
2971 {
2972    ra_ctx ctx(program, policy);
2973    get_affinities(ctx);
2974 
2975    for (Block& block : program->blocks) {
2976       ctx.block = &block;
2977 
2978       /* initialize register file */
2979       RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block);
2980       ctx.war_hint.reset();
2981       ctx.rr_vgpr_it = {PhysReg{256}};
2982       ctx.rr_sgpr_it = {PhysReg{0}};
2983 
2984       std::vector<aco_ptr<Instruction>> instructions;
2985       instructions.reserve(block.instructions.size());
2986 
2987       /* this is a slight adjustment from the paper as we already have phi nodes:
2988        * We consider them incomplete phis and only handle the definition. */
2989       get_regs_for_phis(ctx, block, register_file, instructions,
2990                         program->live.live_in[block.index]);
2991 
2992       /* If this is a merge block, the state of the register file at the branch instruction of the
2993        * predecessors corresponds to the state after phis at the merge block. So, we allocate a
2994        * register for the predecessor's branch definitions as if there was a phi.
2995        */
2996       if (!block.linear_preds.empty() &&
2997           (block.linear_preds.size() != 1 ||
2998            program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
2999          PhysReg br_reg = get_reg_phi(ctx, program->live.live_in[block.index], register_file,
3000                                       instructions, block, ctx.phi_dummy, Temp(0, s2));
3001          for (unsigned pred : block.linear_preds) {
3002             program->blocks[pred].scc_live_out = register_file[scc];
3003             aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
3004 
3005             assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
3006                    br->definitions[0].isKill());
3007 
3008             br->definitions[0].setFixed(br_reg);
3009          }
3010       }
3011 
3012       /* Handle all other instructions of the block */
3013       auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
3014       auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
3015       for (; instr_it != block.instructions.end(); ++instr_it) {
3016          aco_ptr<Instruction>& instr = *instr_it;
3017          std::vector<std::pair<Operand, Definition>> parallelcopy;
3018          bool temp_in_scc = register_file[scc];
3019 
3020          if (instr->opcode == aco_opcode::p_branch) {
3021             /* unconditional branches are handled after phis of the target */
3022             instructions.emplace_back(std::move(instr));
3023             break;
3024          }
3025 
3026          assert(!is_phi(instr));
3027 
3028          /* handle operands */
3029          bool fixed = false;
3030          for (unsigned i = 0; i < instr->operands.size(); ++i) {
3031             auto& operand = instr->operands[i];
3032             if (!operand.isTemp())
3033                continue;
3034 
3035             /* rename operands */
3036             operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
3037             assert(ctx.assignments[operand.tempId()].assigned);
3038 
3039             fixed |=
3040                operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
3041          }
3042 
3043          bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
3044                              instr->opcode == aco_opcode::v_writelane_b32_e64;
3045          if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
3046              instr->operands[1].isTemp()) {
3047             /* v_writelane_b32 can take two sgprs but only if one is m0. */
3048             if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
3049                 ctx.assignments[instr->operands[1].tempId()].reg != m0) {
3050                instr->operands[0].setFixed(m0);
3051                fixed = true;
3052             }
3053          }
3054 
3055          if (fixed)
3056             handle_fixed_operands(ctx, register_file, parallelcopy, instr);
3057 
3058          for (unsigned i = 0; i < instr->operands.size(); ++i) {
3059             auto& operand = instr->operands[i];
3060             if (!operand.isTemp() || operand.isFixed())
3061                continue;
3062 
3063             PhysReg reg = ctx.assignments[operand.tempId()].reg;
3064             if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
3065                operand.setFixed(reg);
3066             else
3067                get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
3068 
3069             if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
3070                 (instr->isDS() && instr->ds().gds)) {
3071                for (unsigned j = 0; j < operand.size(); j++)
3072                   ctx.war_hint.set(operand.physReg().reg() + j);
3073             }
3074          }
3075 
3076          /* remove dead vars from register file */
3077          for (const Operand& op : instr->operands) {
3078             if (op.isTemp() && op.isFirstKillBeforeDef())
3079                register_file.clear(op);
3080          }
3081 
3082          optimize_encoding(ctx, register_file, instr);
3083 
3084          /* Handle definitions which must have the same register as an operand.
3085           * We expect that the definition has the same size as the operand, otherwise the new
3086           * location for the operand (if it's not killed) might intersect with the old one.
3087           * We can't read from the old location because it's corrupted, and we can't write the new
3088           * location because that's used by a live-through operand.
3089           */
3090          int op_fixed_to_def = get_op_fixed_to_def(instr.get());
3091          if (op_fixed_to_def != -1)
3092             instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg());
3093 
3094          /* handle fixed definitions first */
3095          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3096             auto& definition = instr->definitions[i];
3097             if (!definition.isFixed())
3098                continue;
3099 
3100             adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
3101             /* check if the target register is blocked */
3102             if (register_file.test(definition.physReg(), definition.bytes())) {
3103                const PhysRegInterval def_regs{definition.physReg(), definition.size()};
3104 
3105                /* create parallelcopy pair to move blocking vars */
3106                std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
3107 
3108                RegisterFile tmp_file(register_file);
3109                /* re-enable the killed operands, so that we don't move the blocking vars there */
3110                for (const Operand& op : instr->operands) {
3111                   if (op.isTemp() && op.isFirstKillBeforeDef())
3112                      tmp_file.fill(op);
3113                }
3114 
3115                ASSERTED bool success = false;
3116                success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
3117                assert(success);
3118 
3119                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3120             }
3121 
3122             if (!definition.isTemp())
3123                continue;
3124 
3125             ctx.assignments[definition.tempId()].set(definition);
3126             register_file.fill(definition);
3127          }
3128 
3129          /* handle all other definitions */
3130          for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3131             Definition* definition = &instr->definitions[i];
3132 
3133             if (definition->isFixed() || !definition->isTemp())
3134                continue;
3135 
3136             /* find free reg */
3137             if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
3138                /* Allocation of linear VGPRs is special. */
3139                definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
3140                update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3141             } else if (instr->opcode == aco_opcode::p_split_vector) {
3142                PhysReg reg = instr->operands[0].physReg();
3143                RegClass rc = definition->regClass();
3144                for (unsigned j = 0; j < i; j++)
3145                   reg.reg_b += instr->definitions[j].bytes();
3146                if (get_reg_specified(ctx, register_file, rc, instr, reg, -1)) {
3147                   definition->setFixed(reg);
3148                } else if (i == 0) {
3149                   RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
3150                   DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
3151                   std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
3152                   if (res && get_reg_specified(ctx, register_file, rc, instr, *res, -1))
3153                      definition->setFixed(*res);
3154                } else if (instr->definitions[i - 1].isFixed()) {
3155                   reg = instr->definitions[i - 1].physReg();
3156                   reg.reg_b += instr->definitions[i - 1].bytes();
3157                   if (get_reg_specified(ctx, register_file, rc, instr, reg, -1))
3158                      definition->setFixed(reg);
3159                }
3160             } else if (instr->opcode == aco_opcode::p_parallelcopy) {
3161                PhysReg reg = instr->operands[i].physReg();
3162                if (instr->operands[i].isTemp() &&
3163                    instr->operands[i].getTemp().type() == definition->getTemp().type() &&
3164                    !register_file.test(reg, definition->bytes()))
3165                   definition->setFixed(reg);
3166             } else if (instr->opcode == aco_opcode::p_extract_vector) {
3167                PhysReg reg = instr->operands[0].physReg();
3168                reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
3169                if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg, -1))
3170                   definition->setFixed(reg);
3171             } else if (instr->opcode == aco_opcode::p_create_vector) {
3172                PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
3173                                                    parallelcopy, instr);
3174                update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3175                definition->setFixed(reg);
3176             } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
3177                        instr->operands[2].isTemp() && instr->operands[2].isKill() &&
3178                        instr->operands[2].regClass() == definition->regClass()) {
3179                /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
3180                 * Here we set a policy of forcing them the same if operands[2] gets killed (and
3181                 * otherwise they don't overlap). This may not be optimal if RA would select a
3182                 * different location due to affinity, but that gets complicated very quickly. */
3183                definition->setFixed(instr->operands[2].physReg());
3184             }
3185 
3186             if (!definition->isFixed()) {
3187                Temp tmp = definition->getTemp();
3188                if (definition->regClass().is_subdword() && definition->bytes() < 4) {
3189                   PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
3190                   definition->setFixed(reg);
3191                   if (reg.byte() || register_file.test(reg, 4)) {
3192                      bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2);
3193                      add_subdword_definition(program, instr, reg, allow_16bit_write);
3194                      definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
3195                                                              the reference */
3196                   }
3197                } else {
3198                   definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
3199                }
3200                update_renames(ctx, register_file, parallelcopy, instr,
3201                               instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
3202                                                                            : (UpdateRenames)0);
3203             }
3204 
3205             assert(
3206                definition->isFixed() &&
3207                ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
3208                 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
3209             ctx.assignments[definition->tempId()].set(*definition);
3210             register_file.fill(*definition);
3211          }
3212 
3213          handle_pseudo(ctx, register_file, instr.get());
3214 
3215          /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
3216           * be read */
3217          for (const Definition& def : instr->definitions) {
3218             if (def.isTemp() && def.isKill())
3219                register_file.clear(def);
3220          }
3221          for (unsigned i = 0; i < instr->operands.size(); i++) {
3222             const Operand& op = instr->operands[i];
3223             if (op.isTemp() && op.isFirstKill() && op.isLateKill())
3224                register_file.clear(op);
3225             if (op.isTemp() && op.physReg().byte() != 0)
3226                add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
3227          }
3228 
3229          emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
3230 
3231          /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3232          bool instr_needs_vop3 =
3233             !instr->isVOP3() &&
3234             ((withoutDPP(instr->format) == Format::VOPC &&
3235               instr->definitions[0].physReg() != vcc) ||
3236              (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
3237              ((instr->opcode == aco_opcode::v_add_co_u32 ||
3238                instr->opcode == aco_opcode::v_addc_co_u32 ||
3239                instr->opcode == aco_opcode::v_sub_co_u32 ||
3240                instr->opcode == aco_opcode::v_subb_co_u32 ||
3241                instr->opcode == aco_opcode::v_subrev_co_u32 ||
3242                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3243               instr->definitions[1].physReg() != vcc) ||
3244              ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3245                instr->opcode == aco_opcode::v_subb_co_u32 ||
3246                instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3247               instr->operands[2].physReg() != vcc));
3248          if (instr_needs_vop3) {
3249 
3250             /* If the first operand is a literal, we have to move it to an sgpr
3251              * for generations without VOP3+literal support.
3252              * Both literals and sgprs count towards the constant bus limit,
3253              * so this is always valid.
3254              */
3255             if (instr->operands.size() && instr->operands[0].isLiteral() &&
3256                 program->gfx_level < GFX10) {
3257                /* Re-use the register we already allocated for the definition.
3258                 * This works because the instruction cannot have any other SGPR operand.
3259                 */
3260                Temp tmp = program->allocateTmp(instr->operands[0].size() == 2 ? s2 : s1);
3261                const Definition& def =
3262                   instr->isVOPC() ? instr->definitions[0] : instr->definitions.back();
3263                assert(def.regClass() == s2);
3264                ctx.assignments.emplace_back(def.physReg(), tmp.regClass());
3265 
3266                Instruction* copy =
3267                   create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
3268                copy->operands[0] = instr->operands[0];
3269                if (copy->operands[0].bytes() < 4)
3270                   copy->operands[0] = Operand::c32(copy->operands[0].constantValue());
3271                copy->definitions[0] = Definition(tmp);
3272                copy->definitions[0].setFixed(def.physReg());
3273 
3274                instr->operands[0] = Operand(tmp);
3275                instr->operands[0].setFixed(def.physReg());
3276                instr->operands[0].setFirstKill(true);
3277 
3278                instructions.emplace_back(copy);
3279             }
3280 
3281             /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3282             instr->format = asVOP3(instr->format);
3283          }
3284 
3285          instructions.emplace_back(std::move(*instr_it));
3286 
3287       } /* end for Instr */
3288 
3289       if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
3290          /* Reset this for block_kind_resume. */
3291          ctx.num_linear_vgprs = 0;
3292 
3293          ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
3294          ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
3295          assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
3296          assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
3297       } else if (should_compact_linear_vgprs(ctx, register_file)) {
3298          aco_ptr<Instruction> br = std::move(instructions.back());
3299          instructions.pop_back();
3300 
3301          bool temp_in_scc =
3302             register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
3303 
3304          std::vector<std::pair<Operand, Definition>> parallelcopy;
3305          compact_linear_vgprs(ctx, register_file, parallelcopy);
3306          update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
3307          emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
3308 
3309          instructions.push_back(std::move(br));
3310       }
3311 
3312       block.instructions = std::move(instructions);
3313    } /* end for BB */
3314 
3315    /* num_gpr = rnd_up(max_used_gpr + 1) */
3316    program->config->num_vgprs =
3317       std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
3318    program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3319 
3320    program->progress = CompilationProgress::after_ra;
3321 }
3322 
3323 } // namespace aco
3324