1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "util/bitset.h"
10 #include "util/enum_operators.h"
11
12 #include <algorithm>
13 #include <array>
14 #include <bitset>
15 #include <map>
16 #include <optional>
17 #include <vector>
18
19 namespace aco {
20 namespace {
21
22 struct ra_ctx;
23 struct DefInfo;
24
25 unsigned get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
26 unsigned idx, RegClass rc);
27 void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
28 RegClass rc);
29 void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
30 bool allow_16bit_write);
31
32 struct assignment {
33 PhysReg reg;
34 RegClass rc;
35 union {
36 struct {
37 bool assigned : 1;
38 bool vcc : 1;
39 bool m0 : 1;
40 bool renamed : 1;
41 };
42 uint8_t _ = 0;
43 };
44 uint32_t affinity = 0;
45 assignment() = default;
assignmentaco::__anon8422c4420111::assignment46 assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
setaco::__anon8422c4420111::assignment47 void set(const Definition& def)
48 {
49 assigned = true;
50 reg = def.physReg();
51 rc = def.regClass();
52 }
53 };
54
55 /* Iterator type for making PhysRegInterval compatible with range-based for */
56 struct PhysRegIterator {
57 using difference_type = int;
58 using value_type = unsigned;
59 using reference = const unsigned&;
60 using pointer = const unsigned*;
61 using iterator_category = std::bidirectional_iterator_tag;
62
63 PhysReg reg;
64
operator *aco::__anon8422c4420111::PhysRegIterator65 PhysReg operator*() const { return reg; }
66
operator ++aco::__anon8422c4420111::PhysRegIterator67 PhysRegIterator& operator++()
68 {
69 reg.reg_b += 4;
70 return *this;
71 }
72
operator --aco::__anon8422c4420111::PhysRegIterator73 PhysRegIterator& operator--()
74 {
75 reg.reg_b -= 4;
76 return *this;
77 }
78
operator ==aco::__anon8422c4420111::PhysRegIterator79 bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
80
operator !=aco::__anon8422c4420111::PhysRegIterator81 bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
82
operator <aco::__anon8422c4420111::PhysRegIterator83 bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
84 };
85
86 struct ra_ctx {
87
88 Program* program;
89 Block* block = NULL;
90 aco::monotonic_buffer_resource memory;
91 std::vector<assignment> assignments;
92 std::vector<aco::unordered_map<uint32_t, Temp>> renames;
93 std::vector<uint32_t> loop_header;
94 aco::unordered_map<uint32_t, Temp> orig_names;
95 aco::unordered_map<uint32_t, Instruction*> vectors;
96 aco::unordered_map<uint32_t, Instruction*> split_vectors;
97 aco_ptr<Instruction> pseudo_dummy;
98 aco_ptr<Instruction> phi_dummy;
99 uint16_t max_used_sgpr = 0;
100 uint16_t max_used_vgpr = 0;
101 uint16_t sgpr_limit;
102 uint16_t vgpr_limit;
103 std::bitset<512> war_hint;
104 PhysRegIterator rr_sgpr_it;
105 PhysRegIterator rr_vgpr_it;
106
107 uint16_t sgpr_bounds;
108 uint16_t vgpr_bounds;
109 uint16_t num_linear_vgprs;
110
111 ra_test_policy policy;
112
ra_ctxaco::__anon8422c4420111::ra_ctx113 ra_ctx(Program* program_, ra_test_policy policy_)
114 : program(program_), assignments(program->peekAllocationId()),
115 renames(program->blocks.size(), aco::unordered_map<uint32_t, Temp>(memory)),
116 orig_names(memory), vectors(memory), split_vectors(memory), policy(policy_)
117 {
118 pseudo_dummy.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
119 phi_dummy.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, 0, 0));
120 sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
121 vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
122
123 sgpr_bounds = program->max_reg_demand.sgpr;
124 vgpr_bounds = program->max_reg_demand.vgpr;
125 num_linear_vgprs = 0;
126 }
127 };
128
129 /* Half-open register interval used in "sliding window"-style for-loops */
130 struct PhysRegInterval {
131 PhysReg lo_;
132 unsigned size;
133
134 /* Inclusive lower bound */
loaco::__anon8422c4420111::PhysRegInterval135 PhysReg lo() const { return lo_; }
136
137 /* Exclusive upper bound */
hiaco::__anon8422c4420111::PhysRegInterval138 PhysReg hi() const { return PhysReg{lo() + size}; }
139
operator +=aco::__anon8422c4420111::PhysRegInterval140 PhysRegInterval& operator+=(uint32_t stride)
141 {
142 lo_ = PhysReg{lo_.reg() + stride};
143 return *this;
144 }
145
operator !=aco::__anon8422c4420111::PhysRegInterval146 bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
147
148 /* Construct a half-open interval, excluding the end register */
from_untilaco::__anon8422c4420111::PhysRegInterval149 static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
150
containsaco::__anon8422c4420111::PhysRegInterval151 bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
152
containsaco::__anon8422c4420111::PhysRegInterval153 bool contains(const PhysRegInterval& needle) const
154 {
155 return needle.lo() >= lo() && needle.hi() <= hi();
156 }
157
beginaco::__anon8422c4420111::PhysRegInterval158 PhysRegIterator begin() const { return {lo_}; }
159
endaco::__anon8422c4420111::PhysRegInterval160 PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
161 };
162
163 bool
intersects(const PhysRegInterval & a,const PhysRegInterval & b)164 intersects(const PhysRegInterval& a, const PhysRegInterval& b)
165 {
166 return a.hi() > b.lo() && b.hi() > a.lo();
167 }
168
169 /* Gets the stride for full (non-subdword) registers */
170 uint32_t
get_stride(RegClass rc)171 get_stride(RegClass rc)
172 {
173 if (rc.type() == RegType::vgpr) {
174 return 1;
175 } else {
176 uint32_t size = rc.size();
177 if (size == 2) {
178 return 2;
179 } else if (size >= 4) {
180 return 4;
181 } else {
182 return 1;
183 }
184 }
185 }
186
187 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegType type,bool linear_vgpr)188 get_reg_bounds(ra_ctx& ctx, RegType type, bool linear_vgpr)
189 {
190 uint16_t linear_vgpr_start = ctx.vgpr_bounds - ctx.num_linear_vgprs;
191 if (type == RegType::vgpr && linear_vgpr) {
192 return PhysRegInterval{PhysReg(256 + linear_vgpr_start), ctx.num_linear_vgprs};
193 } else if (type == RegType::vgpr) {
194 return PhysRegInterval{PhysReg(256), linear_vgpr_start};
195 } else {
196 return PhysRegInterval{PhysReg(0), ctx.sgpr_bounds};
197 }
198 }
199
200 PhysRegInterval
get_reg_bounds(ra_ctx & ctx,RegClass rc)201 get_reg_bounds(ra_ctx& ctx, RegClass rc)
202 {
203 return get_reg_bounds(ctx, rc.type(), rc.is_linear_vgpr());
204 }
205
206 struct DefInfo {
207 PhysRegInterval bounds;
208 uint8_t size;
209 uint8_t stride;
210 /* Even if stride=4, we might be able to write to the high half instead without preserving the
211 * low half. In that case, data_stride=2. */
212 uint8_t data_stride;
213 RegClass rc;
214
DefInfoaco::__anon8422c4420111::DefInfo215 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
216 {
217 size = rc.size();
218 stride = get_stride(rc);
219 data_stride = 0;
220
221 bounds = get_reg_bounds(ctx, rc);
222
223 if (rc.is_subdword() && operand >= 0) {
224 /* stride in bytes */
225 stride = get_subdword_operand_stride(ctx.program->gfx_level, instr, operand, rc);
226 } else if (rc.is_subdword()) {
227 get_subdword_definition_info(ctx.program, instr);
228 } else if (instr->isMIMG() && instr->mimg().d16 && ctx.program->gfx_level <= GFX9) {
229 /* Workaround GFX9 hardware bug for D16 image instructions: FeatureImageGather4D16Bug
230 *
231 * The register use is not calculated correctly, and the hardware assumes a
232 * full dword per component. Don't use the last registers of the register file.
233 * Otherwise, the instruction will be skipped.
234 *
235 * https://reviews.llvm.org/D81172
236 */
237 bool imageGather4D16Bug = operand == -1 && rc == v2 && instr->mimg().dmask != 0xF;
238 assert(ctx.program->gfx_level == GFX9 && "Image D16 on GFX8 not supported.");
239
240 if (imageGather4D16Bug)
241 bounds.size -= MAX2(rc.bytes() / 4 - ctx.num_linear_vgprs, 0);
242 }
243
244 if (!data_stride)
245 data_stride = rc.is_subdword() ? stride : (stride * 4);
246 }
247
248 private:
249 void get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr);
250 };
251
252 class RegisterFile {
253 public:
RegisterFile()254 RegisterFile() { regs.fill(0); }
255
256 std::array<uint32_t, 512> regs;
257 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
258
operator [](PhysReg index) const259 const uint32_t& operator[](PhysReg index) const { return regs[index]; }
260
operator [](PhysReg index)261 uint32_t& operator[](PhysReg index) { return regs[index]; }
262
count_zero(PhysRegInterval reg_interval) const263 unsigned count_zero(PhysRegInterval reg_interval) const
264 {
265 unsigned res = 0;
266 for (PhysReg reg : reg_interval)
267 res += !regs[reg];
268 return res;
269 }
270
271 /* Returns true if any of the bytes in the given range are allocated or blocked */
test(PhysReg start,unsigned num_bytes) const272 bool test(PhysReg start, unsigned num_bytes) const
273 {
274 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
275 assert(i <= 511);
276 if (regs[i] & 0x0FFFFFFF)
277 return true;
278 if (regs[i] == 0xF0000000) {
279 auto it = subdword_regs.find(i);
280 assert(it != subdword_regs.end());
281 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
282 if (it->second[j])
283 return true;
284 }
285 }
286 }
287 return false;
288 }
289
block(PhysReg start,RegClass rc)290 void block(PhysReg start, RegClass rc)
291 {
292 if (rc.is_subdword())
293 fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
294 else
295 fill(start, rc.size(), 0xFFFFFFFF);
296 }
297
is_blocked(PhysReg start) const298 bool is_blocked(PhysReg start) const
299 {
300 if (regs[start] == 0xFFFFFFFF)
301 return true;
302 if (regs[start] == 0xF0000000) {
303 auto it = subdword_regs.find(start);
304 assert(it != subdword_regs.end());
305 for (unsigned i = start.byte(); i < 4; i++)
306 if (it->second[i] == 0xFFFFFFFF)
307 return true;
308 }
309 return false;
310 }
311
is_empty_or_blocked(PhysReg start) const312 bool is_empty_or_blocked(PhysReg start) const
313 {
314 /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
315 * incremented value to 1 */
316 if (regs[start] == 0xF0000000) {
317 auto it = subdword_regs.find(start);
318 assert(it != subdword_regs.end());
319 return it->second[start.byte()] + 1 <= 1;
320 }
321 return regs[start] + 1 <= 1;
322 }
323
clear(PhysReg start,RegClass rc)324 void clear(PhysReg start, RegClass rc)
325 {
326 if (rc.is_subdword())
327 fill_subdword(start, rc.bytes(), 0);
328 else
329 fill(start, rc.size(), 0);
330 }
331
fill(Operand op)332 void fill(Operand op)
333 {
334 if (op.regClass().is_subdword())
335 fill_subdword(op.physReg(), op.bytes(), op.tempId());
336 else
337 fill(op.physReg(), op.size(), op.tempId());
338 }
339
clear(Operand op)340 void clear(Operand op) { clear(op.physReg(), op.regClass()); }
341
fill(Definition def)342 void fill(Definition def)
343 {
344 if (def.regClass().is_subdword())
345 fill_subdword(def.physReg(), def.bytes(), def.tempId());
346 else
347 fill(def.physReg(), def.size(), def.tempId());
348 }
349
clear(Definition def)350 void clear(Definition def) { clear(def.physReg(), def.regClass()); }
351
get_id(PhysReg reg) const352 unsigned get_id(PhysReg reg) const
353 {
354 return regs[reg] == 0xF0000000 ? subdword_regs.at(reg)[reg.byte()] : regs[reg];
355 }
356
357 private:
fill(PhysReg start,unsigned size,uint32_t val)358 void fill(PhysReg start, unsigned size, uint32_t val)
359 {
360 for (unsigned i = 0; i < size; i++)
361 regs[start + i] = val;
362 }
363
fill_subdword(PhysReg start,unsigned num_bytes,uint32_t val)364 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
365 {
366 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
367 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
368 /* emplace or get */
369 std::array<uint32_t, 4>& sub =
370 subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
371 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
372 sub[j] = val;
373
374 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
375 subdword_regs.erase(i);
376 regs[i] = 0;
377 }
378 }
379 }
380 };
381
382 std::vector<unsigned> find_vars(ra_ctx& ctx, const RegisterFile& reg_file,
383 const PhysRegInterval reg_interval);
384
385 /* helper function for debugging */
386 UNUSED void
print_reg(const RegisterFile & reg_file,PhysReg reg,bool has_adjacent_variable)387 print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
388 {
389 if (reg_file[reg] == 0xFFFFFFFF) {
390 printf((const char*)u8"☐");
391 } else if (reg_file[reg]) {
392 const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
393 if (show_subdword_alloc) {
394 auto block_chars = {
395 // clang-format off
396 u8"?", u8"▘", u8"▝", u8"▀",
397 u8"▖", u8"▌", u8"▞", u8"▛",
398 u8"▗", u8"▚", u8"▐", u8"▜",
399 u8"▄", u8"▙", u8"▟", u8"▉"
400 // clang-format on
401 };
402 unsigned index = 0;
403 for (int i = 0; i < 4; ++i) {
404 if (reg_file.subdword_regs.at(reg)[i]) {
405 index |= 1 << i;
406 }
407 }
408 printf("%s", (const char*)(block_chars.begin()[index]));
409 } else {
410 /* Indicate filled register slot */
411 if (!has_adjacent_variable) {
412 printf((const char*)u8"█");
413 } else {
414 /* Use a slightly shorter box to leave a small gap between adjacent variables */
415 printf((const char*)u8"▉");
416 }
417 }
418 } else {
419 printf((const char*)u8"·");
420 }
421 }
422
423 /* helper function for debugging */
424 UNUSED void
print_regs(ra_ctx & ctx,PhysRegInterval regs,const RegisterFile & reg_file)425 print_regs(ra_ctx& ctx, PhysRegInterval regs, const RegisterFile& reg_file)
426 {
427 char reg_char = regs.lo().reg() >= 256 ? 'v' : 's';
428 const int max_regs_per_line = 64;
429
430 /* print markers */
431 printf(" ");
432 for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
433 printf("%-3.2u ", i);
434 }
435 printf("\n");
436
437 /* print usage */
438 auto line_begin_it = regs.begin();
439 while (line_begin_it != regs.end()) {
440 const int regs_in_line =
441 std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
442
443 if (line_begin_it == regs.begin()) {
444 printf("%cgprs: ", reg_char);
445 } else {
446 printf(" %+4d ", std::distance(regs.begin(), line_begin_it));
447 }
448 const auto line_end_it = std::next(line_begin_it, regs_in_line);
449
450 for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
451 bool has_adjacent_variable =
452 (std::next(reg_it) != line_end_it &&
453 reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
454 print_reg(reg_file, *reg_it, has_adjacent_variable);
455 }
456
457 line_begin_it = line_end_it;
458 printf("\n");
459 }
460
461 const unsigned free_regs =
462 std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
463 printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
464
465 /* print assignments ordered by registers */
466 std::map<PhysReg, std::pair<unsigned, unsigned>> regs_to_vars; /* maps to byte size and temp id */
467 for (unsigned id : find_vars(ctx, reg_file, regs)) {
468 const assignment& var = ctx.assignments[id];
469 PhysReg reg = var.reg;
470 ASSERTED auto inserted = regs_to_vars.emplace(reg, std::make_pair(var.rc.bytes(), id));
471 assert(inserted.second);
472 }
473
474 for (const auto& reg_and_var : regs_to_vars) {
475 const auto& first_reg = reg_and_var.first;
476 const auto& size_id = reg_and_var.second;
477
478 printf("%%%u ", size_id.second);
479 if (ctx.orig_names.count(size_id.second) &&
480 ctx.orig_names[size_id.second].id() != size_id.second) {
481 printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
482 }
483 printf("= %c[%d", reg_char, first_reg.reg() % 256);
484 PhysReg last_reg = first_reg.advance(size_id.first - 1);
485 if (first_reg.reg() != last_reg.reg()) {
486 assert(first_reg.byte() == 0 && last_reg.byte() == 3);
487 printf("-%d", last_reg.reg() % 256);
488 }
489 printf("]");
490 if (first_reg.byte() != 0 || last_reg.byte() != 3) {
491 printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
492 }
493 printf("\n");
494 }
495 }
496
497 unsigned
get_subdword_operand_stride(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned idx,RegClass rc)498 get_subdword_operand_stride(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
499 unsigned idx, RegClass rc)
500 {
501 assert(gfx_level >= GFX8);
502 if (instr->isPseudo()) {
503 /* v_readfirstlane_b32 cannot use SDWA */
504 if (instr->opcode == aco_opcode::p_as_uniform)
505 return 4;
506 else
507 return rc.bytes() % 2 == 0 ? 2 : 1;
508 }
509
510 assert(rc.bytes() <= 2);
511 if (instr->isVALU()) {
512 if (can_use_SDWA(gfx_level, instr, false))
513 return rc.bytes();
514 if (can_use_opsel(gfx_level, instr->opcode, idx))
515 return 2;
516 if (instr->isVOP3P())
517 return 2;
518 }
519
520 switch (instr->opcode) {
521 case aco_opcode::v_cvt_f32_ubyte0: return 1;
522 case aco_opcode::ds_write_b8:
523 case aco_opcode::ds_write_b16: return gfx_level >= GFX9 ? 2 : 4;
524 case aco_opcode::buffer_store_byte:
525 case aco_opcode::buffer_store_short:
526 case aco_opcode::buffer_store_format_d16_x:
527 case aco_opcode::flat_store_byte:
528 case aco_opcode::flat_store_short:
529 case aco_opcode::scratch_store_byte:
530 case aco_opcode::scratch_store_short:
531 case aco_opcode::global_store_byte:
532 case aco_opcode::global_store_short: return gfx_level >= GFX9 ? 2 : 4;
533 default: return 4;
534 }
535 }
536
537 void
add_subdword_operand(ra_ctx & ctx,aco_ptr<Instruction> & instr,unsigned idx,unsigned byte,RegClass rc)538 add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
539 RegClass rc)
540 {
541 amd_gfx_level gfx_level = ctx.program->gfx_level;
542 if (instr->isPseudo() || byte == 0)
543 return;
544
545 assert(rc.bytes() <= 2);
546 if (instr->isVALU()) {
547 if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
548 switch (byte) {
549 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
550 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
551 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
552 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
553 }
554 return;
555 }
556
557 /* use SDWA */
558 if (can_use_SDWA(gfx_level, instr, false)) {
559 convert_to_SDWA(gfx_level, instr);
560 return;
561 }
562
563 /* use opsel */
564 if (instr->isVOP3P()) {
565 assert(byte == 2 && !instr->valu().opsel_lo[idx]);
566 instr->valu().opsel_lo[idx] = true;
567 instr->valu().opsel_hi[idx] = true;
568 return;
569 }
570
571 assert(can_use_opsel(gfx_level, instr->opcode, idx));
572 instr->valu().opsel[idx] = true;
573 return;
574 }
575
576 assert(byte == 2);
577 if (instr->opcode == aco_opcode::ds_write_b8)
578 instr->opcode = aco_opcode::ds_write_b8_d16_hi;
579 else if (instr->opcode == aco_opcode::ds_write_b16)
580 instr->opcode = aco_opcode::ds_write_b16_d16_hi;
581 else if (instr->opcode == aco_opcode::buffer_store_byte)
582 instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
583 else if (instr->opcode == aco_opcode::buffer_store_short)
584 instr->opcode = aco_opcode::buffer_store_short_d16_hi;
585 else if (instr->opcode == aco_opcode::buffer_store_format_d16_x)
586 instr->opcode = aco_opcode::buffer_store_format_d16_hi_x;
587 else if (instr->opcode == aco_opcode::flat_store_byte)
588 instr->opcode = aco_opcode::flat_store_byte_d16_hi;
589 else if (instr->opcode == aco_opcode::flat_store_short)
590 instr->opcode = aco_opcode::flat_store_short_d16_hi;
591 else if (instr->opcode == aco_opcode::scratch_store_byte)
592 instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
593 else if (instr->opcode == aco_opcode::scratch_store_short)
594 instr->opcode = aco_opcode::scratch_store_short_d16_hi;
595 else if (instr->opcode == aco_opcode::global_store_byte)
596 instr->opcode = aco_opcode::global_store_byte_d16_hi;
597 else if (instr->opcode == aco_opcode::global_store_short)
598 instr->opcode = aco_opcode::global_store_short_d16_hi;
599 else
600 unreachable("Something went wrong: Impossible register assignment.");
601 return;
602 }
603
604 void
get_subdword_definition_info(Program * program,const aco_ptr<Instruction> & instr)605 DefInfo::get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr)
606 {
607 amd_gfx_level gfx_level = program->gfx_level;
608 assert(gfx_level >= GFX8);
609
610 stride = rc.bytes() % 2 == 0 ? 2 : 1;
611
612 if (instr->isPseudo()) {
613 if (instr->opcode == aco_opcode::p_interp_gfx11) {
614 rc = RegClass(RegType::vgpr, rc.size());
615 stride = 1;
616 }
617 return;
618 }
619
620 if (instr->isVALU()) {
621 assert(rc.bytes() <= 2);
622
623 if (can_use_SDWA(gfx_level, instr, false) || instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
624 return;
625
626 rc = instr_is_16bit(gfx_level, instr->opcode) ? v2b : v1;
627 stride = rc == v2b ? 4 : 1;
628 if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
629 can_use_opsel(gfx_level, instr->opcode, -1)) {
630 data_stride = 2;
631 stride = rc == v2b ? 2 : stride;
632 }
633 return;
634 }
635
636 switch (instr->opcode) {
637 case aco_opcode::v_interp_p2_f16: return;
638 /* D16 loads with _hi version */
639 case aco_opcode::ds_read_u8_d16:
640 case aco_opcode::ds_read_i8_d16:
641 case aco_opcode::ds_read_u16_d16:
642 case aco_opcode::flat_load_ubyte_d16:
643 case aco_opcode::flat_load_sbyte_d16:
644 case aco_opcode::flat_load_short_d16:
645 case aco_opcode::global_load_ubyte_d16:
646 case aco_opcode::global_load_sbyte_d16:
647 case aco_opcode::global_load_short_d16:
648 case aco_opcode::scratch_load_ubyte_d16:
649 case aco_opcode::scratch_load_sbyte_d16:
650 case aco_opcode::scratch_load_short_d16:
651 case aco_opcode::buffer_load_ubyte_d16:
652 case aco_opcode::buffer_load_sbyte_d16:
653 case aco_opcode::buffer_load_short_d16:
654 case aco_opcode::buffer_load_format_d16_x: {
655 assert(gfx_level >= GFX9);
656 if (program->dev.sram_ecc_enabled) {
657 rc = v1;
658 stride = 1;
659 data_stride = 2;
660 } else {
661 stride = 2;
662 }
663 return;
664 }
665 /* 3-component D16 loads */
666 case aco_opcode::buffer_load_format_d16_xyz:
667 case aco_opcode::tbuffer_load_format_d16_xyz: {
668 assert(gfx_level >= GFX9);
669 if (program->dev.sram_ecc_enabled) {
670 rc = v2;
671 stride = 1;
672 } else {
673 stride = 4;
674 }
675 return;
676 }
677 default: break;
678 }
679
680 if (instr->isMIMG() && instr->mimg().d16 && !program->dev.sram_ecc_enabled) {
681 assert(gfx_level >= GFX9);
682 stride = 4;
683 } else {
684 rc = RegClass(RegType::vgpr, rc.size());
685 stride = 1;
686 }
687 }
688
689 void
add_subdword_definition(Program * program,aco_ptr<Instruction> & instr,PhysReg reg,bool allow_16bit_write)690 add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
691 bool allow_16bit_write)
692 {
693 if (instr->isPseudo())
694 return;
695
696 if (instr->isVALU()) {
697 amd_gfx_level gfx_level = program->gfx_level;
698 assert(instr->definitions[0].bytes() <= 2);
699
700 if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
701 return;
702
703 if (reg.byte() == 0 && allow_16bit_write && instr_is_16bit(gfx_level, instr->opcode))
704 return;
705
706 /* use SDWA */
707 if (can_use_SDWA(gfx_level, instr, false)) {
708 convert_to_SDWA(gfx_level, instr);
709 return;
710 }
711
712 assert(allow_16bit_write);
713
714 if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
715 instr->opcode = aco_opcode::v_fma_mixhi_f16;
716 return;
717 }
718
719 /* use opsel */
720 assert(reg.byte() == 2);
721 assert(can_use_opsel(gfx_level, instr->opcode, -1));
722 instr->valu().opsel[3] = true; /* dst in high half */
723 return;
724 }
725
726 if (reg.byte() == 0)
727 return;
728 else if (instr->opcode == aco_opcode::v_interp_p2_f16)
729 instr->opcode = aco_opcode::v_interp_p2_hi_f16;
730 else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
731 instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
732 else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
733 instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
734 else if (instr->opcode == aco_opcode::buffer_load_short_d16)
735 instr->opcode = aco_opcode::buffer_load_short_d16_hi;
736 else if (instr->opcode == aco_opcode::buffer_load_format_d16_x)
737 instr->opcode = aco_opcode::buffer_load_format_d16_hi_x;
738 else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
739 instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
740 else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
741 instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
742 else if (instr->opcode == aco_opcode::flat_load_short_d16)
743 instr->opcode = aco_opcode::flat_load_short_d16_hi;
744 else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
745 instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
746 else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
747 instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
748 else if (instr->opcode == aco_opcode::scratch_load_short_d16)
749 instr->opcode = aco_opcode::scratch_load_short_d16_hi;
750 else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
751 instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
752 else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
753 instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
754 else if (instr->opcode == aco_opcode::global_load_short_d16)
755 instr->opcode = aco_opcode::global_load_short_d16_hi;
756 else if (instr->opcode == aco_opcode::ds_read_u8_d16)
757 instr->opcode = aco_opcode::ds_read_u8_d16_hi;
758 else if (instr->opcode == aco_opcode::ds_read_i8_d16)
759 instr->opcode = aco_opcode::ds_read_i8_d16_hi;
760 else if (instr->opcode == aco_opcode::ds_read_u16_d16)
761 instr->opcode = aco_opcode::ds_read_u16_d16_hi;
762 else
763 unreachable("Something went wrong: Impossible register assignment.");
764 }
765
766 void
adjust_max_used_regs(ra_ctx & ctx,RegClass rc,unsigned reg)767 adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
768 {
769 uint16_t max_addressible_sgpr = ctx.sgpr_limit;
770 unsigned size = rc.size();
771 if (rc.type() == RegType::vgpr) {
772 assert(reg >= 256);
773 uint16_t hi = reg - 256 + size - 1;
774 assert(hi <= 255);
775 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
776 } else if (reg + rc.size() <= max_addressible_sgpr) {
777 uint16_t hi = reg + size - 1;
778 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
779 }
780 }
781
782 enum UpdateRenames {
783 rename_not_killed_ops = 0x1,
784 fill_killed_ops = 0x2,
785 rename_precolored_ops = 0x4,
786 };
787 MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
788
789 void
update_renames(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,UpdateRenames flags)790 update_renames(ra_ctx& ctx, RegisterFile& reg_file,
791 std::vector<std::pair<Operand, Definition>>& parallelcopies,
792 aco_ptr<Instruction>& instr, UpdateRenames flags)
793 {
794 /* clear operands */
795 for (std::pair<Operand, Definition>& copy : parallelcopies) {
796 /* the definitions with id are not from this function and already handled */
797 if (copy.second.isTemp())
798 continue;
799 reg_file.clear(copy.first);
800 }
801
802 /* allocate id's and rename operands: this is done transparently here */
803 auto it = parallelcopies.begin();
804 while (it != parallelcopies.end()) {
805 if (it->second.isTemp()) {
806 ++it;
807 continue;
808 }
809
810 /* check if we moved a definition: change the register and remove copy */
811 bool is_def = false;
812 for (Definition& def : instr->definitions) {
813 if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
814 // FIXME: ensure that the definition can use this reg
815 def.setFixed(it->second.physReg());
816 reg_file.fill(def);
817 ctx.assignments[def.tempId()].reg = def.physReg();
818 it = parallelcopies.erase(it);
819 is_def = true;
820 break;
821 }
822 }
823 if (is_def)
824 continue;
825
826 /* check if we moved another parallelcopy definition */
827 for (std::pair<Operand, Definition>& other : parallelcopies) {
828 if (!other.second.isTemp())
829 continue;
830 if (it->first.getTemp() == other.second.getTemp()) {
831 other.second.setFixed(it->second.physReg());
832 ctx.assignments[other.second.tempId()].reg = other.second.physReg();
833 it = parallelcopies.erase(it);
834 is_def = true;
835 /* check if we moved an operand, again */
836 bool fill = true;
837 for (Operand& op : instr->operands) {
838 if (op.isTemp() && op.tempId() == other.second.tempId()) {
839 // FIXME: ensure that the operand can use this reg
840 op.setFixed(other.second.physReg());
841 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
842 }
843 }
844 if (fill)
845 reg_file.fill(other.second);
846 break;
847 }
848 }
849 if (is_def)
850 continue;
851
852 std::pair<Operand, Definition>& copy = *it;
853 copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
854 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
855 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
856
857 /* check if we moved an operand */
858 bool first[2] = {true, true};
859 bool fill = true;
860 for (unsigned i = 0; i < instr->operands.size(); i++) {
861 Operand& op = instr->operands[i];
862 if (!op.isTemp())
863 continue;
864 if (op.tempId() == copy.first.tempId()) {
865 /* only rename precolored operands if the copy-location matches */
866 bool omit_renaming = (flags & rename_precolored_ops) && op.isFixed() &&
867 op.physReg() != copy.second.physReg();
868
869 /* Omit renaming in some cases for p_create_vector in order to avoid
870 * unnecessary shuffle code. */
871 if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
872 omit_renaming = true;
873 for (std::pair<Operand, Definition>& pc : parallelcopies) {
874 PhysReg def_reg = pc.second.physReg();
875 omit_renaming &= def_reg > copy.first.physReg()
876 ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
877 : (def_reg + pc.second.size() <= copy.first.physReg().reg());
878 }
879 }
880
881 /* Fix the kill flags */
882 if (first[omit_renaming])
883 op.setFirstKill(omit_renaming || op.isKill());
884 else
885 op.setKill(omit_renaming || op.isKill());
886 first[omit_renaming] = false;
887
888 if (omit_renaming)
889 continue;
890
891 op.setTemp(copy.second.getTemp());
892 op.setFixed(copy.second.physReg());
893
894 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
895 }
896 }
897
898 if (fill)
899 reg_file.fill(copy.second);
900
901 ++it;
902 }
903 }
904
905 std::optional<PhysReg>
get_reg_simple(ra_ctx & ctx,const RegisterFile & reg_file,DefInfo info)906 get_reg_simple(ra_ctx& ctx, const RegisterFile& reg_file, DefInfo info)
907 {
908 PhysRegInterval bounds = info.bounds;
909 uint32_t size = info.size;
910 uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
911 RegClass rc = info.rc;
912
913 if (stride < size && !rc.is_subdword()) {
914 DefInfo new_info = info;
915 new_info.stride = stride * 2;
916 if (size % new_info.stride == 0) {
917 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, new_info);
918 if (res)
919 return res;
920 }
921 }
922
923 PhysRegIterator& rr_it = rc.type() == RegType::vgpr ? ctx.rr_vgpr_it : ctx.rr_sgpr_it;
924 if (stride == 1) {
925 if (rr_it != bounds.begin() && bounds.contains(rr_it.reg)) {
926 assert(bounds.begin() < rr_it);
927 assert(rr_it < bounds.end());
928 info.bounds = PhysRegInterval::from_until(rr_it.reg, bounds.hi());
929 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
930 if (res)
931 return res;
932 bounds = PhysRegInterval::from_until(bounds.lo(), rr_it.reg);
933 }
934 }
935
936 auto is_free = [&](PhysReg reg_index)
937 { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
938
939 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
940 reg_win += stride) {
941 if (std::all_of(reg_win.begin(), reg_win.end(), is_free)) {
942 if (stride == 1) {
943 PhysRegIterator new_rr_it{PhysReg{reg_win.lo() + size}};
944 if (new_rr_it < bounds.end())
945 rr_it = new_rr_it;
946 }
947 adjust_max_used_regs(ctx, rc, reg_win.lo());
948 return reg_win.lo();
949 }
950 }
951
952 /* do this late because using the upper bytes of a register can require
953 * larger instruction encodings or copies
954 * TODO: don't do this in situations where it doesn't benefit */
955 if (rc.is_subdword()) {
956 for (const std::pair<const uint32_t, std::array<uint32_t, 4>>& entry :
957 reg_file.subdword_regs) {
958 assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
959 if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
960 continue;
961
962 auto it = entry.second.begin();
963 for (unsigned i = 0; i < 4; i += info.stride) {
964 /* check if there's a block of free bytes large enough to hold the register */
965 bool reg_found =
966 std::all_of(std::next(it, i), std::next(it, std::min(4u, i + rc.bytes())),
967 [](unsigned v) { return v == 0; });
968
969 /* check if also the neighboring reg is free if needed */
970 if (reg_found && i + rc.bytes() > 4)
971 reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
972
973 if (reg_found) {
974 PhysReg res{entry.first};
975 res.reg_b += i;
976 adjust_max_used_regs(ctx, rc, entry.first);
977 return res;
978 }
979 }
980 }
981 }
982
983 return {};
984 }
985
986 /* collect variables from a register area */
987 std::vector<unsigned>
find_vars(ra_ctx & ctx,const RegisterFile & reg_file,const PhysRegInterval reg_interval)988 find_vars(ra_ctx& ctx, const RegisterFile& reg_file, const PhysRegInterval reg_interval)
989 {
990 std::vector<unsigned> vars;
991 for (PhysReg j : reg_interval) {
992 if (reg_file.is_blocked(j))
993 continue;
994 if (reg_file[j] == 0xF0000000) {
995 for (unsigned k = 0; k < 4; k++) {
996 unsigned id = reg_file.subdword_regs.at(j)[k];
997 if (id && (vars.empty() || id != vars.back()))
998 vars.emplace_back(id);
999 }
1000 } else {
1001 unsigned id = reg_file[j];
1002 if (id && (vars.empty() || id != vars.back()))
1003 vars.emplace_back(id);
1004 }
1005 }
1006 return vars;
1007 }
1008
1009 /* collect variables from a register area and clear reg_file
1010 * variables are sorted in decreasing size and
1011 * increasing assigned register
1012 */
1013 std::vector<unsigned>
collect_vars(ra_ctx & ctx,RegisterFile & reg_file,const PhysRegInterval reg_interval)1014 collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
1015 {
1016 std::vector<unsigned> ids = find_vars(ctx, reg_file, reg_interval);
1017 std::sort(ids.begin(), ids.end(),
1018 [&](unsigned a, unsigned b)
1019 {
1020 assignment& var_a = ctx.assignments[a];
1021 assignment& var_b = ctx.assignments[b];
1022 return var_a.rc.bytes() > var_b.rc.bytes() ||
1023 (var_a.rc.bytes() == var_b.rc.bytes() && var_a.reg < var_b.reg);
1024 });
1025
1026 for (unsigned id : ids) {
1027 assignment& var = ctx.assignments[id];
1028 reg_file.clear(var.reg, var.rc);
1029 }
1030 return ids;
1031 }
1032
1033 std::optional<PhysReg>
get_reg_for_create_vector_copy(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg,DefInfo info,unsigned id)1034 get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
1035 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1036 aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
1037 DefInfo info, unsigned id)
1038 {
1039 PhysReg reg = def_reg.lo();
1040 /* dead operand: return position in vector */
1041 for (unsigned i = 0; i < instr->operands.size(); i++) {
1042 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id &&
1043 instr->operands[i].isKillBeforeDef()) {
1044 assert(!reg_file.test(reg, instr->operands[i].bytes()));
1045 if (info.rc.is_subdword() || reg.byte() == 0)
1046 return reg;
1047 else
1048 return {};
1049 }
1050 reg.reg_b += instr->operands[i].bytes();
1051 }
1052
1053 /* GFX9+ has a VGPR swap instruction. */
1054 if (ctx.program->gfx_level <= GFX8 || info.rc.type() == RegType::sgpr)
1055 return {};
1056
1057 /* check if the previous position was in vector */
1058 assignment& var = ctx.assignments[id];
1059 if (def_reg.contains(PhysRegInterval{var.reg, info.size})) {
1060 reg = def_reg.lo();
1061 /* try to use the previous register of the operand */
1062 for (unsigned i = 0; i < instr->operands.size(); i++) {
1063 if (reg != var.reg) {
1064 reg.reg_b += instr->operands[i].bytes();
1065 continue;
1066 }
1067
1068 /* check if we can swap positions */
1069 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKill() &&
1070 instr->operands[i].regClass() == info.rc) {
1071 assignment& op = ctx.assignments[instr->operands[i].tempId()];
1072 /* if everything matches, create parallelcopy for the killed operand */
1073 if (!intersects(def_reg, PhysRegInterval{op.reg, op.rc.size()}) && op.reg != scc &&
1074 reg_file.get_id(op.reg) == instr->operands[i].tempId()) {
1075 Definition pc_def = Definition(reg, info.rc);
1076 parallelcopies.emplace_back(instr->operands[i], pc_def);
1077 return op.reg;
1078 }
1079 }
1080 return {};
1081 }
1082 }
1083 return {};
1084 }
1085
1086 bool
get_regs_for_copies(ra_ctx & ctx,RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const std::vector<unsigned> & vars,aco_ptr<Instruction> & instr,const PhysRegInterval def_reg)1087 get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
1088 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1089 const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
1090 const PhysRegInterval def_reg)
1091 {
1092 /* Variables are sorted from large to small and with increasing assigned register */
1093 for (unsigned id : vars) {
1094 assignment& var = ctx.assignments[id];
1095 PhysRegInterval bounds = get_reg_bounds(ctx, var.rc);
1096 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
1097 uint32_t size = info.size;
1098
1099 /* check if this is a dead operand, then we can re-use the space from the definition
1100 * also use the correct stride for sub-dword operands */
1101 bool is_dead_operand = false;
1102 std::optional<PhysReg> res;
1103 if (instr->opcode == aco_opcode::p_create_vector) {
1104 res =
1105 get_reg_for_create_vector_copy(ctx, reg_file, parallelcopies, instr, def_reg, info, id);
1106 } else {
1107 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
1108 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
1109 info = DefInfo(ctx, instr, var.rc, i);
1110 if (instr->operands[i].isKillBeforeDef()) {
1111 info.bounds = def_reg;
1112 res = get_reg_simple(ctx, reg_file, info);
1113 is_dead_operand = true;
1114 }
1115 break;
1116 }
1117 }
1118 }
1119 if (!res && !def_reg.size) {
1120 /* If this is before definitions are handled, def_reg may be an empty interval. */
1121 info.bounds = bounds;
1122 res = get_reg_simple(ctx, reg_file, info);
1123 } else if (!res) {
1124 /* Try to find space within the bounds but outside of the definition */
1125 info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
1126 res = get_reg_simple(ctx, reg_file, info);
1127 if (!res && def_reg.hi() <= bounds.hi()) {
1128 unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
1129 info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
1130 res = get_reg_simple(ctx, reg_file, info);
1131 }
1132 }
1133
1134 if (res) {
1135 /* mark the area as blocked */
1136 reg_file.block(*res, var.rc);
1137
1138 /* create parallelcopy pair (without definition id) */
1139 Temp tmp = Temp(id, var.rc);
1140 Operand pc_op = Operand(tmp);
1141 pc_op.setFixed(var.reg);
1142 Definition pc_def = Definition(*res, pc_op.regClass());
1143 parallelcopies.emplace_back(pc_op, pc_def);
1144 continue;
1145 }
1146
1147 PhysReg best_pos = bounds.lo();
1148 unsigned num_moves = 0xFF;
1149 unsigned num_vars = 0;
1150
1151 /* we use a sliding window to find potential positions */
1152 unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
1153 for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1154 reg_win += stride) {
1155 if (!is_dead_operand && intersects(reg_win, def_reg))
1156 continue;
1157
1158 /* second, check that we have at most k=num_moves elements in the window
1159 * and no element is larger than the currently processed one */
1160 unsigned k = 0;
1161 unsigned n = 0;
1162 unsigned last_var = 0;
1163 bool found = true;
1164 for (PhysReg j : reg_win) {
1165 if (reg_file[j] == 0 || reg_file[j] == last_var)
1166 continue;
1167
1168 if (reg_file.is_blocked(j) || k > num_moves) {
1169 found = false;
1170 break;
1171 }
1172 if (reg_file[j] == 0xF0000000) {
1173 k += 1;
1174 n++;
1175 continue;
1176 }
1177 /* we cannot split live ranges of linear vgprs */
1178 if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1179 found = false;
1180 break;
1181 }
1182 bool is_kill = false;
1183 for (const Operand& op : instr->operands) {
1184 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
1185 is_kill = true;
1186 break;
1187 }
1188 }
1189 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
1190 found = false;
1191 break;
1192 }
1193
1194 k += ctx.assignments[reg_file[j]].rc.size();
1195 last_var = reg_file[j];
1196 n++;
1197 if (k > num_moves || (k == num_moves && n <= num_vars)) {
1198 found = false;
1199 break;
1200 }
1201 }
1202
1203 if (found) {
1204 best_pos = reg_win.lo();
1205 num_moves = k;
1206 num_vars = n;
1207 }
1208 }
1209
1210 /* FIXME: we messed up and couldn't find space for the variables to be copied */
1211 if (num_moves == 0xFF)
1212 return false;
1213
1214 PhysRegInterval reg_win{best_pos, size};
1215
1216 /* collect variables and block reg file */
1217 std::vector<unsigned> new_vars = collect_vars(ctx, reg_file, reg_win);
1218
1219 /* mark the area as blocked */
1220 reg_file.block(reg_win.lo(), var.rc);
1221 adjust_max_used_regs(ctx, var.rc, reg_win.lo());
1222
1223 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, instr, def_reg))
1224 return false;
1225
1226 /* create parallelcopy pair (without definition id) */
1227 Temp tmp = Temp(id, var.rc);
1228 Operand pc_op = Operand(tmp);
1229 pc_op.setFixed(var.reg);
1230 Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
1231 parallelcopies.emplace_back(pc_op, pc_def);
1232 }
1233
1234 return true;
1235 }
1236
1237 std::optional<PhysReg>
get_reg_impl(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies,const DefInfo & info,aco_ptr<Instruction> & instr)1238 get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
1239 std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
1240 aco_ptr<Instruction>& instr)
1241 {
1242 const PhysRegInterval& bounds = info.bounds;
1243 uint32_t size = info.size;
1244 uint32_t stride = info.stride;
1245 RegClass rc = info.rc;
1246
1247 /* check how many free regs we have */
1248 unsigned regs_free = reg_file.count_zero(bounds);
1249
1250 /* mark and count killed operands */
1251 unsigned killed_ops = 0;
1252 std::bitset<256> is_killed_operand; /* per-register */
1253 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
1254 Operand& op = instr->operands[j];
1255 if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
1256 !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
1257 assert(op.isFixed());
1258
1259 for (unsigned i = 0; i < op.size(); ++i) {
1260 is_killed_operand[(op.physReg() & 0xff) + i] = true;
1261 }
1262
1263 killed_ops += op.getTemp().size();
1264 }
1265 }
1266
1267 assert((regs_free + ctx.num_linear_vgprs) >= size);
1268
1269 /* we might have to move dead operands to dst in order to make space */
1270 unsigned op_moves = 0;
1271
1272 if (size > (regs_free - killed_ops))
1273 op_moves = size - (regs_free - killed_ops);
1274
1275 /* find the best position to place the definition */
1276 PhysRegInterval best_win = {bounds.lo(), size};
1277 unsigned num_moves = 0xFF;
1278 unsigned num_vars = 0;
1279
1280 /* we use a sliding window to check potential positions */
1281 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
1282 reg_win += stride) {
1283 /* first check if the register window starts in the middle of an
1284 * allocated variable: this is what we have to fix to allow for
1285 * num_moves > size */
1286 if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
1287 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1288 continue;
1289 if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
1290 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1291 continue;
1292
1293 /* second, check that we have at most k=num_moves elements in the window
1294 * and no element is larger than the currently processed one */
1295 unsigned k = op_moves;
1296 unsigned n = 0;
1297 unsigned remaining_op_moves = op_moves;
1298 unsigned last_var = 0;
1299 bool found = true;
1300 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1301 for (const PhysReg j : reg_win) {
1302 /* dead operands effectively reduce the number of estimated moves */
1303 if (is_killed_operand[j & 0xFF]) {
1304 if (remaining_op_moves) {
1305 k--;
1306 remaining_op_moves--;
1307 }
1308 continue;
1309 }
1310
1311 if (reg_file[j] == 0 || reg_file[j] == last_var)
1312 continue;
1313
1314 if (reg_file[j] == 0xF0000000) {
1315 k += 1;
1316 n++;
1317 continue;
1318 }
1319
1320 if (ctx.assignments[reg_file[j]].rc.size() >= size) {
1321 found = false;
1322 break;
1323 }
1324
1325 /* we cannot split live ranges of linear vgprs */
1326 if (ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
1327 found = false;
1328 break;
1329 }
1330
1331 k += ctx.assignments[reg_file[j]].rc.size();
1332 n++;
1333 last_var = reg_file[j];
1334 }
1335
1336 if (!found || k > num_moves)
1337 continue;
1338 if (k == num_moves && n < num_vars)
1339 continue;
1340 if (!aligned && k == num_moves && n == num_vars)
1341 continue;
1342
1343 if (found) {
1344 best_win = reg_win;
1345 num_moves = k;
1346 num_vars = n;
1347 }
1348 }
1349
1350 if (num_moves == 0xFF)
1351 return {};
1352
1353 /* now, we figured the placement for our definition */
1354 RegisterFile tmp_file(reg_file);
1355
1356 /* p_create_vector: also re-place killed operands in the definition space */
1357 if (instr->opcode == aco_opcode::p_create_vector) {
1358 for (Operand& op : instr->operands) {
1359 if (op.isTemp() && op.isFirstKillBeforeDef())
1360 tmp_file.fill(op);
1361 }
1362 }
1363
1364 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, best_win);
1365
1366 /* re-enable killed operands */
1367 if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) {
1368 for (Operand& op : instr->operands) {
1369 if (op.isTemp() && op.isFirstKillBeforeDef())
1370 tmp_file.fill(op);
1371 }
1372 }
1373
1374 std::vector<std::pair<Operand, Definition>> pc;
1375 if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
1376 return {};
1377
1378 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1379
1380 adjust_max_used_regs(ctx, rc, best_win.lo());
1381 return best_win.lo();
1382 }
1383
1384 bool
get_reg_specified(ra_ctx & ctx,const RegisterFile & reg_file,RegClass rc,aco_ptr<Instruction> & instr,PhysReg reg,int operand)1385 get_reg_specified(ra_ctx& ctx, const RegisterFile& reg_file, RegClass rc,
1386 aco_ptr<Instruction>& instr, PhysReg reg, int operand)
1387 {
1388 /* catch out-of-range registers */
1389 if (reg >= PhysReg{512})
1390 return false;
1391
1392 DefInfo info(ctx, instr, rc, operand);
1393
1394 if (reg.reg_b % info.data_stride)
1395 return false;
1396
1397 assert(util_is_power_of_two_nonzero(info.stride));
1398 reg.reg_b &= ~(info.stride - 1);
1399
1400 PhysRegInterval reg_win = {PhysReg(reg.reg()), info.rc.size()};
1401 PhysRegInterval vcc_win = {vcc, 2};
1402 /* VCC is outside the bounds */
1403 bool is_vcc =
1404 info.rc.type() == RegType::sgpr && vcc_win.contains(reg_win) && ctx.program->needs_vcc;
1405 bool is_m0 = info.rc == s1 && reg == m0 && can_write_m0(instr);
1406 if (!info.bounds.contains(reg_win) && !is_vcc && !is_m0)
1407 return false;
1408
1409 if (reg_file.test(reg, info.rc.bytes()))
1410 return false;
1411
1412 adjust_max_used_regs(ctx, info.rc, reg_win.lo());
1413 return true;
1414 }
1415
1416 bool
increase_register_file(ra_ctx & ctx,RegClass rc)1417 increase_register_file(ra_ctx& ctx, RegClass rc)
1418 {
1419 if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
1420 ctx.vgpr_bounds < ctx.vgpr_limit) {
1421 /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
1422 update_vgpr_sgpr_demand(
1423 ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
1424
1425 ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
1426 } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
1427 update_vgpr_sgpr_demand(
1428 ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
1429
1430 ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
1431 } else {
1432 return false;
1433 }
1434
1435 return true;
1436 }
1437
1438 struct IDAndRegClass {
IDAndRegClassaco::__anon8422c4420111::IDAndRegClass1439 IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
1440
1441 unsigned id;
1442 RegClass rc;
1443 };
1444
1445 struct IDAndInfo {
IDAndInfoaco::__anon8422c4420111::IDAndInfo1446 IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
1447
1448 unsigned id;
1449 DefInfo info;
1450 };
1451
1452 void
add_rename(ra_ctx & ctx,Temp orig_val,Temp new_val)1453 add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val)
1454 {
1455 ctx.renames[ctx.block->index][orig_val.id()] = new_val;
1456 ctx.orig_names.emplace(new_val.id(), orig_val);
1457 ctx.assignments[orig_val.id()].renamed = true;
1458 }
1459
1460 /* Reallocates vars by sorting them and placing each variable after the previous
1461 * one. If one of the variables has 0xffffffff as an ID, the register assigned
1462 * for that variable will be returned.
1463 */
1464 PhysReg
compact_relocate_vars(ra_ctx & ctx,const std::vector<IDAndRegClass> & vars,std::vector<std::pair<Operand,Definition>> & parallelcopies,PhysReg start)1465 compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
1466 std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
1467 {
1468 /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
1469 * temporary sizes to dwords.
1470 */
1471 std::vector<IDAndInfo> sorted;
1472 for (IDAndRegClass var : vars) {
1473 DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
1474 sorted.emplace_back(var.id, info);
1475 }
1476
1477 std::sort(
1478 sorted.begin(), sorted.end(),
1479 [&ctx](const IDAndInfo& a, const IDAndInfo& b)
1480 {
1481 unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
1482 unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
1483 if (a_stride > b_stride)
1484 return true;
1485 if (a_stride < b_stride)
1486 return false;
1487 if (a.id == 0xffffffff || b.id == 0xffffffff)
1488 return a.id ==
1489 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
1490 return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
1491 });
1492
1493 PhysReg next_reg = start;
1494 PhysReg space_reg;
1495 for (IDAndInfo& var : sorted) {
1496 unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
1497 next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
1498
1499 /* 0xffffffff is a special variable ID used reserve a space for killed
1500 * operands and definitions.
1501 */
1502 if (var.id != 0xffffffff) {
1503 if (next_reg != ctx.assignments[var.id].reg) {
1504 RegClass rc = ctx.assignments[var.id].rc;
1505 Temp tmp(var.id, rc);
1506
1507 Operand pc_op(tmp);
1508 pc_op.setFixed(ctx.assignments[var.id].reg);
1509 Definition pc_def(next_reg, rc);
1510 parallelcopies.emplace_back(pc_op, pc_def);
1511 }
1512 } else {
1513 space_reg = next_reg;
1514 }
1515
1516 adjust_max_used_regs(ctx, var.info.rc, next_reg);
1517
1518 next_reg = next_reg.advance(var.info.rc.size() * 4);
1519 }
1520
1521 return space_reg;
1522 }
1523
1524 bool
is_mimg_vaddr_intact(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)1525 is_mimg_vaddr_intact(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
1526 {
1527 PhysReg first{512};
1528 for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
1529 Operand op = instr->operands[i + 3];
1530
1531 if (ctx.assignments[op.tempId()].assigned) {
1532 PhysReg reg = ctx.assignments[op.tempId()].reg;
1533
1534 if (first.reg() == 512) {
1535 PhysRegInterval bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1536 first = reg.advance(i * -4);
1537 PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
1538 if (!bounds.contains(vec)) /* not enough space for other operands */
1539 return false;
1540 } else {
1541 if (reg != first.advance(i * 4)) /* not at the best position */
1542 return false;
1543 }
1544 } else {
1545 /* If there's an unexpected temporary, this operand is unlikely to be
1546 * placed in the best position.
1547 */
1548 if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
1549 return false;
1550 }
1551 }
1552
1553 return true;
1554 }
1555
1556 std::optional<PhysReg>
get_reg_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,aco_ptr<Instruction> & instr,int operand)1557 get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr,
1558 int operand)
1559 {
1560 Instruction* vec = ctx.vectors[temp.id()];
1561 unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
1562 unsigned our_offset = 0;
1563 for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1564 Operand& op = vec->operands[i];
1565 if (op.isTemp() && op.tempId() == temp.id())
1566 break;
1567 else
1568 our_offset += op.bytes();
1569 }
1570
1571 if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
1572 unsigned their_offset = 0;
1573 /* check for every operand of the vector
1574 * - whether the operand is assigned and
1575 * - we can use the register relative to that operand
1576 */
1577 for (unsigned i = first_operand; i < vec->operands.size(); i++) {
1578 Operand& op = vec->operands[i];
1579 if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
1580 ctx.assignments[op.tempId()].assigned) {
1581 PhysReg reg = ctx.assignments[op.tempId()].reg;
1582 reg.reg_b += (our_offset - their_offset);
1583 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand))
1584 return reg;
1585
1586 /* return if MIMG vaddr components don't remain vector-aligned */
1587 if (vec->format == Format::MIMG)
1588 return {};
1589 }
1590 their_offset += op.bytes();
1591 }
1592
1593 /* We didn't find a register relative to other vector operands.
1594 * Try to find new space which fits the whole vector.
1595 */
1596 RegClass vec_rc = RegClass::get(temp.type(), their_offset);
1597 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
1598 std::optional<PhysReg> reg = get_reg_simple(ctx, reg_file, info);
1599 if (reg) {
1600 reg->reg_b += our_offset;
1601 /* make sure to only use byte offset if the instruction supports it */
1602 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, *reg, operand))
1603 return reg;
1604 }
1605 }
1606 return {};
1607 }
1608
1609 bool
compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file,std::vector<std::pair<Operand,Definition>> & parallelcopies)1610 compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
1611 std::vector<std::pair<Operand, Definition>>& parallelcopies)
1612 {
1613 PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
1614 int zeros = reg_file.count_zero(linear_vgpr_bounds);
1615 if (zeros == 0)
1616 return false;
1617
1618 std::vector<IDAndRegClass> vars;
1619 for (unsigned id : find_vars(ctx, reg_file, linear_vgpr_bounds))
1620 vars.emplace_back(id, ctx.assignments[id].rc);
1621
1622 ctx.num_linear_vgprs -= zeros;
1623 compact_relocate_vars(ctx, vars, parallelcopies, get_reg_bounds(ctx, RegType::vgpr, true).lo());
1624
1625 return true;
1626 }
1627
1628 /* Allocates a linear VGPR. We allocate them at the end of the register file and keep them separate
1629 * from normal VGPRs. This is for two reasons:
1630 * - Because we only ever move linear VGPRs into an empty space or a space previously occupied by a
1631 * linear one, we never have to swap a normal VGPR and a linear one.
1632 * - As linear VGPR's live ranges only start and end on top-level blocks, we never have to move a
1633 * linear VGPR in control flow.
1634 */
1635 PhysReg
alloc_linear_vgpr(ra_ctx & ctx,const RegisterFile & reg_file,aco_ptr<Instruction> & instr,std::vector<std::pair<Operand,Definition>> & parallelcopies)1636 alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
1637 std::vector<std::pair<Operand, Definition>>& parallelcopies)
1638 {
1639 assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
1640 assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
1641
1642 RegClass rc = instr->definitions[0].regClass();
1643
1644 /* Try to choose an unused space in the linear VGPR bounds. */
1645 for (unsigned i = rc.size(); i <= ctx.num_linear_vgprs; i++) {
1646 PhysReg reg(256 + ctx.vgpr_bounds - i);
1647 if (!reg_file.test(reg, rc.bytes())) {
1648 adjust_max_used_regs(ctx, rc, reg);
1649 return reg;
1650 }
1651 }
1652
1653 PhysRegInterval old_normal_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
1654
1655 /* Compact linear VGPRs, grow the bounds if necessary, and choose a space at the beginning: */
1656 compact_linear_vgprs(ctx, reg_file, parallelcopies);
1657
1658 PhysReg reg(256 + ctx.vgpr_bounds - (ctx.num_linear_vgprs + rc.size()));
1659 /* Space that was for normal VGPRs, but is now for linear VGPRs. */
1660 PhysRegInterval new_win = PhysRegInterval::from_until(reg, MAX2(old_normal_bounds.hi(), reg));
1661
1662 RegisterFile tmp_file(reg_file);
1663 PhysRegInterval reg_win{reg, rc.size()};
1664 std::vector<unsigned> blocking_vars = collect_vars(ctx, tmp_file, new_win);
1665
1666 /* Re-enable killed operands */
1667 for (Operand& op : instr->operands) {
1668 if (op.isTemp() && op.isFirstKillBeforeDef())
1669 tmp_file.fill(op);
1670 }
1671
1672 /* Find new assignments for blocking vars. */
1673 std::vector<std::pair<Operand, Definition>> pc;
1674 if (!ctx.policy.skip_optimistic_path &&
1675 get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
1676 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1677 } else {
1678 /* Fallback algorithm: reallocate all variables at once. */
1679 std::vector<IDAndRegClass> vars;
1680 for (unsigned id : find_vars(ctx, reg_file, old_normal_bounds))
1681 vars.emplace_back(id, ctx.assignments[id].rc);
1682 compact_relocate_vars(ctx, vars, parallelcopies, PhysReg(256));
1683
1684 std::vector<IDAndRegClass> killed_op_vars;
1685 for (Operand& op : instr->operands) {
1686 if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == RegType::vgpr)
1687 killed_op_vars.emplace_back(op.tempId(), op.regClass());
1688 }
1689 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, reg_win.lo());
1690 }
1691
1692 /* If this is updated earlier, a killed operand can't be placed inside the definition. */
1693 ctx.num_linear_vgprs += rc.size();
1694
1695 adjust_max_used_regs(ctx, rc, reg);
1696 return reg;
1697 }
1698
1699 bool
should_compact_linear_vgprs(ra_ctx & ctx,const RegisterFile & reg_file)1700 should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file)
1701 {
1702 if (!(ctx.block->kind & block_kind_top_level) || ctx.block->linear_succs.empty())
1703 return false;
1704
1705 /* Since we won't be able to copy linear VGPRs to make space when in control flow, we have to
1706 * ensure in advance that there is enough space for normal VGPRs. */
1707 unsigned max_vgpr_usage = 0;
1708 unsigned next_toplevel = ctx.block->index + 1;
1709 for (; !(ctx.program->blocks[next_toplevel].kind & block_kind_top_level); next_toplevel++) {
1710 max_vgpr_usage =
1711 MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].register_demand.vgpr);
1712 }
1713 max_vgpr_usage =
1714 MAX2(max_vgpr_usage, (unsigned)ctx.program->blocks[next_toplevel].live_in_demand.vgpr);
1715
1716 for (unsigned tmp : find_vars(ctx, reg_file, get_reg_bounds(ctx, RegType::vgpr, true)))
1717 max_vgpr_usage -= ctx.assignments[tmp].rc.size();
1718
1719 return max_vgpr_usage > get_reg_bounds(ctx, RegType::vgpr, false).size;
1720 }
1721
1722 PhysReg
get_reg(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr,int operand_index=-1)1723 get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1724 std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
1725 int operand_index = -1)
1726 {
1727 auto split_vec = ctx.split_vectors.find(temp.id());
1728 if (split_vec != ctx.split_vectors.end()) {
1729 unsigned offset = 0;
1730 for (Definition def : split_vec->second->definitions) {
1731 if (ctx.assignments[def.tempId()].affinity) {
1732 assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
1733 if (affinity.assigned) {
1734 PhysReg reg = affinity.reg;
1735 reg.reg_b -= offset;
1736 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg, operand_index))
1737 return reg;
1738 }
1739 }
1740 offset += def.bytes();
1741 }
1742 }
1743
1744 if (ctx.assignments[temp.id()].affinity) {
1745 assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
1746 if (affinity.assigned) {
1747 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg, operand_index))
1748 return affinity.reg;
1749 }
1750 }
1751 if (ctx.assignments[temp.id()].vcc) {
1752 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, vcc, operand_index))
1753 return vcc;
1754 }
1755 if (ctx.assignments[temp.id()].m0) {
1756 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, m0, operand_index))
1757 return m0;
1758 }
1759
1760 std::optional<PhysReg> res;
1761
1762 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
1763 res = get_reg_vector(ctx, reg_file, temp, instr, operand_index);
1764 if (res)
1765 return *res;
1766 }
1767
1768 if (temp.size() == 1 && operand_index == -1) {
1769 for (const Operand& op : instr->operands) {
1770 if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass() == temp.regClass()) {
1771 assert(op.isFixed());
1772 if (op.physReg() == vcc || op.physReg() == vcc_hi)
1773 continue;
1774 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, op.physReg(),
1775 operand_index))
1776 return op.physReg();
1777 }
1778 }
1779 }
1780
1781 DefInfo info(ctx, instr, temp.regClass(), operand_index);
1782
1783 if (!ctx.policy.skip_optimistic_path) {
1784 /* try to find space without live-range splits */
1785 res = get_reg_simple(ctx, reg_file, info);
1786
1787 if (res)
1788 return *res;
1789 }
1790
1791 /* try to find space with live-range splits */
1792 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
1793
1794 if (res)
1795 return *res;
1796
1797 /* try compacting the linear vgprs to make more space */
1798 std::vector<std::pair<Operand, Definition>> pc;
1799 if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
1800 compact_linear_vgprs(ctx, reg_file, pc)) {
1801 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1802
1803 /* We don't need to fill the copy definitions in because we don't care about the linear VGPR
1804 * space here. */
1805 RegisterFile tmp_file(reg_file);
1806 for (std::pair<Operand, Definition>& copy : pc)
1807 tmp_file.clear(copy.first);
1808
1809 return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
1810 }
1811
1812 /* We should only fail here because keeping under the limit would require
1813 * too many moves. */
1814 assert(reg_file.count_zero(info.bounds) >= info.size);
1815
1816 /* try using more registers */
1817 if (!increase_register_file(ctx, info.rc)) {
1818 /* fallback algorithm: reallocate all variables at once (linear VGPRs should already be
1819 * compact at the end) */
1820 unsigned def_size = info.rc.size();
1821 for (Definition def : instr->definitions) {
1822 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1823 def_size += def.regClass().size();
1824 }
1825
1826 unsigned killed_op_size = 0;
1827 for (Operand op : instr->operands) {
1828 if (op.isTemp() && op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1829 killed_op_size += op.regClass().size();
1830 }
1831
1832 const PhysRegInterval regs = get_reg_bounds(ctx, info.rc);
1833
1834 /* reallocate passthrough variables and non-killed operands */
1835 std::vector<IDAndRegClass> vars;
1836 for (unsigned id : find_vars(ctx, reg_file, regs))
1837 vars.emplace_back(id, ctx.assignments[id].rc);
1838 vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
1839
1840 PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
1841
1842 /* reallocate killed operands */
1843 std::vector<IDAndRegClass> killed_op_vars;
1844 for (Operand op : instr->operands) {
1845 if (op.isFirstKillBeforeDef() && op.regClass().type() == info.rc.type())
1846 killed_op_vars.emplace_back(op.tempId(), op.regClass());
1847 }
1848 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
1849
1850 /* reallocate definitions */
1851 std::vector<IDAndRegClass> def_vars;
1852 for (Definition def : instr->definitions) {
1853 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
1854 def_vars.emplace_back(def.tempId(), def.regClass());
1855 }
1856 def_vars.emplace_back(0xffffffff, info.rc);
1857 return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
1858 }
1859
1860 return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
1861 }
1862
1863 PhysReg
get_reg_create_vector(ra_ctx & ctx,const RegisterFile & reg_file,Temp temp,std::vector<std::pair<Operand,Definition>> & parallelcopies,aco_ptr<Instruction> & instr)1864 get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
1865 std::vector<std::pair<Operand, Definition>>& parallelcopies,
1866 aco_ptr<Instruction>& instr)
1867 {
1868 RegClass rc = temp.regClass();
1869 /* create_vector instructions have different costs w.r.t. register coalescing */
1870 uint32_t size = rc.size();
1871 uint32_t bytes = rc.bytes();
1872 uint32_t stride = get_stride(rc);
1873 PhysRegInterval bounds = get_reg_bounds(ctx, rc);
1874
1875 // TODO: improve p_create_vector for sub-dword vectors
1876
1877 PhysReg best_pos{0xFFF};
1878 unsigned num_moves = 0xFF;
1879 bool best_avoid = true;
1880 uint32_t correct_pos_mask = 0;
1881
1882 /* test for each operand which definition placement causes the least shuffle instructions */
1883 for (unsigned i = 0, offset = 0; i < instr->operands.size();
1884 offset += instr->operands[i].bytes(), i++) {
1885 // TODO: think about, if we can alias live operands on the same register
1886 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
1887 instr->operands[i].getTemp().type() != rc.type())
1888 continue;
1889
1890 if (offset > instr->operands[i].physReg().reg_b)
1891 continue;
1892
1893 unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
1894 if (reg_lower % 4)
1895 continue;
1896 PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
1897 unsigned k = 0;
1898
1899 /* no need to check multiple times */
1900 if (reg_win.lo() == best_pos)
1901 continue;
1902
1903 /* check borders */
1904 // TODO: this can be improved */
1905 if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
1906 continue;
1907 if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
1908 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
1909 continue;
1910 if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
1911 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
1912 continue;
1913
1914 /* count variables to be moved and check "avoid" */
1915 bool avoid = false;
1916 bool linear_vgpr = false;
1917 for (PhysReg j : reg_win) {
1918 if (reg_file[j] != 0) {
1919 if (reg_file[j] == 0xF0000000) {
1920 PhysReg reg;
1921 reg.reg_b = j * 4;
1922 unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
1923 for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
1924 k += reg_file.test(reg, 1);
1925 } else {
1926 k += 4;
1927 linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
1928 }
1929 }
1930 avoid |= ctx.war_hint[j];
1931 }
1932
1933 /* we cannot split live ranges of linear vgprs */
1934 if (linear_vgpr)
1935 continue;
1936
1937 if (avoid && !best_avoid)
1938 continue;
1939
1940 /* count operands in wrong positions */
1941 uint32_t correct_pos_mask_new = 0;
1942 for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
1943 offset2 += instr->operands[j].bytes(), j++) {
1944 Operand& op = instr->operands[j];
1945 if (op.isTemp() && op.physReg().reg_b == reg_win.lo() * 4 + offset2)
1946 correct_pos_mask_new |= 1 << j;
1947 else
1948 k += op.bytes();
1949 }
1950 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
1951 if (k > num_moves || (!aligned && k == num_moves))
1952 continue;
1953
1954 best_pos = reg_win.lo();
1955 num_moves = k;
1956 best_avoid = avoid;
1957 correct_pos_mask = correct_pos_mask_new;
1958 }
1959
1960 /* too many moves: try the generic get_reg() function */
1961 if (num_moves >= 2 * bytes) {
1962 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1963 } else if (num_moves > bytes) {
1964 DefInfo info(ctx, instr, rc, -1);
1965 std::optional<PhysReg> res = get_reg_simple(ctx, reg_file, info);
1966 if (res)
1967 return *res;
1968 }
1969
1970 /* re-enable killed operands which are in the wrong position */
1971 RegisterFile tmp_file(reg_file);
1972 for (Operand& op : instr->operands) {
1973 if (op.isTemp() && op.isFirstKillBeforeDef())
1974 tmp_file.fill(op);
1975 }
1976 for (unsigned i = 0; i < instr->operands.size(); i++) {
1977 if ((correct_pos_mask >> i) & 1u && instr->operands[i].isKill())
1978 tmp_file.clear(instr->operands[i]);
1979 }
1980
1981 /* collect variables to be moved */
1982 std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
1983
1984 bool success = false;
1985 std::vector<std::pair<Operand, Definition>> pc;
1986 success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
1987
1988 if (!success) {
1989 if (!increase_register_file(ctx, temp.regClass())) {
1990 /* use the fallback algorithm in get_reg() */
1991 return get_reg(ctx, reg_file, temp, parallelcopies, instr);
1992 }
1993 return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
1994 }
1995
1996 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
1997 adjust_max_used_regs(ctx, rc, best_pos);
1998
1999 return best_pos;
2000 }
2001
2002 void
handle_pseudo(ra_ctx & ctx,const RegisterFile & reg_file,Instruction * instr)2003 handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
2004 {
2005 if (instr->format != Format::PSEUDO)
2006 return;
2007
2008 /* all instructions which use handle_operands() need this information */
2009 switch (instr->opcode) {
2010 case aco_opcode::p_extract_vector:
2011 case aco_opcode::p_create_vector:
2012 case aco_opcode::p_split_vector:
2013 case aco_opcode::p_parallelcopy:
2014 case aco_opcode::p_start_linear_vgpr: break;
2015 default: return;
2016 }
2017
2018 bool writes_linear = false;
2019 /* if all definitions are logical vgpr, no need to care for SCC */
2020 for (Definition& def : instr->definitions) {
2021 if (def.getTemp().regClass().is_linear())
2022 writes_linear = true;
2023 }
2024 /* if all operands are constant, no need to care either */
2025 bool reads_linear = false;
2026 for (Operand& op : instr->operands) {
2027 if (op.isTemp() && op.getTemp().regClass().is_linear())
2028 reads_linear = true;
2029 }
2030
2031 if (!writes_linear || !reads_linear || !reg_file[scc])
2032 return;
2033
2034 instr->pseudo().needs_scratch_reg = true;
2035 instr->pseudo().tmp_in_scc = reg_file[scc];
2036
2037 int reg = ctx.max_used_sgpr;
2038 for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
2039 ;
2040 if (reg < 0) {
2041 reg = ctx.max_used_sgpr + 1;
2042 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
2043 ;
2044 }
2045
2046 adjust_max_used_regs(ctx, s1, reg);
2047 instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
2048 }
2049
2050 bool
operand_can_use_reg(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,unsigned idx,PhysReg reg,RegClass rc)2051 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
2052 RegClass rc)
2053 {
2054 if (reg.byte()) {
2055 unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
2056 if (reg.byte() % stride)
2057 return false;
2058 }
2059
2060 switch (instr->format) {
2061 case Format::SMEM:
2062 return reg != scc && reg != exec &&
2063 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
2064 (reg != vcc || (instr->definitions.empty() && idx == 2) ||
2065 gfx_level >= GFX10); /* sdata can be vcc */
2066 case Format::MUBUF:
2067 case Format::MTBUF: return idx != 2 || gfx_level < GFX12 || reg != scc;
2068 default:
2069 // TODO: there are more instructions with restrictions on registers
2070 return true;
2071 }
2072 }
2073
2074 void
handle_fixed_operands(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr)2075 handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
2076 std::vector<std::pair<Operand, Definition>>& parallelcopy,
2077 aco_ptr<Instruction>& instr)
2078 {
2079 assert(instr->operands.size() <= 128);
2080
2081 RegisterFile tmp_file(register_file);
2082
2083 BITSET_DECLARE(mask, 128) = {0};
2084
2085 for (unsigned i = 0; i < instr->operands.size(); i++) {
2086 Operand& op = instr->operands[i];
2087
2088 if (!op.isTemp() || !op.isFixed())
2089 continue;
2090
2091 PhysReg src = ctx.assignments[op.tempId()].reg;
2092 adjust_max_used_regs(ctx, op.regClass(), op.physReg());
2093
2094 if (op.physReg() == src) {
2095 tmp_file.block(op.physReg(), op.regClass());
2096 continue;
2097 }
2098
2099 unsigned j;
2100 bool found = false;
2101 BITSET_FOREACH_SET (j, mask, i) {
2102 if (instr->operands[j].tempId() == op.tempId() &&
2103 instr->operands[j].physReg() == op.physReg()) {
2104 found = true;
2105 break;
2106 }
2107 }
2108 if (found)
2109 continue; /* the copy is already added to the list */
2110
2111 /* clear from register_file so fixed operands are not collected be collect_vars() */
2112 tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
2113
2114 BITSET_SET(mask, i);
2115
2116 Operand pc_op(instr->operands[i].getTemp(), src);
2117 Definition pc_def = Definition(op.physReg(), pc_op.regClass());
2118 parallelcopy.emplace_back(pc_op, pc_def);
2119 }
2120
2121 if (BITSET_IS_EMPTY(mask))
2122 return;
2123
2124 unsigned i;
2125 std::vector<unsigned> blocking_vars;
2126 BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
2127 Operand& op = instr->operands[i];
2128 PhysRegInterval target{op.physReg(), op.size()};
2129 std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
2130 blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
2131
2132 /* prevent get_regs_for_copies() from using these registers */
2133 tmp_file.block(op.physReg(), op.regClass());
2134 }
2135
2136 get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
2137 update_renames(ctx, register_file, parallelcopy, instr,
2138 rename_not_killed_ops | fill_killed_ops | rename_precolored_ops);
2139 }
2140
2141 void
get_reg_for_operand(ra_ctx & ctx,RegisterFile & register_file,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,Operand & operand,unsigned operand_index)2142 get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
2143 std::vector<std::pair<Operand, Definition>>& parallelcopy,
2144 aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
2145 {
2146 /* clear the operand in case it's only a stride mismatch */
2147 PhysReg src = ctx.assignments[operand.tempId()].reg;
2148 register_file.clear(src, operand.regClass());
2149 PhysReg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
2150
2151 Operand pc_op = operand;
2152 pc_op.setFixed(src);
2153 Definition pc_def = Definition(dst, pc_op.regClass());
2154 parallelcopy.emplace_back(pc_op, pc_def);
2155 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
2156 }
2157
2158 PhysReg
get_reg_phi(ra_ctx & ctx,IDSet & live_in,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,Block & block,aco_ptr<Instruction> & phi,Temp tmp)2159 get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
2160 std::vector<aco_ptr<Instruction>>& instructions, Block& block,
2161 aco_ptr<Instruction>& phi, Temp tmp)
2162 {
2163 std::vector<std::pair<Operand, Definition>> parallelcopy;
2164 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
2165 update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
2166
2167 /* process parallelcopy */
2168 for (std::pair<Operand, Definition> pc : parallelcopy) {
2169 /* see if it's a copy from a different phi */
2170 // TODO: prefer moving some previous phis over live-ins
2171 // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
2172 // problem in practice since they can only be fixed to exec)
2173 Instruction* prev_phi = NULL;
2174 for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
2175 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
2176 prev_phi = phi_it->get();
2177 }
2178 if (prev_phi) {
2179 /* if so, just update that phi's register */
2180 prev_phi->definitions[0].setFixed(pc.second.physReg());
2181 register_file.fill(prev_phi->definitions[0]);
2182 ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
2183 pc.second.regClass()};
2184 continue;
2185 }
2186
2187 /* rename */
2188 auto orig_it = ctx.orig_names.find(pc.first.tempId());
2189 Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
2190 add_rename(ctx, orig, pc.second.getTemp());
2191
2192 /* otherwise, this is a live-in and we need to create a new phi
2193 * to move it in this block's predecessors */
2194 aco_opcode opcode =
2195 pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2196 Block::edge_vec& preds =
2197 pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
2198 aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2199 new_phi->definitions[0] = pc.second;
2200 for (unsigned i = 0; i < preds.size(); i++)
2201 new_phi->operands[i] = Operand(pc.first);
2202 instructions.emplace_back(std::move(new_phi));
2203
2204 /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
2205 * a loop header.
2206 */
2207 live_in.erase(orig.id());
2208 }
2209
2210 return reg;
2211 }
2212
2213 void
get_regs_for_phis(ra_ctx & ctx,Block & block,RegisterFile & register_file,std::vector<aco_ptr<Instruction>> & instructions,IDSet & live_in)2214 get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
2215 std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
2216 {
2217 /* move all phis to instructions */
2218 for (aco_ptr<Instruction>& phi : block.instructions) {
2219 if (!is_phi(phi))
2220 break;
2221 if (!phi->definitions[0].isKill())
2222 instructions.emplace_back(std::move(phi));
2223 }
2224
2225 /* assign phis with all-matching registers to that register */
2226 for (aco_ptr<Instruction>& phi : instructions) {
2227 Definition& definition = phi->definitions[0];
2228 if (definition.isFixed())
2229 continue;
2230
2231 if (!phi->operands[0].isTemp())
2232 continue;
2233
2234 PhysReg reg = phi->operands[0].physReg();
2235 auto OpsSame = [=](const Operand& op) -> bool
2236 { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
2237 bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
2238 if (!all_same)
2239 continue;
2240
2241 if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1))
2242 continue;
2243
2244 definition.setFixed(reg);
2245 register_file.fill(definition);
2246 ctx.assignments[definition.tempId()].set(definition);
2247 }
2248
2249 /* try to find a register that is used by at least one operand */
2250 for (aco_ptr<Instruction>& phi : instructions) {
2251 Definition& definition = phi->definitions[0];
2252 if (definition.isFixed())
2253 continue;
2254
2255 /* use affinity if available */
2256 if (ctx.assignments[definition.tempId()].affinity &&
2257 ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
2258 assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
2259 assert(affinity.rc == definition.regClass());
2260 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg, -1)) {
2261 definition.setFixed(affinity.reg);
2262 register_file.fill(definition);
2263 ctx.assignments[definition.tempId()].set(definition);
2264 continue;
2265 }
2266 }
2267
2268 /* by going backwards, we aim to avoid copies in else-blocks */
2269 for (int i = phi->operands.size() - 1; i >= 0; i--) {
2270 const Operand& op = phi->operands[i];
2271 if (!op.isTemp() || !op.isFixed())
2272 continue;
2273
2274 PhysReg reg = op.physReg();
2275 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg, -1)) {
2276 definition.setFixed(reg);
2277 register_file.fill(definition);
2278 ctx.assignments[definition.tempId()].set(definition);
2279 break;
2280 }
2281 }
2282 }
2283
2284 /* find registers for phis where the register was blocked or no operand was assigned */
2285
2286 /* Don't use iterators because get_reg_phi() can add phis to the end of the vector. */
2287 for (unsigned i = 0; i < instructions.size(); i++) {
2288 aco_ptr<Instruction>& phi = instructions[i];
2289 Definition& definition = phi->definitions[0];
2290 if (definition.isFixed())
2291 continue;
2292
2293 definition.setFixed(
2294 get_reg_phi(ctx, live_in, register_file, instructions, block, phi, definition.getTemp()));
2295
2296 register_file.fill(definition);
2297 ctx.assignments[definition.tempId()].set(definition);
2298 }
2299 }
2300
2301 inline Temp
read_variable(ra_ctx & ctx,Temp val,unsigned block_idx)2302 read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
2303 {
2304 /* This variable didn't get renamed, yet. */
2305 if (!ctx.assignments[val.id()].renamed)
2306 return val;
2307
2308 auto it = ctx.renames[block_idx].find(val.id());
2309 if (it == ctx.renames[block_idx].end())
2310 return val;
2311 else
2312 return it->second;
2313 }
2314
2315 Temp
handle_live_in(ra_ctx & ctx,Temp val,Block * block)2316 handle_live_in(ra_ctx& ctx, Temp val, Block* block)
2317 {
2318 /* This variable didn't get renamed, yet. */
2319 if (!ctx.assignments[val.id()].renamed)
2320 return val;
2321
2322 Block::edge_vec& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
2323 if (preds.size() == 0)
2324 return val;
2325
2326 if (preds.size() == 1) {
2327 /* if the block has only one predecessor, just look there for the name */
2328 return read_variable(ctx, val, preds[0]);
2329 }
2330
2331 /* there are multiple predecessors and the block is sealed */
2332 Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
2333
2334 /* get the rename from each predecessor and check if they are the same */
2335 Temp new_val;
2336 bool needs_phi = false;
2337 for (unsigned i = 0; i < preds.size(); i++) {
2338 ops[i] = read_variable(ctx, val, preds[i]);
2339 if (i == 0)
2340 new_val = ops[i];
2341 else
2342 needs_phi |= !(new_val == ops[i]);
2343 }
2344
2345 if (needs_phi) {
2346 assert(!val.regClass().is_linear_vgpr());
2347
2348 /* the variable has been renamed differently in the predecessors: we need to insert a phi */
2349 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
2350 aco_ptr<Instruction> phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
2351 new_val = ctx.program->allocateTmp(val.regClass());
2352 phi->definitions[0] = Definition(new_val);
2353 ctx.assignments.emplace_back();
2354 assert(ctx.assignments.size() == ctx.program->peekAllocationId());
2355 for (unsigned i = 0; i < preds.size(); i++) {
2356 /* update the operands so that it uses the new affinity */
2357 phi->operands[i] = Operand(ops[i]);
2358 assert(ctx.assignments[ops[i].id()].assigned);
2359 assert(ops[i].regClass() == new_val.regClass());
2360 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
2361 }
2362 block->instructions.insert(block->instructions.begin(), std::move(phi));
2363 }
2364
2365 return new_val;
2366 }
2367
2368 void
handle_loop_phis(ra_ctx & ctx,const IDSet & live_in,uint32_t loop_header_idx,uint32_t loop_exit_idx)2369 handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
2370 uint32_t loop_exit_idx)
2371 {
2372 Block& loop_header = ctx.program->blocks[loop_header_idx];
2373 aco::unordered_map<uint32_t, Temp> renames(ctx.memory);
2374
2375 /* create phis for variables renamed during the loop */
2376 for (unsigned t : live_in) {
2377 if (!ctx.assignments[t].renamed)
2378 continue;
2379
2380 Temp val = Temp(t, ctx.program->temp_rc[t]);
2381 Temp prev = read_variable(ctx, val, loop_header_idx - 1);
2382 Temp renamed = handle_live_in(ctx, val, &loop_header);
2383 if (renamed == prev)
2384 continue;
2385
2386 /* insert additional renames at block end, but don't overwrite */
2387 renames[prev.id()] = renamed;
2388 ctx.orig_names[renamed.id()] = val;
2389 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2390 auto it = ctx.renames[idx].emplace(val.id(), renamed);
2391 /* if insertion is unsuccessful, update if necessary */
2392 if (!it.second && it.first->second == prev)
2393 it.first->second = renamed;
2394 }
2395
2396 /* update loop-carried values of the phi created by handle_live_in() */
2397 for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
2398 Operand& op = loop_header.instructions[0]->operands[i];
2399 if (op.getTemp() == prev)
2400 op.setTemp(renamed);
2401 }
2402
2403 /* use the assignment from the loop preheader and fix def reg */
2404 assignment& var = ctx.assignments[prev.id()];
2405 ctx.assignments[renamed.id()] = var;
2406 loop_header.instructions[0]->definitions[0].setFixed(var.reg);
2407 }
2408
2409 /* rename loop carried phi operands */
2410 for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
2411 aco_ptr<Instruction>& phi = loop_header.instructions[i];
2412 if (!is_phi(phi))
2413 break;
2414 const Block::edge_vec& preds =
2415 phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
2416 for (unsigned j = 1; j < phi->operands.size(); j++) {
2417 Operand& op = phi->operands[j];
2418 if (!op.isTemp())
2419 continue;
2420
2421 /* Find the original name, since this operand might not use the original name if the phi
2422 * was created after init_reg_file().
2423 */
2424 auto it = ctx.orig_names.find(op.tempId());
2425 Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
2426
2427 op.setTemp(read_variable(ctx, orig, preds[j]));
2428 op.setFixed(ctx.assignments[op.tempId()].reg);
2429 }
2430 }
2431
2432 /* return early if no new phi was created */
2433 if (renames.empty())
2434 return;
2435
2436 /* propagate new renames through loop */
2437 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
2438 Block& current = ctx.program->blocks[idx];
2439 /* rename all uses in this block */
2440 for (aco_ptr<Instruction>& instr : current.instructions) {
2441 /* phis are renamed after RA */
2442 if (idx == loop_header_idx && is_phi(instr))
2443 continue;
2444
2445 for (Operand& op : instr->operands) {
2446 if (!op.isTemp())
2447 continue;
2448
2449 auto rename = renames.find(op.tempId());
2450 if (rename != renames.end()) {
2451 assert(rename->second.id());
2452 op.setTemp(rename->second);
2453 }
2454 }
2455 }
2456 }
2457 }
2458
2459 /**
2460 * This function serves the purpose to correctly initialize the register file
2461 * at the beginning of a block (before any existing phis).
2462 * In order to do so, all live-in variables are entered into the RegisterFile.
2463 * Reg-to-reg moves (renames) from previous blocks are taken into account and
2464 * the SSA is repaired by inserting corresponding phi-nodes.
2465 */
2466 RegisterFile
init_reg_file(ra_ctx & ctx,const std::vector<IDSet> & live_out_per_block,Block & block)2467 init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
2468 {
2469 if (block.kind & block_kind_loop_exit) {
2470 uint32_t header = ctx.loop_header.back();
2471 ctx.loop_header.pop_back();
2472 handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
2473 }
2474
2475 RegisterFile register_file;
2476 const IDSet& live_in = live_out_per_block[block.index];
2477 assert(block.index != 0 || live_in.empty());
2478
2479 if (block.kind & block_kind_loop_header) {
2480 ctx.loop_header.emplace_back(block.index);
2481 /* already rename phis incoming value */
2482 for (aco_ptr<Instruction>& instr : block.instructions) {
2483 if (!is_phi(instr))
2484 break;
2485 Operand& operand = instr->operands[0];
2486 if (operand.isTemp()) {
2487 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
2488 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2489 }
2490 }
2491 for (unsigned t : live_in) {
2492 Temp val = Temp(t, ctx.program->temp_rc[t]);
2493 Temp renamed = read_variable(ctx, val, block.index - 1);
2494 if (renamed != val)
2495 add_rename(ctx, val, renamed);
2496 assignment& var = ctx.assignments[renamed.id()];
2497 assert(var.assigned);
2498 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2499 }
2500 } else {
2501 /* rename phi operands */
2502 for (aco_ptr<Instruction>& instr : block.instructions) {
2503 if (!is_phi(instr))
2504 break;
2505 const Block::edge_vec& preds =
2506 instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
2507
2508 for (unsigned i = 0; i < instr->operands.size(); i++) {
2509 Operand& operand = instr->operands[i];
2510 if (!operand.isTemp())
2511 continue;
2512 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
2513 operand.setFixed(ctx.assignments[operand.tempId()].reg);
2514 }
2515 }
2516 for (unsigned t : live_in) {
2517 Temp val = Temp(t, ctx.program->temp_rc[t]);
2518 Temp renamed = handle_live_in(ctx, val, &block);
2519 assignment& var = ctx.assignments[renamed.id()];
2520 /* due to live-range splits, the live-in might be a phi, now */
2521 if (var.assigned) {
2522 register_file.fill(Definition(renamed.id(), var.reg, var.rc));
2523 }
2524 if (renamed != val) {
2525 add_rename(ctx, val, renamed);
2526 }
2527 }
2528 }
2529
2530 return register_file;
2531 }
2532
2533 bool
vop3_can_use_vop2acc(ra_ctx & ctx,Instruction * instr)2534 vop3_can_use_vop2acc(ra_ctx& ctx, Instruction* instr)
2535 {
2536 if (!instr->isVOP3() && !instr->isVOP3P())
2537 return false;
2538
2539 switch (instr->opcode) {
2540 case aco_opcode::v_mad_f32:
2541 case aco_opcode::v_mad_f16:
2542 case aco_opcode::v_mad_legacy_f16: break;
2543 case aco_opcode::v_fma_f32:
2544 case aco_opcode::v_pk_fma_f16:
2545 case aco_opcode::v_fma_f16:
2546 case aco_opcode::v_dot4_i32_i8:
2547 if (ctx.program->gfx_level < GFX10)
2548 return false;
2549 break;
2550 case aco_opcode::v_mad_legacy_f32:
2551 if (!ctx.program->dev.has_mac_legacy32)
2552 return false;
2553 break;
2554 case aco_opcode::v_fma_legacy_f32:
2555 if (!ctx.program->dev.has_fmac_legacy32)
2556 return false;
2557 break;
2558 default: return false;
2559 }
2560
2561 if (!instr->operands[2].isOfType(RegType::vgpr) || !instr->operands[2].isKillBeforeDef() ||
2562 (!instr->operands[0].isOfType(RegType::vgpr) && !instr->operands[1].isOfType(RegType::vgpr)))
2563 return false;
2564
2565 if (instr->isVOP3P()) {
2566 for (unsigned i = 0; i < 3; i++) {
2567 if (instr->operands[i].isLiteral())
2568 continue;
2569
2570 if (instr->valu().opsel_lo[i])
2571 return false;
2572
2573 /* v_pk_fmac_f16 inline constants are replicated to hi bits starting with gfx11. */
2574 if (instr->valu().opsel_hi[i] ==
2575 (instr->operands[i].isConstant() && ctx.program->gfx_level >= GFX11))
2576 return false;
2577 }
2578 } else {
2579 if (instr->valu().opsel & (ctx.program->gfx_level < GFX11 ? 0xf : ~0x3))
2580 return false;
2581 for (unsigned i = 0; i < 2; i++) {
2582 if (!instr->operands[i].isOfType(RegType::vgpr) && instr->valu().opsel[i])
2583 return false;
2584 }
2585 }
2586
2587 unsigned im_mask = instr->isDPP16() && instr->isVOP3() ? 0x3 : 0;
2588 if (instr->valu().omod || instr->valu().clamp || (instr->valu().abs & ~im_mask) ||
2589 (instr->valu().neg & ~im_mask))
2590 return false;
2591
2592 return true;
2593 }
2594
2595 bool
sop2_can_use_sopk(ra_ctx & ctx,Instruction * instr)2596 sop2_can_use_sopk(ra_ctx& ctx, Instruction* instr)
2597 {
2598 if (instr->opcode != aco_opcode::s_add_i32 && instr->opcode != aco_opcode::s_add_u32 &&
2599 instr->opcode != aco_opcode::s_mul_i32 && instr->opcode != aco_opcode::s_cselect_b32)
2600 return false;
2601
2602 if (instr->opcode == aco_opcode::s_add_u32 && !instr->definitions[1].isKill())
2603 return false;
2604
2605 uint32_t literal_idx = 0;
2606
2607 if (instr->opcode != aco_opcode::s_cselect_b32 && instr->operands[1].isLiteral())
2608 literal_idx = 1;
2609
2610 if (!instr->operands[!literal_idx].isTemp() || !instr->operands[!literal_idx].isKillBeforeDef())
2611 return false;
2612
2613 if (!instr->operands[literal_idx].isLiteral())
2614 return false;
2615
2616 const uint32_t i16_mask = 0xffff8000u;
2617 uint32_t value = instr->operands[literal_idx].constantValue();
2618 if ((value & i16_mask) && (value & i16_mask) != i16_mask)
2619 return false;
2620
2621 return true;
2622 }
2623
2624 void
get_affinities(ra_ctx & ctx)2625 get_affinities(ra_ctx& ctx)
2626 {
2627 std::vector<std::vector<Temp>> phi_resources;
2628 aco::unordered_map<uint32_t, uint32_t> temp_to_phi_resources(ctx.memory);
2629
2630 for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
2631 block_rit++) {
2632 Block& block = *block_rit;
2633
2634 std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
2635 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
2636 aco_ptr<Instruction>& instr = *rit;
2637 if (is_phi(instr))
2638 break;
2639
2640 /* add vector affinities */
2641 if (instr->opcode == aco_opcode::p_create_vector) {
2642 for (const Operand& op : instr->operands) {
2643 if (op.isTemp() && op.isFirstKill() &&
2644 op.getTemp().type() == instr->definitions[0].getTemp().type())
2645 ctx.vectors[op.tempId()] = instr.get();
2646 }
2647 } else if (instr->format == Format::MIMG && instr->operands.size() > 4 &&
2648 !instr->mimg().strict_wqm && ctx.program->gfx_level < GFX12) {
2649 for (unsigned i = 3; i < instr->operands.size(); i++)
2650 ctx.vectors[instr->operands[i].tempId()] = instr.get();
2651 } else if (instr->opcode == aco_opcode::p_split_vector &&
2652 instr->operands[0].isFirstKillBeforeDef()) {
2653 ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
2654 } else if (instr->isVOPC() && !instr->isVOP3()) {
2655 if (!instr->isSDWA() || ctx.program->gfx_level == GFX8)
2656 ctx.assignments[instr->definitions[0].tempId()].vcc = true;
2657 } else if (instr->isVOP2() && !instr->isVOP3()) {
2658 if (instr->operands.size() == 3 && instr->operands[2].isTemp() &&
2659 instr->operands[2].regClass().type() == RegType::sgpr)
2660 ctx.assignments[instr->operands[2].tempId()].vcc = true;
2661 if (instr->definitions.size() == 2)
2662 ctx.assignments[instr->definitions[1].tempId()].vcc = true;
2663 } else if (instr->opcode == aco_opcode::s_and_b32 ||
2664 instr->opcode == aco_opcode::s_and_b64) {
2665 /* If SCC is used by a branch, we might be able to use
2666 * s_cbranch_vccz/s_cbranch_vccnz if the operand is VCC.
2667 */
2668 if (!instr->definitions[1].isKill() && instr->operands[0].isTemp() &&
2669 instr->operands[1].isFixed() && instr->operands[1].physReg() == exec)
2670 ctx.assignments[instr->operands[0].tempId()].vcc = true;
2671 } else if (instr->opcode == aco_opcode::s_sendmsg) {
2672 ctx.assignments[instr->operands[0].tempId()].m0 = true;
2673 }
2674
2675 int op_fixed_to_def0 = get_op_fixed_to_def(instr.get());
2676 for (unsigned i = 0; i < instr->definitions.size(); i++) {
2677 const Definition& def = instr->definitions[i];
2678 if (!def.isTemp())
2679 continue;
2680 /* mark last-seen phi operand */
2681 auto it = temp_to_phi_resources.find(def.tempId());
2682 if (it != temp_to_phi_resources.end() &&
2683 def.regClass() == phi_resources[it->second][0].regClass()) {
2684 phi_resources[it->second][0] = def.getTemp();
2685 /* try to coalesce phi affinities with parallelcopies */
2686 Operand op;
2687 if (instr->opcode == aco_opcode::p_parallelcopy) {
2688 op = instr->operands[i];
2689 } else if (i == 0 && op_fixed_to_def0 != -1) {
2690 op = instr->operands[op_fixed_to_def0];
2691 } else if (vop3_can_use_vop2acc(ctx, instr.get())) {
2692 op = instr->operands[2];
2693 } else if (i == 0 && sop2_can_use_sopk(ctx, instr.get())) {
2694 op = instr->operands[instr->operands[0].isLiteral()];
2695 } else {
2696 continue;
2697 }
2698
2699 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
2700 phi_resources[it->second].emplace_back(op.getTemp());
2701 temp_to_phi_resources[op.tempId()] = it->second;
2702 }
2703 }
2704 }
2705 }
2706
2707 /* collect phi affinities */
2708 for (; rit != block.instructions.rend(); ++rit) {
2709 aco_ptr<Instruction>& instr = *rit;
2710 assert(is_phi(instr));
2711
2712 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
2713 continue;
2714
2715 assert(instr->definitions[0].isTemp());
2716 auto it = temp_to_phi_resources.find(instr->definitions[0].tempId());
2717 unsigned index = phi_resources.size();
2718 std::vector<Temp>* affinity_related;
2719 if (it != temp_to_phi_resources.end()) {
2720 index = it->second;
2721 phi_resources[index][0] = instr->definitions[0].getTemp();
2722 affinity_related = &phi_resources[index];
2723 } else {
2724 phi_resources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
2725 affinity_related = &phi_resources.back();
2726 }
2727
2728 for (const Operand& op : instr->operands) {
2729 if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
2730 affinity_related->emplace_back(op.getTemp());
2731 if (block.kind & block_kind_loop_header)
2732 continue;
2733 temp_to_phi_resources[op.tempId()] = index;
2734 }
2735 }
2736 }
2737
2738 /* visit the loop header phis first in order to create nested affinities */
2739 if (block.kind & block_kind_loop_exit) {
2740 /* find loop header */
2741 auto header_rit = block_rit;
2742 while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
2743 header_rit++;
2744
2745 for (aco_ptr<Instruction>& phi : header_rit->instructions) {
2746 if (!is_phi(phi))
2747 break;
2748 if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
2749 continue;
2750
2751 /* create an (empty) merge-set for the phi-related variables */
2752 auto it = temp_to_phi_resources.find(phi->definitions[0].tempId());
2753 unsigned index = phi_resources.size();
2754 if (it == temp_to_phi_resources.end()) {
2755 temp_to_phi_resources[phi->definitions[0].tempId()] = index;
2756 phi_resources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
2757 } else {
2758 index = it->second;
2759 }
2760 for (unsigned i = 1; i < phi->operands.size(); i++) {
2761 const Operand& op = phi->operands[i];
2762 if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
2763 temp_to_phi_resources[op.tempId()] = index;
2764 }
2765 }
2766 }
2767 }
2768 }
2769 /* create affinities */
2770 for (std::vector<Temp>& vec : phi_resources) {
2771 for (unsigned i = 1; i < vec.size(); i++)
2772 if (vec[i].id() != vec[0].id())
2773 ctx.assignments[vec[i].id()].affinity = vec[0].id();
2774 }
2775 }
2776
2777 void
optimize_encoding_vop2(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2778 optimize_encoding_vop2(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2779 {
2780 if (!vop3_can_use_vop2acc(ctx, instr.get()))
2781 return;
2782
2783 for (unsigned i = ctx.program->gfx_level < GFX11 ? 0 : 2; i < 3; i++) {
2784 if (instr->operands[i].physReg().byte())
2785 return;
2786 }
2787
2788 unsigned def_id = instr->definitions[0].tempId();
2789 if (ctx.assignments[def_id].affinity) {
2790 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2791 if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
2792 !register_file.test(affinity.reg, instr->operands[2].bytes()))
2793 return;
2794 }
2795
2796 if (!instr->operands[1].isOfType(RegType::vgpr))
2797 instr->valu().swapOperands(0, 1);
2798
2799 if (instr->isVOP3P() && instr->operands[0].isLiteral()) {
2800 unsigned literal = instr->operands[0].constantValue();
2801 unsigned lo = (literal >> (instr->valu().opsel_lo[0] * 16)) & 0xffff;
2802 unsigned hi = (literal >> (instr->valu().opsel_hi[0] * 16)) & 0xffff;
2803 instr->operands[0] = Operand::literal32(lo | (hi << 16));
2804 }
2805
2806 instr->format = (Format)(((unsigned)withoutVOP3(instr->format) & ~(unsigned)Format::VOP3P) |
2807 (unsigned)Format::VOP2);
2808 instr->valu().opsel_lo = 0;
2809 instr->valu().opsel_hi = 0;
2810 switch (instr->opcode) {
2811 case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
2812 case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
2813 case aco_opcode::v_mad_f16:
2814 case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
2815 case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
2816 case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
2817 case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
2818 case aco_opcode::v_mad_legacy_f32: instr->opcode = aco_opcode::v_mac_legacy_f32; break;
2819 case aco_opcode::v_fma_legacy_f32: instr->opcode = aco_opcode::v_fmac_legacy_f32; break;
2820 default: break;
2821 }
2822 }
2823
2824 void
optimize_encoding_sopk(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2825 optimize_encoding_sopk(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2826 {
2827 /* try to optimize sop2 with literal source to sopk */
2828 if (!sop2_can_use_sopk(ctx, instr.get()))
2829 return;
2830 unsigned literal_idx = instr->operands[1].isLiteral();
2831
2832 if (instr->operands[!literal_idx].physReg() >= 128)
2833 return;
2834
2835 unsigned def_id = instr->definitions[0].tempId();
2836 if (ctx.assignments[def_id].affinity) {
2837 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
2838 if (affinity.assigned && affinity.reg != instr->operands[!literal_idx].physReg() &&
2839 !register_file.test(affinity.reg, instr->operands[!literal_idx].bytes()))
2840 return;
2841 }
2842
2843 instr->format = Format::SOPK;
2844 instr->salu().imm = instr->operands[literal_idx].constantValue() & 0xffff;
2845 if (literal_idx == 0)
2846 std::swap(instr->operands[0], instr->operands[1]);
2847 if (instr->operands.size() > 2)
2848 std::swap(instr->operands[1], instr->operands[2]);
2849 instr->operands.pop_back();
2850
2851 switch (instr->opcode) {
2852 case aco_opcode::s_add_u32:
2853 case aco_opcode::s_add_i32: instr->opcode = aco_opcode::s_addk_i32; break;
2854 case aco_opcode::s_mul_i32: instr->opcode = aco_opcode::s_mulk_i32; break;
2855 case aco_opcode::s_cselect_b32: instr->opcode = aco_opcode::s_cmovk_i32; break;
2856 default: unreachable("illegal instruction");
2857 }
2858 }
2859
2860 void
optimize_encoding(ra_ctx & ctx,RegisterFile & register_file,aco_ptr<Instruction> & instr)2861 optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>& instr)
2862 {
2863 if (instr->isVALU())
2864 optimize_encoding_vop2(ctx, register_file, instr);
2865 if (instr->isSALU())
2866 optimize_encoding_sopk(ctx, register_file, instr);
2867 }
2868
2869 void
emit_parallel_copy_internal(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)2870 emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
2871 aco_ptr<Instruction>& instr,
2872 std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
2873 RegisterFile& register_file)
2874 {
2875 if (parallelcopy.empty())
2876 return;
2877
2878 aco_ptr<Instruction> pc;
2879 pc.reset(create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, parallelcopy.size(),
2880 parallelcopy.size()));
2881 bool linear_vgpr = false;
2882 bool sgpr_operands_alias_defs = false;
2883 uint64_t sgpr_operands[4] = {0, 0, 0, 0};
2884 for (unsigned i = 0; i < parallelcopy.size(); i++) {
2885 linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
2886
2887 if (temp_in_scc && parallelcopy[i].first.isTemp() &&
2888 parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
2889 if (!sgpr_operands_alias_defs) {
2890 unsigned reg = parallelcopy[i].first.physReg().reg();
2891 unsigned size = parallelcopy[i].first.getTemp().size();
2892 sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
2893
2894 reg = parallelcopy[i].second.physReg().reg();
2895 size = parallelcopy[i].second.getTemp().size();
2896 if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
2897 sgpr_operands_alias_defs = true;
2898 }
2899 }
2900
2901 pc->operands[i] = parallelcopy[i].first;
2902 pc->definitions[i] = parallelcopy[i].second;
2903 assert(pc->operands[i].size() == pc->definitions[i].size());
2904
2905 /* it might happen that the operand is already renamed. we have to restore the
2906 * original name. */
2907 auto it = ctx.orig_names.find(pc->operands[i].tempId());
2908 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
2909 add_rename(ctx, orig, pc->definitions[i].getTemp());
2910 }
2911
2912 if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
2913 /* disable definitions and re-enable operands */
2914 RegisterFile tmp_file(register_file);
2915 for (const Definition& def : instr->definitions) {
2916 if (def.isTemp() && !def.isKill())
2917 tmp_file.clear(def);
2918 }
2919 for (const Operand& op : instr->operands) {
2920 if (op.isTemp() && op.isFirstKill())
2921 tmp_file.block(op.physReg(), op.regClass());
2922 }
2923
2924 handle_pseudo(ctx, tmp_file, pc.get());
2925 } else {
2926 pc->pseudo().needs_scratch_reg = sgpr_operands_alias_defs || linear_vgpr;
2927 pc->pseudo().tmp_in_scc = false;
2928 }
2929
2930 instructions.emplace_back(std::move(pc));
2931
2932 parallelcopy.clear();
2933 }
2934
2935 void
emit_parallel_copy(ra_ctx & ctx,std::vector<std::pair<Operand,Definition>> & parallelcopy,aco_ptr<Instruction> & instr,std::vector<aco_ptr<Instruction>> & instructions,bool temp_in_scc,RegisterFile & register_file)2936 emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
2937 aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
2938 bool temp_in_scc, RegisterFile& register_file)
2939 {
2940 if (parallelcopy.empty())
2941 return;
2942
2943 std::vector<std::pair<Operand, Definition>> linear_vgpr;
2944 if (ctx.num_linear_vgprs) {
2945 unsigned next = 0;
2946 for (unsigned i = 0; i < parallelcopy.size(); i++) {
2947 if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
2948 linear_vgpr.push_back(parallelcopy[i]);
2949 continue;
2950 }
2951
2952 if (next != i)
2953 parallelcopy[next] = parallelcopy[i];
2954 next++;
2955 }
2956 parallelcopy.resize(next);
2957 }
2958
2959 /* Because of how linear VGPRs are allocated, we should never have to move a linear VGPR into the
2960 * space of a normal one. This means the copy can be done entirely before normal VGPR copies. */
2961 emit_parallel_copy_internal(ctx, linear_vgpr, instr, instructions, temp_in_scc,
2962 register_file);
2963 emit_parallel_copy_internal(ctx, parallelcopy, instr, instructions, temp_in_scc,
2964 register_file);
2965 }
2966
2967 } /* end namespace */
2968
2969 void
register_allocation(Program * program,ra_test_policy policy)2970 register_allocation(Program* program, ra_test_policy policy)
2971 {
2972 ra_ctx ctx(program, policy);
2973 get_affinities(ctx);
2974
2975 for (Block& block : program->blocks) {
2976 ctx.block = █
2977
2978 /* initialize register file */
2979 RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block);
2980 ctx.war_hint.reset();
2981 ctx.rr_vgpr_it = {PhysReg{256}};
2982 ctx.rr_sgpr_it = {PhysReg{0}};
2983
2984 std::vector<aco_ptr<Instruction>> instructions;
2985 instructions.reserve(block.instructions.size());
2986
2987 /* this is a slight adjustment from the paper as we already have phi nodes:
2988 * We consider them incomplete phis and only handle the definition. */
2989 get_regs_for_phis(ctx, block, register_file, instructions,
2990 program->live.live_in[block.index]);
2991
2992 /* If this is a merge block, the state of the register file at the branch instruction of the
2993 * predecessors corresponds to the state after phis at the merge block. So, we allocate a
2994 * register for the predecessor's branch definitions as if there was a phi.
2995 */
2996 if (!block.linear_preds.empty() &&
2997 (block.linear_preds.size() != 1 ||
2998 program->blocks[block.linear_preds[0]].linear_succs.size() == 1)) {
2999 PhysReg br_reg = get_reg_phi(ctx, program->live.live_in[block.index], register_file,
3000 instructions, block, ctx.phi_dummy, Temp(0, s2));
3001 for (unsigned pred : block.linear_preds) {
3002 program->blocks[pred].scc_live_out = register_file[scc];
3003 aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
3004
3005 assert(br->definitions.size() == 1 && br->definitions[0].regClass() == s2 &&
3006 br->definitions[0].isKill());
3007
3008 br->definitions[0].setFixed(br_reg);
3009 }
3010 }
3011
3012 /* Handle all other instructions of the block */
3013 auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
3014 auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
3015 for (; instr_it != block.instructions.end(); ++instr_it) {
3016 aco_ptr<Instruction>& instr = *instr_it;
3017 std::vector<std::pair<Operand, Definition>> parallelcopy;
3018 bool temp_in_scc = register_file[scc];
3019
3020 if (instr->opcode == aco_opcode::p_branch) {
3021 /* unconditional branches are handled after phis of the target */
3022 instructions.emplace_back(std::move(instr));
3023 break;
3024 }
3025
3026 assert(!is_phi(instr));
3027
3028 /* handle operands */
3029 bool fixed = false;
3030 for (unsigned i = 0; i < instr->operands.size(); ++i) {
3031 auto& operand = instr->operands[i];
3032 if (!operand.isTemp())
3033 continue;
3034
3035 /* rename operands */
3036 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
3037 assert(ctx.assignments[operand.tempId()].assigned);
3038
3039 fixed |=
3040 operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
3041 }
3042
3043 bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
3044 instr->opcode == aco_opcode::v_writelane_b32_e64;
3045 if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
3046 instr->operands[1].isTemp()) {
3047 /* v_writelane_b32 can take two sgprs but only if one is m0. */
3048 if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
3049 ctx.assignments[instr->operands[1].tempId()].reg != m0) {
3050 instr->operands[0].setFixed(m0);
3051 fixed = true;
3052 }
3053 }
3054
3055 if (fixed)
3056 handle_fixed_operands(ctx, register_file, parallelcopy, instr);
3057
3058 for (unsigned i = 0; i < instr->operands.size(); ++i) {
3059 auto& operand = instr->operands[i];
3060 if (!operand.isTemp() || operand.isFixed())
3061 continue;
3062
3063 PhysReg reg = ctx.assignments[operand.tempId()].reg;
3064 if (operand_can_use_reg(program->gfx_level, instr, i, reg, operand.regClass()))
3065 operand.setFixed(reg);
3066 else
3067 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
3068
3069 if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->gfx_level == GFX6) ||
3070 (instr->isDS() && instr->ds().gds)) {
3071 for (unsigned j = 0; j < operand.size(); j++)
3072 ctx.war_hint.set(operand.physReg().reg() + j);
3073 }
3074 }
3075
3076 /* remove dead vars from register file */
3077 for (const Operand& op : instr->operands) {
3078 if (op.isTemp() && op.isFirstKillBeforeDef())
3079 register_file.clear(op);
3080 }
3081
3082 optimize_encoding(ctx, register_file, instr);
3083
3084 /* Handle definitions which must have the same register as an operand.
3085 * We expect that the definition has the same size as the operand, otherwise the new
3086 * location for the operand (if it's not killed) might intersect with the old one.
3087 * We can't read from the old location because it's corrupted, and we can't write the new
3088 * location because that's used by a live-through operand.
3089 */
3090 int op_fixed_to_def = get_op_fixed_to_def(instr.get());
3091 if (op_fixed_to_def != -1)
3092 instr->definitions[0].setFixed(instr->operands[op_fixed_to_def].physReg());
3093
3094 /* handle fixed definitions first */
3095 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3096 auto& definition = instr->definitions[i];
3097 if (!definition.isFixed())
3098 continue;
3099
3100 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
3101 /* check if the target register is blocked */
3102 if (register_file.test(definition.physReg(), definition.bytes())) {
3103 const PhysRegInterval def_regs{definition.physReg(), definition.size()};
3104
3105 /* create parallelcopy pair to move blocking vars */
3106 std::vector<unsigned> vars = collect_vars(ctx, register_file, def_regs);
3107
3108 RegisterFile tmp_file(register_file);
3109 /* re-enable the killed operands, so that we don't move the blocking vars there */
3110 for (const Operand& op : instr->operands) {
3111 if (op.isTemp() && op.isFirstKillBeforeDef())
3112 tmp_file.fill(op);
3113 }
3114
3115 ASSERTED bool success = false;
3116 success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
3117 assert(success);
3118
3119 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3120 }
3121
3122 if (!definition.isTemp())
3123 continue;
3124
3125 ctx.assignments[definition.tempId()].set(definition);
3126 register_file.fill(definition);
3127 }
3128
3129 /* handle all other definitions */
3130 for (unsigned i = 0; i < instr->definitions.size(); ++i) {
3131 Definition* definition = &instr->definitions[i];
3132
3133 if (definition->isFixed() || !definition->isTemp())
3134 continue;
3135
3136 /* find free reg */
3137 if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
3138 /* Allocation of linear VGPRs is special. */
3139 definition->setFixed(alloc_linear_vgpr(ctx, register_file, instr, parallelcopy));
3140 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
3141 } else if (instr->opcode == aco_opcode::p_split_vector) {
3142 PhysReg reg = instr->operands[0].physReg();
3143 RegClass rc = definition->regClass();
3144 for (unsigned j = 0; j < i; j++)
3145 reg.reg_b += instr->definitions[j].bytes();
3146 if (get_reg_specified(ctx, register_file, rc, instr, reg, -1)) {
3147 definition->setFixed(reg);
3148 } else if (i == 0) {
3149 RegClass vec_rc = RegClass::get(rc.type(), instr->operands[0].bytes());
3150 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
3151 std::optional<PhysReg> res = get_reg_simple(ctx, register_file, info);
3152 if (res && get_reg_specified(ctx, register_file, rc, instr, *res, -1))
3153 definition->setFixed(*res);
3154 } else if (instr->definitions[i - 1].isFixed()) {
3155 reg = instr->definitions[i - 1].physReg();
3156 reg.reg_b += instr->definitions[i - 1].bytes();
3157 if (get_reg_specified(ctx, register_file, rc, instr, reg, -1))
3158 definition->setFixed(reg);
3159 }
3160 } else if (instr->opcode == aco_opcode::p_parallelcopy) {
3161 PhysReg reg = instr->operands[i].physReg();
3162 if (instr->operands[i].isTemp() &&
3163 instr->operands[i].getTemp().type() == definition->getTemp().type() &&
3164 !register_file.test(reg, definition->bytes()))
3165 definition->setFixed(reg);
3166 } else if (instr->opcode == aco_opcode::p_extract_vector) {
3167 PhysReg reg = instr->operands[0].physReg();
3168 reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
3169 if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg, -1))
3170 definition->setFixed(reg);
3171 } else if (instr->opcode == aco_opcode::p_create_vector) {
3172 PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
3173 parallelcopy, instr);
3174 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
3175 definition->setFixed(reg);
3176 } else if (instr_info.classes[(int)instr->opcode] == instr_class::wmma &&
3177 instr->operands[2].isTemp() && instr->operands[2].isKill() &&
3178 instr->operands[2].regClass() == definition->regClass()) {
3179 /* For WMMA, the dest needs to either be equal to operands[2], or not overlap it.
3180 * Here we set a policy of forcing them the same if operands[2] gets killed (and
3181 * otherwise they don't overlap). This may not be optimal if RA would select a
3182 * different location due to affinity, but that gets complicated very quickly. */
3183 definition->setFixed(instr->operands[2].physReg());
3184 }
3185
3186 if (!definition->isFixed()) {
3187 Temp tmp = definition->getTemp();
3188 if (definition->regClass().is_subdword() && definition->bytes() < 4) {
3189 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
3190 definition->setFixed(reg);
3191 if (reg.byte() || register_file.test(reg, 4)) {
3192 bool allow_16bit_write = reg.byte() % 2 == 0 && !register_file.test(reg, 2);
3193 add_subdword_definition(program, instr, reg, allow_16bit_write);
3194 definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
3195 the reference */
3196 }
3197 } else {
3198 definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
3199 }
3200 update_renames(ctx, register_file, parallelcopy, instr,
3201 instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
3202 : (UpdateRenames)0);
3203 }
3204
3205 assert(
3206 definition->isFixed() &&
3207 ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
3208 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
3209 ctx.assignments[definition->tempId()].set(*definition);
3210 register_file.fill(*definition);
3211 }
3212
3213 handle_pseudo(ctx, register_file, instr.get());
3214
3215 /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
3216 * be read */
3217 for (const Definition& def : instr->definitions) {
3218 if (def.isTemp() && def.isKill())
3219 register_file.clear(def);
3220 }
3221 for (unsigned i = 0; i < instr->operands.size(); i++) {
3222 const Operand& op = instr->operands[i];
3223 if (op.isTemp() && op.isFirstKill() && op.isLateKill())
3224 register_file.clear(op);
3225 if (op.isTemp() && op.physReg().byte() != 0)
3226 add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
3227 }
3228
3229 emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
3230
3231 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
3232 bool instr_needs_vop3 =
3233 !instr->isVOP3() &&
3234 ((withoutDPP(instr->format) == Format::VOPC &&
3235 instr->definitions[0].physReg() != vcc) ||
3236 (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[2].physReg() != vcc) ||
3237 ((instr->opcode == aco_opcode::v_add_co_u32 ||
3238 instr->opcode == aco_opcode::v_addc_co_u32 ||
3239 instr->opcode == aco_opcode::v_sub_co_u32 ||
3240 instr->opcode == aco_opcode::v_subb_co_u32 ||
3241 instr->opcode == aco_opcode::v_subrev_co_u32 ||
3242 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3243 instr->definitions[1].physReg() != vcc) ||
3244 ((instr->opcode == aco_opcode::v_addc_co_u32 ||
3245 instr->opcode == aco_opcode::v_subb_co_u32 ||
3246 instr->opcode == aco_opcode::v_subbrev_co_u32) &&
3247 instr->operands[2].physReg() != vcc));
3248 if (instr_needs_vop3) {
3249
3250 /* If the first operand is a literal, we have to move it to an sgpr
3251 * for generations without VOP3+literal support.
3252 * Both literals and sgprs count towards the constant bus limit,
3253 * so this is always valid.
3254 */
3255 if (instr->operands.size() && instr->operands[0].isLiteral() &&
3256 program->gfx_level < GFX10) {
3257 /* Re-use the register we already allocated for the definition.
3258 * This works because the instruction cannot have any other SGPR operand.
3259 */
3260 Temp tmp = program->allocateTmp(instr->operands[0].size() == 2 ? s2 : s1);
3261 const Definition& def =
3262 instr->isVOPC() ? instr->definitions[0] : instr->definitions.back();
3263 assert(def.regClass() == s2);
3264 ctx.assignments.emplace_back(def.physReg(), tmp.regClass());
3265
3266 Instruction* copy =
3267 create_instruction(aco_opcode::p_parallelcopy, Format::PSEUDO, 1, 1);
3268 copy->operands[0] = instr->operands[0];
3269 if (copy->operands[0].bytes() < 4)
3270 copy->operands[0] = Operand::c32(copy->operands[0].constantValue());
3271 copy->definitions[0] = Definition(tmp);
3272 copy->definitions[0].setFixed(def.physReg());
3273
3274 instr->operands[0] = Operand(tmp);
3275 instr->operands[0].setFixed(def.physReg());
3276 instr->operands[0].setFirstKill(true);
3277
3278 instructions.emplace_back(copy);
3279 }
3280
3281 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
3282 instr->format = asVOP3(instr->format);
3283 }
3284
3285 instructions.emplace_back(std::move(*instr_it));
3286
3287 } /* end for Instr */
3288
3289 if ((block.kind & block_kind_top_level) && block.linear_succs.empty()) {
3290 /* Reset this for block_kind_resume. */
3291 ctx.num_linear_vgprs = 0;
3292
3293 ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
3294 ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
3295 assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
3296 assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
3297 } else if (should_compact_linear_vgprs(ctx, register_file)) {
3298 aco_ptr<Instruction> br = std::move(instructions.back());
3299 instructions.pop_back();
3300
3301 bool temp_in_scc =
3302 register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
3303
3304 std::vector<std::pair<Operand, Definition>> parallelcopy;
3305 compact_linear_vgprs(ctx, register_file, parallelcopy);
3306 update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
3307 emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
3308
3309 instructions.push_back(std::move(br));
3310 }
3311
3312 block.instructions = std::move(instructions);
3313 } /* end for BB */
3314
3315 /* num_gpr = rnd_up(max_used_gpr + 1) */
3316 program->config->num_vgprs =
3317 std::min<uint16_t>(get_vgpr_alloc(program, ctx.max_used_vgpr + 1), 256);
3318 program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
3319
3320 program->progress = CompilationProgress::after_ra;
3321 }
3322
3323 } // namespace aco
3324