1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "util/memstream.h"
10 #include "util/ralloc.h"
11
12 #include <array>
13 #include <map>
14 #include <set>
15 #include <vector>
16
17 namespace aco {
18
19 static void
aco_log(Program * program,enum aco_compiler_debug_level level,const char * prefix,const char * file,unsigned line,const char * fmt,va_list args)20 aco_log(Program* program, enum aco_compiler_debug_level level, const char* prefix, const char* file,
21 unsigned line, const char* fmt, va_list args)
22 {
23 char* msg;
24
25 if (program->debug.shorten_messages) {
26 msg = ralloc_vasprintf(NULL, fmt, args);
27 } else {
28 msg = ralloc_strdup(NULL, prefix);
29 ralloc_asprintf_append(&msg, " In file %s:%u\n", file, line);
30 ralloc_asprintf_append(&msg, " ");
31 ralloc_vasprintf_append(&msg, fmt, args);
32 }
33
34 if (program->debug.func)
35 program->debug.func(program->debug.private_data, level, msg);
36
37 fprintf(program->debug.output, "%s\n", msg);
38
39 ralloc_free(msg);
40 }
41
42 void
_aco_err(Program * program,const char * file,unsigned line,const char * fmt,...)43 _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
44 {
45 va_list args;
46
47 va_start(args, fmt);
48 aco_log(program, ACO_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
49 va_end(args);
50 }
51
52 bool
validate_ir(Program * program)53 validate_ir(Program* program)
54 {
55 bool is_valid = true;
56 auto check = [&program, &is_valid](bool success, const char* msg,
57 aco::Instruction* instr) -> void
58 {
59 if (!success) {
60 char* out;
61 size_t outsize;
62 struct u_memstream mem;
63 u_memstream_open(&mem, &out, &outsize);
64 FILE* const memf = u_memstream_get(&mem);
65
66 fprintf(memf, "%s: ", msg);
67 aco_print_instr(program->gfx_level, instr, memf);
68 u_memstream_close(&mem);
69
70 aco_err(program, "%s", out);
71 free(out);
72
73 is_valid = false;
74 }
75 };
76
77 /* check reachability */
78 if (program->progress < CompilationProgress::after_lower_to_hw) {
79 std::map<uint32_t, std::pair<uint32_t, bool>> def_blocks;
80 for (Block& block : program->blocks) {
81 for (aco_ptr<Instruction>& instr : block.instructions) {
82 for (Definition def : instr->definitions) {
83 if (!def.isTemp())
84 continue;
85 check(!def_blocks.count(def.tempId()), "Temporary defined twice", instr.get());
86 def_blocks[def.tempId()] = std::make_pair(block.index, false);
87 }
88 }
89 }
90
91 for (Block& block : program->blocks) {
92 for (aco_ptr<Instruction>& instr : block.instructions) {
93 for (unsigned i = 0; i < instr->operands.size(); i++) {
94 Operand op = instr->operands[i];
95 if (!op.isTemp())
96 continue;
97
98 uint32_t use_block_idx = block.index;
99 if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_boolean_phi)
100 use_block_idx = block.logical_preds[i];
101 else if (instr->opcode == aco_opcode::p_linear_phi)
102 use_block_idx = block.linear_preds[i];
103
104 auto it = def_blocks.find(op.tempId());
105 if (it != def_blocks.end()) {
106 Block& def_block = program->blocks[it->second.first];
107 Block& use_block = program->blocks[use_block_idx];
108 bool dominates =
109 def_block.index == use_block_idx
110 ? (use_block_idx == block.index ? it->second.second : true)
111 : (op.regClass().is_linear() ? dominates_linear(def_block, use_block)
112 : dominates_logical(def_block, use_block));
113 if (!dominates) {
114 char msg[256];
115 snprintf(msg, sizeof(msg), "Definition of %%%u does not dominate use",
116 op.tempId());
117 check(false, msg, instr.get());
118 }
119 } else {
120 char msg[256];
121 snprintf(msg, sizeof(msg), "%%%u never defined", op.tempId());
122 check(false, msg, instr.get());
123 }
124 }
125
126 for (Definition def : instr->definitions) {
127 if (def.isTemp())
128 def_blocks[def.tempId()].second = true;
129 }
130 }
131 }
132 }
133
134 for (Block& block : program->blocks) {
135 for (aco_ptr<Instruction>& instr : block.instructions) {
136
137 if (program->progress < CompilationProgress::after_lower_to_hw) {
138 for (const Operand& op : instr->operands)
139 check(!op.isTemp() || op.regClass() == program->temp_rc[op.tempId()],
140 "Operand RC not consistent.", instr.get());
141
142 for (const Definition& def : instr->definitions)
143 check(!def.isTemp() || def.regClass() == program->temp_rc[def.tempId()],
144 "Definition RC not consistent.", instr.get());
145 }
146
147 unsigned pck_defs = instr_info.definitions[(int)instr->opcode];
148 unsigned pck_ops = instr_info.operands[(int)instr->opcode];
149
150 if (pck_defs != 0) {
151 /* Before GFX10 v_cmpx also writes VCC. */
152 if (instr->isVOPC() && program->gfx_level < GFX10 && pck_defs == exec_hi)
153 pck_defs = vcc | (exec_hi << 8);
154
155 for (unsigned i = 0; i < 4; i++) {
156 uint32_t def = (pck_defs >> (i * 8)) & 0xff;
157 if (def == 0) {
158 check(i == instr->definitions.size(), "Too many definitions", instr.get());
159 break;
160 } else {
161 check(i < instr->definitions.size(), "Too few definitions", instr.get());
162 if (i >= instr->definitions.size())
163 break;
164 }
165
166 if (def == m0) {
167 check(instr->definitions[i].isFixed() && instr->definitions[i].physReg() == m0,
168 "Definition needs m0", instr.get());
169 } else if (def == scc) {
170 check(instr->definitions[i].isFixed() && instr->definitions[i].physReg() == scc,
171 "Definition needs scc", instr.get());
172 } else if (def == exec_hi) {
173 RegClass rc = instr->isSALU() ? s2 : program->lane_mask;
174 check(instr->definitions[i].isFixed() &&
175 instr->definitions[i].physReg() == exec &&
176 instr->definitions[i].regClass() == rc,
177 "Definition needs exec", instr.get());
178 } else if (def == exec_lo) {
179 check(instr->definitions[i].isFixed() &&
180 instr->definitions[i].physReg() == exec_lo &&
181 instr->definitions[i].regClass() == s1,
182 "Definition needs exec_lo", instr.get());
183 } else if (def == vcc) {
184 check(instr->definitions[i].regClass() == program->lane_mask,
185 "Definition has to be lane mask", instr.get());
186 check(!instr->definitions[i].isFixed() ||
187 instr->definitions[i].physReg() == vcc || instr->isVOP3() ||
188 instr->isSDWA(),
189 "Definition has to be vcc", instr.get());
190 } else {
191 check(instr->definitions[i].size() == def, "Definition has wrong size",
192 instr.get());
193 }
194 }
195 }
196
197 if (pck_ops != 0) {
198 for (unsigned i = 0; i < 4; i++) {
199 uint32_t op = (pck_ops >> (i * 8)) & 0xff;
200 if (op == 0) {
201 check(i == instr->operands.size(), "Too many operands", instr.get());
202 break;
203 } else {
204 check(i < instr->operands.size(), "Too few operands", instr.get());
205 if (i >= instr->operands.size())
206 break;
207 }
208
209 if (op == m0) {
210 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == m0,
211 "Operand needs m0", instr.get());
212 } else if (op == scc) {
213 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == scc,
214 "Operand needs scc", instr.get());
215 } else if (op == exec_hi) {
216 RegClass rc = instr->isSALU() ? s2 : program->lane_mask;
217 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == exec &&
218 instr->operands[i].hasRegClass() && instr->operands[i].regClass() == rc,
219 "Operand needs exec", instr.get());
220 } else if (op == exec_lo) {
221 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == exec_lo &&
222 instr->operands[i].hasRegClass() && instr->operands[i].regClass() == s1,
223 "Operand needs exec_lo", instr.get());
224 } else if (op == vcc) {
225 check(instr->operands[i].hasRegClass() &&
226 instr->operands[i].regClass() == program->lane_mask,
227 "Operand has to be lane mask", instr.get());
228 check(!instr->operands[i].isFixed() || instr->operands[i].physReg() == vcc ||
229 instr->isVOP3(),
230 "Operand has to be vcc", instr.get());
231 } else {
232 check(instr->operands[i].size() == op ||
233 (instr->operands[i].isFixed() && instr->operands[i].physReg() >= 128 &&
234 instr->operands[i].physReg() < 256),
235 "Operand has wrong size", instr.get());
236 }
237 }
238 }
239
240 /* check base format */
241 Format base_format = instr->format;
242 base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA);
243 base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP16);
244 base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP8);
245 if ((uint32_t)base_format & (uint32_t)Format::VOP1)
246 base_format = Format::VOP1;
247 else if ((uint32_t)base_format & (uint32_t)Format::VOP2)
248 base_format = Format::VOP2;
249 else if ((uint32_t)base_format & (uint32_t)Format::VOPC)
250 base_format = Format::VOPC;
251 else if (base_format == Format::VINTRP) {
252 if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
253 instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
254 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
255 instr->opcode == aco_opcode::v_interp_p2_f16 ||
256 instr->opcode == aco_opcode::v_interp_p2_hi_f16) {
257 /* v_interp_*_fp16 are considered VINTRP by the compiler but
258 * they are emitted as VOP3.
259 */
260 base_format = Format::VOP3;
261 } else {
262 base_format = Format::VINTRP;
263 }
264 }
265 check(base_format == instr_info.format[(int)instr->opcode],
266 "Wrong base format for instruction", instr.get());
267
268 /* check VOP3 modifiers */
269 if (instr->isVOP3() && withoutDPP(instr->format) != Format::VOP3) {
270 check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
271 base_format == Format::VOPC || base_format == Format::VINTRP,
272 "Format cannot have VOP3/VOP3B applied", instr.get());
273 }
274
275 if (instr->isDPP()) {
276 check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
277 base_format == Format::VOPC || base_format == Format::VOP3 ||
278 base_format == Format::VOP3P,
279 "Format cannot have DPP applied", instr.get());
280 check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
281 "VOP3+DPP is GFX11+ only", instr.get());
282
283 bool fi =
284 instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive;
285 check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only",
286 instr.get());
287 }
288
289 /* check SDWA */
290 if (instr->isSDWA()) {
291 check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
292 base_format == Format::VOPC,
293 "Format cannot have SDWA applied", instr.get());
294
295 check(program->gfx_level >= GFX8, "SDWA is GFX8 to GFX10.3 only", instr.get());
296 check(program->gfx_level < GFX11, "SDWA is GFX8 to GFX10.3 only", instr.get());
297
298 SDWA_instruction& sdwa = instr->sdwa();
299 check(sdwa.omod == 0 || program->gfx_level >= GFX9, "SDWA omod only supported on GFX9+",
300 instr.get());
301 if (base_format == Format::VOPC) {
302 check(sdwa.clamp == false || program->gfx_level == GFX8,
303 "SDWA VOPC clamp only supported on GFX8", instr.get());
304 check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
305 program->gfx_level >= GFX9,
306 "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
307 } else {
308 const Definition& def = instr->definitions[0];
309 check(def.bytes() <= 4, "SDWA definitions must not be larger than 4 bytes",
310 instr.get());
311 check(def.bytes() >= sdwa.dst_sel.size() + sdwa.dst_sel.offset(),
312 "SDWA definition selection size must be at most definition size", instr.get());
313 check(
314 sdwa.dst_sel.size() == 1 || sdwa.dst_sel.size() == 2 || sdwa.dst_sel.size() == 4,
315 "SDWA definition selection size must be 1, 2 or 4 bytes", instr.get());
316 check(sdwa.dst_sel.offset() % sdwa.dst_sel.size() == 0, "Invalid selection offset",
317 instr.get());
318 check(def.bytes() == 4 || def.bytes() == sdwa.dst_sel.size(),
319 "SDWA dst_sel size must be definition size for subdword definitions",
320 instr.get());
321 check(def.bytes() == 4 || sdwa.dst_sel.offset() == 0,
322 "SDWA dst_sel offset must be 0 for subdword definitions", instr.get());
323 }
324
325 for (unsigned i = 0; i < std::min<unsigned>(2, instr->operands.size()); i++) {
326 const Operand& op = instr->operands[i];
327 check(op.bytes() <= 4, "SDWA operands must not be larger than 4 bytes", instr.get());
328 check(op.bytes() >= sdwa.sel[i].size() + sdwa.sel[i].offset(),
329 "SDWA operand selection size must be at most operand size", instr.get());
330 check(sdwa.sel[i].size() == 1 || sdwa.sel[i].size() == 2 || sdwa.sel[i].size() == 4,
331 "SDWA operand selection size must be 1, 2 or 4 bytes", instr.get());
332 check(sdwa.sel[i].offset() % sdwa.sel[i].size() == 0, "Invalid selection offset",
333 instr.get());
334 }
335 if (instr->operands.size() >= 3) {
336 check(instr->operands[2].isFixed() && instr->operands[2].physReg() == vcc,
337 "3rd operand must be fixed to vcc with SDWA", instr.get());
338 }
339 if (instr->definitions.size() >= 2) {
340 check(instr->definitions[1].isFixed() && instr->definitions[1].physReg() == vcc,
341 "2nd definition must be fixed to vcc with SDWA", instr.get());
342 }
343
344 const bool sdwa_opcodes =
345 instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
346 instr->opcode != aco_opcode::v_fmamk_f32 &&
347 instr->opcode != aco_opcode::v_fmaak_f32 &&
348 instr->opcode != aco_opcode::v_fmamk_f16 &&
349 instr->opcode != aco_opcode::v_fmaak_f16 &&
350 instr->opcode != aco_opcode::v_madmk_f32 &&
351 instr->opcode != aco_opcode::v_madak_f32 &&
352 instr->opcode != aco_opcode::v_madmk_f16 &&
353 instr->opcode != aco_opcode::v_madak_f16 &&
354 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
355 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
356
357 const bool feature_mac =
358 program->gfx_level == GFX8 &&
359 (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
360
361 check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
362 }
363
364 /* check opsel */
365 if (instr->opcode == aco_opcode::v_permlane16_b32 ||
366 instr->opcode == aco_opcode::v_permlanex16_b32) {
367 check(instr->valu().opsel <= 0x3, "Unexpected opsel for permlane", instr.get());
368 } else if (instr->isVOP3() || instr->isVOP1() || instr->isVOP2() || instr->isVOPC()) {
369 VALU_instruction& valu = instr->valu();
370 check(valu.opsel == 0 || program->gfx_level >= GFX9, "Opsel is only supported on GFX9+",
371 instr.get());
372 check(valu.opsel == 0 || instr->format == Format::VOP3 || program->gfx_level >= GFX11,
373 "Opsel is only supported for VOP3 before GFX11", instr.get());
374
375 for (unsigned i = 0; i < 3; i++) {
376 if (i >= instr->operands.size() ||
377 (!instr->isVOP3() && !instr->operands[i].isOfType(RegType::vgpr)) ||
378 (instr->operands[i].hasRegClass() &&
379 instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
380 check(!valu.opsel[i], "Unexpected opsel for operand", instr.get());
381 }
382 if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
383 check(!valu.opsel[3], "Unexpected opsel for sub-dword definition", instr.get());
384 } else if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
385 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
386 instr->opcode == aco_opcode::v_fma_mix_f32) {
387 check(instr->definitions[0].regClass() ==
388 (instr->opcode == aco_opcode::v_fma_mix_f32 ? v1 : v2b),
389 "v_fma_mix_f32/v_fma_mix_f16 must have v1/v2b definition", instr.get());
390 } else if (instr->isVOP3P()) {
391 VALU_instruction& vop3p = instr->valu();
392 for (unsigned i = 0; i < instr->operands.size(); i++) {
393 if (instr->operands[i].hasRegClass() &&
394 instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())
395 check(!vop3p.opsel_lo[i] && !vop3p.opsel_hi[i],
396 "Unexpected opsel for subdword operand", instr.get());
397 }
398 check(instr->definitions[0].regClass() == v1 ||
399 instr_info.classes[(int)instr->opcode] == instr_class::wmma,
400 "VOP3P must have v1 definition", instr.get());
401 }
402
403 /* check for undefs */
404 for (unsigned i = 0; i < instr->operands.size(); i++) {
405 if (instr->operands[i].isUndefined()) {
406 bool flat = instr->isFlatLike();
407 bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
408 instr->opcode == aco_opcode::p_create_vector ||
409 instr->opcode == aco_opcode::p_start_linear_vgpr ||
410 instr->opcode == aco_opcode::p_jump_to_epilog ||
411 instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
412 instr->opcode == aco_opcode::p_end_with_regs ||
413 (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
414 (instr->opcode == aco_opcode::p_bpermute_permlane && i == 0) ||
415 (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
416 ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
417 (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
418 (instr->opcode == aco_opcode::p_init_scratch && i == 0);
419 check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
420 } else {
421 check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
422 instr->operands[i].isConstant(),
423 "Uninitialized Operand", instr.get());
424 }
425 }
426
427 for (Operand& op : instr->operands) {
428 if (op.isFixed() || !op.hasRegClass() || !op.regClass().is_linear_vgpr() ||
429 op.isUndefined())
430 continue;
431
432 /* Only kill linear VGPRs in top-level blocks. Otherwise, we might have to move linear
433 * VGPRs to make space for normal ones and that isn't possible inside control flow. */
434 if (op.isKill()) {
435 check(block.kind & block_kind_top_level,
436 "Linear VGPR operands must only be killed at top-level blocks", instr.get());
437 }
438 }
439
440 /* check subdword definitions */
441 for (unsigned i = 0; i < instr->definitions.size(); i++) {
442 if (instr->definitions[i].regClass().is_subdword())
443 check(instr->definitions[i].bytes() <= 4 || instr->isPseudo() || instr->isVMEM(),
444 "Only Pseudo and VMEM instructions can write subdword registers > 4 bytes",
445 instr.get());
446 }
447
448 if ((instr->isSALU() && instr->opcode != aco_opcode::p_constaddr_addlo &&
449 instr->opcode != aco_opcode::p_resumeaddr_addlo) ||
450 instr->isVALU()) {
451 /* check literals */
452 Operand literal(s1);
453 for (unsigned i = 0; i < instr->operands.size(); i++) {
454 Operand op = instr->operands[i];
455 if (!op.isLiteral())
456 continue;
457
458 check(!instr->isDPP() && !instr->isSDWA() &&
459 (!instr->isVOP3() || program->gfx_level >= GFX10) &&
460 (!instr->isVOP3P() || program->gfx_level >= GFX10),
461 "Literal applied on wrong instruction format", instr.get());
462
463 check(literal.isUndefined() || (literal.size() == op.size() &&
464 literal.constantValue() == op.constantValue()),
465 "Only 1 Literal allowed", instr.get());
466 literal = op;
467 check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
468 "Wrong source position for Literal argument", instr.get());
469 }
470
471 /* check num sgprs for VALU */
472 if (instr->isVALU()) {
473 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64_e64 ||
474 instr->opcode == aco_opcode::v_lshlrev_b64 ||
475 instr->opcode == aco_opcode::v_lshrrev_b64 ||
476 instr->opcode == aco_opcode::v_ashrrev_i64;
477 unsigned const_bus_limit = 1;
478 if (program->gfx_level >= GFX10 && !is_shift64)
479 const_bus_limit = 2;
480
481 uint32_t scalar_mask =
482 instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG() ? 0x7 : 0x5;
483 if (instr->isSDWA())
484 scalar_mask = program->gfx_level >= GFX9 ? 0x7 : 0x4;
485 else if (instr->isDPP())
486 scalar_mask = 0x4;
487
488 if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
489 instr->opcode == aco_opcode::v_readlane_b32 ||
490 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
491 instr_info.classes[(int)instr->opcode] ==
492 instr_class::valu_pseudo_scalar_trans) {
493 check(instr->definitions[0].regClass().type() == RegType::sgpr,
494 "Wrong Definition type for VALU instruction", instr.get());
495 } else {
496 check(instr->definitions[0].regClass().type() == RegType::vgpr,
497 "Wrong Definition type for VALU instruction", instr.get());
498 }
499
500 unsigned num_sgprs = 0;
501 unsigned sgpr[] = {0, 0};
502 for (unsigned i = 0; i < instr->operands.size(); i++) {
503 Operand op = instr->operands[i];
504 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
505 instr->opcode == aco_opcode::v_readlane_b32 ||
506 instr->opcode == aco_opcode::v_readlane_b32_e64) {
507 check(i != 1 || op.isOfType(RegType::sgpr) || op.isConstant(),
508 "Must be a SGPR or a constant", instr.get());
509 check(i == 1 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
510 "Wrong Operand type for VALU instruction", instr.get());
511 continue;
512 }
513 if (instr->opcode == aco_opcode::v_permlane16_b32 ||
514 instr->opcode == aco_opcode::v_permlanex16_b32 ||
515 instr->opcode == aco_opcode::v_permlane64_b32) {
516 check(i != 0 || op.isOfType(RegType::vgpr),
517 "Operand 0 of v_permlane must be VGPR", instr.get());
518 check(i == 0 || op.isOfType(RegType::sgpr) || op.isConstant(),
519 "Lane select operands of v_permlane must be SGPR or constant",
520 instr.get());
521 }
522
523 if (instr->opcode == aco_opcode::v_writelane_b32 ||
524 instr->opcode == aco_opcode::v_writelane_b32_e64) {
525 check(i != 2 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
526 "Wrong Operand type for VALU instruction", instr.get());
527 check(i == 2 || op.isOfType(RegType::sgpr) || op.isConstant(),
528 "Must be a SGPR or a constant", instr.get());
529 continue;
530 }
531 if (op.isOfType(RegType::sgpr)) {
532 check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
533 instr.get());
534
535 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
536 if (num_sgprs < 2)
537 sgpr[num_sgprs++] = op.tempId();
538 }
539 }
540
541 if (op.isConstant() && !op.isLiteral())
542 check(scalar_mask & (1 << i), "Wrong source position for constant argument",
543 instr.get());
544 }
545 check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
546 "Too many SGPRs/literals", instr.get());
547
548 /* Validate modifiers. */
549 check(!instr->valu().opsel || instr->isVOP3() || instr->isVOP1() ||
550 instr->isVOP2() || instr->isVOPC() || instr->isVINTERP_INREG(),
551 "OPSEL set for unsupported instruction format", instr.get());
552 check(!instr->valu().opsel_lo || instr->isVOP3P(),
553 "OPSEL_LO set for unsupported instruction format", instr.get());
554 check(!instr->valu().opsel_hi || instr->isVOP3P(),
555 "OPSEL_HI set for unsupported instruction format", instr.get());
556 check(!instr->valu().omod || instr->isVOP3() || instr->isSDWA(),
557 "OMOD set for unsupported instruction format", instr.get());
558 check(!instr->valu().clamp || instr->isVOP3() || instr->isVOP3P() ||
559 instr->isSDWA() || instr->isVINTERP_INREG(),
560 "CLAMP set for unsupported instruction format", instr.get());
561
562 for (bool abs : instr->valu().abs) {
563 check(!abs || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
564 instr->isDPP16(),
565 "ABS/NEG_HI set for unsupported instruction format", instr.get());
566 }
567 for (bool neg : instr->valu().neg) {
568 check(!neg || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
569 instr->isDPP16() || instr->isVINTERP_INREG(),
570 "NEG/NEG_LO set for unsupported instruction format", instr.get());
571 }
572 }
573
574 if (instr->isSOP1() || instr->isSOP2()) {
575 if (!instr->definitions.empty())
576 check(instr->definitions[0].regClass().type() == RegType::sgpr,
577 "Wrong Definition type for SALU instruction", instr.get());
578 for (const Operand& op : instr->operands) {
579 check(op.isConstant() || op.isOfType(RegType::sgpr),
580 "Wrong Operand type for SALU instruction", instr.get());
581 }
582 }
583 }
584
585 switch (instr->format) {
586 case Format::PSEUDO: {
587 if (instr->opcode == aco_opcode::p_create_vector ||
588 instr->opcode == aco_opcode::p_start_linear_vgpr) {
589 unsigned size = 0;
590 for (const Operand& op : instr->operands) {
591 check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
592 size += op.bytes();
593 }
594 if (!instr->operands.empty() || instr->opcode == aco_opcode::p_create_vector) {
595 check(size == instr->definitions[0].bytes(),
596 "Definition size does not match operand sizes", instr.get());
597 }
598 if (instr->definitions[0].regClass().type() == RegType::sgpr) {
599 for (const Operand& op : instr->operands) {
600 check(op.isConstant() || op.regClass().type() == RegType::sgpr,
601 "Wrong Operand type for scalar vector", instr.get());
602 }
603 }
604 if (instr->opcode == aco_opcode::p_start_linear_vgpr)
605 check(instr->definitions[0].regClass().is_linear_vgpr(),
606 "Definition must be linear VGPR", instr.get());
607 } else if (instr->opcode == aco_opcode::p_extract_vector) {
608 check(!instr->operands[0].isConstant() && instr->operands[1].isConstant(),
609 "Wrong Operand types", instr.get());
610 check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
611 instr->operands[0].bytes(),
612 "Index out of range", instr.get());
613 check(instr->definitions[0].regClass().type() == RegType::vgpr ||
614 instr->operands[0].regClass().type() == RegType::sgpr,
615 "Cannot extract SGPR value from VGPR vector", instr.get());
616 check(program->gfx_level >= GFX9 ||
617 !instr->definitions[0].regClass().is_subdword() ||
618 instr->operands[0].regClass().type() == RegType::vgpr,
619 "Cannot extract subdword from SGPR before GFX9+", instr.get());
620 } else if (instr->opcode == aco_opcode::p_split_vector) {
621 check(!instr->operands[0].isConstant(), "Operand must not be constant", instr.get());
622 unsigned size = 0;
623 for (const Definition& def : instr->definitions) {
624 size += def.bytes();
625 }
626 check(size == instr->operands[0].bytes(),
627 "Operand size does not match definition sizes", instr.get());
628 if (instr->operands[0].isOfType(RegType::vgpr)) {
629 for (const Definition& def : instr->definitions)
630 check(def.regClass().type() == RegType::vgpr,
631 "Wrong Definition type for VGPR split_vector", instr.get());
632 } else {
633 for (const Definition& def : instr->definitions)
634 check(program->gfx_level >= GFX9 || !def.regClass().is_subdword(),
635 "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
636 }
637 } else if (instr->opcode == aco_opcode::p_parallelcopy) {
638 check(instr->definitions.size() == instr->operands.size(),
639 "Number of Operands does not match number of Definitions", instr.get());
640 for (unsigned i = 0; i < instr->operands.size(); i++) {
641 check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
642 "Operand and Definition size must match", instr.get());
643 if (instr->operands[i].hasRegClass()) {
644 check((instr->definitions[i].regClass().type() ==
645 instr->operands[i].regClass().type()) ||
646 (instr->definitions[i].regClass().type() == RegType::vgpr &&
647 instr->operands[i].regClass().type() == RegType::sgpr),
648 "Operand and Definition types do not match", instr.get());
649 check(instr->definitions[i].regClass().is_linear_vgpr() ==
650 instr->operands[i].regClass().is_linear_vgpr(),
651 "Operand and Definition types do not match", instr.get());
652 } else {
653 check(!instr->definitions[i].regClass().is_linear_vgpr(),
654 "Can only copy linear VGPRs into linear VGPRs, not constant/undef",
655 instr.get());
656 }
657 }
658 } else if (instr->opcode == aco_opcode::p_phi) {
659 check(instr->operands.size() == block.logical_preds.size(),
660 "Number of Operands does not match number of predecessors", instr.get());
661 check(instr->definitions[0].regClass().type() == RegType::vgpr,
662 "Logical Phi Definition must be vgpr", instr.get());
663 for (const Operand& op : instr->operands)
664 check(instr->definitions[0].size() == op.size(),
665 "Operand sizes must match Definition size", instr.get());
666 } else if (instr->opcode == aco_opcode::p_linear_phi) {
667 for (const Operand& op : instr->operands) {
668 check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
669 instr.get());
670 check(instr->definitions[0].size() == op.size(),
671 "Operand sizes must match Definition size", instr.get());
672 }
673 check(instr->operands.size() == block.linear_preds.size(),
674 "Number of Operands does not match number of predecessors", instr.get());
675 } else if (instr->opcode == aco_opcode::p_extract ||
676 instr->opcode == aco_opcode::p_insert) {
677 check(!instr->operands[0].isConstant(), "Data operand must not be constant",
678 instr.get());
679 check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
680 if (instr->opcode == aco_opcode::p_extract)
681 check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
682 instr.get());
683
684 check(instr->definitions[0].regClass().type() != RegType::sgpr ||
685 instr->operands[0].regClass().type() == RegType::sgpr,
686 "Can't extract/insert VGPR to SGPR", instr.get());
687
688 if (instr->opcode == aco_opcode::p_insert)
689 check(instr->operands[0].bytes() == instr->definitions[0].bytes(),
690 "Sizes of p_insert data operand and definition must match", instr.get());
691
692 if (instr->definitions[0].regClass().type() == RegType::sgpr)
693 check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
694 instr->definitions[1].physReg() == scc,
695 "SGPR extract/insert needs an SCC definition", instr.get());
696
697 unsigned data_bits = instr->operands[0].bytes() * 8u;
698 unsigned op_bits = instr->operands[2].constantValue();
699
700 if (instr->opcode == aco_opcode::p_insert) {
701 check(op_bits == 8 || op_bits == 16, "Size must be 8 or 16", instr.get());
702 check(op_bits < data_bits, "Size must be smaller than source", instr.get());
703 } else if (instr->opcode == aco_opcode::p_extract) {
704 check(op_bits == 8 || op_bits == 16 || op_bits == 32,
705 "Size must be 8 or 16 or 32", instr.get());
706 check(data_bits >= op_bits, "Can't extract more bits than what the data has.",
707 instr.get());
708 }
709
710 unsigned comp = data_bits / MAX2(op_bits, 1);
711 check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
712 instr.get());
713 } else if (instr->opcode == aco_opcode::p_jump_to_epilog) {
714 check(instr->definitions.size() == 0, "p_jump_to_epilog must have 0 definitions",
715 instr.get());
716 check(instr->operands.size() > 0 && instr->operands[0].isOfType(RegType::sgpr) &&
717 instr->operands[0].size() == 2,
718 "First operand of p_jump_to_epilog must be a SGPR", instr.get());
719 for (unsigned i = 1; i < instr->operands.size(); i++) {
720 check(instr->operands[i].isOfType(RegType::vgpr) ||
721 instr->operands[i].isOfType(RegType::sgpr) ||
722 instr->operands[i].isUndefined(),
723 "Other operands of p_jump_to_epilog must be VGPRs, SGPRs or undef",
724 instr.get());
725 }
726 } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
727 check(instr->definitions.size() == 6,
728 "p_dual_src_export_gfx11 must have 6 definitions", instr.get());
729 check(instr->definitions[2].regClass() == program->lane_mask,
730 "Third definition of p_dual_src_export_gfx11 must be a lane mask",
731 instr.get());
732 check(instr->definitions[3].regClass() == program->lane_mask,
733 "Fourth definition of p_dual_src_export_gfx11 must be a lane mask",
734 instr.get());
735 check(instr->definitions[4].physReg() == vcc,
736 "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get());
737 check(instr->definitions[5].physReg() == scc,
738 "Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get());
739 check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands",
740 instr.get());
741 for (unsigned i = 0; i < instr->operands.size(); i++) {
742 check(
743 instr->operands[i].isOfType(RegType::vgpr) || instr->operands[i].isUndefined(),
744 "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
745 }
746 }
747 break;
748 }
749 case Format::PSEUDO_REDUCTION: {
750 for (const Operand& op : instr->operands)
751 check(op.regClass().type() == RegType::vgpr,
752 "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
753 instr.get());
754
755 if (instr->opcode == aco_opcode::p_reduce &&
756 instr->reduction().cluster_size == program->wave_size)
757 check(instr->definitions[0].regClass().type() == RegType::sgpr ||
758 program->wave_size == 32,
759 "The result of unclustered reductions must go into an SGPR.", instr.get());
760 else
761 check(instr->definitions[0].regClass().type() == RegType::vgpr,
762 "The result of scans and clustered reductions must go into a VGPR.",
763 instr.get());
764
765 break;
766 }
767 case Format::SMEM: {
768 if (instr->operands.size() >= 1)
769 check(instr->operands[0].isOfType(RegType::sgpr), "SMEM operands must be sgpr",
770 instr.get());
771 if (instr->operands.size() >= 2)
772 check(instr->operands[1].isConstant() || instr->operands[1].isOfType(RegType::sgpr),
773 "SMEM offset must be constant or sgpr", instr.get());
774 if (!instr->definitions.empty())
775 check(instr->definitions[0].regClass().type() == RegType::sgpr,
776 "SMEM result must be sgpr", instr.get());
777 break;
778 }
779 case Format::MTBUF:
780 case Format::MUBUF: {
781 check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
782 instr.get());
783 check(instr->operands[1].isOfType(RegType::vgpr),
784 "VADDR must be in vgpr for VMEM instructions", instr.get());
785 check(instr->operands[0].isOfType(RegType::sgpr), "VMEM resource constant must be sgpr",
786 instr.get());
787 check(instr->operands.size() < 4 || instr->operands[3].isOfType(RegType::vgpr),
788 "VMEM write data must be vgpr", instr.get());
789 if (instr->operands.size() >= 3 && instr->operands[2].isConstant())
790 check(program->gfx_level < GFX12 || instr->operands[2].constantValue() == 0,
791 "VMEM SOFFSET must not be non-zero constant on GFX12+", instr.get());
792
793 const bool d16 =
794 instr->opcode ==
795 aco_opcode::buffer_load_dword || // FIXME: used to spill subdword variables
796 instr->opcode == aco_opcode::buffer_load_ubyte ||
797 instr->opcode == aco_opcode::buffer_load_sbyte ||
798 instr->opcode == aco_opcode::buffer_load_ushort ||
799 instr->opcode == aco_opcode::buffer_load_sshort ||
800 instr->opcode == aco_opcode::buffer_load_ubyte_d16 ||
801 instr->opcode == aco_opcode::buffer_load_ubyte_d16_hi ||
802 instr->opcode == aco_opcode::buffer_load_sbyte_d16 ||
803 instr->opcode == aco_opcode::buffer_load_sbyte_d16_hi ||
804 instr->opcode == aco_opcode::buffer_load_short_d16 ||
805 instr->opcode == aco_opcode::buffer_load_short_d16_hi ||
806 instr->opcode == aco_opcode::buffer_load_format_d16_x ||
807 instr->opcode == aco_opcode::buffer_load_format_d16_hi_x ||
808 instr->opcode == aco_opcode::buffer_load_format_d16_xy ||
809 instr->opcode == aco_opcode::buffer_load_format_d16_xyz ||
810 instr->opcode == aco_opcode::buffer_load_format_d16_xyzw ||
811 instr->opcode == aco_opcode::tbuffer_load_format_d16_x ||
812 instr->opcode == aco_opcode::tbuffer_load_format_d16_xy ||
813 instr->opcode == aco_opcode::tbuffer_load_format_d16_xyz ||
814 instr->opcode == aco_opcode::tbuffer_load_format_d16_xyzw;
815 if (instr->definitions.size()) {
816 check(instr->definitions[0].regClass().type() == RegType::vgpr,
817 "VMEM definitions[0] (VDATA) must be VGPR", instr.get());
818 check(d16 || !instr->definitions[0].regClass().is_subdword(),
819 "Only D16 opcodes can load subdword values.", instr.get());
820 check(instr->definitions[0].bytes() <= 8 || !d16,
821 "D16 opcodes can only load up to 8 bytes.", instr.get());
822 }
823 break;
824 }
825 case Format::MIMG: {
826 check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
827 instr.get());
828 check(instr->operands[0].hasRegClass() &&
829 (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
830 "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
831 if (instr->operands[1].hasRegClass())
832 check(instr->operands[1].regClass() == s4,
833 "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
834 if (!instr->operands[2].isUndefined()) {
835 bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
836 instr->opcode == aco_opcode::image_atomic_fcmpswap;
837 check(instr->definitions.empty() ||
838 (instr->definitions[0].regClass() == instr->operands[2].regClass() ||
839 is_cmpswap),
840 "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
841 "TFE/LWE loads",
842 instr.get());
843 }
844
845 if (instr->mimg().strict_wqm) {
846 check(instr->operands[3].hasRegClass() &&
847 instr->operands[3].regClass().is_linear_vgpr(),
848 "MIMG operands[3] must be temp linear VGPR.", instr.get());
849
850 unsigned total_size = 0;
851 for (unsigned i = 4; i < instr->operands.size(); i++) {
852 check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1,
853 "MIMG operands[4+] (VADDR) must be v1", instr.get());
854 total_size += instr->operands[i].bytes();
855 }
856 check(total_size <= instr->operands[3].bytes(),
857 "MIMG operands[4+] must fit within operands[3].", instr.get());
858 } else {
859 check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
860 "NSA is only supported on GFX10+", instr.get());
861 for (unsigned i = 3; i < instr->operands.size(); i++) {
862 check(instr->operands[i].hasRegClass() &&
863 instr->operands[i].regClass().type() == RegType::vgpr,
864 "MIMG operands[3+] (VADDR) must be VGPR", instr.get());
865 if (instr->operands.size() > 4) {
866 if (program->gfx_level < GFX11) {
867 check(instr->operands[i].regClass() == v1,
868 "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
869 } else {
870 unsigned num_scalar =
871 program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4;
872 if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
873 instr->opcode != aco_opcode::image_bvh64_intersect_ray &&
874 i < 3 + num_scalar) {
875 check(instr->operands[i].regClass() == v1,
876 "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
877 }
878 }
879 }
880 }
881 }
882
883 if (instr->definitions.size()) {
884 check(instr->definitions[0].regClass().type() == RegType::vgpr,
885 "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
886 check(instr->mimg().d16 || !instr->definitions[0].regClass().is_subdword(),
887 "Only D16 MIMG instructions can load subdword values.", instr.get());
888 check(instr->definitions[0].bytes() <= 8 || !instr->mimg().d16,
889 "D16 MIMG instructions can only load up to 8 bytes.", instr.get());
890 }
891 break;
892 }
893 case Format::DS: {
894 for (const Operand& op : instr->operands) {
895 check(op.isOfType(RegType::vgpr) || op.physReg() == m0 || op.isUndefined(),
896 "Only VGPRs are valid DS instruction operands", instr.get());
897 }
898 if (!instr->definitions.empty())
899 check(instr->definitions[0].regClass().type() == RegType::vgpr,
900 "DS instruction must return VGPR", instr.get());
901 break;
902 }
903 case Format::EXP: {
904 for (unsigned i = 0; i < 4; i++)
905 check(instr->operands[i].isOfType(RegType::vgpr),
906 "Only VGPRs are valid Export arguments", instr.get());
907 break;
908 }
909 case Format::FLAT:
910 check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
911 instr.get());
912 FALLTHROUGH;
913 case Format::GLOBAL:
914 check(instr->operands[0].isOfType(RegType::vgpr), "FLAT/GLOBAL address must be vgpr",
915 instr.get());
916 FALLTHROUGH;
917 case Format::SCRATCH: {
918 check(instr->operands[0].isOfType(RegType::vgpr),
919 "FLAT/GLOBAL/SCRATCH address must be undefined or vgpr", instr.get());
920 check(instr->operands[1].isOfType(RegType::sgpr),
921 "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
922 if (instr->format == Format::SCRATCH && program->gfx_level < GFX10_3)
923 check(!instr->operands[0].isUndefined() || !instr->operands[1].isUndefined(),
924 "SCRATCH must have either SADDR or ADDR operand", instr.get());
925 if (!instr->definitions.empty())
926 check(instr->definitions[0].regClass().type() == RegType::vgpr,
927 "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
928 else
929 check(instr->operands[2].isOfType(RegType::vgpr),
930 "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
931 break;
932 }
933 case Format::LDSDIR: {
934 check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1,
935 "LDSDIR must have an v1 definition", instr.get());
936 check(instr->operands.size() == 1, "LDSDIR must have an operand", instr.get());
937 if (!instr->operands.empty()) {
938 check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand",
939 instr.get());
940 check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0,
941 "LDSDIR must have an operand fixed to m0", instr.get());
942 }
943 break;
944 }
945 default: break;
946 }
947 }
948 }
949
950 return is_valid;
951 }
952
953 bool
validate_cfg(Program * program)954 validate_cfg(Program* program)
955 {
956 if (!(debug_flags & DEBUG_VALIDATE_IR))
957 return true;
958
959 bool is_valid = true;
960 auto check_block = [&program, &is_valid](bool success, const char* msg,
961 aco::Block* block) -> void
962 {
963 if (!success) {
964 aco_err(program, "%s: BB%u", msg, block->index);
965 is_valid = false;
966 }
967 };
968
969 /* validate CFG */
970 for (unsigned i = 0; i < program->blocks.size(); i++) {
971 Block& block = program->blocks[i];
972 check_block(block.index == i, "block.index must match actual index", &block);
973
974 /* predecessors/successors should be sorted */
975 for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
976 check_block(block.linear_preds[j] < block.linear_preds[j + 1],
977 "linear predecessors must be sorted", &block);
978 for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
979 check_block(block.logical_preds[j] < block.logical_preds[j + 1],
980 "logical predecessors must be sorted", &block);
981 for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
982 check_block(block.linear_succs[j] < block.linear_succs[j + 1],
983 "linear successors must be sorted", &block);
984 for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
985 check_block(block.logical_succs[j] < block.logical_succs[j + 1],
986 "logical successors must be sorted", &block);
987
988 /* critical edges are not allowed */
989 if (block.linear_preds.size() > 1) {
990 for (unsigned pred : block.linear_preds)
991 check_block(program->blocks[pred].linear_succs.size() == 1,
992 "linear critical edges are not allowed", &program->blocks[pred]);
993 for (unsigned pred : block.logical_preds)
994 check_block(program->blocks[pred].logical_succs.size() == 1,
995 "logical critical edges are not allowed", &program->blocks[pred]);
996 }
997 }
998
999 return is_valid;
1000 }
1001
1002 bool
validate_live_vars(Program * program)1003 validate_live_vars(Program* program)
1004 {
1005 if (!(debug_flags & DEBUG_VALIDATE_LIVE_VARS))
1006 return true;
1007
1008 bool is_valid = true;
1009 const int prev_num_waves = program->num_waves;
1010 const monotonic_buffer_resource old_memory = std::move(program->live.memory);
1011 const std::vector<IDSet> prev_live_in = std::move(program->live.live_in);
1012 const RegisterDemand prev_max_demand = program->max_reg_demand;
1013 std::vector<RegisterDemand> block_demands(program->blocks.size());
1014 std::vector<RegisterDemand> live_in_demands(program->blocks.size());
1015 std::vector<std::vector<RegisterDemand>> register_demands(program->blocks.size());
1016
1017 for (unsigned i = 0; i < program->blocks.size(); i++) {
1018 Block& b = program->blocks[i];
1019 block_demands[i] = b.register_demand;
1020 live_in_demands[i] = b.live_in_demand;
1021 register_demands[i].reserve(b.instructions.size());
1022 for (unsigned j = 0; j < b.instructions.size(); j++)
1023 register_demands[i].emplace_back(b.instructions[j]->register_demand);
1024 }
1025
1026 aco::live_var_analysis(program);
1027
1028 /* Validate RegisterDemand calculation */
1029 for (unsigned i = 0; i < program->blocks.size(); i++) {
1030 Block& b = program->blocks[i];
1031
1032 if (!(b.register_demand == block_demands[i])) {
1033 is_valid = false;
1034 aco_err(program,
1035 "Register Demand not updated correctly for BB%d: got (%3u vgpr, %3u sgpr), but "
1036 "should be (%3u vgpr, %3u sgpr)",
1037 i, block_demands[i].vgpr, block_demands[i].sgpr, b.register_demand.vgpr,
1038 b.register_demand.sgpr);
1039 }
1040 if (!(b.live_in_demand == live_in_demands[i])) {
1041 is_valid = false;
1042 aco_err(program,
1043 "Live-in Demand not updated correctly for BB%d: got (%3u vgpr, %3u sgpr), but "
1044 "should be (%3u vgpr, %3u sgpr)",
1045 i, live_in_demands[i].vgpr, live_in_demands[i].sgpr, b.live_in_demand.vgpr,
1046 b.live_in_demand.sgpr);
1047 }
1048
1049 for (unsigned j = 0; j < b.instructions.size(); j++) {
1050 if (b.instructions[j]->register_demand == register_demands[i][j])
1051 continue;
1052
1053 char* out;
1054 size_t outsize;
1055 struct u_memstream mem;
1056 u_memstream_open(&mem, &out, &outsize);
1057 FILE* const memf = u_memstream_get(&mem);
1058
1059 fprintf(memf,
1060 "Register Demand not updated correctly: got (%3u vgpr, %3u sgpr), but should be "
1061 "(%3u vgpr, %3u sgpr): \n\t",
1062 register_demands[i][j].vgpr, register_demands[i][j].sgpr,
1063 b.instructions[j]->register_demand.vgpr, b.instructions[j]->register_demand.sgpr);
1064 aco_print_instr(program->gfx_level, b.instructions[j].get(), memf, print_kill);
1065 u_memstream_close(&mem);
1066
1067 aco_err(program, "%s", out);
1068 free(out);
1069
1070 is_valid = false;
1071 }
1072 }
1073 if (!(program->max_reg_demand == prev_max_demand) || program->num_waves != prev_num_waves) {
1074 is_valid = false;
1075 aco_err(program,
1076 "Max Register Demand and Num Waves not updated correctly: got (%3u vgpr, %3u sgpr) "
1077 "and %2u waves, but should be (%3u vgpr, %3u sgpr) and %2u waves",
1078 prev_max_demand.vgpr, prev_max_demand.sgpr, prev_num_waves,
1079 program->max_reg_demand.vgpr, program->max_reg_demand.sgpr, program->num_waves);
1080 }
1081
1082 /* Validate Live-in sets */
1083 for (unsigned i = 0; i < program->blocks.size(); i++) {
1084 if (prev_live_in[i] != program->live.live_in[i]) {
1085 char* out;
1086 size_t outsize;
1087 struct u_memstream mem;
1088 u_memstream_open(&mem, &out, &outsize);
1089 FILE* const memf = u_memstream_get(&mem);
1090
1091 fprintf(memf, "Live-in set not updated correctly for BB%d:", i);
1092 fprintf(memf, "\nMissing values: ");
1093 for (unsigned t : program->live.live_in[i]) {
1094 if (prev_live_in[i].count(t) == 0)
1095 fprintf(memf, "%%%d, ", t);
1096 }
1097 fprintf(memf, "\nAdditional values: ");
1098 for (unsigned t : prev_live_in[i]) {
1099 if (program->live.live_in[i].count(t) == 0)
1100 fprintf(memf, "%%%d, ", t);
1101 }
1102 u_memstream_close(&mem);
1103 aco_err(program, "%s", out);
1104 free(out);
1105 is_valid = false;
1106 }
1107 }
1108
1109 return is_valid;
1110 }
1111
1112 /* RA validation */
1113 namespace {
1114
1115 struct Location {
Locationaco::__anon701c7ea20311::Location1116 Location() : block(NULL), instr(NULL) {}
1117
1118 Block* block;
1119 Instruction* instr; // NULL if it's the block's live-in
1120 };
1121
1122 struct Assignment {
1123 Location defloc;
1124 Location firstloc;
1125 PhysReg reg;
1126 bool valid;
1127 };
1128
1129 bool
ra_fail(Program * program,Location loc,Location loc2,const char * fmt,...)1130 ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
1131 {
1132 va_list args;
1133 va_start(args, fmt);
1134 char msg[1024];
1135 vsprintf(msg, fmt, args);
1136 va_end(args);
1137
1138 char* out;
1139 size_t outsize;
1140 struct u_memstream mem;
1141 u_memstream_open(&mem, &out, &outsize);
1142 FILE* const memf = u_memstream_get(&mem);
1143
1144 fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
1145 if (loc.instr) {
1146 aco_print_instr(program->gfx_level, loc.instr, memf);
1147 fprintf(memf, "\n%s", msg);
1148 } else {
1149 fprintf(memf, "%s", msg);
1150 }
1151 if (loc2.block) {
1152 fprintf(memf, " in BB%d:\n", loc2.block->index);
1153 aco_print_instr(program->gfx_level, loc2.instr, memf);
1154 }
1155 fprintf(memf, "\n\n");
1156 u_memstream_close(&mem);
1157
1158 aco_err(program, "%s", out);
1159 free(out);
1160
1161 return true;
1162 }
1163
1164 bool
validate_subdword_operand(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned index)1165 validate_subdword_operand(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
1166 unsigned index)
1167 {
1168 Operand op = instr->operands[index];
1169 unsigned byte = op.physReg().byte();
1170
1171 if (instr->opcode == aco_opcode::p_as_uniform)
1172 return byte == 0;
1173 if (instr->isPseudo() && gfx_level >= GFX8)
1174 return true;
1175 if (instr->isSDWA())
1176 return byte + instr->sdwa().sel[index].offset() + instr->sdwa().sel[index].size() <= 4 &&
1177 byte % instr->sdwa().sel[index].size() == 0;
1178 if (instr->isVOP3P()) {
1179 bool fma_mix = instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
1180 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
1181 instr->opcode == aco_opcode::v_fma_mix_f32;
1182 return instr->valu().opsel_lo[index] == (byte >> 1) &&
1183 instr->valu().opsel_hi[index] == (fma_mix || (byte >> 1));
1184 }
1185 if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, index))
1186 return true;
1187
1188 switch (instr->opcode) {
1189 case aco_opcode::v_cvt_f32_ubyte1:
1190 if (byte == 1)
1191 return true;
1192 break;
1193 case aco_opcode::v_cvt_f32_ubyte2:
1194 if (byte == 2)
1195 return true;
1196 break;
1197 case aco_opcode::v_cvt_f32_ubyte3:
1198 if (byte == 3)
1199 return true;
1200 break;
1201 case aco_opcode::ds_write_b8_d16_hi:
1202 case aco_opcode::ds_write_b16_d16_hi:
1203 if (byte == 2 && index == 1)
1204 return true;
1205 break;
1206 case aco_opcode::buffer_store_byte_d16_hi:
1207 case aco_opcode::buffer_store_short_d16_hi:
1208 case aco_opcode::buffer_store_format_d16_hi_x:
1209 if (byte == 2 && index == 3)
1210 return true;
1211 break;
1212 case aco_opcode::flat_store_byte_d16_hi:
1213 case aco_opcode::flat_store_short_d16_hi:
1214 case aco_opcode::scratch_store_byte_d16_hi:
1215 case aco_opcode::scratch_store_short_d16_hi:
1216 case aco_opcode::global_store_byte_d16_hi:
1217 case aco_opcode::global_store_short_d16_hi:
1218 if (byte == 2 && index == 2)
1219 return true;
1220 break;
1221 default: break;
1222 }
1223
1224 return byte == 0;
1225 }
1226
1227 bool
validate_subdword_definition(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr)1228 validate_subdword_definition(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr)
1229 {
1230 Definition def = instr->definitions[0];
1231 unsigned byte = def.physReg().byte();
1232
1233 if (instr->isPseudo() && gfx_level >= GFX8)
1234 return true;
1235 if (instr->isSDWA())
1236 return byte + instr->sdwa().dst_sel.offset() + instr->sdwa().dst_sel.size() <= 4 &&
1237 byte % instr->sdwa().dst_sel.size() == 0;
1238 if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, -1))
1239 return true;
1240
1241 switch (instr->opcode) {
1242 case aco_opcode::v_interp_p2_hi_f16:
1243 case aco_opcode::v_fma_mixhi_f16:
1244 case aco_opcode::buffer_load_ubyte_d16_hi:
1245 case aco_opcode::buffer_load_sbyte_d16_hi:
1246 case aco_opcode::buffer_load_short_d16_hi:
1247 case aco_opcode::buffer_load_format_d16_hi_x:
1248 case aco_opcode::flat_load_ubyte_d16_hi:
1249 case aco_opcode::flat_load_short_d16_hi:
1250 case aco_opcode::scratch_load_ubyte_d16_hi:
1251 case aco_opcode::scratch_load_short_d16_hi:
1252 case aco_opcode::global_load_ubyte_d16_hi:
1253 case aco_opcode::global_load_short_d16_hi:
1254 case aco_opcode::ds_read_u8_d16_hi:
1255 case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
1256 case aco_opcode::p_v_cvt_pk_u8_f32: return true;
1257 default: break;
1258 }
1259
1260 return byte == 0;
1261 }
1262
1263 unsigned
get_subdword_bytes_written(Program * program,const aco_ptr<Instruction> & instr,unsigned index)1264 get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
1265 {
1266 amd_gfx_level gfx_level = program->gfx_level;
1267 Definition def = instr->definitions[index];
1268
1269 if (instr->isPseudo())
1270 return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u;
1271 if (instr->isVALU() || instr->isVINTRP()) {
1272 assert(def.bytes() <= 2);
1273 if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
1274 return 1;
1275
1276 if (instr->isSDWA())
1277 return instr->sdwa().dst_sel.size();
1278
1279 if (instr_is_16bit(gfx_level, instr->opcode))
1280 return 2;
1281
1282 return 4;
1283 }
1284
1285 if (instr->isMIMG()) {
1286 assert(instr->mimg().d16);
1287 return program->dev.sram_ecc_enabled ? def.size() * 4u : def.bytes();
1288 }
1289
1290 switch (instr->opcode) {
1291 case aco_opcode::buffer_load_ubyte_d16:
1292 case aco_opcode::buffer_load_sbyte_d16:
1293 case aco_opcode::buffer_load_short_d16:
1294 case aco_opcode::buffer_load_format_d16_x:
1295 case aco_opcode::tbuffer_load_format_d16_x:
1296 case aco_opcode::flat_load_ubyte_d16:
1297 case aco_opcode::flat_load_short_d16:
1298 case aco_opcode::scratch_load_ubyte_d16:
1299 case aco_opcode::scratch_load_short_d16:
1300 case aco_opcode::global_load_ubyte_d16:
1301 case aco_opcode::global_load_short_d16:
1302 case aco_opcode::ds_read_u8_d16:
1303 case aco_opcode::ds_read_u16_d16:
1304 case aco_opcode::buffer_load_ubyte_d16_hi:
1305 case aco_opcode::buffer_load_sbyte_d16_hi:
1306 case aco_opcode::buffer_load_short_d16_hi:
1307 case aco_opcode::buffer_load_format_d16_hi_x:
1308 case aco_opcode::flat_load_ubyte_d16_hi:
1309 case aco_opcode::flat_load_short_d16_hi:
1310 case aco_opcode::scratch_load_ubyte_d16_hi:
1311 case aco_opcode::scratch_load_short_d16_hi:
1312 case aco_opcode::global_load_ubyte_d16_hi:
1313 case aco_opcode::global_load_short_d16_hi:
1314 case aco_opcode::ds_read_u8_d16_hi:
1315 case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
1316 case aco_opcode::buffer_load_format_d16_xyz:
1317 case aco_opcode::tbuffer_load_format_d16_xyz: return program->dev.sram_ecc_enabled ? 8 : 6;
1318 default: return def.size() * 4;
1319 }
1320 }
1321
1322 bool
validate_instr_defs(Program * program,std::array<unsigned,2048> & regs,const std::vector<Assignment> & assignments,const Location & loc,aco_ptr<Instruction> & instr)1323 validate_instr_defs(Program* program, std::array<unsigned, 2048>& regs,
1324 const std::vector<Assignment>& assignments, const Location& loc,
1325 aco_ptr<Instruction>& instr)
1326 {
1327 bool err = false;
1328
1329 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1330 Definition& def = instr->definitions[i];
1331 if (!def.isTemp())
1332 continue;
1333 Temp tmp = def.getTemp();
1334 PhysReg reg = assignments[tmp.id()].reg;
1335 for (unsigned j = 0; j < tmp.bytes(); j++) {
1336 if (regs[reg.reg_b + j])
1337 err |=
1338 ra_fail(program, loc, assignments[regs[reg.reg_b + j]].defloc,
1339 "Assignment of element %d of %%%d already taken by %%%d from instruction", i,
1340 tmp.id(), regs[reg.reg_b + j]);
1341 regs[reg.reg_b + j] = tmp.id();
1342 }
1343 if (def.regClass().is_subdword() && def.bytes() < 4) {
1344 unsigned written = get_subdword_bytes_written(program, instr, i);
1345 /* If written=4, the instruction still might write the upper half. In that case, it's
1346 * the lower half that isn't preserved */
1347 for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
1348 unsigned written_reg = reg.reg() * 4u + j;
1349 if (regs[written_reg] && regs[written_reg] != def.tempId())
1350 err |= ra_fail(program, loc, assignments[regs[written_reg]].defloc,
1351 "Assignment of element %d of %%%d overwrites the full register "
1352 "taken by %%%d from instruction",
1353 i, tmp.id(), regs[written_reg]);
1354 }
1355 }
1356 }
1357
1358 for (const Definition& def : instr->definitions) {
1359 if (!def.isTemp())
1360 continue;
1361 if (def.isKill()) {
1362 for (unsigned j = 0; j < def.getTemp().bytes(); j++)
1363 regs[def.physReg().reg_b + j] = 0;
1364 }
1365 }
1366
1367 return err;
1368 }
1369
1370 } /* end namespace */
1371
1372 bool
validate_ra(Program * program)1373 validate_ra(Program* program)
1374 {
1375 if (!(debug_flags & DEBUG_VALIDATE_RA))
1376 return false;
1377
1378 bool err = false;
1379 aco::live_var_analysis(program);
1380 std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size());
1381 uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves);
1382
1383 std::vector<Assignment> assignments(program->peekAllocationId());
1384 for (Block& block : program->blocks) {
1385 Location loc;
1386 loc.block = █
1387 for (aco_ptr<Instruction>& instr : block.instructions) {
1388 if (instr->opcode == aco_opcode::p_phi) {
1389 for (unsigned i = 0; i < instr->operands.size(); i++) {
1390 if (instr->operands[i].isTemp() &&
1391 instr->operands[i].getTemp().type() == RegType::sgpr &&
1392 instr->operands[i].isFirstKill())
1393 phi_sgpr_ops[block.logical_preds[i]].emplace_back(instr->operands[i].getTemp());
1394 }
1395 }
1396
1397 loc.instr = instr.get();
1398 for (unsigned i = 0; i < instr->operands.size(); i++) {
1399 Operand& op = instr->operands[i];
1400 if (!op.isTemp())
1401 continue;
1402 if (!op.isFixed())
1403 err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
1404 if (assignments[op.tempId()].valid && assignments[op.tempId()].reg != op.physReg())
1405 err |=
1406 ra_fail(program, loc, assignments[op.tempId()].firstloc,
1407 "Operand %d has an inconsistent register assignment with instruction", i);
1408 if ((op.getTemp().type() == RegType::vgpr &&
1409 op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
1410 (op.getTemp().type() == RegType::sgpr &&
1411 op.physReg() + op.size() > program->config->num_sgprs &&
1412 op.physReg() < sgpr_limit))
1413 err |= ra_fail(program, loc, assignments[op.tempId()].firstloc,
1414 "Operand %d has an out-of-bounds register assignment", i);
1415 if (op.physReg() == vcc && !program->needs_vcc)
1416 err |= ra_fail(program, loc, Location(),
1417 "Operand %d fixed to vcc but needs_vcc=false", i);
1418 if (op.regClass().is_subdword() &&
1419 !validate_subdword_operand(program->gfx_level, instr, i))
1420 err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
1421 if (!assignments[op.tempId()].firstloc.block)
1422 assignments[op.tempId()].firstloc = loc;
1423 if (!assignments[op.tempId()].defloc.block) {
1424 assignments[op.tempId()].reg = op.physReg();
1425 assignments[op.tempId()].valid = true;
1426 }
1427 }
1428
1429 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1430 Definition& def = instr->definitions[i];
1431 if (!def.isTemp())
1432 continue;
1433 if (!def.isFixed())
1434 err |=
1435 ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
1436 if (assignments[def.tempId()].defloc.block)
1437 err |= ra_fail(program, loc, assignments[def.tempId()].defloc,
1438 "Temporary %%%d also defined by instruction", def.tempId());
1439 if ((def.getTemp().type() == RegType::vgpr &&
1440 def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
1441 (def.getTemp().type() == RegType::sgpr &&
1442 def.physReg() + def.size() > program->config->num_sgprs &&
1443 def.physReg() < sgpr_limit))
1444 err |= ra_fail(program, loc, assignments[def.tempId()].firstloc,
1445 "Definition %d has an out-of-bounds register assignment", i);
1446 if (def.physReg() == vcc && !program->needs_vcc)
1447 err |= ra_fail(program, loc, Location(),
1448 "Definition %d fixed to vcc but needs_vcc=false", i);
1449 if (def.regClass().is_subdword() &&
1450 !validate_subdword_definition(program->gfx_level, instr))
1451 err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
1452 if (!assignments[def.tempId()].firstloc.block)
1453 assignments[def.tempId()].firstloc = loc;
1454 assignments[def.tempId()].defloc = loc;
1455 assignments[def.tempId()].reg = def.physReg();
1456 assignments[def.tempId()].valid = true;
1457 }
1458 }
1459 }
1460
1461 for (Block& block : program->blocks) {
1462 Location loc;
1463 loc.block = █
1464
1465 std::array<unsigned, 2048> regs; /* register file in bytes */
1466 regs.fill(0);
1467
1468 /* check live in */
1469 for (unsigned id : program->live.live_in[block.index]) {
1470 Temp tmp(id, program->temp_rc[id]);
1471 PhysReg reg = assignments[id].reg;
1472 for (unsigned i = 0; i < tmp.bytes(); i++) {
1473 if (regs[reg.reg_b + i]) {
1474 err |= ra_fail(program, loc, Location(),
1475 "Assignment of element %d of %%%d already taken by %%%d in live-in",
1476 i, id, regs[reg.reg_b + i]);
1477 }
1478 regs[reg.reg_b + i] = id;
1479 }
1480 }
1481
1482 for (aco_ptr<Instruction>& instr : block.instructions) {
1483 loc.instr = instr.get();
1484
1485 /* remove killed p_phi operands from regs */
1486 if (instr->opcode == aco_opcode::p_logical_end) {
1487 for (Temp tmp : phi_sgpr_ops[block.index]) {
1488 PhysReg reg = assignments[tmp.id()].reg;
1489 for (unsigned i = 0; i < tmp.bytes(); i++)
1490 regs[reg.reg_b + i] = 0;
1491 }
1492 }
1493
1494 if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) {
1495 for (const Operand& op : instr->operands) {
1496 if (!op.isTemp())
1497 continue;
1498 if (op.isFirstKillBeforeDef()) {
1499 for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1500 regs[op.physReg().reg_b + j] = 0;
1501 }
1502 }
1503 }
1504
1505 if (!instr->isBranch() || block.linear_succs.size() != 1)
1506 err |= validate_instr_defs(program, regs, assignments, loc, instr);
1507
1508 if (!is_phi(instr)) {
1509 for (const Operand& op : instr->operands) {
1510 if (!op.isTemp())
1511 continue;
1512 if (op.isLateKill() && op.isFirstKill()) {
1513 for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1514 regs[op.physReg().reg_b + j] = 0;
1515 }
1516 }
1517 } else if (block.linear_preds.size() != 1 ||
1518 program->blocks[block.linear_preds[0]].linear_succs.size() == 1) {
1519 for (unsigned pred : block.linear_preds) {
1520 aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
1521 assert(br->isBranch());
1522 err |= validate_instr_defs(program, regs, assignments, loc, br);
1523 }
1524 }
1525 }
1526 }
1527
1528 return err;
1529 }
1530 } // namespace aco
1531