1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "aco_instruction_selection.h"
9
10 #include "aco_builder.h"
11 #include "aco_interface.h"
12 #include "aco_ir.h"
13
14 #include "common/ac_descriptors.h"
15 #include "common/ac_gpu_info.h"
16 #include "common/ac_nir.h"
17 #include "common/sid.h"
18
19 #include "util/fast_idiv_by_const.h"
20 #include "util/memstream.h"
21
22 #include <array>
23 #include <functional>
24 #include <map>
25 #include <numeric>
26 #include <stack>
27 #include <utility>
28 #include <vector>
29
30 namespace aco {
31 namespace {
32
33 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
34
35 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)36 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
37 const char* msg)
38 {
39 char* out;
40 size_t outsize;
41 struct u_memstream mem;
42 u_memstream_open(&mem, &out, &outsize);
43 FILE* const memf = u_memstream_get(&mem);
44
45 fprintf(memf, "%s: ", msg);
46 nir_print_instr(instr, memf);
47 u_memstream_close(&mem);
48
49 _aco_err(ctx->program, file, line, out);
50 free(out);
51 }
52
53 struct if_context {
54 Temp cond;
55
56 bool divergent_old;
57 bool had_divergent_discard_old;
58 bool had_divergent_discard_then;
59 bool has_divergent_continue_old;
60 bool has_divergent_continue_then;
61 struct exec_info exec_old;
62
63 unsigned BB_if_idx;
64 unsigned invert_idx;
65 Block BB_invert;
66 Block BB_endif;
67 };
68
69 struct loop_context {
70 Block loop_exit;
71
72 unsigned header_idx_old;
73 Block* exit_old;
74 bool divergent_cont_old;
75 bool divergent_branch_old;
76 bool divergent_if_old;
77 };
78
79 static void visit_cf_list(struct isel_context* ctx, struct exec_list* list);
80
81 static void
add_logical_edge(unsigned pred_idx,Block * succ)82 add_logical_edge(unsigned pred_idx, Block* succ)
83 {
84 succ->logical_preds.emplace_back(pred_idx);
85 }
86
87 static void
add_linear_edge(unsigned pred_idx,Block * succ)88 add_linear_edge(unsigned pred_idx, Block* succ)
89 {
90 succ->linear_preds.emplace_back(pred_idx);
91 }
92
93 static void
add_edge(unsigned pred_idx,Block * succ)94 add_edge(unsigned pred_idx, Block* succ)
95 {
96 add_logical_edge(pred_idx, succ);
97 add_linear_edge(pred_idx, succ);
98 }
99
100 static void
append_logical_start(Block * b)101 append_logical_start(Block* b)
102 {
103 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
104 }
105
106 static void
append_logical_end(Block * b)107 append_logical_end(Block* b)
108 {
109 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
110 }
111
112 Temp
get_ssa_temp(struct isel_context * ctx,nir_def * def)113 get_ssa_temp(struct isel_context* ctx, nir_def* def)
114 {
115 uint32_t id = ctx->first_temp_id + def->index;
116 return Temp(id, ctx->program->temp_rc[id]);
117 }
118
119 static Builder
create_alu_builder(isel_context * ctx,nir_alu_instr * instr)120 create_alu_builder(isel_context* ctx, nir_alu_instr* instr)
121 {
122 Builder bld(ctx->program, ctx->block);
123 bld.is_precise = instr->exact;
124 bld.is_sz_preserve = nir_alu_instr_is_signed_zero_preserve(instr);
125 bld.is_inf_preserve = nir_alu_instr_is_inf_preserve(instr);
126 bld.is_nan_preserve = nir_alu_instr_is_nan_preserve(instr);
127 return bld;
128 }
129
130 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())131 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
132 {
133 Builder bld(ctx->program, ctx->block);
134 assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
135 assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
136
137 if (ctx->program->wave_size == 32) {
138 Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
139 return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
140 }
141
142 Operand mask_lo = Operand::c32(-1u);
143 Operand mask_hi = Operand::c32(-1u);
144
145 if (mask.isTemp()) {
146 RegClass rc = RegClass(mask.regClass().type(), 1);
147 Builder::Result mask_split =
148 bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
149 mask_lo = Operand(mask_split.def(0).getTemp());
150 mask_hi = Operand(mask_split.def(1).getTemp());
151 } else if (mask.physReg() == exec) {
152 mask_lo = Operand(exec_lo, s1);
153 mask_hi = Operand(exec_hi, s1);
154 }
155
156 Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
157
158 if (ctx->program->gfx_level <= GFX7)
159 return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
160 else
161 return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
162 }
163
164 inline void
set_wqm(isel_context * ctx,bool enable_helpers=false)165 set_wqm(isel_context* ctx, bool enable_helpers = false)
166 {
167 if (ctx->program->stage == fragment_fs) {
168 ctx->wqm_block_idx = ctx->block->index;
169 ctx->wqm_instruction_idx = ctx->block->instructions.size();
170 if (ctx->shader)
171 enable_helpers |= ctx->shader->info.fs.require_full_quads;
172 ctx->program->needs_wqm |= enable_helpers;
173 }
174 }
175
176 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)177 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
178 {
179 if (index.regClass() == s1)
180 return bld.readlane(bld.def(s1), data, index);
181
182 /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
183 * of multiple binaries, because the VGPR use is not known when choosing
184 * which registers to use for the shared VGPRs.
185 */
186 const bool avoid_shared_vgprs =
187 ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
188 ctx->program->wave_size == 64 &&
189 (ctx->program->info.ps.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
190 ctx->program->info.vs.has_prolog || ctx->stage == raytracing_cs);
191
192 if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
193 /* GFX6-7: there is no bpermute instruction */
194 return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
195 bld.def(bld.lm, vcc), index, data);
196 } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
197
198 /* GFX10 wave64 mode: emulate full-wave bpermute */
199 Temp index_is_lo =
200 bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
201 Builder::Result index_is_lo_split =
202 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
203 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
204 index_is_lo_split.def(1).getTemp());
205 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
206 index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
207 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
208
209 if (ctx->options->gfx_level <= GFX10_3) {
210 /* We need one pair of shared VGPRs:
211 * Note, that these have twice the allocation granularity of normal VGPRs
212 */
213 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
214
215 return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
216 bld.def(s1, scc), index_x4, data, same_half);
217 } else {
218 return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
219 bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half);
220 }
221 } else {
222 /* GFX8-9 or GFX10 wave32: bpermute works normally */
223 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
224 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
225 }
226 }
227
228 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask,bool allow_fi)229 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
230 {
231 if (ctx->options->gfx_level >= GFX8) {
232 unsigned and_mask = mask & 0x1f;
233 unsigned or_mask = (mask >> 5) & 0x1f;
234 unsigned xor_mask = (mask >> 10) & 0x1f;
235
236 /* Eliminate or_mask. */
237 and_mask &= ~or_mask;
238 xor_mask ^= or_mask;
239
240 uint16_t dpp_ctrl = 0xffff;
241
242 /* DPP16 before DPP8 before v_permlane(x)16_b32
243 * because DPP16 supports modifiers and v_permlane
244 * can't be folded into valu instructions.
245 */
246 if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
247 unsigned res[4];
248 for (unsigned i = 0; i < 4; i++)
249 res[i] = ((i & and_mask) ^ xor_mask);
250 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
251 } else if (and_mask == 0x1f && xor_mask == 8) {
252 dpp_ctrl = dpp_row_rr(8);
253 } else if (and_mask == 0x1f && xor_mask == 0xf) {
254 dpp_ctrl = dpp_row_mirror;
255 } else if (and_mask == 0x1f && xor_mask == 0x7) {
256 dpp_ctrl = dpp_row_half_mirror;
257 } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
258 dpp_ctrl = dpp_row_share(xor_mask);
259 } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
260 dpp_ctrl = dpp_row_xmask(xor_mask);
261 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
262 uint32_t lane_sel = 0;
263 for (unsigned i = 0; i < 8; i++)
264 lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
265 return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
266 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
267 uint64_t lane_mask = 0;
268 for (unsigned i = 0; i < 16; i++)
269 lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
270 aco_opcode opcode =
271 xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
272 Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
273 Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
274 Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
275 ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
276 ret->valu().opsel[1] = true; /* set BOUND_CTRL */
277 return ret;
278 }
279
280 if (dpp_ctrl != 0xffff)
281 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
282 allow_fi);
283 }
284
285 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
286 }
287
288 Temp
as_vgpr(Builder & bld,Temp val)289 as_vgpr(Builder& bld, Temp val)
290 {
291 if (val.type() == RegType::sgpr)
292 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
293 assert(val.type() == RegType::vgpr);
294 return val;
295 }
296
297 Temp
as_vgpr(isel_context * ctx,Temp val)298 as_vgpr(isel_context* ctx, Temp val)
299 {
300 Builder bld(ctx->program, ctx->block);
301 return as_vgpr(bld, val);
302 }
303
304 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)305 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
306 {
307 Builder bld(ctx->program, ctx->block);
308 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
309 }
310
311 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)312 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
313 {
314 /* no need to extract the whole vector */
315 if (src.regClass() == dst_rc) {
316 assert(idx == 0);
317 return src;
318 }
319
320 assert(src.bytes() > (idx * dst_rc.bytes()));
321 Builder bld(ctx->program, ctx->block);
322 auto it = ctx->allocated_vec.find(src.id());
323 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
324 if (it->second[idx].regClass() == dst_rc) {
325 return it->second[idx];
326 } else {
327 assert(!dst_rc.is_subdword());
328 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
329 return bld.copy(bld.def(dst_rc), it->second[idx]);
330 }
331 }
332
333 if (dst_rc.is_subdword())
334 src = as_vgpr(ctx, src);
335
336 if (src.bytes() == dst_rc.bytes()) {
337 assert(idx == 0);
338 return bld.copy(bld.def(dst_rc), src);
339 } else {
340 Temp dst = bld.tmp(dst_rc);
341 emit_extract_vector(ctx, src, idx, dst);
342 return dst;
343 }
344 }
345
346 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)347 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
348 {
349 if (num_components == 1)
350 return;
351 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
352 return;
353 RegClass rc;
354 if (num_components > vec_src.size()) {
355 if (vec_src.type() == RegType::sgpr) {
356 /* should still help get_alu_src() */
357 emit_split_vector(ctx, vec_src, vec_src.size());
358 return;
359 }
360 /* sub-dword split */
361 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
362 } else {
363 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
364 }
365 aco_ptr<Instruction> split{
366 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
367 split->operands[0] = Operand(vec_src);
368 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
369 for (unsigned i = 0; i < num_components; i++) {
370 elems[i] = ctx->program->allocateTmp(rc);
371 split->definitions[i] = Definition(elems[i]);
372 }
373 ctx->block->instructions.emplace_back(std::move(split));
374 ctx->allocated_vec.emplace(vec_src.id(), elems);
375 }
376
377 /* This vector expansion uses a mask to determine which elements in the new vector
378 * come from the original vector. The other elements are undefined. */
379 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask,bool zero_padding=false)380 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
381 bool zero_padding = false)
382 {
383 assert(vec_src.type() == RegType::vgpr);
384 Builder bld(ctx->program, ctx->block);
385
386 if (dst.type() == RegType::sgpr && num_components > dst.size()) {
387 Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
388 expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
389 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
390 ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
391 return;
392 }
393
394 emit_split_vector(ctx, vec_src, util_bitcount(mask));
395
396 if (vec_src == dst)
397 return;
398
399 if (num_components == 1) {
400 if (dst.type() == RegType::sgpr)
401 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
402 else
403 bld.copy(Definition(dst), vec_src);
404 return;
405 }
406
407 unsigned component_bytes = dst.bytes() / num_components;
408 RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
409 RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
410 assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
411 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
412
413 Temp padding = Temp(0, dst_rc);
414 if (zero_padding)
415 padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
416
417 aco_ptr<Instruction> vec{
418 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
419 vec->definitions[0] = Definition(dst);
420 unsigned k = 0;
421 for (unsigned i = 0; i < num_components; i++) {
422 if (mask & (1 << i)) {
423 Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
424 if (dst.type() == RegType::sgpr)
425 src = bld.as_uniform(src);
426 vec->operands[i] = Operand(src);
427 elems[i] = src;
428 } else {
429 vec->operands[i] = Operand::zero(component_bytes);
430 elems[i] = padding;
431 }
432 }
433 ctx->block->instructions.emplace_back(std::move(vec));
434 ctx->allocated_vec.emplace(dst.id(), elems);
435 }
436
437 /* adjust misaligned small bit size loads */
438 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)439 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
440 {
441 Builder bld(ctx->program, ctx->block);
442 Operand shift;
443 Temp select = Temp();
444 if (offset.isConstant()) {
445 assert(offset.constantValue() && offset.constantValue() < 4);
446 shift = Operand::c32(offset.constantValue() * 8);
447 } else {
448 /* bit_offset = 8 * (offset & 0x3) */
449 Temp tmp =
450 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
451 select = bld.tmp(s1);
452 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
453 Operand::c32(3u));
454 }
455
456 if (vec.size() == 1) {
457 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
458 } else if (vec.size() == 2) {
459 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
460 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
461 if (tmp == dst)
462 emit_split_vector(ctx, dst, 2);
463 else
464 emit_extract_vector(ctx, tmp, 0, dst);
465 } else if (vec.size() == 3 || vec.size() == 4) {
466 Temp lo = bld.tmp(s2), hi;
467 if (vec.size() == 3) {
468 /* this can happen if we use VMEM for a uniform load */
469 hi = bld.tmp(s1);
470 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
471 } else {
472 hi = bld.tmp(s2);
473 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
474 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
475 }
476 if (select != Temp())
477 hi =
478 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
479 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
480 Temp mid = bld.tmp(s1);
481 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
482 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
483 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
484 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
485 emit_split_vector(ctx, dst, 2);
486 }
487 }
488
489 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)490 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
491 {
492 Builder bld(ctx->program, ctx->block);
493 if (offset.isTemp()) {
494 Temp tmp[4] = {vec, vec, vec, vec};
495
496 if (vec.size() == 4) {
497 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
498 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
499 Definition(tmp[2]), Definition(tmp[3]), vec);
500 } else if (vec.size() == 3) {
501 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
502 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
503 Definition(tmp[2]), vec);
504 } else if (vec.size() == 2) {
505 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
506 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
507 }
508 for (unsigned i = 0; i < dst.size(); i++)
509 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
510
511 vec = tmp[0];
512 if (dst.size() == 2)
513 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
514
515 offset = Operand::zero();
516 }
517
518 unsigned num_components = vec.bytes() / component_size;
519 if (vec.regClass() == dst.regClass()) {
520 assert(offset.constantValue() == 0);
521 bld.copy(Definition(dst), vec);
522 emit_split_vector(ctx, dst, num_components);
523 return;
524 }
525
526 emit_split_vector(ctx, vec, num_components);
527 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
528 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
529
530 assert(offset.constantValue() % component_size == 0);
531 unsigned skip = offset.constantValue() / component_size;
532 for (unsigned i = skip; i < num_components; i++)
533 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
534
535 if (dst.type() == RegType::vgpr) {
536 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
537 num_components = dst.bytes() / component_size;
538 aco_ptr<Instruction> create_vec{
539 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
540 for (unsigned i = 0; i < num_components; i++)
541 create_vec->operands[i] = Operand(elems[i]);
542 create_vec->definitions[0] = Definition(dst);
543 bld.insert(std::move(create_vec));
544
545 } else if (skip) {
546 /* if dst is sgpr - split the src, but move the original to sgpr. */
547 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
548 byte_align_scalar(ctx, vec, offset, dst);
549 } else {
550 assert(dst.size() == vec.size());
551 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
552 }
553
554 ctx->allocated_vec.emplace(dst.id(), elems);
555 }
556
557 Temp
get_ssa_temp_tex(struct isel_context * ctx,nir_def * def,bool is_16bit)558 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
559 {
560 RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
561 Temp tmp = get_ssa_temp(ctx, def);
562 if (tmp.bytes() != rc.bytes())
563 return emit_extract_vector(ctx, tmp, 0, rc);
564 else
565 return tmp;
566 }
567
568 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))569 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
570 {
571 Builder bld(ctx->program, ctx->block);
572 if (!dst.id())
573 dst = bld.tmp(bld.lm);
574
575 assert(val.regClass() == s1);
576 assert(dst.regClass() == bld.lm);
577
578 return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
579 bld.scc(val));
580 }
581
582 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))583 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
584 {
585 Builder bld(ctx->program, ctx->block);
586 if (!dst.id())
587 dst = bld.tmp(s1);
588
589 assert(val.regClass() == bld.lm);
590 assert(dst.regClass() == s1);
591
592 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
593 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
594 return dst;
595 }
596
597 /**
598 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
599 * src_bits and dst_bits are truncated.
600 *
601 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
602 * bit is indicated by src_bits in this case.
603 *
604 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
605 */
606 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())607 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
608 bool sign_extend, Temp dst = Temp())
609 {
610 assert(!(sign_extend && dst_bits < src_bits) &&
611 "Shrinking integers is not supported for signed inputs");
612
613 if (!dst.id()) {
614 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
615 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
616 else
617 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
618 }
619
620 assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
621 assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
622
623 if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
624 /* Copy the raw value, leaving an undefined value in the upper bits for
625 * the caller to handle appropriately */
626 return bld.copy(Definition(dst), src);
627 } else if (dst.bytes() < src.bytes()) {
628 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
629 }
630
631 Temp tmp = dst;
632 if (dst_bits == 64)
633 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
634
635 if (tmp == src) {
636 } else if (src.regClass() == s1) {
637 assert(src_bits < 32);
638 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
639 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
640 } else {
641 assert(src_bits < 32);
642 bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
643 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
644 }
645
646 if (dst_bits == 64) {
647 if (sign_extend && dst.regClass() == s2) {
648 Temp high =
649 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
650 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
651 } else if (sign_extend && dst.regClass() == v2) {
652 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
653 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
654 } else {
655 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
656 }
657 }
658
659 return dst;
660 }
661
662 enum sgpr_extract_mode {
663 sgpr_extract_sext,
664 sgpr_extract_zext,
665 sgpr_extract_undef,
666 };
667
668 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)669 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
670 {
671 Temp vec = get_ssa_temp(ctx, src->src.ssa);
672 unsigned src_size = src->src.ssa->bit_size;
673 unsigned swizzle = src->swizzle[0];
674
675 if (vec.size() > 1) {
676 assert(src_size == 16);
677 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
678 swizzle = swizzle & 1;
679 }
680
681 Builder bld(ctx->program, ctx->block);
682 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
683
684 if (mode == sgpr_extract_undef && swizzle == 0)
685 bld.copy(Definition(tmp), vec);
686 else
687 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
688 Operand::c32(swizzle), Operand::c32(src_size),
689 Operand::c32((mode == sgpr_extract_sext)));
690
691 if (dst.regClass() == s2)
692 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
693
694 return dst;
695 }
696
697 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)698 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
699 {
700 if (src.src.ssa->num_components == 1 && size == 1)
701 return get_ssa_temp(ctx, src.src.ssa);
702
703 Temp vec = get_ssa_temp(ctx, src.src.ssa);
704 unsigned elem_size = src.src.ssa->bit_size / 8u;
705 bool identity_swizzle = true;
706
707 for (unsigned i = 0; identity_swizzle && i < size; i++) {
708 if (src.swizzle[i] != i)
709 identity_swizzle = false;
710 }
711 if (identity_swizzle)
712 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
713
714 assert(elem_size > 0);
715 assert(vec.bytes() % elem_size == 0);
716
717 if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
718 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
719 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
720 sgpr_extract_undef);
721 }
722
723 bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
724 if (as_uniform)
725 vec = as_vgpr(ctx, vec);
726
727 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
728 : RegClass(vec.type(), elem_size / 4);
729 if (size == 1) {
730 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
731 } else {
732 assert(size <= 4);
733 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
734 aco_ptr<Instruction> vec_instr{
735 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
736 for (unsigned i = 0; i < size; ++i) {
737 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
738 vec_instr->operands[i] = Operand{elems[i]};
739 }
740 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
741 vec_instr->definitions[0] = Definition(dst);
742 ctx->block->instructions.emplace_back(std::move(vec_instr));
743 ctx->allocated_vec.emplace(dst.id(), elems);
744 return as_uniform ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
745 }
746 }
747
748 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)749 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
750 {
751 /* returns v2b or v1 for vop3p usage.
752 * The source expects exactly 2 16bit components
753 * which are within the same dword
754 */
755 assert(src.src.ssa->bit_size == 16);
756 assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
757
758 Temp tmp = get_ssa_temp(ctx, src.src.ssa);
759 if (tmp.size() == 1)
760 return tmp;
761
762 /* the size is larger than 1 dword: check the swizzle */
763 unsigned dword = src.swizzle[0] >> 1;
764
765 /* extract a full dword if possible */
766 if (tmp.bytes() >= (dword + 1) * 4) {
767 /* if the source is split into components, use p_create_vector */
768 auto it = ctx->allocated_vec.find(tmp.id());
769 if (it != ctx->allocated_vec.end()) {
770 unsigned index = dword << 1;
771 Builder bld(ctx->program, ctx->block);
772 if (it->second[index].regClass() == v2b)
773 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
774 it->second[index + 1]);
775 }
776 return emit_extract_vector(ctx, tmp, dword, v1);
777 } else {
778 /* This must be a swizzled access to %a.zz where %a is v6b */
779 assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
780 assert(tmp.regClass() == v6b && dword == 1);
781 return emit_extract_vector(ctx, tmp, dword * 2, v2b);
782 }
783 }
784
785 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)786 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
787 {
788 nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
789 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
790 }
791
792 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)793 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
794 {
795 if (ptr.size() == 2)
796 return ptr;
797 Builder bld(ctx->program, ctx->block);
798 if (ptr.type() == RegType::vgpr && !non_uniform)
799 ptr = bld.as_uniform(ptr);
800 return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
801 Operand::c32((unsigned)ctx->options->address32_hi));
802 }
803
804 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)805 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
806 bool writes_scc, uint8_t uses_ub = 0)
807 {
808 Builder bld = create_alu_builder(ctx, instr);
809 bld.is_nuw = instr->no_unsigned_wrap;
810
811 Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
812 Operand(get_alu_src(ctx, instr->src[1]))};
813 u_foreach_bit (i, uses_ub) {
814 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
815 if (src_ub <= 0xffff)
816 operands[i].set16bit(true);
817 else if (src_ub <= 0xffffff)
818 operands[i].set24bit(true);
819 }
820
821 if (writes_scc)
822 bld.sop2(op, Definition(dst), bld.def(s1, scc), operands[0], operands[1]);
823 else
824 bld.sop2(op, Definition(dst), operands[0], operands[1]);
825 }
826
827 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)828 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
829 bool commutative, bool swap_srcs = false, bool flush_denorms = false,
830 bool nuw = false, uint8_t uses_ub = 0)
831 {
832 Builder bld = create_alu_builder(ctx, instr);
833 bld.is_nuw = nuw;
834
835 Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
836 Operand(get_alu_src(ctx, instr->src[1]))};
837 u_foreach_bit (i, uses_ub) {
838 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
839 if (src_ub <= 0xffff)
840 operands[i].set16bit(true);
841 else if (src_ub <= 0xffffff)
842 operands[i].set24bit(true);
843 }
844
845 if (swap_srcs)
846 std::swap(operands[0], operands[1]);
847
848 if (operands[1].isOfType(RegType::sgpr)) {
849 if (commutative && operands[0].isOfType(RegType::vgpr)) {
850 std::swap(operands[0], operands[1]);
851 } else {
852 operands[1] = bld.copy(bld.def(RegType::vgpr, operands[1].size()), operands[1]);
853 }
854 }
855
856 if (flush_denorms && ctx->program->gfx_level < GFX9) {
857 assert(dst.size() == 1);
858 Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), operands[0], operands[1]);
859 if (dst.bytes() == 2)
860 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
861 else
862 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
863 } else {
864 bld.vop2(opc, Definition(dst), operands[0], operands[1]);
865 }
866 }
867
868 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)869 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
870 {
871 Builder bld = create_alu_builder(ctx, instr);
872
873 Temp src0 = get_alu_src(ctx, instr->src[0]);
874 Temp src1 = get_alu_src(ctx, instr->src[1]);
875
876 if (src1.type() == RegType::sgpr) {
877 assert(src0.type() == RegType::vgpr);
878 std::swap(src0, src1);
879 }
880
881 Temp src00 = bld.tmp(src0.type(), 1);
882 Temp src01 = bld.tmp(src0.type(), 1);
883 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
884 Temp src10 = bld.tmp(v1);
885 Temp src11 = bld.tmp(v1);
886 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
887 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
888 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
889 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
890 }
891
892 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)893 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
894 bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
895 {
896 assert(num_sources == 2 || num_sources == 3);
897 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
898 bool has_sgpr = false;
899 for (unsigned i = 0; i < num_sources; i++) {
900 src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
901 if (has_sgpr)
902 src[i] = as_vgpr(ctx, src[i]);
903 else
904 has_sgpr = src[i].type() == RegType::sgpr;
905 }
906
907 Builder bld = create_alu_builder(ctx, instr);
908 if (flush_denorms && ctx->program->gfx_level < GFX9) {
909 Temp tmp;
910 if (num_sources == 3)
911 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
912 else
913 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
914 if (dst.size() == 1)
915 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
916 else
917 bld.vop3(aco_opcode::v_mul_f64_e64, Definition(dst), Operand::c64(0x3FF0000000000000),
918 tmp);
919 } else if (num_sources == 3) {
920 bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
921 } else {
922 bld.vop3(op, Definition(dst), src[0], src[1]);
923 }
924 }
925
926 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)927 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
928 bool swap_srcs = false)
929 {
930 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
931 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
932 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
933 src1 = as_vgpr(ctx, src1);
934 assert(instr->def.num_components == 2);
935
936 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
937 unsigned opsel_lo =
938 (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
939 unsigned opsel_hi =
940 (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
941
942 Builder bld = create_alu_builder(ctx, instr);
943 Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
944 emit_split_vector(ctx, dst, 2);
945 return res;
946 }
947
948 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp,unsigned neg_lo=0)949 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
950 unsigned neg_lo = 0)
951 {
952 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
953 bool has_sgpr = false;
954 for (unsigned i = 0; i < 3; i++) {
955 src[i] = get_alu_src(ctx, instr->src[i]);
956 if (has_sgpr)
957 src[i] = as_vgpr(ctx, src[i]);
958 else
959 has_sgpr = src[i].type() == RegType::sgpr;
960 }
961
962 Builder bld = create_alu_builder(ctx, instr);
963 VALU_instruction& vop3p =
964 bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
965 vop3p.clamp = clamp;
966 vop3p.neg_lo = neg_lo;
967 }
968
969 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)970 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
971 {
972 Builder bld = create_alu_builder(ctx, instr);
973 if (dst.type() == RegType::sgpr)
974 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
975 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
976 else
977 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
978 }
979
980 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)981 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
982 {
983 Temp src0 = get_alu_src(ctx, instr->src[0]);
984 Temp src1 = get_alu_src(ctx, instr->src[1]);
985 assert(src0.size() == src1.size());
986
987 aco_ptr<Instruction> vopc;
988 if (src1.type() == RegType::sgpr) {
989 if (src0.type() == RegType::vgpr) {
990 /* to swap the operands, we might also have to change the opcode */
991 op = get_vcmp_swapped(op);
992 Temp t = src0;
993 src0 = src1;
994 src1 = t;
995 } else {
996 src1 = as_vgpr(ctx, src1);
997 }
998 }
999
1000 Builder bld = create_alu_builder(ctx, instr);
1001 bld.vopc(op, Definition(dst), src0, src1);
1002 }
1003
1004 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1005 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1006 {
1007 Temp src0 = get_alu_src(ctx, instr->src[0]);
1008 Temp src1 = get_alu_src(ctx, instr->src[1]);
1009 Builder bld = create_alu_builder(ctx, instr);
1010
1011 assert(dst.regClass() == bld.lm);
1012 assert(src0.type() == RegType::sgpr);
1013 assert(src1.type() == RegType::sgpr);
1014
1015 /* Emit the SALU comparison instruction */
1016 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1017 /* Turn the result into a per-lane bool */
1018 bool_to_vector_condition(ctx, cmp, dst);
1019 }
1020
1021 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s16_op=aco_opcode::num_opcodes,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1022 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1023 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s16_op = aco_opcode::num_opcodes,
1024 aco_opcode s32_op = aco_opcode::num_opcodes,
1025 aco_opcode s64_op = aco_opcode::num_opcodes)
1026 {
1027 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op
1028 : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1029 : s16_op;
1030 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op
1031 : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1032 : v16_op;
1033 bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1034 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1035 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1036 aco_opcode op = use_valu ? v_op : s_op;
1037 assert(op != aco_opcode::num_opcodes);
1038 assert(dst.regClass() == ctx->program->lane_mask);
1039
1040 if (use_valu)
1041 emit_vopc_instruction(ctx, instr, op, dst);
1042 else
1043 emit_sopc_instruction(ctx, instr, op, dst);
1044 }
1045
1046 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1047 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1048 Temp dst)
1049 {
1050 Builder bld(ctx->program, ctx->block);
1051 Temp src0 = get_alu_src(ctx, instr->src[0]);
1052 Temp src1 = get_alu_src(ctx, instr->src[1]);
1053
1054 assert(dst.regClass() == bld.lm);
1055 assert(src0.regClass() == bld.lm);
1056 assert(src1.regClass() == bld.lm);
1057
1058 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1059 }
1060
1061 void
select_vec2(isel_context * ctx,Temp dst,Temp cond,Temp then,Temp els)1062 select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
1063 {
1064 Builder bld(ctx->program, ctx->block);
1065
1066 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1067 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1068 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1069 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1070
1071 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1072 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1073
1074 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1075 }
1076
1077 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1078 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1079 {
1080 Builder bld(ctx->program, ctx->block);
1081 Temp cond = get_alu_src(ctx, instr->src[0]);
1082 Temp then = get_alu_src(ctx, instr->src[1]);
1083 Temp els = get_alu_src(ctx, instr->src[2]);
1084
1085 assert(cond.regClass() == bld.lm);
1086
1087 if (dst.type() == RegType::vgpr) {
1088 aco_ptr<Instruction> bcsel;
1089 if (dst.size() == 1) {
1090 then = as_vgpr(ctx, then);
1091 els = as_vgpr(ctx, els);
1092
1093 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1094 } else if (dst.size() == 2) {
1095 select_vec2(ctx, dst, cond, then, els);
1096 } else {
1097 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1098 }
1099 return;
1100 }
1101
1102 if (instr->def.bit_size == 1) {
1103 assert(dst.regClass() == bld.lm);
1104 assert(then.regClass() == bld.lm);
1105 assert(els.regClass() == bld.lm);
1106 }
1107
1108 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1109 if (dst.regClass() == s1 || dst.regClass() == s2) {
1110 assert((then.regClass() == s1 || then.regClass() == s2) &&
1111 els.regClass() == then.regClass());
1112 assert(dst.size() == then.size());
1113 aco_opcode op =
1114 dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1115 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1116 } else {
1117 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1118 }
1119 return;
1120 }
1121
1122 /* divergent boolean bcsel
1123 * this implements bcsel on bools: dst = s0 ? s1 : s2
1124 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1125 assert(instr->def.bit_size == 1);
1126
1127 if (cond.id() != then.id())
1128 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1129
1130 if (cond.id() == els.id())
1131 bld.copy(Definition(dst), then);
1132 else
1133 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1134 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1135 }
1136
1137 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode vop,aco_opcode sop,uint32_t undo)1138 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode vop,
1139 aco_opcode sop, uint32_t undo)
1140 {
1141 if (ctx->block->fp_mode.denorm32 == 0) {
1142 if (dst.regClass() == v1)
1143 bld.vop1(vop, dst, val);
1144 else if (ctx->options->gfx_level >= GFX12)
1145 bld.vop3(sop, dst, val);
1146 else
1147 bld.pseudo(aco_opcode::p_as_uniform, dst, bld.vop1(vop, bld.def(v1), val));
1148 return;
1149 }
1150
1151 /* multiply by 16777216 to handle denormals */
1152 Temp scale, unscale;
1153 if (val.regClass() == v1) {
1154 val = as_vgpr(bld, val);
1155 Temp is_denormal = bld.tmp(bld.lm);
1156 VALU_instruction& valu = bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal),
1157 val, Operand::c32(1u << 4))
1158 ->valu();
1159 valu.neg[0] = true;
1160 valu.abs[0] = true;
1161 scale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1162 bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), is_denormal);
1163 unscale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1164 bld.copy(bld.def(s1), Operand::c32(undo)), is_denormal);
1165 } else {
1166 Temp abs = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), val,
1167 bld.copy(bld.def(s1), Operand::c32(0x7fffffff)));
1168 Temp denorm_cmp = bld.copy(bld.def(s1), Operand::c32(0x00800000));
1169 Temp is_denormal = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, denorm_cmp);
1170 scale = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
1171 bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), Operand::c32(0x3f800000),
1172 bld.scc(is_denormal));
1173 unscale =
1174 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(undo)),
1175 Operand::c32(0x3f800000), bld.scc(is_denormal));
1176 }
1177
1178 if (dst.regClass() == v1) {
1179 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), scale, as_vgpr(bld, val));
1180 scaled = bld.vop1(vop, bld.def(v1), scaled);
1181 bld.vop2(aco_opcode::v_mul_f32, dst, unscale, scaled);
1182 } else {
1183 assert(ctx->options->gfx_level >= GFX11_5);
1184 Temp scaled = bld.sop2(aco_opcode::s_mul_f32, bld.def(s1), scale, val);
1185 if (ctx->options->gfx_level >= GFX12)
1186 scaled = bld.vop3(sop, bld.def(s1), scaled);
1187 else
1188 scaled = bld.as_uniform(bld.vop1(vop, bld.def(v1), scaled));
1189 bld.sop2(aco_opcode::s_mul_f32, dst, unscale, scaled);
1190 }
1191 }
1192
1193 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1194 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1195 {
1196 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, aco_opcode::v_s_rcp_f32, 0x4b800000u);
1197 }
1198
1199 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1200 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1201 {
1202 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, aco_opcode::v_s_rsq_f32, 0x45800000u);
1203 }
1204
1205 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1206 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1207 {
1208 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, aco_opcode::v_s_sqrt_f32,
1209 0x39800000u);
1210 }
1211
1212 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1213 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1214 {
1215 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, aco_opcode::v_s_log_f32, 0xc1c00000u);
1216 }
1217
1218 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1219 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1220 {
1221 if (ctx->options->gfx_level >= GFX7)
1222 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1223
1224 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1225 /* TODO: create more efficient code! */
1226 if (val.type() == RegType::sgpr)
1227 val = as_vgpr(ctx, val);
1228
1229 /* Split the input value. */
1230 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1231 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1232
1233 /* Extract the exponent and compute the unbiased value. */
1234 Temp exponent =
1235 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1236 exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1237
1238 /* Extract the fractional part. */
1239 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1240 Operand::c32(0x000fffffu));
1241 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1242
1243 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1244 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1245 fract_mask);
1246
1247 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1248 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1249 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1250 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1251 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1252
1253 /* Get the sign bit. */
1254 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1255
1256 /* Decide the operation to apply depending on the unbiased exponent. */
1257 Temp exp_lt0 =
1258 bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1259 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1260 bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1261 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1262 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1263 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1264 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1265
1266 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1267 }
1268
1269 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1270 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1271 {
1272 if (ctx->options->gfx_level >= GFX7)
1273 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1274
1275 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1276 * lowered at NIR level for precision reasons). */
1277 Temp src0 = as_vgpr(ctx, val);
1278
1279 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1280 Operand::c32(0x3fefffffu));
1281
1282 Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1283 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1284 Temp min = bld.vop3(aco_opcode::v_min_f64_e64, bld.def(v2), fract, min_val);
1285
1286 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1287 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1288 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1289 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1290
1291 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1292 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1293
1294 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1295
1296 Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src0, v);
1297 add->valu().neg[1] = true;
1298
1299 return add->definitions[0].getTemp();
1300 }
1301
1302 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1303 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1304 {
1305 if (bld.program->gfx_level < GFX8) {
1306 Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1307 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1308 add.def(1).getTemp());
1309 }
1310
1311 Builder::Result add(NULL);
1312 if (bld.program->gfx_level >= GFX9) {
1313 add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1314 } else {
1315 add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1316 }
1317 add->valu().clamp = 1;
1318 return dst.getTemp();
1319 }
1320
1321 Temp
usub32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1322 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1323 {
1324 if (bld.program->gfx_level < GFX8) {
1325 Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1326 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1327 sub.def(1).getTemp());
1328 }
1329
1330 Builder::Result sub(NULL);
1331 if (bld.program->gfx_level >= GFX9) {
1332 sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1333 } else {
1334 sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1335 }
1336 sub->valu().clamp = 1;
1337 return dst.getTemp();
1338 }
1339
1340 void
emit_vec2_f2f16(isel_context * ctx,nir_alu_instr * instr,Temp dst)1341 emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1342 {
1343 Builder bld = create_alu_builder(ctx, instr);
1344 Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
1345 RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
1346 Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
1347 Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
1348
1349 if (dst.regClass() == s1) {
1350 bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src0, src1);
1351 } else {
1352 src1 = as_vgpr(ctx, src1);
1353 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
1354 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
1355 else
1356 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
1357 emit_split_vector(ctx, dst, 2);
1358 }
1359 }
1360
1361 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1362 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1363 {
1364 Builder bld = create_alu_builder(ctx, instr);
1365 Temp dst = get_ssa_temp(ctx, &instr->def);
1366 switch (instr->op) {
1367 case nir_op_vec2:
1368 case nir_op_vec3:
1369 case nir_op_vec4:
1370 case nir_op_vec5:
1371 case nir_op_vec8:
1372 case nir_op_vec16: {
1373 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1374 unsigned num = instr->def.num_components;
1375 for (unsigned i = 0; i < num; ++i)
1376 elems[i] = get_alu_src(ctx, instr->src[i]);
1377
1378 if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1379 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
1380 instr->def.num_components, 1)};
1381 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1382 for (unsigned i = 0; i < num; ++i) {
1383 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1384 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1385 vec->operands[i] = Operand{elems[i]};
1386 }
1387 vec->definitions[0] = Definition(dst);
1388 ctx->block->instructions.emplace_back(std::move(vec));
1389 ctx->allocated_vec.emplace(dst.id(), elems);
1390 } else {
1391 bool use_s_pack = ctx->program->gfx_level >= GFX9;
1392 Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1393
1394 std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1395 uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1396 for (unsigned i = 0; i < num; i++) {
1397 unsigned packed_size = use_s_pack ? 16 : 32;
1398 unsigned idx = i * instr->def.bit_size / packed_size;
1399 unsigned offset = i * instr->def.bit_size % packed_size;
1400 if (nir_src_is_const(instr->src[i].src)) {
1401 const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1402 continue;
1403 }
1404 if (nir_src_is_undef(instr->src[i].src))
1405 continue;
1406
1407 if (offset != packed_size - instr->def.bit_size)
1408 elems[i] =
1409 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1410
1411 if (offset)
1412 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1413 Operand::c32(offset));
1414
1415 if (packed[idx].id())
1416 packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1417 packed[idx]);
1418 else
1419 packed[idx] = elems[i];
1420 }
1421
1422 if (use_s_pack) {
1423 for (unsigned i = 0; i < dst.size(); i++) {
1424 bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1425
1426 if (packed[i * 2].id() && packed[i * 2 + 1].id())
1427 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1428 packed[i * 2 + 1]);
1429 else if (packed[i * 2 + 1].id())
1430 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1431 Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1432 else if (packed[i * 2].id())
1433 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1434 Operand::c32(const_vals[i * 2 + 1]));
1435 else
1436 packed[i] = Temp(); /* Both constants, so reset the entry */
1437
1438 if (same)
1439 const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1440 else
1441 const_vals[i] = 0;
1442 }
1443 }
1444
1445 for (unsigned i = 0; i < dst.size(); i++) {
1446 if (const_vals[i] && packed[i].id())
1447 packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1448 Operand::c32(const_vals[i]), packed[i]);
1449 else if (!packed[i].id())
1450 packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1451 }
1452
1453 if (dst.size() == 1)
1454 bld.copy(Definition(dst), packed[0]);
1455 else {
1456 aco_ptr<Instruction> vec{
1457 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1458 vec->definitions[0] = Definition(dst);
1459 for (unsigned i = 0; i < dst.size(); ++i)
1460 vec->operands[i] = Operand(packed[i]);
1461 bld.insert(std::move(vec));
1462 }
1463 }
1464 break;
1465 }
1466 case nir_op_mov: {
1467 Temp src = get_alu_src(ctx, instr->src[0]);
1468 if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1469 /* use size() instead of bytes() for 8/16-bit */
1470 assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1471 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1472 } else {
1473 assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1474 bld.copy(Definition(dst), src);
1475 }
1476 break;
1477 }
1478 case nir_op_inot: {
1479 Temp src = get_alu_src(ctx, instr->src[0]);
1480 if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1481 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1482 } else if (dst.regClass() == v2) {
1483 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1484 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1485 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1486 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1487 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1488 } else if (dst.type() == RegType::sgpr) {
1489 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1490 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1491 } else {
1492 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1493 }
1494 break;
1495 }
1496 case nir_op_iabs: {
1497 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1498 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1499
1500 unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1501 unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1502
1503 Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1504 src, opsel_lo, opsel_hi);
1505 bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1506 emit_split_vector(ctx, dst, 2);
1507 break;
1508 }
1509 Temp src = get_alu_src(ctx, instr->src[0]);
1510 if (dst.regClass() == s1) {
1511 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1512 } else if (dst.regClass() == v1) {
1513 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1514 bld.vsub32(bld.def(v1), Operand::zero(), src));
1515 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1516 bld.vop3(
1517 aco_opcode::v_max_i16_e64, Definition(dst), src,
1518 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1519 } else if (dst.regClass() == v2b) {
1520 src = as_vgpr(ctx, src);
1521 bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1522 bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1523 } else {
1524 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1525 }
1526 break;
1527 }
1528 case nir_op_isign: {
1529 Temp src = get_alu_src(ctx, instr->src[0]);
1530 if (dst.regClass() == s1) {
1531 Temp tmp =
1532 bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1533 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1534 } else if (dst.regClass() == s2) {
1535 Temp neg =
1536 bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1537 Temp neqz;
1538 if (ctx->program->gfx_level >= GFX8)
1539 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1540 else
1541 neqz =
1542 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1543 .def(1)
1544 .getTemp();
1545 /* SCC gets zero-extended to 64 bit */
1546 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1547 } else if (dst.regClass() == v1) {
1548 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1549 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1550 bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1551 } else if (dst.regClass() == v2b) {
1552 src = as_vgpr(ctx, src);
1553 bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1554 bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1555 } else if (dst.regClass() == v2) {
1556 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1557 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1558 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1559 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1560 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1561 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1562 } else {
1563 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1564 }
1565 break;
1566 }
1567 case nir_op_imax: {
1568 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1569 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1570 } else if (dst.regClass() == v2b) {
1571 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1572 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1573 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1574 } else if (dst.regClass() == v1) {
1575 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1576 } else if (dst.regClass() == s1) {
1577 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1578 } else {
1579 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1580 }
1581 break;
1582 }
1583 case nir_op_umax: {
1584 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1585 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1586 } else if (dst.regClass() == v2b) {
1587 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1588 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1589 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1590 } else if (dst.regClass() == v1) {
1591 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1592 } else if (dst.regClass() == s1) {
1593 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1594 } else {
1595 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1596 }
1597 break;
1598 }
1599 case nir_op_imin: {
1600 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1601 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1602 } else if (dst.regClass() == v2b) {
1603 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1604 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1605 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1606 } else if (dst.regClass() == v1) {
1607 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1608 } else if (dst.regClass() == s1) {
1609 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1610 } else {
1611 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1612 }
1613 break;
1614 }
1615 case nir_op_umin: {
1616 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1617 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1618 } else if (dst.regClass() == v2b) {
1619 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1620 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1621 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1622 } else if (dst.regClass() == v1) {
1623 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1624 } else if (dst.regClass() == s1) {
1625 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1626 } else {
1627 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1628 }
1629 break;
1630 }
1631 case nir_op_ior: {
1632 if (instr->def.bit_size == 1) {
1633 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1634 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1635 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1636 } else if (dst.regClass() == v2) {
1637 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1638 } else if (dst.regClass() == s1) {
1639 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1640 } else if (dst.regClass() == s2) {
1641 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1642 } else {
1643 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1644 }
1645 break;
1646 }
1647 case nir_op_iand: {
1648 if (instr->def.bit_size == 1) {
1649 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1650 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1651 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1652 } else if (dst.regClass() == v2) {
1653 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1654 } else if (dst.regClass() == s1) {
1655 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1656 } else if (dst.regClass() == s2) {
1657 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1658 } else {
1659 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1660 }
1661 break;
1662 }
1663 case nir_op_ixor: {
1664 if (instr->def.bit_size == 1) {
1665 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1666 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1667 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1668 } else if (dst.regClass() == v2) {
1669 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1670 } else if (dst.regClass() == s1) {
1671 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1672 } else if (dst.regClass() == s2) {
1673 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1674 } else {
1675 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1676 }
1677 break;
1678 }
1679 case nir_op_ushr: {
1680 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1681 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1682 } else if (dst.regClass() == v2b) {
1683 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1684 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1685 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1686 } else if (dst.regClass() == v1) {
1687 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1688 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1689 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1690 get_alu_src(ctx, instr->src[0]));
1691 } else if (dst.regClass() == v2) {
1692 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1693 } else if (dst.regClass() == s2) {
1694 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1695 } else if (dst.regClass() == s1) {
1696 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1697 } else {
1698 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1699 }
1700 break;
1701 }
1702 case nir_op_ishl: {
1703 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1704 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1705 } else if (dst.regClass() == v2b) {
1706 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1707 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1708 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1709 } else if (dst.regClass() == v1) {
1710 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1711 false, 1);
1712 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1713 bld.vop3(aco_opcode::v_lshlrev_b64_e64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1714 get_alu_src(ctx, instr->src[0]));
1715 } else if (dst.regClass() == v2) {
1716 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1717 } else if (dst.regClass() == s1) {
1718 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1719 } else if (dst.regClass() == s2) {
1720 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1721 } else {
1722 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1723 }
1724 break;
1725 }
1726 case nir_op_ishr: {
1727 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1728 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1729 } else if (dst.regClass() == v2b) {
1730 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1731 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1732 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1733 } else if (dst.regClass() == v1) {
1734 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1735 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1736 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1737 get_alu_src(ctx, instr->src[0]));
1738 } else if (dst.regClass() == v2) {
1739 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1740 } else if (dst.regClass() == s1) {
1741 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1742 } else if (dst.regClass() == s2) {
1743 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1744 } else {
1745 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1746 }
1747 break;
1748 }
1749 case nir_op_find_lsb: {
1750 Temp src = get_alu_src(ctx, instr->src[0]);
1751 if (src.regClass() == s1) {
1752 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1753 } else if (src.regClass() == v1) {
1754 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1755 } else if (src.regClass() == s2) {
1756 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1757 } else if (src.regClass() == v2) {
1758 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1759 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1760 lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1761 hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1762 hi = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32u), hi);
1763 bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1764 } else {
1765 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1766 }
1767 break;
1768 }
1769 case nir_op_ufind_msb:
1770 case nir_op_ifind_msb: {
1771 Temp src = get_alu_src(ctx, instr->src[0]);
1772 if (src.regClass() == s1 || src.regClass() == s2) {
1773 aco_opcode op = src.regClass() == s2
1774 ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1775 : aco_opcode::s_flbit_i32_i64)
1776 : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1777 : aco_opcode::s_flbit_i32);
1778 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1779
1780 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1781 Operand::c32(src.size() * 32u - 1u), msb_rev);
1782 Temp msb = sub.def(0).getTemp();
1783 Temp carry = sub.def(1).getTemp();
1784
1785 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1786 bld.scc(carry));
1787 } else if (src.regClass() == v1) {
1788 aco_opcode op =
1789 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1790 Temp msb_rev = bld.tmp(v1);
1791 emit_vop1_instruction(ctx, instr, op, msb_rev);
1792 Temp msb = bld.tmp(v1);
1793 Temp carry =
1794 bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1795 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1796 } else if (src.regClass() == v2) {
1797 aco_opcode op =
1798 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1799
1800 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1801 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1802
1803 lo = bld.vop1(op, bld.def(v1), lo);
1804 lo = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32), lo);
1805 hi = bld.vop1(op, bld.def(v1), hi);
1806 Temp msb_rev = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), lo, hi);
1807
1808 Temp msb = bld.tmp(v1);
1809 Temp carry =
1810 bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1811 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1812 } else {
1813 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1814 }
1815 break;
1816 }
1817 case nir_op_ufind_msb_rev:
1818 case nir_op_ifind_msb_rev: {
1819 Temp src = get_alu_src(ctx, instr->src[0]);
1820 if (src.regClass() == s1) {
1821 aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1822 : aco_opcode::s_flbit_i32;
1823 bld.sop1(op, Definition(dst), src);
1824 } else if (src.regClass() == v1) {
1825 aco_opcode op =
1826 instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1827 emit_vop1_instruction(ctx, instr, op, dst);
1828 } else {
1829 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1830 }
1831 break;
1832 }
1833 case nir_op_bitfield_reverse: {
1834 if (dst.regClass() == s1) {
1835 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1836 } else if (dst.regClass() == v1) {
1837 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1838 } else {
1839 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1840 }
1841 break;
1842 }
1843 case nir_op_iadd: {
1844 if (dst.regClass() == s1) {
1845 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1846 break;
1847 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1848 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1849 break;
1850 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1851 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1852 break;
1853 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1854 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1855 break;
1856 }
1857
1858 Temp src0 = get_alu_src(ctx, instr->src[0]);
1859 Temp src1 = get_alu_src(ctx, instr->src[1]);
1860 if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1861 if (instr->no_unsigned_wrap)
1862 bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1863 else
1864 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1865 break;
1866 }
1867
1868 assert(src0.size() == 2 && src1.size() == 2);
1869 Temp src00 = bld.tmp(src0.type(), 1);
1870 Temp src01 = bld.tmp(dst.type(), 1);
1871 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1872 Temp src10 = bld.tmp(src1.type(), 1);
1873 Temp src11 = bld.tmp(dst.type(), 1);
1874 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1875
1876 if (dst.regClass() == s2) {
1877 Temp carry = bld.tmp(s1);
1878 Temp dst0 =
1879 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1880 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1881 bld.scc(carry));
1882 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1883 } else if (dst.regClass() == v2) {
1884 Temp dst0 = bld.tmp(v1);
1885 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1886 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1887 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1888 } else {
1889 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1890 }
1891 break;
1892 }
1893 case nir_op_uadd_sat: {
1894 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1895 Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1896 add_instr->valu().clamp = 1;
1897 break;
1898 }
1899 Temp src0 = get_alu_src(ctx, instr->src[0]);
1900 Temp src1 = get_alu_src(ctx, instr->src[1]);
1901 if (dst.regClass() == s1) {
1902 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1903 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1904 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1905 bld.scc(carry));
1906 break;
1907 } else if (dst.regClass() == v2b) {
1908 Instruction* add_instr;
1909 if (ctx->program->gfx_level >= GFX10) {
1910 add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1911 } else {
1912 if (src1.type() == RegType::sgpr)
1913 std::swap(src0, src1);
1914 add_instr =
1915 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1916 }
1917 add_instr->valu().clamp = 1;
1918 break;
1919 } else if (dst.regClass() == v1) {
1920 uadd32_sat(bld, Definition(dst), src0, src1);
1921 break;
1922 }
1923
1924 assert(src0.size() == 2 && src1.size() == 2);
1925
1926 Temp src00 = bld.tmp(src0.type(), 1);
1927 Temp src01 = bld.tmp(src0.type(), 1);
1928 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1929 Temp src10 = bld.tmp(src1.type(), 1);
1930 Temp src11 = bld.tmp(src1.type(), 1);
1931 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1932
1933 if (dst.regClass() == s2) {
1934 Temp carry0 = bld.tmp(s1);
1935 Temp carry1 = bld.tmp(s1);
1936
1937 Temp no_sat0 =
1938 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1939 Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1940 src01, src11, bld.scc(carry0));
1941
1942 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1943
1944 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1945 bld.scc(carry1));
1946 } else if (dst.regClass() == v2) {
1947 Temp no_sat0 = bld.tmp(v1);
1948 Temp dst0 = bld.tmp(v1);
1949 Temp dst1 = bld.tmp(v1);
1950
1951 Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1952 Temp carry1;
1953
1954 if (ctx->program->gfx_level >= GFX8) {
1955 carry1 = bld.tmp(bld.lm);
1956 bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1957 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1958 ->valu()
1959 .clamp = 1;
1960 } else {
1961 Temp no_sat1 = bld.tmp(v1);
1962 carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1963 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1964 carry1);
1965 }
1966
1967 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1968 carry1);
1969 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1970 } else {
1971 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1972 }
1973 break;
1974 }
1975 case nir_op_iadd_sat: {
1976 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1977 Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1978 add_instr->valu().clamp = 1;
1979 break;
1980 }
1981 Temp src0 = get_alu_src(ctx, instr->src[0]);
1982 Temp src1 = get_alu_src(ctx, instr->src[1]);
1983 if (dst.regClass() == s1) {
1984 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1985 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1986 Operand::c32(INT32_MAX), cond);
1987 Temp overflow = bld.tmp(s1);
1988 Temp add =
1989 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1990 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1991 break;
1992 }
1993
1994 src1 = as_vgpr(ctx, src1);
1995
1996 if (dst.regClass() == v2b) {
1997 Instruction* add_instr =
1998 bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1999 add_instr->valu().clamp = 1;
2000 } else if (dst.regClass() == v1) {
2001 Instruction* add_instr =
2002 bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2003 add_instr->valu().clamp = 1;
2004 } else {
2005 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2006 }
2007 break;
2008 }
2009 case nir_op_uadd_carry: {
2010 Temp src0 = get_alu_src(ctx, instr->src[0]);
2011 Temp src1 = get_alu_src(ctx, instr->src[1]);
2012 if (dst.regClass() == s1) {
2013 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2014 break;
2015 }
2016 if (dst.regClass() == v1) {
2017 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2018 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2019 carry);
2020 break;
2021 }
2022
2023 Temp src00 = bld.tmp(src0.type(), 1);
2024 Temp src01 = bld.tmp(dst.type(), 1);
2025 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2026 Temp src10 = bld.tmp(src1.type(), 1);
2027 Temp src11 = bld.tmp(dst.type(), 1);
2028 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2029 if (dst.regClass() == s2) {
2030 Temp carry = bld.tmp(s1);
2031 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2032 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2033 bld.scc(carry))
2034 .def(1)
2035 .getTemp();
2036 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2037 } else if (dst.regClass() == v2) {
2038 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2039 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2040 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2041 Operand::c32(1u), carry);
2042 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2043 } else {
2044 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2045 }
2046 break;
2047 }
2048 case nir_op_isub: {
2049 if (dst.regClass() == s1) {
2050 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2051 break;
2052 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2053 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2054 break;
2055 }
2056
2057 Temp src0 = get_alu_src(ctx, instr->src[0]);
2058 Temp src1 = get_alu_src(ctx, instr->src[1]);
2059 if (dst.regClass() == v1) {
2060 bld.vsub32(Definition(dst), src0, src1);
2061 break;
2062 } else if (dst.bytes() <= 2) {
2063 if (ctx->program->gfx_level >= GFX10)
2064 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2065 else if (src1.type() == RegType::sgpr)
2066 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2067 else if (ctx->program->gfx_level >= GFX8)
2068 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2069 else
2070 bld.vsub32(Definition(dst), src0, src1);
2071 break;
2072 }
2073
2074 Temp src00 = bld.tmp(src0.type(), 1);
2075 Temp src01 = bld.tmp(dst.type(), 1);
2076 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2077 Temp src10 = bld.tmp(src1.type(), 1);
2078 Temp src11 = bld.tmp(dst.type(), 1);
2079 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2080 if (dst.regClass() == s2) {
2081 Temp borrow = bld.tmp(s1);
2082 Temp dst0 =
2083 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2084 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2085 bld.scc(borrow));
2086 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2087 } else if (dst.regClass() == v2) {
2088 Temp lower = bld.tmp(v1);
2089 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2090 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2091 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2092 } else {
2093 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2094 }
2095 break;
2096 }
2097 case nir_op_usub_borrow: {
2098 Temp src0 = get_alu_src(ctx, instr->src[0]);
2099 Temp src1 = get_alu_src(ctx, instr->src[1]);
2100 if (dst.regClass() == s1) {
2101 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2102 break;
2103 } else if (dst.regClass() == v1) {
2104 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2105 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2106 borrow);
2107 break;
2108 }
2109
2110 Temp src00 = bld.tmp(src0.type(), 1);
2111 Temp src01 = bld.tmp(dst.type(), 1);
2112 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2113 Temp src10 = bld.tmp(src1.type(), 1);
2114 Temp src11 = bld.tmp(dst.type(), 1);
2115 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2116 if (dst.regClass() == s2) {
2117 Temp borrow = bld.tmp(s1);
2118 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2119 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2120 bld.scc(borrow))
2121 .def(1)
2122 .getTemp();
2123 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2124 } else if (dst.regClass() == v2) {
2125 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2126 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2127 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2128 Operand::c32(1u), borrow);
2129 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2130 } else {
2131 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2132 }
2133 break;
2134 }
2135 case nir_op_usub_sat: {
2136 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2137 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2138 sub_instr->valu().clamp = 1;
2139 break;
2140 }
2141 Temp src0 = get_alu_src(ctx, instr->src[0]);
2142 Temp src1 = get_alu_src(ctx, instr->src[1]);
2143 if (dst.regClass() == s1) {
2144 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2145 bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2146 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2147 break;
2148 } else if (dst.regClass() == v2b) {
2149 Instruction* sub_instr;
2150 if (ctx->program->gfx_level >= GFX10) {
2151 sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2152 } else {
2153 aco_opcode op = aco_opcode::v_sub_u16;
2154 if (src1.type() == RegType::sgpr) {
2155 std::swap(src0, src1);
2156 op = aco_opcode::v_subrev_u16;
2157 }
2158 sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2159 }
2160 sub_instr->valu().clamp = 1;
2161 break;
2162 } else if (dst.regClass() == v1) {
2163 usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2164 break;
2165 }
2166
2167 assert(src0.size() == 2 && src1.size() == 2);
2168 Temp src00 = bld.tmp(src0.type(), 1);
2169 Temp src01 = bld.tmp(src0.type(), 1);
2170 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2171 Temp src10 = bld.tmp(src1.type(), 1);
2172 Temp src11 = bld.tmp(src1.type(), 1);
2173 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2174
2175 if (dst.regClass() == s2) {
2176 Temp carry0 = bld.tmp(s1);
2177 Temp carry1 = bld.tmp(s1);
2178
2179 Temp no_sat0 =
2180 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2181 Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2182 src01, src11, bld.scc(carry0));
2183
2184 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2185
2186 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2187 bld.scc(carry1));
2188 } else if (dst.regClass() == v2) {
2189 Temp no_sat0 = bld.tmp(v1);
2190 Temp dst0 = bld.tmp(v1);
2191 Temp dst1 = bld.tmp(v1);
2192
2193 Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2194 Temp carry1;
2195
2196 if (ctx->program->gfx_level >= GFX8) {
2197 carry1 = bld.tmp(bld.lm);
2198 bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2199 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2200 ->valu()
2201 .clamp = 1;
2202 } else {
2203 Temp no_sat1 = bld.tmp(v1);
2204 carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2205 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2206 carry1);
2207 }
2208
2209 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2210 carry1);
2211 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2212 } else {
2213 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2214 }
2215 break;
2216 }
2217 case nir_op_isub_sat: {
2218 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2219 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2220 sub_instr->valu().clamp = 1;
2221 break;
2222 }
2223 Temp src0 = get_alu_src(ctx, instr->src[0]);
2224 Temp src1 = get_alu_src(ctx, instr->src[1]);
2225 if (dst.regClass() == s1) {
2226 Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2227 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2228 Operand::c32(INT32_MAX), cond);
2229 Temp overflow = bld.tmp(s1);
2230 Temp sub =
2231 bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2232 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2233 break;
2234 }
2235
2236 src1 = as_vgpr(ctx, src1);
2237
2238 if (dst.regClass() == v2b) {
2239 Instruction* sub_instr =
2240 bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2241 sub_instr->valu().clamp = 1;
2242 } else if (dst.regClass() == v1) {
2243 Instruction* sub_instr =
2244 bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2245 sub_instr->valu().clamp = 1;
2246 } else {
2247 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2248 }
2249 break;
2250 }
2251 case nir_op_imul: {
2252 if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2253 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2254 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2255 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2256 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2257 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2258 } else if (dst.type() == RegType::vgpr) {
2259 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2260 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2261
2262 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2263 bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2264 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2265 true /* commutative */, false, false, nuw_16bit, 0x3);
2266 } else if (nir_src_is_const(instr->src[0].src)) {
2267 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2268 nir_src_as_uint(instr->src[0].src), false);
2269 } else if (nir_src_is_const(instr->src[1].src)) {
2270 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2271 nir_src_as_uint(instr->src[1].src), false);
2272 } else {
2273 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2274 }
2275 } else if (dst.regClass() == s1) {
2276 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2277 } else {
2278 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2279 }
2280 break;
2281 }
2282 case nir_op_umul_high: {
2283 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2284 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2285 } else if (dst.bytes() == 4) {
2286 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2287 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2288
2289 Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2290 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2291 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2292 } else {
2293 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2294 }
2295
2296 if (dst.regClass() == s1)
2297 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2298 } else {
2299 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2300 }
2301 break;
2302 }
2303 case nir_op_imul_high: {
2304 if (dst.regClass() == v1) {
2305 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2306 } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2307 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2308 } else if (dst.regClass() == s1) {
2309 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2310 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2311 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2312 } else {
2313 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2314 }
2315 break;
2316 }
2317 case nir_op_fmul: {
2318 if (dst.regClass() == v2b) {
2319 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2320 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2321 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2322 } else if (dst.regClass() == v1) {
2323 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2324 } else if (dst.regClass() == v2) {
2325 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64_e64, dst);
2326 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2327 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f16, dst, false);
2328 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2329 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f32, dst, false);
2330 } else {
2331 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2332 }
2333 break;
2334 }
2335 case nir_op_fmulz: {
2336 if (dst.regClass() == v1) {
2337 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2338 } else {
2339 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2340 }
2341 break;
2342 }
2343 case nir_op_fadd: {
2344 if (dst.regClass() == v2b) {
2345 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2346 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2347 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2348 } else if (dst.regClass() == v1) {
2349 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2350 } else if (dst.regClass() == v2) {
2351 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64_e64, dst);
2352 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2353 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f16, dst, false);
2354 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2355 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f32, dst, false);
2356 } else {
2357 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2358 }
2359 break;
2360 }
2361 case nir_op_fsub: {
2362 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2363 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2364 VALU_instruction& sub = add->valu();
2365 sub.neg_lo[1] = true;
2366 sub.neg_hi[1] = true;
2367 break;
2368 }
2369
2370 Temp src0 = get_alu_src(ctx, instr->src[0]);
2371 Temp src1 = get_alu_src(ctx, instr->src[1]);
2372 if (dst.regClass() == v2b) {
2373 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2374 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2375 else
2376 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2377 } else if (dst.regClass() == v1) {
2378 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2379 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2380 else
2381 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2382 } else if (dst.regClass() == v2) {
2383 Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), as_vgpr(ctx, src0),
2384 as_vgpr(ctx, src1));
2385 add->valu().neg[1] = true;
2386 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2387 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f16, dst, false);
2388 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2389 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f32, dst, false);
2390 } else {
2391 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2392 }
2393 break;
2394 }
2395 case nir_op_ffma: {
2396 if (dst.regClass() == v2b) {
2397 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2398 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2399 assert(instr->def.num_components == 2);
2400
2401 Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2402 Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2403 Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2404
2405 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2406 unsigned opsel_lo = 0, opsel_hi = 0;
2407 for (unsigned i = 0; i < 3; i++) {
2408 opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2409 opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2410 }
2411
2412 bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2413 emit_split_vector(ctx, dst, 2);
2414 } else if (dst.regClass() == v1) {
2415 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2416 ctx->block->fp_mode.must_flush_denorms32, 3);
2417 } else if (dst.regClass() == v2) {
2418 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2419 } else if (dst.regClass() == s1) {
2420 Temp src0 = get_alu_src(ctx, instr->src[0]);
2421 Temp src1 = get_alu_src(ctx, instr->src[1]);
2422 Temp src2 = get_alu_src(ctx, instr->src[2]);
2423 aco_opcode op =
2424 instr->def.bit_size == 16 ? aco_opcode::s_fmac_f16 : aco_opcode::s_fmac_f32;
2425 bld.sop2(op, Definition(dst), src0, src1, src2);
2426 } else {
2427 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2428 }
2429 break;
2430 }
2431 case nir_op_ffmaz: {
2432 if (dst.regClass() == v1) {
2433 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2434 ctx->block->fp_mode.must_flush_denorms32, 3);
2435 } else {
2436 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2437 }
2438 break;
2439 }
2440 case nir_op_fmax: {
2441 if (dst.regClass() == v2b) {
2442 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
2443 ctx->block->fp_mode.must_flush_denorms16_64);
2444 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2445 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2446 } else if (dst.regClass() == v1) {
2447 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2448 ctx->block->fp_mode.must_flush_denorms32);
2449 } else if (dst.regClass() == v2) {
2450 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64_e64, dst,
2451 ctx->block->fp_mode.must_flush_denorms16_64);
2452 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2453 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f16, dst, false);
2454 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2455 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f32, dst, false);
2456 } else {
2457 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2458 }
2459 break;
2460 }
2461 case nir_op_fmin: {
2462 if (dst.regClass() == v2b) {
2463 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
2464 ctx->block->fp_mode.must_flush_denorms16_64);
2465 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2466 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2467 } else if (dst.regClass() == v1) {
2468 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2469 ctx->block->fp_mode.must_flush_denorms32);
2470 } else if (dst.regClass() == v2) {
2471 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64_e64, dst,
2472 ctx->block->fp_mode.must_flush_denorms16_64);
2473 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2474 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f16, dst, false);
2475 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2476 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f32, dst, false);
2477 } else {
2478 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2479 }
2480 break;
2481 }
2482 case nir_op_sdot_4x8_iadd: {
2483 if (ctx->options->gfx_level >= GFX11)
2484 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2485 else
2486 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2487 break;
2488 }
2489 case nir_op_sdot_4x8_iadd_sat: {
2490 if (ctx->options->gfx_level >= GFX11)
2491 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2492 else
2493 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2494 break;
2495 }
2496 case nir_op_sudot_4x8_iadd: {
2497 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2498 break;
2499 }
2500 case nir_op_sudot_4x8_iadd_sat: {
2501 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2502 break;
2503 }
2504 case nir_op_udot_4x8_uadd: {
2505 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2506 break;
2507 }
2508 case nir_op_udot_4x8_uadd_sat: {
2509 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2510 break;
2511 }
2512 case nir_op_sdot_2x16_iadd: {
2513 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2514 break;
2515 }
2516 case nir_op_sdot_2x16_iadd_sat: {
2517 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2518 break;
2519 }
2520 case nir_op_udot_2x16_uadd: {
2521 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2522 break;
2523 }
2524 case nir_op_udot_2x16_uadd_sat: {
2525 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2526 break;
2527 }
2528 case nir_op_cube_amd: {
2529 Temp in = get_alu_src(ctx, instr->src[0], 3);
2530 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2531 emit_extract_vector(ctx, in, 2, v1)};
2532 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2533 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2534 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2535 Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2536 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2537 break;
2538 }
2539 case nir_op_bcsel: {
2540 emit_bcsel(ctx, instr, dst);
2541 break;
2542 }
2543 case nir_op_frsq: {
2544 if (instr->def.bit_size == 16) {
2545 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2546 bld.vop3(aco_opcode::v_s_rsq_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2547 else
2548 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2549 } else if (instr->def.bit_size == 32) {
2550 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2551 } else if (instr->def.bit_size == 64) {
2552 /* Lowered at NIR level for precision reasons. */
2553 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2554 } else {
2555 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2556 }
2557 break;
2558 }
2559 case nir_op_fneg: {
2560 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2561 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2562 Instruction* vop3p =
2563 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2564 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2565 vop3p->valu().neg_lo[0] = true;
2566 vop3p->valu().neg_hi[0] = true;
2567 emit_split_vector(ctx, dst, 2);
2568 break;
2569 }
2570 Temp src = get_alu_src(ctx, instr->src[0]);
2571 if (dst.regClass() == v2b) {
2572 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2573 } else if (dst.regClass() == v1) {
2574 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2575 as_vgpr(ctx, src));
2576 } else if (dst.regClass() == v2) {
2577 if (ctx->block->fp_mode.must_flush_denorms16_64)
2578 src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2579 as_vgpr(ctx, src));
2580 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2581 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2582 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2583 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2584 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2585 bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0xbc00u), src);
2586 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2587 bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0xbf800000u), src);
2588 } else {
2589 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2590 }
2591 break;
2592 }
2593 case nir_op_fabs: {
2594 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2595 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2596 Instruction* vop3p =
2597 bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2598 instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2599 .instr;
2600 vop3p->valu().neg_lo[1] = true;
2601 vop3p->valu().neg_hi[1] = true;
2602 emit_split_vector(ctx, dst, 2);
2603 break;
2604 }
2605 Temp src = get_alu_src(ctx, instr->src[0]);
2606 if (dst.regClass() == v2b) {
2607 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2608 Operand::c16(0x3c00), as_vgpr(ctx, src))
2609 .instr;
2610 mul->valu().abs[1] = true;
2611 } else if (dst.regClass() == v1) {
2612 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2613 Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2614 .instr;
2615 mul->valu().abs[1] = true;
2616 } else if (dst.regClass() == v2) {
2617 if (ctx->block->fp_mode.must_flush_denorms16_64)
2618 src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2619 as_vgpr(ctx, src));
2620 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2621 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2622 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2623 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2624 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2625 Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fff));
2626 if (ctx->block->fp_mode.denorm16_64 == fp_denorm_keep) {
2627 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2628 } else {
2629 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2630 bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
2631 }
2632 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2633 Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
2634 if (ctx->block->fp_mode.denorm32 == fp_denorm_keep) {
2635 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2636 } else {
2637 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2638 bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0x3f800000), tmp);
2639 }
2640 } else {
2641 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2642 }
2643 break;
2644 }
2645 case nir_op_fsat: {
2646 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2647 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2648 Instruction* vop3p =
2649 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2650 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2651 vop3p->valu().clamp = true;
2652 emit_split_vector(ctx, dst, 2);
2653 break;
2654 }
2655 Temp src = get_alu_src(ctx, instr->src[0]);
2656 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
2657 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2658 src);
2659 } else if (dst.regClass() == v2b) {
2660 bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
2661 ->valu()
2662 .clamp = true;
2663 } else if (dst.regClass() == v1) {
2664 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2665 Operand::c32(0x3f800000u), src);
2666 /* apparently, it is not necessary to flush denorms if this instruction is used with these
2667 * operands */
2668 // TODO: confirm that this holds under any circumstances
2669 } else if (dst.regClass() == v2) {
2670 Instruction* add =
2671 bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src, Operand::zero());
2672 add->valu().clamp = true;
2673 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2674 Temp low = bld.sop2(aco_opcode::s_max_f16, bld.def(s1), src, Operand::c16(0));
2675 bld.sop2(aco_opcode::s_min_f16, Definition(dst), low, Operand::c16(0x3C00));
2676 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2677 Temp low = bld.sop2(aco_opcode::s_max_f32, bld.def(s1), src, Operand::c32(0));
2678 bld.sop2(aco_opcode::s_min_f32, Definition(dst), low, Operand::c32(0x3f800000));
2679 } else {
2680 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2681 }
2682 break;
2683 }
2684 case nir_op_flog2: {
2685 if (instr->def.bit_size == 16) {
2686 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2687 bld.vop3(aco_opcode::v_s_log_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2688 else
2689 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2690 } else if (instr->def.bit_size == 32) {
2691 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2692 } else {
2693 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2694 }
2695 break;
2696 }
2697 case nir_op_frcp: {
2698 if (instr->def.bit_size == 16) {
2699 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2700 bld.vop3(aco_opcode::v_s_rcp_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2701 else
2702 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2703 } else if (instr->def.bit_size == 32) {
2704 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2705 } else if (instr->def.bit_size == 64) {
2706 /* Lowered at NIR level for precision reasons. */
2707 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2708 } else {
2709 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2710 }
2711 break;
2712 }
2713 case nir_op_fexp2: {
2714 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX12) {
2715 aco_opcode opcode =
2716 instr->def.bit_size == 16 ? aco_opcode::v_s_exp_f16 : aco_opcode::v_s_exp_f32;
2717 bld.vop3(opcode, Definition(dst), get_alu_src(ctx, instr->src[0]));
2718 } else if (instr->def.bit_size == 16) {
2719 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2720 } else if (instr->def.bit_size == 32) {
2721 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2722 } else {
2723 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2724 }
2725 break;
2726 }
2727 case nir_op_fsqrt: {
2728 if (instr->def.bit_size == 16) {
2729 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2730 bld.vop3(aco_opcode::v_s_sqrt_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2731 else
2732 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2733 } else if (instr->def.bit_size == 32) {
2734 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2735 } else if (instr->def.bit_size == 64) {
2736 /* Lowered at NIR level for precision reasons. */
2737 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2738 } else {
2739 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2740 }
2741 break;
2742 }
2743 case nir_op_ffract: {
2744 if (dst.regClass() == v2b) {
2745 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2746 } else if (dst.regClass() == v1) {
2747 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2748 } else if (dst.regClass() == v2) {
2749 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2750 } else if (dst.regClass() == s1) {
2751 Temp src = get_alu_src(ctx, instr->src[0]);
2752 aco_opcode op =
2753 instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2754 Temp floor = bld.sop1(op, bld.def(s1), src);
2755 op = instr->def.bit_size == 16 ? aco_opcode::s_sub_f16 : aco_opcode::s_sub_f32;
2756 bld.sop2(op, Definition(dst), src, floor);
2757 } else {
2758 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2759 }
2760 break;
2761 }
2762 case nir_op_ffloor: {
2763 if (dst.regClass() == v2b) {
2764 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2765 } else if (dst.regClass() == v1) {
2766 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2767 } else if (dst.regClass() == v2) {
2768 Temp src = get_alu_src(ctx, instr->src[0]);
2769 emit_floor_f64(ctx, bld, Definition(dst), src);
2770 } else if (dst.regClass() == s1) {
2771 Temp src = get_alu_src(ctx, instr->src[0]);
2772 aco_opcode op =
2773 instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2774 bld.sop1(op, Definition(dst), src);
2775 } else {
2776 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2777 }
2778 break;
2779 }
2780 case nir_op_fceil: {
2781 if (dst.regClass() == v2b) {
2782 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2783 } else if (dst.regClass() == v1) {
2784 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2785 } else if (dst.regClass() == v2) {
2786 if (ctx->options->gfx_level >= GFX7) {
2787 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2788 } else {
2789 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2790 /* trunc = trunc(src0)
2791 * if (src0 > 0.0 && src0 != trunc)
2792 * trunc += 1.0
2793 */
2794 Temp src0 = get_alu_src(ctx, instr->src[0]);
2795 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2796 Temp tmp0 =
2797 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2798 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2799 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2800 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2801 bld.copy(bld.def(v1), Operand::zero()),
2802 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2803 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2804 bld.copy(bld.def(v1), Operand::zero()), add);
2805 bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), trunc, add);
2806 }
2807 } else if (dst.regClass() == s1) {
2808 Temp src = get_alu_src(ctx, instr->src[0]);
2809 aco_opcode op =
2810 instr->def.bit_size == 16 ? aco_opcode::s_ceil_f16 : aco_opcode::s_ceil_f32;
2811 bld.sop1(op, Definition(dst), src);
2812 } else {
2813 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2814 }
2815 break;
2816 }
2817 case nir_op_ftrunc: {
2818 if (dst.regClass() == v2b) {
2819 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2820 } else if (dst.regClass() == v1) {
2821 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2822 } else if (dst.regClass() == v2) {
2823 Temp src = get_alu_src(ctx, instr->src[0]);
2824 emit_trunc_f64(ctx, bld, Definition(dst), src);
2825 } else if (dst.regClass() == s1) {
2826 Temp src = get_alu_src(ctx, instr->src[0]);
2827 aco_opcode op =
2828 instr->def.bit_size == 16 ? aco_opcode::s_trunc_f16 : aco_opcode::s_trunc_f32;
2829 bld.sop1(op, Definition(dst), src);
2830 } else {
2831 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2832 }
2833 break;
2834 }
2835 case nir_op_fround_even: {
2836 if (dst.regClass() == v2b) {
2837 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2838 } else if (dst.regClass() == v1) {
2839 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2840 } else if (dst.regClass() == v2) {
2841 if (ctx->options->gfx_level >= GFX7) {
2842 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2843 } else {
2844 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2845 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2846 Temp src0 = get_alu_src(ctx, instr->src[0]);
2847 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2848
2849 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2850 bld.copy(bld.def(s1), Operand::c32(-2u)));
2851 Temp bfi =
2852 bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2853 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2854 Temp tmp =
2855 bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), src0,
2856 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2857 Instruction* sub =
2858 bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), tmp,
2859 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2860 sub->valu().neg[1] = true;
2861 tmp = sub->definitions[0].getTemp();
2862
2863 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2864 Operand::c32(0x432fffffu));
2865 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2866 vop3->valu().abs[0] = true;
2867 Temp cond = vop3->definitions[0].getTemp();
2868
2869 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2870 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2871 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2872 as_vgpr(ctx, src0_lo), cond);
2873 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2874 as_vgpr(ctx, src0_hi), cond);
2875
2876 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2877 }
2878 } else if (dst.regClass() == s1) {
2879 Temp src = get_alu_src(ctx, instr->src[0]);
2880 aco_opcode op =
2881 instr->def.bit_size == 16 ? aco_opcode::s_rndne_f16 : aco_opcode::s_rndne_f32;
2882 bld.sop1(op, Definition(dst), src);
2883 } else {
2884 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2885 }
2886 break;
2887 }
2888 case nir_op_fsin_amd:
2889 case nir_op_fcos_amd: {
2890 if (instr->def.bit_size == 16 || instr->def.bit_size == 32) {
2891 bool is_sin = instr->op == nir_op_fsin_amd;
2892 aco_opcode opcode, fract;
2893 RegClass rc;
2894 if (instr->def.bit_size == 16) {
2895 opcode = is_sin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2896 fract = aco_opcode::v_fract_f16;
2897 rc = v2b;
2898 } else {
2899 opcode = is_sin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2900 fract = aco_opcode::v_fract_f32;
2901 rc = v1;
2902 }
2903
2904 Temp src = get_alu_src(ctx, instr->src[0]);
2905 /* before GFX9, v_sin and v_cos had a valid input domain of [-256, +256] */
2906 if (ctx->options->gfx_level < GFX9)
2907 src = bld.vop1(fract, bld.def(rc), src);
2908
2909 if (dst.regClass() == rc) {
2910 bld.vop1(opcode, Definition(dst), src);
2911 } else {
2912 Temp tmp = bld.vop1(opcode, bld.def(rc), src);
2913 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2914 }
2915 } else {
2916 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2917 }
2918 break;
2919 }
2920 case nir_op_ldexp: {
2921 if (dst.regClass() == v2b) {
2922 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2923 } else if (dst.regClass() == v1) {
2924 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2925 } else if (dst.regClass() == v2) {
2926 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2927 } else {
2928 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2929 }
2930 break;
2931 }
2932 case nir_op_frexp_sig: {
2933 if (dst.regClass() == v2b) {
2934 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2935 } else if (dst.regClass() == v1) {
2936 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2937 } else if (dst.regClass() == v2) {
2938 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2939 } else {
2940 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2941 }
2942 break;
2943 }
2944 case nir_op_frexp_exp: {
2945 if (instr->src[0].src.ssa->bit_size == 16) {
2946 Temp src = get_alu_src(ctx, instr->src[0]);
2947 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2948 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2949 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2950 } else if (instr->src[0].src.ssa->bit_size == 32) {
2951 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2952 } else if (instr->src[0].src.ssa->bit_size == 64) {
2953 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2954 } else {
2955 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2956 }
2957 break;
2958 }
2959 case nir_op_fsign: {
2960 Temp src = get_alu_src(ctx, instr->src[0]);
2961 if (dst.regClass() == v2b) {
2962 /* replace negative zero with positive zero */
2963 src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), as_vgpr(ctx, src));
2964 if (ctx->program->gfx_level >= GFX9) {
2965 src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
2966 Operand::c16(1u));
2967 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2968 } else {
2969 src = convert_int(ctx, bld, src, 16, 32, true);
2970 src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
2971 Operand::c32(1u));
2972 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2973 }
2974 } else if (dst.regClass() == v1) {
2975 /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
2976 * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
2977 */
2978 Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
2979 src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, as_vgpr(ctx, src));
2980 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
2981 Operand::c32(0xbf800000));
2982 } else if (dst.regClass() == v2) {
2983 src = as_vgpr(ctx, src);
2984 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2985 Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2986 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2987 emit_extract_vector(ctx, src, 1, v1), cond);
2988
2989 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2990 tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2991 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2992
2993 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2994 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2995 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f16, bld.def(s1, scc), Operand::c16(0), src);
2996 src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3c00), src,
2997 bld.scc(cond));
2998 cond = bld.sopc(aco_opcode::s_cmp_ge_f16, bld.def(s1, scc), src, Operand::c16(0));
2999 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbc00),
3000 bld.scc(cond));
3001 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
3002 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f32, bld.def(s1, scc), Operand::c32(0), src);
3003 src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3f800000), src,
3004 bld.scc(cond));
3005 cond = bld.sopc(aco_opcode::s_cmp_ge_f32, bld.def(s1, scc), src, Operand::c32(0));
3006 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbf800000),
3007 bld.scc(cond));
3008 } else {
3009 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3010 }
3011 break;
3012 }
3013 case nir_op_f2f16:
3014 case nir_op_f2f16_rtne: {
3015 assert(instr->src[0].src.ssa->bit_size == 32);
3016 if (instr->def.num_components == 2) {
3017 /* Vectorizing f2f16 is only possible with rtz. */
3018 assert(instr->op != nir_op_f2f16_rtne);
3019 assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
3020 !ctx->block->fp_mode.care_about_round16_64);
3021 emit_vec2_f2f16(ctx, instr, dst);
3022 break;
3023 }
3024 Temp src = get_alu_src(ctx, instr->src[0]);
3025 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
3026 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
3027 * keep value numbering and the scheduler simpler.
3028 */
3029 if (dst.regClass() == v2b)
3030 bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
3031 else
3032 bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, Definition(dst), src);
3033 } else {
3034 if (dst.regClass() == v2b)
3035 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3036 else
3037 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3038 }
3039 break;
3040 }
3041 case nir_op_f2f16_rtz: {
3042 assert(instr->src[0].src.ssa->bit_size == 32);
3043 if (instr->def.num_components == 2) {
3044 emit_vec2_f2f16(ctx, instr, dst);
3045 break;
3046 }
3047 Temp src = get_alu_src(ctx, instr->src[0]);
3048 if (ctx->block->fp_mode.round16_64 == fp_round_tz) {
3049 if (dst.regClass() == v2b)
3050 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3051 else
3052 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3053 } else if (dst.regClass() == s1) {
3054 bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src, Operand::zero());
3055 } else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) {
3056 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
3057 } else {
3058 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
3059 }
3060 break;
3061 }
3062 case nir_op_f2f32: {
3063 if (dst.regClass() == s1) {
3064 assert(instr->src[0].src.ssa->bit_size == 16);
3065 Temp src = get_alu_src(ctx, instr->src[0]);
3066 bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
3067 } else if (instr->src[0].src.ssa->bit_size == 16) {
3068 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
3069 } else if (instr->src[0].src.ssa->bit_size == 64) {
3070 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
3071 } else {
3072 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3073 }
3074 break;
3075 }
3076 case nir_op_f2f64: {
3077 assert(instr->src[0].src.ssa->bit_size == 32);
3078 Temp src = get_alu_src(ctx, instr->src[0]);
3079 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
3080 break;
3081 }
3082 case nir_op_i2f16: {
3083 Temp src = get_alu_src(ctx, instr->src[0]);
3084 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3085 if (dst.regClass() == v2b) {
3086 if (input_size <= 16) {
3087 /* Expand integer to the size expected by the uint→float converter used below */
3088 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3089 if (input_size != target_size) {
3090 src = convert_int(ctx, bld, src, input_size, target_size, true);
3091 }
3092 }
3093
3094 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3095 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
3096 } else {
3097 /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
3098 *
3099 * This is also the fallback-path taken on GFX7 and earlier, which
3100 * do not support direct f16⟷i16 conversions.
3101 */
3102 src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
3103 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3104 }
3105 } else if (dst.regClass() == s1) {
3106 if (input_size <= 16) {
3107 src = convert_int(ctx, bld, src, input_size, 32, true);
3108 }
3109 src = bld.sop1(aco_opcode::s_cvt_f32_i32, bld.def(s1), src);
3110 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3111 } else {
3112 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3113 }
3114 break;
3115 }
3116 case nir_op_i2f32: {
3117 assert(dst.size() == 1);
3118 Temp src = get_alu_src(ctx, instr->src[0]);
3119 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3120 if (input_size <= 32) {
3121 if (input_size <= 16) {
3122 /* Sign-extend to 32-bits */
3123 src = convert_int(ctx, bld, src, input_size, 32, true);
3124 }
3125 if (dst.regClass() == v1)
3126 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
3127 else
3128 bld.sop1(aco_opcode::s_cvt_f32_i32, Definition(dst), src);
3129 } else {
3130 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3131 }
3132 break;
3133 }
3134 case nir_op_i2f64: {
3135 if (instr->src[0].src.ssa->bit_size <= 32) {
3136 Temp src = get_alu_src(ctx, instr->src[0]);
3137 if (instr->src[0].src.ssa->bit_size <= 16)
3138 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
3139 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
3140 } else {
3141 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3142 }
3143 break;
3144 }
3145 case nir_op_u2f16: {
3146 Temp src = get_alu_src(ctx, instr->src[0]);
3147 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3148 if (dst.regClass() == v2b) {
3149 if (input_size <= 16) {
3150 /* Expand integer to the size expected by the uint→float converter used below */
3151 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3152 if (input_size != target_size) {
3153 src = convert_int(ctx, bld, src, input_size, target_size, false);
3154 }
3155 }
3156
3157 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3158 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
3159 } else {
3160 /* Large 32bit inputs need to return inf/FLOAT_MAX.
3161 *
3162 * This is also the fallback-path taken on GFX7 and earlier, which
3163 * do not support direct f16⟷u16 conversions.
3164 */
3165 src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3166 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3167 }
3168 } else if (dst.regClass() == s1) {
3169 if (input_size <= 16) {
3170 src = convert_int(ctx, bld, src, input_size, 32, false);
3171 }
3172 src = bld.sop1(aco_opcode::s_cvt_f32_u32, bld.def(s1), src);
3173 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3174 } else {
3175 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3176 }
3177 break;
3178 }
3179 case nir_op_u2f32: {
3180 assert(dst.size() == 1);
3181 Temp src = get_alu_src(ctx, instr->src[0]);
3182 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3183 if (input_size == 8 && dst.regClass() == v1) {
3184 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3185 } else if (input_size <= 32) {
3186 if (input_size <= 16)
3187 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3188 if (dst.regClass() == v1)
3189 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3190 else
3191 bld.sop1(aco_opcode::s_cvt_f32_u32, Definition(dst), src);
3192 } else {
3193 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3194 }
3195 break;
3196 }
3197 case nir_op_u2f64: {
3198 if (instr->src[0].src.ssa->bit_size <= 32) {
3199 Temp src = get_alu_src(ctx, instr->src[0]);
3200 if (instr->src[0].src.ssa->bit_size <= 16)
3201 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3202 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3203 } else {
3204 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3205 }
3206 break;
3207 }
3208 case nir_op_f2i8:
3209 case nir_op_f2i16: {
3210 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3211 ctx->program->gfx_level >= GFX11_5) {
3212 Temp src = get_alu_src(ctx, instr->src[0]);
3213 Temp tmp = bld.as_uniform(src);
3214 if (instr->src[0].src.ssa->bit_size == 16)
3215 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3216 bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3217 } else if (instr->src[0].src.ssa->bit_size == 16) {
3218 if (ctx->program->gfx_level >= GFX8) {
3219 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3220 } else {
3221 /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3222 Temp tmp = bld.tmp(v1);
3223 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3224 tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3225 tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3226 (dst.type() == RegType::sgpr) ? Temp() : dst);
3227 if (dst.type() == RegType::sgpr) {
3228 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3229 }
3230 }
3231 } else if (instr->src[0].src.ssa->bit_size == 32) {
3232 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3233 } else {
3234 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3235 }
3236 break;
3237 }
3238 case nir_op_f2u8:
3239 case nir_op_f2u16: {
3240 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3241 ctx->program->gfx_level >= GFX11_5) {
3242 Temp src = get_alu_src(ctx, instr->src[0]);
3243 Temp tmp = bld.as_uniform(src);
3244 if (instr->src[0].src.ssa->bit_size == 16)
3245 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3246 bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3247 } else if (instr->src[0].src.ssa->bit_size == 16) {
3248 if (ctx->program->gfx_level >= GFX8) {
3249 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3250 } else {
3251 /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3252 Temp tmp = bld.tmp(v1);
3253 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3254 tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3255 tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3256 (dst.type() == RegType::sgpr) ? Temp() : dst);
3257 if (dst.type() == RegType::sgpr) {
3258 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3259 }
3260 }
3261 } else if (instr->src[0].src.ssa->bit_size == 32) {
3262 if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
3263 bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
3264 get_alu_src(ctx, instr->src[0]));
3265 else
3266 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3267 } else {
3268 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3269 }
3270 break;
3271 }
3272 case nir_op_f2i32: {
3273 Temp src = get_alu_src(ctx, instr->src[0]);
3274 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3275 ctx->program->gfx_level >= GFX11_5) {
3276 Temp tmp = bld.as_uniform(src);
3277 if (instr->src[0].src.ssa->bit_size == 16)
3278 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3279 bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3280 } else if (instr->src[0].src.ssa->bit_size == 16) {
3281 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3282 if (dst.type() == RegType::vgpr) {
3283 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3284 } else {
3285 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3286 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3287 }
3288 } else if (instr->src[0].src.ssa->bit_size == 32) {
3289 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3290 } else if (instr->src[0].src.ssa->bit_size == 64) {
3291 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3292 } else {
3293 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3294 }
3295 break;
3296 }
3297 case nir_op_f2u32: {
3298 Temp src = get_alu_src(ctx, instr->src[0]);
3299 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3300 ctx->program->gfx_level >= GFX11_5) {
3301 Temp tmp = bld.as_uniform(src);
3302 if (instr->src[0].src.ssa->bit_size == 16)
3303 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3304 bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3305 } else if (instr->src[0].src.ssa->bit_size == 16) {
3306 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3307 if (dst.type() == RegType::vgpr) {
3308 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3309 } else {
3310 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3311 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3312 }
3313 } else if (instr->src[0].src.ssa->bit_size == 32) {
3314 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3315 } else if (instr->src[0].src.ssa->bit_size == 64) {
3316 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3317 } else {
3318 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3319 }
3320 break;
3321 }
3322 case nir_op_b2f16: {
3323 Temp src = get_alu_src(ctx, instr->src[0]);
3324 assert(src.regClass() == bld.lm);
3325
3326 if (dst.regClass() == s1) {
3327 src = bool_to_scalar_condition(ctx, src);
3328 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3329 } else if (dst.regClass() == v2b) {
3330 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3331 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3332 } else {
3333 unreachable("Wrong destination register class for nir_op_b2f16.");
3334 }
3335 break;
3336 }
3337 case nir_op_b2f32: {
3338 Temp src = get_alu_src(ctx, instr->src[0]);
3339 assert(src.regClass() == bld.lm);
3340
3341 if (dst.regClass() == s1) {
3342 src = bool_to_scalar_condition(ctx, src);
3343 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3344 } else if (dst.regClass() == v1) {
3345 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3346 Operand::c32(0x3f800000u), src);
3347 } else {
3348 unreachable("Wrong destination register class for nir_op_b2f32.");
3349 }
3350 break;
3351 }
3352 case nir_op_b2f64: {
3353 Temp src = get_alu_src(ctx, instr->src[0]);
3354 assert(src.regClass() == bld.lm);
3355
3356 if (dst.regClass() == s2) {
3357 src = bool_to_scalar_condition(ctx, src);
3358 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3359 Operand::zero(), bld.scc(src));
3360 } else if (dst.regClass() == v2) {
3361 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3362 Temp upper =
3363 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3364 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3365 } else {
3366 unreachable("Wrong destination register class for nir_op_b2f64.");
3367 }
3368 break;
3369 }
3370 case nir_op_i2i8:
3371 case nir_op_i2i16:
3372 case nir_op_i2i32: {
3373 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3374 /* no need to do the extract in get_alu_src() */
3375 sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3376 ? sgpr_extract_sext
3377 : sgpr_extract_undef;
3378 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3379 } else {
3380 const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3381 const unsigned output_bitsize = instr->def.bit_size;
3382 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3383 output_bitsize > input_bitsize, dst);
3384 }
3385 break;
3386 }
3387 case nir_op_u2u8:
3388 case nir_op_u2u16:
3389 case nir_op_u2u32: {
3390 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3391 /* no need to do the extract in get_alu_src() */
3392 sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3393 ? sgpr_extract_zext
3394 : sgpr_extract_undef;
3395 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3396 } else {
3397 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3398 instr->def.bit_size, false, dst);
3399 }
3400 break;
3401 }
3402 case nir_op_b2b32:
3403 case nir_op_b2i8:
3404 case nir_op_b2i16:
3405 case nir_op_b2i32: {
3406 Temp src = get_alu_src(ctx, instr->src[0]);
3407 assert(src.regClass() == bld.lm);
3408
3409 if (dst.regClass() == s1) {
3410 bool_to_scalar_condition(ctx, src, dst);
3411 } else if (dst.type() == RegType::vgpr) {
3412 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3413 src);
3414 } else {
3415 unreachable("Invalid register class for b2i32");
3416 }
3417 break;
3418 }
3419 case nir_op_b2b1: {
3420 Temp src = get_alu_src(ctx, instr->src[0]);
3421 assert(dst.regClass() == bld.lm);
3422
3423 if (src.type() == RegType::vgpr) {
3424 assert(src.regClass() == v1 || src.regClass() == v2);
3425 assert(dst.regClass() == bld.lm);
3426 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3427 Definition(dst), Operand::zero(), src);
3428 } else {
3429 assert(src.regClass() == s1 || src.regClass() == s2);
3430 Temp tmp;
3431 if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3432 tmp =
3433 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3434 .def(1)
3435 .getTemp();
3436 } else {
3437 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3438 bld.scc(bld.def(s1)), Operand::zero(), src);
3439 }
3440 bool_to_vector_condition(ctx, tmp, dst);
3441 }
3442 break;
3443 }
3444 case nir_op_unpack_64_2x32:
3445 case nir_op_unpack_32_2x16:
3446 case nir_op_unpack_64_4x16:
3447 case nir_op_unpack_32_4x8:
3448 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3449 emit_split_vector(
3450 ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3451 break;
3452 case nir_op_pack_64_2x32_split: {
3453 Temp src0 = get_alu_src(ctx, instr->src[0]);
3454 Temp src1 = get_alu_src(ctx, instr->src[1]);
3455
3456 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3457 break;
3458 }
3459 case nir_op_unpack_64_2x32_split_x:
3460 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3461 get_alu_src(ctx, instr->src[0]));
3462 break;
3463 case nir_op_unpack_64_2x32_split_y:
3464 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3465 get_alu_src(ctx, instr->src[0]));
3466 break;
3467 case nir_op_unpack_32_2x16_split_x:
3468 if (dst.type() == RegType::vgpr) {
3469 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3470 get_alu_src(ctx, instr->src[0]));
3471 } else {
3472 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3473 }
3474 break;
3475 case nir_op_unpack_32_2x16_split_y:
3476 if (dst.type() == RegType::vgpr) {
3477 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3478 get_alu_src(ctx, instr->src[0]));
3479 } else {
3480 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3481 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3482 Operand::zero());
3483 }
3484 break;
3485 case nir_op_pack_32_2x16_split: {
3486 Temp src0 = get_alu_src(ctx, instr->src[0]);
3487 Temp src1 = get_alu_src(ctx, instr->src[1]);
3488 if (dst.regClass() == v1) {
3489 src0 = emit_extract_vector(ctx, src0, 0, v2b);
3490 src1 = emit_extract_vector(ctx, src1, 0, v2b);
3491 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3492 } else if (ctx->program->gfx_level >= GFX9) {
3493 bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1);
3494 } else {
3495 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3496 Operand::c32(0xFFFFu));
3497 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3498 Operand::c32(16u));
3499 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3500 }
3501 break;
3502 }
3503 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3504 case nir_op_pack_half_2x16_rtz_split:
3505 case nir_op_pack_half_2x16_split: {
3506 if (dst.regClass() == v1) {
3507 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3508 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3509 else
3510 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3511 } else if (dst.regClass() == s1) {
3512 emit_sop2_instruction(ctx, instr, aco_opcode::s_cvt_pk_rtz_f16_f32, dst, false);
3513 } else {
3514 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3515 }
3516 break;
3517 }
3518 case nir_op_pack_unorm_2x16:
3519 case nir_op_pack_snorm_2x16: {
3520 unsigned bit_size = instr->src[0].src.ssa->bit_size;
3521 /* Only support 16 and 32bit. */
3522 assert(bit_size == 32 || bit_size == 16);
3523
3524 RegClass src_rc = bit_size == 32 ? v1 : v2b;
3525 Temp src = get_alu_src(ctx, instr->src[0], 2);
3526 Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3527 Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3528
3529 /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3530 if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3531 src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3532 src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3533 bit_size = 32;
3534 }
3535
3536 aco_opcode opcode;
3537 if (bit_size == 32) {
3538 opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3539 : aco_opcode::v_cvt_pknorm_i16_f32;
3540 } else {
3541 opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3542 : aco_opcode::v_cvt_pknorm_i16_f16;
3543 }
3544 bld.vop3(opcode, Definition(dst), src0, src1);
3545 break;
3546 }
3547 case nir_op_pack_uint_2x16:
3548 case nir_op_pack_sint_2x16: {
3549 Temp src = get_alu_src(ctx, instr->src[0], 2);
3550 Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3551 Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3552 aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3553 : aco_opcode::v_cvt_pk_i16_i32;
3554 bld.vop3(opcode, Definition(dst), src0, src1);
3555 break;
3556 }
3557 case nir_op_unpack_half_2x16_split_x: {
3558 Temp src = get_alu_src(ctx, instr->src[0]);
3559 if (dst.regClass() == s1) {
3560 bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
3561 break;
3562 }
3563 if (src.regClass() == v1)
3564 src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3565 if (dst.regClass() == v1) {
3566 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3567 } else {
3568 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3569 }
3570 break;
3571 }
3572 case nir_op_unpack_half_2x16_split_y: {
3573 Temp src = get_alu_src(ctx, instr->src[0]);
3574 if (dst.regClass() == s1) {
3575 bld.sop1(aco_opcode::s_cvt_hi_f32_f16, Definition(dst), src);
3576 break;
3577 }
3578 if (src.regClass() == s1)
3579 src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3580 Operand::c32(1u), Operand::c32(16u), Operand::zero());
3581 else
3582 src =
3583 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3584 if (dst.regClass() == v1) {
3585 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3586 } else {
3587 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3588 }
3589 break;
3590 }
3591 case nir_op_msad_4x8: {
3592 assert(dst.regClass() == v1);
3593 emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
3594 break;
3595 }
3596 case nir_op_mqsad_4x8: {
3597 assert(dst.regClass() == v4);
3598 Temp ref = get_alu_src(ctx, instr->src[0]);
3599 Temp src = get_alu_src(ctx, instr->src[1], 2);
3600 Temp accum = get_alu_src(ctx, instr->src[2], 4);
3601 bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
3602 as_vgpr(ctx, accum));
3603 emit_split_vector(ctx, dst, 4);
3604 break;
3605 }
3606 case nir_op_shfr: {
3607 if (dst.regClass() == s1) {
3608 Temp src = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
3609 get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
3610
3611 Temp amount;
3612 if (nir_src_is_const(instr->src[2].src)) {
3613 amount = bld.copy(bld.def(s1), Operand::c32(nir_src_as_uint(instr->src[2].src) & 0x1f));
3614 } else {
3615 amount = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3616 get_alu_src(ctx, instr->src[2]), Operand::c32(0x1f));
3617 }
3618
3619 Temp res = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), src, amount);
3620 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), res, Operand::zero());
3621 } else if (dst.regClass() == v1) {
3622 emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbit_b32, dst, false, 3u);
3623 } else {
3624 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3625 }
3626 break;
3627 }
3628 case nir_op_fquantize2f16: {
3629 Temp src = get_alu_src(ctx, instr->src[0]);
3630 if (dst.regClass() == v1) {
3631 Temp f16;
3632 if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3633 f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
3634 else
3635 f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3636
3637 if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3638 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
3639 break;
3640 }
3641
3642 Temp denorm_zero;
3643 Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3644 if (ctx->program->gfx_level >= GFX8) {
3645 /* value is negative/positive denormal value/zero */
3646 Instruction* tmp0 =
3647 bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, Operand::c32(0x30));
3648 tmp0->valu().abs[0] = true;
3649 tmp0->valu().neg[0] = true;
3650 denorm_zero = tmp0->definitions[0].getTemp();
3651 } else {
3652 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3653 * so compare the result and flush to 0 if it's smaller.
3654 */
3655 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3656 Instruction* tmp0 =
3657 bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3658 tmp0->valu().abs[0] = true;
3659 denorm_zero = tmp0->definitions[0].getTemp();
3660 }
3661 if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3662 Temp copysign_0 =
3663 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3664 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), f32, copysign_0, denorm_zero);
3665 } else {
3666 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), f32, Operand::zero(),
3667 denorm_zero);
3668 }
3669 } else if (dst.regClass() == s1) {
3670 Temp f16;
3671 if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3672 f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
3673 else
3674 f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
3675
3676 if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3677 bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);
3678 } else {
3679 Temp f32 = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), f16);
3680 Temp abs_mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
3681 Temp abs =
3682 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3683 Operand sign;
3684 if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3685 sign =
3686 bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3687 } else {
3688 sign = Operand::c32(0);
3689 }
3690 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3691 Temp denorm_zero = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, smallest);
3692 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), sign, f32, bld.scc(denorm_zero));
3693 }
3694 } else {
3695 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3696 }
3697 break;
3698 }
3699 case nir_op_bfm: {
3700 Temp bits = get_alu_src(ctx, instr->src[0]);
3701 Temp offset = get_alu_src(ctx, instr->src[1]);
3702
3703 if (dst.regClass() == s1) {
3704 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3705 } else if (dst.regClass() == v1) {
3706 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3707 } else {
3708 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3709 }
3710 break;
3711 }
3712 case nir_op_bitfield_select: {
3713
3714 /* dst = (insert & bitmask) | (base & ~bitmask) */
3715 if (dst.regClass() == s1) {
3716 Temp bitmask = get_alu_src(ctx, instr->src[0]);
3717 Temp insert = get_alu_src(ctx, instr->src[1]);
3718 Temp base = get_alu_src(ctx, instr->src[2]);
3719 aco_ptr<Instruction> sop2;
3720 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3721 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3722 Operand lhs;
3723 if (const_insert && const_bitmask) {
3724 lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3725 } else {
3726 insert =
3727 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3728 lhs = Operand(insert);
3729 }
3730
3731 Operand rhs;
3732 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3733 if (const_base && const_bitmask) {
3734 rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3735 } else {
3736 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3737 rhs = Operand(base);
3738 }
3739
3740 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3741
3742 } else if (dst.regClass() == v1) {
3743 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3744 } else {
3745 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3746 }
3747 break;
3748 }
3749 case nir_op_ubfe:
3750 case nir_op_ibfe: {
3751 if (dst.bytes() != 4)
3752 unreachable("Unsupported BFE bit size");
3753
3754 if (dst.type() == RegType::sgpr) {
3755 Temp base = get_alu_src(ctx, instr->src[0]);
3756
3757 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3758 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3759 aco_opcode opcode =
3760 instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3761 if (const_offset && const_bits) {
3762 uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3763 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3764 break;
3765 }
3766
3767 Temp offset = get_alu_src(ctx, instr->src[1]);
3768 Temp bits = get_alu_src(ctx, instr->src[2]);
3769
3770 if (ctx->program->gfx_level >= GFX9) {
3771 Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3772 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3773 bld.def(s1, scc), bits, Operand::c32(0x1fu));
3774 Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3775 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3776 } else if (instr->op == nir_op_ubfe) {
3777 Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3778 Temp masked =
3779 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3780 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3781 } else {
3782 Operand bits_op = const_bits
3783 ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3784 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3785 bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3786 bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3787 Operand::c32(16u));
3788 Operand offset_op = const_offset
3789 ? Operand::c32(const_offset->u32 & 0x1fu)
3790 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3791 offset, Operand::c32(0x1fu));
3792
3793 Temp extract =
3794 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3795 bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3796 }
3797
3798 } else {
3799 aco_opcode opcode =
3800 instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3801 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3802 }
3803 break;
3804 }
3805 case nir_op_extract_u8:
3806 case nir_op_extract_i8:
3807 case nir_op_extract_u16:
3808 case nir_op_extract_i16: {
3809 bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3810 unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3811 uint32_t bits = comp == 4 ? 8 : 16;
3812 unsigned index = nir_src_as_uint(instr->src[1].src);
3813 if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3814 assert(index == 0);
3815 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3816 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3817 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3818 unsigned swizzle = instr->src[0].swizzle[0];
3819 if (vec.size() > 1) {
3820 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3821 swizzle = swizzle & 1;
3822 }
3823 index += swizzle * instr->def.bit_size / bits;
3824 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3825 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3826 } else {
3827 Temp src = get_alu_src(ctx, instr->src[0]);
3828 Definition def(dst);
3829 if (dst.bytes() == 8) {
3830 src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3831 index %= comp;
3832 def = bld.def(src.type(), 1);
3833 }
3834 assert(def.bytes() <= 4);
3835 if (def.regClass() == s1) {
3836 bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3837 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3838 } else {
3839 src = emit_extract_vector(ctx, src, 0, def.regClass());
3840 bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3841 Operand::c32(bits), Operand::c32(is_signed));
3842 }
3843 if (dst.size() == 2)
3844 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3845 Operand::zero());
3846 }
3847 break;
3848 }
3849 case nir_op_insert_u8:
3850 case nir_op_insert_u16: {
3851 unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3852 uint32_t bits = comp == 4 ? 8 : 16;
3853 unsigned index = nir_src_as_uint(instr->src[1].src);
3854 if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3855 assert(index == 0);
3856 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3857 } else {
3858 Temp src = get_alu_src(ctx, instr->src[0]);
3859 Definition def(dst);
3860 bool swap = false;
3861 if (dst.bytes() == 8) {
3862 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3863 swap = index >= comp;
3864 index %= comp;
3865 def = bld.def(src.type(), 1);
3866 }
3867 if (def.regClass() == s1) {
3868 bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3869 Operand::c32(index), Operand::c32(bits));
3870 } else {
3871 src = emit_extract_vector(ctx, src, 0, def.regClass());
3872 bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3873 Operand::c32(bits));
3874 }
3875 if (dst.size() == 2 && swap)
3876 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3877 def.getTemp());
3878 else if (dst.size() == 2)
3879 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3880 Operand::zero());
3881 }
3882 break;
3883 }
3884 case nir_op_bit_count: {
3885 Temp src = get_alu_src(ctx, instr->src[0]);
3886 if (src.regClass() == s1) {
3887 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3888 } else if (src.regClass() == v1) {
3889 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3890 } else if (src.regClass() == v2) {
3891 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3892 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3893 emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3894 } else if (src.regClass() == s2) {
3895 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3896 } else {
3897 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3898 }
3899 break;
3900 }
3901 case nir_op_flt: {
3902 emit_comparison(
3903 ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3904 aco_opcode::v_cmp_lt_f64,
3905 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f16 : aco_opcode::num_opcodes,
3906 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f32 : aco_opcode::num_opcodes);
3907 break;
3908 }
3909 case nir_op_fge: {
3910 emit_comparison(
3911 ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3912 aco_opcode::v_cmp_ge_f64,
3913 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f16 : aco_opcode::num_opcodes,
3914 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f32 : aco_opcode::num_opcodes);
3915 break;
3916 }
3917 case nir_op_fltu: {
3918 emit_comparison(
3919 ctx, instr, dst, aco_opcode::v_cmp_nge_f16, aco_opcode::v_cmp_nge_f32,
3920 aco_opcode::v_cmp_nge_f64,
3921 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f16 : aco_opcode::num_opcodes,
3922 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f32 : aco_opcode::num_opcodes);
3923 break;
3924 }
3925 case nir_op_fgeu: {
3926 emit_comparison(
3927 ctx, instr, dst, aco_opcode::v_cmp_nlt_f16, aco_opcode::v_cmp_nlt_f32,
3928 aco_opcode::v_cmp_nlt_f64,
3929 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f16 : aco_opcode::num_opcodes,
3930 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f32 : aco_opcode::num_opcodes);
3931 break;
3932 }
3933 case nir_op_feq: {
3934 emit_comparison(
3935 ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3936 aco_opcode::v_cmp_eq_f64,
3937 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f16 : aco_opcode::num_opcodes,
3938 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f32 : aco_opcode::num_opcodes);
3939 break;
3940 }
3941 case nir_op_fneu: {
3942 emit_comparison(
3943 ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3944 aco_opcode::v_cmp_neq_f64,
3945 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f16 : aco_opcode::num_opcodes,
3946 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f32 : aco_opcode::num_opcodes);
3947 break;
3948 }
3949 case nir_op_fequ: {
3950 emit_comparison(
3951 ctx, instr, dst, aco_opcode::v_cmp_nlg_f16, aco_opcode::v_cmp_nlg_f32,
3952 aco_opcode::v_cmp_nlg_f64,
3953 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f16 : aco_opcode::num_opcodes,
3954 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f32 : aco_opcode::num_opcodes);
3955 break;
3956 }
3957 case nir_op_fneo: {
3958 emit_comparison(
3959 ctx, instr, dst, aco_opcode::v_cmp_lg_f16, aco_opcode::v_cmp_lg_f32,
3960 aco_opcode::v_cmp_lg_f64,
3961 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f16 : aco_opcode::num_opcodes,
3962 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f32 : aco_opcode::num_opcodes);
3963 break;
3964 }
3965 case nir_op_funord: {
3966 emit_comparison(
3967 ctx, instr, dst, aco_opcode::v_cmp_u_f16, aco_opcode::v_cmp_u_f32, aco_opcode::v_cmp_u_f64,
3968 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f16 : aco_opcode::num_opcodes,
3969 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f32 : aco_opcode::num_opcodes);
3970 break;
3971 }
3972 case nir_op_ford: {
3973 emit_comparison(
3974 ctx, instr, dst, aco_opcode::v_cmp_o_f16, aco_opcode::v_cmp_o_f32, aco_opcode::v_cmp_o_f64,
3975 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f16 : aco_opcode::num_opcodes,
3976 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f32 : aco_opcode::num_opcodes);
3977 break;
3978 }
3979 case nir_op_ilt: {
3980 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3981 aco_opcode::v_cmp_lt_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_i32);
3982 break;
3983 }
3984 case nir_op_ige: {
3985 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3986 aco_opcode::v_cmp_ge_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_i32);
3987 break;
3988 }
3989 case nir_op_ieq: {
3990 if (instr->src[0].src.ssa->bit_size == 1)
3991 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3992 else
3993 emit_comparison(
3994 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3995 aco_opcode::v_cmp_eq_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_eq_i32,
3996 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3997 break;
3998 }
3999 case nir_op_ine: {
4000 if (instr->src[0].src.ssa->bit_size == 1)
4001 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
4002 else
4003 emit_comparison(
4004 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
4005 aco_opcode::v_cmp_lg_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lg_i32,
4006 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
4007 break;
4008 }
4009 case nir_op_ult: {
4010 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
4011 aco_opcode::v_cmp_lt_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_u32);
4012 break;
4013 }
4014 case nir_op_uge: {
4015 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
4016 aco_opcode::v_cmp_ge_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_u32);
4017 break;
4018 }
4019 case nir_op_bitz:
4020 case nir_op_bitnz: {
4021 assert(instr->src[0].src.ssa->bit_size != 1);
4022 bool test0 = instr->op == nir_op_bitz;
4023 Temp src0 = get_alu_src(ctx, instr->src[0]);
4024 Temp src1 = get_alu_src(ctx, instr->src[1]);
4025 bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
4026 if (!use_valu) {
4027 aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
4028 : aco_opcode::s_bitcmp1_b32;
4029 if (test0)
4030 op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
4031 : aco_opcode::s_bitcmp0_b32;
4032 emit_sopc_instruction(ctx, instr, op, dst);
4033 break;
4034 }
4035
4036 /* We do not have a VALU version of s_bitcmp.
4037 * But if the second source is constant, we can use
4038 * v_cmp_class_f32's LUT to check the bit.
4039 * The LUT only has 10 entries, so extract a higher byte if we have to.
4040 * For sign bits comparision with 0 is better because v_cmp_class
4041 * can't be inverted.
4042 */
4043 if (nir_src_is_const(instr->src[1].src)) {
4044 uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
4045 bit &= instr->src[0].src.ssa->bit_size - 1;
4046 src0 = as_vgpr(ctx, src0);
4047
4048 if (src0.regClass() == v2) {
4049 src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
4050 bit &= 31;
4051 }
4052
4053 if (bit == 31) {
4054 bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
4055 Operand::c32(0), src0);
4056 break;
4057 }
4058
4059 if (bit == 15 && ctx->program->gfx_level >= GFX8) {
4060 bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
4061 Operand::c32(0), src0);
4062 break;
4063 }
4064
4065 /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
4066 const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
4067 const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
4068 const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
4069 if (use_opsel) {
4070 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
4071 Operand::c32(16), Operand::c32(0));
4072 bit &= 0xf;
4073 }
4074
4075 /* If we can use sdwa the extract is free, while test0's s_not is not. */
4076 if (bit == 7 && test0 && can_sdwa) {
4077 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4078 Operand::c32(8), Operand::c32(1));
4079 bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
4080 Operand::c32(0), src0);
4081 break;
4082 }
4083
4084 if (bit > max_bit) {
4085 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4086 Operand::c32(8), Operand::c32(0));
4087 bit &= 0x7;
4088 }
4089
4090 /* denorm and snan/qnan inputs are preserved using all float control modes. */
4091 static const struct {
4092 uint32_t fp32;
4093 uint32_t fp16;
4094 bool negate;
4095 } float_lut[10] = {
4096 {0x7f800001, 0x7c01, false}, /* snan */
4097 {~0u, ~0u, false}, /* qnan */
4098 {0xff800000, 0xfc00, false}, /* -inf */
4099 {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
4100 {1, 1, true}, /* -denormal */
4101 {0, 0, true}, /* -0.0 */
4102 {0, 0, false}, /* +0.0 */
4103 {1, 1, false}, /* +denormal */
4104 {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
4105 {0x7f800000, 0x7c00, false}, /* +inf */
4106 };
4107
4108 Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
4109 /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
4110 const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
4111 (ctx->program->gfx_level >= GFX11 && use_opsel);
4112 const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
4113 const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
4114
4115 VALU_instruction& res =
4116 bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
4117 if (float_lut[bit].negate) {
4118 res.format = asVOP3(res.format);
4119 res.neg[0] = true;
4120 }
4121
4122 if (test0)
4123 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
4124
4125 break;
4126 }
4127
4128 Temp res;
4129 aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
4130 if (instr->src[0].src.ssa->bit_size == 16) {
4131 op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
4132 if (ctx->program->gfx_level < GFX10)
4133 res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
4134 else
4135 res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
4136
4137 res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
4138 } else if (instr->src[0].src.ssa->bit_size == 32) {
4139 res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
4140 } else if (instr->src[0].src.ssa->bit_size == 64) {
4141 if (ctx->program->gfx_level < GFX8)
4142 res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
4143 else
4144 res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
4145
4146 res = emit_extract_vector(ctx, res, 0, v1);
4147 res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
4148 } else {
4149 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
4150 }
4151 bld.vopc(op, Definition(dst), Operand::c32(0), res);
4152 break;
4153 }
4154 default: isel_err(&instr->instr, "Unknown NIR ALU instr");
4155 }
4156 }
4157
4158 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)4159 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
4160 {
4161 Temp dst = get_ssa_temp(ctx, &instr->def);
4162
4163 // TODO: we really want to have the resulting type as this would allow for 64bit literals
4164 // which get truncated the lsb if double and msb if int
4165 // for now, we only use s_mov_b64 with 64bit inline constants
4166 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
4167 assert(dst.type() == RegType::sgpr);
4168
4169 Builder bld(ctx->program, ctx->block);
4170
4171 if (instr->def.bit_size == 1) {
4172 assert(dst.regClass() == bld.lm);
4173 int val = instr->value[0].b ? -1 : 0;
4174 Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
4175 bld.copy(Definition(dst), op);
4176 } else if (instr->def.bit_size == 8) {
4177 bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
4178 } else if (instr->def.bit_size == 16) {
4179 /* sign-extend to use s_movk_i32 instead of a literal */
4180 bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
4181 } else if (dst.size() == 1) {
4182 bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
4183 } else {
4184 assert(dst.size() != 1);
4185 aco_ptr<Instruction> vec{
4186 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4187 if (instr->def.bit_size == 64)
4188 for (unsigned i = 0; i < dst.size(); i++)
4189 vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
4190 else {
4191 for (unsigned i = 0; i < dst.size(); i++)
4192 vec->operands[i] = Operand::c32(instr->value[i].u32);
4193 }
4194 vec->definitions[0] = Definition(dst);
4195 ctx->block->instructions.emplace_back(std::move(vec));
4196 }
4197 }
4198
4199 Temp
emit_readfirstlane(isel_context * ctx,Temp src,Temp dst)4200 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
4201 {
4202 Builder bld(ctx->program, ctx->block);
4203
4204 if (src.regClass().type() == RegType::sgpr) {
4205 bld.copy(Definition(dst), src);
4206 } else if (src.size() == 1) {
4207 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
4208 } else {
4209 aco_ptr<Instruction> split{
4210 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
4211 split->operands[0] = Operand(src);
4212
4213 for (unsigned i = 0; i < src.size(); i++) {
4214 split->definitions[i] =
4215 bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
4216 }
4217
4218 Instruction* split_raw = split.get();
4219 ctx->block->instructions.emplace_back(std::move(split));
4220
4221 aco_ptr<Instruction> vec{
4222 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
4223 vec->definitions[0] = Definition(dst);
4224 for (unsigned i = 0; i < src.size(); i++) {
4225 vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
4226 split_raw->definitions[i].getTemp());
4227 }
4228
4229 ctx->block->instructions.emplace_back(std::move(vec));
4230 if (src.bytes() % 4 == 0)
4231 emit_split_vector(ctx, dst, src.size());
4232 }
4233
4234 return dst;
4235 }
4236
4237 bool
can_use_byte_align_for_global_load(unsigned num_components,unsigned component_size,unsigned align_,bool support_12_byte)4238 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
4239 unsigned align_, bool support_12_byte)
4240 {
4241 /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
4242 * to use unsupported load sizes.
4243 */
4244 assert(util_is_power_of_two_nonzero(align_));
4245 if (align_ < 4) {
4246 assert(component_size < 4);
4247 unsigned load_size = num_components * component_size;
4248 uint32_t new_size = align(load_size + (4 - align_), 4);
4249 return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
4250 }
4251 return true;
4252 }
4253
4254 struct LoadEmitInfo {
4255 Operand offset;
4256 Temp dst;
4257 unsigned num_components;
4258 unsigned component_size;
4259 Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
4260 Temp idx = Temp(0, v1); /* buffer index */
4261 unsigned component_stride = 0;
4262 unsigned const_offset = 0;
4263 unsigned align_mul = 0;
4264 unsigned align_offset = 0;
4265 pipe_format format;
4266
4267 ac_hw_cache_flags cache = {{0, 0, 0, 0, 0}};
4268 bool split_by_component_stride = true;
4269 bool readfirstlane_for_uniform = false;
4270 unsigned swizzle_component_size = 0;
4271 memory_sync_info sync;
4272 Temp soffset = Temp(0, s1);
4273 };
4274
4275 struct EmitLoadParameters {
4276 using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
4277 unsigned bytes_needed, unsigned align, unsigned const_offset,
4278 Temp dst_hint);
4279
4280 Callback callback;
4281 bool byte_align_loads;
4282 bool supports_8bit_16bit_loads;
4283 unsigned max_const_offset_plus_one;
4284 };
4285
4286 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)4287 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4288 const EmitLoadParameters& params)
4289 {
4290 unsigned load_size = info.num_components * info.component_size;
4291 unsigned component_size = info.component_size;
4292
4293 unsigned num_vals = 0;
4294 Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
4295
4296 unsigned const_offset = info.const_offset;
4297
4298 const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
4299 unsigned align_offset = info.align_offset % align_mul;
4300
4301 unsigned bytes_read = 0;
4302 while (bytes_read < load_size) {
4303 unsigned bytes_needed = load_size - bytes_read;
4304
4305 /* add buffer for unaligned loads */
4306 int byte_align = 0;
4307 if (params.byte_align_loads) {
4308 byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
4309 }
4310
4311 if (byte_align) {
4312 if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
4313 !params.supports_8bit_16bit_loads) {
4314 if (info.component_stride) {
4315 assert(params.supports_8bit_16bit_loads && "unimplemented");
4316 bytes_needed = 2;
4317 byte_align = 0;
4318 } else {
4319 bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
4320 bytes_needed = align(bytes_needed, 4);
4321 }
4322 } else {
4323 byte_align = 0;
4324 }
4325 }
4326
4327 if (info.split_by_component_stride) {
4328 if (info.swizzle_component_size)
4329 bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
4330 if (info.component_stride)
4331 bytes_needed = MIN2(bytes_needed, info.component_size);
4332 }
4333
4334 bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
4335
4336 /* reduce constant offset */
4337 Operand offset = info.offset;
4338 unsigned reduced_const_offset = const_offset;
4339 bool remove_const_offset_completely = need_to_align_offset;
4340 if (const_offset &&
4341 (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4342 unsigned to_add = const_offset;
4343 if (remove_const_offset_completely) {
4344 reduced_const_offset = 0;
4345 } else {
4346 to_add =
4347 const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4348 reduced_const_offset %= params.max_const_offset_plus_one;
4349 }
4350 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4351 if (offset.isConstant()) {
4352 offset = Operand::c32(offset.constantValue() + to_add);
4353 } else if (offset.isUndefined()) {
4354 offset = Operand::c32(to_add);
4355 } else if (offset_tmp.regClass() == s1) {
4356 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4357 Operand::c32(to_add));
4358 } else if (offset_tmp.regClass() == v1) {
4359 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4360 } else {
4361 Temp lo = bld.tmp(offset_tmp.type(), 1);
4362 Temp hi = bld.tmp(offset_tmp.type(), 1);
4363 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4364
4365 if (offset_tmp.regClass() == s2) {
4366 Temp carry = bld.tmp(s1);
4367 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4368 Operand::c32(to_add));
4369 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4370 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4371 } else {
4372 Temp new_lo = bld.tmp(v1);
4373 Temp carry =
4374 bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4375 hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4376 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4377 }
4378 }
4379 }
4380
4381 /* align offset down if needed */
4382 Operand aligned_offset = offset;
4383 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4384 if (need_to_align_offset) {
4385 align = 4;
4386 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4387 if (offset.isConstant()) {
4388 aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4389 } else if (offset.isUndefined()) {
4390 aligned_offset = Operand::zero();
4391 } else if (offset_tmp.regClass() == s1) {
4392 aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4393 Operand::c32(0xfffffffcu), offset_tmp);
4394 } else if (offset_tmp.regClass() == s2) {
4395 aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4396 Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4397 } else if (offset_tmp.regClass() == v1) {
4398 aligned_offset =
4399 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4400 } else if (offset_tmp.regClass() == v2) {
4401 Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4402 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4403 lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4404 aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4405 }
4406 }
4407 Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4408 : aligned_offset.isConstant()
4409 ? bld.copy(bld.def(s1), aligned_offset)
4410 : Temp(0, s1);
4411
4412 Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4413 reduced_const_offset, byte_align ? Temp() : info.dst);
4414
4415 /* the callback wrote directly to dst */
4416 if (val == info.dst) {
4417 assert(num_vals == 0);
4418 emit_split_vector(ctx, info.dst, info.num_components);
4419 return;
4420 }
4421
4422 /* shift result right if needed */
4423 if (params.byte_align_loads && info.component_size < 4) {
4424 Operand byte_align_off = Operand::c32(byte_align);
4425 if (byte_align == -1) {
4426 if (offset.isConstant())
4427 byte_align_off = Operand::c32(offset.constantValue() % 4u);
4428 else if (offset.isUndefined())
4429 byte_align_off = Operand::zero();
4430 else if (offset.size() == 2)
4431 byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4432 RegClass(offset.getTemp().type(), 1)));
4433 else
4434 byte_align_off = offset;
4435 }
4436
4437 assert(val.bytes() >= load_size && "unimplemented");
4438 if (val.type() == RegType::sgpr)
4439 byte_align_scalar(ctx, val, byte_align_off, info.dst);
4440 else
4441 byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4442 return;
4443 }
4444
4445 /* add result to list and advance */
4446 if (info.component_stride) {
4447 assert(val.bytes() % info.component_size == 0);
4448 unsigned num_loaded_components = val.bytes() / info.component_size;
4449 unsigned advance_bytes = info.component_stride * num_loaded_components;
4450 const_offset += advance_bytes;
4451 align_offset = (align_offset + advance_bytes) % align_mul;
4452 } else {
4453 const_offset += val.bytes();
4454 align_offset = (align_offset + val.bytes()) % align_mul;
4455 }
4456 bytes_read += val.bytes();
4457 vals[num_vals++] = val;
4458 }
4459
4460 /* create array of components */
4461 unsigned components_split = 0;
4462 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4463 bool has_vgprs = false;
4464 for (unsigned i = 0; i < num_vals;) {
4465 Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4466 unsigned num_tmps = 0;
4467 unsigned tmp_size = 0;
4468 RegType reg_type = RegType::sgpr;
4469 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4470 if (vals[i].type() == RegType::vgpr)
4471 reg_type = RegType::vgpr;
4472 tmp_size += vals[i].bytes();
4473 tmp[num_tmps++] = vals[i++];
4474 }
4475 if (num_tmps > 1) {
4476 aco_ptr<Instruction> vec{
4477 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4478 for (unsigned j = 0; j < num_tmps; j++)
4479 vec->operands[j] = Operand(tmp[j]);
4480 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4481 vec->definitions[0] = Definition(tmp[0]);
4482 bld.insert(std::move(vec));
4483 }
4484
4485 if (tmp[0].bytes() % component_size) {
4486 /* trim tmp[0] */
4487 assert(i == num_vals);
4488 RegClass new_rc =
4489 RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4490 tmp[0] =
4491 bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4492 }
4493
4494 RegClass elem_rc = RegClass::get(reg_type, component_size);
4495
4496 unsigned start = components_split;
4497
4498 if (tmp_size == elem_rc.bytes()) {
4499 allocated_vec[components_split++] = tmp[0];
4500 } else {
4501 assert(tmp_size % elem_rc.bytes() == 0);
4502 aco_ptr<Instruction> split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO,
4503 1, tmp_size / elem_rc.bytes())};
4504 for (auto& def : split->definitions) {
4505 Temp component = bld.tmp(elem_rc);
4506 allocated_vec[components_split++] = component;
4507 def = Definition(component);
4508 }
4509 split->operands[0] = Operand(tmp[0]);
4510 bld.insert(std::move(split));
4511 }
4512
4513 /* try to p_as_uniform early so we can create more optimizable code and
4514 * also update allocated_vec */
4515 for (unsigned j = start; j < components_split; j++) {
4516 if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
4517 if (info.readfirstlane_for_uniform) {
4518 allocated_vec[j] = emit_readfirstlane(
4519 ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
4520 } else {
4521 allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4522 }
4523 }
4524 has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4525 }
4526 }
4527
4528 /* concatenate components and p_as_uniform() result if needed */
4529 if (info.dst.type() == RegType::vgpr || !has_vgprs)
4530 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4531
4532 int padding_bytes =
4533 MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4534
4535 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
4536 info.num_components + !!padding_bytes, 1)};
4537 for (unsigned i = 0; i < info.num_components; i++)
4538 vec->operands[i] = Operand(allocated_vec[i]);
4539 if (padding_bytes)
4540 vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4541 if (info.dst.type() == RegType::sgpr && has_vgprs) {
4542 Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4543 vec->definitions[0] = Definition(tmp);
4544 bld.insert(std::move(vec));
4545 if (info.readfirstlane_for_uniform)
4546 emit_readfirstlane(ctx, tmp, info.dst);
4547 else
4548 bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4549 } else {
4550 vec->definitions[0] = Definition(info.dst);
4551 bld.insert(std::move(vec));
4552 }
4553 }
4554
4555 Operand
load_lds_size_m0(Builder & bld)4556 load_lds_size_m0(Builder& bld)
4557 {
4558 /* m0 does not need to be initialized on GFX9+ */
4559 if (bld.program->gfx_level >= GFX9)
4560 return Operand(s1);
4561
4562 return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4563 }
4564
4565 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4566 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4567 unsigned align, unsigned const_offset, Temp dst_hint)
4568 {
4569 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4570
4571 Operand m = load_lds_size_m0(bld);
4572
4573 bool large_ds_read = bld.program->gfx_level >= GFX7;
4574 bool usable_read2 = bld.program->gfx_level >= GFX7;
4575
4576 bool read2 = false;
4577 unsigned size = 0;
4578 aco_opcode op;
4579 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4580 size = 16;
4581 op = aco_opcode::ds_read_b128;
4582 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4583 size = 16;
4584 read2 = true;
4585 op = aco_opcode::ds_read2_b64;
4586 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4587 size = 12;
4588 op = aco_opcode::ds_read_b96;
4589 } else if (bytes_needed >= 8 && align % 8 == 0) {
4590 size = 8;
4591 op = aco_opcode::ds_read_b64;
4592 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4593 size = 8;
4594 read2 = true;
4595 op = aco_opcode::ds_read2_b32;
4596 } else if (bytes_needed >= 4 && align % 4 == 0) {
4597 size = 4;
4598 op = aco_opcode::ds_read_b32;
4599 } else if (bytes_needed >= 2 && align % 2 == 0) {
4600 size = 2;
4601 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4602 } else {
4603 size = 1;
4604 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4605 }
4606
4607 unsigned const_offset_unit = read2 ? size / 2u : 1u;
4608 unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4609
4610 if (const_offset > (const_offset_range - const_offset_unit)) {
4611 unsigned excess = const_offset - (const_offset % const_offset_range);
4612 offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4613 const_offset -= excess;
4614 }
4615
4616 const_offset /= const_offset_unit;
4617
4618 RegClass rc = RegClass::get(RegType::vgpr, size);
4619 Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4620 Instruction* instr;
4621 if (read2)
4622 instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4623 else
4624 instr = bld.ds(op, Definition(val), offset, m, const_offset);
4625 instr->ds().sync = info.sync;
4626
4627 if (m.isUndefined())
4628 instr->operands.pop_back();
4629
4630 return val;
4631 }
4632
4633 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4634
4635 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4636 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4637 unsigned align, unsigned const_offset, Temp dst_hint)
4638 {
4639 assert(align >= 4u);
4640
4641 bld.program->has_smem_buffer_or_global_loads = true;
4642
4643 bool buffer = info.resource.id() && info.resource.bytes() == 16;
4644 Temp addr = info.resource;
4645 if (!buffer && !addr.id()) {
4646 addr = offset;
4647 offset = Temp();
4648 }
4649
4650 bytes_needed = MIN2(bytes_needed, 64);
4651 unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4652 unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4653 /* Only round-up global loads if it's aligned so that it won't cross pages */
4654 bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4655
4656 aco_opcode op;
4657 if (bytes_needed <= 4) {
4658 op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4659 } else if (bytes_needed <= 8) {
4660 op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4661 } else if (bytes_needed <= 16) {
4662 op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4663 } else if (bytes_needed <= 32) {
4664 op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4665 } else {
4666 assert(bytes_needed == 64);
4667 op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4668 }
4669
4670 aco_ptr<Instruction> load{create_instruction(op, Format::SMEM, 2, 1)};
4671 if (buffer) {
4672 if (const_offset)
4673 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4674 Operand::c32(const_offset));
4675 load->operands[0] = Operand(info.resource);
4676 load->operands[1] = Operand(offset);
4677 } else {
4678 load->operands[0] = Operand(addr);
4679 if (offset.id() && const_offset)
4680 load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4681 Operand::c32(const_offset));
4682 else if (offset.id())
4683 load->operands[1] = Operand(offset);
4684 else
4685 load->operands[1] = Operand::c32(const_offset);
4686 }
4687 RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4688 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4689 load->definitions[0] = Definition(val);
4690 load->smem().cache = info.cache;
4691 load->smem().sync = info.sync;
4692 bld.insert(std::move(load));
4693 return val;
4694 }
4695
4696 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4697
4698 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4699 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4700 unsigned align_, unsigned const_offset, Temp dst_hint)
4701 {
4702 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4703 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4704
4705 if (info.soffset.id()) {
4706 if (soffset.isTemp())
4707 vaddr = bld.copy(bld.def(v1), soffset);
4708 soffset = Operand(info.soffset);
4709 }
4710
4711 if (soffset.isUndefined())
4712 soffset = Operand::zero();
4713
4714 bool offen = !vaddr.isUndefined();
4715 bool idxen = info.idx.id();
4716
4717 if (offen && idxen)
4718 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4719 else if (idxen)
4720 vaddr = Operand(info.idx);
4721
4722 unsigned bytes_size = 0;
4723 aco_opcode op;
4724 if (bytes_needed == 1 || align_ % 2) {
4725 bytes_size = 1;
4726 op = aco_opcode::buffer_load_ubyte;
4727 } else if (bytes_needed == 2 || align_ % 4) {
4728 bytes_size = 2;
4729 op = aco_opcode::buffer_load_ushort;
4730 } else if (bytes_needed <= 4) {
4731 bytes_size = 4;
4732 op = aco_opcode::buffer_load_dword;
4733 } else if (bytes_needed <= 8) {
4734 bytes_size = 8;
4735 op = aco_opcode::buffer_load_dwordx2;
4736 } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4737 bytes_size = 12;
4738 op = aco_opcode::buffer_load_dwordx3;
4739 } else {
4740 bytes_size = 16;
4741 op = aco_opcode::buffer_load_dwordx4;
4742 }
4743 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4744 mubuf->operands[0] = Operand(info.resource);
4745 mubuf->operands[1] = vaddr;
4746 mubuf->operands[2] = soffset;
4747 mubuf->mubuf().offen = offen;
4748 mubuf->mubuf().idxen = idxen;
4749 mubuf->mubuf().cache = info.cache;
4750 mubuf->mubuf().sync = info.sync;
4751 mubuf->mubuf().offset = const_offset;
4752 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4753 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4754 mubuf->definitions[0] = Definition(val);
4755 bld.insert(std::move(mubuf));
4756
4757 return val;
4758 }
4759
4760 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4761
4762 Temp
mubuf_load_format_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4763 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4764 unsigned bytes_needed, unsigned align_, unsigned const_offset,
4765 Temp dst_hint)
4766 {
4767 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4768 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4769
4770 if (info.soffset.id()) {
4771 if (soffset.isTemp())
4772 vaddr = bld.copy(bld.def(v1), soffset);
4773 soffset = Operand(info.soffset);
4774 }
4775
4776 if (soffset.isUndefined())
4777 soffset = Operand::zero();
4778
4779 bool offen = !vaddr.isUndefined();
4780 bool idxen = info.idx.id();
4781
4782 if (offen && idxen)
4783 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4784 else if (idxen)
4785 vaddr = Operand(info.idx);
4786
4787 aco_opcode op = aco_opcode::num_opcodes;
4788 if (info.component_size == 2) {
4789 switch (bytes_needed) {
4790 case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4791 case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4792 case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4793 case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4794 default: unreachable("invalid buffer load format size"); break;
4795 }
4796 } else {
4797 assert(info.component_size == 4);
4798 switch (bytes_needed) {
4799 case 4: op = aco_opcode::buffer_load_format_x; break;
4800 case 8: op = aco_opcode::buffer_load_format_xy; break;
4801 case 12: op = aco_opcode::buffer_load_format_xyz; break;
4802 case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4803 default: unreachable("invalid buffer load format size"); break;
4804 }
4805 }
4806
4807 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4808 mubuf->operands[0] = Operand(info.resource);
4809 mubuf->operands[1] = vaddr;
4810 mubuf->operands[2] = soffset;
4811 mubuf->mubuf().offen = offen;
4812 mubuf->mubuf().idxen = idxen;
4813 mubuf->mubuf().cache = info.cache;
4814 mubuf->mubuf().sync = info.sync;
4815 mubuf->mubuf().offset = const_offset;
4816 RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4817 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4818 mubuf->definitions[0] = Definition(val);
4819 bld.insert(std::move(mubuf));
4820
4821 return val;
4822 }
4823
4824 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4825
4826 Temp
scratch_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4827 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4828 unsigned align_, unsigned const_offset, Temp dst_hint)
4829 {
4830 unsigned bytes_size = 0;
4831 aco_opcode op;
4832 if (bytes_needed == 1 || align_ % 2u) {
4833 bytes_size = 1;
4834 op = aco_opcode::scratch_load_ubyte;
4835 } else if (bytes_needed == 2 || align_ % 4u) {
4836 bytes_size = 2;
4837 op = aco_opcode::scratch_load_ushort;
4838 } else if (bytes_needed <= 4) {
4839 bytes_size = 4;
4840 op = aco_opcode::scratch_load_dword;
4841 } else if (bytes_needed <= 8) {
4842 bytes_size = 8;
4843 op = aco_opcode::scratch_load_dwordx2;
4844 } else if (bytes_needed <= 12) {
4845 bytes_size = 12;
4846 op = aco_opcode::scratch_load_dwordx3;
4847 } else {
4848 bytes_size = 16;
4849 op = aco_opcode::scratch_load_dwordx4;
4850 }
4851 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4852 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4853 aco_ptr<Instruction> flat{create_instruction(op, Format::SCRATCH, 2, 1)};
4854 flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4855 flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4856 flat->scratch().sync = info.sync;
4857 flat->scratch().offset = const_offset;
4858 flat->definitions[0] = Definition(val);
4859 bld.insert(std::move(flat));
4860
4861 return val;
4862 }
4863
4864 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4865 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4866
4867 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4868 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4869 {
4870 uint32_t desc[4];
4871 ac_build_raw_buffer_descriptor(bld.program->gfx_level, 0, 0xffffffff, desc);
4872
4873 if (addr.type() == RegType::vgpr)
4874 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4875 Operand::c32(desc[2]), Operand::c32(desc[3]));
4876 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(desc[2]),
4877 Operand::c32(desc[3]));
4878 }
4879
4880 Temp
add64_32(Builder & bld,Temp src0,Temp src1)4881 add64_32(Builder& bld, Temp src0, Temp src1)
4882 {
4883 Temp src00 = bld.tmp(src0.type(), 1);
4884 Temp src01 = bld.tmp(src0.type(), 1);
4885 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4886
4887 if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4888 Temp dst0 = bld.tmp(v1);
4889 Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4890 Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4891 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4892 } else {
4893 Temp carry = bld.tmp(s1);
4894 Temp dst0 =
4895 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4896 Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4897 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4898 }
4899 }
4900
4901 void
lower_global_address(Builder & bld,uint32_t offset_in,Temp * address_inout,uint32_t * const_offset_inout,Temp * offset_inout)4902 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4903 uint32_t* const_offset_inout, Temp* offset_inout)
4904 {
4905 Temp address = *address_inout;
4906 uint64_t const_offset = *const_offset_inout + offset_in;
4907 Temp offset = *offset_inout;
4908
4909 uint64_t max_const_offset_plus_one =
4910 1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4911 if (bld.program->gfx_level >= GFX9)
4912 max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4913 else if (bld.program->gfx_level == GFX6)
4914 max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4915 uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4916 const_offset %= max_const_offset_plus_one;
4917
4918 if (!offset.id()) {
4919 while (unlikely(excess_offset > UINT32_MAX)) {
4920 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4921 excess_offset -= UINT32_MAX;
4922 }
4923 if (excess_offset)
4924 offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4925 } else {
4926 /* If we add to "offset", we would transform the indended
4927 * "address + u2u64(offset) + u2u64(const_offset)" into
4928 * "address + u2u64(offset + const_offset)", so add to the address.
4929 * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4930 * but that should be really rare.
4931 */
4932 while (excess_offset) {
4933 uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4934 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4935 excess_offset -= src2;
4936 }
4937 }
4938
4939 if (bld.program->gfx_level == GFX6) {
4940 /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4941 if (offset.type() != RegType::sgpr) {
4942 address = add64_32(bld, address, offset);
4943 offset = Temp();
4944 }
4945 offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4946 } else if (bld.program->gfx_level <= GFX8) {
4947 /* GFX7,8 (FLAT): VGPR address */
4948 if (offset.id()) {
4949 address = add64_32(bld, address, offset);
4950 offset = Temp();
4951 }
4952 address = as_vgpr(bld, address);
4953 } else {
4954 /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4955 if (address.type() == RegType::vgpr && offset.id()) {
4956 address = add64_32(bld, address, offset);
4957 offset = Temp();
4958 } else if (address.type() == RegType::sgpr && offset.id()) {
4959 offset = as_vgpr(bld, offset);
4960 }
4961 if (address.type() == RegType::sgpr && !offset.id())
4962 offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4963 }
4964
4965 *address_inout = address;
4966 *const_offset_inout = const_offset;
4967 *offset_inout = offset;
4968 }
4969
4970 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4971 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4972 unsigned align_, unsigned const_offset, Temp dst_hint)
4973 {
4974 Temp addr = info.resource;
4975 if (!addr.id()) {
4976 addr = offset;
4977 offset = Temp();
4978 }
4979 lower_global_address(bld, 0, &addr, &const_offset, &offset);
4980
4981 unsigned bytes_size = 0;
4982 bool use_mubuf = bld.program->gfx_level == GFX6;
4983 bool global = bld.program->gfx_level >= GFX9;
4984 aco_opcode op;
4985 if (bytes_needed == 1 || align_ % 2u) {
4986 bytes_size = 1;
4987 op = use_mubuf ? aco_opcode::buffer_load_ubyte
4988 : global ? aco_opcode::global_load_ubyte
4989 : aco_opcode::flat_load_ubyte;
4990 } else if (bytes_needed == 2 || align_ % 4u) {
4991 bytes_size = 2;
4992 op = use_mubuf ? aco_opcode::buffer_load_ushort
4993 : global ? aco_opcode::global_load_ushort
4994 : aco_opcode::flat_load_ushort;
4995 } else if (bytes_needed <= 4) {
4996 bytes_size = 4;
4997 op = use_mubuf ? aco_opcode::buffer_load_dword
4998 : global ? aco_opcode::global_load_dword
4999 : aco_opcode::flat_load_dword;
5000 } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
5001 bytes_size = 8;
5002 op = use_mubuf ? aco_opcode::buffer_load_dwordx2
5003 : global ? aco_opcode::global_load_dwordx2
5004 : aco_opcode::flat_load_dwordx2;
5005 } else if (bytes_needed <= 12 && !use_mubuf) {
5006 bytes_size = 12;
5007 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
5008 } else {
5009 bytes_size = 16;
5010 op = use_mubuf ? aco_opcode::buffer_load_dwordx4
5011 : global ? aco_opcode::global_load_dwordx4
5012 : aco_opcode::flat_load_dwordx4;
5013 }
5014 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5015 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5016 if (use_mubuf) {
5017 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
5018 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
5019 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
5020 mubuf->operands[2] = Operand(offset);
5021 mubuf->mubuf().cache = info.cache;
5022 mubuf->mubuf().offset = const_offset;
5023 mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
5024 mubuf->mubuf().disable_wqm = false;
5025 mubuf->mubuf().sync = info.sync;
5026 mubuf->definitions[0] = Definition(val);
5027 bld.insert(std::move(mubuf));
5028 } else {
5029 aco_ptr<Instruction> flat{
5030 create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
5031 if (addr.regClass() == s2) {
5032 assert(global && offset.id() && offset.type() == RegType::vgpr);
5033 flat->operands[0] = Operand(offset);
5034 flat->operands[1] = Operand(addr);
5035 } else {
5036 assert(addr.type() == RegType::vgpr && !offset.id());
5037 flat->operands[0] = Operand(addr);
5038 flat->operands[1] = Operand(s1);
5039 }
5040 flat->flatlike().cache = info.cache;
5041 flat->flatlike().sync = info.sync;
5042 assert(global || !const_offset);
5043 flat->flatlike().offset = const_offset;
5044 flat->definitions[0] = Definition(val);
5045 bld.insert(std::move(flat));
5046 }
5047
5048 return val;
5049 }
5050
5051 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
5052
5053 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)5054 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
5055 Temp address, unsigned base_offset, unsigned align)
5056 {
5057 assert(util_is_power_of_two_nonzero(align));
5058
5059 Builder bld(ctx->program, ctx->block);
5060
5061 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
5062 info.align_mul = align;
5063 info.align_offset = 0;
5064 info.sync = memory_sync_info(storage_shared);
5065 info.const_offset = base_offset;
5066 /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
5067 * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
5068 * to avoid copy-propagation.
5069 */
5070 info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
5071 ctx->program->wave_size == 64 &&
5072 ctx->program->workgroup_size > 64;
5073 emit_load(ctx, bld, info, lds_load_params);
5074
5075 return dst;
5076 }
5077
5078 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)5079 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
5080 Temp src)
5081 {
5082 if (!count)
5083 return;
5084
5085 Builder bld(ctx->program, ctx->block);
5086
5087 /* count == 1 fast path */
5088 if (count == 1) {
5089 if (dst_type == RegType::sgpr)
5090 dst[0] = bld.as_uniform(src);
5091 else
5092 dst[0] = as_vgpr(ctx, src);
5093 return;
5094 }
5095
5096 /* elem_size_bytes is the greatest common divisor which is a power of 2 */
5097 unsigned elem_size_bytes =
5098 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
5099
5100 ASSERTED bool is_subdword = elem_size_bytes < 4;
5101 assert(!is_subdword || dst_type == RegType::vgpr);
5102
5103 for (unsigned i = 0; i < count; i++)
5104 dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
5105
5106 std::vector<Temp> temps;
5107 /* use allocated_vec if possible */
5108 auto it = ctx->allocated_vec.find(src.id());
5109 if (it != ctx->allocated_vec.end()) {
5110 if (!it->second[0].id())
5111 goto split;
5112 unsigned elem_size = it->second[0].bytes();
5113 assert(src.bytes() % elem_size == 0);
5114
5115 for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
5116 if (!it->second[i].id())
5117 goto split;
5118 }
5119 if (elem_size_bytes % elem_size)
5120 goto split;
5121
5122 temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
5123 elem_size_bytes = elem_size;
5124 }
5125
5126 split:
5127 /* split src if necessary */
5128 if (temps.empty()) {
5129 if (is_subdword && src.type() == RegType::sgpr)
5130 src = as_vgpr(ctx, src);
5131 if (dst_type == RegType::sgpr)
5132 src = bld.as_uniform(src);
5133
5134 unsigned num_elems = src.bytes() / elem_size_bytes;
5135 aco_ptr<Instruction> split{
5136 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
5137 split->operands[0] = Operand(src);
5138 for (unsigned i = 0; i < num_elems; i++) {
5139 temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
5140 split->definitions[i] = Definition(temps.back());
5141 }
5142 bld.insert(std::move(split));
5143 }
5144
5145 unsigned idx = 0;
5146 for (unsigned i = 0; i < count; i++) {
5147 unsigned op_count = dst[i].bytes() / elem_size_bytes;
5148 if (op_count == 1) {
5149 if (dst_type == RegType::sgpr)
5150 dst[i] = bld.as_uniform(temps[idx++]);
5151 else
5152 dst[i] = as_vgpr(ctx, temps[idx++]);
5153 continue;
5154 }
5155
5156 aco_ptr<Instruction> vec{
5157 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
5158 for (unsigned j = 0; j < op_count; j++) {
5159 Temp tmp = temps[idx++];
5160 if (dst_type == RegType::sgpr)
5161 tmp = bld.as_uniform(tmp);
5162 vec->operands[j] = Operand(tmp);
5163 }
5164 vec->definitions[0] = Definition(dst[i]);
5165 bld.insert(std::move(vec));
5166 }
5167 return;
5168 }
5169
5170 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)5171 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
5172 {
5173 unsigned start_elem = ffs(todo_mask) - 1;
5174 bool skip = !(mask & (1 << start_elem));
5175 if (skip)
5176 mask = ~mask & todo_mask;
5177
5178 mask &= todo_mask;
5179
5180 u_bit_scan_consecutive_range(&mask, start, count);
5181
5182 return !skip;
5183 }
5184
5185 void
advance_write_mask(uint32_t * todo_mask,int start,int count)5186 advance_write_mask(uint32_t* todo_mask, int start, int count)
5187 {
5188 *todo_mask &= ~u_bit_consecutive(0, count) << start;
5189 }
5190
5191 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)5192 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
5193 unsigned base_offset, unsigned align)
5194 {
5195 assert(util_is_power_of_two_nonzero(align));
5196 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
5197
5198 Builder bld(ctx->program, ctx->block);
5199 bool large_ds_write = ctx->options->gfx_level >= GFX7;
5200 bool usable_write2 = ctx->options->gfx_level >= GFX7;
5201
5202 unsigned write_count = 0;
5203 Temp write_datas[32];
5204 unsigned offsets[32];
5205 unsigned bytes[32];
5206 aco_opcode opcodes[32];
5207
5208 wrmask = util_widen_mask(wrmask, elem_size_bytes);
5209
5210 const unsigned wrmask_bitcnt = util_bitcount(wrmask);
5211 uint32_t todo = u_bit_consecutive(0, data.bytes());
5212
5213 if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
5214 todo = MIN2(todo, wrmask);
5215
5216 while (todo) {
5217 int offset, byte;
5218 if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
5219 offsets[write_count] = offset;
5220 bytes[write_count] = byte;
5221 opcodes[write_count] = aco_opcode::num_opcodes;
5222 write_count++;
5223 advance_write_mask(&todo, offset, byte);
5224 continue;
5225 }
5226
5227 bool aligned2 = offset % 2 == 0 && align % 2 == 0;
5228 bool aligned4 = offset % 4 == 0 && align % 4 == 0;
5229 bool aligned8 = offset % 8 == 0 && align % 8 == 0;
5230 bool aligned16 = offset % 16 == 0 && align % 16 == 0;
5231
5232 // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
5233 aco_opcode op = aco_opcode::num_opcodes;
5234 if (byte >= 16 && aligned16 && large_ds_write) {
5235 op = aco_opcode::ds_write_b128;
5236 byte = 16;
5237 } else if (byte >= 12 && aligned16 && large_ds_write) {
5238 op = aco_opcode::ds_write_b96;
5239 byte = 12;
5240 } else if (byte >= 8 && aligned8) {
5241 op = aco_opcode::ds_write_b64;
5242 byte = 8;
5243 } else if (byte >= 4 && aligned4) {
5244 op = aco_opcode::ds_write_b32;
5245 byte = 4;
5246 } else if (byte >= 2 && aligned2) {
5247 op = aco_opcode::ds_write_b16;
5248 byte = 2;
5249 } else if (byte >= 1) {
5250 op = aco_opcode::ds_write_b8;
5251 byte = 1;
5252 } else {
5253 assert(false);
5254 }
5255
5256 offsets[write_count] = offset;
5257 bytes[write_count] = byte;
5258 opcodes[write_count] = op;
5259 write_count++;
5260 advance_write_mask(&todo, offset, byte);
5261 }
5262
5263 Operand m = load_lds_size_m0(bld);
5264
5265 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
5266
5267 for (unsigned i = 0; i < write_count; i++) {
5268 aco_opcode op = opcodes[i];
5269 if (op == aco_opcode::num_opcodes)
5270 continue;
5271
5272 Temp split_data = write_datas[i];
5273
5274 unsigned second = write_count;
5275 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
5276 for (second = i + 1; second < write_count; second++) {
5277 if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
5278 op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
5279 opcodes[second] = aco_opcode::num_opcodes;
5280 break;
5281 }
5282 }
5283 }
5284
5285 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
5286 unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
5287
5288 unsigned inline_offset = base_offset + offsets[i];
5289 unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
5290 Temp address_offset = address;
5291 if (inline_offset > max_offset) {
5292 address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
5293 inline_offset = offsets[i];
5294 }
5295
5296 /* offsets[i] shouldn't be large enough for this to happen */
5297 assert(inline_offset <= max_offset);
5298
5299 Instruction* instr;
5300 if (write2) {
5301 Temp second_data = write_datas[second];
5302 inline_offset /= split_data.bytes();
5303 instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
5304 inline_offset + write2_off);
5305 } else {
5306 instr = bld.ds(op, address_offset, split_data, m, inline_offset);
5307 }
5308 instr->ds().sync = memory_sync_info(storage_shared);
5309
5310 if (m.isUndefined())
5311 instr->operands.pop_back();
5312 }
5313 }
5314
5315 aco_opcode
get_buffer_store_op(unsigned bytes)5316 get_buffer_store_op(unsigned bytes)
5317 {
5318 switch (bytes) {
5319 case 1: return aco_opcode::buffer_store_byte;
5320 case 2: return aco_opcode::buffer_store_short;
5321 case 4: return aco_opcode::buffer_store_dword;
5322 case 8: return aco_opcode::buffer_store_dwordx2;
5323 case 12: return aco_opcode::buffer_store_dwordx3;
5324 case 16: return aco_opcode::buffer_store_dwordx4;
5325 }
5326 unreachable("Unexpected store size");
5327 return aco_opcode::num_opcodes;
5328 }
5329
5330 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)5331 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
5332 Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
5333 Temp* write_datas, unsigned* offsets)
5334 {
5335 unsigned write_count_with_skips = 0;
5336 bool skips[16];
5337 unsigned bytes[16];
5338
5339 /* determine how to split the data */
5340 unsigned todo = u_bit_consecutive(0, data.bytes());
5341 while (todo) {
5342 int offset, byte;
5343 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5344 offsets[write_count_with_skips] = offset;
5345 if (skips[write_count_with_skips]) {
5346 bytes[write_count_with_skips] = byte;
5347 advance_write_mask(&todo, offset, byte);
5348 write_count_with_skips++;
5349 continue;
5350 }
5351
5352 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5353 * larger than swizzle_element_size */
5354 byte = MIN2(byte, swizzle_element_size);
5355 if (byte % 4)
5356 byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5357
5358 /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5359 if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5360 byte = 8;
5361
5362 /* dword or larger stores have to be dword-aligned */
5363 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5364 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5365 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5366 if (!dword_aligned)
5367 byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5368
5369 bytes[write_count_with_skips] = byte;
5370 advance_write_mask(&todo, offset, byte);
5371 write_count_with_skips++;
5372 }
5373
5374 /* actually split data */
5375 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5376
5377 /* remove skips */
5378 for (unsigned i = 0; i < write_count_with_skips; i++) {
5379 if (skips[i])
5380 continue;
5381 write_datas[*write_count] = write_datas[i];
5382 offsets[*write_count] = offsets[i];
5383 (*write_count)++;
5384 }
5385 }
5386
5387 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())5388 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5389 unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5390 {
5391 Builder bld(ctx->program, ctx->block);
5392 unsigned dword_size = elem_size_bytes / 4;
5393
5394 if (!dst.id())
5395 dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5396
5397 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5398 aco_ptr<Instruction> instr{
5399 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5400 instr->definitions[0] = Definition(dst);
5401
5402 for (unsigned i = 0; i < cnt; ++i) {
5403 if (arr[i].id()) {
5404 assert(arr[i].size() == dword_size);
5405 allocated_vec[i] = arr[i];
5406 instr->operands[i] = Operand(arr[i]);
5407 } else {
5408 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5409 Operand::zero(dword_size == 2 ? 8 : 4));
5410 allocated_vec[i] = zero;
5411 instr->operands[i] = Operand(zero);
5412 }
5413 }
5414
5415 bld.insert(std::move(instr));
5416
5417 if (split_cnt)
5418 emit_split_vector(ctx, dst, split_cnt);
5419 else
5420 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5421
5422 return dst;
5423 }
5424
5425 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)5426 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5427 {
5428 if (const_offset >= 4096) {
5429 unsigned excess_const_offset = const_offset / 4096u * 4096u;
5430 const_offset %= 4096u;
5431
5432 if (!voffset.id())
5433 voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5434 else if (unlikely(voffset.regClass() == s1))
5435 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5436 Operand::c32(excess_const_offset), Operand(voffset));
5437 else if (likely(voffset.regClass() == v1))
5438 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5439 else
5440 unreachable("Unsupported register class of voffset");
5441 }
5442
5443 return const_offset;
5444 }
5445
5446 Temp
wave_id_in_threadgroup(isel_context * ctx)5447 wave_id_in_threadgroup(isel_context* ctx)
5448 {
5449 Builder bld(ctx->program, ctx->block);
5450 return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5451 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5452 }
5453
5454 Temp
thread_id_in_threadgroup(isel_context * ctx)5455 thread_id_in_threadgroup(isel_context* ctx)
5456 {
5457 /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5458
5459 Builder bld(ctx->program, ctx->block);
5460 Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5461
5462 if (ctx->program->workgroup_size <= ctx->program->wave_size)
5463 return tid_in_wave;
5464
5465 Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5466 Temp num_pre_threads =
5467 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5468 Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5469 return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5470 }
5471
5472 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)5473 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5474 {
5475 unsigned write_mask = nir_intrinsic_write_mask(instr);
5476 unsigned component = nir_intrinsic_component(instr);
5477 nir_src offset = *nir_get_io_offset_src(instr);
5478
5479 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5480 return false;
5481
5482 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5483
5484 if (instr->src[0].ssa->bit_size == 64)
5485 write_mask = util_widen_mask(write_mask, 2);
5486
5487 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5488
5489 /* Use semantic location as index. radv already uses it as intrinsic base
5490 * but radeonsi does not. We need to make LS output and TCS input index
5491 * match each other, so need to use semantic location explicitly. Also for
5492 * TCS epilog to index tess factor temps using semantic location directly.
5493 */
5494 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5495 unsigned base = sem.location;
5496 if (ctx->stage == fragment_fs) {
5497 /* color result is a legacy slot which won't appear with data result
5498 * at the same time. Here we just use the data slot for it to simplify
5499 * code handling for both of them.
5500 */
5501 if (base == FRAG_RESULT_COLOR)
5502 base = FRAG_RESULT_DATA0;
5503
5504 /* Sencond output of dual source blend just use data1 slot for simplicity,
5505 * because dual source blend does not support multi render target.
5506 */
5507 base += sem.dual_source_blend_index;
5508 }
5509 unsigned idx = base * 4u + component;
5510
5511 for (unsigned i = 0; i < 8; ++i) {
5512 if (write_mask & (1 << i)) {
5513 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5514 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5515 }
5516 idx++;
5517 }
5518
5519 if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog && base >= FRAG_RESULT_DATA0) {
5520 unsigned index = base - FRAG_RESULT_DATA0;
5521
5522 if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5523 ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5524 } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5525 ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5526 } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5527 ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5528 }
5529 }
5530
5531 return true;
5532 }
5533
5534 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)5535 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5536 {
5537 /* Only TCS per-vertex inputs are supported by this function.
5538 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5539 * is the same.
5540 */
5541 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5542 return false;
5543
5544 nir_src* off_src = nir_get_io_offset_src(instr);
5545 nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5546 nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5547 bool can_use_temps =
5548 nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5549 nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5550
5551 if (!can_use_temps)
5552 return false;
5553
5554 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5555
5556 unsigned idx =
5557 sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5558 Temp* src = &ctx->inputs.temps[idx];
5559 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5560
5561 return true;
5562 }
5563
5564 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)5565 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5566 {
5567 /* LS pass output to TCS by temp if they have same in/out patch size. */
5568 bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5569 ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5570
5571 bool ps_need_output = ctx->stage == fragment_fs;
5572
5573 if (ls_need_output || ps_need_output) {
5574 bool stored_to_temps = store_output_to_temps(ctx, instr);
5575 if (!stored_to_temps) {
5576 isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5577 abort();
5578 }
5579 } else {
5580 unreachable("Shader stage not implemented");
5581 }
5582 }
5583
5584 bool
in_exec_divergent_or_in_loop(isel_context * ctx)5585 in_exec_divergent_or_in_loop(isel_context* ctx)
5586 {
5587 return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5588 ctx->cf_info.had_divergent_discard;
5589 }
5590
5591 void
emit_interp_instr_gfx11(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5592 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5593 Temp prim_mask, bool high_16bits)
5594 {
5595 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5596 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5597
5598 Builder bld(ctx->program, ctx->block);
5599
5600 if (in_exec_divergent_or_in_loop(ctx)) {
5601 bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5602 Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1,
5603 coord2, bld.m0(prim_mask));
5604 return;
5605 }
5606
5607 Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5608
5609 Temp res;
5610 if (dst.regClass() == v2b) {
5611 Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1,
5612 p, high_16bits ? 0x5 : 0);
5613 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10,
5614 high_16bits ? 0x1 : 0);
5615 } else {
5616 Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5617 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5618 }
5619 /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5620 set_wqm(ctx, true);
5621 }
5622
5623 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5624 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5625 Temp prim_mask, bool high_16bits)
5626 {
5627 if (ctx->options->gfx_level >= GFX11) {
5628 emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask, high_16bits);
5629 return;
5630 }
5631
5632 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5633 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5634
5635 Builder bld(ctx->program, ctx->block);
5636
5637 if (dst.regClass() == v2b) {
5638 if (ctx->program->dev.has_16bank_lds) {
5639 assert(ctx->options->gfx_level <= GFX8);
5640 Builder::Result interp_p1 =
5641 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5642 bld.m0(prim_mask), idx, component);
5643 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v1), coord1,
5644 bld.m0(prim_mask), interp_p1, idx, component, high_16bits);
5645 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5646 interp_p1, idx, component, high_16bits);
5647 } else {
5648 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5649
5650 if (ctx->options->gfx_level == GFX8)
5651 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5652
5653 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5654 bld.m0(prim_mask), idx, component, high_16bits);
5655 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5656 component, high_16bits);
5657 }
5658 } else {
5659 assert(!high_16bits);
5660 Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5661 bld.m0(prim_mask), idx, component);
5662
5663 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5664 idx, component);
5665 }
5666 }
5667
5668 void
emit_interp_mov_instr(isel_context * ctx,unsigned idx,unsigned component,unsigned vertex_id,Temp dst,Temp prim_mask,bool high_16bits)5669 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5670 Temp dst, Temp prim_mask, bool high_16bits)
5671 {
5672 Builder bld(ctx->program, ctx->block);
5673 Temp tmp = dst.bytes() == 2 ? bld.tmp(v1) : dst;
5674 if (ctx->options->gfx_level >= GFX11) {
5675 uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5676 if (in_exec_divergent_or_in_loop(ctx)) {
5677 bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()),
5678 Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5679 bld.m0(prim_mask));
5680 } else {
5681 Temp p =
5682 bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5683 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(tmp), p, dpp_ctrl);
5684 /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5685 set_wqm(ctx, true);
5686 }
5687 } else {
5688 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(tmp), Operand::c32((vertex_id + 2) % 3),
5689 bld.m0(prim_mask), idx, component);
5690 }
5691
5692 if (dst.id() != tmp.id())
5693 emit_extract_vector(ctx, tmp, high_16bits, dst);
5694 }
5695
5696 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)5697 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5698 {
5699 Builder bld(ctx->program, ctx->block);
5700
5701 aco_ptr<Instruction> vec(
5702 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5703 for (unsigned i = 0; i < num_components; i++) {
5704 if (ctx->args->frag_pos[i].used)
5705 vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5706 else
5707 vec->operands[i] = Operand(v1);
5708 }
5709 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5710 assert(num_components == 4);
5711 vec->operands[3] =
5712 bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5713 }
5714
5715 for (Operand& op : vec->operands)
5716 op = op.isUndefined() ? Operand::zero() : op;
5717
5718 vec->definitions[0] = Definition(dst);
5719 ctx->block->instructions.emplace_back(std::move(vec));
5720 emit_split_vector(ctx, dst, num_components);
5721 return;
5722 }
5723
5724 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)5725 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5726 {
5727 Builder bld(ctx->program, ctx->block);
5728 Temp cond;
5729
5730 /* VRS Rate X = Ancillary[2:3]
5731 * VRS Rate Y = Ancillary[4:5]
5732 */
5733 Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5734 Operand::c32(2u), Operand::c32(2u));
5735 Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5736 Operand::c32(4u), Operand::c32(2u));
5737
5738 /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5739 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5740 x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5741 bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5742
5743 /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5744 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5745 y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5746 bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5747
5748 bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5749 }
5750
5751 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)5752 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5753 {
5754 Temp dst = get_ssa_temp(ctx, &instr->def);
5755 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5756 unsigned idx = nir_intrinsic_base(instr);
5757 unsigned component = nir_intrinsic_component(instr);
5758 bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5759 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5760
5761 assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5762
5763 if (instr->def.num_components == 1) {
5764 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask, high_16bits);
5765 } else {
5766 aco_ptr<Instruction> vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
5767 instr->def.num_components, 1));
5768 for (unsigned i = 0; i < instr->def.num_components; i++) {
5769 Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5770 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask, high_16bits);
5771 vec->operands[i] = Operand(tmp);
5772 }
5773 vec->definitions[0] = Definition(dst);
5774 ctx->block->instructions.emplace_back(std::move(vec));
5775 }
5776 }
5777
5778 Temp
mtbuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned alignment,unsigned const_offset,Temp dst_hint)5779 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5780 unsigned alignment, unsigned const_offset, Temp dst_hint)
5781 {
5782 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5783 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5784
5785 if (info.soffset.id()) {
5786 if (soffset.isTemp())
5787 vaddr = bld.copy(bld.def(v1), soffset);
5788 soffset = Operand(info.soffset);
5789 }
5790
5791 if (soffset.isUndefined())
5792 soffset = Operand::zero();
5793
5794 const bool offen = !vaddr.isUndefined();
5795 const bool idxen = info.idx.id();
5796
5797 if (offen && idxen)
5798 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5799 else if (idxen)
5800 vaddr = Operand(info.idx);
5801
5802 /* Determine number of fetched components.
5803 * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5804 */
5805 const struct ac_vtx_format_info* vtx_info =
5806 ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5807 /* The number of channels in the format determines the memory range. */
5808 const unsigned max_components = vtx_info->num_channels;
5809 /* Calculate maximum number of components loaded according to alignment. */
5810 unsigned max_fetched_components = bytes_needed / info.component_size;
5811 max_fetched_components =
5812 ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5813 alignment, max_fetched_components);
5814 const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5815 /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5816 * If a larger format is selected, it's still OK to load a smaller amount from it.
5817 */
5818 bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5819 unsigned bytes_size = 0;
5820 const unsigned bit_size = info.component_size * 8;
5821 aco_opcode op = aco_opcode::num_opcodes;
5822
5823 if (bytes_needed == 2) {
5824 bytes_size = 2;
5825 op = aco_opcode::tbuffer_load_format_d16_x;
5826 } else if (bytes_needed <= 4) {
5827 bytes_size = 4;
5828 if (bit_size == 16)
5829 op = aco_opcode::tbuffer_load_format_d16_xy;
5830 else
5831 op = aco_opcode::tbuffer_load_format_x;
5832 } else if (bytes_needed <= 6) {
5833 bytes_size = 6;
5834 if (bit_size == 16)
5835 op = aco_opcode::tbuffer_load_format_d16_xyz;
5836 else
5837 op = aco_opcode::tbuffer_load_format_xy;
5838 } else if (bytes_needed <= 8) {
5839 bytes_size = 8;
5840 if (bit_size == 16)
5841 op = aco_opcode::tbuffer_load_format_d16_xyzw;
5842 else
5843 op = aco_opcode::tbuffer_load_format_xy;
5844 } else if (bytes_needed <= 12) {
5845 bytes_size = 12;
5846 op = aco_opcode::tbuffer_load_format_xyz;
5847 } else {
5848 bytes_size = 16;
5849 op = aco_opcode::tbuffer_load_format_xyzw;
5850 }
5851
5852 /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5853 if (op == aco_opcode::num_opcodes) {
5854 aco_err(bld.program, "unsupported bit size for typed buffer load");
5855 abort();
5856 }
5857
5858 aco_ptr<Instruction> mtbuf{create_instruction(op, Format::MTBUF, 3, 1)};
5859 mtbuf->operands[0] = Operand(info.resource);
5860 mtbuf->operands[1] = vaddr;
5861 mtbuf->operands[2] = soffset;
5862 mtbuf->mtbuf().offen = offen;
5863 mtbuf->mtbuf().idxen = idxen;
5864 mtbuf->mtbuf().cache = info.cache;
5865 mtbuf->mtbuf().sync = info.sync;
5866 mtbuf->mtbuf().offset = const_offset;
5867 mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
5868 mtbuf->mtbuf().nfmt = fetch_fmt >> 4;
5869 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5870 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5871 mtbuf->definitions[0] = Definition(val);
5872 bld.insert(std::move(mtbuf));
5873
5874 return val;
5875 }
5876
5877 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5878
5879 void
visit_load_fs_input(isel_context * ctx,nir_intrinsic_instr * instr)5880 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5881 {
5882 Builder bld(ctx->program, ctx->block);
5883 Temp dst = get_ssa_temp(ctx, &instr->def);
5884 nir_src offset = *nir_get_io_offset_src(instr);
5885
5886 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5887 isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5888
5889 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5890
5891 unsigned idx = nir_intrinsic_base(instr);
5892 unsigned component = nir_intrinsic_component(instr);
5893 bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5894 unsigned vertex_id = 0; /* P0 */
5895
5896 if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5897 vertex_id = nir_src_as_uint(instr->src[0]);
5898
5899 if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5900 emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask, high_16bits);
5901 } else {
5902 unsigned num_components = instr->def.num_components;
5903 if (instr->def.bit_size == 64)
5904 num_components *= 2;
5905 aco_ptr<Instruction> vec{
5906 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5907 for (unsigned i = 0; i < num_components; i++) {
5908 unsigned chan_component = (component + i) % 4;
5909 unsigned chan_idx = idx + (component + i) / 4;
5910 vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5911 emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5912 prim_mask, high_16bits);
5913 }
5914 vec->definitions[0] = Definition(dst);
5915 bld.insert(std::move(vec));
5916 }
5917 }
5918
5919 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5920 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5921 {
5922 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5923
5924 Builder bld(ctx->program, ctx->block);
5925 Temp dst = get_ssa_temp(ctx, &instr->def);
5926
5927 if (load_input_from_temps(ctx, instr, dst))
5928 return;
5929
5930 unreachable("LDS-based TCS input should have been lowered in NIR.");
5931 }
5932
5933 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5934 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5935 {
5936 switch (ctx->shader->info.stage) {
5937 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5938 default: unreachable("Unimplemented shader stage");
5939 }
5940 }
5941
5942 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5943 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5944 {
5945 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5946
5947 Builder bld(ctx->program, ctx->block);
5948 Temp dst = get_ssa_temp(ctx, &instr->def);
5949
5950 Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5951 Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5952 Operand tes_w = Operand::zero();
5953
5954 if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5955 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5956 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5957 tes_w = Operand(tmp);
5958 }
5959
5960 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5961 emit_split_vector(ctx, tess_coord, 3);
5962 }
5963
5964 ac_hw_cache_flags
get_cache_flags(isel_context * ctx,unsigned access)5965 get_cache_flags(isel_context* ctx, unsigned access)
5966 {
5967 return ac_get_hw_cache_flags(ctx->program->gfx_level, (gl_access_qualifier)access);
5968 }
5969
5970 ac_hw_cache_flags
get_atomic_cache_flags(isel_context * ctx,bool return_previous)5971 get_atomic_cache_flags(isel_context* ctx, bool return_previous)
5972 {
5973 ac_hw_cache_flags cache = get_cache_flags(ctx, ACCESS_TYPE_ATOMIC);
5974 if (return_previous && ctx->program->gfx_level >= GFX12)
5975 cache.gfx12.temporal_hint |= gfx12_atomic_return;
5976 else if (return_previous)
5977 cache.value |= ac_glc;
5978 return cache;
5979 }
5980
5981 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,unsigned access=ACCESS_CAN_REORDER,memory_sync_info sync=memory_sync_info ())5982 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5983 Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5984 unsigned access = ACCESS_CAN_REORDER, memory_sync_info sync = memory_sync_info())
5985 {
5986 Builder bld(ctx->program, ctx->block);
5987
5988 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5989
5990 bool use_smem = dst.type() != RegType::vgpr && (ctx->options->gfx_level >= GFX8 || !glc) &&
5991 (access & ACCESS_CAN_REORDER);
5992 if (use_smem)
5993 offset = bld.as_uniform(offset);
5994 else {
5995 /* GFX6-7 are affected by a hw bug that prevents address clamping to
5996 * work correctly when the SGPR offset is used.
5997 */
5998 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5999 offset = as_vgpr(ctx, offset);
6000 }
6001
6002 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
6003 info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD | (use_smem ? ACCESS_TYPE_SMEM : 0));
6004 info.sync = sync;
6005 info.align_mul = align_mul;
6006 info.align_offset = align_offset;
6007 if (use_smem)
6008 emit_load(ctx, bld, info, smem_load_params);
6009 else
6010 emit_load(ctx, bld, info, mubuf_load_params);
6011 }
6012
6013 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)6014 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
6015 {
6016 Temp dst = get_ssa_temp(ctx, &instr->def);
6017 Builder bld(ctx->program, ctx->block);
6018 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6019
6020 unsigned size = instr->def.bit_size / 8;
6021 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6022 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
6023 }
6024
6025 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)6026 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
6027 {
6028 Temp dst = get_ssa_temp(ctx, &instr->def);
6029
6030 Builder bld(ctx->program, ctx->block);
6031
6032 uint32_t desc[4];
6033 ac_build_raw_buffer_descriptor(ctx->options->gfx_level, 0, 0, desc);
6034
6035 unsigned base = nir_intrinsic_base(instr);
6036 unsigned range = nir_intrinsic_range(instr);
6037
6038 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
6039 if (base && offset.type() == RegType::sgpr)
6040 offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
6041 Operand::c32(base));
6042 else if (base && offset.type() == RegType::vgpr)
6043 offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
6044
6045 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6046 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
6047 Operand::c32(ctx->constant_data_offset)),
6048 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
6049 Operand::c32(desc[3]));
6050 unsigned size = instr->def.bit_size / 8;
6051 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, nir_intrinsic_align_mul(instr),
6052 nir_intrinsic_align_offset(instr));
6053 }
6054
6055 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
6056 * The byte count of each input Temp must be a multiple of 2.
6057 */
6058 static std::vector<Temp>
emit_pack_v1(isel_context * ctx,const std::vector<Temp> & unpacked)6059 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
6060 {
6061 Builder bld(ctx->program, ctx->block);
6062 std::vector<Temp> packed;
6063 Temp low = Temp();
6064 for (Temp tmp : unpacked) {
6065 assert(tmp.bytes() % 2 == 0);
6066 unsigned byte_idx = 0;
6067 while (byte_idx < tmp.bytes()) {
6068 if (low != Temp()) {
6069 Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6070 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
6071 low = Temp();
6072 packed.push_back(dword);
6073 byte_idx += 2;
6074 } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
6075 packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
6076 byte_idx += 4;
6077 } else {
6078 low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6079 byte_idx += 2;
6080 }
6081 }
6082 }
6083 if (low != Temp()) {
6084 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
6085 packed.push_back(dword);
6086 }
6087 return packed;
6088 }
6089
6090 static bool
should_declare_array(ac_image_dim dim)6091 should_declare_array(ac_image_dim dim)
6092 {
6093 return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
6094 dim == ac_image_2darraymsaa;
6095 }
6096
6097 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)6098 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
6099 {
6100 switch (dim) {
6101 case GLSL_SAMPLER_DIM_BUF: return 1;
6102 case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
6103 case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
6104 case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
6105 case GLSL_SAMPLER_DIM_3D:
6106 case GLSL_SAMPLER_DIM_CUBE: return 3;
6107 case GLSL_SAMPLER_DIM_RECT:
6108 case GLSL_SAMPLER_DIM_SUBPASS: return 2;
6109 case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
6110 default: break;
6111 }
6112 return 0;
6113 }
6114
6115 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Temp dst,Temp rsrc,Operand samp,std::vector<Temp> coords,Operand vdata=Operand (v1))6116 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
6117 Operand vdata = Operand(v1))
6118 {
6119 bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;
6120
6121 size_t nsa_size = bld.program->dev.max_nsa_vgprs;
6122 if (!is_vsample && bld.program->gfx_level >= GFX12)
6123 nsa_size++; /* VIMAGE can encode one more VADDR */
6124 nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
6125
6126 const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
6127 if (strict_wqm)
6128 nsa_size = coords.size();
6129
6130 for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
6131 if (!coords[i].id())
6132 continue;
6133
6134 coords[i] = as_vgpr(bld, coords[i]);
6135 }
6136
6137 if (nsa_size < coords.size()) {
6138 Temp coord = coords[nsa_size];
6139 if (coords.size() - nsa_size > 1) {
6140 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
6141 coords.size() - nsa_size, 1)};
6142
6143 unsigned coord_size = 0;
6144 for (unsigned i = nsa_size; i < coords.size(); i++) {
6145 vec->operands[i - nsa_size] = Operand(coords[i]);
6146 coord_size += coords[i].size();
6147 }
6148
6149 coord = bld.tmp(RegType::vgpr, coord_size);
6150 vec->definitions[0] = Definition(coord);
6151 bld.insert(std::move(vec));
6152 } else {
6153 coord = as_vgpr(bld, coord);
6154 }
6155
6156 coords[nsa_size] = coord;
6157 coords.resize(nsa_size + 1);
6158 }
6159
6160 bool has_dst = dst.id() != 0;
6161
6162 aco_ptr<Instruction> mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), has_dst)};
6163 if (has_dst)
6164 mimg->definitions[0] = Definition(dst);
6165 mimg->operands[0] = Operand(rsrc);
6166 mimg->operands[1] = samp;
6167 mimg->operands[2] = vdata;
6168 for (unsigned i = 0; i < coords.size(); i++)
6169 mimg->operands[3 + i] = Operand(coords[i]);
6170 mimg->mimg().strict_wqm = strict_wqm;
6171
6172 return &bld.insert(std::move(mimg))->mimg();
6173 }
6174
6175 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6176 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6177 {
6178 Builder bld(ctx->program, ctx->block);
6179 Temp dst = get_ssa_temp(ctx, &instr->def);
6180 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6181 Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6182 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6183 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6184 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6185 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6186
6187 /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
6188 * There are five smaller vector groups:
6189 * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
6190 * These directly match the NIR intrinsic sources.
6191 */
6192 std::vector<Temp> args = {
6193 node, tmax, origin, dir, inv_dir,
6194 };
6195
6196 if (bld.program->gfx_level == GFX10_3) {
6197 std::vector<Temp> scalar_args;
6198 for (Temp tmp : args) {
6199 for (unsigned i = 0; i < tmp.size(); i++)
6200 scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
6201 }
6202 args = std::move(scalar_args);
6203 }
6204
6205 MIMG_instruction* mimg =
6206 emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6207 mimg->dim = ac_image_1d;
6208 mimg->dmask = 0xf;
6209 mimg->unrm = true;
6210 mimg->r128 = true;
6211
6212 emit_split_vector(ctx, dst, instr->def.num_components);
6213 }
6214
6215 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6216 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6217 {
6218
6219 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6220 bool a16 = instr->src[1].ssa->bit_size == 16;
6221 RegClass rc = a16 ? v2b : v1;
6222 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6223 bool is_array = nir_intrinsic_image_array(instr);
6224 ASSERTED bool add_frag_pos =
6225 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6226 assert(!add_frag_pos && "Input attachments should be lowered.");
6227 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6228 bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6229 int count = image_type_to_components_count(dim, is_array);
6230 std::vector<Temp> coords;
6231 Builder bld(ctx->program, ctx->block);
6232
6233 if (gfx9_1d) {
6234 coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6235 coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6236 if (is_array)
6237 coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6238 } else {
6239 for (int i = 0; i < count; i++)
6240 coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6241 }
6242
6243 bool has_lod = false;
6244 Temp lod;
6245
6246 if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6247 instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6248 instr->intrinsic == nir_intrinsic_bindless_image_store) {
6249 int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6250 assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6251 has_lod =
6252 !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6253
6254 if (has_lod)
6255 lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6256 }
6257
6258 if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6259 /* The hw can't bind a slice of a 3D image as a 2D image, because it
6260 * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6261 * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6262 */
6263 assert(ctx->options->gfx_level == GFX9);
6264 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6265 Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6266 /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6267 Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6268 Operand::c32(13u));
6269
6270 if (has_lod) {
6271 /* If there's a lod parameter it matter if the image is 3d or 2d because
6272 * the hw reads either the fourth or third component as lod. So detect
6273 * 3d images and place the lod at the third component otherwise.
6274 * For non 3D descriptors we effectively add lod twice to coords,
6275 * but the hw will only read the first one, the second is ignored.
6276 */
6277 Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6278 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6279 Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6280 Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6281 Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6282 first_layer =
6283 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6284 }
6285
6286 if (a16)
6287 coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6288 else
6289 coords.emplace_back(first_layer);
6290 }
6291
6292 if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6293 assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6294 coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6295 }
6296
6297 if (has_lod)
6298 coords.emplace_back(lod);
6299
6300 return emit_pack_v1(ctx, coords);
6301 }
6302
6303 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6304 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6305 {
6306 /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6307 if (semantics & semantic_atomicrmw)
6308 return memory_sync_info(storage, semantics);
6309
6310 unsigned access = nir_intrinsic_access(instr);
6311
6312 if (access & ACCESS_VOLATILE)
6313 semantics |= semantic_volatile;
6314 if (access & ACCESS_CAN_REORDER)
6315 semantics |= semantic_can_reorder | semantic_private;
6316
6317 return memory_sync_info(storage, semantics);
6318 }
6319
6320 Operand
emit_tfe_init(Builder & bld,Temp dst)6321 emit_tfe_init(Builder& bld, Temp dst)
6322 {
6323 Temp tmp = bld.tmp(dst.regClass());
6324
6325 aco_ptr<Instruction> vec{
6326 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6327 for (unsigned i = 0; i < dst.size(); i++)
6328 vec->operands[i] = Operand::zero();
6329 vec->definitions[0] = Definition(tmp);
6330 /* Since this is fixed to an instruction's definition register, any CSE will
6331 * just create copies. Copying costs about the same as zero-initialization,
6332 * but these copies can break up clauses.
6333 */
6334 vec->definitions[0].setNoCSE(true);
6335 bld.insert(std::move(vec));
6336
6337 return Operand(tmp);
6338 }
6339
6340 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6341 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6342 {
6343 Builder bld(ctx->program, ctx->block);
6344 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6345 bool is_array = nir_intrinsic_image_array(instr);
6346 bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6347 Temp dst = get_ssa_temp(ctx, &instr->def);
6348
6349 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6350
6351 unsigned result_size = instr->def.num_components - is_sparse;
6352 unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6353 expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6354 if (dim == GLSL_SAMPLER_DIM_BUF)
6355 expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6356 unsigned dmask = expand_mask;
6357 if (instr->def.bit_size == 64) {
6358 expand_mask &= 0x9;
6359 /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6360 dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6361 }
6362 if (is_sparse)
6363 expand_mask |= 1 << result_size;
6364
6365 bool d16 = instr->def.bit_size == 16;
6366 assert(!d16 || !is_sparse);
6367
6368 unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6369
6370 Temp tmp;
6371 if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6372 tmp = dst;
6373 else
6374 tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6375
6376 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6377
6378 if (dim == GLSL_SAMPLER_DIM_BUF) {
6379 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6380
6381 aco_opcode opcode;
6382 if (!d16) {
6383 switch (util_bitcount(dmask)) {
6384 case 1: opcode = aco_opcode::buffer_load_format_x; break;
6385 case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6386 case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6387 case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6388 default: unreachable(">4 channel buffer image load");
6389 }
6390 } else {
6391 switch (util_bitcount(dmask)) {
6392 case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6393 case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6394 case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6395 case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6396 default: unreachable(">4 channel buffer image load");
6397 }
6398 }
6399 aco_ptr<Instruction> load{create_instruction(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6400 load->operands[0] = Operand(resource);
6401 load->operands[1] = Operand(vindex);
6402 load->operands[2] = Operand::c32(0);
6403 load->definitions[0] = Definition(tmp);
6404 load->mubuf().idxen = true;
6405 load->mubuf().cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6406 load->mubuf().sync = sync;
6407 load->mubuf().tfe = is_sparse;
6408 if (load->mubuf().tfe)
6409 load->operands[3] = emit_tfe_init(bld, tmp);
6410 ctx->block->instructions.emplace_back(std::move(load));
6411 } else {
6412 std::vector<Temp> coords = get_image_coords(ctx, instr);
6413
6414 aco_opcode opcode;
6415 if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6416 opcode = aco_opcode::image_load;
6417 } else {
6418 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6419 opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6420 }
6421
6422 Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6423 MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6424 load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6425 load->a16 = instr->src[1].ssa->bit_size == 16;
6426 load->d16 = d16;
6427 load->dmask = dmask;
6428 load->unrm = true;
6429 load->tfe = is_sparse;
6430
6431 if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6432 load->dim = is_array ? ac_image_2darray : ac_image_2d;
6433 load->da = is_array;
6434 load->sync = memory_sync_info();
6435 } else {
6436 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6437 load->dim = sdim;
6438 load->da = should_declare_array(sdim);
6439 load->sync = sync;
6440 }
6441 }
6442
6443 if (is_sparse && instr->def.bit_size == 64) {
6444 /* The result components are 64-bit but the sparse residency code is
6445 * 32-bit. So add a zero to the end so expand_vector() works correctly.
6446 */
6447 tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6448 Operand::zero());
6449 }
6450
6451 expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6452 }
6453
6454 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6455 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6456 {
6457 Builder bld(ctx->program, ctx->block);
6458 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6459 bool is_array = nir_intrinsic_image_array(instr);
6460 Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6461 bool d16 = instr->src[3].ssa->bit_size == 16;
6462
6463 /* only R64_UINT and R64_SINT supported */
6464 if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6465 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6466 data = as_vgpr(ctx, data);
6467
6468 uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6469
6470 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6471 unsigned access = nir_intrinsic_access(instr);
6472 ac_hw_cache_flags cache =
6473 get_cache_flags(ctx, access | ACCESS_TYPE_STORE | ACCESS_MAY_STORE_SUBDWORD);
6474
6475 uint32_t dmask = BITFIELD_MASK(num_components);
6476 if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6477 for (uint32_t i = 0; i < instr->num_components; i++) {
6478 /* components not in dmask receive:
6479 * GFX6-11.5: zero
6480 * GFX12+: first component in dmask
6481 */
6482 nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6483 if (nir_scalar_is_undef(comp)) {
6484 dmask &= ~BITFIELD_BIT(i);
6485 } else if (ctx->options->gfx_level <= GFX11_5) {
6486 if (nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0)
6487 dmask &= ~BITFIELD_BIT(i);
6488 } else {
6489 unsigned first = dim == GLSL_SAMPLER_DIM_BUF ? 0 : ffs(dmask) - 1;
6490 if (i != first && nir_scalar_equal(nir_scalar_resolved(instr->src[3].ssa, first), comp))
6491 dmask &= ~BITFIELD_BIT(i);
6492 }
6493 }
6494
6495 /* dmask cannot be 0, at least one vgpr is always read */
6496 if (dmask == 0)
6497 dmask = 1;
6498 /* buffer store only supports consecutive components. */
6499 if (dim == GLSL_SAMPLER_DIM_BUF)
6500 dmask = BITFIELD_MASK(util_last_bit(dmask));
6501
6502 if (dmask != BITFIELD_MASK(num_components)) {
6503 uint32_t dmask_count = util_bitcount(dmask);
6504 RegClass rc = d16 ? v2b : v1;
6505 if (dmask_count == 1) {
6506 data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6507 } else {
6508 aco_ptr<Instruction> vec{
6509 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6510 uint32_t index = 0;
6511 u_foreach_bit (bit, dmask) {
6512 vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6513 }
6514 data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6515 vec->definitions[0] = Definition(data);
6516 bld.insert(std::move(vec));
6517 }
6518 }
6519 }
6520
6521 if (dim == GLSL_SAMPLER_DIM_BUF) {
6522 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6523 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6524 aco_opcode opcode;
6525 if (!d16) {
6526 switch (dmask) {
6527 case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
6528 case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
6529 case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
6530 case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
6531 default: unreachable(">4 channel buffer image store");
6532 }
6533 } else {
6534 switch (dmask) {
6535 case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6536 case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6537 case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6538 case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6539 default: unreachable(">4 channel buffer image store");
6540 }
6541 }
6542 aco_ptr<Instruction> store{create_instruction(opcode, Format::MUBUF, 4, 0)};
6543 store->operands[0] = Operand(rsrc);
6544 store->operands[1] = Operand(vindex);
6545 store->operands[2] = Operand::c32(0);
6546 store->operands[3] = Operand(data);
6547 store->mubuf().idxen = true;
6548 store->mubuf().cache = cache;
6549 store->mubuf().disable_wqm = true;
6550 store->mubuf().sync = sync;
6551 ctx->program->needs_exact = true;
6552 ctx->block->instructions.emplace_back(std::move(store));
6553 return;
6554 }
6555
6556 assert(data.type() == RegType::vgpr);
6557 std::vector<Temp> coords = get_image_coords(ctx, instr);
6558 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6559
6560 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6561 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6562
6563 MIMG_instruction* store =
6564 emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6565 store->cache = cache;
6566 store->a16 = instr->src[1].ssa->bit_size == 16;
6567 store->d16 = d16;
6568 store->dmask = dmask;
6569 store->unrm = true;
6570 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6571 store->dim = sdim;
6572 store->da = should_declare_array(sdim);
6573 store->disable_wqm = true;
6574 store->sync = sync;
6575 ctx->program->needs_exact = true;
6576 return;
6577 }
6578
6579 void
translate_buffer_image_atomic_op(const nir_atomic_op op,aco_opcode * buf_op,aco_opcode * buf_op64,aco_opcode * image_op)6580 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6581 aco_opcode* image_op)
6582 {
6583 switch (op) {
6584 case nir_atomic_op_iadd:
6585 *buf_op = aco_opcode::buffer_atomic_add;
6586 *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6587 *image_op = aco_opcode::image_atomic_add;
6588 break;
6589 case nir_atomic_op_umin:
6590 *buf_op = aco_opcode::buffer_atomic_umin;
6591 *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6592 *image_op = aco_opcode::image_atomic_umin;
6593 break;
6594 case nir_atomic_op_imin:
6595 *buf_op = aco_opcode::buffer_atomic_smin;
6596 *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6597 *image_op = aco_opcode::image_atomic_smin;
6598 break;
6599 case nir_atomic_op_umax:
6600 *buf_op = aco_opcode::buffer_atomic_umax;
6601 *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6602 *image_op = aco_opcode::image_atomic_umax;
6603 break;
6604 case nir_atomic_op_imax:
6605 *buf_op = aco_opcode::buffer_atomic_smax;
6606 *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6607 *image_op = aco_opcode::image_atomic_smax;
6608 break;
6609 case nir_atomic_op_iand:
6610 *buf_op = aco_opcode::buffer_atomic_and;
6611 *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6612 *image_op = aco_opcode::image_atomic_and;
6613 break;
6614 case nir_atomic_op_ior:
6615 *buf_op = aco_opcode::buffer_atomic_or;
6616 *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6617 *image_op = aco_opcode::image_atomic_or;
6618 break;
6619 case nir_atomic_op_ixor:
6620 *buf_op = aco_opcode::buffer_atomic_xor;
6621 *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6622 *image_op = aco_opcode::image_atomic_xor;
6623 break;
6624 case nir_atomic_op_xchg:
6625 *buf_op = aco_opcode::buffer_atomic_swap;
6626 *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6627 *image_op = aco_opcode::image_atomic_swap;
6628 break;
6629 case nir_atomic_op_cmpxchg:
6630 *buf_op = aco_opcode::buffer_atomic_cmpswap;
6631 *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6632 *image_op = aco_opcode::image_atomic_cmpswap;
6633 break;
6634 case nir_atomic_op_inc_wrap:
6635 *buf_op = aco_opcode::buffer_atomic_inc;
6636 *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6637 *image_op = aco_opcode::image_atomic_inc;
6638 break;
6639 case nir_atomic_op_dec_wrap:
6640 *buf_op = aco_opcode::buffer_atomic_dec;
6641 *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6642 *image_op = aco_opcode::image_atomic_dec;
6643 break;
6644 case nir_atomic_op_fadd:
6645 *buf_op = aco_opcode::buffer_atomic_add_f32;
6646 *buf_op64 = aco_opcode::num_opcodes;
6647 *image_op = aco_opcode::num_opcodes;
6648 break;
6649 case nir_atomic_op_fmin:
6650 *buf_op = aco_opcode::buffer_atomic_fmin;
6651 *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6652 *image_op = aco_opcode::image_atomic_fmin;
6653 break;
6654 case nir_atomic_op_fmax:
6655 *buf_op = aco_opcode::buffer_atomic_fmax;
6656 *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6657 *image_op = aco_opcode::image_atomic_fmax;
6658 break;
6659 default: unreachable("unsupported atomic operation");
6660 }
6661 }
6662
6663 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6664 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6665 {
6666 bool return_previous = !nir_def_is_unused(&instr->def);
6667 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6668 bool is_array = nir_intrinsic_image_array(instr);
6669 Builder bld(ctx->program, ctx->block);
6670
6671 const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6672 const bool cmpswap = op == nir_atomic_op_cmpxchg;
6673
6674 aco_opcode buf_op, buf_op64, image_op;
6675 translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6676
6677 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6678 bool is_64bit = data.bytes() == 8;
6679 assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6680
6681 if (cmpswap)
6682 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6683 get_ssa_temp(ctx, instr->src[4].ssa), data);
6684
6685 Temp dst = get_ssa_temp(ctx, &instr->def);
6686 memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6687
6688 if (dim == GLSL_SAMPLER_DIM_BUF) {
6689 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6690 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6691 // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6692 // implemented.");
6693 aco_ptr<Instruction> mubuf{create_instruction(is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4,
6694 return_previous ? 1 : 0)};
6695 mubuf->operands[0] = Operand(resource);
6696 mubuf->operands[1] = Operand(vindex);
6697 mubuf->operands[2] = Operand::c32(0);
6698 mubuf->operands[3] = Operand(data);
6699 Definition def =
6700 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6701 if (return_previous)
6702 mubuf->definitions[0] = def;
6703 mubuf->mubuf().offset = 0;
6704 mubuf->mubuf().idxen = true;
6705 mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6706 mubuf->mubuf().disable_wqm = true;
6707 mubuf->mubuf().sync = sync;
6708 ctx->program->needs_exact = true;
6709 ctx->block->instructions.emplace_back(std::move(mubuf));
6710 if (return_previous && cmpswap)
6711 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6712 return;
6713 }
6714
6715 std::vector<Temp> coords = get_image_coords(ctx, instr);
6716 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6717 Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6718 MIMG_instruction* mimg =
6719 emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6720 mimg->cache = get_atomic_cache_flags(ctx, return_previous);
6721 mimg->dmask = (1 << data.size()) - 1;
6722 mimg->a16 = instr->src[1].ssa->bit_size == 16;
6723 mimg->unrm = true;
6724 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6725 mimg->dim = sdim;
6726 mimg->da = should_declare_array(sdim);
6727 mimg->disable_wqm = true;
6728 mimg->sync = sync;
6729 ctx->program->needs_exact = true;
6730 if (return_previous && cmpswap)
6731 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6732 return;
6733 }
6734
6735 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6736 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6737 {
6738 Builder bld(ctx->program, ctx->block);
6739 unsigned num_components = instr->num_components;
6740
6741 Temp dst = get_ssa_temp(ctx, &instr->def);
6742 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6743
6744 unsigned access = nir_intrinsic_access(instr);
6745 unsigned size = instr->def.bit_size / 8;
6746
6747 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6748 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), access,
6749 get_memory_sync_info(instr, storage_buffer, 0));
6750 }
6751
6752 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6753 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6754 {
6755 Builder bld(ctx->program, ctx->block);
6756 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6757 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6758 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6759 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6760
6761 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6762
6763 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6764
6765 unsigned write_count = 0;
6766 Temp write_datas[32];
6767 unsigned offsets[32];
6768 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6769 write_datas, offsets);
6770
6771 /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6772 * correctly when the SGPR offset is used.
6773 */
6774 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6775 offset = as_vgpr(ctx, offset);
6776
6777 for (unsigned i = 0; i < write_count; i++) {
6778 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6779 unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6780 if (write_datas[i].bytes() < 4)
6781 access |= ACCESS_MAY_STORE_SUBDWORD;
6782
6783 aco_ptr<Instruction> store{create_instruction(op, Format::MUBUF, 4, 0)};
6784 store->operands[0] = Operand(rsrc);
6785 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6786 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6787 store->operands[3] = Operand(write_datas[i]);
6788 store->mubuf().offset = offsets[i];
6789 store->mubuf().offen = (offset.type() == RegType::vgpr);
6790 store->mubuf().cache = get_cache_flags(ctx, access);
6791 store->mubuf().disable_wqm = true;
6792 store->mubuf().sync = sync;
6793 ctx->program->needs_exact = true;
6794 ctx->block->instructions.emplace_back(std::move(store));
6795 }
6796 }
6797
6798 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6799 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6800 {
6801 Builder bld(ctx->program, ctx->block);
6802 bool return_previous = !nir_def_is_unused(&instr->def);
6803 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6804
6805 const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6806 const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6807
6808 aco_opcode op32, op64, image_op;
6809 translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6810
6811 if (cmpswap)
6812 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6813 get_ssa_temp(ctx, instr->src[3].ssa), data);
6814
6815 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6816 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6817 Temp dst = get_ssa_temp(ctx, &instr->def);
6818
6819 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6820 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6821 mubuf->operands[0] = Operand(rsrc);
6822 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6823 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6824 mubuf->operands[3] = Operand(data);
6825 Definition def =
6826 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6827 if (return_previous)
6828 mubuf->definitions[0] = def;
6829 mubuf->mubuf().offset = 0;
6830 mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
6831 mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6832 mubuf->mubuf().disable_wqm = true;
6833 mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6834 ctx->program->needs_exact = true;
6835 ctx->block->instructions.emplace_back(std::move(mubuf));
6836 if (return_previous && cmpswap)
6837 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6838 }
6839
6840 void
parse_global(isel_context * ctx,nir_intrinsic_instr * intrin,Temp * address,uint32_t * const_offset,Temp * offset)6841 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6842 Temp* offset)
6843 {
6844 bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6845 *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6846
6847 *const_offset = nir_intrinsic_base(intrin);
6848
6849 unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6850 nir_src offset_src = intrin->src[num_src - 1];
6851 if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6852 *offset = get_ssa_temp(ctx, offset_src.ssa);
6853 else
6854 *offset = Temp();
6855 }
6856
6857 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6858 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6859 {
6860 Builder bld(ctx->program, ctx->block);
6861 unsigned num_components = instr->num_components;
6862 unsigned component_size = instr->def.bit_size / 8;
6863
6864 Temp addr, offset;
6865 uint32_t const_offset;
6866 parse_global(ctx, instr, &addr, &const_offset, &offset);
6867
6868 LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6869 component_size};
6870 if (offset.id()) {
6871 info.resource = addr;
6872 info.offset = Operand(offset);
6873 }
6874 info.const_offset = const_offset;
6875 info.align_mul = nir_intrinsic_align_mul(instr);
6876 info.align_offset = nir_intrinsic_align_offset(instr);
6877 info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6878
6879 /* Don't expand global loads when they use MUBUF or SMEM.
6880 * Global loads don't have the bounds checking that buffer loads have that
6881 * makes this safe.
6882 */
6883 unsigned align = nir_intrinsic_align(instr);
6884 bool byte_align_for_smem_mubuf =
6885 can_use_byte_align_for_global_load(num_components, component_size, align, false);
6886
6887 unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD;
6888 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6889
6890 /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6891 * it's safe to use SMEM */
6892 bool can_use_smem = (access & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6893 if (info.dst.type() == RegType::vgpr || (ctx->options->gfx_level < GFX8 && glc) ||
6894 !can_use_smem) {
6895 EmitLoadParameters params = global_load_params;
6896 params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6897 info.cache = get_cache_flags(ctx, access);
6898 emit_load(ctx, bld, info, params);
6899 } else {
6900 if (info.resource.id())
6901 info.resource = bld.as_uniform(info.resource);
6902 info.offset = Operand(bld.as_uniform(info.offset));
6903 info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_SMEM);
6904 emit_load(ctx, bld, info, smem_load_params);
6905 }
6906 }
6907
6908 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6909 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6910 {
6911 Builder bld(ctx->program, ctx->block);
6912 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6913 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6914
6915 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6916 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6917
6918 unsigned write_count = 0;
6919 Temp write_datas[32];
6920 unsigned offsets[32];
6921 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6922 write_datas, offsets);
6923
6924 Temp addr, offset;
6925 uint32_t const_offset;
6926 parse_global(ctx, instr, &addr, &const_offset, &offset);
6927
6928 for (unsigned i = 0; i < write_count; i++) {
6929 Temp write_address = addr;
6930 uint32_t write_const_offset = const_offset;
6931 Temp write_offset = offset;
6932 lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6933
6934 unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6935 if (write_datas[i].bytes() < 4)
6936 access |= ACCESS_MAY_STORE_SUBDWORD;
6937
6938 if (ctx->options->gfx_level >= GFX7) {
6939 bool global = ctx->options->gfx_level >= GFX9;
6940 aco_opcode op;
6941 switch (write_datas[i].bytes()) {
6942 case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6943 case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6944 case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6945 case 8:
6946 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6947 break;
6948 case 12:
6949 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6950 break;
6951 case 16:
6952 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6953 break;
6954 default: unreachable("store_global not implemented for this size.");
6955 }
6956
6957 aco_ptr<Instruction> flat{
6958 create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6959 if (write_address.regClass() == s2) {
6960 assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6961 flat->operands[0] = Operand(write_offset);
6962 flat->operands[1] = Operand(write_address);
6963 } else {
6964 assert(write_address.type() == RegType::vgpr && !write_offset.id());
6965 flat->operands[0] = Operand(write_address);
6966 flat->operands[1] = Operand(s1);
6967 }
6968 flat->operands[2] = Operand(write_datas[i]);
6969 flat->flatlike().cache = get_cache_flags(ctx, access);
6970 assert(global || !write_const_offset);
6971 flat->flatlike().offset = write_const_offset;
6972 flat->flatlike().disable_wqm = true;
6973 flat->flatlike().sync = sync;
6974 ctx->program->needs_exact = true;
6975 ctx->block->instructions.emplace_back(std::move(flat));
6976 } else {
6977 assert(ctx->options->gfx_level == GFX6);
6978
6979 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6980
6981 Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6982
6983 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, 0)};
6984 mubuf->operands[0] = Operand(rsrc);
6985 mubuf->operands[1] =
6986 write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6987 mubuf->operands[2] = Operand(write_offset);
6988 mubuf->operands[3] = Operand(write_datas[i]);
6989 mubuf->mubuf().cache = get_cache_flags(ctx, access);
6990 mubuf->mubuf().offset = write_const_offset;
6991 mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
6992 mubuf->mubuf().disable_wqm = true;
6993 mubuf->mubuf().sync = sync;
6994 ctx->program->needs_exact = true;
6995 ctx->block->instructions.emplace_back(std::move(mubuf));
6996 }
6997 }
6998 }
6999
7000 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7001 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7002 {
7003 Builder bld(ctx->program, ctx->block);
7004 bool return_previous = !nir_def_is_unused(&instr->def);
7005 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7006
7007 const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
7008 const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
7009
7010 if (cmpswap)
7011 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
7012 get_ssa_temp(ctx, instr->src[2].ssa), data);
7013
7014 Temp dst = get_ssa_temp(ctx, &instr->def);
7015
7016 aco_opcode op32, op64;
7017
7018 Temp addr, offset;
7019 uint32_t const_offset;
7020 parse_global(ctx, instr, &addr, &const_offset, &offset);
7021 lower_global_address(bld, 0, &addr, &const_offset, &offset);
7022
7023 if (ctx->options->gfx_level >= GFX7) {
7024 bool global = ctx->options->gfx_level >= GFX9;
7025 switch (nir_op) {
7026 case nir_atomic_op_iadd:
7027 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
7028 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
7029 break;
7030 case nir_atomic_op_imin:
7031 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
7032 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
7033 break;
7034 case nir_atomic_op_umin:
7035 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
7036 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
7037 break;
7038 case nir_atomic_op_imax:
7039 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
7040 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
7041 break;
7042 case nir_atomic_op_umax:
7043 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
7044 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
7045 break;
7046 case nir_atomic_op_iand:
7047 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
7048 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
7049 break;
7050 case nir_atomic_op_ior:
7051 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
7052 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
7053 break;
7054 case nir_atomic_op_ixor:
7055 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
7056 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
7057 break;
7058 case nir_atomic_op_xchg:
7059 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
7060 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
7061 break;
7062 case nir_atomic_op_cmpxchg:
7063 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
7064 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
7065 break;
7066 case nir_atomic_op_fadd:
7067 op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
7068 op64 = aco_opcode::num_opcodes;
7069 break;
7070 case nir_atomic_op_fmin:
7071 op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
7072 op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
7073 break;
7074 case nir_atomic_op_fmax:
7075 op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
7076 op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
7077 break;
7078 case nir_atomic_op_ordered_add_gfx12_amd:
7079 assert(ctx->options->gfx_level >= GFX12 && instr->def.bit_size == 64);
7080 op32 = aco_opcode::num_opcodes;
7081 op64 = aco_opcode::global_atomic_ordered_add_b64;
7082 break;
7083 default: unreachable("unsupported atomic operation");
7084 }
7085
7086 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7087 aco_ptr<Instruction> flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3,
7088 return_previous ? 1 : 0)};
7089 if (addr.regClass() == s2) {
7090 assert(global && offset.id() && offset.type() == RegType::vgpr);
7091 flat->operands[0] = Operand(offset);
7092 flat->operands[1] = Operand(addr);
7093 } else {
7094 assert(addr.type() == RegType::vgpr && !offset.id());
7095 flat->operands[0] = Operand(addr);
7096 flat->operands[1] = Operand(s1);
7097 }
7098 flat->operands[2] = Operand(data);
7099 if (return_previous)
7100 flat->definitions[0] = Definition(dst);
7101 flat->flatlike().cache = get_atomic_cache_flags(ctx, return_previous);
7102 assert(global || !const_offset);
7103 flat->flatlike().offset = const_offset;
7104 flat->flatlike().disable_wqm = true;
7105 flat->flatlike().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7106 ctx->program->needs_exact = true;
7107 ctx->block->instructions.emplace_back(std::move(flat));
7108 } else {
7109 assert(ctx->options->gfx_level == GFX6);
7110
7111 UNUSED aco_opcode image_op;
7112 translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
7113
7114 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
7115
7116 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7117
7118 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
7119 mubuf->operands[0] = Operand(rsrc);
7120 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
7121 mubuf->operands[2] = Operand(offset);
7122 mubuf->operands[3] = Operand(data);
7123 Definition def =
7124 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
7125 if (return_previous)
7126 mubuf->definitions[0] = def;
7127 mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
7128 mubuf->mubuf().offset = const_offset;
7129 mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
7130 mubuf->mubuf().disable_wqm = true;
7131 mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7132 ctx->program->needs_exact = true;
7133 ctx->block->instructions.emplace_back(std::move(mubuf));
7134 if (return_previous && cmpswap)
7135 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
7136 }
7137 }
7138
7139 unsigned
aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)7140 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
7141 {
7142 unsigned storage = storage_none;
7143
7144 if (mem_mode & nir_var_shader_out)
7145 storage |= storage_vmem_output;
7146 if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
7147 storage |= storage_buffer;
7148 if (mem_mode & nir_var_mem_task_payload)
7149 storage |= storage_task_payload;
7150 if (mem_mode & nir_var_mem_shared)
7151 storage |= storage_shared;
7152 if (mem_mode & nir_var_image)
7153 storage |= storage_image;
7154
7155 return storage;
7156 }
7157
7158 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7159 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7160 {
7161 Builder bld(ctx->program, ctx->block);
7162
7163 /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7164 bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7165 bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7166 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
7167 bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
7168 bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7169
7170 Temp dst = get_ssa_temp(ctx, &intrin->def);
7171 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7172 Temp v_offset =
7173 v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7174 Temp s_offset =
7175 s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7176 Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
7177
7178 ac_hw_cache_flags cache = get_cache_flags(ctx, nir_intrinsic_access(intrin) | ACCESS_TYPE_LOAD);
7179
7180 unsigned const_offset = nir_intrinsic_base(intrin);
7181 unsigned elem_size_bytes = intrin->def.bit_size / 8u;
7182 unsigned num_components = intrin->def.num_components;
7183
7184 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7185 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
7186
7187 LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
7188 info.idx = idx;
7189 info.cache = cache;
7190 info.soffset = s_offset;
7191 info.const_offset = const_offset;
7192 info.sync = sync;
7193
7194 if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
7195 const pipe_format format = nir_intrinsic_format(intrin);
7196 const struct ac_vtx_format_info* vtx_info =
7197 ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
7198 const struct util_format_description* f = util_format_description(format);
7199 const unsigned align_mul = nir_intrinsic_align_mul(intrin);
7200 const unsigned align_offset = nir_intrinsic_align_offset(intrin);
7201
7202 /* Avoid splitting:
7203 * - non-array formats because that would result in incorrect code
7204 * - when element size is same as component size (to reduce instruction count)
7205 */
7206 const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7207
7208 info.align_mul = align_mul;
7209 info.align_offset = align_offset;
7210 info.format = format;
7211 info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7212 info.split_by_component_stride = false;
7213
7214 emit_load(ctx, bld, info, mtbuf_load_params);
7215 } else {
7216 assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7217
7218 if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7219 assert(!swizzled);
7220
7221 emit_load(ctx, bld, info, mubuf_load_format_params);
7222 } else {
7223 const unsigned swizzle_element_size =
7224 swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7225
7226 info.component_stride = swizzle_element_size;
7227 info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7228 info.align_mul = MIN2(elem_size_bytes, 4);
7229 info.align_offset = 0;
7230
7231 emit_load(ctx, bld, info, mubuf_load_params);
7232 }
7233 }
7234 }
7235
7236 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7237 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7238 {
7239 Builder bld(ctx->program, ctx->block);
7240
7241 /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7242 bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7243 bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7244 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7245 bool offen = !nir_src_is_const(intrin->src[2]) || nir_src_as_uint(intrin->src[2]);
7246
7247 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7248 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7249 Temp v_offset = offen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)) : Temp();
7250 Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7251 Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7252
7253 unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7254 assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
7255 elem_size_bytes == 8);
7256
7257 unsigned write_mask = nir_intrinsic_write_mask(intrin);
7258 write_mask = util_widen_mask(write_mask, elem_size_bytes);
7259
7260 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7261 /* GS outputs are only written once. */
7262 const bool written_once =
7263 mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7264 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7265 written_once ? semantic_can_reorder : semantic_none);
7266
7267 unsigned write_count = 0;
7268 Temp write_datas[32];
7269 unsigned offsets[32];
7270 split_buffer_store(ctx, NULL, false, RegType::vgpr, store_src, write_mask,
7271 swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
7272 write_datas, offsets);
7273
7274 for (unsigned i = 0; i < write_count; i++) {
7275 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7276 Temp write_voffset = v_offset;
7277 unsigned const_offset = resolve_excess_vmem_const_offset(
7278 bld, write_voffset, offsets[i] + nir_intrinsic_base(intrin));
7279
7280 Operand vaddr_op(v1);
7281 if (offen && idxen)
7282 vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, write_voffset);
7283 else if (offen)
7284 vaddr_op = Operand(write_voffset);
7285 else if (idxen)
7286 vaddr_op = Operand(idx);
7287
7288 unsigned access = nir_intrinsic_access(intrin);
7289 if (write_datas[i].bytes() < 4)
7290 access |= ACCESS_MAY_STORE_SUBDWORD;
7291 ac_hw_cache_flags cache = get_cache_flags(ctx, access | ACCESS_TYPE_STORE);
7292
7293 Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset,
7294 Operand(write_datas[i]), const_offset, offen, idxen,
7295 /* addr64 */ false, /* disable_wqm */ false, cache)
7296 .instr;
7297 mubuf->mubuf().sync = sync;
7298 }
7299 }
7300
7301 void
visit_load_smem(isel_context * ctx,nir_intrinsic_instr * instr)7302 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7303 {
7304 Builder bld(ctx->program, ctx->block);
7305 Temp dst = get_ssa_temp(ctx, &instr->def);
7306 Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7307 Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7308
7309 /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7310 if (base.bytes() == 4) {
7311 base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7312 Operand::c32(ctx->options->address32_hi));
7313 }
7314
7315 aco_opcode opcode = aco_opcode::s_load_dword;
7316 unsigned size = 1;
7317
7318 assert(dst.bytes() <= 64);
7319
7320 if (dst.bytes() > 32) {
7321 opcode = aco_opcode::s_load_dwordx16;
7322 size = 16;
7323 } else if (dst.bytes() > 16) {
7324 opcode = aco_opcode::s_load_dwordx8;
7325 size = 8;
7326 } else if (dst.bytes() > 8) {
7327 opcode = aco_opcode::s_load_dwordx4;
7328 size = 4;
7329 } else if (dst.bytes() > 4) {
7330 opcode = aco_opcode::s_load_dwordx2;
7331 size = 2;
7332 }
7333
7334 if (dst.size() != size) {
7335 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7336 bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7337 } else {
7338 bld.smem(opcode, Definition(dst), base, offset);
7339 }
7340 emit_split_vector(ctx, dst, instr->def.num_components);
7341 }
7342
7343 sync_scope
translate_nir_scope(mesa_scope scope)7344 translate_nir_scope(mesa_scope scope)
7345 {
7346 switch (scope) {
7347 case SCOPE_NONE:
7348 case SCOPE_INVOCATION: return scope_invocation;
7349 case SCOPE_SUBGROUP: return scope_subgroup;
7350 case SCOPE_WORKGROUP: return scope_workgroup;
7351 case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7352 case SCOPE_DEVICE: return scope_device;
7353 case SCOPE_SHADER_CALL: return scope_invocation;
7354 }
7355 unreachable("invalid scope");
7356 }
7357
7358 void
emit_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7359 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7360 {
7361 Builder bld(ctx->program, ctx->block);
7362
7363 unsigned storage_allowed = storage_buffer | storage_image;
7364 unsigned semantics = 0;
7365 sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7366 sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7367
7368 /* We use shared storage for the following:
7369 * - compute shaders expose it in their API
7370 * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7371 * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7372 * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7373 */
7374 bool shared_storage_used =
7375 ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7376 ctx->stage.hw == AC_HW_HULL_SHADER ||
7377 (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7378 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7379
7380 if (shared_storage_used)
7381 storage_allowed |= storage_shared;
7382
7383 /* Task payload: Task Shader output, Mesh Shader input */
7384 if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7385 storage_allowed |= storage_task_payload;
7386
7387 /* Allow VMEM output for all stages that can have outputs. */
7388 if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7389 ctx->stage.has(SWStage::TS))
7390 storage_allowed |= storage_vmem_output;
7391
7392 /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7393 * They are allowed in CS, TCS, and in any NGG shader.
7394 */
7395 ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7396 ctx->stage.hw == AC_HW_HULL_SHADER ||
7397 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7398
7399 unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7400 unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7401 storage &= storage_allowed;
7402
7403 unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7404 if (nir_semantics & NIR_MEMORY_ACQUIRE)
7405 semantics |= semantic_acquire | semantic_release;
7406 if (nir_semantics & NIR_MEMORY_RELEASE)
7407 semantics |= semantic_acquire | semantic_release;
7408
7409 assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7410 assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7411
7412 bld.barrier(aco_opcode::p_barrier,
7413 memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7414 exec_scope);
7415 }
7416
7417 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7418 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7419 {
7420 // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7421 Temp dst = get_ssa_temp(ctx, &instr->def);
7422 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7423 Builder bld(ctx->program, ctx->block);
7424
7425 unsigned elem_size_bytes = instr->def.bit_size / 8;
7426 unsigned num_components = instr->def.num_components;
7427 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7428 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7429 }
7430
7431 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7432 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7433 {
7434 unsigned writemask = nir_intrinsic_write_mask(instr);
7435 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7436 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7437 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7438
7439 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7440 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7441 }
7442
7443 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7444 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7445 {
7446 unsigned offset = nir_intrinsic_base(instr);
7447 Builder bld(ctx->program, ctx->block);
7448 Operand m = load_lds_size_m0(bld);
7449 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7450 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7451
7452 unsigned num_operands = 3;
7453 aco_opcode op32, op64, op32_rtn, op64_rtn;
7454 switch (nir_intrinsic_atomic_op(instr)) {
7455 case nir_atomic_op_iadd:
7456 op32 = aco_opcode::ds_add_u32;
7457 op64 = aco_opcode::ds_add_u64;
7458 op32_rtn = aco_opcode::ds_add_rtn_u32;
7459 op64_rtn = aco_opcode::ds_add_rtn_u64;
7460 break;
7461 case nir_atomic_op_imin:
7462 op32 = aco_opcode::ds_min_i32;
7463 op64 = aco_opcode::ds_min_i64;
7464 op32_rtn = aco_opcode::ds_min_rtn_i32;
7465 op64_rtn = aco_opcode::ds_min_rtn_i64;
7466 break;
7467 case nir_atomic_op_umin:
7468 op32 = aco_opcode::ds_min_u32;
7469 op64 = aco_opcode::ds_min_u64;
7470 op32_rtn = aco_opcode::ds_min_rtn_u32;
7471 op64_rtn = aco_opcode::ds_min_rtn_u64;
7472 break;
7473 case nir_atomic_op_imax:
7474 op32 = aco_opcode::ds_max_i32;
7475 op64 = aco_opcode::ds_max_i64;
7476 op32_rtn = aco_opcode::ds_max_rtn_i32;
7477 op64_rtn = aco_opcode::ds_max_rtn_i64;
7478 break;
7479 case nir_atomic_op_umax:
7480 op32 = aco_opcode::ds_max_u32;
7481 op64 = aco_opcode::ds_max_u64;
7482 op32_rtn = aco_opcode::ds_max_rtn_u32;
7483 op64_rtn = aco_opcode::ds_max_rtn_u64;
7484 break;
7485 case nir_atomic_op_iand:
7486 op32 = aco_opcode::ds_and_b32;
7487 op64 = aco_opcode::ds_and_b64;
7488 op32_rtn = aco_opcode::ds_and_rtn_b32;
7489 op64_rtn = aco_opcode::ds_and_rtn_b64;
7490 break;
7491 case nir_atomic_op_ior:
7492 op32 = aco_opcode::ds_or_b32;
7493 op64 = aco_opcode::ds_or_b64;
7494 op32_rtn = aco_opcode::ds_or_rtn_b32;
7495 op64_rtn = aco_opcode::ds_or_rtn_b64;
7496 break;
7497 case nir_atomic_op_ixor:
7498 op32 = aco_opcode::ds_xor_b32;
7499 op64 = aco_opcode::ds_xor_b64;
7500 op32_rtn = aco_opcode::ds_xor_rtn_b32;
7501 op64_rtn = aco_opcode::ds_xor_rtn_b64;
7502 break;
7503 case nir_atomic_op_xchg:
7504 op32 = aco_opcode::ds_write_b32;
7505 op64 = aco_opcode::ds_write_b64;
7506 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7507 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7508 break;
7509 case nir_atomic_op_cmpxchg:
7510 op32 = aco_opcode::ds_cmpst_b32;
7511 op64 = aco_opcode::ds_cmpst_b64;
7512 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7513 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7514 num_operands = 4;
7515 break;
7516 case nir_atomic_op_fadd:
7517 op32 = aco_opcode::ds_add_f32;
7518 op32_rtn = aco_opcode::ds_add_rtn_f32;
7519 op64 = aco_opcode::num_opcodes;
7520 op64_rtn = aco_opcode::num_opcodes;
7521 break;
7522 case nir_atomic_op_fmin:
7523 op32 = aco_opcode::ds_min_f32;
7524 op32_rtn = aco_opcode::ds_min_rtn_f32;
7525 op64 = aco_opcode::ds_min_f64;
7526 op64_rtn = aco_opcode::ds_min_rtn_f64;
7527 break;
7528 case nir_atomic_op_fmax:
7529 op32 = aco_opcode::ds_max_f32;
7530 op32_rtn = aco_opcode::ds_max_rtn_f32;
7531 op64 = aco_opcode::ds_max_f64;
7532 op64_rtn = aco_opcode::ds_max_rtn_f64;
7533 break;
7534 default: unreachable("Unhandled shared atomic intrinsic");
7535 }
7536
7537 bool return_previous = !nir_def_is_unused(&instr->def);
7538
7539 aco_opcode op;
7540 if (data.size() == 1) {
7541 assert(instr->def.bit_size == 32);
7542 op = return_previous ? op32_rtn : op32;
7543 } else {
7544 assert(instr->def.bit_size == 64);
7545 op = return_previous ? op64_rtn : op64;
7546 }
7547
7548 if (offset > 65535) {
7549 address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7550 offset = 0;
7551 }
7552
7553 aco_ptr<Instruction> ds;
7554 ds.reset(create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0));
7555 ds->operands[0] = Operand(address);
7556 ds->operands[1] = Operand(data);
7557 if (num_operands == 4) {
7558 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7559 ds->operands[2] = Operand(data2);
7560 if (bld.program->gfx_level >= GFX11)
7561 std::swap(ds->operands[1], ds->operands[2]);
7562 }
7563 ds->operands[num_operands - 1] = m;
7564 ds->ds().offset0 = offset;
7565 if (return_previous)
7566 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7567 ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7568
7569 if (m.isUndefined())
7570 ds->operands.pop_back();
7571
7572 ctx->block->instructions.emplace_back(std::move(ds));
7573 }
7574
7575 void
visit_shared_append(isel_context * ctx,nir_intrinsic_instr * instr)7576 visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr)
7577 {
7578 Builder bld(ctx->program, ctx->block);
7579 unsigned address = nir_intrinsic_base(instr);
7580 assert(address <= 65535 && (address % 4 == 0));
7581
7582 aco_opcode op;
7583 switch (instr->intrinsic) {
7584 case nir_intrinsic_shared_append_amd: op = aco_opcode::ds_append; break;
7585 case nir_intrinsic_shared_consume_amd: op = aco_opcode::ds_consume; break;
7586 default: unreachable("not shared_append/consume");
7587 }
7588
7589 Temp tmp = bld.tmp(v1);
7590 Instruction *ds;
7591 Operand m = load_lds_size_m0(bld);
7592 if (m.isUndefined())
7593 ds = bld.ds(op, Definition(tmp), address);
7594 else
7595 ds = bld.ds(op, Definition(tmp), m, address);
7596 ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7597
7598 bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp);
7599 }
7600
7601 void
visit_access_shared2_amd(isel_context * ctx,nir_intrinsic_instr * instr)7602 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7603 {
7604 bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7605 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7606 Builder bld(ctx->program, ctx->block);
7607
7608 assert(bld.program->gfx_level >= GFX7);
7609
7610 bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7611 uint8_t offset0 = nir_intrinsic_offset0(instr);
7612 uint8_t offset1 = nir_intrinsic_offset1(instr);
7613 bool st64 = nir_intrinsic_st64(instr);
7614
7615 Operand m = load_lds_size_m0(bld);
7616 Instruction* ds;
7617 if (is_store) {
7618 aco_opcode op = st64
7619 ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7620 : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7621 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7622 RegClass comp_rc = is64bit ? v2 : v1;
7623 Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7624 Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7625 ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7626 } else {
7627 Temp dst = get_ssa_temp(ctx, &instr->def);
7628 Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7629 aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7630 : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7631 ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7632 }
7633 ds->ds().sync = memory_sync_info(storage_shared);
7634 if (m.isUndefined())
7635 ds->operands.pop_back();
7636
7637 if (!is_store) {
7638 Temp dst = get_ssa_temp(ctx, &instr->def);
7639 if (dst.type() == RegType::sgpr) {
7640 emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7641 Temp comp[4];
7642 /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7643 for (unsigned i = 0; i < dst.size(); i++)
7644 comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7645 if (is64bit) {
7646 Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7647 Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7648 ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7649 ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7650 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7651 ctx->allocated_vec[dst.id()] = {comp0, comp1};
7652 } else {
7653 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7654 }
7655 }
7656
7657 emit_split_vector(ctx, dst, 2);
7658 }
7659 }
7660
7661 Temp
get_scratch_resource(isel_context * ctx)7662 get_scratch_resource(isel_context* ctx)
7663 {
7664 Builder bld(ctx->program, ctx->block);
7665 Temp scratch_addr = ctx->program->private_segment_buffer;
7666 if (!scratch_addr.bytes()) {
7667 Temp addr_lo =
7668 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7669 Temp addr_hi =
7670 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7671 scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7672 } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7673 scratch_addr =
7674 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7675 }
7676
7677 struct ac_buffer_state ac_state = {0};
7678 uint32_t desc[4];
7679
7680 ac_state.size = 0xffffffff;
7681 ac_state.format = PIPE_FORMAT_R32_FLOAT;
7682 for (int i = 0; i < 4; i++)
7683 ac_state.swizzle[i] = PIPE_SWIZZLE_0;
7684 /* older generations need element size = 4 bytes. element size removed in GFX9 */
7685 ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u;
7686 ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u;
7687 ac_state.add_tid = true;
7688 ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
7689
7690 ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc);
7691
7692 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]),
7693 Operand::c32(desc[3]));
7694 }
7695
7696 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7697 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7698 {
7699 Builder bld(ctx->program, ctx->block);
7700 Temp dst = get_ssa_temp(ctx, &instr->def);
7701
7702 LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7703 info.align_mul = nir_intrinsic_align_mul(instr);
7704 info.align_offset = nir_intrinsic_align_offset(instr);
7705 info.cache = get_cache_flags(ctx, ACCESS_TYPE_LOAD | ACCESS_IS_SWIZZLED_AMD);
7706 info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7707 info.sync = memory_sync_info(storage_scratch, semantic_private);
7708 if (ctx->program->gfx_level >= GFX9) {
7709 if (nir_src_is_const(instr->src[0])) {
7710 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7711 info.offset =
7712 bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7713 info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7714 } else {
7715 info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7716 }
7717 EmitLoadParameters params = scratch_flat_load_params;
7718 params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7719 emit_load(ctx, bld, info, params);
7720 } else {
7721 info.resource = get_scratch_resource(ctx);
7722 info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7723 info.soffset = ctx->program->scratch_offset;
7724 emit_load(ctx, bld, info, scratch_mubuf_load_params);
7725 }
7726 }
7727
7728 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7729 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7730 {
7731 Builder bld(ctx->program, ctx->block);
7732 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7733 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7734
7735 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7736 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7737
7738 unsigned write_count = 0;
7739 Temp write_datas[32];
7740 unsigned offsets[32];
7741 unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7742 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7743 &write_count, write_datas, offsets);
7744
7745 if (ctx->program->gfx_level >= GFX9) {
7746 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7747 offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7748 uint32_t base_const_offset =
7749 nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7750
7751 for (unsigned i = 0; i < write_count; i++) {
7752 aco_opcode op;
7753 switch (write_datas[i].bytes()) {
7754 case 1: op = aco_opcode::scratch_store_byte; break;
7755 case 2: op = aco_opcode::scratch_store_short; break;
7756 case 4: op = aco_opcode::scratch_store_dword; break;
7757 case 8: op = aco_opcode::scratch_store_dwordx2; break;
7758 case 12: op = aco_opcode::scratch_store_dwordx3; break;
7759 case 16: op = aco_opcode::scratch_store_dwordx4; break;
7760 default: unreachable("Unexpected store size");
7761 }
7762
7763 uint32_t const_offset = base_const_offset + offsets[i];
7764 assert(const_offset < max || offset.id() == 0);
7765
7766 Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7767 Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7768 if (offset.id() == 0)
7769 saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7770
7771 bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7772 memory_sync_info(storage_scratch, semantic_private));
7773 }
7774 } else {
7775 Temp rsrc = get_scratch_resource(ctx);
7776 offset = as_vgpr(ctx, offset);
7777 for (unsigned i = 0; i < write_count; i++) {
7778 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7779 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7780 write_datas[i], offsets[i], true);
7781 mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7782 unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD |
7783 (write_datas[i].bytes() < 4 ? ACCESS_MAY_STORE_SUBDWORD : 0);
7784 mubuf->mubuf().cache = get_cache_flags(ctx, access);
7785 }
7786 }
7787 }
7788
7789 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7790 get_reduce_op(nir_op op, unsigned bit_size)
7791 {
7792 switch (op) {
7793 #define CASEI(name) \
7794 case nir_op_##name: \
7795 return (bit_size == 32) ? name##32 \
7796 : (bit_size == 16) ? name##16 \
7797 : (bit_size == 8) ? name##8 \
7798 : name##64;
7799 #define CASEF(name) \
7800 case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7801 CASEI(iadd)
7802 CASEI(imul)
7803 CASEI(imin)
7804 CASEI(umin)
7805 CASEI(imax)
7806 CASEI(umax)
7807 CASEI(iand)
7808 CASEI(ior)
7809 CASEI(ixor)
7810 CASEF(fadd)
7811 CASEF(fmul)
7812 CASEF(fmin)
7813 CASEF(fmax)
7814 default: unreachable("unknown reduction op");
7815 #undef CASEI
7816 #undef CASEF
7817 }
7818 }
7819
7820 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7821 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7822 {
7823 Builder bld(ctx->program, ctx->block);
7824 Definition dst(get_ssa_temp(ctx, &instr->def));
7825 assert(dst.regClass().type() != RegType::vgpr);
7826 if (src.regClass().type() == RegType::vgpr)
7827 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7828 else
7829 bld.copy(dst, src);
7830 }
7831
7832 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7833 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7834 {
7835 Builder bld(ctx->program, ctx->block);
7836 Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7837
7838 if (op == nir_op_fadd) {
7839 src_tmp = as_vgpr(ctx, src_tmp);
7840 Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7841 : dst.getTemp();
7842
7843 if (src.ssa->bit_size == 16) {
7844 count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7845 bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7846 } else {
7847 assert(src.ssa->bit_size == 32);
7848 count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7849 bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7850 }
7851
7852 if (tmp != dst.getTemp())
7853 bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7854
7855 return;
7856 }
7857
7858 if (dst.regClass() == s1)
7859 src_tmp = bld.as_uniform(src_tmp);
7860
7861 if (op == nir_op_ixor && count.type() == RegType::sgpr)
7862 count =
7863 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7864 else if (op == nir_op_ixor)
7865 count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7866
7867 assert(dst.getTemp().type() == count.type());
7868
7869 if (nir_src_is_const(src)) {
7870 uint32_t imm = nir_src_as_uint(src);
7871 if (imm == 1 && dst.bytes() <= 2)
7872 bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7873 else if (imm == 1)
7874 bld.copy(dst, count);
7875 else if (imm == 0)
7876 bld.copy(dst, Operand::zero(dst.bytes()));
7877 else if (count.type() == RegType::vgpr)
7878 bld.v_mul_imm(dst, count, imm, true, true);
7879 else if (imm == 0xffffffff)
7880 bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
7881 else if (util_is_power_of_two_or_zero(imm))
7882 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
7883 Operand::c32(ffs(imm) - 1u));
7884 else
7885 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7886 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7887 bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7888 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7889 bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7890 } else if (dst.getTemp().type() == RegType::vgpr) {
7891 bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7892 } else {
7893 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7894 }
7895 }
7896
7897 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7898 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7899 {
7900 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7901 if (op == nir_op_imul || op == nir_op_fmul)
7902 return false;
7903
7904 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7905 Builder bld(ctx->program, ctx->block);
7906 Definition dst(get_ssa_temp(ctx, &instr->def));
7907 unsigned bit_size = instr->src[0].ssa->bit_size;
7908 if (bit_size > 32)
7909 return false;
7910
7911 Temp thread_count =
7912 bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7913 set_wqm(ctx);
7914
7915 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7916 } else {
7917 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7918 }
7919
7920 return true;
7921 }
7922
7923 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7924 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7925 {
7926 Builder bld(ctx->program, ctx->block);
7927 Definition dst(get_ssa_temp(ctx, &instr->def));
7928 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7929 bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7930
7931 if (op == nir_op_imul || op == nir_op_fmul)
7932 return false;
7933
7934 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7935 if (instr->src[0].ssa->bit_size > 32)
7936 return false;
7937
7938 Temp packed_tid;
7939 if (inc)
7940 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7941 else
7942 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7943 set_wqm(ctx);
7944
7945 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7946 return true;
7947 }
7948
7949 assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7950 op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7951
7952 if (inc) {
7953 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7954 return true;
7955 }
7956
7957 /* Copy the source and write the reduction operation identity to the first lane. */
7958 Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7959 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7960 ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7961 if (dst.bytes() == 8) {
7962 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7963 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7964 uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7965 uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7966
7967 lo =
7968 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7969 hi =
7970 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7971 bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7972 } else {
7973 uint32_t identity = get_reduction_identity(reduce_op, 0);
7974 bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7975 as_vgpr(ctx, src));
7976 }
7977
7978 set_wqm(ctx);
7979 return true;
7980 }
7981
7982 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7983 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7984 Definition dst, Temp src)
7985 {
7986 assert(src.bytes() <= 8);
7987 assert(src.type() == RegType::vgpr);
7988
7989 Builder bld(ctx->program, ctx->block);
7990
7991 unsigned num_defs = 0;
7992 Definition defs[5];
7993 defs[num_defs++] = dst;
7994 defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7995
7996 /* scalar identity temporary */
7997 bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7998 aco_op != aco_opcode::p_reduce;
7999 if (aco_op == aco_opcode::p_exclusive_scan) {
8000 need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
8001 op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
8002 op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
8003 op == fmul64);
8004 }
8005 if (need_sitmp)
8006 defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
8007
8008 /* scc clobber */
8009 defs[num_defs++] = bld.def(s1, scc);
8010
8011 /* vcc clobber */
8012 bool clobber_vcc = false;
8013 if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
8014 clobber_vcc = true;
8015 if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
8016 clobber_vcc = true;
8017 if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
8018 clobber_vcc = true;
8019
8020 if (clobber_vcc)
8021 defs[num_defs++] = bld.def(bld.lm, vcc);
8022
8023 Instruction* reduce = create_instruction(aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
8024 reduce->operands[0] = Operand(src);
8025 /* setup_reduce_temp will update these undef operands if needed */
8026 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
8027 reduce->operands[2] = Operand(v1.as_linear());
8028 std::copy(defs, defs + num_defs, reduce->definitions.begin());
8029
8030 reduce->reduction().reduce_op = op;
8031 reduce->reduction().cluster_size = cluster_size;
8032 bld.insert(std::move(reduce));
8033
8034 return dst.getTemp();
8035 }
8036
8037 Temp
inclusive_scan_to_exclusive(isel_context * ctx,ReduceOp op,Definition dst,Temp src)8038 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
8039 {
8040 Builder bld(ctx->program, ctx->block);
8041
8042 Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
8043 bld.def(dst.regClass()), src);
8044
8045 switch (op) {
8046 case iadd8:
8047 case iadd16:
8048 case iadd32: return bld.vsub32(dst, scan, src);
8049 case ixor64:
8050 case iadd64: {
8051 Temp src00 = bld.tmp(v1);
8052 Temp src01 = bld.tmp(v1);
8053 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
8054 Temp src10 = bld.tmp(v1);
8055 Temp src11 = bld.tmp(v1);
8056 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
8057
8058 Temp lower = bld.tmp(v1);
8059 Temp upper = bld.tmp(v1);
8060 if (op == iadd64) {
8061 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
8062 bld.vsub32(Definition(upper), src01, src11, false, borrow);
8063 } else {
8064 bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
8065 bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
8066 }
8067 return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
8068 }
8069 case ixor8:
8070 case ixor16:
8071 case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
8072 default: unreachable("Unsupported op");
8073 }
8074 }
8075
8076 bool
emit_rotate_by_constant(isel_context * ctx,Temp & dst,Temp src,unsigned cluster_size,uint64_t delta)8077 emit_rotate_by_constant(isel_context* ctx, Temp& dst, Temp src, unsigned cluster_size,
8078 uint64_t delta)
8079 {
8080 Builder bld(ctx->program, ctx->block);
8081 RegClass rc = src.regClass();
8082 dst = Temp(0, rc);
8083 delta %= cluster_size;
8084
8085 if (delta == 0) {
8086 dst = bld.copy(bld.def(rc), src);
8087 } else if (delta * 2 == cluster_size && cluster_size <= 32) {
8088 dst = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, delta), true);
8089 } else if (cluster_size == 4) {
8090 unsigned res[4];
8091 for (unsigned i = 0; i < 4; i++)
8092 res[i] = (i + delta) & 0x3;
8093 uint32_t dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
8094 if (ctx->program->gfx_level >= GFX8)
8095 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_ctrl);
8096 else
8097 dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl);
8098 } else if (cluster_size == 8 && ctx->program->gfx_level >= GFX10) {
8099 uint32_t lane_sel = 0;
8100 for (unsigned i = 0; i < 8; i++)
8101 lane_sel |= ((i + delta) & 0x7) << (i * 3);
8102 dst = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(rc), src, lane_sel);
8103 } else if (cluster_size == 16 && ctx->program->gfx_level >= GFX8) {
8104 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_row_rr(16 - delta));
8105 } else if (cluster_size <= 32 && ctx->program->gfx_level >= GFX9) {
8106 uint32_t ctrl = ds_pattern_rotate(delta, ~(cluster_size - 1) & 0x1f);
8107 dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, ctrl);
8108 } else if (cluster_size == 64) {
8109 bool has_wf_dpp = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX10;
8110 if (delta == 32 && ctx->program->gfx_level >= GFX11) {
8111 dst = bld.vop1(aco_opcode::v_permlane64_b32, bld.def(rc), src);
8112 } else if (delta == 1 && has_wf_dpp) {
8113 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rl1);
8114 } else if (delta == 63 && has_wf_dpp) {
8115 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rr1);
8116 }
8117 }
8118
8119 return dst.id() != 0;
8120 }
8121
8122 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)8123 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
8124 {
8125 Builder bld(ctx->program, ctx->block);
8126 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
8127 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
8128
8129 Temp ddx_1, ddx_2, ddy_1, ddy_2;
8130 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
8131 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
8132 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8133
8134 /* Build DD X/Y */
8135 if (ctx->program->gfx_level >= GFX8) {
8136 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
8137 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
8138 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
8139 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
8140 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
8141 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
8142 } else {
8143 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
8144 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
8145 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
8146 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
8147 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
8148
8149 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
8150 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
8151 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
8152 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
8153 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
8154 }
8155
8156 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
8157 aco_opcode mad =
8158 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
8159 Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
8160 Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
8161 tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
8162 tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
8163 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2);
8164 set_wqm(ctx, true);
8165 return;
8166 }
8167
8168 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8169 Temp lanecount_to_mask(isel_context* ctx, Temp count);
8170 void pops_await_overlapped_waves(isel_context* ctx);
8171
8172 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)8173 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
8174 {
8175 bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
8176 if (intrin == nir_intrinsic_load_barycentric_pixel ||
8177 intrin == nir_intrinsic_load_barycentric_at_offset) {
8178 return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
8179 } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
8180 return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
8181 } else {
8182 assert(intrin == nir_intrinsic_load_barycentric_sample);
8183 return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
8184 }
8185 }
8186
8187 void
ds_ordered_count_offsets(isel_context * ctx,unsigned index_operand,unsigned wave_release,unsigned wave_done,unsigned * offset0,unsigned * offset1)8188 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
8189 unsigned wave_done, unsigned* offset0, unsigned* offset1)
8190 {
8191 unsigned ordered_count_index = index_operand & 0x3f;
8192 unsigned count_dword = (index_operand >> 24) & 0xf;
8193
8194 assert(ctx->options->gfx_level >= GFX10);
8195 assert(count_dword >= 1 && count_dword <= 4);
8196
8197 *offset0 = ordered_count_index << 2;
8198 *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
8199
8200 if (ctx->options->gfx_level < GFX11)
8201 *offset1 |= 3 /* GS shader type */ << 2;
8202 }
8203
8204 struct aco_export_mrt {
8205 Operand out[4];
8206 unsigned enabled_channels;
8207 unsigned target;
8208 bool compr;
8209 };
8210
8211 static void
create_fs_dual_src_export_gfx11(isel_context * ctx,const struct aco_export_mrt * mrt0,const struct aco_export_mrt * mrt1)8212 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8213 const struct aco_export_mrt* mrt1)
8214 {
8215 Builder bld(ctx->program, ctx->block);
8216
8217 aco_ptr<Instruction> exp{
8218 create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8219 for (unsigned i = 0; i < 4; i++) {
8220 exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8221 exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8222 }
8223
8224 RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8225 exp->definitions[0] = bld.def(type); /* mrt0 */
8226 exp->definitions[1] = bld.def(type); /* mrt1 */
8227 exp->definitions[2] = bld.def(bld.lm);
8228 exp->definitions[3] = bld.def(bld.lm);
8229 exp->definitions[4] = bld.def(bld.lm, vcc);
8230 exp->definitions[5] = bld.def(s1, scc);
8231 ctx->block->instructions.emplace_back(std::move(exp));
8232
8233 ctx->program->has_color_exports = true;
8234 }
8235
8236 static void
visit_cmat_muladd(isel_context * ctx,nir_intrinsic_instr * instr)8237 visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
8238 {
8239 aco_opcode opcode = aco_opcode::num_opcodes;
8240 unsigned signed_mask = 0;
8241 bool clamp = false;
8242
8243 switch (instr->src[0].ssa->bit_size) {
8244 case 16:
8245 switch (instr->def.bit_size) {
8246 case 32: opcode = aco_opcode::v_wmma_f32_16x16x16_f16; break;
8247 case 16: opcode = aco_opcode::v_wmma_f16_16x16x16_f16; break;
8248 }
8249 break;
8250 case 8:
8251 opcode = aco_opcode::v_wmma_i32_16x16x16_iu8;
8252 signed_mask = nir_intrinsic_cmat_signed_mask(instr);
8253 clamp = nir_intrinsic_saturate(instr);
8254 break;
8255 }
8256
8257 if (opcode == aco_opcode::num_opcodes)
8258 unreachable("visit_cmat_muladd: invalid bit size combination");
8259
8260 Builder bld(ctx->program, ctx->block);
8261
8262 Temp dst = get_ssa_temp(ctx, &instr->def);
8263 Operand A(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
8264 Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
8265 Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8266
8267 VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
8268 vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
8269 vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
8270 vop3p.clamp = clamp;
8271
8272 emit_split_vector(ctx, dst, instr->def.num_components);
8273 }
8274
8275 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)8276 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8277 {
8278 Builder bld(ctx->program, ctx->block);
8279 switch (instr->intrinsic) {
8280 case nir_intrinsic_load_barycentric_sample:
8281 case nir_intrinsic_load_barycentric_pixel:
8282 case nir_intrinsic_load_barycentric_centroid: {
8283 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8284 Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8285 assert(bary.size() == 2);
8286 Temp dst = get_ssa_temp(ctx, &instr->def);
8287 bld.copy(Definition(dst), bary);
8288 emit_split_vector(ctx, dst, 2);
8289 break;
8290 }
8291 case nir_intrinsic_load_barycentric_model: {
8292 Temp model = get_arg(ctx, ctx->args->pull_model);
8293 assert(model.size() == 3);
8294 Temp dst = get_ssa_temp(ctx, &instr->def);
8295 bld.copy(Definition(dst), model);
8296 emit_split_vector(ctx, dst, 3);
8297 break;
8298 }
8299 case nir_intrinsic_load_barycentric_at_offset: {
8300 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8301 RegClass rc = RegClass(offset.type(), 1);
8302 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8303 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8304 Temp bary = get_interp_param(ctx, instr->intrinsic,
8305 (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8306 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8307 break;
8308 }
8309 case nir_intrinsic_load_front_face: {
8310 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8311 Operand::zero(), get_arg(ctx, ctx->args->front_face));
8312 break;
8313 }
8314 case nir_intrinsic_load_view_index: {
8315 Temp dst = get_ssa_temp(ctx, &instr->def);
8316 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8317 break;
8318 }
8319 case nir_intrinsic_load_frag_coord: {
8320 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8321 break;
8322 }
8323 case nir_intrinsic_load_frag_shading_rate:
8324 emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8325 break;
8326 case nir_intrinsic_load_sample_pos: {
8327 Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8328 Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8329 bld.pseudo(
8330 aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8331 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8332 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8333 break;
8334 }
8335 case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8336 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8337 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8338 case nir_intrinsic_load_input:
8339 case nir_intrinsic_load_per_primitive_input:
8340 case nir_intrinsic_load_input_vertex:
8341 if (ctx->program->stage == fragment_fs)
8342 visit_load_fs_input(ctx, instr);
8343 else
8344 isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8345 break;
8346 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8347 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8348 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8349 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8350 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8351 case nir_intrinsic_shared_atomic:
8352 case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8353 case nir_intrinsic_shared_append_amd:
8354 case nir_intrinsic_shared_consume_amd: visit_shared_append(ctx, instr); break;
8355 case nir_intrinsic_load_shared2_amd:
8356 case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8357 case nir_intrinsic_bindless_image_load:
8358 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8359 case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8360 case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8361 case nir_intrinsic_bindless_image_atomic:
8362 case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8363 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8364 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8365 case nir_intrinsic_load_typed_buffer_amd:
8366 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8367 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8368 case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8369 case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8370 case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8371 case nir_intrinsic_global_atomic_amd:
8372 case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8373 case nir_intrinsic_ssbo_atomic:
8374 case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8375 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8376 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8377 case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8378 case nir_intrinsic_load_num_workgroups: {
8379 Temp dst = get_ssa_temp(ctx, &instr->def);
8380 if (ctx->options->load_grid_size_from_user_sgpr) {
8381 bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8382 } else {
8383 Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8384 assert(addr.regClass() == s2);
8385 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8386 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8387 bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8388 }
8389 emit_split_vector(ctx, dst, 3);
8390 break;
8391 }
8392 case nir_intrinsic_load_local_invocation_id: {
8393 Temp dst = get_ssa_temp(ctx, &instr->def);
8394 if (ctx->options->gfx_level >= GFX11) {
8395 Temp local_ids[3];
8396
8397 /* Thread IDs are packed in VGPR0, 10 bits per component. */
8398 for (uint32_t i = 0; i < 3; i++) {
8399 if (i == 0 && ctx->shader->info.workgroup_size[1] == 1 &&
8400 ctx->shader->info.workgroup_size[2] == 1 &&
8401 !ctx->shader->info.workgroup_size_variable) {
8402 local_ids[i] = get_arg(ctx, ctx->args->local_invocation_ids);
8403 } else if (i == 2 || (i == 1 && ctx->shader->info.workgroup_size[2] == 1 &&
8404 !ctx->shader->info.workgroup_size_variable)) {
8405 local_ids[i] =
8406 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(i * 10u),
8407 get_arg(ctx, ctx->args->local_invocation_ids));
8408 } else {
8409 local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8410 get_arg(ctx, ctx->args->local_invocation_ids),
8411 Operand::c32(i * 10u), Operand::c32(10u));
8412 }
8413 }
8414
8415 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8416 local_ids[2]);
8417 } else {
8418 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8419 }
8420 emit_split_vector(ctx, dst, 3);
8421 break;
8422 }
8423 case nir_intrinsic_load_workgroup_id: {
8424 Temp dst = get_ssa_temp(ctx, &instr->def);
8425 if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8426 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), ctx->workgroup_id[0],
8427 ctx->workgroup_id[1], ctx->workgroup_id[2]);
8428 emit_split_vector(ctx, dst, 3);
8429 } else {
8430 isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8431 }
8432 break;
8433 }
8434 case nir_intrinsic_load_subgroup_id: {
8435 assert(ctx->options->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER);
8436 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8437 ctx->ttmp8, Operand::c32(25 | (5 << 16)));
8438 break;
8439 }
8440 case nir_intrinsic_load_local_invocation_index: {
8441 if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8442 if (ctx->options->gfx_level >= GFX11) {
8443 /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8444 Temp wave_id =
8445 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8446 get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8447
8448 Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8449 Operand::c32(ctx->program->wave_size));
8450 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8451 } else {
8452 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8453 get_arg(ctx, ctx->args->vs_rel_patch_id));
8454 }
8455 break;
8456 } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8457 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8458 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8459 break;
8460 } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8461 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8462 break;
8463 }
8464
8465 Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8466
8467 if (ctx->options->gfx_level >= GFX12) {
8468 Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx->ttmp8,
8469 Operand::c32(25 | (5 << 16)));
8470 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8471 Operand::c32(ctx->program->wave_size == 64 ? 6 : 5), id);
8472 break;
8473 }
8474
8475 /* The tg_size bits [6:11] contain the subgroup id,
8476 * we need this multiplied by the wave size, and then OR the thread id to it.
8477 */
8478 if (ctx->program->wave_size == 64) {
8479 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8480 * feed that to v_or */
8481 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8482 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8483 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8484 } else {
8485 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8486 Temp tg_num =
8487 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8488 get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8489 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8490 Operand::c32(0x5u), id);
8491 }
8492 break;
8493 }
8494 case nir_intrinsic_ddx:
8495 case nir_intrinsic_ddy:
8496 case nir_intrinsic_ddx_fine:
8497 case nir_intrinsic_ddy_fine:
8498 case nir_intrinsic_ddx_coarse:
8499 case nir_intrinsic_ddy_coarse: {
8500 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8501 Temp dst = get_ssa_temp(ctx, &instr->def);
8502
8503 bool only_used_by_abs = true;
8504 nir_foreach_use (use, &instr->def) {
8505 nir_instr* use_instr = nir_src_parent_instr(use);
8506
8507 if (use_instr->type != nir_instr_type_alu ||
8508 nir_instr_as_alu(use_instr)->op != nir_op_fabs)
8509 only_used_by_abs = false;
8510 }
8511
8512 uint16_t dpp_ctrl1, dpp_ctrl2;
8513 if (instr->intrinsic == nir_intrinsic_ddx_fine) {
8514 if (only_used_by_abs) {
8515 dpp_ctrl1 = dpp_quad_perm(1, 0, 3, 2);
8516 dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8517 } else {
8518 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
8519 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
8520 }
8521 } else if (instr->intrinsic == nir_intrinsic_ddy_fine) {
8522 if (only_used_by_abs) {
8523 dpp_ctrl1 = dpp_quad_perm(2, 3, 0, 1);
8524 dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8525 } else {
8526 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
8527 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
8528 }
8529 } else {
8530 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
8531 if (instr->intrinsic == nir_intrinsic_ddx ||
8532 instr->intrinsic == nir_intrinsic_ddx_coarse)
8533 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
8534 else
8535 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8536 }
8537
8538 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
8539 assert(instr->def.num_components == 2);
8540
8541 /* identify swizzle to opsel */
8542 unsigned opsel_lo = 0b00;
8543 unsigned opsel_hi = 0b11;
8544
8545 Temp tl = src;
8546 if (nir_src_is_divergent(instr->src[0]))
8547 tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
8548
8549 Builder::Result sub =
8550 bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), src, tl, opsel_lo, opsel_hi);
8551 sub->valu().neg_lo[1] = true;
8552 sub->valu().neg_hi[1] = true;
8553
8554 if (nir_src_is_divergent(instr->src[0]) && dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8555 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), sub, dpp_ctrl2);
8556 else
8557 bld.copy(Definition(dst), sub);
8558 emit_split_vector(ctx, dst, 2);
8559 } else {
8560 aco_opcode subrev =
8561 instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32;
8562 bool use_interp = dpp_ctrl1 == dpp_quad_perm(0, 0, 0, 0) && instr->def.bit_size == 32 &&
8563 ctx->program->gfx_level >= GFX11_5;
8564 if (!nir_src_is_divergent(instr->src[0])) {
8565 bld.vop2(subrev, Definition(dst), src, src);
8566 } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(1, 1, 1, 1)) {
8567 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), src,
8568 Operand::c32(0x3f800000), src)
8569 ->valu()
8570 .neg[2] = true;
8571 } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(2, 2, 2, 2)) {
8572 Builder::Result tmp = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1),
8573 Operand::c32(0), Operand::c32(0), src);
8574 tmp->valu().neg = 0x6;
8575 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), src,
8576 Operand::c32(0x3f800000), tmp);
8577 } else if (ctx->program->gfx_level >= GFX8 && dpp_ctrl2 == dpp_quad_perm(0, 1, 2, 3)) {
8578 bld.vop2_dpp(subrev, Definition(dst), src, src, dpp_ctrl1);
8579 } else if (ctx->program->gfx_level >= GFX8) {
8580 Temp tmp = bld.vop2_dpp(subrev, bld.def(v1), src, src, dpp_ctrl1);
8581 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), tmp, dpp_ctrl2);
8582 } else {
8583 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
8584 Temp tr = src;
8585 if (dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8586 tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
8587 bld.vop2(subrev, Definition(dst), tl, tr);
8588 }
8589 }
8590 set_wqm(ctx, true);
8591 break;
8592 }
8593
8594 case nir_intrinsic_load_subgroup_invocation: {
8595 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8596 break;
8597 }
8598 case nir_intrinsic_ballot_relaxed:
8599 case nir_intrinsic_ballot: {
8600 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8601 Temp dst = get_ssa_temp(ctx, &instr->def);
8602
8603 if (instr->src[0].ssa->bit_size == 1) {
8604 assert(src.regClass() == bld.lm);
8605 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8606 src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8607 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8608 src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8609 } else {
8610 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8611 }
8612
8613 /* Make sure that all inactive lanes return zero.
8614 * Value-numbering might remove the comparison above */
8615 Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8616 if (instr->intrinsic == nir_intrinsic_ballot_relaxed)
8617 src = bld.copy(def, src);
8618 else
8619 src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8620 if (dst.size() != bld.lm.size()) {
8621 /* Wave32 with ballot size set to 64 */
8622 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8623 }
8624
8625 set_wqm(ctx);
8626 break;
8627 }
8628 case nir_intrinsic_inverse_ballot: {
8629 Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8630 Temp dst = get_ssa_temp(ctx, &instr->def);
8631
8632 assert(dst.size() == bld.lm.size());
8633 if (src.size() > dst.size()) {
8634 emit_extract_vector(ctx, src, 0, dst);
8635 } else if (src.size() < dst.size()) {
8636 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8637 } else {
8638 bld.copy(Definition(dst), src);
8639 }
8640 break;
8641 }
8642 case nir_intrinsic_shuffle:
8643 case nir_intrinsic_read_invocation: {
8644 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8645 assert(instr->def.bit_size != 1);
8646 if (!nir_src_is_divergent(instr->src[0])) {
8647 emit_uniform_subgroup(ctx, instr, src);
8648 } else {
8649 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8650 if (instr->intrinsic == nir_intrinsic_read_invocation ||
8651 !nir_src_is_divergent(instr->src[1]))
8652 tid = bld.as_uniform(tid);
8653 Temp dst = get_ssa_temp(ctx, &instr->def);
8654
8655 src = as_vgpr(ctx, src);
8656
8657 if (src.regClass() == v1b || src.regClass() == v2b) {
8658 Temp tmp = bld.tmp(v1);
8659 tmp = emit_bpermute(ctx, bld, tid, src);
8660 if (dst.type() == RegType::vgpr)
8661 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8662 bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8663 else
8664 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8665 } else if (src.regClass() == v1) {
8666 Temp tmp = emit_bpermute(ctx, bld, tid, src);
8667 bld.copy(Definition(dst), tmp);
8668 } else if (src.regClass() == v2) {
8669 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8670 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8671 lo = emit_bpermute(ctx, bld, tid, lo);
8672 hi = emit_bpermute(ctx, bld, tid, hi);
8673 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8674 emit_split_vector(ctx, dst, 2);
8675 } else {
8676 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8677 }
8678 set_wqm(ctx);
8679 }
8680 break;
8681 }
8682 case nir_intrinsic_rotate: {
8683 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8684 Temp delta = get_ssa_temp(ctx, instr->src[1].ssa);
8685 Temp dst = get_ssa_temp(ctx, &instr->def);
8686 assert(instr->def.bit_size > 1 && instr->def.bit_size <= 32);
8687
8688 if (!nir_src_is_divergent(instr->src[0])) {
8689 emit_uniform_subgroup(ctx, instr, src);
8690 break;
8691 }
8692
8693 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
8694 cluster_size = util_next_power_of_two(
8695 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8696
8697 if (cluster_size == 1) {
8698 bld.copy(Definition(dst), src);
8699 break;
8700 }
8701
8702 delta = bld.as_uniform(delta);
8703 src = as_vgpr(ctx, src);
8704
8705 Temp tmp;
8706 if (nir_src_is_const(instr->src[1]) &&
8707 emit_rotate_by_constant(ctx, tmp, src, cluster_size, nir_src_as_uint(instr->src[1]))) {
8708 } else if (cluster_size == 2) {
8709 Temp noswap =
8710 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), delta, Operand::c32(0));
8711 noswap = bool_to_vector_condition(ctx, noswap);
8712 Temp swapped = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, 0x1), true);
8713 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(src.regClass()), swapped, src, noswap);
8714 } else if (ctx->program->gfx_level >= GFX10 && cluster_size <= 16) {
8715 if (cluster_size == 4) /* shift mask already does this for 8/16. */
8716 delta = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), delta,
8717 Operand::c32(0x3));
8718 delta =
8719 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), delta, Operand::c32(2));
8720
8721 Temp lo = bld.copy(bld.def(s1), Operand::c32(cluster_size == 4 ? 0x32103210 : 0x76543210));
8722 Temp hi;
8723
8724 if (cluster_size <= 8) {
8725 Temp shr = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8726 if (cluster_size == 4) {
8727 Temp lotolohi = bld.copy(bld.def(s1), Operand::c32(0x4444));
8728 Temp lohi =
8729 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, lotolohi);
8730 lo = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), shr, lohi);
8731 } else {
8732 delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
8733 Operand::c32(32), delta);
8734 Temp shl =
8735 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8736 lo = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, shl);
8737 }
8738 Temp lotohi = bld.copy(bld.def(s1), Operand::c32(0x88888888));
8739 hi = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), lo, lotohi);
8740 } else {
8741 hi = bld.copy(bld.def(s1), Operand::c32(0xfedcba98));
8742
8743 Temp lohi = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
8744
8745 Temp shr = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8746 delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(64),
8747 delta);
8748 Temp shl = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8749
8750 lohi = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), shr, shl);
8751 lo = bld.tmp(s1);
8752 hi = bld.tmp(s1);
8753 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), lohi);
8754 }
8755
8756 Builder::Result ret =
8757 bld.vop3(aco_opcode::v_permlane16_b32, bld.def(src.regClass()), src, lo, hi);
8758 ret->valu().opsel[0] = true; /* set FETCH_INACTIVE */
8759 ret->valu().opsel[1] = true; /* set BOUND_CTRL */
8760 tmp = ret;
8761 } else {
8762 /* Fallback to ds_bpermute if we can't find a special instruction. */
8763 Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
8764 Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
8765
8766 if (ctx->program->gfx_level >= GFX10 && cluster_size == 32) {
8767 /* ds_bpermute is restricted to 32 lanes on GFX10+. */
8768 Temp index_x4 =
8769 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
8770 tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
8771 } else {
8772 /* Technically, full wave rotate doesn't need this, but it breaks the pseudo ops. */
8773 src_lane = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), Operand::c32(cluster_size - 1),
8774 src_lane, tid);
8775 tmp = emit_bpermute(ctx, bld, src_lane, src);
8776 }
8777 }
8778
8779 tmp = emit_extract_vector(ctx, tmp, 0, dst.regClass());
8780 bld.copy(Definition(dst), tmp);
8781 set_wqm(ctx);
8782 break;
8783 }
8784 case nir_intrinsic_load_sample_id: {
8785 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8786 get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8787 break;
8788 }
8789 case nir_intrinsic_read_first_invocation: {
8790 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8791 Temp dst = get_ssa_temp(ctx, &instr->def);
8792 if (instr->def.bit_size == 1) {
8793 assert(src.regClass() == bld.lm);
8794 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8795 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8796 bool_to_vector_condition(ctx, tmp, dst);
8797 } else {
8798 emit_readfirstlane(ctx, src, dst);
8799 }
8800 set_wqm(ctx);
8801 break;
8802 }
8803 case nir_intrinsic_as_uniform: {
8804 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8805 Temp dst = get_ssa_temp(ctx, &instr->def);
8806 if (src.type() == RegType::vgpr)
8807 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
8808 else
8809 bld.copy(Definition(dst), src);
8810 break;
8811 }
8812 case nir_intrinsic_vote_all: {
8813 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8814 Temp dst = get_ssa_temp(ctx, &instr->def);
8815 assert(src.regClass() == bld.lm);
8816 assert(dst.regClass() == bld.lm);
8817
8818 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8819 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8820 .def(1)
8821 .getTemp();
8822 Temp cond = bool_to_vector_condition(ctx, tmp);
8823 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8824 set_wqm(ctx);
8825 break;
8826 }
8827 case nir_intrinsic_vote_any: {
8828 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8829 Temp dst = get_ssa_temp(ctx, &instr->def);
8830 assert(src.regClass() == bld.lm);
8831 assert(dst.regClass() == bld.lm);
8832
8833 Temp tmp = bool_to_scalar_condition(ctx, src);
8834 bool_to_vector_condition(ctx, tmp, dst);
8835 set_wqm(ctx);
8836 break;
8837 }
8838 case nir_intrinsic_quad_vote_any: {
8839 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8840 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8841 bld.sop1(Builder::s_wqm, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8842 set_wqm(ctx);
8843 break;
8844 }
8845 case nir_intrinsic_quad_vote_all: {
8846 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8847 src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8848 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8849 src = bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), src);
8850 bld.sop1(Builder::s_not, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8851 set_wqm(ctx);
8852 break;
8853 }
8854 case nir_intrinsic_reduce:
8855 case nir_intrinsic_inclusive_scan:
8856 case nir_intrinsic_exclusive_scan: {
8857 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8858 Temp dst = get_ssa_temp(ctx, &instr->def);
8859 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8860 unsigned cluster_size =
8861 instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8862 cluster_size = util_next_power_of_two(
8863 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8864 const unsigned bit_size = instr->src[0].ssa->bit_size;
8865 assert(bit_size != 1);
8866
8867 if (!nir_src_is_divergent(instr->src[0])) {
8868 /* We use divergence analysis to assign the regclass, so check if it's
8869 * working as expected */
8870 ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8871 if (instr->intrinsic == nir_intrinsic_inclusive_scan ||
8872 cluster_size != ctx->program->wave_size)
8873 expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor ||
8874 op == nir_op_imul || op == nir_op_fmul;
8875 assert(instr->def.divergent == expected_divergent);
8876
8877 if (instr->intrinsic == nir_intrinsic_reduce) {
8878 if (!instr->def.divergent && emit_uniform_reduce(ctx, instr))
8879 break;
8880 } else if (emit_uniform_scan(ctx, instr)) {
8881 break;
8882 }
8883 }
8884
8885 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8886 ReduceOp reduce_op = get_reduce_op(op, bit_size);
8887
8888 aco_opcode aco_op;
8889 switch (instr->intrinsic) {
8890 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8891 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8892 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8893 default: unreachable("unknown reduce intrinsic");
8894 }
8895
8896 /* Avoid whole wave shift. */
8897 const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8898 (op == nir_op_iadd || op == nir_op_ixor) &&
8899 dst.type() == RegType::vgpr;
8900 if (use_inclusive_for_exclusive)
8901 inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8902 else
8903 emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8904
8905 set_wqm(ctx);
8906 break;
8907 }
8908 case nir_intrinsic_dpp16_shift_amd: {
8909 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8910 Temp dst = get_ssa_temp(ctx, &instr->def);
8911 int delta = nir_intrinsic_base(instr);
8912 assert(delta >= -15 && delta <= 15 && delta != 0);
8913 assert(instr->def.bit_size != 1 && instr->def.bit_size < 64);
8914 assert(ctx->options->gfx_level >= GFX8);
8915
8916 uint16_t dpp_ctrl = delta < 0 ? dpp_row_sr(-delta) : dpp_row_sl(delta);
8917 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), src, dpp_ctrl);
8918
8919 set_wqm(ctx);
8920 break;
8921 }
8922 case nir_intrinsic_quad_broadcast:
8923 case nir_intrinsic_quad_swap_horizontal:
8924 case nir_intrinsic_quad_swap_vertical:
8925 case nir_intrinsic_quad_swap_diagonal:
8926 case nir_intrinsic_quad_swizzle_amd: {
8927 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8928
8929 if (!instr->def.divergent) {
8930 emit_uniform_subgroup(ctx, instr, src);
8931 break;
8932 }
8933
8934 /* Quad broadcast lane. */
8935 unsigned lane = 0;
8936 /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8937 bool bool_use_valu = instr->def.bit_size == 1;
8938
8939 uint16_t dpp_ctrl = 0;
8940
8941 bool allow_fi = true;
8942 switch (instr->intrinsic) {
8943 case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8944 case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8945 case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8946 case nir_intrinsic_quad_swizzle_amd:
8947 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8948 allow_fi &= nir_intrinsic_fetch_inactive(instr);
8949 break;
8950 case nir_intrinsic_quad_broadcast:
8951 lane = nir_src_as_const_value(instr->src[1])->u32;
8952 dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8953 bool_use_valu = false;
8954 break;
8955 default: break;
8956 }
8957
8958 Temp dst = get_ssa_temp(ctx, &instr->def);
8959
8960 /* Setup source. */
8961 if (bool_use_valu)
8962 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8963 Operand::c32(-1), src);
8964 else if (instr->def.bit_size != 1)
8965 src = as_vgpr(ctx, src);
8966
8967 if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8968 /* Special case for quad broadcast using SALU only. */
8969 assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8970
8971 uint32_t half_mask = 0x11111111u << lane;
8972 Operand mask_tmp = bld.lm.bytes() == 4
8973 ? Operand::c32(half_mask)
8974 : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8975 Operand::c32(half_mask), Operand::c32(half_mask));
8976
8977 src =
8978 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8979 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8980 bld.sop1(Builder::s_wqm, Definition(dst), bld.def(s1, scc), src);
8981 } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8982 unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8983 Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8984
8985 if (ctx->program->gfx_level >= GFX8)
8986 bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8987 else
8988 bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8989
8990 if (excess_bytes)
8991 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8992 bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8993 if (bool_use_valu)
8994 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8995 } else if (instr->def.bit_size == 64) {
8996 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8997 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8998
8999 if (ctx->program->gfx_level >= GFX8) {
9000 lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
9001 allow_fi);
9002 hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
9003 allow_fi);
9004 } else {
9005 lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
9006 hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
9007 }
9008
9009 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
9010 emit_split_vector(ctx, dst, 2);
9011 } else {
9012 isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
9013 }
9014
9015 set_wqm(ctx);
9016 break;
9017 }
9018 case nir_intrinsic_masked_swizzle_amd: {
9019 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9020 if (!instr->def.divergent) {
9021 emit_uniform_subgroup(ctx, instr, src);
9022 break;
9023 }
9024 Temp dst = get_ssa_temp(ctx, &instr->def);
9025 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
9026 bool allow_fi = nir_intrinsic_fetch_inactive(instr);
9027
9028 if (instr->def.bit_size != 1)
9029 src = as_vgpr(ctx, src);
9030
9031 if (instr->def.bit_size == 1) {
9032 assert(src.regClass() == bld.lm);
9033 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
9034 Operand::c32(-1), src);
9035 src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
9036 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
9037 } else if (dst.regClass() == v1b) {
9038 Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
9039 emit_extract_vector(ctx, tmp, 0, dst);
9040 } else if (dst.regClass() == v2b) {
9041 Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
9042 emit_extract_vector(ctx, tmp, 0, dst);
9043 } else if (dst.regClass() == v1) {
9044 bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
9045 } else if (dst.regClass() == v2) {
9046 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
9047 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
9048 lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
9049 hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
9050 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
9051 emit_split_vector(ctx, dst, 2);
9052 } else {
9053 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
9054 }
9055 set_wqm(ctx);
9056 break;
9057 }
9058 case nir_intrinsic_write_invocation_amd: {
9059 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9060 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9061 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
9062 Temp dst = get_ssa_temp(ctx, &instr->def);
9063 if (dst.regClass() == v1) {
9064 /* src2 is ignored for writelane. RA assigns the same reg for dst */
9065 bld.writelane(Definition(dst), val, lane, src);
9066 } else if (dst.regClass() == v2) {
9067 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
9068 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
9069 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
9070 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
9071 Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
9072 Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
9073 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
9074 emit_split_vector(ctx, dst, 2);
9075 } else {
9076 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
9077 }
9078 break;
9079 }
9080 case nir_intrinsic_mbcnt_amd: {
9081 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9082 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
9083 Temp dst = get_ssa_temp(ctx, &instr->def);
9084 /* Fit 64-bit mask for wave32 */
9085 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
9086 emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
9087 set_wqm(ctx);
9088 break;
9089 }
9090 case nir_intrinsic_lane_permute_16_amd: {
9091 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9092 Temp dst = get_ssa_temp(ctx, &instr->def);
9093 assert(ctx->program->gfx_level >= GFX10);
9094
9095 if (src.regClass() == s1) {
9096 bld.copy(Definition(dst), src);
9097 } else if (dst.regClass() == v1 && src.regClass() == v1) {
9098 bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
9099 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
9100 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
9101 } else {
9102 isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
9103 }
9104 break;
9105 }
9106 case nir_intrinsic_load_helper_invocation:
9107 case nir_intrinsic_is_helper_invocation: {
9108 /* load_helper() after demote() get lowered to is_helper().
9109 * Otherwise, these two behave the same. */
9110 Temp dst = get_ssa_temp(ctx, &instr->def);
9111 bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
9112 ctx->program->needs_exact = true;
9113 break;
9114 }
9115 case nir_intrinsic_demote:
9116 case nir_intrinsic_demote_if: {
9117 Operand cond = Operand::c32(-1u);
9118 if (instr->intrinsic == nir_intrinsic_demote_if) {
9119 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9120 assert(src.regClass() == bld.lm);
9121 cond =
9122 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
9123 }
9124
9125 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
9126
9127 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9128 ctx->cf_info.exec.potentially_empty_discard = true;
9129
9130 ctx->block->kind |= block_kind_uses_discard;
9131 ctx->program->needs_exact = true;
9132
9133 /* Enable WQM in order to prevent helper lanes from getting terminated. */
9134 if (ctx->shader->info.maximally_reconverges)
9135 ctx->program->needs_wqm = true;
9136
9137 break;
9138 }
9139 case nir_intrinsic_terminate:
9140 case nir_intrinsic_terminate_if: {
9141 Operand cond = Operand::c32(-1u);
9142 if (instr->intrinsic == nir_intrinsic_terminate_if) {
9143 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9144 assert(src.regClass() == bld.lm);
9145 cond =
9146 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
9147
9148 ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
9149 }
9150
9151 bld.pseudo(aco_opcode::p_discard_if, cond);
9152
9153 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9154 ctx->cf_info.exec.potentially_empty_discard = true;
9155 ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
9156 ctx->block->kind |= block_kind_uses_discard;
9157 ctx->program->needs_exact = true;
9158 break;
9159 }
9160 case nir_intrinsic_first_invocation: {
9161 bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
9162 Operand(exec, bld.lm));
9163 set_wqm(ctx);
9164 break;
9165 }
9166 case nir_intrinsic_last_invocation: {
9167 Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
9168 bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
9169 Operand::c32(ctx->program->wave_size - 1u), flbit);
9170 set_wqm(ctx);
9171 break;
9172 }
9173 case nir_intrinsic_elect: {
9174 /* p_elect is lowered in aco_insert_exec_mask.
9175 * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
9176 * two p_elect with different exec masks as the same.
9177 */
9178 bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
9179 Operand(exec, bld.lm));
9180 set_wqm(ctx);
9181 break;
9182 }
9183 case nir_intrinsic_shader_clock: {
9184 Temp dst = get_ssa_temp(ctx, &instr->def);
9185 if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
9186 ctx->options->gfx_level >= GFX12) {
9187 Temp hi0 = bld.tmp(s1);
9188 Temp hi1 = bld.tmp(s1);
9189 Temp lo = bld.tmp(s1);
9190 bld.pseudo(aco_opcode::p_shader_cycles_hi_lo_hi, Definition(hi0), Definition(lo), Definition(hi1));
9191 Temp hi_eq = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), hi0, hi1);
9192 lo = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), lo, Operand::zero(), bld.scc(hi_eq));
9193 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi1);
9194 } else if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
9195 ctx->options->gfx_level >= GFX10_3) {
9196 /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
9197 Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
9198 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
9199 } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
9200 ctx->options->gfx_level >= GFX11) {
9201 bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
9202 Operand::c32(sendmsg_rtn_get_realtime));
9203 } else {
9204 aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
9205 ? aco_opcode::s_memrealtime
9206 : aco_opcode::s_memtime;
9207 bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
9208 }
9209 emit_split_vector(ctx, dst, 2);
9210 break;
9211 }
9212 case nir_intrinsic_load_vertex_id_zero_base: {
9213 Temp dst = get_ssa_temp(ctx, &instr->def);
9214 bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
9215 break;
9216 }
9217 case nir_intrinsic_load_first_vertex: {
9218 Temp dst = get_ssa_temp(ctx, &instr->def);
9219 bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
9220 break;
9221 }
9222 case nir_intrinsic_load_base_instance: {
9223 Temp dst = get_ssa_temp(ctx, &instr->def);
9224 bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
9225 break;
9226 }
9227 case nir_intrinsic_load_instance_id: {
9228 Temp dst = get_ssa_temp(ctx, &instr->def);
9229 bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
9230 break;
9231 }
9232 case nir_intrinsic_load_draw_id: {
9233 Temp dst = get_ssa_temp(ctx, &instr->def);
9234 bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
9235 break;
9236 }
9237 case nir_intrinsic_load_invocation_id: {
9238 Temp dst = get_ssa_temp(ctx, &instr->def);
9239
9240 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
9241 if (ctx->options->gfx_level >= GFX12)
9242 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
9243 get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand::c32(27u),
9244 Operand::c32(5u));
9245 else if (ctx->options->gfx_level >= GFX10)
9246 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
9247 get_arg(ctx, ctx->args->gs_invocation_id));
9248 else
9249 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
9250 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
9251 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
9252 Operand::c32(8u), Operand::c32(5u));
9253 } else {
9254 unreachable("Unsupported stage for load_invocation_id");
9255 }
9256
9257 break;
9258 }
9259 case nir_intrinsic_load_primitive_id: {
9260 Temp dst = get_ssa_temp(ctx, &instr->def);
9261
9262 switch (ctx->shader->info.stage) {
9263 case MESA_SHADER_GEOMETRY:
9264 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9265 break;
9266 case MESA_SHADER_TESS_CTRL:
9267 bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
9268 break;
9269 case MESA_SHADER_TESS_EVAL:
9270 bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
9271 break;
9272 default:
9273 if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
9274 /* In case of NGG, the GS threads always have the primitive ID
9275 * even if there is no SW GS. */
9276 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9277 break;
9278 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
9279 bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
9280 break;
9281 }
9282 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
9283 }
9284
9285 break;
9286 }
9287 case nir_intrinsic_sendmsg_amd: {
9288 unsigned imm = nir_intrinsic_base(instr);
9289 Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9290 bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), imm);
9291 break;
9292 }
9293 case nir_intrinsic_load_gs_wave_id_amd: {
9294 Temp dst = get_ssa_temp(ctx, &instr->def);
9295 if (ctx->args->merged_wave_info.used)
9296 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
9297 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
9298 Operand::zero());
9299 else if (ctx->args->gs_wave_id.used)
9300 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
9301 else
9302 unreachable("Shader doesn't have GS wave ID.");
9303 break;
9304 }
9305 case nir_intrinsic_is_subgroup_invocation_lt_amd: {
9306 Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9307 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
9308 break;
9309 }
9310 case nir_intrinsic_gds_atomic_add_amd: {
9311 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
9312 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
9313 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
9314 Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
9315 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
9316 true);
9317 break;
9318 }
9319 case nir_intrinsic_load_sbt_base_amd: {
9320 Temp dst = get_ssa_temp(ctx, &instr->def);
9321 Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
9322 assert(addr.regClass() == s2);
9323 bld.copy(Definition(dst), Operand(addr));
9324 break;
9325 }
9326 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
9327 case nir_intrinsic_load_resume_shader_address_amd: {
9328 bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
9329 bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
9330 break;
9331 }
9332 case nir_intrinsic_overwrite_vs_arguments_amd: {
9333 ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9334 ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9335 break;
9336 }
9337 case nir_intrinsic_overwrite_tes_arguments_amd: {
9338 ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9339 ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9340 ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9341 ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
9342 break;
9343 }
9344 case nir_intrinsic_load_scalar_arg_amd:
9345 case nir_intrinsic_load_vector_arg_amd: {
9346 assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
9347 Temp dst = get_ssa_temp(ctx, &instr->def);
9348 Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
9349 assert(src.id());
9350 assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
9351 : RegType::vgpr));
9352 bld.copy(Definition(dst), src);
9353 emit_split_vector(ctx, dst, dst.size());
9354 break;
9355 }
9356 case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: {
9357 Temp dst = get_ssa_temp(ctx, &instr->def);
9358 Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
9359 Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
9360
9361 Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
9362 unsigned offset0, offset1;
9363 Instruction* ds_instr;
9364 Operand m;
9365
9366 /* Lock a GDS mutex. */
9367 ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
9368 m = bld.m0(bld.as_uniform(ordered_id));
9369 ds_instr =
9370 bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9371 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9372
9373 aco_ptr<Instruction> vec{
9374 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
9375 unsigned write_mask = nir_intrinsic_write_mask(instr);
9376
9377 for (unsigned i = 0; i < instr->num_components; i++) {
9378 if (write_mask & (1 << i)) {
9379 Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9380
9381 ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9382 i * 4, 0u, true);
9383 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9384
9385 vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
9386 } else {
9387 vec->operands[i] = Operand::zero();
9388 }
9389 }
9390
9391 vec->definitions[0] = Definition(dst);
9392 ctx->block->instructions.emplace_back(std::move(vec));
9393
9394 /* Unlock a GDS mutex. */
9395 ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
9396 m = bld.m0(bld.as_uniform(ordered_id));
9397 ds_instr =
9398 bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9399 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9400
9401 emit_split_vector(ctx, dst, instr->num_components);
9402 break;
9403 }
9404 case nir_intrinsic_xfb_counter_sub_gfx11_amd: {
9405 unsigned write_mask = nir_intrinsic_write_mask(instr);
9406 Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
9407
9408 u_foreach_bit (i, write_mask) {
9409 Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9410 Instruction* ds_instr;
9411
9412 ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9413 i * 4, 0u, true);
9414 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9415 }
9416 break;
9417 }
9418 case nir_intrinsic_export_amd:
9419 case nir_intrinsic_export_row_amd: {
9420 unsigned flags = nir_intrinsic_flags(instr);
9421 unsigned target = nir_intrinsic_base(instr);
9422 unsigned write_mask = nir_intrinsic_write_mask(instr);
9423
9424 /* Mark vertex export block. */
9425 if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
9426 ctx->block->kind |= block_kind_export_end;
9427
9428 if (target < V_008DFC_SQ_EXP_MRTZ)
9429 ctx->program->has_color_exports = true;
9430
9431 const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;
9432
9433 aco_ptr<Instruction> exp{create_instruction(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};
9434
9435 exp->exp().dest = target;
9436 exp->exp().enabled_mask = write_mask;
9437 exp->exp().compressed = flags & AC_EXP_FLAG_COMPRESSED;
9438
9439 /* ACO may reorder position/mrt export instructions, then mark done for last
9440 * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
9441 * exports here and leave it to ACO.
9442 */
9443 if (target == V_008DFC_SQ_EXP_PRIM)
9444 exp->exp().done = flags & AC_EXP_FLAG_DONE;
9445 else
9446 exp->exp().done = false;
9447
9448 /* ACO may reorder mrt export instructions, then mark valid mask for last
9449 * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9450 * exports here and leave it to ACO.
9451 */
9452 if (target > V_008DFC_SQ_EXP_NULL)
9453 exp->exp().valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9454 else
9455 exp->exp().valid_mask = false;
9456
9457 exp->exp().row_en = row_en;
9458
9459 /* Compressed export uses two bits for a channel. */
9460 uint32_t channel_mask = exp->exp().compressed
9461 ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0)
9462 : write_mask;
9463
9464 Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9465 for (unsigned i = 0; i < 4; i++) {
9466 exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9467 ? Operand(emit_extract_vector(ctx, value, i, v1))
9468 : Operand(v1);
9469 }
9470
9471 if (row_en) {
9472 Temp row = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9473 /* Hack to prevent the RA from moving the source into m0 and then back to a normal SGPR. */
9474 row = bld.copy(bld.def(s1, m0), row);
9475 exp->operands[4] = bld.m0(row);
9476 }
9477
9478 ctx->block->instructions.emplace_back(std::move(exp));
9479 break;
9480 }
9481 case nir_intrinsic_export_dual_src_blend_amd: {
9482 Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9483 Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9484 unsigned write_mask = nir_intrinsic_write_mask(instr);
9485
9486 struct aco_export_mrt mrt0, mrt1;
9487 for (unsigned i = 0; i < 4; i++) {
9488 mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9489 : Operand(v1);
9490
9491 mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9492 : Operand(v1);
9493 }
9494 mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9495
9496 create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9497
9498 ctx->block->kind |= block_kind_export_end;
9499 break;
9500 }
9501 case nir_intrinsic_strict_wqm_coord_amd: {
9502 Temp dst = get_ssa_temp(ctx, &instr->def);
9503 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9504 unsigned begin_size = nir_intrinsic_base(instr);
9505
9506 unsigned num_src = 1;
9507 auto it = ctx->allocated_vec.find(src.id());
9508 if (it != ctx->allocated_vec.end())
9509 num_src = src.bytes() / it->second[0].bytes();
9510
9511 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO,
9512 num_src + !!begin_size, 1)};
9513
9514 if (begin_size)
9515 vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9516 for (unsigned i = 0; i < num_src; i++) {
9517 Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9518 vec->operands[i + !!begin_size] = Operand(comp);
9519 }
9520
9521 vec->definitions[0] = Definition(dst);
9522 ctx->block->instructions.emplace_back(std::move(vec));
9523 break;
9524 }
9525 case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9526 Temp dst = get_ssa_temp(ctx, &instr->def);
9527 bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9528 Operand::c32(aco_symbol_lds_ngg_scratch_base));
9529 break;
9530 }
9531 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9532 Temp dst = get_ssa_temp(ctx, &instr->def);
9533 bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9534 Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9535 break;
9536 }
9537 case nir_intrinsic_store_scalar_arg_amd: {
9538 BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
9539 ctx->arg_temps[nir_intrinsic_base(instr)] =
9540 bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9541 break;
9542 }
9543 case nir_intrinsic_store_vector_arg_amd: {
9544 BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
9545 ctx->arg_temps[nir_intrinsic_base(instr)] =
9546 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9547 break;
9548 }
9549 case nir_intrinsic_begin_invocation_interlock: {
9550 pops_await_overlapped_waves(ctx);
9551 break;
9552 }
9553 case nir_intrinsic_end_invocation_interlock: {
9554 if (ctx->options->gfx_level < GFX11)
9555 bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9556 break;
9557 }
9558 case nir_intrinsic_cmat_muladd_amd: visit_cmat_muladd(ctx, instr); break;
9559 case nir_intrinsic_nop_amd: bld.sopp(aco_opcode::s_nop, nir_intrinsic_base(instr)); break;
9560 case nir_intrinsic_sleep_amd: bld.sopp(aco_opcode::s_sleep, nir_intrinsic_base(instr)); break;
9561 case nir_intrinsic_unit_test_amd:
9562 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(nir_intrinsic_base(instr)),
9563 get_ssa_temp(ctx, instr->src[0].ssa));
9564 break;
9565 case nir_intrinsic_unit_test_uniform_amd:
9566 case nir_intrinsic_unit_test_divergent_amd:
9567 bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
9568 Operand::c32(nir_intrinsic_base(instr)));
9569 break;
9570 default:
9571 isel_err(&instr->instr, "Unimplemented intrinsic instr");
9572 abort();
9573
9574 break;
9575 }
9576 }
9577
9578 void
get_const_vec(nir_def * vec,nir_const_value * cv[4])9579 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9580 {
9581 if (vec->parent_instr->type != nir_instr_type_alu)
9582 return;
9583 nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9584 if (vec_instr->op != nir_op_vec(vec->num_components))
9585 return;
9586
9587 for (unsigned i = 0; i < vec->num_components; i++) {
9588 cv[i] =
9589 vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9590 }
9591 }
9592
9593 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9594 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9595 {
9596 assert(instr->op != nir_texop_samples_identical);
9597
9598 Builder bld(ctx->program, ctx->block);
9599 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9600 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9601 has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9602 Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9603 offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9604 coord = Temp(), wqm_coord = Temp();
9605 std::vector<Temp> coords;
9606 std::vector<Temp> derivs;
9607 nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9608
9609 for (unsigned i = 0; i < instr->num_srcs; i++) {
9610 switch (instr->src[i].src_type) {
9611 case nir_tex_src_texture_handle:
9612 resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9613 break;
9614 case nir_tex_src_sampler_handle:
9615 sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9616 break;
9617 default: break;
9618 }
9619 }
9620
9621 bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9622 (instr->dest_type & (nir_type_int | nir_type_uint));
9623 bool tg4_integer_cube_workaround =
9624 tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9625
9626 bool a16 = false, g16 = false;
9627
9628 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9629 if (coord_idx > 0)
9630 a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9631
9632 int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9633 if (ddx_idx > 0)
9634 g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9635
9636 for (unsigned i = 0; i < instr->num_srcs; i++) {
9637 switch (instr->src[i].src_type) {
9638 case nir_tex_src_coord: {
9639 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9640 coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9641 break;
9642 }
9643 case nir_tex_src_backend1: {
9644 assert(instr->src[i].src.ssa->bit_size == 32);
9645 wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9646 has_wqm_coord = true;
9647 break;
9648 }
9649 case nir_tex_src_bias:
9650 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9651 /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9652 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9653 has_bias = true;
9654 break;
9655 case nir_tex_src_lod: {
9656 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9657 level_zero = true;
9658 } else {
9659 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9660 lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9661 has_lod = true;
9662 }
9663 break;
9664 }
9665 case nir_tex_src_min_lod:
9666 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9667 clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9668 has_clamped_lod = true;
9669 break;
9670 case nir_tex_src_comparator:
9671 if (instr->is_shadow) {
9672 assert(instr->src[i].src.ssa->bit_size == 32);
9673 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9674 has_compare = true;
9675 }
9676 break;
9677 case nir_tex_src_offset:
9678 case nir_tex_src_backend2:
9679 assert(instr->src[i].src.ssa->bit_size == 32);
9680 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9681 get_const_vec(instr->src[i].src.ssa, const_offset);
9682 has_offset = true;
9683 break;
9684 case nir_tex_src_ddx:
9685 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9686 ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9687 has_ddx = true;
9688 break;
9689 case nir_tex_src_ddy:
9690 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9691 ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9692 has_ddy = true;
9693 break;
9694 case nir_tex_src_ms_index:
9695 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9696 sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9697 has_sample_index = true;
9698 break;
9699 case nir_tex_src_texture_offset:
9700 case nir_tex_src_sampler_offset:
9701 default: break;
9702 }
9703 }
9704
9705 if (has_wqm_coord) {
9706 assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9707 instr->op == nir_texop_lod);
9708 assert(wqm_coord.regClass().is_linear_vgpr());
9709 assert(!a16 && !g16);
9710 }
9711
9712 if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9713 level_zero = true;
9714
9715 if (has_offset) {
9716 assert(instr->op != nir_texop_txf);
9717
9718 aco_ptr<Instruction> tmp_instr;
9719 Temp acc, pack = Temp();
9720
9721 uint32_t pack_const = 0;
9722 for (unsigned i = 0; i < offset.size(); i++) {
9723 if (!const_offset[i])
9724 continue;
9725 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9726 }
9727
9728 if (offset.type() == RegType::sgpr) {
9729 for (unsigned i = 0; i < offset.size(); i++) {
9730 if (const_offset[i])
9731 continue;
9732
9733 acc = emit_extract_vector(ctx, offset, i, s1);
9734 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9735 Operand::c32(0x3Fu));
9736
9737 if (i) {
9738 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9739 Operand::c32(8u * i));
9740 }
9741
9742 if (pack == Temp()) {
9743 pack = acc;
9744 } else {
9745 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9746 }
9747 }
9748
9749 if (pack_const && pack != Temp())
9750 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9751 Operand::c32(pack_const), pack);
9752 } else {
9753 for (unsigned i = 0; i < offset.size(); i++) {
9754 if (const_offset[i])
9755 continue;
9756
9757 acc = emit_extract_vector(ctx, offset, i, v1);
9758 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9759
9760 if (i) {
9761 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9762 }
9763
9764 if (pack == Temp()) {
9765 pack = acc;
9766 } else {
9767 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9768 }
9769 }
9770
9771 if (pack_const && pack != Temp())
9772 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9773 }
9774 if (pack == Temp())
9775 offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9776 else
9777 offset = pack;
9778 }
9779
9780 std::vector<Temp> unpacked_coord;
9781 if (coord != Temp())
9782 unpacked_coord.push_back(coord);
9783 if (has_sample_index)
9784 unpacked_coord.push_back(sample_index);
9785 if (has_lod)
9786 unpacked_coord.push_back(lod);
9787 if (has_clamped_lod)
9788 unpacked_coord.push_back(clamped_lod);
9789
9790 coords = emit_pack_v1(ctx, unpacked_coord);
9791
9792 /* pack derivatives */
9793 if (has_ddx || has_ddy) {
9794 assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9795 std::array<Temp, 2> ddxddy = {ddx, ddy};
9796 for (Temp tmp : ddxddy) {
9797 if (tmp == Temp())
9798 continue;
9799 std::vector<Temp> unpacked = {tmp};
9800 for (Temp derv : emit_pack_v1(ctx, unpacked))
9801 derivs.push_back(derv);
9802 }
9803 has_derivs = true;
9804 }
9805
9806 unsigned dim = 0;
9807 bool da = false;
9808 if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9809 dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9810 da = should_declare_array((ac_image_dim)dim);
9811 }
9812
9813 /* Build tex instruction */
9814 unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9815 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9816 dmask = u_bit_consecutive(0, util_last_bit(dmask));
9817 if (instr->is_sparse)
9818 dmask = MAX2(dmask, 1) | 0x10;
9819 bool d16 = instr->def.bit_size == 16;
9820 Temp dst = get_ssa_temp(ctx, &instr->def);
9821 Temp tmp_dst = dst;
9822
9823 /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9824 if (instr->op == nir_texop_tg4) {
9825 assert(instr->def.num_components == (4 + instr->is_sparse));
9826 if (instr->is_shadow)
9827 dmask = 1;
9828 else
9829 dmask = 1 << instr->component;
9830 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9831 tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9832 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9833 tmp_dst = bld.tmp(v1);
9834 } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9835 unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9836 tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9837 }
9838
9839 Temp tg4_compare_cube_wa64 = Temp();
9840
9841 if (tg4_integer_workarounds) {
9842 Temp half_texel[2];
9843 if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
9844 half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
9845 } else {
9846 Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9847 Temp size = bld.tmp(v2);
9848 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9849 Operand(s4), std::vector<Temp>{tg4_lod});
9850 tex->dim = dim;
9851 tex->dmask = 0x3;
9852 tex->da = da;
9853 emit_split_vector(ctx, size, size.size());
9854
9855 for (unsigned i = 0; i < 2; i++) {
9856 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9857 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9858 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9859 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9860 Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9861 }
9862
9863 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9864 /* In vulkan, whether the sampler uses unnormalized
9865 * coordinates or not is a dynamic property of the
9866 * sampler. Hence, to figure out whether or not we
9867 * need to divide by the texture size, we need to test
9868 * the sampler at runtime. This tests the bit set by
9869 * radv_init_sampler().
9870 */
9871 unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9872 Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
9873 Temp not_needed =
9874 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
9875
9876 not_needed = bool_to_vector_condition(ctx, not_needed);
9877 half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9878 Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9879 half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9880 Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9881 }
9882 }
9883
9884 Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9885 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9886
9887 if (tg4_integer_cube_workaround) {
9888 /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9889 Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9890 aco_ptr<Instruction> split{
9891 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9892 split->operands[0] = Operand(resource);
9893 for (unsigned i = 0; i < resource.size(); i++) {
9894 desc[i] = bld.tmp(s1);
9895 split->definitions[i] = Definition(desc[i]);
9896 }
9897 ctx->block->instructions.emplace_back(std::move(split));
9898
9899 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9900 Operand::c32(20u | (6u << 16)));
9901 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9902 Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9903
9904 Temp nfmt;
9905 if (instr->dest_type & nir_type_uint) {
9906 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9907 Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9908 Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9909 } else {
9910 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9911 Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9912 Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9913 }
9914 tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9915 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9916
9917 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9918 Operand::c32(26u));
9919
9920 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9921 Operand::c32(C_008F14_NUM_FORMAT));
9922 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9923
9924 aco_ptr<Instruction> vec{
9925 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9926 for (unsigned i = 0; i < resource.size(); i++)
9927 vec->operands[i] = Operand(desc[i]);
9928 resource = bld.tmp(resource.regClass());
9929 vec->definitions[0] = Definition(resource);
9930 ctx->block->instructions.emplace_back(std::move(vec));
9931
9932 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9933 tg4_compare_cube_wa64);
9934 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9935 tg4_compare_cube_wa64);
9936 }
9937 coords[0] = new_coords[0];
9938 coords[1] = new_coords[1];
9939 }
9940
9941 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9942 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9943 // ac_build_buffer_load_format_gfx9_safe()
9944
9945 assert(coords.size() == 1);
9946 aco_opcode op;
9947 if (d16) {
9948 switch (util_last_bit(dmask & 0xf)) {
9949 case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9950 case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9951 case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9952 case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9953 default: unreachable("Tex instruction loads more than 4 components.");
9954 }
9955 } else {
9956 switch (util_last_bit(dmask & 0xf)) {
9957 case 1: op = aco_opcode::buffer_load_format_x; break;
9958 case 2: op = aco_opcode::buffer_load_format_xy; break;
9959 case 3: op = aco_opcode::buffer_load_format_xyz; break;
9960 case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9961 default: unreachable("Tex instruction loads more than 4 components.");
9962 }
9963 }
9964
9965 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9966 mubuf->operands[0] = Operand(resource);
9967 mubuf->operands[1] = Operand(coords[0]);
9968 mubuf->operands[2] = Operand::c32(0);
9969 mubuf->definitions[0] = Definition(tmp_dst);
9970 mubuf->mubuf().idxen = true;
9971 mubuf->mubuf().tfe = instr->is_sparse;
9972 if (mubuf->mubuf().tfe)
9973 mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9974 ctx->block->instructions.emplace_back(std::move(mubuf));
9975
9976 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9977 return;
9978 }
9979
9980 /* gather MIMG address components */
9981 std::vector<Temp> args;
9982 if (has_wqm_coord) {
9983 args.emplace_back(wqm_coord);
9984 if (!(ctx->block->kind & block_kind_top_level))
9985 ctx->unended_linear_vgprs.push_back(wqm_coord);
9986 }
9987 if (has_offset)
9988 args.emplace_back(offset);
9989 if (has_bias)
9990 args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9991 if (has_compare)
9992 args.emplace_back(compare);
9993 if (has_derivs)
9994 args.insert(args.end(), derivs.begin(), derivs.end());
9995
9996 args.insert(args.end(), coords.begin(), coords.end());
9997
9998 if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9999 instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
10000 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
10001 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
10002 ? aco_opcode::image_load
10003 : aco_opcode::image_load_mip;
10004 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
10005 MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
10006 if (instr->op == nir_texop_fragment_mask_fetch_amd)
10007 tex->dim = da ? ac_image_2darray : ac_image_2d;
10008 else
10009 tex->dim = dim;
10010 tex->dmask = dmask & 0xf;
10011 tex->unrm = true;
10012 tex->da = da;
10013 tex->tfe = instr->is_sparse;
10014 tex->d16 = d16;
10015 tex->a16 = a16;
10016
10017 if (instr->op == nir_texop_fragment_mask_fetch_amd) {
10018 /* Use 0x76543210 if the image doesn't have FMASK. */
10019 assert(dmask == 1 && dst.bytes() == 4);
10020 assert(dst.id() != tmp_dst.id());
10021
10022 if (dst.regClass() == s1) {
10023 Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
10024 emit_extract_vector(ctx, resource, 1, s1));
10025 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
10026 Operand::c32(0x76543210), bld.scc(is_not_null));
10027 } else {
10028 Temp is_not_null = bld.tmp(bld.lm);
10029 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
10030 emit_extract_vector(ctx, resource, 1, s1));
10031 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
10032 bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
10033 }
10034 } else {
10035 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
10036 }
10037 return;
10038 }
10039
10040 bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
10041
10042 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
10043 aco_opcode opcode = aco_opcode::image_sample;
10044 if (has_offset) { /* image_sample_*_o */
10045 if (has_clamped_lod) {
10046 if (has_compare) {
10047 opcode = aco_opcode::image_sample_c_cl_o;
10048 if (separate_g16)
10049 opcode = aco_opcode::image_sample_c_d_cl_o_g16;
10050 else if (has_derivs)
10051 opcode = aco_opcode::image_sample_c_d_cl_o;
10052 if (has_bias)
10053 opcode = aco_opcode::image_sample_c_b_cl_o;
10054 } else {
10055 opcode = aco_opcode::image_sample_cl_o;
10056 if (separate_g16)
10057 opcode = aco_opcode::image_sample_d_cl_o_g16;
10058 else if (has_derivs)
10059 opcode = aco_opcode::image_sample_d_cl_o;
10060 if (has_bias)
10061 opcode = aco_opcode::image_sample_b_cl_o;
10062 }
10063 } else if (has_compare) {
10064 opcode = aco_opcode::image_sample_c_o;
10065 if (separate_g16)
10066 opcode = aco_opcode::image_sample_c_d_o_g16;
10067 else if (has_derivs)
10068 opcode = aco_opcode::image_sample_c_d_o;
10069 if (has_bias)
10070 opcode = aco_opcode::image_sample_c_b_o;
10071 if (level_zero)
10072 opcode = aco_opcode::image_sample_c_lz_o;
10073 if (has_lod)
10074 opcode = aco_opcode::image_sample_c_l_o;
10075 } else {
10076 opcode = aco_opcode::image_sample_o;
10077 if (separate_g16)
10078 opcode = aco_opcode::image_sample_d_o_g16;
10079 else if (has_derivs)
10080 opcode = aco_opcode::image_sample_d_o;
10081 if (has_bias)
10082 opcode = aco_opcode::image_sample_b_o;
10083 if (level_zero)
10084 opcode = aco_opcode::image_sample_lz_o;
10085 if (has_lod)
10086 opcode = aco_opcode::image_sample_l_o;
10087 }
10088 } else if (has_clamped_lod) { /* image_sample_*_cl */
10089 if (has_compare) {
10090 opcode = aco_opcode::image_sample_c_cl;
10091 if (separate_g16)
10092 opcode = aco_opcode::image_sample_c_d_cl_g16;
10093 else if (has_derivs)
10094 opcode = aco_opcode::image_sample_c_d_cl;
10095 if (has_bias)
10096 opcode = aco_opcode::image_sample_c_b_cl;
10097 } else {
10098 opcode = aco_opcode::image_sample_cl;
10099 if (separate_g16)
10100 opcode = aco_opcode::image_sample_d_cl_g16;
10101 else if (has_derivs)
10102 opcode = aco_opcode::image_sample_d_cl;
10103 if (has_bias)
10104 opcode = aco_opcode::image_sample_b_cl;
10105 }
10106 } else { /* no offset */
10107 if (has_compare) {
10108 opcode = aco_opcode::image_sample_c;
10109 if (separate_g16)
10110 opcode = aco_opcode::image_sample_c_d_g16;
10111 else if (has_derivs)
10112 opcode = aco_opcode::image_sample_c_d;
10113 if (has_bias)
10114 opcode = aco_opcode::image_sample_c_b;
10115 if (level_zero)
10116 opcode = aco_opcode::image_sample_c_lz;
10117 if (has_lod)
10118 opcode = aco_opcode::image_sample_c_l;
10119 } else {
10120 opcode = aco_opcode::image_sample;
10121 if (separate_g16)
10122 opcode = aco_opcode::image_sample_d_g16;
10123 else if (has_derivs)
10124 opcode = aco_opcode::image_sample_d;
10125 if (has_bias)
10126 opcode = aco_opcode::image_sample_b;
10127 if (level_zero)
10128 opcode = aco_opcode::image_sample_lz;
10129 if (has_lod)
10130 opcode = aco_opcode::image_sample_l;
10131 }
10132 }
10133
10134 if (instr->op == nir_texop_tg4) {
10135 /* GFX11 supports implicit LOD, but the extension is unsupported. */
10136 assert(level_zero || ctx->options->gfx_level < GFX11);
10137
10138 if (has_offset) { /* image_gather4_*_o */
10139 if (has_compare) {
10140 opcode = aco_opcode::image_gather4_c_o;
10141 if (level_zero)
10142 opcode = aco_opcode::image_gather4_c_lz_o;
10143 if (has_lod)
10144 opcode = aco_opcode::image_gather4_c_l_o;
10145 if (has_bias)
10146 opcode = aco_opcode::image_gather4_c_b_o;
10147 } else {
10148 opcode = aco_opcode::image_gather4_o;
10149 if (level_zero)
10150 opcode = aco_opcode::image_gather4_lz_o;
10151 if (has_lod)
10152 opcode = aco_opcode::image_gather4_l_o;
10153 if (has_bias)
10154 opcode = aco_opcode::image_gather4_b_o;
10155 }
10156 } else {
10157 if (has_compare) {
10158 opcode = aco_opcode::image_gather4_c;
10159 if (level_zero)
10160 opcode = aco_opcode::image_gather4_c_lz;
10161 if (has_lod)
10162 opcode = aco_opcode::image_gather4_c_l;
10163 if (has_bias)
10164 opcode = aco_opcode::image_gather4_c_b;
10165 } else {
10166 opcode = aco_opcode::image_gather4;
10167 if (level_zero)
10168 opcode = aco_opcode::image_gather4_lz;
10169 if (has_lod)
10170 opcode = aco_opcode::image_gather4_l;
10171 if (has_bias)
10172 opcode = aco_opcode::image_gather4_b;
10173 }
10174 }
10175 } else if (instr->op == nir_texop_lod) {
10176 opcode = aco_opcode::image_get_lod;
10177 }
10178
10179 bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
10180 !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
10181 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
10182
10183 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
10184 MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
10185 tex->dim = dim;
10186 tex->dmask = dmask & 0xf;
10187 tex->da = da;
10188 tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
10189 tex->tfe = instr->is_sparse;
10190 tex->d16 = d16;
10191 tex->a16 = a16;
10192 if (implicit_derivs)
10193 set_wqm(ctx, true);
10194
10195 if (tg4_integer_cube_workaround) {
10196 assert(tmp_dst.id() != dst.id());
10197 assert(tmp_dst.size() == dst.size());
10198
10199 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
10200 Temp val[4];
10201 for (unsigned i = 0; i < 4; i++) {
10202 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
10203 Temp cvt_val;
10204 if (instr->dest_type & nir_type_uint)
10205 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
10206 else
10207 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
10208 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
10209 tg4_compare_cube_wa64);
10210 }
10211
10212 Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
10213 if (instr->is_sparse)
10214 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
10215 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
10216 else
10217 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
10218 val[3]);
10219 }
10220 unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
10221 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
10222 }
10223
10224 Operand
get_phi_operand(isel_context * ctx,nir_def * ssa,RegClass rc)10225 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc)
10226 {
10227 Temp tmp = get_ssa_temp(ctx, ssa);
10228 if (ssa->parent_instr->type == nir_instr_type_undef) {
10229 return Operand(rc);
10230 } else if (ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
10231 bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
10232 return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
10233 } else {
10234 return Operand(tmp);
10235 }
10236 }
10237
10238 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)10239 visit_phi(isel_context* ctx, nir_phi_instr* instr)
10240 {
10241 Temp dst = get_ssa_temp(ctx, &instr->def);
10242 assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
10243 aco_opcode opcode = instr->def.bit_size == 1 ? aco_opcode::p_boolean_phi : aco_opcode::p_phi;
10244
10245 /* we want a sorted list of sources, since the predecessor list is also sorted */
10246 std::map<unsigned, nir_def*> phi_src;
10247 nir_foreach_phi_src (src, instr)
10248 phi_src[src->pred->index] = src->src.ssa;
10249
10250 Instruction* phi = create_instruction(opcode, Format::PSEUDO, phi_src.size(), 1);
10251 unsigned i = 0;
10252 for (std::pair<unsigned, nir_def*> src : phi_src)
10253 phi->operands[i++] = get_phi_operand(ctx, src.second, dst.regClass());
10254 phi->definitions[0] = Definition(dst);
10255 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10256 }
10257
10258 void
visit_undef(isel_context * ctx,nir_undef_instr * instr)10259 visit_undef(isel_context* ctx, nir_undef_instr* instr)
10260 {
10261 Temp dst = get_ssa_temp(ctx, &instr->def);
10262
10263 assert(dst.type() == RegType::sgpr);
10264
10265 if (dst.size() == 1) {
10266 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
10267 } else {
10268 aco_ptr<Instruction> vec{
10269 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
10270 for (unsigned i = 0; i < dst.size(); i++)
10271 vec->operands[i] = Operand::zero();
10272 vec->definitions[0] = Definition(dst);
10273 ctx->block->instructions.emplace_back(std::move(vec));
10274 }
10275 }
10276
10277 void
begin_loop(isel_context * ctx,loop_context * lc)10278 begin_loop(isel_context* ctx, loop_context* lc)
10279 {
10280 // TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true
10281 append_logical_end(ctx->block);
10282 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10283 Builder bld(ctx->program, ctx->block);
10284 bld.branch(aco_opcode::p_branch, bld.def(s2));
10285 unsigned loop_preheader_idx = ctx->block->index;
10286
10287 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10288
10289 ctx->program->next_loop_depth++;
10290
10291 Block* loop_header = ctx->program->create_and_insert_block();
10292 loop_header->kind |= block_kind_loop_header;
10293 add_edge(loop_preheader_idx, loop_header);
10294 ctx->block = loop_header;
10295
10296 append_logical_start(ctx->block);
10297
10298 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10299 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10300 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10301 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10302 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10303 }
10304
10305 void
update_exec_info(isel_context * ctx)10306 update_exec_info(isel_context* ctx)
10307 {
10308 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10309 ctx->cf_info.exec.potentially_empty_discard = false;
10310
10311 ctx->cf_info.exec.potentially_empty_break &=
10312 ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_break_depth;
10313 ctx->cf_info.exec.potentially_empty_continue &=
10314 ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_continue_depth;
10315
10316 if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_break_depth &&
10317 !ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.parent_loop.has_divergent_continue) {
10318 ctx->cf_info.exec.potentially_empty_break = false;
10319 }
10320 if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_continue_depth &&
10321 !ctx->cf_info.parent_if.is_divergent) {
10322 ctx->cf_info.exec.potentially_empty_continue = false;
10323 }
10324
10325 if (!ctx->cf_info.exec.potentially_empty_break)
10326 ctx->cf_info.exec.potentially_empty_break_depth = UINT16_MAX;
10327 if (!ctx->cf_info.exec.potentially_empty_continue)
10328 ctx->cf_info.exec.potentially_empty_continue_depth = UINT16_MAX;
10329 }
10330
10331 void
end_loop(isel_context * ctx,loop_context * lc)10332 end_loop(isel_context* ctx, loop_context* lc)
10333 {
10334 // TODO: what if a loop ends with a unconditional or uniformly branched continue
10335 // and this branch is never taken?
10336 if (!ctx->cf_info.has_branch) {
10337 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10338 Builder bld(ctx->program, ctx->block);
10339 append_logical_end(ctx->block);
10340
10341 /* No need to check exec.potentially_empty_break/continue originating inside the loop. In the
10342 * only case where it's possible at this point (divergent break after divergent continue), we
10343 * should continue anyway. */
10344 if (ctx->cf_info.exec.potentially_empty_discard ||
10345 (ctx->cf_info.exec.potentially_empty_break &&
10346 ctx->cf_info.exec.potentially_empty_break_depth < ctx->block->loop_nest_depth) ||
10347 (ctx->cf_info.exec.potentially_empty_continue &&
10348 ctx->cf_info.exec.potentially_empty_continue_depth < ctx->block->loop_nest_depth)) {
10349 /* Discards can result in code running with an empty exec mask.
10350 * This would result in divergent breaks not ever being taken. As a
10351 * workaround, break the loop when the loop mask is empty instead of
10352 * always continuing. */
10353 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10354 unsigned block_idx = ctx->block->index;
10355
10356 /* create helper blocks to avoid critical edges */
10357 Block* break_block = ctx->program->create_and_insert_block();
10358 break_block->kind = block_kind_uniform;
10359 bld.reset(break_block);
10360 bld.branch(aco_opcode::p_branch, bld.def(s2));
10361 add_linear_edge(block_idx, break_block);
10362 add_linear_edge(break_block->index, &lc->loop_exit);
10363
10364 Block* continue_block = ctx->program->create_and_insert_block();
10365 continue_block->kind = block_kind_uniform;
10366 bld.reset(continue_block);
10367 bld.branch(aco_opcode::p_branch, bld.def(s2));
10368 add_linear_edge(block_idx, continue_block);
10369 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10370
10371 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10372 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10373 ctx->block = &ctx->program->blocks[block_idx];
10374 } else {
10375 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10376 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10377 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10378 else
10379 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10380 }
10381
10382 bld.reset(ctx->block);
10383 bld.branch(aco_opcode::p_branch, bld.def(s2));
10384 }
10385
10386 ctx->cf_info.has_branch = false;
10387 ctx->program->next_loop_depth--;
10388
10389 /* emit loop successor block */
10390 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10391 append_logical_start(ctx->block);
10392
10393 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10394 ctx->cf_info.parent_loop.exit = lc->exit_old;
10395 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10396 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10397 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10398 update_exec_info(ctx);
10399 }
10400
10401 void
emit_loop_jump(isel_context * ctx,bool is_break)10402 emit_loop_jump(isel_context* ctx, bool is_break)
10403 {
10404 Builder bld(ctx->program, ctx->block);
10405 Block* logical_target;
10406 append_logical_end(ctx->block);
10407 unsigned idx = ctx->block->index;
10408
10409 if (is_break) {
10410 logical_target = ctx->cf_info.parent_loop.exit;
10411 add_logical_edge(idx, logical_target);
10412 ctx->block->kind |= block_kind_break;
10413
10414 if (!ctx->cf_info.parent_if.is_divergent &&
10415 !ctx->cf_info.parent_loop.has_divergent_continue) {
10416 /* uniform break - directly jump out of the loop */
10417 ctx->block->kind |= block_kind_uniform;
10418 ctx->cf_info.has_branch = true;
10419 bld.branch(aco_opcode::p_branch, bld.def(s2));
10420 add_linear_edge(idx, logical_target);
10421 return;
10422 }
10423 ctx->cf_info.parent_loop.has_divergent_branch = true;
10424
10425 if (!ctx->cf_info.exec.potentially_empty_break) {
10426 ctx->cf_info.exec.potentially_empty_break = true;
10427 ctx->cf_info.exec.potentially_empty_break_depth = ctx->block->loop_nest_depth;
10428 }
10429 } else {
10430 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10431 add_logical_edge(idx, logical_target);
10432 ctx->block->kind |= block_kind_continue;
10433
10434 if (!ctx->cf_info.parent_if.is_divergent) {
10435 /* uniform continue - directly jump to the loop header */
10436 ctx->block->kind |= block_kind_uniform;
10437 ctx->cf_info.has_branch = true;
10438 bld.branch(aco_opcode::p_branch, bld.def(s2));
10439 add_linear_edge(idx, logical_target);
10440 return;
10441 }
10442
10443 /* for potential uniform breaks after this continue,
10444 we must ensure that they are handled correctly */
10445 ctx->cf_info.parent_loop.has_divergent_continue = true;
10446 ctx->cf_info.parent_loop.has_divergent_branch = true;
10447
10448 if (!ctx->cf_info.exec.potentially_empty_continue) {
10449 ctx->cf_info.exec.potentially_empty_continue = true;
10450 ctx->cf_info.exec.potentially_empty_continue_depth = ctx->block->loop_nest_depth;
10451 }
10452 }
10453
10454 /* remove critical edges from linear CFG */
10455 bld.branch(aco_opcode::p_branch, bld.def(s2));
10456 Block* break_block = ctx->program->create_and_insert_block();
10457 break_block->kind |= block_kind_uniform;
10458 add_linear_edge(idx, break_block);
10459 /* the loop_header pointer might be invalidated by this point */
10460 if (!is_break)
10461 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10462 add_linear_edge(break_block->index, logical_target);
10463 bld.reset(break_block);
10464 bld.branch(aco_opcode::p_branch, bld.def(s2));
10465
10466 Block* continue_block = ctx->program->create_and_insert_block();
10467 add_linear_edge(idx, continue_block);
10468 append_logical_start(continue_block);
10469 ctx->block = continue_block;
10470 }
10471
10472 void
emit_loop_break(isel_context * ctx)10473 emit_loop_break(isel_context* ctx)
10474 {
10475 emit_loop_jump(ctx, true);
10476 }
10477
10478 void
emit_loop_continue(isel_context * ctx)10479 emit_loop_continue(isel_context* ctx)
10480 {
10481 emit_loop_jump(ctx, false);
10482 }
10483
10484 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10485 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10486 {
10487 switch (instr->type) {
10488 case nir_jump_break: emit_loop_break(ctx); break;
10489 case nir_jump_continue: emit_loop_continue(ctx); break;
10490 default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10491 }
10492 }
10493
10494 void
visit_block(isel_context * ctx,nir_block * block)10495 visit_block(isel_context* ctx, nir_block* block)
10496 {
10497 if (ctx->block->kind & block_kind_top_level) {
10498 Builder bld(ctx->program, ctx->block);
10499 for (Temp tmp : ctx->unended_linear_vgprs) {
10500 bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10501 }
10502 ctx->unended_linear_vgprs.clear();
10503 }
10504
10505 ctx->block->instructions.reserve(ctx->block->instructions.size() +
10506 exec_list_length(&block->instr_list) * 2);
10507 nir_foreach_instr (instr, block) {
10508 switch (instr->type) {
10509 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10510 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10511 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10512 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10513 case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10514 case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10515 case nir_instr_type_deref: break;
10516 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10517 default: isel_err(instr, "Unknown NIR instr type");
10518 }
10519 }
10520 }
10521
10522 static bool
all_uses_inside_loop(nir_def * def,nir_block * block_before_loop,nir_block * block_after_loop)10523 all_uses_inside_loop(nir_def* def, nir_block* block_before_loop, nir_block* block_after_loop)
10524 {
10525 nir_foreach_use_including_if (use, def) {
10526 if (nir_src_is_if(use)) {
10527 nir_block* branch_block =
10528 nir_cf_node_as_block(nir_cf_node_prev(&nir_src_parent_if(use)->cf_node));
10529 if (branch_block->index <= block_before_loop->index || branch_block->index >= block_after_loop->index)
10530 return false;
10531 } else {
10532 nir_instr* instr = nir_src_parent_instr(use);
10533 if ((instr->block->index <= block_before_loop->index || instr->block->index >= block_after_loop->index) &&
10534 !(instr->type == nir_instr_type_phi && instr->block == block_after_loop)) {
10535 return false;
10536 }
10537 }
10538 }
10539
10540 return true;
10541 }
10542
10543 Temp
rename_temp(const std::map<unsigned,unsigned> & renames,Temp tmp)10544 rename_temp(const std::map<unsigned, unsigned>& renames, Temp tmp)
10545 {
10546 auto it = renames.find(tmp.id());
10547 if (it != renames.end())
10548 return Temp(it->second, tmp.regClass());
10549 return tmp;
10550 }
10551
10552 static void
lcssa_workaround(isel_context * ctx,nir_loop * loop)10553 lcssa_workaround(isel_context* ctx, nir_loop* loop)
10554 {
10555 nir_block* block_before_loop = nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node));
10556 nir_block* block_after_loop = nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node));
10557
10558 std::map<unsigned, unsigned> renames;
10559 nir_foreach_block_in_cf_node (block, &loop->cf_node) {
10560 /* These values are reachable from the loop exit even when continue_or_break is used. We
10561 * shouldn't create phis with undef operands in case the contents are important even if exec
10562 * is zero (for example, memory access addresses). */
10563 if (nir_block_dominates(block, nir_loop_last_block(loop)))
10564 continue;
10565
10566 /* Definitions in this block are not reachable from the loop exit, and so all uses are inside
10567 * the loop. */
10568 if (!nir_block_dominates(block, block_after_loop))
10569 continue;
10570
10571 nir_foreach_instr (instr, block) {
10572 nir_def* def = nir_instr_def(instr);
10573 if (!def)
10574 continue;
10575
10576 Temp tmp = get_ssa_temp(ctx, def);
10577 if (!tmp.is_linear() || all_uses_inside_loop(def, block_before_loop, block_after_loop))
10578 continue;
10579
10580 Temp new_tmp = ctx->program->allocateTmp(tmp.regClass());
10581 aco_ptr<Instruction> phi(create_instruction(aco_opcode::p_phi, Format::PSEUDO,
10582 ctx->block->logical_preds.size(), 1));
10583 for (unsigned i = 0; i < ctx->block->logical_preds.size(); i++)
10584 phi->operands[i] = Operand(new_tmp);
10585 phi->definitions[0] = Definition(tmp);
10586 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10587
10588 renames.emplace(tmp.id(), new_tmp.id());
10589 }
10590 }
10591
10592 if (renames.empty())
10593 return;
10594
10595 for (unsigned i = ctx->block->index - 1;
10596 ctx->program->blocks[i].loop_nest_depth > ctx->block->loop_nest_depth; i--) {
10597 for (aco_ptr<Instruction>& instr : ctx->program->blocks[i].instructions) {
10598 for (Definition& def : instr->definitions) {
10599 if (def.isTemp())
10600 def.setTemp(rename_temp(renames, def.getTemp()));
10601 }
10602 for (Operand& op : instr->operands) {
10603 if (op.isTemp())
10604 op.setTemp(rename_temp(renames, op.getTemp()));
10605 }
10606 }
10607 }
10608 }
10609
10610 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10611 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10612 static void end_uniform_if(isel_context* ctx, if_context* ic);
10613
10614 static void
visit_loop(isel_context * ctx,nir_loop * loop)10615 visit_loop(isel_context* ctx, nir_loop* loop)
10616 {
10617 assert(!nir_loop_has_continue_construct(loop));
10618 loop_context lc;
10619 begin_loop(ctx, &lc);
10620
10621 visit_cf_list(ctx, &loop->body);
10622
10623 end_loop(ctx, &lc);
10624
10625 /* Create extra LCSSA phis for continue_or_break */
10626 if (ctx->block->linear_preds.size() > ctx->block->logical_preds.size())
10627 lcssa_workaround(ctx, loop);
10628 }
10629
10630 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond,nir_selection_control sel_ctrl=nir_selection_control_none)10631 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10632 nir_selection_control sel_ctrl = nir_selection_control_none)
10633 {
10634 ic->cond = cond;
10635
10636 append_logical_end(ctx->block);
10637 ctx->block->kind |= block_kind_branch;
10638
10639 /* branch to linear then block */
10640 assert(cond.regClass() == ctx->program->lane_mask);
10641 aco_ptr<Instruction> branch;
10642 branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 1));
10643 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10644 branch->operands[0] = Operand(cond);
10645 bool never_taken =
10646 sel_ctrl == nir_selection_control_divergent_always_taken &&
10647 !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10648 ctx->cf_info.exec.potentially_empty_continue);
10649 branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10650 branch->branch().never_taken = never_taken;
10651 ctx->block->instructions.push_back(std::move(branch));
10652
10653 ic->BB_if_idx = ctx->block->index;
10654 ic->BB_invert = Block();
10655 /* Invert blocks are intentionally not marked as top level because they
10656 * are not part of the logical cfg. */
10657 ic->BB_invert.kind |= block_kind_invert;
10658 ic->BB_endif = Block();
10659 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10660
10661 ic->exec_old = ctx->cf_info.exec;
10662 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10663 ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10664 ctx->cf_info.parent_if.is_divergent = true;
10665
10666 /* divergent branches use cbranch_execz */
10667 ctx->cf_info.exec = exec_info();
10668
10669 /** emit logical then block */
10670 ctx->program->next_divergent_if_logical_depth++;
10671 Block* BB_then_logical = ctx->program->create_and_insert_block();
10672 add_edge(ic->BB_if_idx, BB_then_logical);
10673 ctx->block = BB_then_logical;
10674 append_logical_start(BB_then_logical);
10675 }
10676
10677 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic,nir_selection_control sel_ctrl=nir_selection_control_none)10678 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10679 nir_selection_control sel_ctrl = nir_selection_control_none)
10680 {
10681 Block* BB_then_logical = ctx->block;
10682 append_logical_end(BB_then_logical);
10683 /* branch from logical then block to invert block */
10684 aco_ptr<Instruction> branch;
10685 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10686 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10687 BB_then_logical->instructions.emplace_back(std::move(branch));
10688 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10689 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10690 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10691 BB_then_logical->kind |= block_kind_uniform;
10692 assert(!ctx->cf_info.has_branch);
10693 ctx->cf_info.parent_loop.has_divergent_branch = false;
10694 ctx->program->next_divergent_if_logical_depth--;
10695
10696 /** emit linear then block */
10697 Block* BB_then_linear = ctx->program->create_and_insert_block();
10698 BB_then_linear->kind |= block_kind_uniform;
10699 add_linear_edge(ic->BB_if_idx, BB_then_linear);
10700 /* branch from linear then block to invert block */
10701 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10702 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10703 BB_then_linear->instructions.emplace_back(std::move(branch));
10704 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10705
10706 /** emit invert merge block */
10707 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10708 ic->invert_idx = ctx->block->index;
10709
10710 /* branch to linear else block (skip else) */
10711 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10712 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10713 bool never_taken =
10714 sel_ctrl == nir_selection_control_divergent_always_taken &&
10715 !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10716 ctx->cf_info.exec.potentially_empty_continue);
10717 branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10718 branch->branch().never_taken = never_taken;
10719 ctx->block->instructions.push_back(std::move(branch));
10720
10721 ic->exec_old.combine(ctx->cf_info.exec);
10722 /* divergent branches use cbranch_execz */
10723 ctx->cf_info.exec = exec_info();
10724
10725 ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10726 ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10727
10728 /** emit logical else block */
10729 ctx->program->next_divergent_if_logical_depth++;
10730 Block* BB_else_logical = ctx->program->create_and_insert_block();
10731 add_logical_edge(ic->BB_if_idx, BB_else_logical);
10732 add_linear_edge(ic->invert_idx, BB_else_logical);
10733 ctx->block = BB_else_logical;
10734 append_logical_start(BB_else_logical);
10735 }
10736
10737 static void
end_divergent_if(isel_context * ctx,if_context * ic)10738 end_divergent_if(isel_context* ctx, if_context* ic)
10739 {
10740 Block* BB_else_logical = ctx->block;
10741 append_logical_end(BB_else_logical);
10742
10743 /* branch from logical else block to endif block */
10744 aco_ptr<Instruction> branch;
10745 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10746 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10747 BB_else_logical->instructions.emplace_back(std::move(branch));
10748 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10749 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10750 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10751 BB_else_logical->kind |= block_kind_uniform;
10752 ctx->program->next_divergent_if_logical_depth--;
10753
10754 assert(!ctx->cf_info.has_branch);
10755 ctx->cf_info.parent_loop.has_divergent_branch = false;
10756
10757 /** emit linear else block */
10758 Block* BB_else_linear = ctx->program->create_and_insert_block();
10759 BB_else_linear->kind |= block_kind_uniform;
10760 add_linear_edge(ic->invert_idx, BB_else_linear);
10761
10762 /* branch from linear else block to endif block */
10763 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10764 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10765 BB_else_linear->instructions.emplace_back(std::move(branch));
10766 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10767
10768 /** emit endif merge block */
10769 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10770 append_logical_start(ctx->block);
10771
10772 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10773 ctx->cf_info.exec.combine(ic->exec_old);
10774 update_exec_info(ctx);
10775 ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10776
10777 /* We shouldn't create unreachable blocks. */
10778 assert(!ctx->block->logical_preds.empty());
10779 }
10780
10781 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10782 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10783 {
10784 assert(cond.regClass() == s1);
10785
10786 append_logical_end(ctx->block);
10787 ctx->block->kind |= block_kind_uniform;
10788
10789 aco_ptr<Instruction> branch;
10790 aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10791 branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10792 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10793 branch->operands[0] = Operand(cond);
10794 branch->operands[0].setFixed(scc);
10795 ctx->block->instructions.emplace_back(std::move(branch));
10796
10797 ic->BB_if_idx = ctx->block->index;
10798 ic->BB_endif = Block();
10799 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10800
10801 ctx->cf_info.has_branch = false;
10802 ctx->cf_info.parent_loop.has_divergent_branch = false;
10803
10804 ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10805 ic->has_divergent_continue_old = ctx->cf_info.parent_loop.has_divergent_continue;
10806
10807 /** emit then block */
10808 ctx->program->next_uniform_if_depth++;
10809 Block* BB_then = ctx->program->create_and_insert_block();
10810 add_edge(ic->BB_if_idx, BB_then);
10811 append_logical_start(BB_then);
10812 ctx->block = BB_then;
10813 }
10814
10815 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10816 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10817 {
10818 Block* BB_then = ctx->block;
10819
10820 if (!ctx->cf_info.has_branch) {
10821 append_logical_end(BB_then);
10822 /* branch from then block to endif block */
10823 aco_ptr<Instruction> branch;
10824 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10825 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10826 BB_then->instructions.emplace_back(std::move(branch));
10827 add_linear_edge(BB_then->index, &ic->BB_endif);
10828 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10829 add_logical_edge(BB_then->index, &ic->BB_endif);
10830 BB_then->kind |= block_kind_uniform;
10831 }
10832
10833 ctx->cf_info.has_branch = false;
10834 ctx->cf_info.parent_loop.has_divergent_branch = false;
10835
10836 ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10837 ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10838
10839 ic->has_divergent_continue_then = ctx->cf_info.parent_loop.has_divergent_continue;
10840 ctx->cf_info.parent_loop.has_divergent_continue = ic->has_divergent_continue_old;
10841
10842 /** emit else block */
10843 Block* BB_else = ctx->program->create_and_insert_block();
10844 add_edge(ic->BB_if_idx, BB_else);
10845 append_logical_start(BB_else);
10846 ctx->block = BB_else;
10847 }
10848
10849 static void
end_uniform_if(isel_context * ctx,if_context * ic)10850 end_uniform_if(isel_context* ctx, if_context* ic)
10851 {
10852 Block* BB_else = ctx->block;
10853
10854 if (!ctx->cf_info.has_branch) {
10855 append_logical_end(BB_else);
10856 /* branch from then block to endif block */
10857 aco_ptr<Instruction> branch;
10858 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10859 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10860 BB_else->instructions.emplace_back(std::move(branch));
10861 add_linear_edge(BB_else->index, &ic->BB_endif);
10862 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10863 add_logical_edge(BB_else->index, &ic->BB_endif);
10864 BB_else->kind |= block_kind_uniform;
10865 }
10866
10867 ctx->cf_info.has_branch = false;
10868 ctx->cf_info.parent_loop.has_divergent_branch = false;
10869 ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10870 ctx->cf_info.parent_loop.has_divergent_continue |= ic->has_divergent_continue_then;
10871
10872 /** emit endif merge block */
10873 ctx->program->next_uniform_if_depth--;
10874 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10875 append_logical_start(ctx->block);
10876
10877 /* We shouldn't create unreachable blocks. */
10878 assert(!ctx->block->logical_preds.empty());
10879 }
10880
10881 static void
visit_if(isel_context * ctx,nir_if * if_stmt)10882 visit_if(isel_context* ctx, nir_if* if_stmt)
10883 {
10884 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10885 Builder bld(ctx->program, ctx->block);
10886 aco_ptr<Instruction> branch;
10887 if_context ic;
10888
10889 if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10890 /**
10891 * Uniform conditionals are represented in the following way*) :
10892 *
10893 * The linear and logical CFG:
10894 * BB_IF
10895 * / \
10896 * BB_THEN (logical) BB_ELSE (logical)
10897 * \ /
10898 * BB_ENDIF
10899 *
10900 * *) Exceptions may be due to break and continue statements within loops
10901 * If a break/continue happens within uniform control flow, it branches
10902 * to the loop exit/entry block. Otherwise, it branches to the next
10903 * merge block.
10904 **/
10905
10906 assert(cond.regClass() == ctx->program->lane_mask);
10907 cond = bool_to_scalar_condition(ctx, cond);
10908
10909 begin_uniform_if_then(ctx, &ic, cond);
10910 visit_cf_list(ctx, &if_stmt->then_list);
10911
10912 begin_uniform_if_else(ctx, &ic);
10913 visit_cf_list(ctx, &if_stmt->else_list);
10914
10915 end_uniform_if(ctx, &ic);
10916 } else { /* non-uniform condition */
10917 /**
10918 * To maintain a logical and linear CFG without critical edges,
10919 * non-uniform conditionals are represented in the following way*) :
10920 *
10921 * The linear CFG:
10922 * BB_IF
10923 * / \
10924 * BB_THEN (logical) BB_THEN (linear)
10925 * \ /
10926 * BB_INVERT (linear)
10927 * / \
10928 * BB_ELSE (logical) BB_ELSE (linear)
10929 * \ /
10930 * BB_ENDIF
10931 *
10932 * The logical CFG:
10933 * BB_IF
10934 * / \
10935 * BB_THEN (logical) BB_ELSE (logical)
10936 * \ /
10937 * BB_ENDIF
10938 *
10939 * *) Exceptions may be due to break and continue statements within loops
10940 **/
10941
10942 begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10943 visit_cf_list(ctx, &if_stmt->then_list);
10944
10945 begin_divergent_if_else(ctx, &ic, if_stmt->control);
10946 visit_cf_list(ctx, &if_stmt->else_list);
10947
10948 end_divergent_if(ctx, &ic);
10949 }
10950 }
10951
10952 static void
visit_cf_list(isel_context * ctx,struct exec_list * list)10953 visit_cf_list(isel_context* ctx, struct exec_list* list)
10954 {
10955 foreach_list_typed (nir_cf_node, node, node, list) {
10956 switch (node->type) {
10957 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10958 case nir_cf_node_if: visit_if(ctx, nir_cf_node_as_if(node)); break;
10959 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10960 default: unreachable("unimplemented cf list type");
10961 }
10962 }
10963 }
10964
10965 static void
export_mrt(isel_context * ctx,const struct aco_export_mrt * mrt)10966 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10967 {
10968 Builder bld(ctx->program, ctx->block);
10969
10970 bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10971 mrt->enabled_channels, mrt->target, mrt->compr);
10972
10973 ctx->program->has_color_exports = true;
10974 }
10975
10976 static bool
export_fs_mrt_color(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned slot,struct aco_export_mrt * mrt)10977 export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
10978 unsigned slot, struct aco_export_mrt* mrt)
10979 {
10980 unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf;
10981
10982 if (col_format == V_028714_SPI_SHADER_ZERO)
10983 return false;
10984
10985 Builder bld(ctx->program, ctx->block);
10986 Operand values[4];
10987
10988 for (unsigned i = 0; i < 4; ++i) {
10989 values[i] = Operand(colors[i]);
10990 }
10991
10992 unsigned enabled_channels = 0;
10993 aco_opcode compr_op = aco_opcode::num_opcodes;
10994 bool compr = false;
10995 bool is_16bit = colors[0].regClass() == v2b;
10996 bool is_int8 = (info->color_is_int8 >> slot) & 1;
10997 bool is_int10 = (info->color_is_int10 >> slot) & 1;
10998 bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1;
10999
11000 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
11001 if (enable_mrt_output_nan_fixup && !is_16bit &&
11002 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
11003 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
11004 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
11005 for (unsigned i = 0; i < 4; i++) {
11006 Temp is_not_nan =
11007 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
11008 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
11009 is_not_nan);
11010 }
11011 }
11012
11013 switch (col_format) {
11014 case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
11015
11016 case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
11017
11018 case V_028714_SPI_SHADER_32_AR:
11019 if (ctx->options->gfx_level >= GFX10) {
11020 /* Special case: on GFX10, the outputs are different for 32_AR */
11021 enabled_channels = 0x3;
11022 values[1] = values[3];
11023 values[3] = Operand(v1);
11024 } else {
11025 enabled_channels = 0x9;
11026 }
11027 break;
11028
11029 case V_028714_SPI_SHADER_FP16_ABGR:
11030 for (int i = 0; i < 2; i++) {
11031 if (is_16bit) {
11032 values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2],
11033 values[i * 2 + 1]);
11034 } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
11035 values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2],
11036 values[i * 2 + 1]);
11037 } else {
11038 values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2],
11039 values[i * 2 + 1]);
11040 }
11041 }
11042 values[2] = Operand(v1);
11043 values[3] = Operand(v1);
11044 enabled_channels = 0xf;
11045 compr = true;
11046 break;
11047
11048 case V_028714_SPI_SHADER_UNORM16_ABGR:
11049 if (is_16bit && ctx->options->gfx_level >= GFX9) {
11050 compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
11051 } else {
11052 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
11053 }
11054 break;
11055
11056 case V_028714_SPI_SHADER_SNORM16_ABGR:
11057 if (is_16bit && ctx->options->gfx_level >= GFX9) {
11058 compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
11059 } else {
11060 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
11061 }
11062 break;
11063
11064 case V_028714_SPI_SHADER_UINT16_ABGR:
11065 compr_op = aco_opcode::v_cvt_pk_u16_u32;
11066 if (is_int8 || is_int10) {
11067 /* clamp */
11068 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
11069
11070 for (unsigned i = 0; i < 4; i++) {
11071 uint32_t max = i == 3 && is_int10 ? 3 : max_rgb;
11072
11073 values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
11074 }
11075 } else if (is_16bit) {
11076 for (unsigned i = 0; i < 4; i++) {
11077 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
11078 values[i] = Operand(tmp);
11079 }
11080 }
11081 break;
11082
11083 case V_028714_SPI_SHADER_SINT16_ABGR:
11084 compr_op = aco_opcode::v_cvt_pk_i16_i32;
11085 if (is_int8 || is_int10) {
11086 /* clamp */
11087 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
11088 uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11089
11090 for (unsigned i = 0; i < 4; i++) {
11091 uint32_t max = i == 3 && is_int10 ? 1 : max_rgb;
11092 uint32_t min = i == 3 && is_int10 ? -2u : min_rgb;
11093
11094 values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
11095 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
11096 }
11097 } else if (is_16bit) {
11098 for (unsigned i = 0; i < 4; i++) {
11099 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11100 values[i] = Operand(tmp);
11101 }
11102 }
11103 break;
11104
11105 case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11106
11107 case V_028714_SPI_SHADER_ZERO:
11108 default: return false;
11109 }
11110
11111 if (compr_op != aco_opcode::num_opcodes) {
11112 values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]);
11113 values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]);
11114 values[2] = Operand(v1);
11115 values[3] = Operand(v1);
11116 enabled_channels = 0xf;
11117 compr = true;
11118 } else if (!compr) {
11119 for (int i = 0; i < 4; i++)
11120 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11121 }
11122
11123 if (ctx->program->gfx_level >= GFX11) {
11124 /* GFX11 doesn't use COMPR for exports, but the channel mask should be
11125 * 0x3 instead.
11126 */
11127 enabled_channels = compr ? 0x3 : enabled_channels;
11128 compr = false;
11129 }
11130
11131 for (unsigned i = 0; i < 4; i++)
11132 mrt->out[i] = values[i];
11133 mrt->target = V_008DFC_SQ_EXP_MRT;
11134 mrt->enabled_channels = enabled_channels;
11135 mrt->compr = compr;
11136
11137 return true;
11138 }
11139
11140 static void
export_fs_mrtz(isel_context * ctx,Temp depth,Temp stencil,Temp samplemask,Temp alpha)11141 export_fs_mrtz(isel_context* ctx, Temp depth, Temp stencil, Temp samplemask, Temp alpha)
11142 {
11143 Builder bld(ctx->program, ctx->block);
11144 unsigned enabled_channels = 0;
11145 bool compr = false;
11146 Operand values[4];
11147
11148 for (unsigned i = 0; i < 4; ++i) {
11149 values[i] = Operand(v1);
11150 }
11151
11152 /* Both stencil and sample mask only need 16-bits. */
11153 if (!depth.id() && !alpha.id() && (stencil.id() || samplemask.id())) {
11154 compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
11155
11156 if (stencil.id()) {
11157 /* Stencil should be in X[23:16]. */
11158 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil);
11159 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
11160 }
11161
11162 if (samplemask.id()) {
11163 /* SampleMask should be in Y[15:0]. */
11164 values[1] = Operand(samplemask);
11165 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
11166 }
11167 } else {
11168 if (depth.id()) {
11169 values[0] = Operand(depth);
11170 enabled_channels |= 0x1;
11171 }
11172
11173 if (stencil.id()) {
11174 values[1] = Operand(stencil);
11175 enabled_channels |= 0x2;
11176 }
11177
11178 if (samplemask.id()) {
11179 values[2] = Operand(samplemask);
11180 enabled_channels |= 0x4;
11181 }
11182
11183 if (alpha.id()) {
11184 assert(ctx->program->gfx_level >= GFX11);
11185 values[3] = Operand(alpha);
11186 enabled_channels |= 0x8;
11187 }
11188 }
11189
11190 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
11191 * writemask component.
11192 */
11193 if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
11194 ctx->options->family != CHIP_HAINAN) {
11195 enabled_channels |= 0x1;
11196 }
11197
11198 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
11199 V_008DFC_SQ_EXP_MRTZ, compr);
11200 }
11201
11202 static void
create_fs_null_export(isel_context * ctx)11203 create_fs_null_export(isel_context* ctx)
11204 {
11205 /* FS must always have exports.
11206 * So when there are none, we need to add a null export.
11207 */
11208
11209 Builder bld(ctx->program, ctx->block);
11210 /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
11211 unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
11212 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11213 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11214
11215 ctx->program->has_color_exports = true;
11216 }
11217
11218 static void
create_fs_jump_to_epilog(isel_context * ctx)11219 create_fs_jump_to_epilog(isel_context* ctx)
11220 {
11221 Builder bld(ctx->program, ctx->block);
11222 std::vector<Operand> exports;
11223 unsigned vgpr = 256; /* VGPR 0 */
11224
11225 if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11226 exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
11227
11228 if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11229 exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
11230
11231 if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11232 exports.emplace_back(
11233 Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
11234
11235 PhysReg exports_start(vgpr);
11236
11237 for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
11238 unsigned color_index = slot - FRAG_RESULT_DATA0;
11239 unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
11240 unsigned write_mask = ctx->outputs.mask[slot];
11241
11242 if (!write_mask)
11243 continue;
11244
11245 PhysReg color_start(exports_start.reg() + color_index * 4);
11246
11247 for (unsigned i = 0; i < 4; i++) {
11248 if (!(write_mask & BITFIELD_BIT(i))) {
11249 exports.emplace_back(Operand(v1));
11250 continue;
11251 }
11252
11253 PhysReg chan_reg = color_start.advance(i * 4u);
11254 Operand chan(ctx->outputs.temps[slot * 4u + i]);
11255
11256 if (color_type == ACO_TYPE_FLOAT16) {
11257 chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
11258 } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
11259 bool sign_ext = color_type == ACO_TYPE_INT16;
11260 Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
11261 chan = Operand(tmp);
11262 }
11263
11264 chan.setFixed(chan_reg);
11265 exports.emplace_back(chan);
11266 }
11267 }
11268
11269 Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));
11270
11271 aco_ptr<Instruction> jump{
11272 create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
11273 jump->operands[0] = Operand(continue_pc);
11274 for (unsigned i = 0; i < exports.size(); i++) {
11275 jump->operands[i + 1] = exports[i];
11276 }
11277 ctx->block->instructions.emplace_back(std::move(jump));
11278 }
11279
11280 PhysReg
get_arg_reg(const struct ac_shader_args * args,struct ac_arg arg)11281 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
11282 {
11283 assert(arg.used);
11284 enum ac_arg_regfile file = args->args[arg.arg_index].file;
11285 unsigned reg = args->args[arg.arg_index].offset;
11286 return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
11287 }
11288
11289 static Operand
get_arg_for_end(isel_context * ctx,struct ac_arg arg)11290 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
11291 {
11292 return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
11293 }
11294
11295 static void
passthrough_all_args(isel_context * ctx,std::vector<Operand> & regs)11296 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
11297 {
11298 struct ac_arg arg;
11299 arg.used = true;
11300
11301 for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
11302 regs.emplace_back(get_arg_for_end(ctx, arg));
11303 }
11304
11305 static void
build_end_with_regs(isel_context * ctx,std::vector<Operand> & regs)11306 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
11307 {
11308 aco_ptr<Instruction> end{
11309 create_instruction(aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
11310
11311 for (unsigned i = 0; i < regs.size(); i++)
11312 end->operands[i] = regs[i];
11313
11314 ctx->block->instructions.emplace_back(std::move(end));
11315
11316 ctx->block->kind |= block_kind_end_with_regs;
11317 }
11318
11319 static void
create_fs_end_for_epilog(isel_context * ctx)11320 create_fs_end_for_epilog(isel_context* ctx)
11321 {
11322 Builder bld(ctx->program, ctx->block);
11323
11324 std::vector<Operand> regs;
11325
11326 regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
11327
11328 unsigned vgpr = 256;
11329
11330 for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
11331 unsigned index = slot - FRAG_RESULT_DATA0;
11332 unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
11333 unsigned write_mask = ctx->outputs.mask[slot];
11334
11335 if (!write_mask)
11336 continue;
11337
11338 if (type == ACO_TYPE_ANY32) {
11339 u_foreach_bit (i, write_mask) {
11340 regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11341 }
11342 } else {
11343 for (unsigned i = 0; i < 2; i++) {
11344 unsigned mask = (write_mask >> (i * 2)) & 0x3;
11345 if (!mask)
11346 continue;
11347
11348 unsigned chan = slot * 4 + i * 2;
11349 Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
11350 Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
11351
11352 Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
11353 regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
11354 }
11355 }
11356 vgpr += 4;
11357 }
11358
11359 if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11360 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
11361
11362 if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11363 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
11364
11365 if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11366 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
11367
11368 build_end_with_regs(ctx, regs);
11369
11370 /* Exit WQM mode finally. */
11371 ctx->program->needs_exact = true;
11372 }
11373
11374 Instruction*
add_startpgm(struct isel_context * ctx)11375 add_startpgm(struct isel_context* ctx)
11376 {
11377 unsigned def_count = 0;
11378 for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11379 if (ctx->args->args[i].skip)
11380 continue;
11381 unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11382 if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11383 def_count += ctx->args->args[i].size;
11384 else
11385 def_count++;
11386 }
11387
11388 if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12)
11389 def_count += 3;
11390
11391 Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11392 ctx->block->instructions.emplace_back(startpgm);
11393 for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11394 if (ctx->args->args[i].skip)
11395 continue;
11396
11397 enum ac_arg_regfile file = ctx->args->args[i].file;
11398 unsigned size = ctx->args->args[i].size;
11399 unsigned reg = ctx->args->args[i].offset;
11400 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11401
11402 if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11403 Temp elems[16];
11404 for (unsigned j = 0; j < size; j++) {
11405 elems[j] = ctx->program->allocateTmp(s1);
11406 startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11407 }
11408 ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11409 } else {
11410 Temp dst = ctx->program->allocateTmp(type);
11411 Definition def(dst);
11412 def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11413 ctx->arg_temps[i] = dst;
11414 startpgm->definitions[arg++] = def;
11415
11416 if (ctx->args->args[i].pending_vmem) {
11417 assert(file == AC_ARG_VGPR);
11418 ctx->program->args_pending_vmem.push_back(def);
11419 }
11420 }
11421 }
11422
11423 if (ctx->program->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
11424 Temp idx = ctx->program->allocateTmp(s1);
11425 Temp idy = ctx->program->allocateTmp(s1);
11426 ctx->ttmp8 = ctx->program->allocateTmp(s1);
11427 startpgm->definitions[def_count - 3] = Definition(idx);
11428 startpgm->definitions[def_count - 3].setFixed(PhysReg(108 + 9 /*ttmp9*/));
11429 startpgm->definitions[def_count - 2] = Definition(ctx->ttmp8);
11430 startpgm->definitions[def_count - 2].setFixed(PhysReg(108 + 8 /*ttmp8*/));
11431 startpgm->definitions[def_count - 1] = Definition(idy);
11432 startpgm->definitions[def_count - 1].setFixed(PhysReg(108 + 7 /*ttmp7*/));
11433 ctx->workgroup_id[0] = Operand(idx);
11434 if (ctx->args->workgroup_ids[2].used) {
11435 Builder bld(ctx->program, ctx->block);
11436 ctx->workgroup_id[1] =
11437 bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::zero(),
11438 Operand::c32(16u), Operand::zero());
11439 ctx->workgroup_id[2] =
11440 bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::c32(1u),
11441 Operand::c32(16u), Operand::zero());
11442 } else {
11443 ctx->workgroup_id[1] = Operand(idy);
11444 ctx->workgroup_id[2] = Operand::zero();
11445 }
11446 } else if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
11447 const struct ac_arg* ids = ctx->args->workgroup_ids;
11448 for (unsigned i = 0; i < 3; i++)
11449 ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero();
11450 }
11451
11452 /* epilog has no scratch */
11453 if (ctx->args->scratch_offset.used) {
11454 if (ctx->program->gfx_level < GFX9) {
11455 /* Stash these in the program so that they can be accessed later when
11456 * handling spilling.
11457 */
11458 if (ctx->args->ring_offsets.used)
11459 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11460
11461 ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11462 } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11463 /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11464 */
11465 Operand scratch_addr = ctx->args->ring_offsets.used
11466 ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11467 : Operand(s2);
11468
11469 Builder bld(ctx->program, ctx->block);
11470 bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11471 get_arg(ctx, ctx->args->scratch_offset));
11472 }
11473 }
11474
11475 return startpgm;
11476 }
11477
11478 void
fix_ls_vgpr_init_bug(isel_context * ctx)11479 fix_ls_vgpr_init_bug(isel_context* ctx)
11480 {
11481 Builder bld(ctx->program, ctx->block);
11482 constexpr unsigned hs_idx = 1u;
11483 Builder::Result hs_thread_count =
11484 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11485 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11486 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11487
11488 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11489
11490 Temp instance_id =
11491 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11492 get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11493 Temp vs_rel_patch_id =
11494 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11495 get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11496 Temp vertex_id =
11497 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11498 get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11499
11500 ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11501 ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11502 ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11503 }
11504
11505 void
split_arguments(isel_context * ctx,Instruction * startpgm)11506 split_arguments(isel_context* ctx, Instruction* startpgm)
11507 {
11508 /* Split all arguments except for the first (ring_offsets) and the last
11509 * (exec) so that the dead channels don't stay live throughout the program.
11510 */
11511 for (int i = 1; i < startpgm->definitions.size(); i++) {
11512 if (startpgm->definitions[i].regClass().size() > 1) {
11513 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11514 startpgm->definitions[i].regClass().size());
11515 }
11516 }
11517 }
11518
11519 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11520 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11521 {
11522 Program* program = ctx->program;
11523
11524 unsigned float_controls = shader->info.float_controls_execution_mode;
11525
11526 program->next_fp_mode.must_flush_denorms32 =
11527 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11528 program->next_fp_mode.must_flush_denorms16_64 =
11529 float_controls &
11530 (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11531
11532 program->next_fp_mode.care_about_round32 =
11533 float_controls &
11534 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11535
11536 program->next_fp_mode.care_about_round16_64 =
11537 float_controls &
11538 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11539 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11540
11541 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11542 * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11543 if (program->next_fp_mode.must_flush_denorms16_64)
11544 program->next_fp_mode.denorm16_64 = 0;
11545 else
11546 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11547
11548 /* preserving fp32 denorms is expensive, so only do it if asked */
11549 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11550 program->next_fp_mode.denorm32 = fp_denorm_keep;
11551 else
11552 program->next_fp_mode.denorm32 = 0;
11553
11554 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11555 program->next_fp_mode.round32 = fp_round_tz;
11556 else
11557 program->next_fp_mode.round32 = fp_round_ne;
11558
11559 if (float_controls &
11560 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11561 program->next_fp_mode.round16_64 = fp_round_tz;
11562 else
11563 program->next_fp_mode.round16_64 = fp_round_ne;
11564
11565 ctx->block->fp_mode = program->next_fp_mode;
11566 }
11567
11568 void
cleanup_cfg(Program * program)11569 cleanup_cfg(Program* program)
11570 {
11571 /* create linear_succs/logical_succs */
11572 for (Block& BB : program->blocks) {
11573 for (unsigned idx : BB.linear_preds)
11574 program->blocks[idx].linear_succs.emplace_back(BB.index);
11575 for (unsigned idx : BB.logical_preds)
11576 program->blocks[idx].logical_succs.emplace_back(BB.index);
11577 }
11578 }
11579
11580 void
finish_program(isel_context * ctx)11581 finish_program(isel_context* ctx)
11582 {
11583 cleanup_cfg(ctx->program);
11584
11585 /* Insert a single p_end_wqm instruction after the last derivative calculation */
11586 if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11587 /* Find the next BB at top-level CFG */
11588 while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11589 ctx->wqm_block_idx++;
11590 ctx->wqm_instruction_idx = 0;
11591 }
11592
11593 std::vector<aco_ptr<Instruction>>* instrs =
11594 &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11595 auto it = instrs->begin() + ctx->wqm_instruction_idx;
11596
11597 /* Delay transistion to Exact to help optimizations and scheduling */
11598 while (it != instrs->end()) {
11599 aco_ptr<Instruction>& instr = *it;
11600 /* End WQM before: */
11601 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11602 instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11603 instr->opcode == aco_opcode::p_jump_to_epilog ||
11604 instr->opcode == aco_opcode::p_logical_start)
11605 break;
11606
11607 ++it;
11608
11609 /* End WQM after: */
11610 if (instr->opcode == aco_opcode::p_logical_end ||
11611 instr->opcode == aco_opcode::p_discard_if ||
11612 instr->opcode == aco_opcode::p_demote_to_helper ||
11613 instr->opcode == aco_opcode::p_end_with_regs)
11614 break;
11615 }
11616
11617 Builder bld(ctx->program);
11618 bld.reset(instrs, it);
11619 bld.pseudo(aco_opcode::p_end_wqm);
11620 }
11621 }
11622
11623 Temp
lanecount_to_mask(isel_context * ctx,Temp count)11624 lanecount_to_mask(isel_context* ctx, Temp count)
11625 {
11626 assert(count.regClass() == s1);
11627
11628 Builder bld(ctx->program, ctx->block);
11629 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11630 Temp cond;
11631
11632 if (ctx->program->wave_size == 64) {
11633 /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11634 Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11635 Operand::c32(6u /* log2(64) */));
11636 cond =
11637 bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11638 } else {
11639 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11640 * the register */
11641 cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11642 }
11643
11644 return cond;
11645 }
11646
11647 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11648 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11649 {
11650 Builder bld(ctx->program, ctx->block);
11651
11652 /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11653 Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11654 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11655 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11656
11657 return lanecount_to_mask(ctx, count);
11658 }
11659
11660 static void
insert_rt_jump_next(isel_context & ctx,const struct ac_shader_args * args)11661 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11662 {
11663 unsigned src_count = 0;
11664 for (unsigned i = 0; i < ctx.args->arg_count; i++)
11665 src_count += !!BITSET_TEST(ctx.output_args, i);
11666
11667 Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11668 ctx.block->instructions.emplace_back(ret);
11669
11670 src_count = 0;
11671 for (unsigned i = 0; i < ctx.args->arg_count; i++) {
11672 if (!BITSET_TEST(ctx.output_args, i))
11673 continue;
11674
11675 enum ac_arg_regfile file = ctx.args->args[i].file;
11676 unsigned size = ctx.args->args[i].size;
11677 unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11678 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11679 Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11680 : Operand(PhysReg{reg}, type);
11681 ret->operands[src_count] = op;
11682 src_count++;
11683 }
11684
11685 Builder bld(ctx.program, ctx.block);
11686 bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11687 }
11688
11689 void
select_program_rt(isel_context & ctx,unsigned shader_count,struct nir_shader * const * shaders,const struct ac_shader_args * args)11690 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11691 const struct ac_shader_args* args)
11692 {
11693 for (unsigned i = 0; i < shader_count; i++) {
11694 if (i) {
11695 ctx.block = ctx.program->create_and_insert_block();
11696 ctx.block->kind = block_kind_top_level | block_kind_resume;
11697 }
11698
11699 nir_shader* nir = shaders[i];
11700 init_context(&ctx, nir);
11701 setup_fp_mode(&ctx, nir);
11702
11703 Instruction* startpgm = add_startpgm(&ctx);
11704 append_logical_start(ctx.block);
11705 split_arguments(&ctx, startpgm);
11706 visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11707 append_logical_end(ctx.block);
11708 ctx.block->kind |= block_kind_uniform;
11709
11710 /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11711 * shader without shader calls.
11712 */
11713 if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11714 insert_rt_jump_next(ctx, args);
11715
11716 cleanup_context(&ctx);
11717 }
11718
11719 ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11720 finish_program(&ctx);
11721 }
11722
11723 void
pops_await_overlapped_waves(isel_context * ctx)11724 pops_await_overlapped_waves(isel_context* ctx)
11725 {
11726 ctx->program->has_pops_overlapped_waves_wait = true;
11727
11728 Builder bld(ctx->program, ctx->block);
11729
11730 if (ctx->program->gfx_level >= GFX11) {
11731 /* GFX11+ - waiting for the export from the overlapped waves.
11732 * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11733 */
11734 bld.sopp(aco_opcode::s_wait_event,
11735 ctx->program->gfx_level >= GFX12 ? wait_event_imm_wait_export_ready_gfx12 : 0);
11736 return;
11737 }
11738
11739 /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11740
11741 const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11742
11743 /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11744 const Temp did_overlap =
11745 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11746 if_context did_overlap_if_context;
11747 begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11748 bld.reset(ctx->block);
11749
11750 /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11751 if (ctx->program->gfx_level >= GFX10) {
11752 /* 2 packer ID bits on GFX10-10.3. */
11753 const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11754 collision, Operand::c32(0x2001c));
11755 /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11756 const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11757 bld.def(s1, scc), packer_id, Operand::c32(1));
11758 bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11759 } else {
11760 /* 1 packer ID bit on GFX9. */
11761 const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11762 collision, Operand::c32(0x1001c));
11763 /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11764 * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11765 */
11766 const Temp packer_id_hwreg_bits =
11767 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11768 bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11769 }
11770
11771 Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11772 collision, Operand::c32(0xa0010));
11773 if (ctx->program->gfx_level < GFX10) {
11774 /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11775 * actual wave ID by 1 in case of wraparound.
11776 */
11777 const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11778 collision, Operand::c32(0x3ff));
11779 const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11780 aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11781 newest_overlapped_wave_id =
11782 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11783 newest_overlapped_wave_id_wrapped);
11784 }
11785
11786 /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11787 * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11788 * no more than 1023 values behind the current wave ID.
11789 * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11790 * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11791 * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11792 * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11793 * `a - (b + 1)` is `a + ~b`.
11794 * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11795 * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11796 * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11797 * expected.
11798 */
11799 const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11800 collision, Operand::c32(0x3ff));
11801 newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11802 newest_overlapped_wave_id, wave_id_offset);
11803
11804 /* Await the overlapped waves. */
11805
11806 loop_context wait_loop_context;
11807 begin_loop(ctx, &wait_loop_context);
11808 bld.reset(ctx->block);
11809
11810 const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11811 bld.def(s1, scc), wave_id_offset);
11812 /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11813 * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11814 * exited the ordered section.
11815 */
11816 const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11817 newest_overlapped_wave_id, exiting_wave_id);
11818 if_context newest_overlapped_wave_exited_if_context;
11819 begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11820 newest_overlapped_wave_exited);
11821 emit_loop_break(ctx);
11822 begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11823 end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11824 bld.reset(ctx->block);
11825
11826 /* Sleep before rechecking to let overlapped waves run for some time. */
11827 bld.sopp(aco_opcode::s_sleep, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11828
11829 end_loop(ctx, &wait_loop_context);
11830 bld.reset(ctx->block);
11831
11832 /* Indicate the wait has been done to subsequent compilation stages. */
11833 bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11834
11835 begin_uniform_if_else(ctx, &did_overlap_if_context);
11836 end_uniform_if(ctx, &did_overlap_if_context);
11837 bld.reset(ctx->block);
11838 }
11839
11840 static void
create_merged_jump_to_epilog(isel_context * ctx)11841 create_merged_jump_to_epilog(isel_context* ctx)
11842 {
11843 Builder bld(ctx->program, ctx->block);
11844 std::vector<Operand> regs;
11845
11846 for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11847 if (!ctx->args->args[i].preserved)
11848 continue;
11849
11850 const enum ac_arg_regfile file = ctx->args->args[i].file;
11851 const unsigned reg = ctx->args->args[i].offset;
11852
11853 Operand op(ctx->arg_temps[i]);
11854 op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11855 regs.emplace_back(op);
11856 }
11857
11858 Temp continue_pc =
11859 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11860
11861 aco_ptr<Instruction> jump{
11862 create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11863 jump->operands[0] = Operand(continue_pc);
11864 for (unsigned i = 0; i < regs.size(); i++) {
11865 jump->operands[i + 1] = regs[i];
11866 }
11867 ctx->block->instructions.emplace_back(std::move(jump));
11868 }
11869
11870 static void
create_end_for_merged_shader(isel_context * ctx)11871 create_end_for_merged_shader(isel_context* ctx)
11872 {
11873 std::vector<Operand> regs;
11874
11875 unsigned max_args;
11876 if (ctx->stage.sw == SWStage::VS) {
11877 assert(ctx->args->vertex_id.used);
11878 max_args = ctx->args->vertex_id.arg_index;
11879 } else {
11880 assert(ctx->stage.sw == SWStage::TES);
11881 assert(ctx->args->tes_u.used);
11882 max_args = ctx->args->tes_u.arg_index;
11883 }
11884
11885 struct ac_arg arg;
11886 arg.used = true;
11887
11888 for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
11889 regs.emplace_back(get_arg_for_end(ctx, arg));
11890
11891 build_end_with_regs(ctx, regs);
11892 }
11893
11894 void
select_shader(isel_context & ctx,nir_shader * nir,const bool need_startpgm,const bool need_endpgm,const bool need_barrier,if_context * ic_merged_wave_info,const bool check_merged_wave_info,const bool endif_merged_wave_info)11895 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
11896 const bool need_barrier, if_context* ic_merged_wave_info,
11897 const bool check_merged_wave_info, const bool endif_merged_wave_info)
11898 {
11899 init_context(&ctx, nir);
11900 setup_fp_mode(&ctx, nir);
11901
11902 Program* program = ctx.program;
11903
11904 if (need_startpgm) {
11905 /* Needs to be after init_context() for FS. */
11906 Instruction* startpgm = add_startpgm(&ctx);
11907
11908 if (!program->info.vs.has_prolog &&
11909 (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11910 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, 0x3u);
11911 }
11912
11913 append_logical_start(ctx.block);
11914
11915 if (ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs &&
11916 !program->info.vs.has_prolog)
11917 fix_ls_vgpr_init_bug(&ctx);
11918
11919 split_arguments(&ctx, startpgm);
11920 }
11921
11922 if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11923 !program->stage.has(SWStage::GS)) {
11924 /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11925 * s_sendmsg(GS_ALLOC_REQ).
11926 */
11927 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, 0u);
11928 }
11929
11930 if (check_merged_wave_info) {
11931 const unsigned i =
11932 nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11933 const Temp cond = merged_wave_info_to_mask(&ctx, i);
11934 begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11935 }
11936
11937 if (need_barrier) {
11938 const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11939 program->wave_size % nir->info.tess.tcs_vertices_out == 0
11940 ? scope_subgroup
11941 : scope_workgroup;
11942
11943 Builder(ctx.program, ctx.block)
11944 .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11945 scope);
11946 }
11947
11948 nir_function_impl* func = nir_shader_get_entrypoint(nir);
11949 visit_cf_list(&ctx, &func->body);
11950
11951 if (ctx.program->info.ps.has_epilog) {
11952 if (ctx.stage == fragment_fs) {
11953 if (ctx.options->is_opengl)
11954 create_fs_end_for_epilog(&ctx);
11955 else
11956 create_fs_jump_to_epilog(&ctx);
11957
11958 /* FS epilogs always have at least one color/null export. */
11959 ctx.program->has_color_exports = true;
11960 }
11961 }
11962
11963 if (endif_merged_wave_info) {
11964 begin_divergent_if_else(&ctx, ic_merged_wave_info);
11965 end_divergent_if(&ctx, ic_merged_wave_info);
11966 }
11967
11968 bool is_first_stage_of_merged_shader = false;
11969
11970 if (ctx.program->info.merged_shader_compiled_separately &&
11971 (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11972 assert(program->gfx_level >= GFX9);
11973 if (ctx.options->is_opengl)
11974 create_end_for_merged_shader(&ctx);
11975 else
11976 create_merged_jump_to_epilog(&ctx);
11977
11978 is_first_stage_of_merged_shader = true;
11979 }
11980
11981 cleanup_context(&ctx);
11982
11983 if (need_endpgm) {
11984 program->config->float_mode = program->blocks[0].fp_mode.val;
11985
11986 append_logical_end(ctx.block);
11987 ctx.block->kind |= block_kind_uniform;
11988
11989 if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
11990 (nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
11991 Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
11992 }
11993
11994 finish_program(&ctx);
11995 }
11996 }
11997
11998 void
select_program_merged(isel_context & ctx,const unsigned shader_count,nir_shader * const * shaders)11999 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
12000 {
12001 if_context ic_merged_wave_info;
12002 const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
12003
12004 for (unsigned i = 0; i < shader_count; i++) {
12005 nir_shader* nir = shaders[i];
12006
12007 /* We always need to insert p_startpgm at the beginning of the first shader. */
12008 const bool need_startpgm = i == 0;
12009
12010 /* Need to handle program end for last shader stage. */
12011 const bool need_endpgm = i == shader_count - 1;
12012
12013 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
12014 nir_function_impl* func = nir_shader_get_entrypoint(nir);
12015 const bool empty_shader =
12016 nir_cf_list_is_empty_block(&func->body) &&
12017 ((nir->info.stage == MESA_SHADER_VERTEX &&
12018 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
12019 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
12020
12021 /* See if we need to emit a check of the merged wave info SGPR. */
12022 const bool check_merged_wave_info =
12023 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
12024 const bool endif_merged_wave_info =
12025 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
12026
12027 /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
12028 const bool tcs_skip_barrier =
12029 ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
12030
12031 /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
12032 const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
12033
12034 select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
12035 check_merged_wave_info, endif_merged_wave_info);
12036
12037 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
12038 /* Special handling when TCS input and output patch size is the same.
12039 * Outputs of the previous stage are inputs to the next stage.
12040 */
12041 ctx.inputs = ctx.outputs;
12042 ctx.outputs = shader_io_state();
12043 }
12044 }
12045 }
12046
12047 void
emit_polygon_stipple(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12048 emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12049 {
12050 Builder bld(ctx->program, ctx->block);
12051
12052 /* Use the fixed-point gl_FragCoord input.
12053 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
12054 * per coordinate to get the repeating effect.
12055 */
12056 Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
12057 Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
12058 Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
12059 Operand::c32(5u));
12060
12061 /* Load the buffer descriptor. */
12062 Temp list = get_arg(ctx, finfo->internal_bindings);
12063 list = convert_pointer_to_64_bit(ctx, list);
12064 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12065 Operand::c32(finfo->poly_stipple_buf_offset));
12066
12067 /* The stipple pattern is 32x32, each row has 32 bits. */
12068 Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
12069 Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
12070 0, true);
12071 Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
12072 Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
12073 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
12074
12075 ctx->block->kind |= block_kind_uses_discard;
12076 ctx->program->needs_exact = true;
12077 }
12078
12079 void
overwrite_interp_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12080 overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12081 {
12082 Builder bld(ctx->program, ctx->block);
12083
12084 if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
12085 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
12086 * The hw doesn't compute CENTROID if the whole wave only
12087 * contains fully-covered quads.
12088 */
12089 Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
12090
12091 /* enabled when bit 31 is set */
12092 Temp cond =
12093 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
12094
12095 /* scale 1bit scc to wave size bits used by v_cndmask */
12096 cond = bool_to_vector_condition(ctx, cond);
12097
12098 if (finfo->bc_optimize_for_persp) {
12099 Temp center = get_arg(ctx, ctx->args->persp_center);
12100 Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
12101
12102 Temp dst = bld.tmp(v2);
12103 select_vec2(ctx, dst, cond, center, centroid);
12104 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
12105 }
12106
12107 if (finfo->bc_optimize_for_linear) {
12108 Temp center = get_arg(ctx, ctx->args->linear_center);
12109 Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
12110
12111 Temp dst = bld.tmp(v2);
12112 select_vec2(ctx, dst, cond, center, centroid);
12113 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
12114 }
12115 }
12116
12117 if (finfo->force_persp_sample_interp) {
12118 Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
12119 ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
12120 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
12121 }
12122
12123 if (finfo->force_linear_sample_interp) {
12124 Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
12125 ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
12126 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
12127 }
12128
12129 if (finfo->force_persp_center_interp) {
12130 Temp persp_center = get_arg(ctx, ctx->args->persp_center);
12131 ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
12132 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
12133 }
12134
12135 if (finfo->force_linear_center_interp) {
12136 Temp linear_center = get_arg(ctx, ctx->args->linear_center);
12137 ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
12138 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
12139 }
12140 }
12141
12142 void
overwrite_samplemask_arg(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12143 overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12144 {
12145 Builder bld(ctx->program, ctx->block);
12146
12147 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
12148 * says:
12149 *
12150 * "When per-sample shading is active due to the use of a fragment
12151 * input qualified by sample or due to the use of the gl_SampleID
12152 * or gl_SamplePosition variables, only the bit for the current
12153 * sample is set in gl_SampleMaskIn. When state specifies multiple
12154 * fragment shader invocations for a given fragment, the sample
12155 * mask for any single fragment shader invocation may specify a
12156 * subset of the covered samples for the fragment. In this case,
12157 * the bit corresponding to each covered sample will be set in
12158 * exactly one fragment shader invocation."
12159 *
12160 * The samplemask loaded by hardware is always the coverage of the
12161 * entire pixel/fragment, so mask bits out based on the sample ID.
12162 */
12163 if (finfo->samplemask_log_ps_iter) {
12164 Temp ancillary = get_arg(ctx, ctx->args->ancillary);
12165 Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
12166 Operand::c32(4u));
12167 Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
12168
12169 uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
12170 Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
12171
12172 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
12173 samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
12174
12175 ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
12176 }
12177 }
12178
12179 Temp
get_interp_color(isel_context * ctx,int interp_vgpr,unsigned attr_index,unsigned comp)12180 get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
12181 {
12182 Builder bld(ctx->program, ctx->block);
12183
12184 Temp dst = bld.tmp(v1);
12185
12186 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
12187
12188 if (interp_vgpr != -1) {
12189 /* interp args are all 2 vgprs */
12190 int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
12191 Temp interp_ij = ctx->arg_temps[arg_index];
12192
12193 emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask, false);
12194 } else {
12195 emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask, false);
12196 }
12197
12198 return dst;
12199 }
12200
12201 void
interpolate_color_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo,std::vector<Operand> & regs)12202 interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
12203 std::vector<Operand>& regs)
12204 {
12205 if (!finfo->colors_read)
12206 return;
12207
12208 Builder bld(ctx->program, ctx->block);
12209
12210 unsigned vgpr = 256 + ctx->args->num_vgprs_used;
12211
12212 if (finfo->color_two_side) {
12213 Temp face = get_arg(ctx, ctx->args->front_face);
12214 Temp is_face_positive =
12215 bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), face);
12216
12217 u_foreach_bit (i, finfo->colors_read) {
12218 unsigned color_index = i / 4;
12219 unsigned front_index = finfo->color_attr_index[color_index];
12220 int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12221
12222 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
12223 * otherwise it's at offset "num_inputs".
12224 */
12225 unsigned back_index = finfo->num_interp_inputs;
12226 if (color_index == 1 && finfo->colors_read & 0xf)
12227 back_index++;
12228
12229 Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
12230 Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
12231
12232 Temp color =
12233 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
12234
12235 regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12236 }
12237 } else {
12238 u_foreach_bit (i, finfo->colors_read) {
12239 unsigned color_index = i / 4;
12240 unsigned attr_index = finfo->color_attr_index[color_index];
12241 int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12242 Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
12243
12244 regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12245 }
12246 }
12247 }
12248
12249 void
emit_clamp_alpha_test(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned color_index)12250 emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
12251 unsigned color_index)
12252 {
12253 Builder bld(ctx->program, ctx->block);
12254
12255 if (info->clamp_color) {
12256 for (unsigned i = 0; i < 4; i++) {
12257 if (colors[i].regClass() == v2b) {
12258 colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
12259 Operand::c16(0x3c00), colors[i]);
12260 } else {
12261 assert(colors[i].regClass() == v1);
12262 colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
12263 Operand::c32(0x3f800000u), colors[i]);
12264 }
12265 }
12266 }
12267
12268 if (info->alpha_to_one) {
12269 if (colors[3].regClass() == v2b)
12270 colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00));
12271 else
12272 colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u));
12273 }
12274
12275 if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) {
12276 Operand cond = Operand::c32(-1u);
12277 if (info->alpha_func != COMPARE_FUNC_NEVER) {
12278 aco_opcode opcode = aco_opcode::num_opcodes;
12279
12280 switch (info->alpha_func) {
12281 case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break;
12282 case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break;
12283 case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break;
12284 case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break;
12285 case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break;
12286 case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break;
12287 default: unreachable("invalid alpha func");
12288 }
12289
12290 Temp ref = get_arg(ctx, info->alpha_reference);
12291
12292 Temp alpha = colors[3].regClass() == v2b
12293 ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3])
12294 : colors[3];
12295
12296 /* true if not pass */
12297 cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha);
12298 }
12299
12300 bld.pseudo(aco_opcode::p_discard_if, cond);
12301 ctx->block->kind |= block_kind_uses_discard;
12302 ctx->program->needs_exact = true;
12303 }
12304 }
12305
12306 } /* end namespace */
12307
12308 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12309 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
12310 ac_shader_config* config, const struct aco_compiler_options* options,
12311 const struct aco_shader_info* info, const struct ac_shader_args* args)
12312 {
12313 isel_context ctx =
12314 setup_isel_context(program, shader_count, shaders, config, options, info, args);
12315
12316 if (ctx.stage == raytracing_cs)
12317 return select_program_rt(ctx, shader_count, shaders, args);
12318
12319 if (shader_count >= 2) {
12320 select_program_merged(ctx, shader_count, shaders);
12321 } else {
12322 bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
12323 if_context ic_merged_wave_info;
12324
12325 /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
12326 if (ctx.program->info.merged_shader_compiled_separately) {
12327 assert(ctx.program->gfx_level >= GFX9);
12328 if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
12329 check_merged_wave_info = endif_merged_wave_info = true;
12330 } else {
12331 const bool ngg_gs =
12332 ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
12333 assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
12334 check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
12335 need_barrier = !ngg_gs;
12336 }
12337 }
12338
12339 select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
12340 check_merged_wave_info, endif_merged_wave_info);
12341 }
12342 }
12343
12344 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12345 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
12346 const struct aco_compiler_options* options,
12347 const struct aco_shader_info* info, const struct ac_shader_args* args)
12348 {
12349 assert(options->gfx_level == GFX8);
12350
12351 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12352 config);
12353
12354 isel_context ctx = {};
12355 ctx.program = program;
12356 ctx.args = args;
12357 ctx.options = options;
12358 ctx.stage = program->stage;
12359
12360 ctx.block = ctx.program->create_and_insert_block();
12361 ctx.block->kind = block_kind_top_level;
12362
12363 program->workgroup_size = 1; /* XXX */
12364
12365 add_startpgm(&ctx);
12366 append_logical_start(ctx.block);
12367
12368 Builder bld(ctx.program, ctx.block);
12369
12370 /* Load the buffer descriptor from TMA. */
12371 bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
12372 Operand::zero());
12373
12374 ac_hw_cache_flags cache_glc;
12375 cache_glc.value = ac_glc;
12376
12377 /* Store TTMP0-TTMP1. */
12378 bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
12379 Operand(PhysReg{ttmp0}, s2), memory_sync_info(), cache_glc);
12380
12381 uint32_t hw_regs_idx[] = {
12382 2, /* HW_REG_STATUS */
12383 3, /* HW_REG_TRAP_STS */
12384 4, /* HW_REG_HW_ID */
12385 7, /* HW_REG_IB_STS */
12386 };
12387
12388 /* Store some hardware registers. */
12389 for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
12390 /* "((size - 1) << 11) | register" */
12391 bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
12392 ((20 - 1) << 11) | hw_regs_idx[i]);
12393
12394 bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
12395 Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(),
12396 cache_glc);
12397 }
12398
12399 program->config->float_mode = program->blocks[0].fp_mode.val;
12400
12401 append_logical_end(ctx.block);
12402 ctx.block->kind |= block_kind_uniform;
12403 bld.sopp(aco_opcode::s_endpgm);
12404
12405 finish_program(&ctx);
12406 }
12407
12408 Operand
get_arg_fixed(const struct ac_shader_args * args,struct ac_arg arg)12409 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
12410 {
12411 enum ac_arg_regfile file = args->args[arg.arg_index].file;
12412 unsigned size = args->args[arg.arg_index].size;
12413 RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
12414 return Operand(get_arg_reg(args, arg), rc);
12415 }
12416
12417 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)12418 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
12419 {
12420 unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
12421 for (unsigned i = 0; i < count;) {
12422 unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
12423
12424 if (size == 4)
12425 bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
12426 Operand::c32((start + i) * 16u));
12427 else if (size == 2)
12428 bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
12429 Operand::c32((start + i) * 16u));
12430 else
12431 bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
12432 Operand::c32((start + i) * 16u));
12433
12434 dest = dest.advance(size * 16u);
12435 i += size;
12436 }
12437
12438 return count;
12439 }
12440
12441 void
wait_for_smem_loads(Builder & bld)12442 wait_for_smem_loads(Builder& bld)
12443 {
12444 if (bld.program->gfx_level >= GFX12) {
12445 bld.sopp(aco_opcode::s_wait_kmcnt, 0);
12446 } else {
12447 wait_imm lgkm_imm;
12448 lgkm_imm.lgkm = 0;
12449 bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(bld.program->gfx_level));
12450 }
12451 }
12452
12453 void
wait_for_vmem_loads(Builder & bld)12454 wait_for_vmem_loads(Builder& bld)
12455 {
12456 if (bld.program->gfx_level >= GFX12) {
12457 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
12458 } else {
12459 wait_imm vm_imm;
12460 vm_imm.vm = 0;
12461 bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(bld.program->gfx_level));
12462 }
12463 }
12464
12465 Operand
calc_nontrivial_instance_id(Builder & bld,const struct ac_shader_args * args,const struct aco_vs_prolog_info * pinfo,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)12466 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
12467 const struct aco_vs_prolog_info* pinfo, unsigned index,
12468 Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
12469 PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
12470 {
12471 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
12472 get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
12473
12474 wait_for_smem_loads(bld);
12475
12476 Definition fetch_index_def(tmp_vgpr0, v1);
12477 Operand fetch_index(tmp_vgpr0, v1);
12478
12479 Operand div_info(tmp_sgpr, s1);
12480 if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
12481 /* use SDWA */
12482 if (bld.program->gfx_level < GFX9) {
12483 bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12484 div_info = Operand(tmp_vgpr1, v1);
12485 }
12486
12487 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12488
12489 Instruction* instr;
12490 if (bld.program->gfx_level >= GFX9)
12491 instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12492 else
12493 instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12494 div_info, fetch_index)
12495 .instr;
12496 instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12497
12498 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12499 fetch_index);
12500
12501 instr =
12502 bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12503 instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12504 } else {
12505 Operand tmp_op(tmp_vgpr1, v1);
12506 Definition tmp_def(tmp_vgpr1, v1);
12507
12508 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12509
12510 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12511 bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12512
12513 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12514 Operand(tmp_sgpr.advance(4), s1));
12515
12516 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12517 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12518 }
12519
12520 bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12521
12522 return fetch_index;
12523 }
12524
12525 void
select_rt_prolog(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * in_args,const struct ac_shader_args * out_args)12526 select_rt_prolog(Program* program, ac_shader_config* config,
12527 const struct aco_compiler_options* options, const struct aco_shader_info* info,
12528 const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12529 {
12530 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12531 config);
12532 Block* block = program->create_and_insert_block();
12533 block->kind = block_kind_top_level;
12534 program->workgroup_size = info->workgroup_size;
12535 program->wave_size = info->workgroup_size;
12536 calc_min_waves(program);
12537 Builder bld(program, block);
12538 block->instructions.reserve(32);
12539 unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12540 unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12541
12542 /* Inputs:
12543 * Ring offsets: s[0-1]
12544 * Indirect descriptor sets: s[2]
12545 * Push constants pointer: s[3]
12546 * SBT descriptors: s[4-5]
12547 * Traversal shader address: s[6-7]
12548 * Ray launch size address: s[8-9]
12549 * Dynamic callable stack base: s[10]
12550 * Workgroup IDs (xyz): s[11], s[12], s[13]
12551 * Scratch offset: s[14]
12552 * Local invocation IDs: v[0-2]
12553 */
12554 PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12555 PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12556 PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12557 PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12558 PhysReg in_wg_id_x;
12559 PhysReg in_wg_id_y;
12560 PhysReg in_wg_id_z;
12561 PhysReg in_scratch_offset;
12562 if (options->gfx_level < GFX12) {
12563 in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12564 in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12565 in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12566 } else {
12567 in_wg_id_x = PhysReg(108 + 9 /*ttmp9*/);
12568 in_wg_id_y = PhysReg(108 + 7 /*ttmp7*/);
12569 }
12570 if (options->gfx_level < GFX11)
12571 in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12572 PhysReg in_local_ids[2] = {
12573 get_arg_reg(in_args, in_args->local_invocation_ids),
12574 get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
12575 };
12576
12577 /* Outputs:
12578 * Callee shader PC: s[0-1]
12579 * Indirect descriptor sets: s[2]
12580 * Push constants pointer: s[3]
12581 * SBT descriptors: s[4-5]
12582 * Traversal shader address: s[6-7]
12583 * Ray launch sizes (xyz): s[8], s[9], s[10]
12584 * Scratch offset (<GFX9 only): s[11]
12585 * Ring offsets (<GFX9 only): s[12-13]
12586 * Ray launch IDs: v[0-2]
12587 * Stack pointer: v[3]
12588 * Shader VA: v[4-5]
12589 * Shader Record Ptr: v[6-7]
12590 */
12591 PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12592 PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
12593 PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
12594 PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
12595 PhysReg out_launch_ids[3];
12596 for (unsigned i = 0; i < 3; i++)
12597 out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
12598 PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12599 PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12600
12601 /* Temporaries: */
12602 num_sgprs = align(num_sgprs, 2);
12603 PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
12604 num_sgprs += 2;
12605 PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
12606 num_sgprs += 2;
12607 PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs};
12608 num_sgprs++;
12609
12610 PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
12611
12612 /* Confirm some assumptions about register aliasing */
12613 assert(in_ring_offsets == out_uniform_shader_addr);
12614 assert(get_arg_reg(in_args, in_args->push_constants) ==
12615 get_arg_reg(out_args, out_args->push_constants));
12616 assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12617 get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12618 assert(in_launch_size_addr == out_launch_size_x);
12619 assert(in_stack_base == out_launch_size_z);
12620 assert(in_local_ids[0] == out_launch_ids[0]);
12621
12622 /* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
12623 * arg. Make sure no other outputs have overwritten it by then.
12624 */
12625 assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
12626
12627 /* load raygen sbt */
12628 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12629 Operand::c32(0u));
12630
12631 /* init scratch */
12632 if (options->gfx_level < GFX9) {
12633 /* copy ring offsets to temporary location*/
12634 bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12635 Operand(in_ring_offsets, s2));
12636 } else if (options->gfx_level < GFX11) {
12637 hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12638 Operand(in_scratch_offset, s1));
12639 }
12640
12641 /* set stack ptr */
12642 bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12643
12644 /* load raygen address */
12645 bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12646 Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12647
12648 /* load ray launch sizes */
12649 bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12650 Operand(in_launch_size_addr, s2), Operand::c32(8u));
12651 bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12652 Operand(in_launch_size_addr, s2), Operand::c32(0u));
12653
12654 /* calculate ray launch ids */
12655 if (options->gfx_level >= GFX11) {
12656 /* Thread IDs are packed in VGPR0, 10 bits per component. */
12657 bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12658 Operand::c32(10u), Operand::c32(3u));
12659 bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12660 Operand(in_local_ids[0], v1));
12661 }
12662 /* Do this backwards to reduce some RAW hazards on GFX11+ */
12663 if (options->gfx_level >= GFX12) {
12664 bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
12665 Operand(in_wg_id_y, s1));
12666 bld.vop3(aco_opcode::v_mad_u32_u16, Definition(out_launch_ids[1], v1),
12667 Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12668 Operand(in_local_ids[1], v1));
12669 } else {
12670 bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12671 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1),
12672 Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12673 Operand(in_local_ids[1], v1));
12674 }
12675 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12676 Operand::c32(8), Operand(in_local_ids[0], v1));
12677
12678 /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12679 if (options->gfx_level < GFX9) {
12680 bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12681 Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12682 } else {
12683 bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12684 Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12685 }
12686 bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12687 Operand(tmp_raygen_sbt.advance(4), s1));
12688
12689 /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
12690 * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
12691 * tmp_wg_id_x_times_size now holds wg_id.x * wg_size.
12692 */
12693 bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_id_x_times_size, s1), Definition(scc, s1),
12694 Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
12695
12696 /* Calculate and add local_invocation_index */
12697 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
12698 Operand(tmp_wg_id_x_times_size, s1));
12699 if (program->wave_size == 64) {
12700 if (program->gfx_level <= GFX7)
12701 bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
12702 Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12703 else
12704 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
12705 Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12706 }
12707
12708 /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
12709 bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
12710 Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
12711 bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
12712 Operand::c32_or_c64(-1u, program->wave_size == 64),
12713 Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
12714 bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
12715 Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
12716 bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
12717 Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
12718
12719 if (options->gfx_level < GFX9) {
12720 /* write scratch/ring offsets to outputs, if needed */
12721 bld.sop1(aco_opcode::s_mov_b32,
12722 Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12723 Operand(in_scratch_offset, s1));
12724 bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12725 Operand(tmp_ring_offsets, s2));
12726 }
12727
12728 /* jump to raygen */
12729 bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12730
12731 program->config->float_mode = program->blocks[0].fp_mode.val;
12732 program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12733 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12734 }
12735
12736 PhysReg
get_next_vgpr(unsigned size,unsigned * num,int * offset=NULL)12737 get_next_vgpr(unsigned size, unsigned* num, int *offset = NULL)
12738 {
12739 unsigned reg = *num + (offset ? *offset : 0);
12740 if (reg + size >= *num) {
12741 *num = reg + size;
12742 if (offset)
12743 *offset = 0;
12744 } else if (offset) {
12745 *offset += size;
12746 }
12747 return PhysReg(256 + reg);
12748 }
12749
12750 struct UnalignedVsAttribLoad {
12751 /* dst/scratch are PhysReg converted to unsigned */
12752 unsigned dst;
12753 unsigned scratch;
12754 bool d16;
12755 const struct ac_vtx_format_info* vtx_info;
12756 };
12757
12758 struct UnalignedVsAttribLoadState {
12759 unsigned max_vgprs;
12760 unsigned initial_num_vgprs;
12761 unsigned* num_vgprs;
12762 unsigned overflow_num_vgprs;
12763 aco::small_vec<UnalignedVsAttribLoad, 16> current_loads;
12764 };
12765
12766 void
convert_unaligned_vs_attrib(Builder & bld,UnalignedVsAttribLoad load)12767 convert_unaligned_vs_attrib(Builder& bld, UnalignedVsAttribLoad load)
12768 {
12769 PhysReg dst(load.dst);
12770 PhysReg scratch(load.scratch);
12771 const struct ac_vtx_format_info* vtx_info = load.vtx_info;
12772 unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12773 unsigned nfmt = vtx_info->hw_format[0] >> 4;
12774
12775 unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12776 if (load.d16) {
12777 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(scratch, v1),
12778 Operand::c32(8), Operand(dst, v1));
12779 } else {
12780 for (unsigned i = 1; i < size; i++) {
12781 PhysReg byte_reg = scratch.advance(i * 4 - 4);
12782 if (bld.program->gfx_level >= GFX9) {
12783 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(byte_reg, v1),
12784 Operand::c32(i * 8), Operand(dst, v1));
12785 } else {
12786 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(byte_reg, v1), Operand::c32(i * 8),
12787 Operand(byte_reg, v1));
12788 bld.vop2(aco_opcode::v_or_b32, Definition(dst, v1), Operand(dst, v1),
12789 Operand(byte_reg, v1));
12790 }
12791 }
12792 }
12793
12794 unsigned num_channels = vtx_info->chan_byte_size ? 1 : vtx_info->num_channels;
12795 PhysReg chan[4] = {dst, dst.advance(4), dst.advance(8), dst.advance(12)};
12796
12797 if (dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11) {
12798 bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(22),
12799 Operand::c32(10));
12800 bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(11),
12801 Operand::c32(11));
12802 bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(0),
12803 Operand::c32(11));
12804 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[2], v1), Operand::c32(5),
12805 Operand(chan[2], v1));
12806 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[1], v1), Operand::c32(4),
12807 Operand(chan[1], v1));
12808 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[0], v1), Operand::c32(4),
12809 Operand(chan[0], v1));
12810 } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_2_10_10_10) {
12811 aco_opcode bfe = aco_opcode::v_bfe_u32;
12812 switch (nfmt) {
12813 case V_008F0C_BUF_NUM_FORMAT_SNORM:
12814 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12815 case V_008F0C_BUF_NUM_FORMAT_SINT: bfe = aco_opcode::v_bfe_i32; break;
12816 default: break;
12817 }
12818
12819 bool swapxz = G_008F0C_DST_SEL_X(vtx_info->dst_sel) != V_008F0C_SQ_SEL_X;
12820 bld.vop3(bfe, Definition(chan[3], v1), Operand(dst, v1), Operand::c32(30), Operand::c32(2));
12821 bld.vop3(bfe, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(swapxz ? 0 : 20),
12822 Operand::c32(10));
12823 bld.vop3(bfe, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(10), Operand::c32(10));
12824 bld.vop3(bfe, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(swapxz ? 20 : 0),
12825 Operand::c32(10));
12826 } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_8 || dfmt == V_008F0C_BUF_DATA_FORMAT_16) {
12827 unsigned bits = dfmt == V_008F0C_BUF_DATA_FORMAT_8 ? 8 : 16;
12828 switch (nfmt) {
12829 case V_008F0C_BUF_NUM_FORMAT_SNORM:
12830 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12831 case V_008F0C_BUF_NUM_FORMAT_SINT:
12832 bld.vop3(aco_opcode::v_bfe_i32, Definition(dst, v1), Operand(dst, v1), Operand::c32(0),
12833 Operand::c32(bits));
12834 break;
12835 default: break;
12836 }
12837 }
12838
12839 if (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT &&
12840 (dfmt == V_008F0C_BUF_DATA_FORMAT_16 || dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11)) {
12841 for (unsigned i = 0; i < num_channels; i++)
12842 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(chan[i], v1), Operand(chan[i], v1));
12843 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12844 for (unsigned i = 0; i < num_channels; i++)
12845 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(chan[i], v1), Operand(chan[i], v1));
12846 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SSCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12847 for (unsigned i = 0; i < num_channels; i++)
12848 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(chan[i], v1), Operand(chan[i], v1));
12849 }
12850
12851 std::array<unsigned, 4> chan_max;
12852 switch (dfmt) {
12853 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: chan_max = {1023, 1023, 1023, 3}; break;
12854 case V_008F0C_BUF_DATA_FORMAT_8: chan_max = {255, 255, 255, 255}; break;
12855 case V_008F0C_BUF_DATA_FORMAT_16: chan_max = {65535, 65535, 65535, 65535}; break;
12856 }
12857
12858 if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12859 for (unsigned i = 0; i < num_channels; i++)
12860 bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12861 Operand::c32(fui(1.0 / chan_max[i])), Operand(chan[i], v1));
12862 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12863 for (unsigned i = 0; i < num_channels; i++) {
12864 bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12865 Operand::c32(fui(1.0 / (chan_max[i] >> 1))), Operand(chan[i], v1));
12866 bld.vop2(aco_opcode::v_max_f32, Definition(chan[i], v1), Operand::c32(0xbf800000),
12867 Operand(chan[i], v1));
12868 }
12869 }
12870 }
12871
12872 void
convert_current_unaligned_vs_attribs(Builder & bld,UnalignedVsAttribLoadState * state)12873 convert_current_unaligned_vs_attribs(Builder& bld, UnalignedVsAttribLoadState* state)
12874 {
12875 if (state->current_loads.empty())
12876 return;
12877
12878 wait_for_vmem_loads(bld);
12879
12880 for (UnalignedVsAttribLoad load : state->current_loads)
12881 convert_unaligned_vs_attrib(bld, load);
12882 state->current_loads.clear();
12883
12884 state->overflow_num_vgprs = state->initial_num_vgprs;
12885 state->num_vgprs = &state->overflow_num_vgprs;
12886 }
12887
12888 void
load_unaligned_vs_attrib(Builder & bld,PhysReg dst,Operand desc,Operand index,uint32_t offset,const struct ac_vtx_format_info * vtx_info,UnalignedVsAttribLoadState * state)12889 load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index, uint32_t offset,
12890 const struct ac_vtx_format_info* vtx_info,
12891 UnalignedVsAttribLoadState* state)
12892 {
12893 unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12894
12895 UnalignedVsAttribLoad load;
12896 load.dst = dst;
12897 load.vtx_info = vtx_info;
12898 load.d16 = bld.program->gfx_level >= GFX9 && !bld.program->dev.sram_ecc_enabled && size == 4;
12899
12900 unsigned num_scratch_vgprs = load.d16 ? 1 : (size - 1);
12901 if (!vtx_info->chan_byte_size) {
12902 /* When chan_byte_size==0, we're loading the entire attribute, so we can use the last 3
12903 * components of the destination.
12904 */
12905 assert(num_scratch_vgprs <= 3);
12906 load.scratch = dst.advance(4);
12907 } else {
12908 if (*state->num_vgprs + num_scratch_vgprs > state->max_vgprs)
12909 convert_current_unaligned_vs_attribs(bld, state);
12910
12911 load.scratch = get_next_vgpr(num_scratch_vgprs, state->num_vgprs, NULL);
12912 }
12913
12914 PhysReg scratch(load.scratch);
12915 if (load.d16) {
12916 bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index,
12917 Operand::c32(0u), offset, false, true);
12918 bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index,
12919 Operand::c32(0u), offset + 2, false, true);
12920 bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index,
12921 Operand::c32(0u), offset + 1, false, true);
12922 bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index,
12923 Operand::c32(0u), offset + 3, false, true);
12924 } else {
12925 for (unsigned i = 0; i < size; i++) {
12926 Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
12927 bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(offset + i), 0,
12928 false, true);
12929 }
12930 }
12931
12932 state->current_loads.push_back(load);
12933 }
12934
12935 void
select_vs_prolog(Program * program,const struct aco_vs_prolog_info * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12936 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12937 const struct aco_compiler_options* options, const struct aco_shader_info* info,
12938 const struct ac_shader_args* args)
12939 {
12940 assert(pinfo->num_attributes > 0);
12941
12942 /* This should be enough for any shader/stage. */
12943 unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12944
12945 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12946 config);
12947 program->dev.vgpr_limit = 256;
12948
12949 Block* block = program->create_and_insert_block();
12950 block->kind = block_kind_top_level;
12951
12952 program->workgroup_size = 64;
12953 calc_min_waves(program);
12954
12955 Builder bld(program, block);
12956
12957 block->instructions.reserve(16 + pinfo->num_attributes * 4);
12958
12959 /* Besides performance, the purpose of this is also for the FeatureRequiredExportPriority GFX11.5
12960 * issue. */
12961 bld.sopp(aco_opcode::s_setprio, 3);
12962
12963 uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12964 bool has_nontrivial_divisors = pinfo->nontrivial_divisors;
12965
12966 /* choose sgprs */
12967 PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12968 PhysReg prolog_input = vertex_buffers.advance(8);
12969 PhysReg desc(
12970 align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12971
12972 Operand start_instance = get_arg_fixed(args, args->start_instance);
12973 Operand instance_id = get_arg_fixed(args, args->instance_id);
12974
12975 bool needs_instance_index =
12976 pinfo->instance_rate_inputs &
12977 ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
12978 bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
12979 bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
12980 bool needs_tmp_vgpr0 = has_nontrivial_divisors;
12981 bool needs_tmp_vgpr1 = has_nontrivial_divisors &&
12982 (program->gfx_level <= GFX8 || program->gfx_level >= GFX11);
12983
12984 int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4;
12985
12986 unsigned num_vgprs = args->num_vgprs_used;
12987 PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs);
12988 PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1;
12989 if (needs_vertex_index)
12990 vertex_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12991 if (needs_instance_index)
12992 instance_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12993 if (needs_start_instance)
12994 start_instance_vgpr = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12995 if (needs_tmp_vgpr0)
12996 nontrivial_tmp_vgpr0 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12997 if (needs_tmp_vgpr1)
12998 nontrivial_tmp_vgpr1 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12999
13000 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
13001 get_arg_fixed(args, args->vertex_buffers));
13002 if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
13003 bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
13004 options->address32_hi & 0xFFFF);
13005 } else {
13006 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
13007 Operand::c32((unsigned)options->address32_hi));
13008 }
13009
13010 const struct ac_vtx_format_info* vtx_info_table =
13011 ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
13012
13013 UnalignedVsAttribLoadState unaligned_state;
13014 unaligned_state.max_vgprs = MAX2(84, num_vgprs + 8);
13015 unaligned_state.initial_num_vgprs = num_vgprs;
13016 unaligned_state.num_vgprs = &num_vgprs;
13017
13018 unsigned num_sgprs = 0;
13019 for (unsigned loc = 0; loc < pinfo->num_attributes;) {
13020 unsigned num_descs =
13021 load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
13022 num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
13023
13024 if (loc == 0) {
13025 /* perform setup while we load the descriptors */
13026 if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
13027 Operand count = get_arg_fixed(args, args->merged_wave_info);
13028 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
13029 if (program->wave_size == 64) {
13030 bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
13031 Operand::c32(6u /* log2(64) */));
13032 bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
13033 Operand(exec, s2), Operand(scc, s1));
13034 }
13035 }
13036
13037 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
13038 if (info->hw_stage == AC_HW_HULL_SHADER && options->has_ls_vgpr_init_bug) {
13039 /* We don't want load_vb_descs() to write vcc. */
13040 assert(program->dev.sgpr_limit <= vcc.reg());
13041
13042 bld.sop2(aco_opcode::s_bfe_u32, Definition(vcc, s1), Definition(scc, s1),
13043 get_arg_fixed(args, args->merged_wave_info), Operand::c32((8u << 16) | 8u));
13044 bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), Operand::c32(-1), Operand::zero(),
13045 Operand(scc, s1));
13046
13047 /* These copies are ordered so that vertex_id=tcs_patch_id doesn't overwrite vertex_id
13048 * before instance_id=vertex_id. */
13049 ac_arg src_args[] = {args->vertex_id, args->tcs_rel_ids, args->tcs_patch_id};
13050 ac_arg dst_args[] = {args->instance_id, args->vs_rel_patch_id, args->vertex_id};
13051 for (unsigned i = 0; i < 3; i++) {
13052 bld.vop2(aco_opcode::v_cndmask_b32, Definition(get_arg_reg(args, dst_args[i]), v1),
13053 get_arg_fixed(args, src_args[i]), get_arg_fixed(args, dst_args[i]),
13054 Operand(vcc, bld.lm));
13055 }
13056 }
13057
13058 if (needs_vertex_index)
13059 bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
13060 get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
13061 if (needs_instance_index)
13062 bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
13063 Operand(s2), true);
13064 if (needs_start_instance)
13065 bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
13066 }
13067
13068 wait_for_smem_loads(bld);
13069
13070 for (unsigned i = 0; i < num_descs;) {
13071 PhysReg dest(attributes_start.reg() + loc * 4u);
13072
13073 /* calculate index */
13074 Operand fetch_index = Operand(vertex_index, v1);
13075 if (pinfo->instance_rate_inputs & (1u << loc)) {
13076 if (!(pinfo->zero_divisors & (1u << loc))) {
13077 fetch_index = instance_id;
13078 if (pinfo->nontrivial_divisors & (1u << loc)) {
13079 unsigned index = util_bitcount(pinfo->nontrivial_divisors & BITFIELD_MASK(loc));
13080 fetch_index = calc_nontrivial_instance_id(
13081 bld, args, pinfo, index, instance_id, start_instance, prolog_input,
13082 nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
13083 } else {
13084 fetch_index = Operand(instance_index, v1);
13085 }
13086 } else {
13087 fetch_index = Operand(start_instance_vgpr, v1);
13088 }
13089 }
13090
13091 /* perform load */
13092 PhysReg cur_desc = desc.advance(i * 16);
13093 if ((pinfo->misaligned_mask & (1u << loc))) {
13094 const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
13095
13096 assert(vtx_info->has_hw_format & 0x1);
13097 unsigned dfmt = vtx_info->hw_format[0] & 0xf;
13098 unsigned nfmt = vtx_info->hw_format[0] >> 4;
13099
13100 for (unsigned j = 0; j < (vtx_info->chan_byte_size ? vtx_info->num_channels : 1); j++) {
13101 bool post_shuffle = pinfo->post_shuffle & (1u << loc);
13102 unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
13103
13104 if ((pinfo->unaligned_mask & (1u << loc)) && vtx_info->chan_byte_size <= 4)
13105 load_unaligned_vs_attrib(bld, dest.advance(j * 4u), Operand(cur_desc, s4),
13106 fetch_index, offset, vtx_info, &unaligned_state);
13107 else if (vtx_info->chan_byte_size == 8)
13108 bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
13109 Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
13110 fetch_index, Operand::c32(offset), dfmt, nfmt, 0, false, true);
13111 else
13112 bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
13113 Operand(cur_desc, s4), fetch_index, Operand::c32(offset), dfmt, nfmt,
13114 0, false, true);
13115 }
13116
13117 unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
13118 loc += slots;
13119 i += slots;
13120 } else {
13121 bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
13122 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true);
13123 loc++;
13124 i++;
13125 }
13126 }
13127 }
13128
13129 uint32_t constant_mask = pinfo->misaligned_mask;
13130 while (constant_mask) {
13131 unsigned loc = u_bit_scan(&constant_mask);
13132 const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
13133
13134 /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
13135 * For 64-bit data types, no default attribute values are provided. Input variables must
13136 * not use more components than provided by the attribute.
13137 */
13138 if (vtx_info->chan_byte_size == 8) {
13139 if (vtx_info->num_channels > 2)
13140 u_bit_scan(&constant_mask);
13141 continue;
13142 }
13143
13144 assert(vtx_info->has_hw_format & 0x1);
13145 unsigned nfmt = vtx_info->hw_format[0] >> 4;
13146
13147 uint32_t one = nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
13148 ? 1u
13149 : 0x3f800000u;
13150 PhysReg dest(attributes_start.reg() + loc * 4u);
13151 for (unsigned j = vtx_info->num_channels; j < 4; j++) {
13152 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
13153 Operand::c32(j == 3 ? one : 0u));
13154 }
13155 }
13156
13157 convert_current_unaligned_vs_attribs(bld, &unaligned_state);
13158
13159 if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)
13160 wait_for_vmem_loads(bld);
13161
13162 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
13163 * so we may need to fix it up. */
13164 u_foreach_bit (loc, (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)) {
13165 PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
13166
13167 unsigned alpha_adjust = (pinfo->alpha_adjust_lo >> loc) & 0x1;
13168 alpha_adjust |= ((pinfo->alpha_adjust_hi >> loc) & 0x1) << 1;
13169
13170 if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
13171 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
13172
13173 /* For the integer-like cases, do a natural sign extension.
13174 *
13175 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
13176 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
13177 * exponent.
13178 */
13179 unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
13180 bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
13181 Operand::c32(offset), Operand::c32(2u));
13182
13183 /* Convert back to the right type. */
13184 if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
13185 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13186 bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
13187 Operand(alpha, v1));
13188 } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
13189 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13190 }
13191 }
13192
13193 block->kind |= block_kind_uniform;
13194
13195 /* continue on to the main shader */
13196 Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
13197 if (has_nontrivial_divisors) {
13198 bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
13199 get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
13200 wait_for_smem_loads(bld);
13201 continue_pc = Operand(prolog_input, s2);
13202 }
13203
13204 bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
13205
13206 program->config->float_mode = program->blocks[0].fp_mode.val;
13207 /* addition on GFX6-8 requires a carry-out (we use VCC) */
13208 program->needs_vcc = program->gfx_level <= GFX8;
13209 program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
13210 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
13211 }
13212
13213 void
select_ps_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13214 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
13215 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13216 const struct ac_shader_args* args)
13217 {
13218 const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
13219 isel_context ctx =
13220 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13221
13222 ctx.block->fp_mode = program->next_fp_mode;
13223
13224 add_startpgm(&ctx);
13225 append_logical_start(ctx.block);
13226
13227 Builder bld(ctx.program, ctx.block);
13228
13229 Temp colors[MAX_DRAW_BUFFERS][4];
13230 for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13231 if (!einfo->colors[i].used)
13232 continue;
13233
13234 Temp color = get_arg(&ctx, einfo->colors[i]);
13235 unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3;
13236
13237 emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8);
13238 for (unsigned c = 0; c < 4; ++c) {
13239 colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b);
13240 }
13241
13242 emit_clamp_alpha_test(&ctx, einfo, colors[i], i);
13243 }
13244
13245 bool has_mrtz_depth = einfo->depth.used;
13246 bool has_mrtz_stencil = einfo->stencil.used;
13247 bool has_mrtz_samplemask = einfo->samplemask.used;
13248 bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used;
13249 bool has_mrtz_export =
13250 has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha;
13251 if (has_mrtz_export) {
13252 Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp();
13253 Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp();
13254 Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp();
13255 Temp alpha = has_mrtz_alpha ? colors[0][3] : Temp();
13256
13257 export_fs_mrtz(&ctx, depth, stencil, samplemask, alpha);
13258 }
13259
13260 /* Export all color render targets */
13261 struct aco_export_mrt mrts[MAX_DRAW_BUFFERS];
13262 unsigned mrt_num = 0;
13263
13264 if (einfo->broadcast_last_cbuf) {
13265 for (unsigned i = 0; i <= einfo->broadcast_last_cbuf; i++) {
13266 struct aco_export_mrt* mrt = &mrts[mrt_num];
13267 if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt))
13268 mrt->target += mrt_num++;
13269 }
13270 } else {
13271 for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13272 struct aco_export_mrt* mrt = &mrts[mrt_num];
13273 const uint8_t cb_idx = einfo->color_map[i];
13274
13275 if (cb_idx == 0xff || !einfo->colors[cb_idx].used)
13276 continue;
13277
13278 if (export_fs_mrt_color(&ctx, einfo, colors[cb_idx], i, mrt)) {
13279 mrt->target += mrt_num++;
13280 }
13281 }
13282 }
13283
13284 if (mrt_num) {
13285 if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
13286 assert(mrt_num == 2);
13287 create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]);
13288 } else {
13289 for (unsigned i = 0; i < mrt_num; i++)
13290 export_mrt(&ctx, &mrts[i]);
13291 }
13292 } else if (!has_mrtz_export && !einfo->skip_null_export) {
13293 create_fs_null_export(&ctx);
13294 }
13295
13296 program->config->float_mode = program->blocks[0].fp_mode.val;
13297
13298 append_logical_end(ctx.block);
13299 ctx.block->kind |= block_kind_export_end;
13300 bld.reset(ctx.block);
13301 bld.sopp(aco_opcode::s_endpgm);
13302
13303 finish_program(&ctx);
13304 }
13305
13306 void
select_ps_prolog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13307 select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
13308 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13309 const struct ac_shader_args* args)
13310 {
13311 const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
13312 isel_context ctx =
13313 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13314
13315 ctx.block->fp_mode = program->next_fp_mode;
13316
13317 add_startpgm(&ctx);
13318 append_logical_start(ctx.block);
13319
13320 if (finfo->poly_stipple)
13321 emit_polygon_stipple(&ctx, finfo);
13322
13323 overwrite_interp_args(&ctx, finfo);
13324
13325 overwrite_samplemask_arg(&ctx, finfo);
13326
13327 std::vector<Operand> regs;
13328 passthrough_all_args(&ctx, regs);
13329
13330 interpolate_color_args(&ctx, finfo, regs);
13331
13332 program->config->float_mode = program->blocks[0].fp_mode.val;
13333
13334 append_logical_end(ctx.block);
13335
13336 build_end_with_regs(&ctx, regs);
13337
13338 /* To compute all end args in WQM mode if required by main part. */
13339 if (finfo->needs_wqm)
13340 set_wqm(&ctx, true);
13341
13342 /* Exit WQM mode finally. */
13343 program->needs_exact = true;
13344
13345 finish_program(&ctx);
13346 }
13347
13348 } // namespace aco
13349