xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_instruction_selection.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "aco_instruction_selection.h"
9 
10 #include "aco_builder.h"
11 #include "aco_interface.h"
12 #include "aco_ir.h"
13 
14 #include "common/ac_descriptors.h"
15 #include "common/ac_gpu_info.h"
16 #include "common/ac_nir.h"
17 #include "common/sid.h"
18 
19 #include "util/fast_idiv_by_const.h"
20 #include "util/memstream.h"
21 
22 #include <array>
23 #include <functional>
24 #include <map>
25 #include <numeric>
26 #include <stack>
27 #include <utility>
28 #include <vector>
29 
30 namespace aco {
31 namespace {
32 
33 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
34 
35 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)36 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
37           const char* msg)
38 {
39    char* out;
40    size_t outsize;
41    struct u_memstream mem;
42    u_memstream_open(&mem, &out, &outsize);
43    FILE* const memf = u_memstream_get(&mem);
44 
45    fprintf(memf, "%s: ", msg);
46    nir_print_instr(instr, memf);
47    u_memstream_close(&mem);
48 
49    _aco_err(ctx->program, file, line, out);
50    free(out);
51 }
52 
53 struct if_context {
54    Temp cond;
55 
56    bool divergent_old;
57    bool had_divergent_discard_old;
58    bool had_divergent_discard_then;
59    bool has_divergent_continue_old;
60    bool has_divergent_continue_then;
61    struct exec_info exec_old;
62 
63    unsigned BB_if_idx;
64    unsigned invert_idx;
65    Block BB_invert;
66    Block BB_endif;
67 };
68 
69 struct loop_context {
70    Block loop_exit;
71 
72    unsigned header_idx_old;
73    Block* exit_old;
74    bool divergent_cont_old;
75    bool divergent_branch_old;
76    bool divergent_if_old;
77 };
78 
79 static void visit_cf_list(struct isel_context* ctx, struct exec_list* list);
80 
81 static void
add_logical_edge(unsigned pred_idx,Block * succ)82 add_logical_edge(unsigned pred_idx, Block* succ)
83 {
84    succ->logical_preds.emplace_back(pred_idx);
85 }
86 
87 static void
add_linear_edge(unsigned pred_idx,Block * succ)88 add_linear_edge(unsigned pred_idx, Block* succ)
89 {
90    succ->linear_preds.emplace_back(pred_idx);
91 }
92 
93 static void
add_edge(unsigned pred_idx,Block * succ)94 add_edge(unsigned pred_idx, Block* succ)
95 {
96    add_logical_edge(pred_idx, succ);
97    add_linear_edge(pred_idx, succ);
98 }
99 
100 static void
append_logical_start(Block * b)101 append_logical_start(Block* b)
102 {
103    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
104 }
105 
106 static void
append_logical_end(Block * b)107 append_logical_end(Block* b)
108 {
109    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
110 }
111 
112 Temp
get_ssa_temp(struct isel_context * ctx,nir_def * def)113 get_ssa_temp(struct isel_context* ctx, nir_def* def)
114 {
115    uint32_t id = ctx->first_temp_id + def->index;
116    return Temp(id, ctx->program->temp_rc[id]);
117 }
118 
119 static Builder
create_alu_builder(isel_context * ctx,nir_alu_instr * instr)120 create_alu_builder(isel_context* ctx, nir_alu_instr* instr)
121 {
122    Builder bld(ctx->program, ctx->block);
123    bld.is_precise = instr->exact;
124    bld.is_sz_preserve = nir_alu_instr_is_signed_zero_preserve(instr);
125    bld.is_inf_preserve = nir_alu_instr_is_inf_preserve(instr);
126    bld.is_nan_preserve = nir_alu_instr_is_nan_preserve(instr);
127    return bld;
128 }
129 
130 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())131 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
132 {
133    Builder bld(ctx->program, ctx->block);
134    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
135    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
136 
137    if (ctx->program->wave_size == 32) {
138       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
139       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
140    }
141 
142    Operand mask_lo = Operand::c32(-1u);
143    Operand mask_hi = Operand::c32(-1u);
144 
145    if (mask.isTemp()) {
146       RegClass rc = RegClass(mask.regClass().type(), 1);
147       Builder::Result mask_split =
148          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
149       mask_lo = Operand(mask_split.def(0).getTemp());
150       mask_hi = Operand(mask_split.def(1).getTemp());
151    } else if (mask.physReg() == exec) {
152       mask_lo = Operand(exec_lo, s1);
153       mask_hi = Operand(exec_hi, s1);
154    }
155 
156    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
157 
158    if (ctx->program->gfx_level <= GFX7)
159       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
160    else
161       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
162 }
163 
164 inline void
set_wqm(isel_context * ctx,bool enable_helpers=false)165 set_wqm(isel_context* ctx, bool enable_helpers = false)
166 {
167    if (ctx->program->stage == fragment_fs) {
168       ctx->wqm_block_idx = ctx->block->index;
169       ctx->wqm_instruction_idx = ctx->block->instructions.size();
170       if (ctx->shader)
171          enable_helpers |= ctx->shader->info.fs.require_full_quads;
172       ctx->program->needs_wqm |= enable_helpers;
173    }
174 }
175 
176 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)177 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
178 {
179    if (index.regClass() == s1)
180       return bld.readlane(bld.def(s1), data, index);
181 
182    /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
183     * of multiple binaries, because the VGPR use is not known when choosing
184     * which registers to use for the shared VGPRs.
185     */
186    const bool avoid_shared_vgprs =
187       ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
188       ctx->program->wave_size == 64 &&
189       (ctx->program->info.ps.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
190        ctx->program->info.vs.has_prolog || ctx->stage == raytracing_cs);
191 
192    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
193       /* GFX6-7: there is no bpermute instruction */
194       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
195                         bld.def(bld.lm, vcc), index, data);
196    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
197 
198       /* GFX10 wave64 mode: emulate full-wave bpermute */
199       Temp index_is_lo =
200          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
201       Builder::Result index_is_lo_split =
202          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
203       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
204                                      index_is_lo_split.def(1).getTemp());
205       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
206                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
207       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
208 
209       if (ctx->options->gfx_level <= GFX10_3) {
210          /* We need one pair of shared VGPRs:
211           * Note, that these have twice the allocation granularity of normal VGPRs
212           */
213          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
214 
215          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
216                            bld.def(s1, scc), index_x4, data, same_half);
217       } else {
218          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
219                            bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half);
220       }
221    } else {
222       /* GFX8-9 or GFX10 wave32: bpermute works normally */
223       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
224       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
225    }
226 }
227 
228 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask,bool allow_fi)229 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
230 {
231    if (ctx->options->gfx_level >= GFX8) {
232       unsigned and_mask = mask & 0x1f;
233       unsigned or_mask = (mask >> 5) & 0x1f;
234       unsigned xor_mask = (mask >> 10) & 0x1f;
235 
236       /* Eliminate or_mask. */
237       and_mask &= ~or_mask;
238       xor_mask ^= or_mask;
239 
240       uint16_t dpp_ctrl = 0xffff;
241 
242       /* DPP16 before DPP8 before v_permlane(x)16_b32
243        * because DPP16 supports modifiers and v_permlane
244        * can't be folded into valu instructions.
245        */
246       if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
247          unsigned res[4];
248          for (unsigned i = 0; i < 4; i++)
249             res[i] = ((i & and_mask) ^ xor_mask);
250          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
251       } else if (and_mask == 0x1f && xor_mask == 8) {
252          dpp_ctrl = dpp_row_rr(8);
253       } else if (and_mask == 0x1f && xor_mask == 0xf) {
254          dpp_ctrl = dpp_row_mirror;
255       } else if (and_mask == 0x1f && xor_mask == 0x7) {
256          dpp_ctrl = dpp_row_half_mirror;
257       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
258          dpp_ctrl = dpp_row_share(xor_mask);
259       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
260          dpp_ctrl = dpp_row_xmask(xor_mask);
261       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
262          uint32_t lane_sel = 0;
263          for (unsigned i = 0; i < 8; i++)
264             lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
265          return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
266       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
267          uint64_t lane_mask = 0;
268          for (unsigned i = 0; i < 16; i++)
269             lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
270          aco_opcode opcode =
271             xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
272          Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
273          Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
274          Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
275          ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
276          ret->valu().opsel[1] = true;     /* set BOUND_CTRL */
277          return ret;
278       }
279 
280       if (dpp_ctrl != 0xffff)
281          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
282                              allow_fi);
283    }
284 
285    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
286 }
287 
288 Temp
as_vgpr(Builder & bld,Temp val)289 as_vgpr(Builder& bld, Temp val)
290 {
291    if (val.type() == RegType::sgpr)
292       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
293    assert(val.type() == RegType::vgpr);
294    return val;
295 }
296 
297 Temp
as_vgpr(isel_context * ctx,Temp val)298 as_vgpr(isel_context* ctx, Temp val)
299 {
300    Builder bld(ctx->program, ctx->block);
301    return as_vgpr(bld, val);
302 }
303 
304 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)305 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
306 {
307    Builder bld(ctx->program, ctx->block);
308    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
309 }
310 
311 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)312 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
313 {
314    /* no need to extract the whole vector */
315    if (src.regClass() == dst_rc) {
316       assert(idx == 0);
317       return src;
318    }
319 
320    assert(src.bytes() > (idx * dst_rc.bytes()));
321    Builder bld(ctx->program, ctx->block);
322    auto it = ctx->allocated_vec.find(src.id());
323    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
324       if (it->second[idx].regClass() == dst_rc) {
325          return it->second[idx];
326       } else {
327          assert(!dst_rc.is_subdword());
328          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
329          return bld.copy(bld.def(dst_rc), it->second[idx]);
330       }
331    }
332 
333    if (dst_rc.is_subdword())
334       src = as_vgpr(ctx, src);
335 
336    if (src.bytes() == dst_rc.bytes()) {
337       assert(idx == 0);
338       return bld.copy(bld.def(dst_rc), src);
339    } else {
340       Temp dst = bld.tmp(dst_rc);
341       emit_extract_vector(ctx, src, idx, dst);
342       return dst;
343    }
344 }
345 
346 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)347 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
348 {
349    if (num_components == 1)
350       return;
351    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
352       return;
353    RegClass rc;
354    if (num_components > vec_src.size()) {
355       if (vec_src.type() == RegType::sgpr) {
356          /* should still help get_alu_src() */
357          emit_split_vector(ctx, vec_src, vec_src.size());
358          return;
359       }
360       /* sub-dword split */
361       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
362    } else {
363       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
364    }
365    aco_ptr<Instruction> split{
366       create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
367    split->operands[0] = Operand(vec_src);
368    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
369    for (unsigned i = 0; i < num_components; i++) {
370       elems[i] = ctx->program->allocateTmp(rc);
371       split->definitions[i] = Definition(elems[i]);
372    }
373    ctx->block->instructions.emplace_back(std::move(split));
374    ctx->allocated_vec.emplace(vec_src.id(), elems);
375 }
376 
377 /* This vector expansion uses a mask to determine which elements in the new vector
378  * come from the original vector. The other elements are undefined. */
379 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask,bool zero_padding=false)380 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
381               bool zero_padding = false)
382 {
383    assert(vec_src.type() == RegType::vgpr);
384    Builder bld(ctx->program, ctx->block);
385 
386    if (dst.type() == RegType::sgpr && num_components > dst.size()) {
387       Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
388       expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
389       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
390       ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
391       return;
392    }
393 
394    emit_split_vector(ctx, vec_src, util_bitcount(mask));
395 
396    if (vec_src == dst)
397       return;
398 
399    if (num_components == 1) {
400       if (dst.type() == RegType::sgpr)
401          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
402       else
403          bld.copy(Definition(dst), vec_src);
404       return;
405    }
406 
407    unsigned component_bytes = dst.bytes() / num_components;
408    RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
409    RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
410    assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
411    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
412 
413    Temp padding = Temp(0, dst_rc);
414    if (zero_padding)
415       padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
416 
417    aco_ptr<Instruction> vec{
418       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
419    vec->definitions[0] = Definition(dst);
420    unsigned k = 0;
421    for (unsigned i = 0; i < num_components; i++) {
422       if (mask & (1 << i)) {
423          Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
424          if (dst.type() == RegType::sgpr)
425             src = bld.as_uniform(src);
426          vec->operands[i] = Operand(src);
427          elems[i] = src;
428       } else {
429          vec->operands[i] = Operand::zero(component_bytes);
430          elems[i] = padding;
431       }
432    }
433    ctx->block->instructions.emplace_back(std::move(vec));
434    ctx->allocated_vec.emplace(dst.id(), elems);
435 }
436 
437 /* adjust misaligned small bit size loads */
438 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)439 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
440 {
441    Builder bld(ctx->program, ctx->block);
442    Operand shift;
443    Temp select = Temp();
444    if (offset.isConstant()) {
445       assert(offset.constantValue() && offset.constantValue() < 4);
446       shift = Operand::c32(offset.constantValue() * 8);
447    } else {
448       /* bit_offset = 8 * (offset & 0x3) */
449       Temp tmp =
450          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
451       select = bld.tmp(s1);
452       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
453                        Operand::c32(3u));
454    }
455 
456    if (vec.size() == 1) {
457       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
458    } else if (vec.size() == 2) {
459       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
460       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
461       if (tmp == dst)
462          emit_split_vector(ctx, dst, 2);
463       else
464          emit_extract_vector(ctx, tmp, 0, dst);
465    } else if (vec.size() == 3 || vec.size() == 4) {
466       Temp lo = bld.tmp(s2), hi;
467       if (vec.size() == 3) {
468          /* this can happen if we use VMEM for a uniform load */
469          hi = bld.tmp(s1);
470          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
471       } else {
472          hi = bld.tmp(s2);
473          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
474          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
475       }
476       if (select != Temp())
477          hi =
478             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
479       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
480       Temp mid = bld.tmp(s1);
481       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
482       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
483       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
484       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
485       emit_split_vector(ctx, dst, 2);
486    }
487 }
488 
489 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)490 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
491 {
492    Builder bld(ctx->program, ctx->block);
493    if (offset.isTemp()) {
494       Temp tmp[4] = {vec, vec, vec, vec};
495 
496       if (vec.size() == 4) {
497          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
498          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
499                     Definition(tmp[2]), Definition(tmp[3]), vec);
500       } else if (vec.size() == 3) {
501          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
502          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
503                     Definition(tmp[2]), vec);
504       } else if (vec.size() == 2) {
505          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
506          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
507       }
508       for (unsigned i = 0; i < dst.size(); i++)
509          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
510 
511       vec = tmp[0];
512       if (dst.size() == 2)
513          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
514 
515       offset = Operand::zero();
516    }
517 
518    unsigned num_components = vec.bytes() / component_size;
519    if (vec.regClass() == dst.regClass()) {
520       assert(offset.constantValue() == 0);
521       bld.copy(Definition(dst), vec);
522       emit_split_vector(ctx, dst, num_components);
523       return;
524    }
525 
526    emit_split_vector(ctx, vec, num_components);
527    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
528    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
529 
530    assert(offset.constantValue() % component_size == 0);
531    unsigned skip = offset.constantValue() / component_size;
532    for (unsigned i = skip; i < num_components; i++)
533       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
534 
535    if (dst.type() == RegType::vgpr) {
536       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
537       num_components = dst.bytes() / component_size;
538       aco_ptr<Instruction> create_vec{
539          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
540       for (unsigned i = 0; i < num_components; i++)
541          create_vec->operands[i] = Operand(elems[i]);
542       create_vec->definitions[0] = Definition(dst);
543       bld.insert(std::move(create_vec));
544 
545    } else if (skip) {
546       /* if dst is sgpr - split the src, but move the original to sgpr. */
547       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
548       byte_align_scalar(ctx, vec, offset, dst);
549    } else {
550       assert(dst.size() == vec.size());
551       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
552    }
553 
554    ctx->allocated_vec.emplace(dst.id(), elems);
555 }
556 
557 Temp
get_ssa_temp_tex(struct isel_context * ctx,nir_def * def,bool is_16bit)558 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
559 {
560    RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
561    Temp tmp = get_ssa_temp(ctx, def);
562    if (tmp.bytes() != rc.bytes())
563       return emit_extract_vector(ctx, tmp, 0, rc);
564    else
565       return tmp;
566 }
567 
568 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))569 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
570 {
571    Builder bld(ctx->program, ctx->block);
572    if (!dst.id())
573       dst = bld.tmp(bld.lm);
574 
575    assert(val.regClass() == s1);
576    assert(dst.regClass() == bld.lm);
577 
578    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
579                    bld.scc(val));
580 }
581 
582 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))583 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
584 {
585    Builder bld(ctx->program, ctx->block);
586    if (!dst.id())
587       dst = bld.tmp(s1);
588 
589    assert(val.regClass() == bld.lm);
590    assert(dst.regClass() == s1);
591 
592    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
593    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
594    return dst;
595 }
596 
597 /**
598  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
599  * src_bits and dst_bits are truncated.
600  *
601  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
602  * bit is indicated by src_bits in this case.
603  *
604  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
605  */
606 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())607 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
608             bool sign_extend, Temp dst = Temp())
609 {
610    assert(!(sign_extend && dst_bits < src_bits) &&
611           "Shrinking integers is not supported for signed inputs");
612 
613    if (!dst.id()) {
614       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
615          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
616       else
617          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
618    }
619 
620    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
621    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
622 
623    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
624       /* Copy the raw value, leaving an undefined value in the upper bits for
625        * the caller to handle appropriately */
626       return bld.copy(Definition(dst), src);
627    } else if (dst.bytes() < src.bytes()) {
628       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
629    }
630 
631    Temp tmp = dst;
632    if (dst_bits == 64)
633       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
634 
635    if (tmp == src) {
636    } else if (src.regClass() == s1) {
637       assert(src_bits < 32);
638       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
639                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
640    } else {
641       assert(src_bits < 32);
642       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
643                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
644    }
645 
646    if (dst_bits == 64) {
647       if (sign_extend && dst.regClass() == s2) {
648          Temp high =
649             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
650          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
651       } else if (sign_extend && dst.regClass() == v2) {
652          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
653          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
654       } else {
655          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
656       }
657    }
658 
659    return dst;
660 }
661 
662 enum sgpr_extract_mode {
663    sgpr_extract_sext,
664    sgpr_extract_zext,
665    sgpr_extract_undef,
666 };
667 
668 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)669 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
670 {
671    Temp vec = get_ssa_temp(ctx, src->src.ssa);
672    unsigned src_size = src->src.ssa->bit_size;
673    unsigned swizzle = src->swizzle[0];
674 
675    if (vec.size() > 1) {
676       assert(src_size == 16);
677       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
678       swizzle = swizzle & 1;
679    }
680 
681    Builder bld(ctx->program, ctx->block);
682    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
683 
684    if (mode == sgpr_extract_undef && swizzle == 0)
685       bld.copy(Definition(tmp), vec);
686    else
687       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
688                  Operand::c32(swizzle), Operand::c32(src_size),
689                  Operand::c32((mode == sgpr_extract_sext)));
690 
691    if (dst.regClass() == s2)
692       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
693 
694    return dst;
695 }
696 
697 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)698 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
699 {
700    if (src.src.ssa->num_components == 1 && size == 1)
701       return get_ssa_temp(ctx, src.src.ssa);
702 
703    Temp vec = get_ssa_temp(ctx, src.src.ssa);
704    unsigned elem_size = src.src.ssa->bit_size / 8u;
705    bool identity_swizzle = true;
706 
707    for (unsigned i = 0; identity_swizzle && i < size; i++) {
708       if (src.swizzle[i] != i)
709          identity_swizzle = false;
710    }
711    if (identity_swizzle)
712       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
713 
714    assert(elem_size > 0);
715    assert(vec.bytes() % elem_size == 0);
716 
717    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
718       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
719       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
720                                            sgpr_extract_undef);
721    }
722 
723    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
724    if (as_uniform)
725       vec = as_vgpr(ctx, vec);
726 
727    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
728                                     : RegClass(vec.type(), elem_size / 4);
729    if (size == 1) {
730       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
731    } else {
732       assert(size <= 4);
733       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
734       aco_ptr<Instruction> vec_instr{
735          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
736       for (unsigned i = 0; i < size; ++i) {
737          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
738          vec_instr->operands[i] = Operand{elems[i]};
739       }
740       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
741       vec_instr->definitions[0] = Definition(dst);
742       ctx->block->instructions.emplace_back(std::move(vec_instr));
743       ctx->allocated_vec.emplace(dst.id(), elems);
744       return as_uniform ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
745    }
746 }
747 
748 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)749 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
750 {
751    /* returns v2b or v1 for vop3p usage.
752     * The source expects exactly 2 16bit components
753     * which are within the same dword
754     */
755    assert(src.src.ssa->bit_size == 16);
756    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
757 
758    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
759    if (tmp.size() == 1)
760       return tmp;
761 
762    /* the size is larger than 1 dword: check the swizzle */
763    unsigned dword = src.swizzle[0] >> 1;
764 
765    /* extract a full dword if possible */
766    if (tmp.bytes() >= (dword + 1) * 4) {
767       /* if the source is split into components, use p_create_vector */
768       auto it = ctx->allocated_vec.find(tmp.id());
769       if (it != ctx->allocated_vec.end()) {
770          unsigned index = dword << 1;
771          Builder bld(ctx->program, ctx->block);
772          if (it->second[index].regClass() == v2b)
773             return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
774                               it->second[index + 1]);
775       }
776       return emit_extract_vector(ctx, tmp, dword, v1);
777    } else {
778       /* This must be a swizzled access to %a.zz where %a is v6b */
779       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
780       assert(tmp.regClass() == v6b && dword == 1);
781       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
782    }
783 }
784 
785 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)786 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
787 {
788    nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
789    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
790 }
791 
792 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)793 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
794 {
795    if (ptr.size() == 2)
796       return ptr;
797    Builder bld(ctx->program, ctx->block);
798    if (ptr.type() == RegType::vgpr && !non_uniform)
799       ptr = bld.as_uniform(ptr);
800    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
801                      Operand::c32((unsigned)ctx->options->address32_hi));
802 }
803 
804 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)805 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
806                       bool writes_scc, uint8_t uses_ub = 0)
807 {
808    Builder bld = create_alu_builder(ctx, instr);
809    bld.is_nuw = instr->no_unsigned_wrap;
810 
811    Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
812                           Operand(get_alu_src(ctx, instr->src[1]))};
813    u_foreach_bit (i, uses_ub) {
814       uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
815       if (src_ub <= 0xffff)
816          operands[i].set16bit(true);
817       else if (src_ub <= 0xffffff)
818          operands[i].set24bit(true);
819    }
820 
821    if (writes_scc)
822       bld.sop2(op, Definition(dst), bld.def(s1, scc), operands[0], operands[1]);
823    else
824       bld.sop2(op, Definition(dst), operands[0], operands[1]);
825 }
826 
827 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)828 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
829                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
830                       bool nuw = false, uint8_t uses_ub = 0)
831 {
832    Builder bld = create_alu_builder(ctx, instr);
833    bld.is_nuw = nuw;
834 
835    Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
836                           Operand(get_alu_src(ctx, instr->src[1]))};
837    u_foreach_bit (i, uses_ub) {
838       uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
839       if (src_ub <= 0xffff)
840          operands[i].set16bit(true);
841       else if (src_ub <= 0xffffff)
842          operands[i].set24bit(true);
843    }
844 
845    if (swap_srcs)
846       std::swap(operands[0], operands[1]);
847 
848    if (operands[1].isOfType(RegType::sgpr)) {
849       if (commutative && operands[0].isOfType(RegType::vgpr)) {
850          std::swap(operands[0], operands[1]);
851       } else {
852          operands[1] = bld.copy(bld.def(RegType::vgpr, operands[1].size()), operands[1]);
853       }
854    }
855 
856    if (flush_denorms && ctx->program->gfx_level < GFX9) {
857       assert(dst.size() == 1);
858       Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), operands[0], operands[1]);
859       if (dst.bytes() == 2)
860          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
861       else
862          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
863    } else {
864       bld.vop2(opc, Definition(dst), operands[0], operands[1]);
865    }
866 }
867 
868 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)869 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
870 {
871    Builder bld = create_alu_builder(ctx, instr);
872 
873    Temp src0 = get_alu_src(ctx, instr->src[0]);
874    Temp src1 = get_alu_src(ctx, instr->src[1]);
875 
876    if (src1.type() == RegType::sgpr) {
877       assert(src0.type() == RegType::vgpr);
878       std::swap(src0, src1);
879    }
880 
881    Temp src00 = bld.tmp(src0.type(), 1);
882    Temp src01 = bld.tmp(src0.type(), 1);
883    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
884    Temp src10 = bld.tmp(v1);
885    Temp src11 = bld.tmp(v1);
886    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
887    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
888    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
889    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
890 }
891 
892 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)893 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
894                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
895 {
896    assert(num_sources == 2 || num_sources == 3);
897    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
898    bool has_sgpr = false;
899    for (unsigned i = 0; i < num_sources; i++) {
900       src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
901       if (has_sgpr)
902          src[i] = as_vgpr(ctx, src[i]);
903       else
904          has_sgpr = src[i].type() == RegType::sgpr;
905    }
906 
907    Builder bld = create_alu_builder(ctx, instr);
908    if (flush_denorms && ctx->program->gfx_level < GFX9) {
909       Temp tmp;
910       if (num_sources == 3)
911          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
912       else
913          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
914       if (dst.size() == 1)
915          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
916       else
917          bld.vop3(aco_opcode::v_mul_f64_e64, Definition(dst), Operand::c64(0x3FF0000000000000),
918                   tmp);
919    } else if (num_sources == 3) {
920       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
921    } else {
922       bld.vop3(op, Definition(dst), src[0], src[1]);
923    }
924 }
925 
926 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)927 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
928                        bool swap_srcs = false)
929 {
930    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
931    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
932    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
933       src1 = as_vgpr(ctx, src1);
934    assert(instr->def.num_components == 2);
935 
936    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
937    unsigned opsel_lo =
938       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
939    unsigned opsel_hi =
940       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
941 
942    Builder bld = create_alu_builder(ctx, instr);
943    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
944    emit_split_vector(ctx, dst, 2);
945    return res;
946 }
947 
948 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp,unsigned neg_lo=0)949 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
950                       unsigned neg_lo = 0)
951 {
952    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
953    bool has_sgpr = false;
954    for (unsigned i = 0; i < 3; i++) {
955       src[i] = get_alu_src(ctx, instr->src[i]);
956       if (has_sgpr)
957          src[i] = as_vgpr(ctx, src[i]);
958       else
959          has_sgpr = src[i].type() == RegType::sgpr;
960    }
961 
962    Builder bld = create_alu_builder(ctx, instr);
963    VALU_instruction& vop3p =
964       bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
965    vop3p.clamp = clamp;
966    vop3p.neg_lo = neg_lo;
967 }
968 
969 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)970 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
971 {
972    Builder bld = create_alu_builder(ctx, instr);
973    if (dst.type() == RegType::sgpr)
974       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
975                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
976    else
977       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
978 }
979 
980 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)981 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
982 {
983    Temp src0 = get_alu_src(ctx, instr->src[0]);
984    Temp src1 = get_alu_src(ctx, instr->src[1]);
985    assert(src0.size() == src1.size());
986 
987    aco_ptr<Instruction> vopc;
988    if (src1.type() == RegType::sgpr) {
989       if (src0.type() == RegType::vgpr) {
990          /* to swap the operands, we might also have to change the opcode */
991          op = get_vcmp_swapped(op);
992          Temp t = src0;
993          src0 = src1;
994          src1 = t;
995       } else {
996          src1 = as_vgpr(ctx, src1);
997       }
998    }
999 
1000    Builder bld = create_alu_builder(ctx, instr);
1001    bld.vopc(op, Definition(dst), src0, src1);
1002 }
1003 
1004 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1005 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1006 {
1007    Temp src0 = get_alu_src(ctx, instr->src[0]);
1008    Temp src1 = get_alu_src(ctx, instr->src[1]);
1009    Builder bld = create_alu_builder(ctx, instr);
1010 
1011    assert(dst.regClass() == bld.lm);
1012    assert(src0.type() == RegType::sgpr);
1013    assert(src1.type() == RegType::sgpr);
1014 
1015    /* Emit the SALU comparison instruction */
1016    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1017    /* Turn the result into a per-lane bool */
1018    bool_to_vector_condition(ctx, cmp, dst);
1019 }
1020 
1021 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s16_op=aco_opcode::num_opcodes,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1022 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1023                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s16_op = aco_opcode::num_opcodes,
1024                 aco_opcode s32_op = aco_opcode::num_opcodes,
1025                 aco_opcode s64_op = aco_opcode::num_opcodes)
1026 {
1027    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1028                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1029                                                              : s16_op;
1030    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1031                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1032                                                              : v16_op;
1033    bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1034                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1035                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1036    aco_opcode op = use_valu ? v_op : s_op;
1037    assert(op != aco_opcode::num_opcodes);
1038    assert(dst.regClass() == ctx->program->lane_mask);
1039 
1040    if (use_valu)
1041       emit_vopc_instruction(ctx, instr, op, dst);
1042    else
1043       emit_sopc_instruction(ctx, instr, op, dst);
1044 }
1045 
1046 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1047 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1048                    Temp dst)
1049 {
1050    Builder bld(ctx->program, ctx->block);
1051    Temp src0 = get_alu_src(ctx, instr->src[0]);
1052    Temp src1 = get_alu_src(ctx, instr->src[1]);
1053 
1054    assert(dst.regClass() == bld.lm);
1055    assert(src0.regClass() == bld.lm);
1056    assert(src1.regClass() == bld.lm);
1057 
1058    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1059 }
1060 
1061 void
select_vec2(isel_context * ctx,Temp dst,Temp cond,Temp then,Temp els)1062 select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
1063 {
1064    Builder bld(ctx->program, ctx->block);
1065 
1066    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1067    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1068    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1069    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1070 
1071    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1072    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1073 
1074    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1075 }
1076 
1077 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1078 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1079 {
1080    Builder bld(ctx->program, ctx->block);
1081    Temp cond = get_alu_src(ctx, instr->src[0]);
1082    Temp then = get_alu_src(ctx, instr->src[1]);
1083    Temp els = get_alu_src(ctx, instr->src[2]);
1084 
1085    assert(cond.regClass() == bld.lm);
1086 
1087    if (dst.type() == RegType::vgpr) {
1088       aco_ptr<Instruction> bcsel;
1089       if (dst.size() == 1) {
1090          then = as_vgpr(ctx, then);
1091          els = as_vgpr(ctx, els);
1092 
1093          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1094       } else if (dst.size() == 2) {
1095          select_vec2(ctx, dst, cond, then, els);
1096       } else {
1097          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1098       }
1099       return;
1100    }
1101 
1102    if (instr->def.bit_size == 1) {
1103       assert(dst.regClass() == bld.lm);
1104       assert(then.regClass() == bld.lm);
1105       assert(els.regClass() == bld.lm);
1106    }
1107 
1108    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1109       if (dst.regClass() == s1 || dst.regClass() == s2) {
1110          assert((then.regClass() == s1 || then.regClass() == s2) &&
1111                 els.regClass() == then.regClass());
1112          assert(dst.size() == then.size());
1113          aco_opcode op =
1114             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1115          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1116       } else {
1117          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1118       }
1119       return;
1120    }
1121 
1122    /* divergent boolean bcsel
1123     * this implements bcsel on bools: dst = s0 ? s1 : s2
1124     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1125    assert(instr->def.bit_size == 1);
1126 
1127    if (cond.id() != then.id())
1128       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1129 
1130    if (cond.id() == els.id())
1131       bld.copy(Definition(dst), then);
1132    else
1133       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1134                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1135 }
1136 
1137 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode vop,aco_opcode sop,uint32_t undo)1138 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode vop,
1139                aco_opcode sop, uint32_t undo)
1140 {
1141    if (ctx->block->fp_mode.denorm32 == 0) {
1142       if (dst.regClass() == v1)
1143          bld.vop1(vop, dst, val);
1144       else if (ctx->options->gfx_level >= GFX12)
1145          bld.vop3(sop, dst, val);
1146       else
1147          bld.pseudo(aco_opcode::p_as_uniform, dst, bld.vop1(vop, bld.def(v1), val));
1148       return;
1149    }
1150 
1151    /* multiply by 16777216 to handle denormals */
1152    Temp scale, unscale;
1153    if (val.regClass() == v1) {
1154       val = as_vgpr(bld, val);
1155       Temp is_denormal = bld.tmp(bld.lm);
1156       VALU_instruction& valu = bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal),
1157                                             val, Operand::c32(1u << 4))
1158                                   ->valu();
1159       valu.neg[0] = true;
1160       valu.abs[0] = true;
1161       scale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1162                            bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), is_denormal);
1163       unscale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1164                              bld.copy(bld.def(s1), Operand::c32(undo)), is_denormal);
1165    } else {
1166       Temp abs = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), val,
1167                           bld.copy(bld.def(s1), Operand::c32(0x7fffffff)));
1168       Temp denorm_cmp = bld.copy(bld.def(s1), Operand::c32(0x00800000));
1169       Temp is_denormal = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, denorm_cmp);
1170       scale = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
1171                        bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), Operand::c32(0x3f800000),
1172                        bld.scc(is_denormal));
1173       unscale =
1174          bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(undo)),
1175                   Operand::c32(0x3f800000), bld.scc(is_denormal));
1176    }
1177 
1178    if (dst.regClass() == v1) {
1179       Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), scale, as_vgpr(bld, val));
1180       scaled = bld.vop1(vop, bld.def(v1), scaled);
1181       bld.vop2(aco_opcode::v_mul_f32, dst, unscale, scaled);
1182    } else {
1183       assert(ctx->options->gfx_level >= GFX11_5);
1184       Temp scaled = bld.sop2(aco_opcode::s_mul_f32, bld.def(s1), scale, val);
1185       if (ctx->options->gfx_level >= GFX12)
1186          scaled = bld.vop3(sop, bld.def(s1), scaled);
1187       else
1188          scaled = bld.as_uniform(bld.vop1(vop, bld.def(v1), scaled));
1189       bld.sop2(aco_opcode::s_mul_f32, dst, unscale, scaled);
1190    }
1191 }
1192 
1193 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1194 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1195 {
1196    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, aco_opcode::v_s_rcp_f32, 0x4b800000u);
1197 }
1198 
1199 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1200 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1201 {
1202    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, aco_opcode::v_s_rsq_f32, 0x45800000u);
1203 }
1204 
1205 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1206 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1207 {
1208    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, aco_opcode::v_s_sqrt_f32,
1209                   0x39800000u);
1210 }
1211 
1212 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1213 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1214 {
1215    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, aco_opcode::v_s_log_f32, 0xc1c00000u);
1216 }
1217 
1218 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1219 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1220 {
1221    if (ctx->options->gfx_level >= GFX7)
1222       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1223 
1224    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1225    /* TODO: create more efficient code! */
1226    if (val.type() == RegType::sgpr)
1227       val = as_vgpr(ctx, val);
1228 
1229    /* Split the input value. */
1230    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1231    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1232 
1233    /* Extract the exponent and compute the unbiased value. */
1234    Temp exponent =
1235       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1236    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1237 
1238    /* Extract the fractional part. */
1239    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1240                                 Operand::c32(0x000fffffu));
1241    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1242 
1243    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1244    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1245               fract_mask);
1246 
1247    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1248    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1249    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1250    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1251    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1252 
1253    /* Get the sign bit. */
1254    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1255 
1256    /* Decide the operation to apply depending on the unbiased exponent. */
1257    Temp exp_lt0 =
1258       bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1259    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1260                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1261    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1262    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1263    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1264    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1265 
1266    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1267 }
1268 
1269 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1270 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1271 {
1272    if (ctx->options->gfx_level >= GFX7)
1273       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1274 
1275    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1276     * lowered at NIR level for precision reasons). */
1277    Temp src0 = as_vgpr(ctx, val);
1278 
1279    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1280                              Operand::c32(0x3fefffffu));
1281 
1282    Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1283    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1284    Temp min = bld.vop3(aco_opcode::v_min_f64_e64, bld.def(v2), fract, min_val);
1285 
1286    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1287    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1288    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1289    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1290 
1291    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1292    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1293 
1294    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1295 
1296    Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src0, v);
1297    add->valu().neg[1] = true;
1298 
1299    return add->definitions[0].getTemp();
1300 }
1301 
1302 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1303 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1304 {
1305    if (bld.program->gfx_level < GFX8) {
1306       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1307       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1308                           add.def(1).getTemp());
1309    }
1310 
1311    Builder::Result add(NULL);
1312    if (bld.program->gfx_level >= GFX9) {
1313       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1314    } else {
1315       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1316    }
1317    add->valu().clamp = 1;
1318    return dst.getTemp();
1319 }
1320 
1321 Temp
usub32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1322 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1323 {
1324    if (bld.program->gfx_level < GFX8) {
1325       Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1326       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1327                           sub.def(1).getTemp());
1328    }
1329 
1330    Builder::Result sub(NULL);
1331    if (bld.program->gfx_level >= GFX9) {
1332       sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1333    } else {
1334       sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1335    }
1336    sub->valu().clamp = 1;
1337    return dst.getTemp();
1338 }
1339 
1340 void
emit_vec2_f2f16(isel_context * ctx,nir_alu_instr * instr,Temp dst)1341 emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1342 {
1343    Builder bld = create_alu_builder(ctx, instr);
1344    Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
1345    RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
1346    Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
1347    Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
1348 
1349    if (dst.regClass() == s1) {
1350       bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src0, src1);
1351    } else {
1352       src1 = as_vgpr(ctx, src1);
1353       if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
1354          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
1355       else
1356          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
1357       emit_split_vector(ctx, dst, 2);
1358    }
1359 }
1360 
1361 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1362 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1363 {
1364    Builder bld = create_alu_builder(ctx, instr);
1365    Temp dst = get_ssa_temp(ctx, &instr->def);
1366    switch (instr->op) {
1367    case nir_op_vec2:
1368    case nir_op_vec3:
1369    case nir_op_vec4:
1370    case nir_op_vec5:
1371    case nir_op_vec8:
1372    case nir_op_vec16: {
1373       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1374       unsigned num = instr->def.num_components;
1375       for (unsigned i = 0; i < num; ++i)
1376          elems[i] = get_alu_src(ctx, instr->src[i]);
1377 
1378       if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1379          aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
1380                                                      instr->def.num_components, 1)};
1381          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1382          for (unsigned i = 0; i < num; ++i) {
1383             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1384                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1385             vec->operands[i] = Operand{elems[i]};
1386          }
1387          vec->definitions[0] = Definition(dst);
1388          ctx->block->instructions.emplace_back(std::move(vec));
1389          ctx->allocated_vec.emplace(dst.id(), elems);
1390       } else {
1391          bool use_s_pack = ctx->program->gfx_level >= GFX9;
1392          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1393 
1394          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1395          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1396          for (unsigned i = 0; i < num; i++) {
1397             unsigned packed_size = use_s_pack ? 16 : 32;
1398             unsigned idx = i * instr->def.bit_size / packed_size;
1399             unsigned offset = i * instr->def.bit_size % packed_size;
1400             if (nir_src_is_const(instr->src[i].src)) {
1401                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1402                continue;
1403             }
1404             if (nir_src_is_undef(instr->src[i].src))
1405                continue;
1406 
1407             if (offset != packed_size - instr->def.bit_size)
1408                elems[i] =
1409                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1410 
1411             if (offset)
1412                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1413                                    Operand::c32(offset));
1414 
1415             if (packed[idx].id())
1416                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1417                                       packed[idx]);
1418             else
1419                packed[idx] = elems[i];
1420          }
1421 
1422          if (use_s_pack) {
1423             for (unsigned i = 0; i < dst.size(); i++) {
1424                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1425 
1426                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1427                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1428                                        packed[i * 2 + 1]);
1429                else if (packed[i * 2 + 1].id())
1430                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1431                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1432                else if (packed[i * 2].id())
1433                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1434                                        Operand::c32(const_vals[i * 2 + 1]));
1435                else
1436                   packed[i] = Temp(); /* Both constants, so reset the entry */
1437 
1438                if (same)
1439                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1440                else
1441                   const_vals[i] = 0;
1442             }
1443          }
1444 
1445          for (unsigned i = 0; i < dst.size(); i++) {
1446             if (const_vals[i] && packed[i].id())
1447                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1448                                     Operand::c32(const_vals[i]), packed[i]);
1449             else if (!packed[i].id())
1450                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1451          }
1452 
1453          if (dst.size() == 1)
1454             bld.copy(Definition(dst), packed[0]);
1455          else {
1456             aco_ptr<Instruction> vec{
1457                create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1458             vec->definitions[0] = Definition(dst);
1459             for (unsigned i = 0; i < dst.size(); ++i)
1460                vec->operands[i] = Operand(packed[i]);
1461             bld.insert(std::move(vec));
1462          }
1463       }
1464       break;
1465    }
1466    case nir_op_mov: {
1467       Temp src = get_alu_src(ctx, instr->src[0]);
1468       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1469          /* use size() instead of bytes() for 8/16-bit */
1470          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1471          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1472       } else {
1473          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1474          bld.copy(Definition(dst), src);
1475       }
1476       break;
1477    }
1478    case nir_op_inot: {
1479       Temp src = get_alu_src(ctx, instr->src[0]);
1480       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1481          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1482       } else if (dst.regClass() == v2) {
1483          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1484          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1485          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1486          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1487          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1488       } else if (dst.type() == RegType::sgpr) {
1489          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1490          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1491       } else {
1492          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1493       }
1494       break;
1495    }
1496    case nir_op_iabs: {
1497       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1498          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1499 
1500          unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1501          unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1502 
1503          Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1504                               src, opsel_lo, opsel_hi);
1505          bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1506          emit_split_vector(ctx, dst, 2);
1507          break;
1508       }
1509       Temp src = get_alu_src(ctx, instr->src[0]);
1510       if (dst.regClass() == s1) {
1511          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1512       } else if (dst.regClass() == v1) {
1513          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1514                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1515       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1516          bld.vop3(
1517             aco_opcode::v_max_i16_e64, Definition(dst), src,
1518             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1519       } else if (dst.regClass() == v2b) {
1520          src = as_vgpr(ctx, src);
1521          bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1522                   bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1523       } else {
1524          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1525       }
1526       break;
1527    }
1528    case nir_op_isign: {
1529       Temp src = get_alu_src(ctx, instr->src[0]);
1530       if (dst.regClass() == s1) {
1531          Temp tmp =
1532             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1533          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1534       } else if (dst.regClass() == s2) {
1535          Temp neg =
1536             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1537          Temp neqz;
1538          if (ctx->program->gfx_level >= GFX8)
1539             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1540          else
1541             neqz =
1542                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1543                   .def(1)
1544                   .getTemp();
1545          /* SCC gets zero-extended to 64 bit */
1546          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1547       } else if (dst.regClass() == v1) {
1548          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1549       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1550          bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1551       } else if (dst.regClass() == v2b) {
1552          src = as_vgpr(ctx, src);
1553          bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1554                   bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1555       } else if (dst.regClass() == v2) {
1556          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1557          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1558          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1559          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1560          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1561          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1562       } else {
1563          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1564       }
1565       break;
1566    }
1567    case nir_op_imax: {
1568       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1569          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1570       } else if (dst.regClass() == v2b) {
1571          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1572       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1573          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1574       } else if (dst.regClass() == v1) {
1575          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1576       } else if (dst.regClass() == s1) {
1577          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1578       } else {
1579          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1580       }
1581       break;
1582    }
1583    case nir_op_umax: {
1584       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1585          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1586       } else if (dst.regClass() == v2b) {
1587          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1588       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1589          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1590       } else if (dst.regClass() == v1) {
1591          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1592       } else if (dst.regClass() == s1) {
1593          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1594       } else {
1595          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1596       }
1597       break;
1598    }
1599    case nir_op_imin: {
1600       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1601          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1602       } else if (dst.regClass() == v2b) {
1603          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1604       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1605          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1606       } else if (dst.regClass() == v1) {
1607          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1608       } else if (dst.regClass() == s1) {
1609          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1610       } else {
1611          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1612       }
1613       break;
1614    }
1615    case nir_op_umin: {
1616       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1617          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1618       } else if (dst.regClass() == v2b) {
1619          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1620       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1621          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1622       } else if (dst.regClass() == v1) {
1623          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1624       } else if (dst.regClass() == s1) {
1625          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1626       } else {
1627          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1628       }
1629       break;
1630    }
1631    case nir_op_ior: {
1632       if (instr->def.bit_size == 1) {
1633          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1634       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1635          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1636       } else if (dst.regClass() == v2) {
1637          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1638       } else if (dst.regClass() == s1) {
1639          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1640       } else if (dst.regClass() == s2) {
1641          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1642       } else {
1643          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1644       }
1645       break;
1646    }
1647    case nir_op_iand: {
1648       if (instr->def.bit_size == 1) {
1649          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1650       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1651          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1652       } else if (dst.regClass() == v2) {
1653          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1654       } else if (dst.regClass() == s1) {
1655          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1656       } else if (dst.regClass() == s2) {
1657          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1658       } else {
1659          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1660       }
1661       break;
1662    }
1663    case nir_op_ixor: {
1664       if (instr->def.bit_size == 1) {
1665          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1666       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1667          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1668       } else if (dst.regClass() == v2) {
1669          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1670       } else if (dst.regClass() == s1) {
1671          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1672       } else if (dst.regClass() == s2) {
1673          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1674       } else {
1675          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1676       }
1677       break;
1678    }
1679    case nir_op_ushr: {
1680       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1681          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1682       } else if (dst.regClass() == v2b) {
1683          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1684       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1685          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1686       } else if (dst.regClass() == v1) {
1687          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1688       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1689          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1690                   get_alu_src(ctx, instr->src[0]));
1691       } else if (dst.regClass() == v2) {
1692          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1693       } else if (dst.regClass() == s2) {
1694          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1695       } else if (dst.regClass() == s1) {
1696          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1697       } else {
1698          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1699       }
1700       break;
1701    }
1702    case nir_op_ishl: {
1703       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1704          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1705       } else if (dst.regClass() == v2b) {
1706          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1707       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1708          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1709       } else if (dst.regClass() == v1) {
1710          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1711                                false, 1);
1712       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1713          bld.vop3(aco_opcode::v_lshlrev_b64_e64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1714                   get_alu_src(ctx, instr->src[0]));
1715       } else if (dst.regClass() == v2) {
1716          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1717       } else if (dst.regClass() == s1) {
1718          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1719       } else if (dst.regClass() == s2) {
1720          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1721       } else {
1722          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1723       }
1724       break;
1725    }
1726    case nir_op_ishr: {
1727       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1728          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1729       } else if (dst.regClass() == v2b) {
1730          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1731       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1732          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1733       } else if (dst.regClass() == v1) {
1734          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1735       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1736          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1737                   get_alu_src(ctx, instr->src[0]));
1738       } else if (dst.regClass() == v2) {
1739          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1740       } else if (dst.regClass() == s1) {
1741          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1742       } else if (dst.regClass() == s2) {
1743          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1744       } else {
1745          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1746       }
1747       break;
1748    }
1749    case nir_op_find_lsb: {
1750       Temp src = get_alu_src(ctx, instr->src[0]);
1751       if (src.regClass() == s1) {
1752          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1753       } else if (src.regClass() == v1) {
1754          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1755       } else if (src.regClass() == s2) {
1756          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1757       } else if (src.regClass() == v2) {
1758          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1759          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1760          lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1761          hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1762          hi = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32u), hi);
1763          bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1764       } else {
1765          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1766       }
1767       break;
1768    }
1769    case nir_op_ufind_msb:
1770    case nir_op_ifind_msb: {
1771       Temp src = get_alu_src(ctx, instr->src[0]);
1772       if (src.regClass() == s1 || src.regClass() == s2) {
1773          aco_opcode op = src.regClass() == s2
1774                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1775                                                              : aco_opcode::s_flbit_i32_i64)
1776                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1777                                                              : aco_opcode::s_flbit_i32);
1778          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1779 
1780          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1781                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1782          Temp msb = sub.def(0).getTemp();
1783          Temp carry = sub.def(1).getTemp();
1784 
1785          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1786                   bld.scc(carry));
1787       } else if (src.regClass() == v1) {
1788          aco_opcode op =
1789             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1790          Temp msb_rev = bld.tmp(v1);
1791          emit_vop1_instruction(ctx, instr, op, msb_rev);
1792          Temp msb = bld.tmp(v1);
1793          Temp carry =
1794             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1795          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1796       } else if (src.regClass() == v2) {
1797          aco_opcode op =
1798             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1799 
1800          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1801          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1802 
1803          lo = bld.vop1(op, bld.def(v1), lo);
1804          lo = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32), lo);
1805          hi = bld.vop1(op, bld.def(v1), hi);
1806          Temp msb_rev = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), lo, hi);
1807 
1808          Temp msb = bld.tmp(v1);
1809          Temp carry =
1810             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1811          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1812       } else {
1813          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1814       }
1815       break;
1816    }
1817    case nir_op_ufind_msb_rev:
1818    case nir_op_ifind_msb_rev: {
1819       Temp src = get_alu_src(ctx, instr->src[0]);
1820       if (src.regClass() == s1) {
1821          aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1822                                                            : aco_opcode::s_flbit_i32;
1823          bld.sop1(op, Definition(dst), src);
1824       } else if (src.regClass() == v1) {
1825          aco_opcode op =
1826             instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1827          emit_vop1_instruction(ctx, instr, op, dst);
1828       } else {
1829          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1830       }
1831       break;
1832    }
1833    case nir_op_bitfield_reverse: {
1834       if (dst.regClass() == s1) {
1835          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1836       } else if (dst.regClass() == v1) {
1837          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1838       } else {
1839          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1840       }
1841       break;
1842    }
1843    case nir_op_iadd: {
1844       if (dst.regClass() == s1) {
1845          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1846          break;
1847       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1848          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1849          break;
1850       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1851          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1852          break;
1853       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1854          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1855          break;
1856       }
1857 
1858       Temp src0 = get_alu_src(ctx, instr->src[0]);
1859       Temp src1 = get_alu_src(ctx, instr->src[1]);
1860       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1861          if (instr->no_unsigned_wrap)
1862             bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1863          else
1864             bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1865          break;
1866       }
1867 
1868       assert(src0.size() == 2 && src1.size() == 2);
1869       Temp src00 = bld.tmp(src0.type(), 1);
1870       Temp src01 = bld.tmp(dst.type(), 1);
1871       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1872       Temp src10 = bld.tmp(src1.type(), 1);
1873       Temp src11 = bld.tmp(dst.type(), 1);
1874       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1875 
1876       if (dst.regClass() == s2) {
1877          Temp carry = bld.tmp(s1);
1878          Temp dst0 =
1879             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1880          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1881                               bld.scc(carry));
1882          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1883       } else if (dst.regClass() == v2) {
1884          Temp dst0 = bld.tmp(v1);
1885          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1886          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1887          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1888       } else {
1889          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1890       }
1891       break;
1892    }
1893    case nir_op_uadd_sat: {
1894       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1895          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1896          add_instr->valu().clamp = 1;
1897          break;
1898       }
1899       Temp src0 = get_alu_src(ctx, instr->src[0]);
1900       Temp src1 = get_alu_src(ctx, instr->src[1]);
1901       if (dst.regClass() == s1) {
1902          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1903          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1904          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1905                   bld.scc(carry));
1906          break;
1907       } else if (dst.regClass() == v2b) {
1908          Instruction* add_instr;
1909          if (ctx->program->gfx_level >= GFX10) {
1910             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1911          } else {
1912             if (src1.type() == RegType::sgpr)
1913                std::swap(src0, src1);
1914             add_instr =
1915                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1916          }
1917          add_instr->valu().clamp = 1;
1918          break;
1919       } else if (dst.regClass() == v1) {
1920          uadd32_sat(bld, Definition(dst), src0, src1);
1921          break;
1922       }
1923 
1924       assert(src0.size() == 2 && src1.size() == 2);
1925 
1926       Temp src00 = bld.tmp(src0.type(), 1);
1927       Temp src01 = bld.tmp(src0.type(), 1);
1928       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1929       Temp src10 = bld.tmp(src1.type(), 1);
1930       Temp src11 = bld.tmp(src1.type(), 1);
1931       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1932 
1933       if (dst.regClass() == s2) {
1934          Temp carry0 = bld.tmp(s1);
1935          Temp carry1 = bld.tmp(s1);
1936 
1937          Temp no_sat0 =
1938             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1939          Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1940                                  src01, src11, bld.scc(carry0));
1941 
1942          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1943 
1944          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1945                   bld.scc(carry1));
1946       } else if (dst.regClass() == v2) {
1947          Temp no_sat0 = bld.tmp(v1);
1948          Temp dst0 = bld.tmp(v1);
1949          Temp dst1 = bld.tmp(v1);
1950 
1951          Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1952          Temp carry1;
1953 
1954          if (ctx->program->gfx_level >= GFX8) {
1955             carry1 = bld.tmp(bld.lm);
1956             bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1957                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1958                ->valu()
1959                .clamp = 1;
1960          } else {
1961             Temp no_sat1 = bld.tmp(v1);
1962             carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1963             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1964                          carry1);
1965          }
1966 
1967          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1968                       carry1);
1969          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1970       } else {
1971          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1972       }
1973       break;
1974    }
1975    case nir_op_iadd_sat: {
1976       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1977          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1978          add_instr->valu().clamp = 1;
1979          break;
1980       }
1981       Temp src0 = get_alu_src(ctx, instr->src[0]);
1982       Temp src1 = get_alu_src(ctx, instr->src[1]);
1983       if (dst.regClass() == s1) {
1984          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1985          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1986                                Operand::c32(INT32_MAX), cond);
1987          Temp overflow = bld.tmp(s1);
1988          Temp add =
1989             bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1990          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1991          break;
1992       }
1993 
1994       src1 = as_vgpr(ctx, src1);
1995 
1996       if (dst.regClass() == v2b) {
1997          Instruction* add_instr =
1998             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1999          add_instr->valu().clamp = 1;
2000       } else if (dst.regClass() == v1) {
2001          Instruction* add_instr =
2002             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2003          add_instr->valu().clamp = 1;
2004       } else {
2005          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2006       }
2007       break;
2008    }
2009    case nir_op_uadd_carry: {
2010       Temp src0 = get_alu_src(ctx, instr->src[0]);
2011       Temp src1 = get_alu_src(ctx, instr->src[1]);
2012       if (dst.regClass() == s1) {
2013          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2014          break;
2015       }
2016       if (dst.regClass() == v1) {
2017          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2018          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2019                       carry);
2020          break;
2021       }
2022 
2023       Temp src00 = bld.tmp(src0.type(), 1);
2024       Temp src01 = bld.tmp(dst.type(), 1);
2025       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2026       Temp src10 = bld.tmp(src1.type(), 1);
2027       Temp src11 = bld.tmp(dst.type(), 1);
2028       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2029       if (dst.regClass() == s2) {
2030          Temp carry = bld.tmp(s1);
2031          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2032          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2033                           bld.scc(carry))
2034                     .def(1)
2035                     .getTemp();
2036          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2037       } else if (dst.regClass() == v2) {
2038          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2039          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2040          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2041                               Operand::c32(1u), carry);
2042          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2043       } else {
2044          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2045       }
2046       break;
2047    }
2048    case nir_op_isub: {
2049       if (dst.regClass() == s1) {
2050          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2051          break;
2052       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2053          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2054          break;
2055       }
2056 
2057       Temp src0 = get_alu_src(ctx, instr->src[0]);
2058       Temp src1 = get_alu_src(ctx, instr->src[1]);
2059       if (dst.regClass() == v1) {
2060          bld.vsub32(Definition(dst), src0, src1);
2061          break;
2062       } else if (dst.bytes() <= 2) {
2063          if (ctx->program->gfx_level >= GFX10)
2064             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2065          else if (src1.type() == RegType::sgpr)
2066             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2067          else if (ctx->program->gfx_level >= GFX8)
2068             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2069          else
2070             bld.vsub32(Definition(dst), src0, src1);
2071          break;
2072       }
2073 
2074       Temp src00 = bld.tmp(src0.type(), 1);
2075       Temp src01 = bld.tmp(dst.type(), 1);
2076       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2077       Temp src10 = bld.tmp(src1.type(), 1);
2078       Temp src11 = bld.tmp(dst.type(), 1);
2079       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2080       if (dst.regClass() == s2) {
2081          Temp borrow = bld.tmp(s1);
2082          Temp dst0 =
2083             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2084          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2085                               bld.scc(borrow));
2086          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2087       } else if (dst.regClass() == v2) {
2088          Temp lower = bld.tmp(v1);
2089          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2090          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2091          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2092       } else {
2093          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2094       }
2095       break;
2096    }
2097    case nir_op_usub_borrow: {
2098       Temp src0 = get_alu_src(ctx, instr->src[0]);
2099       Temp src1 = get_alu_src(ctx, instr->src[1]);
2100       if (dst.regClass() == s1) {
2101          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2102          break;
2103       } else if (dst.regClass() == v1) {
2104          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2105          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2106                       borrow);
2107          break;
2108       }
2109 
2110       Temp src00 = bld.tmp(src0.type(), 1);
2111       Temp src01 = bld.tmp(dst.type(), 1);
2112       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2113       Temp src10 = bld.tmp(src1.type(), 1);
2114       Temp src11 = bld.tmp(dst.type(), 1);
2115       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2116       if (dst.regClass() == s2) {
2117          Temp borrow = bld.tmp(s1);
2118          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2119          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2120                            bld.scc(borrow))
2121                      .def(1)
2122                      .getTemp();
2123          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2124       } else if (dst.regClass() == v2) {
2125          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2126          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2127          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2128                                Operand::c32(1u), borrow);
2129          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2130       } else {
2131          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2132       }
2133       break;
2134    }
2135    case nir_op_usub_sat: {
2136       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2137          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2138          sub_instr->valu().clamp = 1;
2139          break;
2140       }
2141       Temp src0 = get_alu_src(ctx, instr->src[0]);
2142       Temp src1 = get_alu_src(ctx, instr->src[1]);
2143       if (dst.regClass() == s1) {
2144          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2145          bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2146          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2147          break;
2148       } else if (dst.regClass() == v2b) {
2149          Instruction* sub_instr;
2150          if (ctx->program->gfx_level >= GFX10) {
2151             sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2152          } else {
2153             aco_opcode op = aco_opcode::v_sub_u16;
2154             if (src1.type() == RegType::sgpr) {
2155                std::swap(src0, src1);
2156                op = aco_opcode::v_subrev_u16;
2157             }
2158             sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2159          }
2160          sub_instr->valu().clamp = 1;
2161          break;
2162       } else if (dst.regClass() == v1) {
2163          usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2164          break;
2165       }
2166 
2167       assert(src0.size() == 2 && src1.size() == 2);
2168       Temp src00 = bld.tmp(src0.type(), 1);
2169       Temp src01 = bld.tmp(src0.type(), 1);
2170       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2171       Temp src10 = bld.tmp(src1.type(), 1);
2172       Temp src11 = bld.tmp(src1.type(), 1);
2173       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2174 
2175       if (dst.regClass() == s2) {
2176          Temp carry0 = bld.tmp(s1);
2177          Temp carry1 = bld.tmp(s1);
2178 
2179          Temp no_sat0 =
2180             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2181          Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2182                                  src01, src11, bld.scc(carry0));
2183 
2184          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2185 
2186          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2187                   bld.scc(carry1));
2188       } else if (dst.regClass() == v2) {
2189          Temp no_sat0 = bld.tmp(v1);
2190          Temp dst0 = bld.tmp(v1);
2191          Temp dst1 = bld.tmp(v1);
2192 
2193          Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2194          Temp carry1;
2195 
2196          if (ctx->program->gfx_level >= GFX8) {
2197             carry1 = bld.tmp(bld.lm);
2198             bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2199                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2200                ->valu()
2201                .clamp = 1;
2202          } else {
2203             Temp no_sat1 = bld.tmp(v1);
2204             carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2205             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2206                          carry1);
2207          }
2208 
2209          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2210                       carry1);
2211          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2212       } else {
2213          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2214       }
2215       break;
2216    }
2217    case nir_op_isub_sat: {
2218       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2219          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2220          sub_instr->valu().clamp = 1;
2221          break;
2222       }
2223       Temp src0 = get_alu_src(ctx, instr->src[0]);
2224       Temp src1 = get_alu_src(ctx, instr->src[1]);
2225       if (dst.regClass() == s1) {
2226          Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2227          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2228                                Operand::c32(INT32_MAX), cond);
2229          Temp overflow = bld.tmp(s1);
2230          Temp sub =
2231             bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2232          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2233          break;
2234       }
2235 
2236       src1 = as_vgpr(ctx, src1);
2237 
2238       if (dst.regClass() == v2b) {
2239          Instruction* sub_instr =
2240             bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2241          sub_instr->valu().clamp = 1;
2242       } else if (dst.regClass() == v1) {
2243          Instruction* sub_instr =
2244             bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2245          sub_instr->valu().clamp = 1;
2246       } else {
2247          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2248       }
2249       break;
2250    }
2251    case nir_op_imul: {
2252       if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2253          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2254       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2255          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2256       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2257          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2258       } else if (dst.type() == RegType::vgpr) {
2259          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2260          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2261 
2262          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2263             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2264             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2265                                   true /* commutative */, false, false, nuw_16bit, 0x3);
2266          } else if (nir_src_is_const(instr->src[0].src)) {
2267             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2268                           nir_src_as_uint(instr->src[0].src), false);
2269          } else if (nir_src_is_const(instr->src[1].src)) {
2270             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2271                           nir_src_as_uint(instr->src[1].src), false);
2272          } else {
2273             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2274          }
2275       } else if (dst.regClass() == s1) {
2276          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2277       } else {
2278          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2279       }
2280       break;
2281    }
2282    case nir_op_umul_high: {
2283       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2284          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2285       } else if (dst.bytes() == 4) {
2286          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2287          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2288 
2289          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2290          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2291             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2292          } else {
2293             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2294          }
2295 
2296          if (dst.regClass() == s1)
2297             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2298       } else {
2299          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2300       }
2301       break;
2302    }
2303    case nir_op_imul_high: {
2304       if (dst.regClass() == v1) {
2305          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2306       } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2307          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2308       } else if (dst.regClass() == s1) {
2309          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2310                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2311          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2312       } else {
2313          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2314       }
2315       break;
2316    }
2317    case nir_op_fmul: {
2318       if (dst.regClass() == v2b) {
2319          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2320       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2321          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2322       } else if (dst.regClass() == v1) {
2323          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2324       } else if (dst.regClass() == v2) {
2325          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64_e64, dst);
2326       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2327          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f16, dst, false);
2328       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2329          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f32, dst, false);
2330       } else {
2331          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2332       }
2333       break;
2334    }
2335    case nir_op_fmulz: {
2336       if (dst.regClass() == v1) {
2337          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2338       } else {
2339          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2340       }
2341       break;
2342    }
2343    case nir_op_fadd: {
2344       if (dst.regClass() == v2b) {
2345          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2346       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2347          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2348       } else if (dst.regClass() == v1) {
2349          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2350       } else if (dst.regClass() == v2) {
2351          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64_e64, dst);
2352       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2353          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f16, dst, false);
2354       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2355          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f32, dst, false);
2356       } else {
2357          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2358       }
2359       break;
2360    }
2361    case nir_op_fsub: {
2362       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2363          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2364          VALU_instruction& sub = add->valu();
2365          sub.neg_lo[1] = true;
2366          sub.neg_hi[1] = true;
2367          break;
2368       }
2369 
2370       Temp src0 = get_alu_src(ctx, instr->src[0]);
2371       Temp src1 = get_alu_src(ctx, instr->src[1]);
2372       if (dst.regClass() == v2b) {
2373          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2374             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2375          else
2376             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2377       } else if (dst.regClass() == v1) {
2378          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2379             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2380          else
2381             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2382       } else if (dst.regClass() == v2) {
2383          Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), as_vgpr(ctx, src0),
2384                                      as_vgpr(ctx, src1));
2385          add->valu().neg[1] = true;
2386       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2387          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f16, dst, false);
2388       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2389          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f32, dst, false);
2390       } else {
2391          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2392       }
2393       break;
2394    }
2395    case nir_op_ffma: {
2396       if (dst.regClass() == v2b) {
2397          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2398       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2399          assert(instr->def.num_components == 2);
2400 
2401          Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2402          Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2403          Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2404 
2405          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2406          unsigned opsel_lo = 0, opsel_hi = 0;
2407          for (unsigned i = 0; i < 3; i++) {
2408             opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2409             opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2410          }
2411 
2412          bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2413          emit_split_vector(ctx, dst, 2);
2414       } else if (dst.regClass() == v1) {
2415          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2416                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2417       } else if (dst.regClass() == v2) {
2418          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2419       } else if (dst.regClass() == s1) {
2420          Temp src0 = get_alu_src(ctx, instr->src[0]);
2421          Temp src1 = get_alu_src(ctx, instr->src[1]);
2422          Temp src2 = get_alu_src(ctx, instr->src[2]);
2423          aco_opcode op =
2424             instr->def.bit_size == 16 ? aco_opcode::s_fmac_f16 : aco_opcode::s_fmac_f32;
2425          bld.sop2(op, Definition(dst), src0, src1, src2);
2426       } else {
2427          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2428       }
2429       break;
2430    }
2431    case nir_op_ffmaz: {
2432       if (dst.regClass() == v1) {
2433          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2434                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2435       } else {
2436          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2437       }
2438       break;
2439    }
2440    case nir_op_fmax: {
2441       if (dst.regClass() == v2b) {
2442          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
2443                                ctx->block->fp_mode.must_flush_denorms16_64);
2444       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2445          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2446       } else if (dst.regClass() == v1) {
2447          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2448                                ctx->block->fp_mode.must_flush_denorms32);
2449       } else if (dst.regClass() == v2) {
2450          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64_e64, dst,
2451                                 ctx->block->fp_mode.must_flush_denorms16_64);
2452       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2453          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f16, dst, false);
2454       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2455          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f32, dst, false);
2456       } else {
2457          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2458       }
2459       break;
2460    }
2461    case nir_op_fmin: {
2462       if (dst.regClass() == v2b) {
2463          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
2464                                ctx->block->fp_mode.must_flush_denorms16_64);
2465       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2466          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2467       } else if (dst.regClass() == v1) {
2468          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2469                                ctx->block->fp_mode.must_flush_denorms32);
2470       } else if (dst.regClass() == v2) {
2471          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64_e64, dst,
2472                                 ctx->block->fp_mode.must_flush_denorms16_64);
2473       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2474          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f16, dst, false);
2475       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2476          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f32, dst, false);
2477       } else {
2478          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2479       }
2480       break;
2481    }
2482    case nir_op_sdot_4x8_iadd: {
2483       if (ctx->options->gfx_level >= GFX11)
2484          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2485       else
2486          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2487       break;
2488    }
2489    case nir_op_sdot_4x8_iadd_sat: {
2490       if (ctx->options->gfx_level >= GFX11)
2491          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2492       else
2493          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2494       break;
2495    }
2496    case nir_op_sudot_4x8_iadd: {
2497       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2498       break;
2499    }
2500    case nir_op_sudot_4x8_iadd_sat: {
2501       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2502       break;
2503    }
2504    case nir_op_udot_4x8_uadd: {
2505       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2506       break;
2507    }
2508    case nir_op_udot_4x8_uadd_sat: {
2509       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2510       break;
2511    }
2512    case nir_op_sdot_2x16_iadd: {
2513       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2514       break;
2515    }
2516    case nir_op_sdot_2x16_iadd_sat: {
2517       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2518       break;
2519    }
2520    case nir_op_udot_2x16_uadd: {
2521       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2522       break;
2523    }
2524    case nir_op_udot_2x16_uadd_sat: {
2525       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2526       break;
2527    }
2528    case nir_op_cube_amd: {
2529       Temp in = get_alu_src(ctx, instr->src[0], 3);
2530       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2531                      emit_extract_vector(ctx, in, 2, v1)};
2532       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2533       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2534       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2535       Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2536       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2537       break;
2538    }
2539    case nir_op_bcsel: {
2540       emit_bcsel(ctx, instr, dst);
2541       break;
2542    }
2543    case nir_op_frsq: {
2544       if (instr->def.bit_size == 16) {
2545          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2546             bld.vop3(aco_opcode::v_s_rsq_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2547          else
2548             emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2549       } else if (instr->def.bit_size == 32) {
2550          emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2551       } else if (instr->def.bit_size == 64) {
2552          /* Lowered at NIR level for precision reasons. */
2553          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2554       } else {
2555          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2556       }
2557       break;
2558    }
2559    case nir_op_fneg: {
2560       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2561          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2562          Instruction* vop3p =
2563             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2564                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2565          vop3p->valu().neg_lo[0] = true;
2566          vop3p->valu().neg_hi[0] = true;
2567          emit_split_vector(ctx, dst, 2);
2568          break;
2569       }
2570       Temp src = get_alu_src(ctx, instr->src[0]);
2571       if (dst.regClass() == v2b) {
2572          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2573       } else if (dst.regClass() == v1) {
2574          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2575                   as_vgpr(ctx, src));
2576       } else if (dst.regClass() == v2) {
2577          if (ctx->block->fp_mode.must_flush_denorms16_64)
2578             src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2579                            as_vgpr(ctx, src));
2580          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2581          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2582          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2583          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2584       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2585          bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0xbc00u), src);
2586       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2587          bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0xbf800000u), src);
2588       } else {
2589          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2590       }
2591       break;
2592    }
2593    case nir_op_fabs: {
2594       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2595          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2596          Instruction* vop3p =
2597             bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2598                       instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2599                .instr;
2600          vop3p->valu().neg_lo[1] = true;
2601          vop3p->valu().neg_hi[1] = true;
2602          emit_split_vector(ctx, dst, 2);
2603          break;
2604       }
2605       Temp src = get_alu_src(ctx, instr->src[0]);
2606       if (dst.regClass() == v2b) {
2607          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2608                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2609                                .instr;
2610          mul->valu().abs[1] = true;
2611       } else if (dst.regClass() == v1) {
2612          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2613                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2614                                .instr;
2615          mul->valu().abs[1] = true;
2616       } else if (dst.regClass() == v2) {
2617          if (ctx->block->fp_mode.must_flush_denorms16_64)
2618             src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2619                            as_vgpr(ctx, src));
2620          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2621          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2622          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2623          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2624       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2625          Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fff));
2626          if (ctx->block->fp_mode.denorm16_64 == fp_denorm_keep) {
2627             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2628          } else {
2629             Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2630             bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
2631          }
2632       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2633          Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
2634          if (ctx->block->fp_mode.denorm32 == fp_denorm_keep) {
2635             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2636          } else {
2637             Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2638             bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0x3f800000), tmp);
2639          }
2640       } else {
2641          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2642       }
2643       break;
2644    }
2645    case nir_op_fsat: {
2646       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2647          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2648          Instruction* vop3p =
2649             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2650                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2651          vop3p->valu().clamp = true;
2652          emit_split_vector(ctx, dst, 2);
2653          break;
2654       }
2655       Temp src = get_alu_src(ctx, instr->src[0]);
2656       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
2657          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2658                   src);
2659       } else if (dst.regClass() == v2b) {
2660          bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
2661             ->valu()
2662             .clamp = true;
2663       } else if (dst.regClass() == v1) {
2664          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2665                   Operand::c32(0x3f800000u), src);
2666          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2667           * operands */
2668          // TODO: confirm that this holds under any circumstances
2669       } else if (dst.regClass() == v2) {
2670          Instruction* add =
2671             bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src, Operand::zero());
2672          add->valu().clamp = true;
2673       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2674          Temp low = bld.sop2(aco_opcode::s_max_f16, bld.def(s1), src, Operand::c16(0));
2675          bld.sop2(aco_opcode::s_min_f16, Definition(dst), low, Operand::c16(0x3C00));
2676       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2677          Temp low = bld.sop2(aco_opcode::s_max_f32, bld.def(s1), src, Operand::c32(0));
2678          bld.sop2(aco_opcode::s_min_f32, Definition(dst), low, Operand::c32(0x3f800000));
2679       } else {
2680          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2681       }
2682       break;
2683    }
2684    case nir_op_flog2: {
2685       if (instr->def.bit_size == 16) {
2686          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2687             bld.vop3(aco_opcode::v_s_log_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2688          else
2689             emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2690       } else if (instr->def.bit_size == 32) {
2691          emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2692       } else {
2693          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2694       }
2695       break;
2696    }
2697    case nir_op_frcp: {
2698       if (instr->def.bit_size == 16) {
2699          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2700             bld.vop3(aco_opcode::v_s_rcp_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2701          else
2702             emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2703       } else if (instr->def.bit_size == 32) {
2704          emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2705       } else if (instr->def.bit_size == 64) {
2706          /* Lowered at NIR level for precision reasons. */
2707          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2708       } else {
2709          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2710       }
2711       break;
2712    }
2713    case nir_op_fexp2: {
2714       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX12) {
2715          aco_opcode opcode =
2716             instr->def.bit_size == 16 ? aco_opcode::v_s_exp_f16 : aco_opcode::v_s_exp_f32;
2717          bld.vop3(opcode, Definition(dst), get_alu_src(ctx, instr->src[0]));
2718       } else if (instr->def.bit_size == 16) {
2719          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2720       } else if (instr->def.bit_size == 32) {
2721          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2722       } else {
2723          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2724       }
2725       break;
2726    }
2727    case nir_op_fsqrt: {
2728       if (instr->def.bit_size == 16) {
2729          if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2730             bld.vop3(aco_opcode::v_s_sqrt_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2731          else
2732             emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2733       } else if (instr->def.bit_size == 32) {
2734          emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2735       } else if (instr->def.bit_size == 64) {
2736          /* Lowered at NIR level for precision reasons. */
2737          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2738       } else {
2739          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2740       }
2741       break;
2742    }
2743    case nir_op_ffract: {
2744       if (dst.regClass() == v2b) {
2745          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2746       } else if (dst.regClass() == v1) {
2747          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2748       } else if (dst.regClass() == v2) {
2749          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2750       } else if (dst.regClass() == s1) {
2751          Temp src = get_alu_src(ctx, instr->src[0]);
2752          aco_opcode op =
2753             instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2754          Temp floor = bld.sop1(op, bld.def(s1), src);
2755          op = instr->def.bit_size == 16 ? aco_opcode::s_sub_f16 : aco_opcode::s_sub_f32;
2756          bld.sop2(op, Definition(dst), src, floor);
2757       } else {
2758          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2759       }
2760       break;
2761    }
2762    case nir_op_ffloor: {
2763       if (dst.regClass() == v2b) {
2764          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2765       } else if (dst.regClass() == v1) {
2766          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2767       } else if (dst.regClass() == v2) {
2768          Temp src = get_alu_src(ctx, instr->src[0]);
2769          emit_floor_f64(ctx, bld, Definition(dst), src);
2770       } else if (dst.regClass() == s1) {
2771          Temp src = get_alu_src(ctx, instr->src[0]);
2772          aco_opcode op =
2773             instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2774          bld.sop1(op, Definition(dst), src);
2775       } else {
2776          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2777       }
2778       break;
2779    }
2780    case nir_op_fceil: {
2781       if (dst.regClass() == v2b) {
2782          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2783       } else if (dst.regClass() == v1) {
2784          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2785       } else if (dst.regClass() == v2) {
2786          if (ctx->options->gfx_level >= GFX7) {
2787             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2788          } else {
2789             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2790             /* trunc = trunc(src0)
2791              * if (src0 > 0.0 && src0 != trunc)
2792              *    trunc += 1.0
2793              */
2794             Temp src0 = get_alu_src(ctx, instr->src[0]);
2795             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2796             Temp tmp0 =
2797                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2798             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2799             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2800             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2801                                 bld.copy(bld.def(v1), Operand::zero()),
2802                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2803             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2804                              bld.copy(bld.def(v1), Operand::zero()), add);
2805             bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), trunc, add);
2806          }
2807       } else if (dst.regClass() == s1) {
2808          Temp src = get_alu_src(ctx, instr->src[0]);
2809          aco_opcode op =
2810             instr->def.bit_size == 16 ? aco_opcode::s_ceil_f16 : aco_opcode::s_ceil_f32;
2811          bld.sop1(op, Definition(dst), src);
2812       } else {
2813          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2814       }
2815       break;
2816    }
2817    case nir_op_ftrunc: {
2818       if (dst.regClass() == v2b) {
2819          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2820       } else if (dst.regClass() == v1) {
2821          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2822       } else if (dst.regClass() == v2) {
2823          Temp src = get_alu_src(ctx, instr->src[0]);
2824          emit_trunc_f64(ctx, bld, Definition(dst), src);
2825       } else if (dst.regClass() == s1) {
2826          Temp src = get_alu_src(ctx, instr->src[0]);
2827          aco_opcode op =
2828             instr->def.bit_size == 16 ? aco_opcode::s_trunc_f16 : aco_opcode::s_trunc_f32;
2829          bld.sop1(op, Definition(dst), src);
2830       } else {
2831          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2832       }
2833       break;
2834    }
2835    case nir_op_fround_even: {
2836       if (dst.regClass() == v2b) {
2837          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2838       } else if (dst.regClass() == v1) {
2839          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2840       } else if (dst.regClass() == v2) {
2841          if (ctx->options->gfx_level >= GFX7) {
2842             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2843          } else {
2844             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2845             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2846             Temp src0 = get_alu_src(ctx, instr->src[0]);
2847             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2848 
2849             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2850                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2851             Temp bfi =
2852                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2853                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2854             Temp tmp =
2855                bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), src0,
2856                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2857             Instruction* sub =
2858                bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), tmp,
2859                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2860             sub->valu().neg[1] = true;
2861             tmp = sub->definitions[0].getTemp();
2862 
2863             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2864                                 Operand::c32(0x432fffffu));
2865             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2866             vop3->valu().abs[0] = true;
2867             Temp cond = vop3->definitions[0].getTemp();
2868 
2869             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2870             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2871             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2872                                      as_vgpr(ctx, src0_lo), cond);
2873             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2874                                      as_vgpr(ctx, src0_hi), cond);
2875 
2876             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2877          }
2878       } else if (dst.regClass() == s1) {
2879          Temp src = get_alu_src(ctx, instr->src[0]);
2880          aco_opcode op =
2881             instr->def.bit_size == 16 ? aco_opcode::s_rndne_f16 : aco_opcode::s_rndne_f32;
2882          bld.sop1(op, Definition(dst), src);
2883       } else {
2884          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2885       }
2886       break;
2887    }
2888    case nir_op_fsin_amd:
2889    case nir_op_fcos_amd: {
2890       if (instr->def.bit_size == 16 || instr->def.bit_size == 32) {
2891          bool is_sin = instr->op == nir_op_fsin_amd;
2892          aco_opcode opcode, fract;
2893          RegClass rc;
2894          if (instr->def.bit_size == 16) {
2895             opcode = is_sin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2896             fract = aco_opcode::v_fract_f16;
2897             rc = v2b;
2898          } else {
2899             opcode = is_sin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2900             fract = aco_opcode::v_fract_f32;
2901             rc = v1;
2902          }
2903 
2904          Temp src = get_alu_src(ctx, instr->src[0]);
2905          /* before GFX9, v_sin and v_cos had a valid input domain of [-256, +256] */
2906          if (ctx->options->gfx_level < GFX9)
2907             src = bld.vop1(fract, bld.def(rc), src);
2908 
2909          if (dst.regClass() == rc) {
2910             bld.vop1(opcode, Definition(dst), src);
2911          } else {
2912             Temp tmp = bld.vop1(opcode, bld.def(rc), src);
2913             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2914          }
2915       } else {
2916          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2917       }
2918       break;
2919    }
2920    case nir_op_ldexp: {
2921       if (dst.regClass() == v2b) {
2922          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2923       } else if (dst.regClass() == v1) {
2924          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2925       } else if (dst.regClass() == v2) {
2926          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2927       } else {
2928          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2929       }
2930       break;
2931    }
2932    case nir_op_frexp_sig: {
2933       if (dst.regClass() == v2b) {
2934          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2935       } else if (dst.regClass() == v1) {
2936          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2937       } else if (dst.regClass() == v2) {
2938          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2939       } else {
2940          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2941       }
2942       break;
2943    }
2944    case nir_op_frexp_exp: {
2945       if (instr->src[0].src.ssa->bit_size == 16) {
2946          Temp src = get_alu_src(ctx, instr->src[0]);
2947          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2948          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2949          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2950       } else if (instr->src[0].src.ssa->bit_size == 32) {
2951          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2952       } else if (instr->src[0].src.ssa->bit_size == 64) {
2953          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2954       } else {
2955          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2956       }
2957       break;
2958    }
2959    case nir_op_fsign: {
2960       Temp src = get_alu_src(ctx, instr->src[0]);
2961       if (dst.regClass() == v2b) {
2962          /* replace negative zero with positive zero */
2963          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), as_vgpr(ctx, src));
2964          if (ctx->program->gfx_level >= GFX9) {
2965             src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
2966                            Operand::c16(1u));
2967             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2968          } else {
2969             src = convert_int(ctx, bld, src, 16, 32, true);
2970             src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
2971                            Operand::c32(1u));
2972             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2973          }
2974       } else if (dst.regClass() == v1) {
2975          /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
2976           * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
2977           */
2978          Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
2979          src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, as_vgpr(ctx, src));
2980          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
2981                   Operand::c32(0xbf800000));
2982       } else if (dst.regClass() == v2) {
2983          src = as_vgpr(ctx, src);
2984          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2985          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2986          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2987                                    emit_extract_vector(ctx, src, 1, v1), cond);
2988 
2989          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2990          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2991          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2992 
2993          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2994       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2995          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f16, bld.def(s1, scc), Operand::c16(0), src);
2996          src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3c00), src,
2997                         bld.scc(cond));
2998          cond = bld.sopc(aco_opcode::s_cmp_ge_f16, bld.def(s1, scc), src, Operand::c16(0));
2999          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbc00),
3000                   bld.scc(cond));
3001       } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
3002          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f32, bld.def(s1, scc), Operand::c32(0), src);
3003          src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3f800000), src,
3004                         bld.scc(cond));
3005          cond = bld.sopc(aco_opcode::s_cmp_ge_f32, bld.def(s1, scc), src, Operand::c32(0));
3006          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbf800000),
3007                   bld.scc(cond));
3008       } else {
3009          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3010       }
3011       break;
3012    }
3013    case nir_op_f2f16:
3014    case nir_op_f2f16_rtne: {
3015       assert(instr->src[0].src.ssa->bit_size == 32);
3016       if (instr->def.num_components == 2) {
3017          /* Vectorizing f2f16 is only possible with rtz. */
3018          assert(instr->op != nir_op_f2f16_rtne);
3019          assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
3020                 !ctx->block->fp_mode.care_about_round16_64);
3021          emit_vec2_f2f16(ctx, instr, dst);
3022          break;
3023       }
3024       Temp src = get_alu_src(ctx, instr->src[0]);
3025       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
3026          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
3027           * keep value numbering and the scheduler simpler.
3028           */
3029          if (dst.regClass() == v2b)
3030             bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
3031          else
3032             bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, Definition(dst), src);
3033       } else {
3034          if (dst.regClass() == v2b)
3035             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3036          else
3037             bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3038       }
3039       break;
3040    }
3041    case nir_op_f2f16_rtz: {
3042       assert(instr->src[0].src.ssa->bit_size == 32);
3043       if (instr->def.num_components == 2) {
3044          emit_vec2_f2f16(ctx, instr, dst);
3045          break;
3046       }
3047       Temp src = get_alu_src(ctx, instr->src[0]);
3048       if (ctx->block->fp_mode.round16_64 == fp_round_tz) {
3049          if (dst.regClass() == v2b)
3050             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3051          else
3052             bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3053       } else if (dst.regClass() == s1) {
3054          bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src, Operand::zero());
3055       } else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) {
3056          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
3057       } else {
3058          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
3059       }
3060       break;
3061    }
3062    case nir_op_f2f32: {
3063       if (dst.regClass() == s1) {
3064          assert(instr->src[0].src.ssa->bit_size == 16);
3065          Temp src = get_alu_src(ctx, instr->src[0]);
3066          bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
3067       } else if (instr->src[0].src.ssa->bit_size == 16) {
3068          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
3069       } else if (instr->src[0].src.ssa->bit_size == 64) {
3070          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
3071       } else {
3072          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3073       }
3074       break;
3075    }
3076    case nir_op_f2f64: {
3077       assert(instr->src[0].src.ssa->bit_size == 32);
3078       Temp src = get_alu_src(ctx, instr->src[0]);
3079       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
3080       break;
3081    }
3082    case nir_op_i2f16: {
3083       Temp src = get_alu_src(ctx, instr->src[0]);
3084       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3085       if (dst.regClass() == v2b) {
3086          if (input_size <= 16) {
3087             /* Expand integer to the size expected by the uint→float converter used below */
3088             unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3089             if (input_size != target_size) {
3090                src = convert_int(ctx, bld, src, input_size, target_size, true);
3091             }
3092          }
3093 
3094          if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3095             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
3096          } else {
3097             /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
3098              *
3099              * This is also the fallback-path taken on GFX7 and earlier, which
3100              * do not support direct f16⟷i16 conversions.
3101              */
3102             src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
3103             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3104          }
3105       } else if (dst.regClass() == s1) {
3106          if (input_size <= 16) {
3107             src = convert_int(ctx, bld, src, input_size, 32, true);
3108          }
3109          src = bld.sop1(aco_opcode::s_cvt_f32_i32, bld.def(s1), src);
3110          bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3111       } else {
3112          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3113       }
3114       break;
3115    }
3116    case nir_op_i2f32: {
3117       assert(dst.size() == 1);
3118       Temp src = get_alu_src(ctx, instr->src[0]);
3119       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3120       if (input_size <= 32) {
3121          if (input_size <= 16) {
3122             /* Sign-extend to 32-bits */
3123             src = convert_int(ctx, bld, src, input_size, 32, true);
3124          }
3125          if (dst.regClass() == v1)
3126             bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
3127          else
3128             bld.sop1(aco_opcode::s_cvt_f32_i32, Definition(dst), src);
3129       } else {
3130          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3131       }
3132       break;
3133    }
3134    case nir_op_i2f64: {
3135       if (instr->src[0].src.ssa->bit_size <= 32) {
3136          Temp src = get_alu_src(ctx, instr->src[0]);
3137          if (instr->src[0].src.ssa->bit_size <= 16)
3138             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
3139          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
3140       } else {
3141          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3142       }
3143       break;
3144    }
3145    case nir_op_u2f16: {
3146       Temp src = get_alu_src(ctx, instr->src[0]);
3147       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3148       if (dst.regClass() == v2b) {
3149          if (input_size <= 16) {
3150             /* Expand integer to the size expected by the uint→float converter used below */
3151             unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3152             if (input_size != target_size) {
3153                src = convert_int(ctx, bld, src, input_size, target_size, false);
3154             }
3155          }
3156 
3157          if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3158             bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
3159          } else {
3160             /* Large 32bit inputs need to return inf/FLOAT_MAX.
3161              *
3162              * This is also the fallback-path taken on GFX7 and earlier, which
3163              * do not support direct f16⟷u16 conversions.
3164              */
3165             src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3166             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3167          }
3168       } else if (dst.regClass() == s1) {
3169          if (input_size <= 16) {
3170             src = convert_int(ctx, bld, src, input_size, 32, false);
3171          }
3172          src = bld.sop1(aco_opcode::s_cvt_f32_u32, bld.def(s1), src);
3173          bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3174       } else {
3175          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3176       }
3177       break;
3178    }
3179    case nir_op_u2f32: {
3180       assert(dst.size() == 1);
3181       Temp src = get_alu_src(ctx, instr->src[0]);
3182       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3183       if (input_size == 8 && dst.regClass() == v1) {
3184          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3185       } else if (input_size <= 32) {
3186          if (input_size <= 16)
3187             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3188          if (dst.regClass() == v1)
3189             bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3190          else
3191             bld.sop1(aco_opcode::s_cvt_f32_u32, Definition(dst), src);
3192       } else {
3193          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3194       }
3195       break;
3196    }
3197    case nir_op_u2f64: {
3198       if (instr->src[0].src.ssa->bit_size <= 32) {
3199          Temp src = get_alu_src(ctx, instr->src[0]);
3200          if (instr->src[0].src.ssa->bit_size <= 16)
3201             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3202          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3203       } else {
3204          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3205       }
3206       break;
3207    }
3208    case nir_op_f2i8:
3209    case nir_op_f2i16: {
3210       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3211           ctx->program->gfx_level >= GFX11_5) {
3212          Temp src = get_alu_src(ctx, instr->src[0]);
3213          Temp tmp = bld.as_uniform(src);
3214          if (instr->src[0].src.ssa->bit_size == 16)
3215             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3216          bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3217       } else if (instr->src[0].src.ssa->bit_size == 16) {
3218          if (ctx->program->gfx_level >= GFX8) {
3219             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3220          } else {
3221             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3222             Temp tmp = bld.tmp(v1);
3223             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3224             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3225             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3226                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3227             if (dst.type() == RegType::sgpr) {
3228                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3229             }
3230          }
3231       } else if (instr->src[0].src.ssa->bit_size == 32) {
3232          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3233       } else {
3234          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3235       }
3236       break;
3237    }
3238    case nir_op_f2u8:
3239    case nir_op_f2u16: {
3240       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3241           ctx->program->gfx_level >= GFX11_5) {
3242          Temp src = get_alu_src(ctx, instr->src[0]);
3243          Temp tmp = bld.as_uniform(src);
3244          if (instr->src[0].src.ssa->bit_size == 16)
3245             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3246          bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3247       } else if (instr->src[0].src.ssa->bit_size == 16) {
3248          if (ctx->program->gfx_level >= GFX8) {
3249             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3250          } else {
3251             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3252             Temp tmp = bld.tmp(v1);
3253             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3254             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3255             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3256                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3257             if (dst.type() == RegType::sgpr) {
3258                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3259             }
3260          }
3261       } else if (instr->src[0].src.ssa->bit_size == 32) {
3262          if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
3263             bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
3264                      get_alu_src(ctx, instr->src[0]));
3265          else
3266             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3267       } else {
3268          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3269       }
3270       break;
3271    }
3272    case nir_op_f2i32: {
3273       Temp src = get_alu_src(ctx, instr->src[0]);
3274       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3275           ctx->program->gfx_level >= GFX11_5) {
3276          Temp tmp = bld.as_uniform(src);
3277          if (instr->src[0].src.ssa->bit_size == 16)
3278             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3279          bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3280       } else if (instr->src[0].src.ssa->bit_size == 16) {
3281          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3282          if (dst.type() == RegType::vgpr) {
3283             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3284          } else {
3285             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3286                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3287          }
3288       } else if (instr->src[0].src.ssa->bit_size == 32) {
3289          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3290       } else if (instr->src[0].src.ssa->bit_size == 64) {
3291          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3292       } else {
3293          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3294       }
3295       break;
3296    }
3297    case nir_op_f2u32: {
3298       Temp src = get_alu_src(ctx, instr->src[0]);
3299       if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3300           ctx->program->gfx_level >= GFX11_5) {
3301          Temp tmp = bld.as_uniform(src);
3302          if (instr->src[0].src.ssa->bit_size == 16)
3303             tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3304          bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3305       } else if (instr->src[0].src.ssa->bit_size == 16) {
3306          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3307          if (dst.type() == RegType::vgpr) {
3308             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3309          } else {
3310             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3311                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3312          }
3313       } else if (instr->src[0].src.ssa->bit_size == 32) {
3314          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3315       } else if (instr->src[0].src.ssa->bit_size == 64) {
3316          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3317       } else {
3318          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3319       }
3320       break;
3321    }
3322    case nir_op_b2f16: {
3323       Temp src = get_alu_src(ctx, instr->src[0]);
3324       assert(src.regClass() == bld.lm);
3325 
3326       if (dst.regClass() == s1) {
3327          src = bool_to_scalar_condition(ctx, src);
3328          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3329       } else if (dst.regClass() == v2b) {
3330          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3331          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3332       } else {
3333          unreachable("Wrong destination register class for nir_op_b2f16.");
3334       }
3335       break;
3336    }
3337    case nir_op_b2f32: {
3338       Temp src = get_alu_src(ctx, instr->src[0]);
3339       assert(src.regClass() == bld.lm);
3340 
3341       if (dst.regClass() == s1) {
3342          src = bool_to_scalar_condition(ctx, src);
3343          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3344       } else if (dst.regClass() == v1) {
3345          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3346                       Operand::c32(0x3f800000u), src);
3347       } else {
3348          unreachable("Wrong destination register class for nir_op_b2f32.");
3349       }
3350       break;
3351    }
3352    case nir_op_b2f64: {
3353       Temp src = get_alu_src(ctx, instr->src[0]);
3354       assert(src.regClass() == bld.lm);
3355 
3356       if (dst.regClass() == s2) {
3357          src = bool_to_scalar_condition(ctx, src);
3358          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3359                   Operand::zero(), bld.scc(src));
3360       } else if (dst.regClass() == v2) {
3361          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3362          Temp upper =
3363             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3364          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3365       } else {
3366          unreachable("Wrong destination register class for nir_op_b2f64.");
3367       }
3368       break;
3369    }
3370    case nir_op_i2i8:
3371    case nir_op_i2i16:
3372    case nir_op_i2i32: {
3373       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3374          /* no need to do the extract in get_alu_src() */
3375          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3376                                      ? sgpr_extract_sext
3377                                      : sgpr_extract_undef;
3378          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3379       } else {
3380          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3381          const unsigned output_bitsize = instr->def.bit_size;
3382          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3383                      output_bitsize > input_bitsize, dst);
3384       }
3385       break;
3386    }
3387    case nir_op_u2u8:
3388    case nir_op_u2u16:
3389    case nir_op_u2u32: {
3390       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3391          /* no need to do the extract in get_alu_src() */
3392          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3393                                      ? sgpr_extract_zext
3394                                      : sgpr_extract_undef;
3395          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3396       } else {
3397          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3398                      instr->def.bit_size, false, dst);
3399       }
3400       break;
3401    }
3402    case nir_op_b2b32:
3403    case nir_op_b2i8:
3404    case nir_op_b2i16:
3405    case nir_op_b2i32: {
3406       Temp src = get_alu_src(ctx, instr->src[0]);
3407       assert(src.regClass() == bld.lm);
3408 
3409       if (dst.regClass() == s1) {
3410          bool_to_scalar_condition(ctx, src, dst);
3411       } else if (dst.type() == RegType::vgpr) {
3412          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3413                       src);
3414       } else {
3415          unreachable("Invalid register class for b2i32");
3416       }
3417       break;
3418    }
3419    case nir_op_b2b1: {
3420       Temp src = get_alu_src(ctx, instr->src[0]);
3421       assert(dst.regClass() == bld.lm);
3422 
3423       if (src.type() == RegType::vgpr) {
3424          assert(src.regClass() == v1 || src.regClass() == v2);
3425          assert(dst.regClass() == bld.lm);
3426          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3427                   Definition(dst), Operand::zero(), src);
3428       } else {
3429          assert(src.regClass() == s1 || src.regClass() == s2);
3430          Temp tmp;
3431          if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3432             tmp =
3433                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3434                   .def(1)
3435                   .getTemp();
3436          } else {
3437             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3438                            bld.scc(bld.def(s1)), Operand::zero(), src);
3439          }
3440          bool_to_vector_condition(ctx, tmp, dst);
3441       }
3442       break;
3443    }
3444    case nir_op_unpack_64_2x32:
3445    case nir_op_unpack_32_2x16:
3446    case nir_op_unpack_64_4x16:
3447    case nir_op_unpack_32_4x8:
3448       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3449       emit_split_vector(
3450          ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3451       break;
3452    case nir_op_pack_64_2x32_split: {
3453       Temp src0 = get_alu_src(ctx, instr->src[0]);
3454       Temp src1 = get_alu_src(ctx, instr->src[1]);
3455 
3456       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3457       break;
3458    }
3459    case nir_op_unpack_64_2x32_split_x:
3460       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3461                  get_alu_src(ctx, instr->src[0]));
3462       break;
3463    case nir_op_unpack_64_2x32_split_y:
3464       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3465                  get_alu_src(ctx, instr->src[0]));
3466       break;
3467    case nir_op_unpack_32_2x16_split_x:
3468       if (dst.type() == RegType::vgpr) {
3469          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3470                     get_alu_src(ctx, instr->src[0]));
3471       } else {
3472          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3473       }
3474       break;
3475    case nir_op_unpack_32_2x16_split_y:
3476       if (dst.type() == RegType::vgpr) {
3477          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3478                     get_alu_src(ctx, instr->src[0]));
3479       } else {
3480          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3481                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3482                     Operand::zero());
3483       }
3484       break;
3485    case nir_op_pack_32_2x16_split: {
3486       Temp src0 = get_alu_src(ctx, instr->src[0]);
3487       Temp src1 = get_alu_src(ctx, instr->src[1]);
3488       if (dst.regClass() == v1) {
3489          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3490          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3491          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3492       } else if (ctx->program->gfx_level >= GFX9) {
3493          bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1);
3494       } else {
3495          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3496                          Operand::c32(0xFFFFu));
3497          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3498                          Operand::c32(16u));
3499          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3500       }
3501       break;
3502    }
3503    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3504    case nir_op_pack_half_2x16_rtz_split:
3505    case nir_op_pack_half_2x16_split: {
3506       if (dst.regClass() == v1) {
3507          if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3508             emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3509          else
3510             emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3511       } else if (dst.regClass() == s1) {
3512          emit_sop2_instruction(ctx, instr, aco_opcode::s_cvt_pk_rtz_f16_f32, dst, false);
3513       } else {
3514          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3515       }
3516       break;
3517    }
3518    case nir_op_pack_unorm_2x16:
3519    case nir_op_pack_snorm_2x16: {
3520       unsigned bit_size = instr->src[0].src.ssa->bit_size;
3521       /* Only support 16 and 32bit. */
3522       assert(bit_size == 32 || bit_size == 16);
3523 
3524       RegClass src_rc = bit_size == 32 ? v1 : v2b;
3525       Temp src = get_alu_src(ctx, instr->src[0], 2);
3526       Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3527       Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3528 
3529       /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3530       if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3531          src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3532          src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3533          bit_size = 32;
3534       }
3535 
3536       aco_opcode opcode;
3537       if (bit_size == 32) {
3538          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3539                                                       : aco_opcode::v_cvt_pknorm_i16_f32;
3540       } else {
3541          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3542                                                       : aco_opcode::v_cvt_pknorm_i16_f16;
3543       }
3544       bld.vop3(opcode, Definition(dst), src0, src1);
3545       break;
3546    }
3547    case nir_op_pack_uint_2x16:
3548    case nir_op_pack_sint_2x16: {
3549       Temp src = get_alu_src(ctx, instr->src[0], 2);
3550       Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3551       Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3552       aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3553                                                              : aco_opcode::v_cvt_pk_i16_i32;
3554       bld.vop3(opcode, Definition(dst), src0, src1);
3555       break;
3556    }
3557    case nir_op_unpack_half_2x16_split_x: {
3558       Temp src = get_alu_src(ctx, instr->src[0]);
3559       if (dst.regClass() == s1) {
3560          bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
3561          break;
3562       }
3563       if (src.regClass() == v1)
3564          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3565       if (dst.regClass() == v1) {
3566          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3567       } else {
3568          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3569       }
3570       break;
3571    }
3572    case nir_op_unpack_half_2x16_split_y: {
3573       Temp src = get_alu_src(ctx, instr->src[0]);
3574       if (dst.regClass() == s1) {
3575          bld.sop1(aco_opcode::s_cvt_hi_f32_f16, Definition(dst), src);
3576          break;
3577       }
3578       if (src.regClass() == s1)
3579          src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3580                           Operand::c32(1u), Operand::c32(16u), Operand::zero());
3581       else
3582          src =
3583             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3584       if (dst.regClass() == v1) {
3585          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3586       } else {
3587          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3588       }
3589       break;
3590    }
3591    case nir_op_msad_4x8: {
3592       assert(dst.regClass() == v1);
3593       emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
3594       break;
3595    }
3596    case nir_op_mqsad_4x8: {
3597       assert(dst.regClass() == v4);
3598       Temp ref = get_alu_src(ctx, instr->src[0]);
3599       Temp src = get_alu_src(ctx, instr->src[1], 2);
3600       Temp accum = get_alu_src(ctx, instr->src[2], 4);
3601       bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
3602                as_vgpr(ctx, accum));
3603       emit_split_vector(ctx, dst, 4);
3604       break;
3605    }
3606    case nir_op_shfr: {
3607       if (dst.regClass() == s1) {
3608          Temp src = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
3609                                get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
3610 
3611          Temp amount;
3612          if (nir_src_is_const(instr->src[2].src)) {
3613             amount = bld.copy(bld.def(s1), Operand::c32(nir_src_as_uint(instr->src[2].src) & 0x1f));
3614          } else {
3615             amount = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3616                               get_alu_src(ctx, instr->src[2]), Operand::c32(0x1f));
3617          }
3618 
3619          Temp res = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), src, amount);
3620          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), res, Operand::zero());
3621       } else if (dst.regClass() == v1) {
3622          emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbit_b32, dst, false, 3u);
3623       } else {
3624          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3625       }
3626       break;
3627    }
3628    case nir_op_fquantize2f16: {
3629       Temp src = get_alu_src(ctx, instr->src[0]);
3630       if (dst.regClass() == v1) {
3631          Temp f16;
3632          if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3633             f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
3634          else
3635             f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3636 
3637          if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3638             bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
3639             break;
3640          }
3641 
3642          Temp denorm_zero;
3643          Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3644          if (ctx->program->gfx_level >= GFX8) {
3645             /* value is negative/positive denormal value/zero */
3646             Instruction* tmp0 =
3647                bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, Operand::c32(0x30));
3648             tmp0->valu().abs[0] = true;
3649             tmp0->valu().neg[0] = true;
3650             denorm_zero = tmp0->definitions[0].getTemp();
3651          } else {
3652             /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3653              * so compare the result and flush to 0 if it's smaller.
3654              */
3655             Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3656             Instruction* tmp0 =
3657                bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3658             tmp0->valu().abs[0] = true;
3659             denorm_zero = tmp0->definitions[0].getTemp();
3660          }
3661          if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3662             Temp copysign_0 =
3663                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3664             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), f32, copysign_0, denorm_zero);
3665          } else {
3666             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), f32, Operand::zero(),
3667                          denorm_zero);
3668          }
3669       } else if (dst.regClass() == s1) {
3670          Temp f16;
3671          if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3672             f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
3673          else
3674             f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
3675 
3676          if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3677             bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);
3678          } else {
3679             Temp f32 = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), f16);
3680             Temp abs_mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
3681             Temp abs =
3682                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3683             Operand sign;
3684             if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3685                sign =
3686                   bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3687             } else {
3688                sign = Operand::c32(0);
3689             }
3690             Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3691             Temp denorm_zero = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, smallest);
3692             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), sign, f32, bld.scc(denorm_zero));
3693          }
3694       } else {
3695          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3696       }
3697       break;
3698    }
3699    case nir_op_bfm: {
3700       Temp bits = get_alu_src(ctx, instr->src[0]);
3701       Temp offset = get_alu_src(ctx, instr->src[1]);
3702 
3703       if (dst.regClass() == s1) {
3704          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3705       } else if (dst.regClass() == v1) {
3706          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3707       } else {
3708          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3709       }
3710       break;
3711    }
3712    case nir_op_bitfield_select: {
3713 
3714       /* dst = (insert & bitmask) | (base & ~bitmask) */
3715       if (dst.regClass() == s1) {
3716          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3717          Temp insert = get_alu_src(ctx, instr->src[1]);
3718          Temp base = get_alu_src(ctx, instr->src[2]);
3719          aco_ptr<Instruction> sop2;
3720          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3721          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3722          Operand lhs;
3723          if (const_insert && const_bitmask) {
3724             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3725          } else {
3726             insert =
3727                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3728             lhs = Operand(insert);
3729          }
3730 
3731          Operand rhs;
3732          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3733          if (const_base && const_bitmask) {
3734             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3735          } else {
3736             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3737             rhs = Operand(base);
3738          }
3739 
3740          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3741 
3742       } else if (dst.regClass() == v1) {
3743          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3744       } else {
3745          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3746       }
3747       break;
3748    }
3749    case nir_op_ubfe:
3750    case nir_op_ibfe: {
3751       if (dst.bytes() != 4)
3752          unreachable("Unsupported BFE bit size");
3753 
3754       if (dst.type() == RegType::sgpr) {
3755          Temp base = get_alu_src(ctx, instr->src[0]);
3756 
3757          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3758          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3759          aco_opcode opcode =
3760             instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3761          if (const_offset && const_bits) {
3762             uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3763             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3764             break;
3765          }
3766 
3767          Temp offset = get_alu_src(ctx, instr->src[1]);
3768          Temp bits = get_alu_src(ctx, instr->src[2]);
3769 
3770          if (ctx->program->gfx_level >= GFX9) {
3771             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3772                                          : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3773                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu));
3774             Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3775             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3776          } else if (instr->op == nir_op_ubfe) {
3777             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3778             Temp masked =
3779                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3780             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3781          } else {
3782             Operand bits_op = const_bits
3783                                  ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3784                                  : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3785                                             bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3786                                                      bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3787                                             Operand::c32(16u));
3788             Operand offset_op = const_offset
3789                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3790                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3791                                               offset, Operand::c32(0x1fu));
3792 
3793             Temp extract =
3794                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3795             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3796          }
3797 
3798       } else {
3799          aco_opcode opcode =
3800             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3801          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3802       }
3803       break;
3804    }
3805    case nir_op_extract_u8:
3806    case nir_op_extract_i8:
3807    case nir_op_extract_u16:
3808    case nir_op_extract_i16: {
3809       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3810       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3811       uint32_t bits = comp == 4 ? 8 : 16;
3812       unsigned index = nir_src_as_uint(instr->src[1].src);
3813       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3814          assert(index == 0);
3815          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3816       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3817          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3818          unsigned swizzle = instr->src[0].swizzle[0];
3819          if (vec.size() > 1) {
3820             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3821             swizzle = swizzle & 1;
3822          }
3823          index += swizzle * instr->def.bit_size / bits;
3824          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3825                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3826       } else {
3827          Temp src = get_alu_src(ctx, instr->src[0]);
3828          Definition def(dst);
3829          if (dst.bytes() == 8) {
3830             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3831             index %= comp;
3832             def = bld.def(src.type(), 1);
3833          }
3834          assert(def.bytes() <= 4);
3835          if (def.regClass() == s1) {
3836             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3837                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3838          } else {
3839             src = emit_extract_vector(ctx, src, 0, def.regClass());
3840             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3841                        Operand::c32(bits), Operand::c32(is_signed));
3842          }
3843          if (dst.size() == 2)
3844             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3845                        Operand::zero());
3846       }
3847       break;
3848    }
3849    case nir_op_insert_u8:
3850    case nir_op_insert_u16: {
3851       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3852       uint32_t bits = comp == 4 ? 8 : 16;
3853       unsigned index = nir_src_as_uint(instr->src[1].src);
3854       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3855          assert(index == 0);
3856          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3857       } else {
3858          Temp src = get_alu_src(ctx, instr->src[0]);
3859          Definition def(dst);
3860          bool swap = false;
3861          if (dst.bytes() == 8) {
3862             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3863             swap = index >= comp;
3864             index %= comp;
3865             def = bld.def(src.type(), 1);
3866          }
3867          if (def.regClass() == s1) {
3868             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3869                        Operand::c32(index), Operand::c32(bits));
3870          } else {
3871             src = emit_extract_vector(ctx, src, 0, def.regClass());
3872             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3873                        Operand::c32(bits));
3874          }
3875          if (dst.size() == 2 && swap)
3876             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3877                        def.getTemp());
3878          else if (dst.size() == 2)
3879             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3880                        Operand::zero());
3881       }
3882       break;
3883    }
3884    case nir_op_bit_count: {
3885       Temp src = get_alu_src(ctx, instr->src[0]);
3886       if (src.regClass() == s1) {
3887          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3888       } else if (src.regClass() == v1) {
3889          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3890       } else if (src.regClass() == v2) {
3891          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3892                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3893                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3894       } else if (src.regClass() == s2) {
3895          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3896       } else {
3897          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3898       }
3899       break;
3900    }
3901    case nir_op_flt: {
3902       emit_comparison(
3903          ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3904          aco_opcode::v_cmp_lt_f64,
3905          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f16 : aco_opcode::num_opcodes,
3906          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f32 : aco_opcode::num_opcodes);
3907       break;
3908    }
3909    case nir_op_fge: {
3910       emit_comparison(
3911          ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3912          aco_opcode::v_cmp_ge_f64,
3913          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f16 : aco_opcode::num_opcodes,
3914          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f32 : aco_opcode::num_opcodes);
3915       break;
3916    }
3917    case nir_op_fltu: {
3918       emit_comparison(
3919          ctx, instr, dst, aco_opcode::v_cmp_nge_f16, aco_opcode::v_cmp_nge_f32,
3920          aco_opcode::v_cmp_nge_f64,
3921          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f16 : aco_opcode::num_opcodes,
3922          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f32 : aco_opcode::num_opcodes);
3923       break;
3924    }
3925    case nir_op_fgeu: {
3926       emit_comparison(
3927          ctx, instr, dst, aco_opcode::v_cmp_nlt_f16, aco_opcode::v_cmp_nlt_f32,
3928          aco_opcode::v_cmp_nlt_f64,
3929          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f16 : aco_opcode::num_opcodes,
3930          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f32 : aco_opcode::num_opcodes);
3931       break;
3932    }
3933    case nir_op_feq: {
3934       emit_comparison(
3935          ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3936          aco_opcode::v_cmp_eq_f64,
3937          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f16 : aco_opcode::num_opcodes,
3938          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f32 : aco_opcode::num_opcodes);
3939       break;
3940    }
3941    case nir_op_fneu: {
3942       emit_comparison(
3943          ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3944          aco_opcode::v_cmp_neq_f64,
3945          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f16 : aco_opcode::num_opcodes,
3946          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f32 : aco_opcode::num_opcodes);
3947       break;
3948    }
3949    case nir_op_fequ: {
3950       emit_comparison(
3951          ctx, instr, dst, aco_opcode::v_cmp_nlg_f16, aco_opcode::v_cmp_nlg_f32,
3952          aco_opcode::v_cmp_nlg_f64,
3953          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f16 : aco_opcode::num_opcodes,
3954          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f32 : aco_opcode::num_opcodes);
3955       break;
3956    }
3957    case nir_op_fneo: {
3958       emit_comparison(
3959          ctx, instr, dst, aco_opcode::v_cmp_lg_f16, aco_opcode::v_cmp_lg_f32,
3960          aco_opcode::v_cmp_lg_f64,
3961          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f16 : aco_opcode::num_opcodes,
3962          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f32 : aco_opcode::num_opcodes);
3963       break;
3964    }
3965    case nir_op_funord: {
3966       emit_comparison(
3967          ctx, instr, dst, aco_opcode::v_cmp_u_f16, aco_opcode::v_cmp_u_f32, aco_opcode::v_cmp_u_f64,
3968          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f16 : aco_opcode::num_opcodes,
3969          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f32 : aco_opcode::num_opcodes);
3970       break;
3971    }
3972    case nir_op_ford: {
3973       emit_comparison(
3974          ctx, instr, dst, aco_opcode::v_cmp_o_f16, aco_opcode::v_cmp_o_f32, aco_opcode::v_cmp_o_f64,
3975          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f16 : aco_opcode::num_opcodes,
3976          ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f32 : aco_opcode::num_opcodes);
3977       break;
3978    }
3979    case nir_op_ilt: {
3980       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3981                       aco_opcode::v_cmp_lt_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_i32);
3982       break;
3983    }
3984    case nir_op_ige: {
3985       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3986                       aco_opcode::v_cmp_ge_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_i32);
3987       break;
3988    }
3989    case nir_op_ieq: {
3990       if (instr->src[0].src.ssa->bit_size == 1)
3991          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3992       else
3993          emit_comparison(
3994             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3995             aco_opcode::v_cmp_eq_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_eq_i32,
3996             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3997       break;
3998    }
3999    case nir_op_ine: {
4000       if (instr->src[0].src.ssa->bit_size == 1)
4001          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
4002       else
4003          emit_comparison(
4004             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
4005             aco_opcode::v_cmp_lg_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lg_i32,
4006             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
4007       break;
4008    }
4009    case nir_op_ult: {
4010       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
4011                       aco_opcode::v_cmp_lt_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_u32);
4012       break;
4013    }
4014    case nir_op_uge: {
4015       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
4016                       aco_opcode::v_cmp_ge_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_u32);
4017       break;
4018    }
4019    case nir_op_bitz:
4020    case nir_op_bitnz: {
4021       assert(instr->src[0].src.ssa->bit_size != 1);
4022       bool test0 = instr->op == nir_op_bitz;
4023       Temp src0 = get_alu_src(ctx, instr->src[0]);
4024       Temp src1 = get_alu_src(ctx, instr->src[1]);
4025       bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
4026       if (!use_valu) {
4027          aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
4028                                                                : aco_opcode::s_bitcmp1_b32;
4029          if (test0)
4030             op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
4031                                                        : aco_opcode::s_bitcmp0_b32;
4032          emit_sopc_instruction(ctx, instr, op, dst);
4033          break;
4034       }
4035 
4036       /* We do not have a VALU version of s_bitcmp.
4037        * But if the second source is constant, we can use
4038        * v_cmp_class_f32's LUT to check the bit.
4039        * The LUT only has 10 entries, so extract a higher byte if we have to.
4040        * For sign bits comparision with 0 is better because v_cmp_class
4041        * can't be inverted.
4042        */
4043       if (nir_src_is_const(instr->src[1].src)) {
4044          uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
4045          bit &= instr->src[0].src.ssa->bit_size - 1;
4046          src0 = as_vgpr(ctx, src0);
4047 
4048          if (src0.regClass() == v2) {
4049             src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
4050             bit &= 31;
4051          }
4052 
4053          if (bit == 31) {
4054             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
4055                      Operand::c32(0), src0);
4056             break;
4057          }
4058 
4059          if (bit == 15 && ctx->program->gfx_level >= GFX8) {
4060             bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
4061                      Operand::c32(0), src0);
4062             break;
4063          }
4064 
4065          /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
4066          const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
4067          const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
4068          const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
4069          if (use_opsel) {
4070             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
4071                               Operand::c32(16), Operand::c32(0));
4072             bit &= 0xf;
4073          }
4074 
4075          /* If we can use sdwa the extract is free, while test0's s_not is not. */
4076          if (bit == 7 && test0 && can_sdwa) {
4077             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4078                               Operand::c32(8), Operand::c32(1));
4079             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
4080                      Operand::c32(0), src0);
4081             break;
4082          }
4083 
4084          if (bit > max_bit) {
4085             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4086                               Operand::c32(8), Operand::c32(0));
4087             bit &= 0x7;
4088          }
4089 
4090          /* denorm and snan/qnan inputs are preserved using all float control modes. */
4091          static const struct {
4092             uint32_t fp32;
4093             uint32_t fp16;
4094             bool negate;
4095          } float_lut[10] = {
4096             {0x7f800001, 0x7c01, false}, /* snan */
4097             {~0u, ~0u, false},           /* qnan */
4098             {0xff800000, 0xfc00, false}, /* -inf */
4099             {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
4100             {1, 1, true},                /* -denormal */
4101             {0, 0, true},                /* -0.0 */
4102             {0, 0, false},               /* +0.0 */
4103             {1, 1, false},               /* +denormal */
4104             {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
4105             {0x7f800000, 0x7c00, false}, /* +inf */
4106          };
4107 
4108          Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
4109          /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
4110          const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
4111                                (ctx->program->gfx_level >= GFX11 && use_opsel);
4112          const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
4113          const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
4114 
4115          VALU_instruction& res =
4116             bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
4117          if (float_lut[bit].negate) {
4118             res.format = asVOP3(res.format);
4119             res.neg[0] = true;
4120          }
4121 
4122          if (test0)
4123             bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
4124 
4125          break;
4126       }
4127 
4128       Temp res;
4129       aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
4130       if (instr->src[0].src.ssa->bit_size == 16) {
4131          op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
4132          if (ctx->program->gfx_level < GFX10)
4133             res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
4134          else
4135             res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
4136 
4137          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
4138       } else if (instr->src[0].src.ssa->bit_size == 32) {
4139          res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
4140       } else if (instr->src[0].src.ssa->bit_size == 64) {
4141          if (ctx->program->gfx_level < GFX8)
4142             res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
4143          else
4144             res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
4145 
4146          res = emit_extract_vector(ctx, res, 0, v1);
4147          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
4148       } else {
4149          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
4150       }
4151       bld.vopc(op, Definition(dst), Operand::c32(0), res);
4152       break;
4153    }
4154    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
4155    }
4156 }
4157 
4158 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)4159 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
4160 {
4161    Temp dst = get_ssa_temp(ctx, &instr->def);
4162 
4163    // TODO: we really want to have the resulting type as this would allow for 64bit literals
4164    // which get truncated the lsb if double and msb if int
4165    // for now, we only use s_mov_b64 with 64bit inline constants
4166    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
4167    assert(dst.type() == RegType::sgpr);
4168 
4169    Builder bld(ctx->program, ctx->block);
4170 
4171    if (instr->def.bit_size == 1) {
4172       assert(dst.regClass() == bld.lm);
4173       int val = instr->value[0].b ? -1 : 0;
4174       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
4175       bld.copy(Definition(dst), op);
4176    } else if (instr->def.bit_size == 8) {
4177       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
4178    } else if (instr->def.bit_size == 16) {
4179       /* sign-extend to use s_movk_i32 instead of a literal */
4180       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
4181    } else if (dst.size() == 1) {
4182       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
4183    } else {
4184       assert(dst.size() != 1);
4185       aco_ptr<Instruction> vec{
4186          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4187       if (instr->def.bit_size == 64)
4188          for (unsigned i = 0; i < dst.size(); i++)
4189             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
4190       else {
4191          for (unsigned i = 0; i < dst.size(); i++)
4192             vec->operands[i] = Operand::c32(instr->value[i].u32);
4193       }
4194       vec->definitions[0] = Definition(dst);
4195       ctx->block->instructions.emplace_back(std::move(vec));
4196    }
4197 }
4198 
4199 Temp
emit_readfirstlane(isel_context * ctx,Temp src,Temp dst)4200 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
4201 {
4202    Builder bld(ctx->program, ctx->block);
4203 
4204    if (src.regClass().type() == RegType::sgpr) {
4205       bld.copy(Definition(dst), src);
4206    } else if (src.size() == 1) {
4207       bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
4208    } else {
4209       aco_ptr<Instruction> split{
4210          create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
4211       split->operands[0] = Operand(src);
4212 
4213       for (unsigned i = 0; i < src.size(); i++) {
4214          split->definitions[i] =
4215             bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
4216       }
4217 
4218       Instruction* split_raw = split.get();
4219       ctx->block->instructions.emplace_back(std::move(split));
4220 
4221       aco_ptr<Instruction> vec{
4222          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
4223       vec->definitions[0] = Definition(dst);
4224       for (unsigned i = 0; i < src.size(); i++) {
4225          vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
4226                                      split_raw->definitions[i].getTemp());
4227       }
4228 
4229       ctx->block->instructions.emplace_back(std::move(vec));
4230       if (src.bytes() % 4 == 0)
4231          emit_split_vector(ctx, dst, src.size());
4232    }
4233 
4234    return dst;
4235 }
4236 
4237 bool
can_use_byte_align_for_global_load(unsigned num_components,unsigned component_size,unsigned align_,bool support_12_byte)4238 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
4239                                    unsigned align_, bool support_12_byte)
4240 {
4241    /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
4242     * to use unsupported load sizes.
4243     */
4244    assert(util_is_power_of_two_nonzero(align_));
4245    if (align_ < 4) {
4246       assert(component_size < 4);
4247       unsigned load_size = num_components * component_size;
4248       uint32_t new_size = align(load_size + (4 - align_), 4);
4249       return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
4250    }
4251    return true;
4252 }
4253 
4254 struct LoadEmitInfo {
4255    Operand offset;
4256    Temp dst;
4257    unsigned num_components;
4258    unsigned component_size;
4259    Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
4260    Temp idx = Temp(0, v1);      /* buffer index */
4261    unsigned component_stride = 0;
4262    unsigned const_offset = 0;
4263    unsigned align_mul = 0;
4264    unsigned align_offset = 0;
4265    pipe_format format;
4266 
4267    ac_hw_cache_flags cache = {{0, 0, 0, 0, 0}};
4268    bool split_by_component_stride = true;
4269    bool readfirstlane_for_uniform = false;
4270    unsigned swizzle_component_size = 0;
4271    memory_sync_info sync;
4272    Temp soffset = Temp(0, s1);
4273 };
4274 
4275 struct EmitLoadParameters {
4276    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
4277                              unsigned bytes_needed, unsigned align, unsigned const_offset,
4278                              Temp dst_hint);
4279 
4280    Callback callback;
4281    bool byte_align_loads;
4282    bool supports_8bit_16bit_loads;
4283    unsigned max_const_offset_plus_one;
4284 };
4285 
4286 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)4287 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4288           const EmitLoadParameters& params)
4289 {
4290    unsigned load_size = info.num_components * info.component_size;
4291    unsigned component_size = info.component_size;
4292 
4293    unsigned num_vals = 0;
4294    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
4295 
4296    unsigned const_offset = info.const_offset;
4297 
4298    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
4299    unsigned align_offset = info.align_offset % align_mul;
4300 
4301    unsigned bytes_read = 0;
4302    while (bytes_read < load_size) {
4303       unsigned bytes_needed = load_size - bytes_read;
4304 
4305       /* add buffer for unaligned loads */
4306       int byte_align = 0;
4307       if (params.byte_align_loads) {
4308          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
4309       }
4310 
4311       if (byte_align) {
4312          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
4313              !params.supports_8bit_16bit_loads) {
4314             if (info.component_stride) {
4315                assert(params.supports_8bit_16bit_loads && "unimplemented");
4316                bytes_needed = 2;
4317                byte_align = 0;
4318             } else {
4319                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
4320                bytes_needed = align(bytes_needed, 4);
4321             }
4322          } else {
4323             byte_align = 0;
4324          }
4325       }
4326 
4327       if (info.split_by_component_stride) {
4328          if (info.swizzle_component_size)
4329             bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
4330          if (info.component_stride)
4331             bytes_needed = MIN2(bytes_needed, info.component_size);
4332       }
4333 
4334       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
4335 
4336       /* reduce constant offset */
4337       Operand offset = info.offset;
4338       unsigned reduced_const_offset = const_offset;
4339       bool remove_const_offset_completely = need_to_align_offset;
4340       if (const_offset &&
4341           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4342          unsigned to_add = const_offset;
4343          if (remove_const_offset_completely) {
4344             reduced_const_offset = 0;
4345          } else {
4346             to_add =
4347                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4348             reduced_const_offset %= params.max_const_offset_plus_one;
4349          }
4350          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4351          if (offset.isConstant()) {
4352             offset = Operand::c32(offset.constantValue() + to_add);
4353          } else if (offset.isUndefined()) {
4354             offset = Operand::c32(to_add);
4355          } else if (offset_tmp.regClass() == s1) {
4356             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4357                               Operand::c32(to_add));
4358          } else if (offset_tmp.regClass() == v1) {
4359             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4360          } else {
4361             Temp lo = bld.tmp(offset_tmp.type(), 1);
4362             Temp hi = bld.tmp(offset_tmp.type(), 1);
4363             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4364 
4365             if (offset_tmp.regClass() == s2) {
4366                Temp carry = bld.tmp(s1);
4367                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4368                              Operand::c32(to_add));
4369                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4370                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4371             } else {
4372                Temp new_lo = bld.tmp(v1);
4373                Temp carry =
4374                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4375                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4376                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4377             }
4378          }
4379       }
4380 
4381       /* align offset down if needed */
4382       Operand aligned_offset = offset;
4383       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4384       if (need_to_align_offset) {
4385          align = 4;
4386          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4387          if (offset.isConstant()) {
4388             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4389          } else if (offset.isUndefined()) {
4390             aligned_offset = Operand::zero();
4391          } else if (offset_tmp.regClass() == s1) {
4392             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4393                                       Operand::c32(0xfffffffcu), offset_tmp);
4394          } else if (offset_tmp.regClass() == s2) {
4395             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4396                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4397          } else if (offset_tmp.regClass() == v1) {
4398             aligned_offset =
4399                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4400          } else if (offset_tmp.regClass() == v2) {
4401             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4402             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4403             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4404             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4405          }
4406       }
4407       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4408                                 : aligned_offset.isConstant()
4409                                    ? bld.copy(bld.def(s1), aligned_offset)
4410                                    : Temp(0, s1);
4411 
4412       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4413                                  reduced_const_offset, byte_align ? Temp() : info.dst);
4414 
4415       /* the callback wrote directly to dst */
4416       if (val == info.dst) {
4417          assert(num_vals == 0);
4418          emit_split_vector(ctx, info.dst, info.num_components);
4419          return;
4420       }
4421 
4422       /* shift result right if needed */
4423       if (params.byte_align_loads && info.component_size < 4) {
4424          Operand byte_align_off = Operand::c32(byte_align);
4425          if (byte_align == -1) {
4426             if (offset.isConstant())
4427                byte_align_off = Operand::c32(offset.constantValue() % 4u);
4428             else if (offset.isUndefined())
4429                byte_align_off = Operand::zero();
4430             else if (offset.size() == 2)
4431                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4432                                                             RegClass(offset.getTemp().type(), 1)));
4433             else
4434                byte_align_off = offset;
4435          }
4436 
4437          assert(val.bytes() >= load_size && "unimplemented");
4438          if (val.type() == RegType::sgpr)
4439             byte_align_scalar(ctx, val, byte_align_off, info.dst);
4440          else
4441             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4442          return;
4443       }
4444 
4445       /* add result to list and advance */
4446       if (info.component_stride) {
4447          assert(val.bytes() % info.component_size == 0);
4448          unsigned num_loaded_components = val.bytes() / info.component_size;
4449          unsigned advance_bytes = info.component_stride * num_loaded_components;
4450          const_offset += advance_bytes;
4451          align_offset = (align_offset + advance_bytes) % align_mul;
4452       } else {
4453          const_offset += val.bytes();
4454          align_offset = (align_offset + val.bytes()) % align_mul;
4455       }
4456       bytes_read += val.bytes();
4457       vals[num_vals++] = val;
4458    }
4459 
4460    /* create array of components */
4461    unsigned components_split = 0;
4462    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4463    bool has_vgprs = false;
4464    for (unsigned i = 0; i < num_vals;) {
4465       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4466       unsigned num_tmps = 0;
4467       unsigned tmp_size = 0;
4468       RegType reg_type = RegType::sgpr;
4469       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4470          if (vals[i].type() == RegType::vgpr)
4471             reg_type = RegType::vgpr;
4472          tmp_size += vals[i].bytes();
4473          tmp[num_tmps++] = vals[i++];
4474       }
4475       if (num_tmps > 1) {
4476          aco_ptr<Instruction> vec{
4477             create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4478          for (unsigned j = 0; j < num_tmps; j++)
4479             vec->operands[j] = Operand(tmp[j]);
4480          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4481          vec->definitions[0] = Definition(tmp[0]);
4482          bld.insert(std::move(vec));
4483       }
4484 
4485       if (tmp[0].bytes() % component_size) {
4486          /* trim tmp[0] */
4487          assert(i == num_vals);
4488          RegClass new_rc =
4489             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4490          tmp[0] =
4491             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4492       }
4493 
4494       RegClass elem_rc = RegClass::get(reg_type, component_size);
4495 
4496       unsigned start = components_split;
4497 
4498       if (tmp_size == elem_rc.bytes()) {
4499          allocated_vec[components_split++] = tmp[0];
4500       } else {
4501          assert(tmp_size % elem_rc.bytes() == 0);
4502          aco_ptr<Instruction> split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO,
4503                                                        1, tmp_size / elem_rc.bytes())};
4504          for (auto& def : split->definitions) {
4505             Temp component = bld.tmp(elem_rc);
4506             allocated_vec[components_split++] = component;
4507             def = Definition(component);
4508          }
4509          split->operands[0] = Operand(tmp[0]);
4510          bld.insert(std::move(split));
4511       }
4512 
4513       /* try to p_as_uniform early so we can create more optimizable code and
4514        * also update allocated_vec */
4515       for (unsigned j = start; j < components_split; j++) {
4516          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
4517             if (info.readfirstlane_for_uniform) {
4518                allocated_vec[j] = emit_readfirstlane(
4519                   ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
4520             } else {
4521                allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4522             }
4523          }
4524          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4525       }
4526    }
4527 
4528    /* concatenate components and p_as_uniform() result if needed */
4529    if (info.dst.type() == RegType::vgpr || !has_vgprs)
4530       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4531 
4532    int padding_bytes =
4533       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4534 
4535    aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
4536                                                info.num_components + !!padding_bytes, 1)};
4537    for (unsigned i = 0; i < info.num_components; i++)
4538       vec->operands[i] = Operand(allocated_vec[i]);
4539    if (padding_bytes)
4540       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4541    if (info.dst.type() == RegType::sgpr && has_vgprs) {
4542       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4543       vec->definitions[0] = Definition(tmp);
4544       bld.insert(std::move(vec));
4545       if (info.readfirstlane_for_uniform)
4546          emit_readfirstlane(ctx, tmp, info.dst);
4547       else
4548          bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4549    } else {
4550       vec->definitions[0] = Definition(info.dst);
4551       bld.insert(std::move(vec));
4552    }
4553 }
4554 
4555 Operand
load_lds_size_m0(Builder & bld)4556 load_lds_size_m0(Builder& bld)
4557 {
4558    /* m0 does not need to be initialized on GFX9+ */
4559    if (bld.program->gfx_level >= GFX9)
4560       return Operand(s1);
4561 
4562    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4563 }
4564 
4565 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4566 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4567                   unsigned align, unsigned const_offset, Temp dst_hint)
4568 {
4569    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4570 
4571    Operand m = load_lds_size_m0(bld);
4572 
4573    bool large_ds_read = bld.program->gfx_level >= GFX7;
4574    bool usable_read2 = bld.program->gfx_level >= GFX7;
4575 
4576    bool read2 = false;
4577    unsigned size = 0;
4578    aco_opcode op;
4579    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4580       size = 16;
4581       op = aco_opcode::ds_read_b128;
4582    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4583       size = 16;
4584       read2 = true;
4585       op = aco_opcode::ds_read2_b64;
4586    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4587       size = 12;
4588       op = aco_opcode::ds_read_b96;
4589    } else if (bytes_needed >= 8 && align % 8 == 0) {
4590       size = 8;
4591       op = aco_opcode::ds_read_b64;
4592    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4593       size = 8;
4594       read2 = true;
4595       op = aco_opcode::ds_read2_b32;
4596    } else if (bytes_needed >= 4 && align % 4 == 0) {
4597       size = 4;
4598       op = aco_opcode::ds_read_b32;
4599    } else if (bytes_needed >= 2 && align % 2 == 0) {
4600       size = 2;
4601       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4602    } else {
4603       size = 1;
4604       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4605    }
4606 
4607    unsigned const_offset_unit = read2 ? size / 2u : 1u;
4608    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4609 
4610    if (const_offset > (const_offset_range - const_offset_unit)) {
4611       unsigned excess = const_offset - (const_offset % const_offset_range);
4612       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4613       const_offset -= excess;
4614    }
4615 
4616    const_offset /= const_offset_unit;
4617 
4618    RegClass rc = RegClass::get(RegType::vgpr, size);
4619    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4620    Instruction* instr;
4621    if (read2)
4622       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4623    else
4624       instr = bld.ds(op, Definition(val), offset, m, const_offset);
4625    instr->ds().sync = info.sync;
4626 
4627    if (m.isUndefined())
4628       instr->operands.pop_back();
4629 
4630    return val;
4631 }
4632 
4633 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4634 
4635 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4636 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4637                    unsigned align, unsigned const_offset, Temp dst_hint)
4638 {
4639    assert(align >= 4u);
4640 
4641    bld.program->has_smem_buffer_or_global_loads = true;
4642 
4643    bool buffer = info.resource.id() && info.resource.bytes() == 16;
4644    Temp addr = info.resource;
4645    if (!buffer && !addr.id()) {
4646       addr = offset;
4647       offset = Temp();
4648    }
4649 
4650    bytes_needed = MIN2(bytes_needed, 64);
4651    unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4652    unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4653    /* Only round-up global loads if it's aligned so that it won't cross pages */
4654    bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4655 
4656    aco_opcode op;
4657    if (bytes_needed <= 4) {
4658       op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4659    } else if (bytes_needed <= 8) {
4660       op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4661    } else if (bytes_needed <= 16) {
4662       op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4663    } else if (bytes_needed <= 32) {
4664       op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4665    } else {
4666       assert(bytes_needed == 64);
4667       op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4668    }
4669 
4670    aco_ptr<Instruction> load{create_instruction(op, Format::SMEM, 2, 1)};
4671    if (buffer) {
4672       if (const_offset)
4673          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4674                            Operand::c32(const_offset));
4675       load->operands[0] = Operand(info.resource);
4676       load->operands[1] = Operand(offset);
4677    } else {
4678       load->operands[0] = Operand(addr);
4679       if (offset.id() && const_offset)
4680          load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4681                                       Operand::c32(const_offset));
4682       else if (offset.id())
4683          load->operands[1] = Operand(offset);
4684       else
4685          load->operands[1] = Operand::c32(const_offset);
4686    }
4687    RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4688    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4689    load->definitions[0] = Definition(val);
4690    load->smem().cache = info.cache;
4691    load->smem().sync = info.sync;
4692    bld.insert(std::move(load));
4693    return val;
4694 }
4695 
4696 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4697 
4698 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4699 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4700                     unsigned align_, unsigned const_offset, Temp dst_hint)
4701 {
4702    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4703    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4704 
4705    if (info.soffset.id()) {
4706       if (soffset.isTemp())
4707          vaddr = bld.copy(bld.def(v1), soffset);
4708       soffset = Operand(info.soffset);
4709    }
4710 
4711    if (soffset.isUndefined())
4712       soffset = Operand::zero();
4713 
4714    bool offen = !vaddr.isUndefined();
4715    bool idxen = info.idx.id();
4716 
4717    if (offen && idxen)
4718       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4719    else if (idxen)
4720       vaddr = Operand(info.idx);
4721 
4722    unsigned bytes_size = 0;
4723    aco_opcode op;
4724    if (bytes_needed == 1 || align_ % 2) {
4725       bytes_size = 1;
4726       op = aco_opcode::buffer_load_ubyte;
4727    } else if (bytes_needed == 2 || align_ % 4) {
4728       bytes_size = 2;
4729       op = aco_opcode::buffer_load_ushort;
4730    } else if (bytes_needed <= 4) {
4731       bytes_size = 4;
4732       op = aco_opcode::buffer_load_dword;
4733    } else if (bytes_needed <= 8) {
4734       bytes_size = 8;
4735       op = aco_opcode::buffer_load_dwordx2;
4736    } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4737       bytes_size = 12;
4738       op = aco_opcode::buffer_load_dwordx3;
4739    } else {
4740       bytes_size = 16;
4741       op = aco_opcode::buffer_load_dwordx4;
4742    }
4743    aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4744    mubuf->operands[0] = Operand(info.resource);
4745    mubuf->operands[1] = vaddr;
4746    mubuf->operands[2] = soffset;
4747    mubuf->mubuf().offen = offen;
4748    mubuf->mubuf().idxen = idxen;
4749    mubuf->mubuf().cache = info.cache;
4750    mubuf->mubuf().sync = info.sync;
4751    mubuf->mubuf().offset = const_offset;
4752    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4753    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4754    mubuf->definitions[0] = Definition(val);
4755    bld.insert(std::move(mubuf));
4756 
4757    return val;
4758 }
4759 
4760 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4761 
4762 Temp
mubuf_load_format_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4763 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4764                            unsigned bytes_needed, unsigned align_, unsigned const_offset,
4765                            Temp dst_hint)
4766 {
4767    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4768    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4769 
4770    if (info.soffset.id()) {
4771       if (soffset.isTemp())
4772          vaddr = bld.copy(bld.def(v1), soffset);
4773       soffset = Operand(info.soffset);
4774    }
4775 
4776    if (soffset.isUndefined())
4777       soffset = Operand::zero();
4778 
4779    bool offen = !vaddr.isUndefined();
4780    bool idxen = info.idx.id();
4781 
4782    if (offen && idxen)
4783       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4784    else if (idxen)
4785       vaddr = Operand(info.idx);
4786 
4787    aco_opcode op = aco_opcode::num_opcodes;
4788    if (info.component_size == 2) {
4789       switch (bytes_needed) {
4790       case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4791       case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4792       case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4793       case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4794       default: unreachable("invalid buffer load format size"); break;
4795       }
4796    } else {
4797       assert(info.component_size == 4);
4798       switch (bytes_needed) {
4799       case 4: op = aco_opcode::buffer_load_format_x; break;
4800       case 8: op = aco_opcode::buffer_load_format_xy; break;
4801       case 12: op = aco_opcode::buffer_load_format_xyz; break;
4802       case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4803       default: unreachable("invalid buffer load format size"); break;
4804       }
4805    }
4806 
4807    aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4808    mubuf->operands[0] = Operand(info.resource);
4809    mubuf->operands[1] = vaddr;
4810    mubuf->operands[2] = soffset;
4811    mubuf->mubuf().offen = offen;
4812    mubuf->mubuf().idxen = idxen;
4813    mubuf->mubuf().cache = info.cache;
4814    mubuf->mubuf().sync = info.sync;
4815    mubuf->mubuf().offset = const_offset;
4816    RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4817    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4818    mubuf->definitions[0] = Definition(val);
4819    bld.insert(std::move(mubuf));
4820 
4821    return val;
4822 }
4823 
4824 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4825 
4826 Temp
scratch_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4827 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4828                       unsigned align_, unsigned const_offset, Temp dst_hint)
4829 {
4830    unsigned bytes_size = 0;
4831    aco_opcode op;
4832    if (bytes_needed == 1 || align_ % 2u) {
4833       bytes_size = 1;
4834       op = aco_opcode::scratch_load_ubyte;
4835    } else if (bytes_needed == 2 || align_ % 4u) {
4836       bytes_size = 2;
4837       op = aco_opcode::scratch_load_ushort;
4838    } else if (bytes_needed <= 4) {
4839       bytes_size = 4;
4840       op = aco_opcode::scratch_load_dword;
4841    } else if (bytes_needed <= 8) {
4842       bytes_size = 8;
4843       op = aco_opcode::scratch_load_dwordx2;
4844    } else if (bytes_needed <= 12) {
4845       bytes_size = 12;
4846       op = aco_opcode::scratch_load_dwordx3;
4847    } else {
4848       bytes_size = 16;
4849       op = aco_opcode::scratch_load_dwordx4;
4850    }
4851    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4852    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4853    aco_ptr<Instruction> flat{create_instruction(op, Format::SCRATCH, 2, 1)};
4854    flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4855    flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4856    flat->scratch().sync = info.sync;
4857    flat->scratch().offset = const_offset;
4858    flat->definitions[0] = Definition(val);
4859    bld.insert(std::move(flat));
4860 
4861    return val;
4862 }
4863 
4864 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4865 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4866 
4867 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4868 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4869 {
4870    uint32_t desc[4];
4871    ac_build_raw_buffer_descriptor(bld.program->gfx_level, 0, 0xffffffff, desc);
4872 
4873    if (addr.type() == RegType::vgpr)
4874       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4875                         Operand::c32(desc[2]), Operand::c32(desc[3]));
4876    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(desc[2]),
4877                      Operand::c32(desc[3]));
4878 }
4879 
4880 Temp
add64_32(Builder & bld,Temp src0,Temp src1)4881 add64_32(Builder& bld, Temp src0, Temp src1)
4882 {
4883    Temp src00 = bld.tmp(src0.type(), 1);
4884    Temp src01 = bld.tmp(src0.type(), 1);
4885    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4886 
4887    if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4888       Temp dst0 = bld.tmp(v1);
4889       Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4890       Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4891       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4892    } else {
4893       Temp carry = bld.tmp(s1);
4894       Temp dst0 =
4895          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4896       Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4897       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4898    }
4899 }
4900 
4901 void
lower_global_address(Builder & bld,uint32_t offset_in,Temp * address_inout,uint32_t * const_offset_inout,Temp * offset_inout)4902 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4903                      uint32_t* const_offset_inout, Temp* offset_inout)
4904 {
4905    Temp address = *address_inout;
4906    uint64_t const_offset = *const_offset_inout + offset_in;
4907    Temp offset = *offset_inout;
4908 
4909    uint64_t max_const_offset_plus_one =
4910       1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4911    if (bld.program->gfx_level >= GFX9)
4912       max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4913    else if (bld.program->gfx_level == GFX6)
4914       max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4915    uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4916    const_offset %= max_const_offset_plus_one;
4917 
4918    if (!offset.id()) {
4919       while (unlikely(excess_offset > UINT32_MAX)) {
4920          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4921          excess_offset -= UINT32_MAX;
4922       }
4923       if (excess_offset)
4924          offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4925    } else {
4926       /* If we add to "offset", we would transform the indended
4927        * "address + u2u64(offset) + u2u64(const_offset)" into
4928        * "address + u2u64(offset + const_offset)", so add to the address.
4929        * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4930        * but that should be really rare.
4931        */
4932       while (excess_offset) {
4933          uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4934          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4935          excess_offset -= src2;
4936       }
4937    }
4938 
4939    if (bld.program->gfx_level == GFX6) {
4940       /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4941       if (offset.type() != RegType::sgpr) {
4942          address = add64_32(bld, address, offset);
4943          offset = Temp();
4944       }
4945       offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4946    } else if (bld.program->gfx_level <= GFX8) {
4947       /* GFX7,8 (FLAT): VGPR address */
4948       if (offset.id()) {
4949          address = add64_32(bld, address, offset);
4950          offset = Temp();
4951       }
4952       address = as_vgpr(bld, address);
4953    } else {
4954       /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4955       if (address.type() == RegType::vgpr && offset.id()) {
4956          address = add64_32(bld, address, offset);
4957          offset = Temp();
4958       } else if (address.type() == RegType::sgpr && offset.id()) {
4959          offset = as_vgpr(bld, offset);
4960       }
4961       if (address.type() == RegType::sgpr && !offset.id())
4962          offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4963    }
4964 
4965    *address_inout = address;
4966    *const_offset_inout = const_offset;
4967    *offset_inout = offset;
4968 }
4969 
4970 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4971 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4972                      unsigned align_, unsigned const_offset, Temp dst_hint)
4973 {
4974    Temp addr = info.resource;
4975    if (!addr.id()) {
4976       addr = offset;
4977       offset = Temp();
4978    }
4979    lower_global_address(bld, 0, &addr, &const_offset, &offset);
4980 
4981    unsigned bytes_size = 0;
4982    bool use_mubuf = bld.program->gfx_level == GFX6;
4983    bool global = bld.program->gfx_level >= GFX9;
4984    aco_opcode op;
4985    if (bytes_needed == 1 || align_ % 2u) {
4986       bytes_size = 1;
4987       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4988            : global  ? aco_opcode::global_load_ubyte
4989                      : aco_opcode::flat_load_ubyte;
4990    } else if (bytes_needed == 2 || align_ % 4u) {
4991       bytes_size = 2;
4992       op = use_mubuf ? aco_opcode::buffer_load_ushort
4993            : global  ? aco_opcode::global_load_ushort
4994                      : aco_opcode::flat_load_ushort;
4995    } else if (bytes_needed <= 4) {
4996       bytes_size = 4;
4997       op = use_mubuf ? aco_opcode::buffer_load_dword
4998            : global  ? aco_opcode::global_load_dword
4999                      : aco_opcode::flat_load_dword;
5000    } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
5001       bytes_size = 8;
5002       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
5003            : global  ? aco_opcode::global_load_dwordx2
5004                      : aco_opcode::flat_load_dwordx2;
5005    } else if (bytes_needed <= 12 && !use_mubuf) {
5006       bytes_size = 12;
5007       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
5008    } else {
5009       bytes_size = 16;
5010       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
5011            : global  ? aco_opcode::global_load_dwordx4
5012                      : aco_opcode::flat_load_dwordx4;
5013    }
5014    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5015    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5016    if (use_mubuf) {
5017       aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
5018       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
5019       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
5020       mubuf->operands[2] = Operand(offset);
5021       mubuf->mubuf().cache = info.cache;
5022       mubuf->mubuf().offset = const_offset;
5023       mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
5024       mubuf->mubuf().disable_wqm = false;
5025       mubuf->mubuf().sync = info.sync;
5026       mubuf->definitions[0] = Definition(val);
5027       bld.insert(std::move(mubuf));
5028    } else {
5029       aco_ptr<Instruction> flat{
5030          create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
5031       if (addr.regClass() == s2) {
5032          assert(global && offset.id() && offset.type() == RegType::vgpr);
5033          flat->operands[0] = Operand(offset);
5034          flat->operands[1] = Operand(addr);
5035       } else {
5036          assert(addr.type() == RegType::vgpr && !offset.id());
5037          flat->operands[0] = Operand(addr);
5038          flat->operands[1] = Operand(s1);
5039       }
5040       flat->flatlike().cache = info.cache;
5041       flat->flatlike().sync = info.sync;
5042       assert(global || !const_offset);
5043       flat->flatlike().offset = const_offset;
5044       flat->definitions[0] = Definition(val);
5045       bld.insert(std::move(flat));
5046    }
5047 
5048    return val;
5049 }
5050 
5051 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
5052 
5053 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)5054 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
5055          Temp address, unsigned base_offset, unsigned align)
5056 {
5057    assert(util_is_power_of_two_nonzero(align));
5058 
5059    Builder bld(ctx->program, ctx->block);
5060 
5061    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
5062    info.align_mul = align;
5063    info.align_offset = 0;
5064    info.sync = memory_sync_info(storage_shared);
5065    info.const_offset = base_offset;
5066    /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
5067     * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
5068     * to avoid copy-propagation.
5069     */
5070    info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
5071                                     ctx->program->wave_size == 64 &&
5072                                     ctx->program->workgroup_size > 64;
5073    emit_load(ctx, bld, info, lds_load_params);
5074 
5075    return dst;
5076 }
5077 
5078 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)5079 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
5080                  Temp src)
5081 {
5082    if (!count)
5083       return;
5084 
5085    Builder bld(ctx->program, ctx->block);
5086 
5087    /* count == 1 fast path */
5088    if (count == 1) {
5089       if (dst_type == RegType::sgpr)
5090          dst[0] = bld.as_uniform(src);
5091       else
5092          dst[0] = as_vgpr(ctx, src);
5093       return;
5094    }
5095 
5096    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
5097    unsigned elem_size_bytes =
5098       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
5099 
5100    ASSERTED bool is_subdword = elem_size_bytes < 4;
5101    assert(!is_subdword || dst_type == RegType::vgpr);
5102 
5103    for (unsigned i = 0; i < count; i++)
5104       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
5105 
5106    std::vector<Temp> temps;
5107    /* use allocated_vec if possible */
5108    auto it = ctx->allocated_vec.find(src.id());
5109    if (it != ctx->allocated_vec.end()) {
5110       if (!it->second[0].id())
5111          goto split;
5112       unsigned elem_size = it->second[0].bytes();
5113       assert(src.bytes() % elem_size == 0);
5114 
5115       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
5116          if (!it->second[i].id())
5117             goto split;
5118       }
5119       if (elem_size_bytes % elem_size)
5120          goto split;
5121 
5122       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
5123       elem_size_bytes = elem_size;
5124    }
5125 
5126 split:
5127    /* split src if necessary */
5128    if (temps.empty()) {
5129       if (is_subdword && src.type() == RegType::sgpr)
5130          src = as_vgpr(ctx, src);
5131       if (dst_type == RegType::sgpr)
5132          src = bld.as_uniform(src);
5133 
5134       unsigned num_elems = src.bytes() / elem_size_bytes;
5135       aco_ptr<Instruction> split{
5136          create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
5137       split->operands[0] = Operand(src);
5138       for (unsigned i = 0; i < num_elems; i++) {
5139          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
5140          split->definitions[i] = Definition(temps.back());
5141       }
5142       bld.insert(std::move(split));
5143    }
5144 
5145    unsigned idx = 0;
5146    for (unsigned i = 0; i < count; i++) {
5147       unsigned op_count = dst[i].bytes() / elem_size_bytes;
5148       if (op_count == 1) {
5149          if (dst_type == RegType::sgpr)
5150             dst[i] = bld.as_uniform(temps[idx++]);
5151          else
5152             dst[i] = as_vgpr(ctx, temps[idx++]);
5153          continue;
5154       }
5155 
5156       aco_ptr<Instruction> vec{
5157          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
5158       for (unsigned j = 0; j < op_count; j++) {
5159          Temp tmp = temps[idx++];
5160          if (dst_type == RegType::sgpr)
5161             tmp = bld.as_uniform(tmp);
5162          vec->operands[j] = Operand(tmp);
5163       }
5164       vec->definitions[0] = Definition(dst[i]);
5165       bld.insert(std::move(vec));
5166    }
5167    return;
5168 }
5169 
5170 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)5171 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
5172 {
5173    unsigned start_elem = ffs(todo_mask) - 1;
5174    bool skip = !(mask & (1 << start_elem));
5175    if (skip)
5176       mask = ~mask & todo_mask;
5177 
5178    mask &= todo_mask;
5179 
5180    u_bit_scan_consecutive_range(&mask, start, count);
5181 
5182    return !skip;
5183 }
5184 
5185 void
advance_write_mask(uint32_t * todo_mask,int start,int count)5186 advance_write_mask(uint32_t* todo_mask, int start, int count)
5187 {
5188    *todo_mask &= ~u_bit_consecutive(0, count) << start;
5189 }
5190 
5191 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)5192 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
5193           unsigned base_offset, unsigned align)
5194 {
5195    assert(util_is_power_of_two_nonzero(align));
5196    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
5197 
5198    Builder bld(ctx->program, ctx->block);
5199    bool large_ds_write = ctx->options->gfx_level >= GFX7;
5200    bool usable_write2 = ctx->options->gfx_level >= GFX7;
5201 
5202    unsigned write_count = 0;
5203    Temp write_datas[32];
5204    unsigned offsets[32];
5205    unsigned bytes[32];
5206    aco_opcode opcodes[32];
5207 
5208    wrmask = util_widen_mask(wrmask, elem_size_bytes);
5209 
5210    const unsigned wrmask_bitcnt = util_bitcount(wrmask);
5211    uint32_t todo = u_bit_consecutive(0, data.bytes());
5212 
5213    if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
5214       todo = MIN2(todo, wrmask);
5215 
5216    while (todo) {
5217       int offset, byte;
5218       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
5219          offsets[write_count] = offset;
5220          bytes[write_count] = byte;
5221          opcodes[write_count] = aco_opcode::num_opcodes;
5222          write_count++;
5223          advance_write_mask(&todo, offset, byte);
5224          continue;
5225       }
5226 
5227       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
5228       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
5229       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
5230       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
5231 
5232       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
5233       aco_opcode op = aco_opcode::num_opcodes;
5234       if (byte >= 16 && aligned16 && large_ds_write) {
5235          op = aco_opcode::ds_write_b128;
5236          byte = 16;
5237       } else if (byte >= 12 && aligned16 && large_ds_write) {
5238          op = aco_opcode::ds_write_b96;
5239          byte = 12;
5240       } else if (byte >= 8 && aligned8) {
5241          op = aco_opcode::ds_write_b64;
5242          byte = 8;
5243       } else if (byte >= 4 && aligned4) {
5244          op = aco_opcode::ds_write_b32;
5245          byte = 4;
5246       } else if (byte >= 2 && aligned2) {
5247          op = aco_opcode::ds_write_b16;
5248          byte = 2;
5249       } else if (byte >= 1) {
5250          op = aco_opcode::ds_write_b8;
5251          byte = 1;
5252       } else {
5253          assert(false);
5254       }
5255 
5256       offsets[write_count] = offset;
5257       bytes[write_count] = byte;
5258       opcodes[write_count] = op;
5259       write_count++;
5260       advance_write_mask(&todo, offset, byte);
5261    }
5262 
5263    Operand m = load_lds_size_m0(bld);
5264 
5265    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
5266 
5267    for (unsigned i = 0; i < write_count; i++) {
5268       aco_opcode op = opcodes[i];
5269       if (op == aco_opcode::num_opcodes)
5270          continue;
5271 
5272       Temp split_data = write_datas[i];
5273 
5274       unsigned second = write_count;
5275       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
5276          for (second = i + 1; second < write_count; second++) {
5277             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
5278                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
5279                opcodes[second] = aco_opcode::num_opcodes;
5280                break;
5281             }
5282          }
5283       }
5284 
5285       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
5286       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
5287 
5288       unsigned inline_offset = base_offset + offsets[i];
5289       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
5290       Temp address_offset = address;
5291       if (inline_offset > max_offset) {
5292          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
5293          inline_offset = offsets[i];
5294       }
5295 
5296       /* offsets[i] shouldn't be large enough for this to happen */
5297       assert(inline_offset <= max_offset);
5298 
5299       Instruction* instr;
5300       if (write2) {
5301          Temp second_data = write_datas[second];
5302          inline_offset /= split_data.bytes();
5303          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
5304                         inline_offset + write2_off);
5305       } else {
5306          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
5307       }
5308       instr->ds().sync = memory_sync_info(storage_shared);
5309 
5310       if (m.isUndefined())
5311          instr->operands.pop_back();
5312    }
5313 }
5314 
5315 aco_opcode
get_buffer_store_op(unsigned bytes)5316 get_buffer_store_op(unsigned bytes)
5317 {
5318    switch (bytes) {
5319    case 1: return aco_opcode::buffer_store_byte;
5320    case 2: return aco_opcode::buffer_store_short;
5321    case 4: return aco_opcode::buffer_store_dword;
5322    case 8: return aco_opcode::buffer_store_dwordx2;
5323    case 12: return aco_opcode::buffer_store_dwordx3;
5324    case 16: return aco_opcode::buffer_store_dwordx4;
5325    }
5326    unreachable("Unexpected store size");
5327    return aco_opcode::num_opcodes;
5328 }
5329 
5330 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)5331 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
5332                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
5333                    Temp* write_datas, unsigned* offsets)
5334 {
5335    unsigned write_count_with_skips = 0;
5336    bool skips[16];
5337    unsigned bytes[16];
5338 
5339    /* determine how to split the data */
5340    unsigned todo = u_bit_consecutive(0, data.bytes());
5341    while (todo) {
5342       int offset, byte;
5343       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5344       offsets[write_count_with_skips] = offset;
5345       if (skips[write_count_with_skips]) {
5346          bytes[write_count_with_skips] = byte;
5347          advance_write_mask(&todo, offset, byte);
5348          write_count_with_skips++;
5349          continue;
5350       }
5351 
5352       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5353        * larger than swizzle_element_size */
5354       byte = MIN2(byte, swizzle_element_size);
5355       if (byte % 4)
5356          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5357 
5358       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5359       if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5360          byte = 8;
5361 
5362       /* dword or larger stores have to be dword-aligned */
5363       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5364       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5365       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5366       if (!dword_aligned)
5367          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5368 
5369       bytes[write_count_with_skips] = byte;
5370       advance_write_mask(&todo, offset, byte);
5371       write_count_with_skips++;
5372    }
5373 
5374    /* actually split data */
5375    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5376 
5377    /* remove skips */
5378    for (unsigned i = 0; i < write_count_with_skips; i++) {
5379       if (skips[i])
5380          continue;
5381       write_datas[*write_count] = write_datas[i];
5382       offsets[*write_count] = offsets[i];
5383       (*write_count)++;
5384    }
5385 }
5386 
5387 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())5388 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5389                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5390 {
5391    Builder bld(ctx->program, ctx->block);
5392    unsigned dword_size = elem_size_bytes / 4;
5393 
5394    if (!dst.id())
5395       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5396 
5397    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5398    aco_ptr<Instruction> instr{
5399       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5400    instr->definitions[0] = Definition(dst);
5401 
5402    for (unsigned i = 0; i < cnt; ++i) {
5403       if (arr[i].id()) {
5404          assert(arr[i].size() == dword_size);
5405          allocated_vec[i] = arr[i];
5406          instr->operands[i] = Operand(arr[i]);
5407       } else {
5408          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5409                               Operand::zero(dword_size == 2 ? 8 : 4));
5410          allocated_vec[i] = zero;
5411          instr->operands[i] = Operand(zero);
5412       }
5413    }
5414 
5415    bld.insert(std::move(instr));
5416 
5417    if (split_cnt)
5418       emit_split_vector(ctx, dst, split_cnt);
5419    else
5420       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5421 
5422    return dst;
5423 }
5424 
5425 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)5426 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5427 {
5428    if (const_offset >= 4096) {
5429       unsigned excess_const_offset = const_offset / 4096u * 4096u;
5430       const_offset %= 4096u;
5431 
5432       if (!voffset.id())
5433          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5434       else if (unlikely(voffset.regClass() == s1))
5435          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5436                             Operand::c32(excess_const_offset), Operand(voffset));
5437       else if (likely(voffset.regClass() == v1))
5438          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5439       else
5440          unreachable("Unsupported register class of voffset");
5441    }
5442 
5443    return const_offset;
5444 }
5445 
5446 Temp
wave_id_in_threadgroup(isel_context * ctx)5447 wave_id_in_threadgroup(isel_context* ctx)
5448 {
5449    Builder bld(ctx->program, ctx->block);
5450    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5451                    get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5452 }
5453 
5454 Temp
thread_id_in_threadgroup(isel_context * ctx)5455 thread_id_in_threadgroup(isel_context* ctx)
5456 {
5457    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5458 
5459    Builder bld(ctx->program, ctx->block);
5460    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5461 
5462    if (ctx->program->workgroup_size <= ctx->program->wave_size)
5463       return tid_in_wave;
5464 
5465    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5466    Temp num_pre_threads =
5467       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5468                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5469    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5470 }
5471 
5472 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)5473 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5474 {
5475    unsigned write_mask = nir_intrinsic_write_mask(instr);
5476    unsigned component = nir_intrinsic_component(instr);
5477    nir_src offset = *nir_get_io_offset_src(instr);
5478 
5479    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5480       return false;
5481 
5482    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5483 
5484    if (instr->src[0].ssa->bit_size == 64)
5485       write_mask = util_widen_mask(write_mask, 2);
5486 
5487    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5488 
5489    /* Use semantic location as index. radv already uses it as intrinsic base
5490     * but radeonsi does not. We need to make LS output and TCS input index
5491     * match each other, so need to use semantic location explicitly. Also for
5492     * TCS epilog to index tess factor temps using semantic location directly.
5493     */
5494    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5495    unsigned base = sem.location;
5496    if (ctx->stage == fragment_fs) {
5497       /* color result is a legacy slot which won't appear with data result
5498        * at the same time. Here we just use the data slot for it to simplify
5499        * code handling for both of them.
5500        */
5501       if (base == FRAG_RESULT_COLOR)
5502          base = FRAG_RESULT_DATA0;
5503 
5504       /* Sencond output of dual source blend just use data1 slot for simplicity,
5505        * because dual source blend does not support multi render target.
5506        */
5507       base += sem.dual_source_blend_index;
5508    }
5509    unsigned idx = base * 4u + component;
5510 
5511    for (unsigned i = 0; i < 8; ++i) {
5512       if (write_mask & (1 << i)) {
5513          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5514          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5515       }
5516       idx++;
5517    }
5518 
5519    if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog && base >= FRAG_RESULT_DATA0) {
5520       unsigned index = base - FRAG_RESULT_DATA0;
5521 
5522       if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5523          ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5524       } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5525          ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5526       } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5527          ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5528       }
5529    }
5530 
5531    return true;
5532 }
5533 
5534 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)5535 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5536 {
5537    /* Only TCS per-vertex inputs are supported by this function.
5538     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5539     * is the same.
5540     */
5541    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5542       return false;
5543 
5544    nir_src* off_src = nir_get_io_offset_src(instr);
5545    nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5546    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5547    bool can_use_temps =
5548       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5549       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5550 
5551    if (!can_use_temps)
5552       return false;
5553 
5554    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5555 
5556    unsigned idx =
5557       sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5558    Temp* src = &ctx->inputs.temps[idx];
5559    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5560 
5561    return true;
5562 }
5563 
5564 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)5565 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5566 {
5567    /* LS pass output to TCS by temp if they have same in/out patch size. */
5568    bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5569                          ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5570 
5571    bool ps_need_output = ctx->stage == fragment_fs;
5572 
5573    if (ls_need_output || ps_need_output) {
5574       bool stored_to_temps = store_output_to_temps(ctx, instr);
5575       if (!stored_to_temps) {
5576          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5577          abort();
5578       }
5579    } else {
5580       unreachable("Shader stage not implemented");
5581    }
5582 }
5583 
5584 bool
in_exec_divergent_or_in_loop(isel_context * ctx)5585 in_exec_divergent_or_in_loop(isel_context* ctx)
5586 {
5587    return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5588           ctx->cf_info.had_divergent_discard;
5589 }
5590 
5591 void
emit_interp_instr_gfx11(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5592 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5593                         Temp prim_mask, bool high_16bits)
5594 {
5595    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5596    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5597 
5598    Builder bld(ctx->program, ctx->block);
5599 
5600    if (in_exec_divergent_or_in_loop(ctx)) {
5601       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5602                  Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1,
5603                  coord2, bld.m0(prim_mask));
5604       return;
5605    }
5606 
5607    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5608 
5609    Temp res;
5610    if (dst.regClass() == v2b) {
5611       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1,
5612                                    p, high_16bits ? 0x5 : 0);
5613       bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10,
5614                         high_16bits ? 0x1 : 0);
5615    } else {
5616       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5617       bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5618    }
5619    /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5620    set_wqm(ctx, true);
5621 }
5622 
5623 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5624 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5625                   Temp prim_mask, bool high_16bits)
5626 {
5627    if (ctx->options->gfx_level >= GFX11) {
5628       emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask, high_16bits);
5629       return;
5630    }
5631 
5632    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5633    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5634 
5635    Builder bld(ctx->program, ctx->block);
5636 
5637    if (dst.regClass() == v2b) {
5638       if (ctx->program->dev.has_16bank_lds) {
5639          assert(ctx->options->gfx_level <= GFX8);
5640          Builder::Result interp_p1 =
5641             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5642                        bld.m0(prim_mask), idx, component);
5643          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v1), coord1,
5644                                 bld.m0(prim_mask), interp_p1, idx, component, high_16bits);
5645          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5646                     interp_p1, idx, component, high_16bits);
5647       } else {
5648          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5649 
5650          if (ctx->options->gfx_level == GFX8)
5651             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5652 
5653          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5654                                                 bld.m0(prim_mask), idx, component, high_16bits);
5655          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5656                     component, high_16bits);
5657       }
5658    } else {
5659       assert(!high_16bits);
5660       Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5661                                   bld.m0(prim_mask), idx, component);
5662 
5663       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5664                  idx, component);
5665    }
5666 }
5667 
5668 void
emit_interp_mov_instr(isel_context * ctx,unsigned idx,unsigned component,unsigned vertex_id,Temp dst,Temp prim_mask,bool high_16bits)5669 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5670                       Temp dst, Temp prim_mask, bool high_16bits)
5671 {
5672    Builder bld(ctx->program, ctx->block);
5673    Temp tmp = dst.bytes() == 2 ? bld.tmp(v1) : dst;
5674    if (ctx->options->gfx_level >= GFX11) {
5675       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5676       if (in_exec_divergent_or_in_loop(ctx)) {
5677          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()),
5678                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5679                     bld.m0(prim_mask));
5680       } else {
5681          Temp p =
5682             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5683          bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(tmp), p, dpp_ctrl);
5684          /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5685          set_wqm(ctx, true);
5686       }
5687    } else {
5688       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(tmp), Operand::c32((vertex_id + 2) % 3),
5689                  bld.m0(prim_mask), idx, component);
5690    }
5691 
5692    if (dst.id() != tmp.id())
5693       emit_extract_vector(ctx, tmp, high_16bits, dst);
5694 }
5695 
5696 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)5697 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5698 {
5699    Builder bld(ctx->program, ctx->block);
5700 
5701    aco_ptr<Instruction> vec(
5702       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5703    for (unsigned i = 0; i < num_components; i++) {
5704       if (ctx->args->frag_pos[i].used)
5705          vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5706       else
5707          vec->operands[i] = Operand(v1);
5708    }
5709    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5710       assert(num_components == 4);
5711       vec->operands[3] =
5712          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5713    }
5714 
5715    for (Operand& op : vec->operands)
5716       op = op.isUndefined() ? Operand::zero() : op;
5717 
5718    vec->definitions[0] = Definition(dst);
5719    ctx->block->instructions.emplace_back(std::move(vec));
5720    emit_split_vector(ctx, dst, num_components);
5721    return;
5722 }
5723 
5724 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)5725 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5726 {
5727    Builder bld(ctx->program, ctx->block);
5728    Temp cond;
5729 
5730    /* VRS Rate X = Ancillary[2:3]
5731     * VRS Rate Y = Ancillary[4:5]
5732     */
5733    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5734                           Operand::c32(2u), Operand::c32(2u));
5735    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5736                           Operand::c32(4u), Operand::c32(2u));
5737 
5738    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5739    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5740    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5741                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5742 
5743    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5744    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5745    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5746                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5747 
5748    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5749 }
5750 
5751 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)5752 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5753 {
5754    Temp dst = get_ssa_temp(ctx, &instr->def);
5755    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5756    unsigned idx = nir_intrinsic_base(instr);
5757    unsigned component = nir_intrinsic_component(instr);
5758    bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5759    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5760 
5761    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5762 
5763    if (instr->def.num_components == 1) {
5764       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask, high_16bits);
5765    } else {
5766       aco_ptr<Instruction> vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
5767                                                   instr->def.num_components, 1));
5768       for (unsigned i = 0; i < instr->def.num_components; i++) {
5769          Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5770          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask, high_16bits);
5771          vec->operands[i] = Operand(tmp);
5772       }
5773       vec->definitions[0] = Definition(dst);
5774       ctx->block->instructions.emplace_back(std::move(vec));
5775    }
5776 }
5777 
5778 Temp
mtbuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned alignment,unsigned const_offset,Temp dst_hint)5779 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5780                     unsigned alignment, unsigned const_offset, Temp dst_hint)
5781 {
5782    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5783    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5784 
5785    if (info.soffset.id()) {
5786       if (soffset.isTemp())
5787          vaddr = bld.copy(bld.def(v1), soffset);
5788       soffset = Operand(info.soffset);
5789    }
5790 
5791    if (soffset.isUndefined())
5792       soffset = Operand::zero();
5793 
5794    const bool offen = !vaddr.isUndefined();
5795    const bool idxen = info.idx.id();
5796 
5797    if (offen && idxen)
5798       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5799    else if (idxen)
5800       vaddr = Operand(info.idx);
5801 
5802    /* Determine number of fetched components.
5803     * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5804     */
5805    const struct ac_vtx_format_info* vtx_info =
5806       ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5807    /* The number of channels in the format determines the memory range. */
5808    const unsigned max_components = vtx_info->num_channels;
5809    /* Calculate maximum number of components loaded according to alignment. */
5810    unsigned max_fetched_components = bytes_needed / info.component_size;
5811    max_fetched_components =
5812       ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5813                              alignment, max_fetched_components);
5814    const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5815    /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5816     * If a larger format is selected, it's still OK to load a smaller amount from it.
5817     */
5818    bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5819    unsigned bytes_size = 0;
5820    const unsigned bit_size = info.component_size * 8;
5821    aco_opcode op = aco_opcode::num_opcodes;
5822 
5823    if (bytes_needed == 2) {
5824       bytes_size = 2;
5825       op = aco_opcode::tbuffer_load_format_d16_x;
5826    } else if (bytes_needed <= 4) {
5827       bytes_size = 4;
5828       if (bit_size == 16)
5829          op = aco_opcode::tbuffer_load_format_d16_xy;
5830       else
5831          op = aco_opcode::tbuffer_load_format_x;
5832    } else if (bytes_needed <= 6) {
5833       bytes_size = 6;
5834       if (bit_size == 16)
5835          op = aco_opcode::tbuffer_load_format_d16_xyz;
5836       else
5837          op = aco_opcode::tbuffer_load_format_xy;
5838    } else if (bytes_needed <= 8) {
5839       bytes_size = 8;
5840       if (bit_size == 16)
5841          op = aco_opcode::tbuffer_load_format_d16_xyzw;
5842       else
5843          op = aco_opcode::tbuffer_load_format_xy;
5844    } else if (bytes_needed <= 12) {
5845       bytes_size = 12;
5846       op = aco_opcode::tbuffer_load_format_xyz;
5847    } else {
5848       bytes_size = 16;
5849       op = aco_opcode::tbuffer_load_format_xyzw;
5850    }
5851 
5852    /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5853    if (op == aco_opcode::num_opcodes) {
5854       aco_err(bld.program, "unsupported bit size for typed buffer load");
5855       abort();
5856    }
5857 
5858    aco_ptr<Instruction> mtbuf{create_instruction(op, Format::MTBUF, 3, 1)};
5859    mtbuf->operands[0] = Operand(info.resource);
5860    mtbuf->operands[1] = vaddr;
5861    mtbuf->operands[2] = soffset;
5862    mtbuf->mtbuf().offen = offen;
5863    mtbuf->mtbuf().idxen = idxen;
5864    mtbuf->mtbuf().cache = info.cache;
5865    mtbuf->mtbuf().sync = info.sync;
5866    mtbuf->mtbuf().offset = const_offset;
5867    mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
5868    mtbuf->mtbuf().nfmt = fetch_fmt >> 4;
5869    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5870    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5871    mtbuf->definitions[0] = Definition(val);
5872    bld.insert(std::move(mtbuf));
5873 
5874    return val;
5875 }
5876 
5877 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5878 
5879 void
visit_load_fs_input(isel_context * ctx,nir_intrinsic_instr * instr)5880 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5881 {
5882    Builder bld(ctx->program, ctx->block);
5883    Temp dst = get_ssa_temp(ctx, &instr->def);
5884    nir_src offset = *nir_get_io_offset_src(instr);
5885 
5886    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5887       isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5888 
5889    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5890 
5891    unsigned idx = nir_intrinsic_base(instr);
5892    unsigned component = nir_intrinsic_component(instr);
5893    bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5894    unsigned vertex_id = 0; /* P0 */
5895 
5896    if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5897       vertex_id = nir_src_as_uint(instr->src[0]);
5898 
5899    if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5900       emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask, high_16bits);
5901    } else {
5902       unsigned num_components = instr->def.num_components;
5903       if (instr->def.bit_size == 64)
5904          num_components *= 2;
5905       aco_ptr<Instruction> vec{
5906          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5907       for (unsigned i = 0; i < num_components; i++) {
5908          unsigned chan_component = (component + i) % 4;
5909          unsigned chan_idx = idx + (component + i) / 4;
5910          vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5911          emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5912                                prim_mask, high_16bits);
5913       }
5914       vec->definitions[0] = Definition(dst);
5915       bld.insert(std::move(vec));
5916    }
5917 }
5918 
5919 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5920 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5921 {
5922    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5923 
5924    Builder bld(ctx->program, ctx->block);
5925    Temp dst = get_ssa_temp(ctx, &instr->def);
5926 
5927    if (load_input_from_temps(ctx, instr, dst))
5928       return;
5929 
5930    unreachable("LDS-based TCS input should have been lowered in NIR.");
5931 }
5932 
5933 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5934 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5935 {
5936    switch (ctx->shader->info.stage) {
5937    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5938    default: unreachable("Unimplemented shader stage");
5939    }
5940 }
5941 
5942 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5943 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5944 {
5945    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5946 
5947    Builder bld(ctx->program, ctx->block);
5948    Temp dst = get_ssa_temp(ctx, &instr->def);
5949 
5950    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5951    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5952    Operand tes_w = Operand::zero();
5953 
5954    if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5955       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5956       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5957       tes_w = Operand(tmp);
5958    }
5959 
5960    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5961    emit_split_vector(ctx, tess_coord, 3);
5962 }
5963 
5964 ac_hw_cache_flags
get_cache_flags(isel_context * ctx,unsigned access)5965 get_cache_flags(isel_context* ctx, unsigned access)
5966 {
5967    return ac_get_hw_cache_flags(ctx->program->gfx_level, (gl_access_qualifier)access);
5968 }
5969 
5970 ac_hw_cache_flags
get_atomic_cache_flags(isel_context * ctx,bool return_previous)5971 get_atomic_cache_flags(isel_context* ctx, bool return_previous)
5972 {
5973    ac_hw_cache_flags cache = get_cache_flags(ctx, ACCESS_TYPE_ATOMIC);
5974    if (return_previous && ctx->program->gfx_level >= GFX12)
5975       cache.gfx12.temporal_hint |= gfx12_atomic_return;
5976    else if (return_previous)
5977       cache.value |= ac_glc;
5978    return cache;
5979 }
5980 
5981 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,unsigned access=ACCESS_CAN_REORDER,memory_sync_info sync=memory_sync_info ())5982 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5983             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5984             unsigned access = ACCESS_CAN_REORDER, memory_sync_info sync = memory_sync_info())
5985 {
5986    Builder bld(ctx->program, ctx->block);
5987 
5988    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5989 
5990    bool use_smem = dst.type() != RegType::vgpr && (ctx->options->gfx_level >= GFX8 || !glc) &&
5991                    (access & ACCESS_CAN_REORDER);
5992    if (use_smem)
5993       offset = bld.as_uniform(offset);
5994    else {
5995       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5996        * work correctly when the SGPR offset is used.
5997        */
5998       if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5999          offset = as_vgpr(ctx, offset);
6000    }
6001 
6002    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
6003    info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD | (use_smem ? ACCESS_TYPE_SMEM : 0));
6004    info.sync = sync;
6005    info.align_mul = align_mul;
6006    info.align_offset = align_offset;
6007    if (use_smem)
6008       emit_load(ctx, bld, info, smem_load_params);
6009    else
6010       emit_load(ctx, bld, info, mubuf_load_params);
6011 }
6012 
6013 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)6014 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
6015 {
6016    Temp dst = get_ssa_temp(ctx, &instr->def);
6017    Builder bld(ctx->program, ctx->block);
6018    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6019 
6020    unsigned size = instr->def.bit_size / 8;
6021    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6022                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
6023 }
6024 
6025 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)6026 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
6027 {
6028    Temp dst = get_ssa_temp(ctx, &instr->def);
6029 
6030    Builder bld(ctx->program, ctx->block);
6031 
6032    uint32_t desc[4];
6033    ac_build_raw_buffer_descriptor(ctx->options->gfx_level, 0, 0, desc);
6034 
6035    unsigned base = nir_intrinsic_base(instr);
6036    unsigned range = nir_intrinsic_range(instr);
6037 
6038    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
6039    if (base && offset.type() == RegType::sgpr)
6040       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
6041                               Operand::c32(base));
6042    else if (base && offset.type() == RegType::vgpr)
6043       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
6044 
6045    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6046                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
6047                                      Operand::c32(ctx->constant_data_offset)),
6048                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
6049                           Operand::c32(desc[3]));
6050    unsigned size = instr->def.bit_size / 8;
6051    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, nir_intrinsic_align_mul(instr),
6052                nir_intrinsic_align_offset(instr));
6053 }
6054 
6055 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
6056  * The byte count of each input Temp must be a multiple of 2.
6057  */
6058 static std::vector<Temp>
emit_pack_v1(isel_context * ctx,const std::vector<Temp> & unpacked)6059 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
6060 {
6061    Builder bld(ctx->program, ctx->block);
6062    std::vector<Temp> packed;
6063    Temp low = Temp();
6064    for (Temp tmp : unpacked) {
6065       assert(tmp.bytes() % 2 == 0);
6066       unsigned byte_idx = 0;
6067       while (byte_idx < tmp.bytes()) {
6068          if (low != Temp()) {
6069             Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6070             Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
6071             low = Temp();
6072             packed.push_back(dword);
6073             byte_idx += 2;
6074          } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
6075             packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
6076             byte_idx += 4;
6077          } else {
6078             low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6079             byte_idx += 2;
6080          }
6081       }
6082    }
6083    if (low != Temp()) {
6084       Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
6085       packed.push_back(dword);
6086    }
6087    return packed;
6088 }
6089 
6090 static bool
should_declare_array(ac_image_dim dim)6091 should_declare_array(ac_image_dim dim)
6092 {
6093    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
6094           dim == ac_image_2darraymsaa;
6095 }
6096 
6097 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)6098 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
6099 {
6100    switch (dim) {
6101    case GLSL_SAMPLER_DIM_BUF: return 1;
6102    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
6103    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
6104    case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
6105    case GLSL_SAMPLER_DIM_3D:
6106    case GLSL_SAMPLER_DIM_CUBE: return 3;
6107    case GLSL_SAMPLER_DIM_RECT:
6108    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
6109    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
6110    default: break;
6111    }
6112    return 0;
6113 }
6114 
6115 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Temp dst,Temp rsrc,Operand samp,std::vector<Temp> coords,Operand vdata=Operand (v1))6116 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
6117           Operand vdata = Operand(v1))
6118 {
6119    bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;
6120 
6121    size_t nsa_size = bld.program->dev.max_nsa_vgprs;
6122    if (!is_vsample && bld.program->gfx_level >= GFX12)
6123       nsa_size++; /* VIMAGE can encode one more VADDR */
6124    nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
6125 
6126    const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
6127    if (strict_wqm)
6128       nsa_size = coords.size();
6129 
6130    for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
6131       if (!coords[i].id())
6132          continue;
6133 
6134       coords[i] = as_vgpr(bld, coords[i]);
6135    }
6136 
6137    if (nsa_size < coords.size()) {
6138       Temp coord = coords[nsa_size];
6139       if (coords.size() - nsa_size > 1) {
6140          aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
6141                                                      coords.size() - nsa_size, 1)};
6142 
6143          unsigned coord_size = 0;
6144          for (unsigned i = nsa_size; i < coords.size(); i++) {
6145             vec->operands[i - nsa_size] = Operand(coords[i]);
6146             coord_size += coords[i].size();
6147          }
6148 
6149          coord = bld.tmp(RegType::vgpr, coord_size);
6150          vec->definitions[0] = Definition(coord);
6151          bld.insert(std::move(vec));
6152       } else {
6153          coord = as_vgpr(bld, coord);
6154       }
6155 
6156       coords[nsa_size] = coord;
6157       coords.resize(nsa_size + 1);
6158    }
6159 
6160    bool has_dst = dst.id() != 0;
6161 
6162    aco_ptr<Instruction> mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), has_dst)};
6163    if (has_dst)
6164       mimg->definitions[0] = Definition(dst);
6165    mimg->operands[0] = Operand(rsrc);
6166    mimg->operands[1] = samp;
6167    mimg->operands[2] = vdata;
6168    for (unsigned i = 0; i < coords.size(); i++)
6169       mimg->operands[3 + i] = Operand(coords[i]);
6170    mimg->mimg().strict_wqm = strict_wqm;
6171 
6172    return &bld.insert(std::move(mimg))->mimg();
6173 }
6174 
6175 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6176 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6177 {
6178    Builder bld(ctx->program, ctx->block);
6179    Temp dst = get_ssa_temp(ctx, &instr->def);
6180    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6181    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6182    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6183    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6184    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6185    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6186 
6187    /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
6188     * There are five smaller vector groups:
6189     * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
6190     * These directly match the NIR intrinsic sources.
6191     */
6192    std::vector<Temp> args = {
6193       node, tmax, origin, dir, inv_dir,
6194    };
6195 
6196    if (bld.program->gfx_level == GFX10_3) {
6197       std::vector<Temp> scalar_args;
6198       for (Temp tmp : args) {
6199          for (unsigned i = 0; i < tmp.size(); i++)
6200             scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
6201       }
6202       args = std::move(scalar_args);
6203    }
6204 
6205    MIMG_instruction* mimg =
6206       emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6207    mimg->dim = ac_image_1d;
6208    mimg->dmask = 0xf;
6209    mimg->unrm = true;
6210    mimg->r128 = true;
6211 
6212    emit_split_vector(ctx, dst, instr->def.num_components);
6213 }
6214 
6215 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6216 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6217 {
6218 
6219    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6220    bool a16 = instr->src[1].ssa->bit_size == 16;
6221    RegClass rc = a16 ? v2b : v1;
6222    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6223    bool is_array = nir_intrinsic_image_array(instr);
6224    ASSERTED bool add_frag_pos =
6225       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6226    assert(!add_frag_pos && "Input attachments should be lowered.");
6227    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6228    bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6229    int count = image_type_to_components_count(dim, is_array);
6230    std::vector<Temp> coords;
6231    Builder bld(ctx->program, ctx->block);
6232 
6233    if (gfx9_1d) {
6234       coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6235       coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6236       if (is_array)
6237          coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6238    } else {
6239       for (int i = 0; i < count; i++)
6240          coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6241    }
6242 
6243    bool has_lod = false;
6244    Temp lod;
6245 
6246    if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6247        instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6248        instr->intrinsic == nir_intrinsic_bindless_image_store) {
6249       int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6250       assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6251       has_lod =
6252          !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6253 
6254       if (has_lod)
6255          lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6256    }
6257 
6258    if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6259       /* The hw can't bind a slice of a 3D image as a 2D image, because it
6260        * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6261        * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6262        */
6263       assert(ctx->options->gfx_level == GFX9);
6264       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6265       Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6266       /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6267       Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6268                                   Operand::c32(13u));
6269 
6270       if (has_lod) {
6271          /* If there's a lod parameter it matter if the image is 3d or 2d because
6272           * the hw reads either the fourth or third component as lod. So detect
6273           * 3d images and place the lod at the third component otherwise.
6274           * For non 3D descriptors we effectively add lod twice to coords,
6275           * but the hw will only read the first one, the second is ignored.
6276           */
6277          Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6278          Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6279                               Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6280          Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6281                                    Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6282          first_layer =
6283             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6284       }
6285 
6286       if (a16)
6287          coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6288       else
6289          coords.emplace_back(first_layer);
6290    }
6291 
6292    if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6293       assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6294       coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6295    }
6296 
6297    if (has_lod)
6298       coords.emplace_back(lod);
6299 
6300    return emit_pack_v1(ctx, coords);
6301 }
6302 
6303 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6304 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6305 {
6306    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6307    if (semantics & semantic_atomicrmw)
6308       return memory_sync_info(storage, semantics);
6309 
6310    unsigned access = nir_intrinsic_access(instr);
6311 
6312    if (access & ACCESS_VOLATILE)
6313       semantics |= semantic_volatile;
6314    if (access & ACCESS_CAN_REORDER)
6315       semantics |= semantic_can_reorder | semantic_private;
6316 
6317    return memory_sync_info(storage, semantics);
6318 }
6319 
6320 Operand
emit_tfe_init(Builder & bld,Temp dst)6321 emit_tfe_init(Builder& bld, Temp dst)
6322 {
6323    Temp tmp = bld.tmp(dst.regClass());
6324 
6325    aco_ptr<Instruction> vec{
6326       create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6327    for (unsigned i = 0; i < dst.size(); i++)
6328       vec->operands[i] = Operand::zero();
6329    vec->definitions[0] = Definition(tmp);
6330    /* Since this is fixed to an instruction's definition register, any CSE will
6331     * just create copies. Copying costs about the same as zero-initialization,
6332     * but these copies can break up clauses.
6333     */
6334    vec->definitions[0].setNoCSE(true);
6335    bld.insert(std::move(vec));
6336 
6337    return Operand(tmp);
6338 }
6339 
6340 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6341 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6342 {
6343    Builder bld(ctx->program, ctx->block);
6344    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6345    bool is_array = nir_intrinsic_image_array(instr);
6346    bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6347    Temp dst = get_ssa_temp(ctx, &instr->def);
6348 
6349    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6350 
6351    unsigned result_size = instr->def.num_components - is_sparse;
6352    unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6353    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6354    if (dim == GLSL_SAMPLER_DIM_BUF)
6355       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6356    unsigned dmask = expand_mask;
6357    if (instr->def.bit_size == 64) {
6358       expand_mask &= 0x9;
6359       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6360       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6361    }
6362    if (is_sparse)
6363       expand_mask |= 1 << result_size;
6364 
6365    bool d16 = instr->def.bit_size == 16;
6366    assert(!d16 || !is_sparse);
6367 
6368    unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6369 
6370    Temp tmp;
6371    if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6372       tmp = dst;
6373    else
6374       tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6375 
6376    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6377 
6378    if (dim == GLSL_SAMPLER_DIM_BUF) {
6379       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6380 
6381       aco_opcode opcode;
6382       if (!d16) {
6383          switch (util_bitcount(dmask)) {
6384          case 1: opcode = aco_opcode::buffer_load_format_x; break;
6385          case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6386          case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6387          case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6388          default: unreachable(">4 channel buffer image load");
6389          }
6390       } else {
6391          switch (util_bitcount(dmask)) {
6392          case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6393          case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6394          case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6395          case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6396          default: unreachable(">4 channel buffer image load");
6397          }
6398       }
6399       aco_ptr<Instruction> load{create_instruction(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6400       load->operands[0] = Operand(resource);
6401       load->operands[1] = Operand(vindex);
6402       load->operands[2] = Operand::c32(0);
6403       load->definitions[0] = Definition(tmp);
6404       load->mubuf().idxen = true;
6405       load->mubuf().cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6406       load->mubuf().sync = sync;
6407       load->mubuf().tfe = is_sparse;
6408       if (load->mubuf().tfe)
6409          load->operands[3] = emit_tfe_init(bld, tmp);
6410       ctx->block->instructions.emplace_back(std::move(load));
6411    } else {
6412       std::vector<Temp> coords = get_image_coords(ctx, instr);
6413 
6414       aco_opcode opcode;
6415       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6416          opcode = aco_opcode::image_load;
6417       } else {
6418          bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6419          opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6420       }
6421 
6422       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6423       MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6424       load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6425       load->a16 = instr->src[1].ssa->bit_size == 16;
6426       load->d16 = d16;
6427       load->dmask = dmask;
6428       load->unrm = true;
6429       load->tfe = is_sparse;
6430 
6431       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6432          load->dim = is_array ? ac_image_2darray : ac_image_2d;
6433          load->da = is_array;
6434          load->sync = memory_sync_info();
6435       } else {
6436          ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6437          load->dim = sdim;
6438          load->da = should_declare_array(sdim);
6439          load->sync = sync;
6440       }
6441    }
6442 
6443    if (is_sparse && instr->def.bit_size == 64) {
6444       /* The result components are 64-bit but the sparse residency code is
6445        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6446        */
6447       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6448                        Operand::zero());
6449    }
6450 
6451    expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6452 }
6453 
6454 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6455 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6456 {
6457    Builder bld(ctx->program, ctx->block);
6458    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6459    bool is_array = nir_intrinsic_image_array(instr);
6460    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6461    bool d16 = instr->src[3].ssa->bit_size == 16;
6462 
6463    /* only R64_UINT and R64_SINT supported */
6464    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6465       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6466    data = as_vgpr(ctx, data);
6467 
6468    uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6469 
6470    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6471    unsigned access = nir_intrinsic_access(instr);
6472    ac_hw_cache_flags cache =
6473       get_cache_flags(ctx, access | ACCESS_TYPE_STORE | ACCESS_MAY_STORE_SUBDWORD);
6474 
6475    uint32_t dmask = BITFIELD_MASK(num_components);
6476    if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6477       for (uint32_t i = 0; i < instr->num_components; i++) {
6478          /* components not in dmask receive:
6479           * GFX6-11.5:  zero
6480           * GFX12+: first component in dmask
6481           */
6482          nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6483          if (nir_scalar_is_undef(comp)) {
6484             dmask &= ~BITFIELD_BIT(i);
6485          } else if (ctx->options->gfx_level <= GFX11_5) {
6486             if (nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0)
6487                dmask &= ~BITFIELD_BIT(i);
6488          } else {
6489             unsigned first = dim == GLSL_SAMPLER_DIM_BUF ? 0 : ffs(dmask) - 1;
6490             if (i != first && nir_scalar_equal(nir_scalar_resolved(instr->src[3].ssa, first), comp))
6491                dmask &= ~BITFIELD_BIT(i);
6492          }
6493       }
6494 
6495       /* dmask cannot be 0, at least one vgpr is always read */
6496       if (dmask == 0)
6497          dmask = 1;
6498       /* buffer store only supports consecutive components. */
6499       if (dim == GLSL_SAMPLER_DIM_BUF)
6500          dmask = BITFIELD_MASK(util_last_bit(dmask));
6501 
6502       if (dmask != BITFIELD_MASK(num_components)) {
6503          uint32_t dmask_count = util_bitcount(dmask);
6504          RegClass rc = d16 ? v2b : v1;
6505          if (dmask_count == 1) {
6506             data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6507          } else {
6508             aco_ptr<Instruction> vec{
6509                create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6510             uint32_t index = 0;
6511             u_foreach_bit (bit, dmask) {
6512                vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6513             }
6514             data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6515             vec->definitions[0] = Definition(data);
6516             bld.insert(std::move(vec));
6517          }
6518       }
6519    }
6520 
6521    if (dim == GLSL_SAMPLER_DIM_BUF) {
6522       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6523       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6524       aco_opcode opcode;
6525       if (!d16) {
6526          switch (dmask) {
6527          case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
6528          case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
6529          case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
6530          case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
6531          default: unreachable(">4 channel buffer image store");
6532          }
6533       } else {
6534          switch (dmask) {
6535          case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6536          case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6537          case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6538          case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6539          default: unreachable(">4 channel buffer image store");
6540          }
6541       }
6542       aco_ptr<Instruction> store{create_instruction(opcode, Format::MUBUF, 4, 0)};
6543       store->operands[0] = Operand(rsrc);
6544       store->operands[1] = Operand(vindex);
6545       store->operands[2] = Operand::c32(0);
6546       store->operands[3] = Operand(data);
6547       store->mubuf().idxen = true;
6548       store->mubuf().cache = cache;
6549       store->mubuf().disable_wqm = true;
6550       store->mubuf().sync = sync;
6551       ctx->program->needs_exact = true;
6552       ctx->block->instructions.emplace_back(std::move(store));
6553       return;
6554    }
6555 
6556    assert(data.type() == RegType::vgpr);
6557    std::vector<Temp> coords = get_image_coords(ctx, instr);
6558    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6559 
6560    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6561    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6562 
6563    MIMG_instruction* store =
6564       emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6565    store->cache = cache;
6566    store->a16 = instr->src[1].ssa->bit_size == 16;
6567    store->d16 = d16;
6568    store->dmask = dmask;
6569    store->unrm = true;
6570    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6571    store->dim = sdim;
6572    store->da = should_declare_array(sdim);
6573    store->disable_wqm = true;
6574    store->sync = sync;
6575    ctx->program->needs_exact = true;
6576    return;
6577 }
6578 
6579 void
translate_buffer_image_atomic_op(const nir_atomic_op op,aco_opcode * buf_op,aco_opcode * buf_op64,aco_opcode * image_op)6580 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6581                                  aco_opcode* image_op)
6582 {
6583    switch (op) {
6584    case nir_atomic_op_iadd:
6585       *buf_op = aco_opcode::buffer_atomic_add;
6586       *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6587       *image_op = aco_opcode::image_atomic_add;
6588       break;
6589    case nir_atomic_op_umin:
6590       *buf_op = aco_opcode::buffer_atomic_umin;
6591       *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6592       *image_op = aco_opcode::image_atomic_umin;
6593       break;
6594    case nir_atomic_op_imin:
6595       *buf_op = aco_opcode::buffer_atomic_smin;
6596       *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6597       *image_op = aco_opcode::image_atomic_smin;
6598       break;
6599    case nir_atomic_op_umax:
6600       *buf_op = aco_opcode::buffer_atomic_umax;
6601       *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6602       *image_op = aco_opcode::image_atomic_umax;
6603       break;
6604    case nir_atomic_op_imax:
6605       *buf_op = aco_opcode::buffer_atomic_smax;
6606       *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6607       *image_op = aco_opcode::image_atomic_smax;
6608       break;
6609    case nir_atomic_op_iand:
6610       *buf_op = aco_opcode::buffer_atomic_and;
6611       *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6612       *image_op = aco_opcode::image_atomic_and;
6613       break;
6614    case nir_atomic_op_ior:
6615       *buf_op = aco_opcode::buffer_atomic_or;
6616       *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6617       *image_op = aco_opcode::image_atomic_or;
6618       break;
6619    case nir_atomic_op_ixor:
6620       *buf_op = aco_opcode::buffer_atomic_xor;
6621       *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6622       *image_op = aco_opcode::image_atomic_xor;
6623       break;
6624    case nir_atomic_op_xchg:
6625       *buf_op = aco_opcode::buffer_atomic_swap;
6626       *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6627       *image_op = aco_opcode::image_atomic_swap;
6628       break;
6629    case nir_atomic_op_cmpxchg:
6630       *buf_op = aco_opcode::buffer_atomic_cmpswap;
6631       *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6632       *image_op = aco_opcode::image_atomic_cmpswap;
6633       break;
6634    case nir_atomic_op_inc_wrap:
6635       *buf_op = aco_opcode::buffer_atomic_inc;
6636       *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6637       *image_op = aco_opcode::image_atomic_inc;
6638       break;
6639    case nir_atomic_op_dec_wrap:
6640       *buf_op = aco_opcode::buffer_atomic_dec;
6641       *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6642       *image_op = aco_opcode::image_atomic_dec;
6643       break;
6644    case nir_atomic_op_fadd:
6645       *buf_op = aco_opcode::buffer_atomic_add_f32;
6646       *buf_op64 = aco_opcode::num_opcodes;
6647       *image_op = aco_opcode::num_opcodes;
6648       break;
6649    case nir_atomic_op_fmin:
6650       *buf_op = aco_opcode::buffer_atomic_fmin;
6651       *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6652       *image_op = aco_opcode::image_atomic_fmin;
6653       break;
6654    case nir_atomic_op_fmax:
6655       *buf_op = aco_opcode::buffer_atomic_fmax;
6656       *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6657       *image_op = aco_opcode::image_atomic_fmax;
6658       break;
6659    default: unreachable("unsupported atomic operation");
6660    }
6661 }
6662 
6663 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6664 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6665 {
6666    bool return_previous = !nir_def_is_unused(&instr->def);
6667    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6668    bool is_array = nir_intrinsic_image_array(instr);
6669    Builder bld(ctx->program, ctx->block);
6670 
6671    const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6672    const bool cmpswap = op == nir_atomic_op_cmpxchg;
6673 
6674    aco_opcode buf_op, buf_op64, image_op;
6675    translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6676 
6677    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6678    bool is_64bit = data.bytes() == 8;
6679    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6680 
6681    if (cmpswap)
6682       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6683                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6684 
6685    Temp dst = get_ssa_temp(ctx, &instr->def);
6686    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6687 
6688    if (dim == GLSL_SAMPLER_DIM_BUF) {
6689       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6690       Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6691       // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6692       // implemented.");
6693       aco_ptr<Instruction> mubuf{create_instruction(is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4,
6694                                                     return_previous ? 1 : 0)};
6695       mubuf->operands[0] = Operand(resource);
6696       mubuf->operands[1] = Operand(vindex);
6697       mubuf->operands[2] = Operand::c32(0);
6698       mubuf->operands[3] = Operand(data);
6699       Definition def =
6700          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6701       if (return_previous)
6702          mubuf->definitions[0] = def;
6703       mubuf->mubuf().offset = 0;
6704       mubuf->mubuf().idxen = true;
6705       mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6706       mubuf->mubuf().disable_wqm = true;
6707       mubuf->mubuf().sync = sync;
6708       ctx->program->needs_exact = true;
6709       ctx->block->instructions.emplace_back(std::move(mubuf));
6710       if (return_previous && cmpswap)
6711          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6712       return;
6713    }
6714 
6715    std::vector<Temp> coords = get_image_coords(ctx, instr);
6716    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6717    Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6718    MIMG_instruction* mimg =
6719       emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6720    mimg->cache = get_atomic_cache_flags(ctx, return_previous);
6721    mimg->dmask = (1 << data.size()) - 1;
6722    mimg->a16 = instr->src[1].ssa->bit_size == 16;
6723    mimg->unrm = true;
6724    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6725    mimg->dim = sdim;
6726    mimg->da = should_declare_array(sdim);
6727    mimg->disable_wqm = true;
6728    mimg->sync = sync;
6729    ctx->program->needs_exact = true;
6730    if (return_previous && cmpswap)
6731       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6732    return;
6733 }
6734 
6735 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6736 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6737 {
6738    Builder bld(ctx->program, ctx->block);
6739    unsigned num_components = instr->num_components;
6740 
6741    Temp dst = get_ssa_temp(ctx, &instr->def);
6742    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6743 
6744    unsigned access = nir_intrinsic_access(instr);
6745    unsigned size = instr->def.bit_size / 8;
6746 
6747    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6748                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), access,
6749                get_memory_sync_info(instr, storage_buffer, 0));
6750 }
6751 
6752 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6753 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6754 {
6755    Builder bld(ctx->program, ctx->block);
6756    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6757    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6758    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6759    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6760 
6761    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6762 
6763    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6764 
6765    unsigned write_count = 0;
6766    Temp write_datas[32];
6767    unsigned offsets[32];
6768    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6769                       write_datas, offsets);
6770 
6771    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6772     * correctly when the SGPR offset is used.
6773     */
6774    if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6775       offset = as_vgpr(ctx, offset);
6776 
6777    for (unsigned i = 0; i < write_count; i++) {
6778       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6779       unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6780       if (write_datas[i].bytes() < 4)
6781          access |= ACCESS_MAY_STORE_SUBDWORD;
6782 
6783       aco_ptr<Instruction> store{create_instruction(op, Format::MUBUF, 4, 0)};
6784       store->operands[0] = Operand(rsrc);
6785       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6786       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6787       store->operands[3] = Operand(write_datas[i]);
6788       store->mubuf().offset = offsets[i];
6789       store->mubuf().offen = (offset.type() == RegType::vgpr);
6790       store->mubuf().cache = get_cache_flags(ctx, access);
6791       store->mubuf().disable_wqm = true;
6792       store->mubuf().sync = sync;
6793       ctx->program->needs_exact = true;
6794       ctx->block->instructions.emplace_back(std::move(store));
6795    }
6796 }
6797 
6798 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6799 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6800 {
6801    Builder bld(ctx->program, ctx->block);
6802    bool return_previous = !nir_def_is_unused(&instr->def);
6803    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6804 
6805    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6806    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6807 
6808    aco_opcode op32, op64, image_op;
6809    translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6810 
6811    if (cmpswap)
6812       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6813                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6814 
6815    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6816    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6817    Temp dst = get_ssa_temp(ctx, &instr->def);
6818 
6819    aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6820    aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6821    mubuf->operands[0] = Operand(rsrc);
6822    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6823    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6824    mubuf->operands[3] = Operand(data);
6825    Definition def =
6826       return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6827    if (return_previous)
6828       mubuf->definitions[0] = def;
6829    mubuf->mubuf().offset = 0;
6830    mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
6831    mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6832    mubuf->mubuf().disable_wqm = true;
6833    mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6834    ctx->program->needs_exact = true;
6835    ctx->block->instructions.emplace_back(std::move(mubuf));
6836    if (return_previous && cmpswap)
6837       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6838 }
6839 
6840 void
parse_global(isel_context * ctx,nir_intrinsic_instr * intrin,Temp * address,uint32_t * const_offset,Temp * offset)6841 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6842              Temp* offset)
6843 {
6844    bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6845    *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6846 
6847    *const_offset = nir_intrinsic_base(intrin);
6848 
6849    unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6850    nir_src offset_src = intrin->src[num_src - 1];
6851    if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6852       *offset = get_ssa_temp(ctx, offset_src.ssa);
6853    else
6854       *offset = Temp();
6855 }
6856 
6857 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6858 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6859 {
6860    Builder bld(ctx->program, ctx->block);
6861    unsigned num_components = instr->num_components;
6862    unsigned component_size = instr->def.bit_size / 8;
6863 
6864    Temp addr, offset;
6865    uint32_t const_offset;
6866    parse_global(ctx, instr, &addr, &const_offset, &offset);
6867 
6868    LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6869                         component_size};
6870    if (offset.id()) {
6871       info.resource = addr;
6872       info.offset = Operand(offset);
6873    }
6874    info.const_offset = const_offset;
6875    info.align_mul = nir_intrinsic_align_mul(instr);
6876    info.align_offset = nir_intrinsic_align_offset(instr);
6877    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6878 
6879    /* Don't expand global loads when they use MUBUF or SMEM.
6880     * Global loads don't have the bounds checking that buffer loads have that
6881     * makes this safe.
6882     */
6883    unsigned align = nir_intrinsic_align(instr);
6884    bool byte_align_for_smem_mubuf =
6885       can_use_byte_align_for_global_load(num_components, component_size, align, false);
6886 
6887    unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD;
6888    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6889 
6890    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6891     * it's safe to use SMEM */
6892    bool can_use_smem = (access & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6893    if (info.dst.type() == RegType::vgpr || (ctx->options->gfx_level < GFX8 && glc) ||
6894        !can_use_smem) {
6895       EmitLoadParameters params = global_load_params;
6896       params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6897       info.cache = get_cache_flags(ctx, access);
6898       emit_load(ctx, bld, info, params);
6899    } else {
6900       if (info.resource.id())
6901          info.resource = bld.as_uniform(info.resource);
6902       info.offset = Operand(bld.as_uniform(info.offset));
6903       info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_SMEM);
6904       emit_load(ctx, bld, info, smem_load_params);
6905    }
6906 }
6907 
6908 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6909 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6910 {
6911    Builder bld(ctx->program, ctx->block);
6912    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6913    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6914 
6915    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6916    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6917 
6918    unsigned write_count = 0;
6919    Temp write_datas[32];
6920    unsigned offsets[32];
6921    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6922                       write_datas, offsets);
6923 
6924    Temp addr, offset;
6925    uint32_t const_offset;
6926    parse_global(ctx, instr, &addr, &const_offset, &offset);
6927 
6928    for (unsigned i = 0; i < write_count; i++) {
6929       Temp write_address = addr;
6930       uint32_t write_const_offset = const_offset;
6931       Temp write_offset = offset;
6932       lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6933 
6934       unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6935       if (write_datas[i].bytes() < 4)
6936          access |= ACCESS_MAY_STORE_SUBDWORD;
6937 
6938       if (ctx->options->gfx_level >= GFX7) {
6939          bool global = ctx->options->gfx_level >= GFX9;
6940          aco_opcode op;
6941          switch (write_datas[i].bytes()) {
6942          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6943          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6944          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6945          case 8:
6946             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6947             break;
6948          case 12:
6949             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6950             break;
6951          case 16:
6952             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6953             break;
6954          default: unreachable("store_global not implemented for this size.");
6955          }
6956 
6957          aco_ptr<Instruction> flat{
6958             create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6959          if (write_address.regClass() == s2) {
6960             assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6961             flat->operands[0] = Operand(write_offset);
6962             flat->operands[1] = Operand(write_address);
6963          } else {
6964             assert(write_address.type() == RegType::vgpr && !write_offset.id());
6965             flat->operands[0] = Operand(write_address);
6966             flat->operands[1] = Operand(s1);
6967          }
6968          flat->operands[2] = Operand(write_datas[i]);
6969          flat->flatlike().cache = get_cache_flags(ctx, access);
6970          assert(global || !write_const_offset);
6971          flat->flatlike().offset = write_const_offset;
6972          flat->flatlike().disable_wqm = true;
6973          flat->flatlike().sync = sync;
6974          ctx->program->needs_exact = true;
6975          ctx->block->instructions.emplace_back(std::move(flat));
6976       } else {
6977          assert(ctx->options->gfx_level == GFX6);
6978 
6979          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6980 
6981          Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6982 
6983          aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, 0)};
6984          mubuf->operands[0] = Operand(rsrc);
6985          mubuf->operands[1] =
6986             write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6987          mubuf->operands[2] = Operand(write_offset);
6988          mubuf->operands[3] = Operand(write_datas[i]);
6989          mubuf->mubuf().cache = get_cache_flags(ctx, access);
6990          mubuf->mubuf().offset = write_const_offset;
6991          mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
6992          mubuf->mubuf().disable_wqm = true;
6993          mubuf->mubuf().sync = sync;
6994          ctx->program->needs_exact = true;
6995          ctx->block->instructions.emplace_back(std::move(mubuf));
6996       }
6997    }
6998 }
6999 
7000 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7001 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7002 {
7003    Builder bld(ctx->program, ctx->block);
7004    bool return_previous = !nir_def_is_unused(&instr->def);
7005    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7006 
7007    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
7008    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
7009 
7010    if (cmpswap)
7011       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
7012                         get_ssa_temp(ctx, instr->src[2].ssa), data);
7013 
7014    Temp dst = get_ssa_temp(ctx, &instr->def);
7015 
7016    aco_opcode op32, op64;
7017 
7018    Temp addr, offset;
7019    uint32_t const_offset;
7020    parse_global(ctx, instr, &addr, &const_offset, &offset);
7021    lower_global_address(bld, 0, &addr, &const_offset, &offset);
7022 
7023    if (ctx->options->gfx_level >= GFX7) {
7024       bool global = ctx->options->gfx_level >= GFX9;
7025       switch (nir_op) {
7026       case nir_atomic_op_iadd:
7027          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
7028          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
7029          break;
7030       case nir_atomic_op_imin:
7031          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
7032          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
7033          break;
7034       case nir_atomic_op_umin:
7035          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
7036          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
7037          break;
7038       case nir_atomic_op_imax:
7039          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
7040          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
7041          break;
7042       case nir_atomic_op_umax:
7043          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
7044          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
7045          break;
7046       case nir_atomic_op_iand:
7047          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
7048          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
7049          break;
7050       case nir_atomic_op_ior:
7051          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
7052          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
7053          break;
7054       case nir_atomic_op_ixor:
7055          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
7056          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
7057          break;
7058       case nir_atomic_op_xchg:
7059          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
7060          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
7061          break;
7062       case nir_atomic_op_cmpxchg:
7063          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
7064          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
7065          break;
7066       case nir_atomic_op_fadd:
7067          op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
7068          op64 = aco_opcode::num_opcodes;
7069          break;
7070       case nir_atomic_op_fmin:
7071          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
7072          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
7073          break;
7074       case nir_atomic_op_fmax:
7075          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
7076          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
7077          break;
7078       case nir_atomic_op_ordered_add_gfx12_amd:
7079          assert(ctx->options->gfx_level >= GFX12 && instr->def.bit_size == 64);
7080          op32 = aco_opcode::num_opcodes;
7081          op64 = aco_opcode::global_atomic_ordered_add_b64;
7082          break;
7083       default: unreachable("unsupported atomic operation");
7084       }
7085 
7086       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7087       aco_ptr<Instruction> flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3,
7088                                                    return_previous ? 1 : 0)};
7089       if (addr.regClass() == s2) {
7090          assert(global && offset.id() && offset.type() == RegType::vgpr);
7091          flat->operands[0] = Operand(offset);
7092          flat->operands[1] = Operand(addr);
7093       } else {
7094          assert(addr.type() == RegType::vgpr && !offset.id());
7095          flat->operands[0] = Operand(addr);
7096          flat->operands[1] = Operand(s1);
7097       }
7098       flat->operands[2] = Operand(data);
7099       if (return_previous)
7100          flat->definitions[0] = Definition(dst);
7101       flat->flatlike().cache = get_atomic_cache_flags(ctx, return_previous);
7102       assert(global || !const_offset);
7103       flat->flatlike().offset = const_offset;
7104       flat->flatlike().disable_wqm = true;
7105       flat->flatlike().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7106       ctx->program->needs_exact = true;
7107       ctx->block->instructions.emplace_back(std::move(flat));
7108    } else {
7109       assert(ctx->options->gfx_level == GFX6);
7110 
7111       UNUSED aco_opcode image_op;
7112       translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
7113 
7114       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
7115 
7116       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7117 
7118       aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
7119       mubuf->operands[0] = Operand(rsrc);
7120       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
7121       mubuf->operands[2] = Operand(offset);
7122       mubuf->operands[3] = Operand(data);
7123       Definition def =
7124          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
7125       if (return_previous)
7126          mubuf->definitions[0] = def;
7127       mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
7128       mubuf->mubuf().offset = const_offset;
7129       mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
7130       mubuf->mubuf().disable_wqm = true;
7131       mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7132       ctx->program->needs_exact = true;
7133       ctx->block->instructions.emplace_back(std::move(mubuf));
7134       if (return_previous && cmpswap)
7135          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
7136    }
7137 }
7138 
7139 unsigned
aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)7140 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
7141 {
7142    unsigned storage = storage_none;
7143 
7144    if (mem_mode & nir_var_shader_out)
7145       storage |= storage_vmem_output;
7146    if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
7147       storage |= storage_buffer;
7148    if (mem_mode & nir_var_mem_task_payload)
7149       storage |= storage_task_payload;
7150    if (mem_mode & nir_var_mem_shared)
7151       storage |= storage_shared;
7152    if (mem_mode & nir_var_image)
7153       storage |= storage_image;
7154 
7155    return storage;
7156 }
7157 
7158 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7159 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7160 {
7161    Builder bld(ctx->program, ctx->block);
7162 
7163    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7164    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7165    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7166                 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
7167    bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
7168    bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7169 
7170    Temp dst = get_ssa_temp(ctx, &intrin->def);
7171    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7172    Temp v_offset =
7173       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7174    Temp s_offset =
7175       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7176    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
7177 
7178    ac_hw_cache_flags cache = get_cache_flags(ctx, nir_intrinsic_access(intrin) | ACCESS_TYPE_LOAD);
7179 
7180    unsigned const_offset = nir_intrinsic_base(intrin);
7181    unsigned elem_size_bytes = intrin->def.bit_size / 8u;
7182    unsigned num_components = intrin->def.num_components;
7183 
7184    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7185    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
7186 
7187    LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
7188    info.idx = idx;
7189    info.cache = cache;
7190    info.soffset = s_offset;
7191    info.const_offset = const_offset;
7192    info.sync = sync;
7193 
7194    if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
7195       const pipe_format format = nir_intrinsic_format(intrin);
7196       const struct ac_vtx_format_info* vtx_info =
7197          ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
7198       const struct util_format_description* f = util_format_description(format);
7199       const unsigned align_mul = nir_intrinsic_align_mul(intrin);
7200       const unsigned align_offset = nir_intrinsic_align_offset(intrin);
7201 
7202       /* Avoid splitting:
7203        * - non-array formats because that would result in incorrect code
7204        * - when element size is same as component size (to reduce instruction count)
7205        */
7206       const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7207 
7208       info.align_mul = align_mul;
7209       info.align_offset = align_offset;
7210       info.format = format;
7211       info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7212       info.split_by_component_stride = false;
7213 
7214       emit_load(ctx, bld, info, mtbuf_load_params);
7215    } else {
7216       assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7217 
7218       if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7219          assert(!swizzled);
7220 
7221          emit_load(ctx, bld, info, mubuf_load_format_params);
7222       } else {
7223          const unsigned swizzle_element_size =
7224             swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7225 
7226          info.component_stride = swizzle_element_size;
7227          info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7228          info.align_mul = MIN2(elem_size_bytes, 4);
7229          info.align_offset = 0;
7230 
7231          emit_load(ctx, bld, info, mubuf_load_params);
7232       }
7233    }
7234 }
7235 
7236 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7237 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7238 {
7239    Builder bld(ctx->program, ctx->block);
7240 
7241    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7242    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7243    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7244                 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7245    bool offen = !nir_src_is_const(intrin->src[2]) || nir_src_as_uint(intrin->src[2]);
7246 
7247    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7248    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7249    Temp v_offset = offen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)) : Temp();
7250    Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7251    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7252 
7253    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7254    assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
7255           elem_size_bytes == 8);
7256 
7257    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7258    write_mask = util_widen_mask(write_mask, elem_size_bytes);
7259 
7260    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7261    /* GS outputs are only written once. */
7262    const bool written_once =
7263       mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7264    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7265                          written_once ? semantic_can_reorder : semantic_none);
7266 
7267    unsigned write_count = 0;
7268    Temp write_datas[32];
7269    unsigned offsets[32];
7270    split_buffer_store(ctx, NULL, false, RegType::vgpr, store_src, write_mask,
7271                       swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
7272                       write_datas, offsets);
7273 
7274    for (unsigned i = 0; i < write_count; i++) {
7275       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7276       Temp write_voffset = v_offset;
7277       unsigned const_offset = resolve_excess_vmem_const_offset(
7278          bld, write_voffset, offsets[i] + nir_intrinsic_base(intrin));
7279 
7280       Operand vaddr_op(v1);
7281       if (offen && idxen)
7282          vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, write_voffset);
7283       else if (offen)
7284          vaddr_op = Operand(write_voffset);
7285       else if (idxen)
7286          vaddr_op = Operand(idx);
7287 
7288       unsigned access = nir_intrinsic_access(intrin);
7289       if (write_datas[i].bytes() < 4)
7290          access |= ACCESS_MAY_STORE_SUBDWORD;
7291       ac_hw_cache_flags cache = get_cache_flags(ctx, access | ACCESS_TYPE_STORE);
7292 
7293       Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset,
7294                                      Operand(write_datas[i]), const_offset, offen, idxen,
7295                                      /* addr64 */ false, /* disable_wqm */ false, cache)
7296                               .instr;
7297       mubuf->mubuf().sync = sync;
7298    }
7299 }
7300 
7301 void
visit_load_smem(isel_context * ctx,nir_intrinsic_instr * instr)7302 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7303 {
7304    Builder bld(ctx->program, ctx->block);
7305    Temp dst = get_ssa_temp(ctx, &instr->def);
7306    Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7307    Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7308 
7309    /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7310    if (base.bytes() == 4) {
7311       base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7312                         Operand::c32(ctx->options->address32_hi));
7313    }
7314 
7315    aco_opcode opcode = aco_opcode::s_load_dword;
7316    unsigned size = 1;
7317 
7318    assert(dst.bytes() <= 64);
7319 
7320    if (dst.bytes() > 32) {
7321       opcode = aco_opcode::s_load_dwordx16;
7322       size = 16;
7323    } else if (dst.bytes() > 16) {
7324       opcode = aco_opcode::s_load_dwordx8;
7325       size = 8;
7326    } else if (dst.bytes() > 8) {
7327       opcode = aco_opcode::s_load_dwordx4;
7328       size = 4;
7329    } else if (dst.bytes() > 4) {
7330       opcode = aco_opcode::s_load_dwordx2;
7331       size = 2;
7332    }
7333 
7334    if (dst.size() != size) {
7335       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7336                  bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7337    } else {
7338       bld.smem(opcode, Definition(dst), base, offset);
7339    }
7340    emit_split_vector(ctx, dst, instr->def.num_components);
7341 }
7342 
7343 sync_scope
translate_nir_scope(mesa_scope scope)7344 translate_nir_scope(mesa_scope scope)
7345 {
7346    switch (scope) {
7347    case SCOPE_NONE:
7348    case SCOPE_INVOCATION: return scope_invocation;
7349    case SCOPE_SUBGROUP: return scope_subgroup;
7350    case SCOPE_WORKGROUP: return scope_workgroup;
7351    case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7352    case SCOPE_DEVICE: return scope_device;
7353    case SCOPE_SHADER_CALL: return scope_invocation;
7354    }
7355    unreachable("invalid scope");
7356 }
7357 
7358 void
emit_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7359 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7360 {
7361    Builder bld(ctx->program, ctx->block);
7362 
7363    unsigned storage_allowed = storage_buffer | storage_image;
7364    unsigned semantics = 0;
7365    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7366    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7367 
7368    /* We use shared storage for the following:
7369     * - compute shaders expose it in their API
7370     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7371     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7372     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7373     */
7374    bool shared_storage_used =
7375       ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7376       ctx->stage.hw == AC_HW_HULL_SHADER ||
7377       (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7378       ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7379 
7380    if (shared_storage_used)
7381       storage_allowed |= storage_shared;
7382 
7383    /* Task payload: Task Shader output, Mesh Shader input */
7384    if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7385       storage_allowed |= storage_task_payload;
7386 
7387    /* Allow VMEM output for all stages that can have outputs. */
7388    if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7389        ctx->stage.has(SWStage::TS))
7390       storage_allowed |= storage_vmem_output;
7391 
7392    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7393     * They are allowed in CS, TCS, and in any NGG shader.
7394     */
7395    ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7396                                            ctx->stage.hw == AC_HW_HULL_SHADER ||
7397                                            ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7398 
7399    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7400    unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7401    storage &= storage_allowed;
7402 
7403    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7404    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7405       semantics |= semantic_acquire | semantic_release;
7406    if (nir_semantics & NIR_MEMORY_RELEASE)
7407       semantics |= semantic_acquire | semantic_release;
7408 
7409    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7410    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7411 
7412    bld.barrier(aco_opcode::p_barrier,
7413                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7414                exec_scope);
7415 }
7416 
7417 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7418 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7419 {
7420    // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7421    Temp dst = get_ssa_temp(ctx, &instr->def);
7422    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7423    Builder bld(ctx->program, ctx->block);
7424 
7425    unsigned elem_size_bytes = instr->def.bit_size / 8;
7426    unsigned num_components = instr->def.num_components;
7427    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7428    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7429 }
7430 
7431 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7432 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7433 {
7434    unsigned writemask = nir_intrinsic_write_mask(instr);
7435    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7436    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7437    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7438 
7439    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7440    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7441 }
7442 
7443 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7444 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7445 {
7446    unsigned offset = nir_intrinsic_base(instr);
7447    Builder bld(ctx->program, ctx->block);
7448    Operand m = load_lds_size_m0(bld);
7449    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7450    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7451 
7452    unsigned num_operands = 3;
7453    aco_opcode op32, op64, op32_rtn, op64_rtn;
7454    switch (nir_intrinsic_atomic_op(instr)) {
7455    case nir_atomic_op_iadd:
7456       op32 = aco_opcode::ds_add_u32;
7457       op64 = aco_opcode::ds_add_u64;
7458       op32_rtn = aco_opcode::ds_add_rtn_u32;
7459       op64_rtn = aco_opcode::ds_add_rtn_u64;
7460       break;
7461    case nir_atomic_op_imin:
7462       op32 = aco_opcode::ds_min_i32;
7463       op64 = aco_opcode::ds_min_i64;
7464       op32_rtn = aco_opcode::ds_min_rtn_i32;
7465       op64_rtn = aco_opcode::ds_min_rtn_i64;
7466       break;
7467    case nir_atomic_op_umin:
7468       op32 = aco_opcode::ds_min_u32;
7469       op64 = aco_opcode::ds_min_u64;
7470       op32_rtn = aco_opcode::ds_min_rtn_u32;
7471       op64_rtn = aco_opcode::ds_min_rtn_u64;
7472       break;
7473    case nir_atomic_op_imax:
7474       op32 = aco_opcode::ds_max_i32;
7475       op64 = aco_opcode::ds_max_i64;
7476       op32_rtn = aco_opcode::ds_max_rtn_i32;
7477       op64_rtn = aco_opcode::ds_max_rtn_i64;
7478       break;
7479    case nir_atomic_op_umax:
7480       op32 = aco_opcode::ds_max_u32;
7481       op64 = aco_opcode::ds_max_u64;
7482       op32_rtn = aco_opcode::ds_max_rtn_u32;
7483       op64_rtn = aco_opcode::ds_max_rtn_u64;
7484       break;
7485    case nir_atomic_op_iand:
7486       op32 = aco_opcode::ds_and_b32;
7487       op64 = aco_opcode::ds_and_b64;
7488       op32_rtn = aco_opcode::ds_and_rtn_b32;
7489       op64_rtn = aco_opcode::ds_and_rtn_b64;
7490       break;
7491    case nir_atomic_op_ior:
7492       op32 = aco_opcode::ds_or_b32;
7493       op64 = aco_opcode::ds_or_b64;
7494       op32_rtn = aco_opcode::ds_or_rtn_b32;
7495       op64_rtn = aco_opcode::ds_or_rtn_b64;
7496       break;
7497    case nir_atomic_op_ixor:
7498       op32 = aco_opcode::ds_xor_b32;
7499       op64 = aco_opcode::ds_xor_b64;
7500       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7501       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7502       break;
7503    case nir_atomic_op_xchg:
7504       op32 = aco_opcode::ds_write_b32;
7505       op64 = aco_opcode::ds_write_b64;
7506       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7507       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7508       break;
7509    case nir_atomic_op_cmpxchg:
7510       op32 = aco_opcode::ds_cmpst_b32;
7511       op64 = aco_opcode::ds_cmpst_b64;
7512       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7513       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7514       num_operands = 4;
7515       break;
7516    case nir_atomic_op_fadd:
7517       op32 = aco_opcode::ds_add_f32;
7518       op32_rtn = aco_opcode::ds_add_rtn_f32;
7519       op64 = aco_opcode::num_opcodes;
7520       op64_rtn = aco_opcode::num_opcodes;
7521       break;
7522    case nir_atomic_op_fmin:
7523       op32 = aco_opcode::ds_min_f32;
7524       op32_rtn = aco_opcode::ds_min_rtn_f32;
7525       op64 = aco_opcode::ds_min_f64;
7526       op64_rtn = aco_opcode::ds_min_rtn_f64;
7527       break;
7528    case nir_atomic_op_fmax:
7529       op32 = aco_opcode::ds_max_f32;
7530       op32_rtn = aco_opcode::ds_max_rtn_f32;
7531       op64 = aco_opcode::ds_max_f64;
7532       op64_rtn = aco_opcode::ds_max_rtn_f64;
7533       break;
7534    default: unreachable("Unhandled shared atomic intrinsic");
7535    }
7536 
7537    bool return_previous = !nir_def_is_unused(&instr->def);
7538 
7539    aco_opcode op;
7540    if (data.size() == 1) {
7541       assert(instr->def.bit_size == 32);
7542       op = return_previous ? op32_rtn : op32;
7543    } else {
7544       assert(instr->def.bit_size == 64);
7545       op = return_previous ? op64_rtn : op64;
7546    }
7547 
7548    if (offset > 65535) {
7549       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7550       offset = 0;
7551    }
7552 
7553    aco_ptr<Instruction> ds;
7554    ds.reset(create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0));
7555    ds->operands[0] = Operand(address);
7556    ds->operands[1] = Operand(data);
7557    if (num_operands == 4) {
7558       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7559       ds->operands[2] = Operand(data2);
7560       if (bld.program->gfx_level >= GFX11)
7561          std::swap(ds->operands[1], ds->operands[2]);
7562    }
7563    ds->operands[num_operands - 1] = m;
7564    ds->ds().offset0 = offset;
7565    if (return_previous)
7566       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7567    ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7568 
7569    if (m.isUndefined())
7570       ds->operands.pop_back();
7571 
7572    ctx->block->instructions.emplace_back(std::move(ds));
7573 }
7574 
7575 void
visit_shared_append(isel_context * ctx,nir_intrinsic_instr * instr)7576 visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr)
7577 {
7578    Builder bld(ctx->program, ctx->block);
7579    unsigned address = nir_intrinsic_base(instr);
7580    assert(address <= 65535 && (address % 4 == 0));
7581 
7582    aco_opcode op;
7583    switch (instr->intrinsic) {
7584    case nir_intrinsic_shared_append_amd: op = aco_opcode::ds_append; break;
7585    case nir_intrinsic_shared_consume_amd: op = aco_opcode::ds_consume; break;
7586    default: unreachable("not shared_append/consume");
7587    }
7588 
7589    Temp tmp = bld.tmp(v1);
7590    Instruction *ds;
7591    Operand m = load_lds_size_m0(bld);
7592    if (m.isUndefined())
7593       ds = bld.ds(op, Definition(tmp), address);
7594    else
7595       ds = bld.ds(op, Definition(tmp), m, address);
7596    ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7597 
7598    bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp);
7599 }
7600 
7601 void
visit_access_shared2_amd(isel_context * ctx,nir_intrinsic_instr * instr)7602 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7603 {
7604    bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7605    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7606    Builder bld(ctx->program, ctx->block);
7607 
7608    assert(bld.program->gfx_level >= GFX7);
7609 
7610    bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7611    uint8_t offset0 = nir_intrinsic_offset0(instr);
7612    uint8_t offset1 = nir_intrinsic_offset1(instr);
7613    bool st64 = nir_intrinsic_st64(instr);
7614 
7615    Operand m = load_lds_size_m0(bld);
7616    Instruction* ds;
7617    if (is_store) {
7618       aco_opcode op = st64
7619                          ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7620                          : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7621       Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7622       RegClass comp_rc = is64bit ? v2 : v1;
7623       Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7624       Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7625       ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7626    } else {
7627       Temp dst = get_ssa_temp(ctx, &instr->def);
7628       Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7629       aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7630                            : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7631       ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7632    }
7633    ds->ds().sync = memory_sync_info(storage_shared);
7634    if (m.isUndefined())
7635       ds->operands.pop_back();
7636 
7637    if (!is_store) {
7638       Temp dst = get_ssa_temp(ctx, &instr->def);
7639       if (dst.type() == RegType::sgpr) {
7640          emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7641          Temp comp[4];
7642          /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7643          for (unsigned i = 0; i < dst.size(); i++)
7644             comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7645          if (is64bit) {
7646             Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7647             Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7648             ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7649             ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7650             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7651             ctx->allocated_vec[dst.id()] = {comp0, comp1};
7652          } else {
7653             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7654          }
7655       }
7656 
7657       emit_split_vector(ctx, dst, 2);
7658    }
7659 }
7660 
7661 Temp
get_scratch_resource(isel_context * ctx)7662 get_scratch_resource(isel_context* ctx)
7663 {
7664    Builder bld(ctx->program, ctx->block);
7665    Temp scratch_addr = ctx->program->private_segment_buffer;
7666    if (!scratch_addr.bytes()) {
7667       Temp addr_lo =
7668          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7669       Temp addr_hi =
7670          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7671       scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7672    } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7673       scratch_addr =
7674          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7675    }
7676 
7677    struct ac_buffer_state ac_state = {0};
7678    uint32_t desc[4];
7679 
7680    ac_state.size = 0xffffffff;
7681    ac_state.format = PIPE_FORMAT_R32_FLOAT;
7682    for (int i = 0; i < 4; i++)
7683       ac_state.swizzle[i] = PIPE_SWIZZLE_0;
7684    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7685    ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u;
7686    ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u;
7687    ac_state.add_tid = true;
7688    ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
7689 
7690    ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc);
7691 
7692    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]),
7693                      Operand::c32(desc[3]));
7694 }
7695 
7696 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7697 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7698 {
7699    Builder bld(ctx->program, ctx->block);
7700    Temp dst = get_ssa_temp(ctx, &instr->def);
7701 
7702    LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7703    info.align_mul = nir_intrinsic_align_mul(instr);
7704    info.align_offset = nir_intrinsic_align_offset(instr);
7705    info.cache = get_cache_flags(ctx, ACCESS_TYPE_LOAD | ACCESS_IS_SWIZZLED_AMD);
7706    info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7707    info.sync = memory_sync_info(storage_scratch, semantic_private);
7708    if (ctx->program->gfx_level >= GFX9) {
7709       if (nir_src_is_const(instr->src[0])) {
7710          uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7711          info.offset =
7712             bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7713          info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7714       } else {
7715          info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7716       }
7717       EmitLoadParameters params = scratch_flat_load_params;
7718       params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7719       emit_load(ctx, bld, info, params);
7720    } else {
7721       info.resource = get_scratch_resource(ctx);
7722       info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7723       info.soffset = ctx->program->scratch_offset;
7724       emit_load(ctx, bld, info, scratch_mubuf_load_params);
7725    }
7726 }
7727 
7728 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7729 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7730 {
7731    Builder bld(ctx->program, ctx->block);
7732    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7733    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7734 
7735    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7736    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7737 
7738    unsigned write_count = 0;
7739    Temp write_datas[32];
7740    unsigned offsets[32];
7741    unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7742    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7743                       &write_count, write_datas, offsets);
7744 
7745    if (ctx->program->gfx_level >= GFX9) {
7746       uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7747       offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7748       uint32_t base_const_offset =
7749          nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7750 
7751       for (unsigned i = 0; i < write_count; i++) {
7752          aco_opcode op;
7753          switch (write_datas[i].bytes()) {
7754          case 1: op = aco_opcode::scratch_store_byte; break;
7755          case 2: op = aco_opcode::scratch_store_short; break;
7756          case 4: op = aco_opcode::scratch_store_dword; break;
7757          case 8: op = aco_opcode::scratch_store_dwordx2; break;
7758          case 12: op = aco_opcode::scratch_store_dwordx3; break;
7759          case 16: op = aco_opcode::scratch_store_dwordx4; break;
7760          default: unreachable("Unexpected store size");
7761          }
7762 
7763          uint32_t const_offset = base_const_offset + offsets[i];
7764          assert(const_offset < max || offset.id() == 0);
7765 
7766          Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7767          Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7768          if (offset.id() == 0)
7769             saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7770 
7771          bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7772                      memory_sync_info(storage_scratch, semantic_private));
7773       }
7774    } else {
7775       Temp rsrc = get_scratch_resource(ctx);
7776       offset = as_vgpr(ctx, offset);
7777       for (unsigned i = 0; i < write_count; i++) {
7778          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7779          Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7780                                         write_datas[i], offsets[i], true);
7781          mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7782          unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD |
7783                            (write_datas[i].bytes() < 4 ? ACCESS_MAY_STORE_SUBDWORD : 0);
7784          mubuf->mubuf().cache = get_cache_flags(ctx, access);
7785       }
7786    }
7787 }
7788 
7789 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7790 get_reduce_op(nir_op op, unsigned bit_size)
7791 {
7792    switch (op) {
7793 #define CASEI(name)                                                                                \
7794    case nir_op_##name:                                                                             \
7795       return (bit_size == 32)   ? name##32                                                         \
7796              : (bit_size == 16) ? name##16                                                         \
7797              : (bit_size == 8)  ? name##8                                                          \
7798                                 : name##64;
7799 #define CASEF(name)                                                                                \
7800    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7801       CASEI(iadd)
7802       CASEI(imul)
7803       CASEI(imin)
7804       CASEI(umin)
7805       CASEI(imax)
7806       CASEI(umax)
7807       CASEI(iand)
7808       CASEI(ior)
7809       CASEI(ixor)
7810       CASEF(fadd)
7811       CASEF(fmul)
7812       CASEF(fmin)
7813       CASEF(fmax)
7814    default: unreachable("unknown reduction op");
7815 #undef CASEI
7816 #undef CASEF
7817    }
7818 }
7819 
7820 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7821 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7822 {
7823    Builder bld(ctx->program, ctx->block);
7824    Definition dst(get_ssa_temp(ctx, &instr->def));
7825    assert(dst.regClass().type() != RegType::vgpr);
7826    if (src.regClass().type() == RegType::vgpr)
7827       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7828    else
7829       bld.copy(dst, src);
7830 }
7831 
7832 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7833 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7834 {
7835    Builder bld(ctx->program, ctx->block);
7836    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7837 
7838    if (op == nir_op_fadd) {
7839       src_tmp = as_vgpr(ctx, src_tmp);
7840       Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7841                                       : dst.getTemp();
7842 
7843       if (src.ssa->bit_size == 16) {
7844          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7845          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7846       } else {
7847          assert(src.ssa->bit_size == 32);
7848          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7849          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7850       }
7851 
7852       if (tmp != dst.getTemp())
7853          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7854 
7855       return;
7856    }
7857 
7858    if (dst.regClass() == s1)
7859       src_tmp = bld.as_uniform(src_tmp);
7860 
7861    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7862       count =
7863          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7864    else if (op == nir_op_ixor)
7865       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7866 
7867    assert(dst.getTemp().type() == count.type());
7868 
7869    if (nir_src_is_const(src)) {
7870       uint32_t imm = nir_src_as_uint(src);
7871       if (imm == 1 && dst.bytes() <= 2)
7872          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7873       else if (imm == 1)
7874          bld.copy(dst, count);
7875       else if (imm == 0)
7876          bld.copy(dst, Operand::zero(dst.bytes()));
7877       else if (count.type() == RegType::vgpr)
7878          bld.v_mul_imm(dst, count, imm, true, true);
7879       else if (imm == 0xffffffff)
7880          bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
7881       else if (util_is_power_of_two_or_zero(imm))
7882          bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
7883                   Operand::c32(ffs(imm) - 1u));
7884       else
7885          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7886    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7887       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7888    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7889       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7890    } else if (dst.getTemp().type() == RegType::vgpr) {
7891       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7892    } else {
7893       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7894    }
7895 }
7896 
7897 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7898 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7899 {
7900    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7901    if (op == nir_op_imul || op == nir_op_fmul)
7902       return false;
7903 
7904    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7905       Builder bld(ctx->program, ctx->block);
7906       Definition dst(get_ssa_temp(ctx, &instr->def));
7907       unsigned bit_size = instr->src[0].ssa->bit_size;
7908       if (bit_size > 32)
7909          return false;
7910 
7911       Temp thread_count =
7912          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7913       set_wqm(ctx);
7914 
7915       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7916    } else {
7917       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7918    }
7919 
7920    return true;
7921 }
7922 
7923 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7924 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7925 {
7926    Builder bld(ctx->program, ctx->block);
7927    Definition dst(get_ssa_temp(ctx, &instr->def));
7928    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7929    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7930 
7931    if (op == nir_op_imul || op == nir_op_fmul)
7932       return false;
7933 
7934    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7935       if (instr->src[0].ssa->bit_size > 32)
7936          return false;
7937 
7938       Temp packed_tid;
7939       if (inc)
7940          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7941       else
7942          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7943       set_wqm(ctx);
7944 
7945       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7946       return true;
7947    }
7948 
7949    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7950           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7951 
7952    if (inc) {
7953       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7954       return true;
7955    }
7956 
7957    /* Copy the source and write the reduction operation identity to the first lane. */
7958    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7959    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7960    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7961    if (dst.bytes() == 8) {
7962       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7963       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7964       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7965       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7966 
7967       lo =
7968          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7969       hi =
7970          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7971       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7972    } else {
7973       uint32_t identity = get_reduction_identity(reduce_op, 0);
7974       bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7975                     as_vgpr(ctx, src));
7976    }
7977 
7978    set_wqm(ctx);
7979    return true;
7980 }
7981 
7982 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7983 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7984                      Definition dst, Temp src)
7985 {
7986    assert(src.bytes() <= 8);
7987    assert(src.type() == RegType::vgpr);
7988 
7989    Builder bld(ctx->program, ctx->block);
7990 
7991    unsigned num_defs = 0;
7992    Definition defs[5];
7993    defs[num_defs++] = dst;
7994    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7995 
7996    /* scalar identity temporary */
7997    bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7998                      aco_op != aco_opcode::p_reduce;
7999    if (aco_op == aco_opcode::p_exclusive_scan) {
8000       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
8001                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
8002                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
8003                      op == fmul64);
8004    }
8005    if (need_sitmp)
8006       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
8007 
8008    /* scc clobber */
8009    defs[num_defs++] = bld.def(s1, scc);
8010 
8011    /* vcc clobber */
8012    bool clobber_vcc = false;
8013    if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
8014       clobber_vcc = true;
8015    if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
8016       clobber_vcc = true;
8017    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
8018       clobber_vcc = true;
8019 
8020    if (clobber_vcc)
8021       defs[num_defs++] = bld.def(bld.lm, vcc);
8022 
8023    Instruction* reduce = create_instruction(aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
8024    reduce->operands[0] = Operand(src);
8025    /* setup_reduce_temp will update these undef operands if needed */
8026    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
8027    reduce->operands[2] = Operand(v1.as_linear());
8028    std::copy(defs, defs + num_defs, reduce->definitions.begin());
8029 
8030    reduce->reduction().reduce_op = op;
8031    reduce->reduction().cluster_size = cluster_size;
8032    bld.insert(std::move(reduce));
8033 
8034    return dst.getTemp();
8035 }
8036 
8037 Temp
inclusive_scan_to_exclusive(isel_context * ctx,ReduceOp op,Definition dst,Temp src)8038 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
8039 {
8040    Builder bld(ctx->program, ctx->block);
8041 
8042    Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
8043                                     bld.def(dst.regClass()), src);
8044 
8045    switch (op) {
8046    case iadd8:
8047    case iadd16:
8048    case iadd32: return bld.vsub32(dst, scan, src);
8049    case ixor64:
8050    case iadd64: {
8051       Temp src00 = bld.tmp(v1);
8052       Temp src01 = bld.tmp(v1);
8053       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
8054       Temp src10 = bld.tmp(v1);
8055       Temp src11 = bld.tmp(v1);
8056       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
8057 
8058       Temp lower = bld.tmp(v1);
8059       Temp upper = bld.tmp(v1);
8060       if (op == iadd64) {
8061          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
8062          bld.vsub32(Definition(upper), src01, src11, false, borrow);
8063       } else {
8064          bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
8065          bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
8066       }
8067       return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
8068    }
8069    case ixor8:
8070    case ixor16:
8071    case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
8072    default: unreachable("Unsupported op");
8073    }
8074 }
8075 
8076 bool
emit_rotate_by_constant(isel_context * ctx,Temp & dst,Temp src,unsigned cluster_size,uint64_t delta)8077 emit_rotate_by_constant(isel_context* ctx, Temp& dst, Temp src, unsigned cluster_size,
8078                         uint64_t delta)
8079 {
8080    Builder bld(ctx->program, ctx->block);
8081    RegClass rc = src.regClass();
8082    dst = Temp(0, rc);
8083    delta %= cluster_size;
8084 
8085    if (delta == 0) {
8086       dst = bld.copy(bld.def(rc), src);
8087    } else if (delta * 2 == cluster_size && cluster_size <= 32) {
8088       dst = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, delta), true);
8089    } else if (cluster_size == 4) {
8090       unsigned res[4];
8091       for (unsigned i = 0; i < 4; i++)
8092          res[i] = (i + delta) & 0x3;
8093       uint32_t dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
8094       if (ctx->program->gfx_level >= GFX8)
8095          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_ctrl);
8096       else
8097          dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl);
8098    } else if (cluster_size == 8 && ctx->program->gfx_level >= GFX10) {
8099       uint32_t lane_sel = 0;
8100       for (unsigned i = 0; i < 8; i++)
8101          lane_sel |= ((i + delta) & 0x7) << (i * 3);
8102       dst = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(rc), src, lane_sel);
8103    } else if (cluster_size == 16 && ctx->program->gfx_level >= GFX8) {
8104       dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_row_rr(16 - delta));
8105    } else if (cluster_size <= 32 && ctx->program->gfx_level >= GFX9) {
8106       uint32_t ctrl = ds_pattern_rotate(delta, ~(cluster_size - 1) & 0x1f);
8107       dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, ctrl);
8108    } else if (cluster_size == 64) {
8109       bool has_wf_dpp = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX10;
8110       if (delta == 32 && ctx->program->gfx_level >= GFX11) {
8111          dst = bld.vop1(aco_opcode::v_permlane64_b32, bld.def(rc), src);
8112       } else if (delta == 1 && has_wf_dpp) {
8113          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rl1);
8114       } else if (delta == 63 && has_wf_dpp) {
8115          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rr1);
8116       }
8117    }
8118 
8119    return dst.id() != 0;
8120 }
8121 
8122 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)8123 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
8124 {
8125    Builder bld(ctx->program, ctx->block);
8126    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
8127    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
8128 
8129    Temp ddx_1, ddx_2, ddy_1, ddy_2;
8130    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
8131    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
8132    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8133 
8134    /* Build DD X/Y */
8135    if (ctx->program->gfx_level >= GFX8) {
8136       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
8137       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
8138       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
8139       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
8140       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
8141       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
8142    } else {
8143       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
8144       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
8145       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
8146       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
8147       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
8148 
8149       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
8150       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
8151       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
8152       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
8153       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
8154    }
8155 
8156    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
8157    aco_opcode mad =
8158       ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
8159    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
8160    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
8161    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
8162    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
8163    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2);
8164    set_wqm(ctx, true);
8165    return;
8166 }
8167 
8168 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8169 Temp lanecount_to_mask(isel_context* ctx, Temp count);
8170 void pops_await_overlapped_waves(isel_context* ctx);
8171 
8172 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)8173 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
8174 {
8175    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
8176    if (intrin == nir_intrinsic_load_barycentric_pixel ||
8177        intrin == nir_intrinsic_load_barycentric_at_offset) {
8178       return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
8179    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
8180       return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
8181    } else {
8182       assert(intrin == nir_intrinsic_load_barycentric_sample);
8183       return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
8184    }
8185 }
8186 
8187 void
ds_ordered_count_offsets(isel_context * ctx,unsigned index_operand,unsigned wave_release,unsigned wave_done,unsigned * offset0,unsigned * offset1)8188 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
8189                          unsigned wave_done, unsigned* offset0, unsigned* offset1)
8190 {
8191    unsigned ordered_count_index = index_operand & 0x3f;
8192    unsigned count_dword = (index_operand >> 24) & 0xf;
8193 
8194    assert(ctx->options->gfx_level >= GFX10);
8195    assert(count_dword >= 1 && count_dword <= 4);
8196 
8197    *offset0 = ordered_count_index << 2;
8198    *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
8199 
8200    if (ctx->options->gfx_level < GFX11)
8201       *offset1 |= 3 /* GS shader type */ << 2;
8202 }
8203 
8204 struct aco_export_mrt {
8205    Operand out[4];
8206    unsigned enabled_channels;
8207    unsigned target;
8208    bool compr;
8209 };
8210 
8211 static void
create_fs_dual_src_export_gfx11(isel_context * ctx,const struct aco_export_mrt * mrt0,const struct aco_export_mrt * mrt1)8212 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8213                                 const struct aco_export_mrt* mrt1)
8214 {
8215    Builder bld(ctx->program, ctx->block);
8216 
8217    aco_ptr<Instruction> exp{
8218       create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8219    for (unsigned i = 0; i < 4; i++) {
8220       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8221       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8222    }
8223 
8224    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8225    exp->definitions[0] = bld.def(type); /* mrt0 */
8226    exp->definitions[1] = bld.def(type); /* mrt1 */
8227    exp->definitions[2] = bld.def(bld.lm);
8228    exp->definitions[3] = bld.def(bld.lm);
8229    exp->definitions[4] = bld.def(bld.lm, vcc);
8230    exp->definitions[5] = bld.def(s1, scc);
8231    ctx->block->instructions.emplace_back(std::move(exp));
8232 
8233    ctx->program->has_color_exports = true;
8234 }
8235 
8236 static void
visit_cmat_muladd(isel_context * ctx,nir_intrinsic_instr * instr)8237 visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
8238 {
8239    aco_opcode opcode = aco_opcode::num_opcodes;
8240    unsigned signed_mask = 0;
8241    bool clamp = false;
8242 
8243    switch (instr->src[0].ssa->bit_size) {
8244    case 16:
8245       switch (instr->def.bit_size) {
8246       case 32: opcode = aco_opcode::v_wmma_f32_16x16x16_f16; break;
8247       case 16: opcode = aco_opcode::v_wmma_f16_16x16x16_f16; break;
8248       }
8249       break;
8250    case 8:
8251       opcode = aco_opcode::v_wmma_i32_16x16x16_iu8;
8252       signed_mask = nir_intrinsic_cmat_signed_mask(instr);
8253       clamp = nir_intrinsic_saturate(instr);
8254       break;
8255    }
8256 
8257    if (opcode == aco_opcode::num_opcodes)
8258       unreachable("visit_cmat_muladd: invalid bit size combination");
8259 
8260    Builder bld(ctx->program, ctx->block);
8261 
8262    Temp dst = get_ssa_temp(ctx, &instr->def);
8263    Operand A(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
8264    Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
8265    Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8266 
8267    VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
8268    vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
8269    vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
8270    vop3p.clamp = clamp;
8271 
8272    emit_split_vector(ctx, dst, instr->def.num_components);
8273 }
8274 
8275 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)8276 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8277 {
8278    Builder bld(ctx->program, ctx->block);
8279    switch (instr->intrinsic) {
8280    case nir_intrinsic_load_barycentric_sample:
8281    case nir_intrinsic_load_barycentric_pixel:
8282    case nir_intrinsic_load_barycentric_centroid: {
8283       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8284       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8285       assert(bary.size() == 2);
8286       Temp dst = get_ssa_temp(ctx, &instr->def);
8287       bld.copy(Definition(dst), bary);
8288       emit_split_vector(ctx, dst, 2);
8289       break;
8290    }
8291    case nir_intrinsic_load_barycentric_model: {
8292       Temp model = get_arg(ctx, ctx->args->pull_model);
8293       assert(model.size() == 3);
8294       Temp dst = get_ssa_temp(ctx, &instr->def);
8295       bld.copy(Definition(dst), model);
8296       emit_split_vector(ctx, dst, 3);
8297       break;
8298    }
8299    case nir_intrinsic_load_barycentric_at_offset: {
8300       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8301       RegClass rc = RegClass(offset.type(), 1);
8302       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8303       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8304       Temp bary = get_interp_param(ctx, instr->intrinsic,
8305                                    (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8306       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8307       break;
8308    }
8309    case nir_intrinsic_load_front_face: {
8310       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8311                Operand::zero(), get_arg(ctx, ctx->args->front_face));
8312       break;
8313    }
8314    case nir_intrinsic_load_view_index: {
8315       Temp dst = get_ssa_temp(ctx, &instr->def);
8316       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8317       break;
8318    }
8319    case nir_intrinsic_load_frag_coord: {
8320       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8321       break;
8322    }
8323    case nir_intrinsic_load_frag_shading_rate:
8324       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8325       break;
8326    case nir_intrinsic_load_sample_pos: {
8327       Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8328       Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8329       bld.pseudo(
8330          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8331          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8332          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8333       break;
8334    }
8335    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8336    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8337    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8338    case nir_intrinsic_load_input:
8339    case nir_intrinsic_load_per_primitive_input:
8340    case nir_intrinsic_load_input_vertex:
8341       if (ctx->program->stage == fragment_fs)
8342          visit_load_fs_input(ctx, instr);
8343       else
8344          isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8345       break;
8346    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8347    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8348    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8349    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8350    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8351    case nir_intrinsic_shared_atomic:
8352    case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8353    case nir_intrinsic_shared_append_amd:
8354    case nir_intrinsic_shared_consume_amd: visit_shared_append(ctx, instr); break;
8355    case nir_intrinsic_load_shared2_amd:
8356    case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8357    case nir_intrinsic_bindless_image_load:
8358    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8359    case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8360    case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8361    case nir_intrinsic_bindless_image_atomic:
8362    case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8363    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8364    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8365    case nir_intrinsic_load_typed_buffer_amd:
8366    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8367    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8368    case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8369    case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8370    case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8371    case nir_intrinsic_global_atomic_amd:
8372    case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8373    case nir_intrinsic_ssbo_atomic:
8374    case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8375    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8376    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8377    case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8378    case nir_intrinsic_load_num_workgroups: {
8379       Temp dst = get_ssa_temp(ctx, &instr->def);
8380       if (ctx->options->load_grid_size_from_user_sgpr) {
8381          bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8382       } else {
8383          Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8384          assert(addr.regClass() == s2);
8385          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8386                     bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8387                     bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8388       }
8389       emit_split_vector(ctx, dst, 3);
8390       break;
8391    }
8392    case nir_intrinsic_load_local_invocation_id: {
8393       Temp dst = get_ssa_temp(ctx, &instr->def);
8394       if (ctx->options->gfx_level >= GFX11) {
8395          Temp local_ids[3];
8396 
8397          /* Thread IDs are packed in VGPR0, 10 bits per component. */
8398          for (uint32_t i = 0; i < 3; i++) {
8399             if (i == 0 && ctx->shader->info.workgroup_size[1] == 1 &&
8400                 ctx->shader->info.workgroup_size[2] == 1 &&
8401                 !ctx->shader->info.workgroup_size_variable) {
8402                local_ids[i] = get_arg(ctx, ctx->args->local_invocation_ids);
8403             } else if (i == 2 || (i == 1 && ctx->shader->info.workgroup_size[2] == 1 &&
8404                                   !ctx->shader->info.workgroup_size_variable)) {
8405                local_ids[i] =
8406                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(i * 10u),
8407                            get_arg(ctx, ctx->args->local_invocation_ids));
8408             } else {
8409                local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8410                                        get_arg(ctx, ctx->args->local_invocation_ids),
8411                                        Operand::c32(i * 10u), Operand::c32(10u));
8412             }
8413          }
8414 
8415          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8416                     local_ids[2]);
8417       } else {
8418          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8419       }
8420       emit_split_vector(ctx, dst, 3);
8421       break;
8422    }
8423    case nir_intrinsic_load_workgroup_id: {
8424       Temp dst = get_ssa_temp(ctx, &instr->def);
8425       if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8426          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), ctx->workgroup_id[0],
8427                     ctx->workgroup_id[1], ctx->workgroup_id[2]);
8428          emit_split_vector(ctx, dst, 3);
8429       } else {
8430          isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8431       }
8432       break;
8433    }
8434    case nir_intrinsic_load_subgroup_id: {
8435       assert(ctx->options->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER);
8436       bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8437                ctx->ttmp8, Operand::c32(25 | (5 << 16)));
8438       break;
8439    }
8440    case nir_intrinsic_load_local_invocation_index: {
8441       if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8442          if (ctx->options->gfx_level >= GFX11) {
8443             /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8444             Temp wave_id =
8445                bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8446                         get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8447 
8448             Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8449                                  Operand::c32(ctx->program->wave_size));
8450             emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8451          } else {
8452             bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8453                      get_arg(ctx, ctx->args->vs_rel_patch_id));
8454          }
8455          break;
8456       } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8457                  ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8458          bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8459          break;
8460       } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8461          emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8462          break;
8463       }
8464 
8465       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8466 
8467       if (ctx->options->gfx_level >= GFX12) {
8468          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx->ttmp8,
8469                                 Operand::c32(25 | (5 << 16)));
8470          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8471                   Operand::c32(ctx->program->wave_size == 64 ? 6 : 5), id);
8472          break;
8473       }
8474 
8475       /* The tg_size bits [6:11] contain the subgroup id,
8476        * we need this multiplied by the wave size, and then OR the thread id to it.
8477        */
8478       if (ctx->program->wave_size == 64) {
8479          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8480           * feed that to v_or */
8481          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8482                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8483          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8484       } else {
8485          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8486          Temp tg_num =
8487             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8488                      get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8489          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8490                   Operand::c32(0x5u), id);
8491       }
8492       break;
8493    }
8494    case nir_intrinsic_ddx:
8495    case nir_intrinsic_ddy:
8496    case nir_intrinsic_ddx_fine:
8497    case nir_intrinsic_ddy_fine:
8498    case nir_intrinsic_ddx_coarse:
8499    case nir_intrinsic_ddy_coarse: {
8500       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8501       Temp dst = get_ssa_temp(ctx, &instr->def);
8502 
8503       bool only_used_by_abs = true;
8504       nir_foreach_use (use, &instr->def) {
8505          nir_instr* use_instr = nir_src_parent_instr(use);
8506 
8507          if (use_instr->type != nir_instr_type_alu ||
8508              nir_instr_as_alu(use_instr)->op != nir_op_fabs)
8509             only_used_by_abs = false;
8510       }
8511 
8512       uint16_t dpp_ctrl1, dpp_ctrl2;
8513       if (instr->intrinsic == nir_intrinsic_ddx_fine) {
8514          if (only_used_by_abs) {
8515             dpp_ctrl1 = dpp_quad_perm(1, 0, 3, 2);
8516             dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8517          } else {
8518             dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
8519             dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
8520          }
8521       } else if (instr->intrinsic == nir_intrinsic_ddy_fine) {
8522          if (only_used_by_abs) {
8523             dpp_ctrl1 = dpp_quad_perm(2, 3, 0, 1);
8524             dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8525          } else {
8526             dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
8527             dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
8528          }
8529       } else {
8530          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
8531          if (instr->intrinsic == nir_intrinsic_ddx ||
8532              instr->intrinsic == nir_intrinsic_ddx_coarse)
8533             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
8534          else
8535             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8536       }
8537 
8538       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
8539          assert(instr->def.num_components == 2);
8540 
8541          /* identify swizzle to opsel */
8542          unsigned opsel_lo = 0b00;
8543          unsigned opsel_hi = 0b11;
8544 
8545          Temp tl = src;
8546          if (nir_src_is_divergent(instr->src[0]))
8547             tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
8548 
8549          Builder::Result sub =
8550             bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), src, tl, opsel_lo, opsel_hi);
8551          sub->valu().neg_lo[1] = true;
8552          sub->valu().neg_hi[1] = true;
8553 
8554          if (nir_src_is_divergent(instr->src[0]) && dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8555             bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), sub, dpp_ctrl2);
8556          else
8557             bld.copy(Definition(dst), sub);
8558          emit_split_vector(ctx, dst, 2);
8559       } else {
8560          aco_opcode subrev =
8561             instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32;
8562          bool use_interp = dpp_ctrl1 == dpp_quad_perm(0, 0, 0, 0) && instr->def.bit_size == 32 &&
8563                            ctx->program->gfx_level >= GFX11_5;
8564          if (!nir_src_is_divergent(instr->src[0])) {
8565             bld.vop2(subrev, Definition(dst), src, src);
8566          } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(1, 1, 1, 1)) {
8567             bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), src,
8568                               Operand::c32(0x3f800000), src)
8569                ->valu()
8570                .neg[2] = true;
8571          } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(2, 2, 2, 2)) {
8572             Builder::Result tmp = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1),
8573                                                     Operand::c32(0), Operand::c32(0), src);
8574             tmp->valu().neg = 0x6;
8575             bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), src,
8576                               Operand::c32(0x3f800000), tmp);
8577          } else if (ctx->program->gfx_level >= GFX8 && dpp_ctrl2 == dpp_quad_perm(0, 1, 2, 3)) {
8578             bld.vop2_dpp(subrev, Definition(dst), src, src, dpp_ctrl1);
8579          } else if (ctx->program->gfx_level >= GFX8) {
8580             Temp tmp = bld.vop2_dpp(subrev, bld.def(v1), src, src, dpp_ctrl1);
8581             bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), tmp, dpp_ctrl2);
8582          } else {
8583             Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
8584             Temp tr = src;
8585             if (dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8586                tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
8587             bld.vop2(subrev, Definition(dst), tl, tr);
8588          }
8589       }
8590       set_wqm(ctx, true);
8591       break;
8592    }
8593 
8594    case nir_intrinsic_load_subgroup_invocation: {
8595       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8596       break;
8597    }
8598    case nir_intrinsic_ballot_relaxed:
8599    case nir_intrinsic_ballot: {
8600       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8601       Temp dst = get_ssa_temp(ctx, &instr->def);
8602 
8603       if (instr->src[0].ssa->bit_size == 1) {
8604          assert(src.regClass() == bld.lm);
8605       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8606          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8607       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8608          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8609       } else {
8610          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8611       }
8612 
8613       /* Make sure that all inactive lanes return zero.
8614        * Value-numbering might remove the comparison above */
8615       Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8616       if (instr->intrinsic == nir_intrinsic_ballot_relaxed)
8617          src = bld.copy(def, src);
8618       else
8619          src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8620       if (dst.size() != bld.lm.size()) {
8621          /* Wave32 with ballot size set to 64 */
8622          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8623       }
8624 
8625       set_wqm(ctx);
8626       break;
8627    }
8628    case nir_intrinsic_inverse_ballot: {
8629       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8630       Temp dst = get_ssa_temp(ctx, &instr->def);
8631 
8632       assert(dst.size() == bld.lm.size());
8633       if (src.size() > dst.size()) {
8634          emit_extract_vector(ctx, src, 0, dst);
8635       } else if (src.size() < dst.size()) {
8636          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8637       } else {
8638          bld.copy(Definition(dst), src);
8639       }
8640       break;
8641    }
8642    case nir_intrinsic_shuffle:
8643    case nir_intrinsic_read_invocation: {
8644       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8645       assert(instr->def.bit_size != 1);
8646       if (!nir_src_is_divergent(instr->src[0])) {
8647          emit_uniform_subgroup(ctx, instr, src);
8648       } else {
8649          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8650          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8651              !nir_src_is_divergent(instr->src[1]))
8652             tid = bld.as_uniform(tid);
8653          Temp dst = get_ssa_temp(ctx, &instr->def);
8654 
8655          src = as_vgpr(ctx, src);
8656 
8657          if (src.regClass() == v1b || src.regClass() == v2b) {
8658             Temp tmp = bld.tmp(v1);
8659             tmp = emit_bpermute(ctx, bld, tid, src);
8660             if (dst.type() == RegType::vgpr)
8661                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8662                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8663             else
8664                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8665          } else if (src.regClass() == v1) {
8666             Temp tmp = emit_bpermute(ctx, bld, tid, src);
8667             bld.copy(Definition(dst), tmp);
8668          } else if (src.regClass() == v2) {
8669             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8670             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8671             lo = emit_bpermute(ctx, bld, tid, lo);
8672             hi = emit_bpermute(ctx, bld, tid, hi);
8673             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8674             emit_split_vector(ctx, dst, 2);
8675          } else {
8676             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8677          }
8678          set_wqm(ctx);
8679       }
8680       break;
8681    }
8682    case nir_intrinsic_rotate: {
8683       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8684       Temp delta = get_ssa_temp(ctx, instr->src[1].ssa);
8685       Temp dst = get_ssa_temp(ctx, &instr->def);
8686       assert(instr->def.bit_size > 1 && instr->def.bit_size <= 32);
8687 
8688       if (!nir_src_is_divergent(instr->src[0])) {
8689          emit_uniform_subgroup(ctx, instr, src);
8690          break;
8691       }
8692 
8693       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
8694       cluster_size = util_next_power_of_two(
8695          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8696 
8697       if (cluster_size == 1) {
8698          bld.copy(Definition(dst), src);
8699          break;
8700       }
8701 
8702       delta = bld.as_uniform(delta);
8703       src = as_vgpr(ctx, src);
8704 
8705       Temp tmp;
8706       if (nir_src_is_const(instr->src[1]) &&
8707           emit_rotate_by_constant(ctx, tmp, src, cluster_size, nir_src_as_uint(instr->src[1]))) {
8708       } else if (cluster_size == 2) {
8709          Temp noswap =
8710             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), delta, Operand::c32(0));
8711          noswap = bool_to_vector_condition(ctx, noswap);
8712          Temp swapped = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, 0x1), true);
8713          tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(src.regClass()), swapped, src, noswap);
8714       } else if (ctx->program->gfx_level >= GFX10 && cluster_size <= 16) {
8715          if (cluster_size == 4) /* shift mask already does this for 8/16. */
8716             delta = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), delta,
8717                              Operand::c32(0x3));
8718          delta =
8719             bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), delta, Operand::c32(2));
8720 
8721          Temp lo = bld.copy(bld.def(s1), Operand::c32(cluster_size == 4 ? 0x32103210 : 0x76543210));
8722          Temp hi;
8723 
8724          if (cluster_size <= 8) {
8725             Temp shr = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8726             if (cluster_size == 4) {
8727                Temp lotolohi = bld.copy(bld.def(s1), Operand::c32(0x4444));
8728                Temp lohi =
8729                   bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, lotolohi);
8730                lo = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), shr, lohi);
8731             } else {
8732                delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
8733                                 Operand::c32(32), delta);
8734                Temp shl =
8735                   bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8736                lo = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, shl);
8737             }
8738             Temp lotohi = bld.copy(bld.def(s1), Operand::c32(0x88888888));
8739             hi = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), lo, lotohi);
8740          } else {
8741             hi = bld.copy(bld.def(s1), Operand::c32(0xfedcba98));
8742 
8743             Temp lohi = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
8744 
8745             Temp shr = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8746             delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(64),
8747                              delta);
8748             Temp shl = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8749 
8750             lohi = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), shr, shl);
8751             lo = bld.tmp(s1);
8752             hi = bld.tmp(s1);
8753             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), lohi);
8754          }
8755 
8756          Builder::Result ret =
8757             bld.vop3(aco_opcode::v_permlane16_b32, bld.def(src.regClass()), src, lo, hi);
8758          ret->valu().opsel[0] = true; /* set FETCH_INACTIVE */
8759          ret->valu().opsel[1] = true; /* set BOUND_CTRL */
8760          tmp = ret;
8761       } else {
8762          /* Fallback to ds_bpermute if we can't find a special instruction. */
8763          Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
8764          Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
8765 
8766          if (ctx->program->gfx_level >= GFX10 && cluster_size == 32) {
8767             /* ds_bpermute is restricted to 32 lanes on GFX10+. */
8768             Temp index_x4 =
8769                bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
8770             tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
8771          } else {
8772             /* Technically, full wave rotate doesn't need this, but it breaks the pseudo ops. */
8773             src_lane = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), Operand::c32(cluster_size - 1),
8774                                 src_lane, tid);
8775             tmp = emit_bpermute(ctx, bld, src_lane, src);
8776          }
8777       }
8778 
8779       tmp = emit_extract_vector(ctx, tmp, 0, dst.regClass());
8780       bld.copy(Definition(dst), tmp);
8781       set_wqm(ctx);
8782       break;
8783    }
8784    case nir_intrinsic_load_sample_id: {
8785       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8786                get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8787       break;
8788    }
8789    case nir_intrinsic_read_first_invocation: {
8790       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8791       Temp dst = get_ssa_temp(ctx, &instr->def);
8792       if (instr->def.bit_size == 1) {
8793          assert(src.regClass() == bld.lm);
8794          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8795                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8796          bool_to_vector_condition(ctx, tmp, dst);
8797       } else {
8798          emit_readfirstlane(ctx, src, dst);
8799       }
8800       set_wqm(ctx);
8801       break;
8802    }
8803    case nir_intrinsic_as_uniform: {
8804       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8805       Temp dst = get_ssa_temp(ctx, &instr->def);
8806       if (src.type() == RegType::vgpr)
8807          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
8808       else
8809          bld.copy(Definition(dst), src);
8810       break;
8811    }
8812    case nir_intrinsic_vote_all: {
8813       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8814       Temp dst = get_ssa_temp(ctx, &instr->def);
8815       assert(src.regClass() == bld.lm);
8816       assert(dst.regClass() == bld.lm);
8817 
8818       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8819       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8820                .def(1)
8821                .getTemp();
8822       Temp cond = bool_to_vector_condition(ctx, tmp);
8823       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8824       set_wqm(ctx);
8825       break;
8826    }
8827    case nir_intrinsic_vote_any: {
8828       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8829       Temp dst = get_ssa_temp(ctx, &instr->def);
8830       assert(src.regClass() == bld.lm);
8831       assert(dst.regClass() == bld.lm);
8832 
8833       Temp tmp = bool_to_scalar_condition(ctx, src);
8834       bool_to_vector_condition(ctx, tmp, dst);
8835       set_wqm(ctx);
8836       break;
8837    }
8838    case nir_intrinsic_quad_vote_any: {
8839       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8840       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8841       bld.sop1(Builder::s_wqm, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8842       set_wqm(ctx);
8843       break;
8844    }
8845    case nir_intrinsic_quad_vote_all: {
8846       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8847       src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8848       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8849       src = bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), src);
8850       bld.sop1(Builder::s_not, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8851       set_wqm(ctx);
8852       break;
8853    }
8854    case nir_intrinsic_reduce:
8855    case nir_intrinsic_inclusive_scan:
8856    case nir_intrinsic_exclusive_scan: {
8857       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8858       Temp dst = get_ssa_temp(ctx, &instr->def);
8859       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8860       unsigned cluster_size =
8861          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8862       cluster_size = util_next_power_of_two(
8863          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8864       const unsigned bit_size = instr->src[0].ssa->bit_size;
8865       assert(bit_size != 1);
8866 
8867       if (!nir_src_is_divergent(instr->src[0])) {
8868          /* We use divergence analysis to assign the regclass, so check if it's
8869           * working as expected */
8870          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8871          if (instr->intrinsic == nir_intrinsic_inclusive_scan ||
8872              cluster_size != ctx->program->wave_size)
8873             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor ||
8874                                  op == nir_op_imul || op == nir_op_fmul;
8875          assert(instr->def.divergent == expected_divergent);
8876 
8877          if (instr->intrinsic == nir_intrinsic_reduce) {
8878             if (!instr->def.divergent && emit_uniform_reduce(ctx, instr))
8879                break;
8880          } else if (emit_uniform_scan(ctx, instr)) {
8881             break;
8882          }
8883       }
8884 
8885       src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8886       ReduceOp reduce_op = get_reduce_op(op, bit_size);
8887 
8888       aco_opcode aco_op;
8889       switch (instr->intrinsic) {
8890       case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8891       case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8892       case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8893       default: unreachable("unknown reduce intrinsic");
8894       }
8895 
8896       /* Avoid whole wave shift. */
8897       const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8898                                                (op == nir_op_iadd || op == nir_op_ixor) &&
8899                                                dst.type() == RegType::vgpr;
8900       if (use_inclusive_for_exclusive)
8901          inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8902       else
8903          emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8904 
8905       set_wqm(ctx);
8906       break;
8907    }
8908    case nir_intrinsic_dpp16_shift_amd: {
8909       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8910       Temp dst = get_ssa_temp(ctx, &instr->def);
8911       int delta = nir_intrinsic_base(instr);
8912       assert(delta >= -15 && delta <= 15 && delta != 0);
8913       assert(instr->def.bit_size != 1 && instr->def.bit_size < 64);
8914       assert(ctx->options->gfx_level >= GFX8);
8915 
8916       uint16_t dpp_ctrl = delta < 0 ? dpp_row_sr(-delta) : dpp_row_sl(delta);
8917       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), src, dpp_ctrl);
8918 
8919       set_wqm(ctx);
8920       break;
8921    }
8922    case nir_intrinsic_quad_broadcast:
8923    case nir_intrinsic_quad_swap_horizontal:
8924    case nir_intrinsic_quad_swap_vertical:
8925    case nir_intrinsic_quad_swap_diagonal:
8926    case nir_intrinsic_quad_swizzle_amd: {
8927       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8928 
8929       if (!instr->def.divergent) {
8930          emit_uniform_subgroup(ctx, instr, src);
8931          break;
8932       }
8933 
8934       /* Quad broadcast lane. */
8935       unsigned lane = 0;
8936       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8937       bool bool_use_valu = instr->def.bit_size == 1;
8938 
8939       uint16_t dpp_ctrl = 0;
8940 
8941       bool allow_fi = true;
8942       switch (instr->intrinsic) {
8943       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8944       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8945       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8946       case nir_intrinsic_quad_swizzle_amd:
8947          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8948          allow_fi &= nir_intrinsic_fetch_inactive(instr);
8949          break;
8950       case nir_intrinsic_quad_broadcast:
8951          lane = nir_src_as_const_value(instr->src[1])->u32;
8952          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8953          bool_use_valu = false;
8954          break;
8955       default: break;
8956       }
8957 
8958       Temp dst = get_ssa_temp(ctx, &instr->def);
8959 
8960       /* Setup source. */
8961       if (bool_use_valu)
8962          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8963                             Operand::c32(-1), src);
8964       else if (instr->def.bit_size != 1)
8965          src = as_vgpr(ctx, src);
8966 
8967       if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8968          /* Special case for quad broadcast using SALU only. */
8969          assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8970 
8971          uint32_t half_mask = 0x11111111u << lane;
8972          Operand mask_tmp = bld.lm.bytes() == 4
8973                                ? Operand::c32(half_mask)
8974                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8975                                             Operand::c32(half_mask), Operand::c32(half_mask));
8976 
8977          src =
8978             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8979          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8980          bld.sop1(Builder::s_wqm, Definition(dst), bld.def(s1, scc), src);
8981       } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8982          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8983          Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8984 
8985          if (ctx->program->gfx_level >= GFX8)
8986             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8987          else
8988             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8989 
8990          if (excess_bytes)
8991             bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8992                        bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8993          if (bool_use_valu)
8994             bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8995       } else if (instr->def.bit_size == 64) {
8996          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8997          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8998 
8999          if (ctx->program->gfx_level >= GFX8) {
9000             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
9001                               allow_fi);
9002             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
9003                               allow_fi);
9004          } else {
9005             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
9006             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
9007          }
9008 
9009          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
9010          emit_split_vector(ctx, dst, 2);
9011       } else {
9012          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
9013       }
9014 
9015       set_wqm(ctx);
9016       break;
9017    }
9018    case nir_intrinsic_masked_swizzle_amd: {
9019       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9020       if (!instr->def.divergent) {
9021          emit_uniform_subgroup(ctx, instr, src);
9022          break;
9023       }
9024       Temp dst = get_ssa_temp(ctx, &instr->def);
9025       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
9026       bool allow_fi = nir_intrinsic_fetch_inactive(instr);
9027 
9028       if (instr->def.bit_size != 1)
9029          src = as_vgpr(ctx, src);
9030 
9031       if (instr->def.bit_size == 1) {
9032          assert(src.regClass() == bld.lm);
9033          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
9034                             Operand::c32(-1), src);
9035          src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
9036          bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
9037       } else if (dst.regClass() == v1b) {
9038          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
9039          emit_extract_vector(ctx, tmp, 0, dst);
9040       } else if (dst.regClass() == v2b) {
9041          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
9042          emit_extract_vector(ctx, tmp, 0, dst);
9043       } else if (dst.regClass() == v1) {
9044          bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
9045       } else if (dst.regClass() == v2) {
9046          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
9047          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
9048          lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
9049          hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
9050          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
9051          emit_split_vector(ctx, dst, 2);
9052       } else {
9053          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
9054       }
9055       set_wqm(ctx);
9056       break;
9057    }
9058    case nir_intrinsic_write_invocation_amd: {
9059       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9060       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9061       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
9062       Temp dst = get_ssa_temp(ctx, &instr->def);
9063       if (dst.regClass() == v1) {
9064          /* src2 is ignored for writelane. RA assigns the same reg for dst */
9065          bld.writelane(Definition(dst), val, lane, src);
9066       } else if (dst.regClass() == v2) {
9067          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
9068          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
9069          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
9070          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
9071          Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
9072          Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
9073          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
9074          emit_split_vector(ctx, dst, 2);
9075       } else {
9076          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
9077       }
9078       break;
9079    }
9080    case nir_intrinsic_mbcnt_amd: {
9081       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9082       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
9083       Temp dst = get_ssa_temp(ctx, &instr->def);
9084       /* Fit 64-bit mask for wave32 */
9085       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
9086       emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
9087       set_wqm(ctx);
9088       break;
9089    }
9090    case nir_intrinsic_lane_permute_16_amd: {
9091       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9092       Temp dst = get_ssa_temp(ctx, &instr->def);
9093       assert(ctx->program->gfx_level >= GFX10);
9094 
9095       if (src.regClass() == s1) {
9096          bld.copy(Definition(dst), src);
9097       } else if (dst.regClass() == v1 && src.regClass() == v1) {
9098          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
9099                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
9100                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
9101       } else {
9102          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
9103       }
9104       break;
9105    }
9106    case nir_intrinsic_load_helper_invocation:
9107    case nir_intrinsic_is_helper_invocation: {
9108       /* load_helper() after demote() get lowered to is_helper().
9109        * Otherwise, these two behave the same. */
9110       Temp dst = get_ssa_temp(ctx, &instr->def);
9111       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
9112       ctx->program->needs_exact = true;
9113       break;
9114    }
9115    case nir_intrinsic_demote:
9116    case nir_intrinsic_demote_if: {
9117       Operand cond = Operand::c32(-1u);
9118       if (instr->intrinsic == nir_intrinsic_demote_if) {
9119          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9120          assert(src.regClass() == bld.lm);
9121          cond =
9122             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
9123       }
9124 
9125       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
9126 
9127       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9128          ctx->cf_info.exec.potentially_empty_discard = true;
9129 
9130       ctx->block->kind |= block_kind_uses_discard;
9131       ctx->program->needs_exact = true;
9132 
9133       /* Enable WQM in order to prevent helper lanes from getting terminated. */
9134       if (ctx->shader->info.maximally_reconverges)
9135          ctx->program->needs_wqm = true;
9136 
9137       break;
9138    }
9139    case nir_intrinsic_terminate:
9140    case nir_intrinsic_terminate_if: {
9141       Operand cond = Operand::c32(-1u);
9142       if (instr->intrinsic == nir_intrinsic_terminate_if) {
9143          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9144          assert(src.regClass() == bld.lm);
9145          cond =
9146             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
9147 
9148          ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
9149       }
9150 
9151       bld.pseudo(aco_opcode::p_discard_if, cond);
9152 
9153       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
9154          ctx->cf_info.exec.potentially_empty_discard = true;
9155       ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
9156       ctx->block->kind |= block_kind_uses_discard;
9157       ctx->program->needs_exact = true;
9158       break;
9159    }
9160    case nir_intrinsic_first_invocation: {
9161       bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
9162                Operand(exec, bld.lm));
9163       set_wqm(ctx);
9164       break;
9165    }
9166    case nir_intrinsic_last_invocation: {
9167       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
9168       bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
9169                Operand::c32(ctx->program->wave_size - 1u), flbit);
9170       set_wqm(ctx);
9171       break;
9172    }
9173    case nir_intrinsic_elect: {
9174       /* p_elect is lowered in aco_insert_exec_mask.
9175        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
9176        * two p_elect with different exec masks as the same.
9177        */
9178       bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
9179                  Operand(exec, bld.lm));
9180       set_wqm(ctx);
9181       break;
9182    }
9183    case nir_intrinsic_shader_clock: {
9184       Temp dst = get_ssa_temp(ctx, &instr->def);
9185       if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
9186           ctx->options->gfx_level >= GFX12) {
9187          Temp hi0 = bld.tmp(s1);
9188          Temp hi1 = bld.tmp(s1);
9189          Temp lo = bld.tmp(s1);
9190          bld.pseudo(aco_opcode::p_shader_cycles_hi_lo_hi, Definition(hi0), Definition(lo), Definition(hi1));
9191          Temp hi_eq = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), hi0, hi1);
9192          lo = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), lo, Operand::zero(), bld.scc(hi_eq));
9193          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi1);
9194       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
9195           ctx->options->gfx_level >= GFX10_3) {
9196          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
9197          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
9198          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
9199       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
9200                  ctx->options->gfx_level >= GFX11) {
9201          bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
9202                   Operand::c32(sendmsg_rtn_get_realtime));
9203       } else {
9204          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
9205                                 ? aco_opcode::s_memrealtime
9206                                 : aco_opcode::s_memtime;
9207          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
9208       }
9209       emit_split_vector(ctx, dst, 2);
9210       break;
9211    }
9212    case nir_intrinsic_load_vertex_id_zero_base: {
9213       Temp dst = get_ssa_temp(ctx, &instr->def);
9214       bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
9215       break;
9216    }
9217    case nir_intrinsic_load_first_vertex: {
9218       Temp dst = get_ssa_temp(ctx, &instr->def);
9219       bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
9220       break;
9221    }
9222    case nir_intrinsic_load_base_instance: {
9223       Temp dst = get_ssa_temp(ctx, &instr->def);
9224       bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
9225       break;
9226    }
9227    case nir_intrinsic_load_instance_id: {
9228       Temp dst = get_ssa_temp(ctx, &instr->def);
9229       bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
9230       break;
9231    }
9232    case nir_intrinsic_load_draw_id: {
9233       Temp dst = get_ssa_temp(ctx, &instr->def);
9234       bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
9235       break;
9236    }
9237    case nir_intrinsic_load_invocation_id: {
9238       Temp dst = get_ssa_temp(ctx, &instr->def);
9239 
9240       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
9241          if (ctx->options->gfx_level >= GFX12)
9242             bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
9243                      get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand::c32(27u),
9244                      Operand::c32(5u));
9245          else if (ctx->options->gfx_level >= GFX10)
9246             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
9247                          get_arg(ctx, ctx->args->gs_invocation_id));
9248          else
9249             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
9250       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
9251          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
9252                   Operand::c32(8u), Operand::c32(5u));
9253       } else {
9254          unreachable("Unsupported stage for load_invocation_id");
9255       }
9256 
9257       break;
9258    }
9259    case nir_intrinsic_load_primitive_id: {
9260       Temp dst = get_ssa_temp(ctx, &instr->def);
9261 
9262       switch (ctx->shader->info.stage) {
9263       case MESA_SHADER_GEOMETRY:
9264          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9265          break;
9266       case MESA_SHADER_TESS_CTRL:
9267          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
9268          break;
9269       case MESA_SHADER_TESS_EVAL:
9270          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
9271          break;
9272       default:
9273          if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
9274             /* In case of NGG, the GS threads always have the primitive ID
9275              * even if there is no SW GS. */
9276             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9277             break;
9278          } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
9279             bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
9280             break;
9281          }
9282          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
9283       }
9284 
9285       break;
9286    }
9287    case nir_intrinsic_sendmsg_amd: {
9288       unsigned imm = nir_intrinsic_base(instr);
9289       Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9290       bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), imm);
9291       break;
9292    }
9293    case nir_intrinsic_load_gs_wave_id_amd: {
9294       Temp dst = get_ssa_temp(ctx, &instr->def);
9295       if (ctx->args->merged_wave_info.used)
9296          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
9297                     get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
9298                     Operand::zero());
9299       else if (ctx->args->gs_wave_id.used)
9300          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
9301       else
9302          unreachable("Shader doesn't have GS wave ID.");
9303       break;
9304    }
9305    case nir_intrinsic_is_subgroup_invocation_lt_amd: {
9306       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9307       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
9308       break;
9309    }
9310    case nir_intrinsic_gds_atomic_add_amd: {
9311       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
9312       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
9313       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
9314       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
9315       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
9316              true);
9317       break;
9318    }
9319    case nir_intrinsic_load_sbt_base_amd: {
9320       Temp dst = get_ssa_temp(ctx, &instr->def);
9321       Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
9322       assert(addr.regClass() == s2);
9323       bld.copy(Definition(dst), Operand(addr));
9324       break;
9325    }
9326    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
9327    case nir_intrinsic_load_resume_shader_address_amd: {
9328       bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
9329                  bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
9330       break;
9331    }
9332    case nir_intrinsic_overwrite_vs_arguments_amd: {
9333       ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9334       ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9335       break;
9336    }
9337    case nir_intrinsic_overwrite_tes_arguments_amd: {
9338       ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9339       ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9340       ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9341       ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
9342       break;
9343    }
9344    case nir_intrinsic_load_scalar_arg_amd:
9345    case nir_intrinsic_load_vector_arg_amd: {
9346       assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
9347       Temp dst = get_ssa_temp(ctx, &instr->def);
9348       Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
9349       assert(src.id());
9350       assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
9351                                                                                   : RegType::vgpr));
9352       bld.copy(Definition(dst), src);
9353       emit_split_vector(ctx, dst, dst.size());
9354       break;
9355    }
9356    case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: {
9357       Temp dst = get_ssa_temp(ctx, &instr->def);
9358       Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
9359       Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
9360 
9361       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
9362       unsigned offset0, offset1;
9363       Instruction* ds_instr;
9364       Operand m;
9365 
9366       /* Lock a GDS mutex. */
9367       ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
9368       m = bld.m0(bld.as_uniform(ordered_id));
9369       ds_instr =
9370          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9371       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9372 
9373       aco_ptr<Instruction> vec{
9374          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
9375       unsigned write_mask = nir_intrinsic_write_mask(instr);
9376 
9377       for (unsigned i = 0; i < instr->num_components; i++) {
9378          if (write_mask & (1 << i)) {
9379             Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9380 
9381             ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9382                               i * 4, 0u, true);
9383             ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9384 
9385             vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
9386          } else {
9387             vec->operands[i] = Operand::zero();
9388          }
9389       }
9390 
9391       vec->definitions[0] = Definition(dst);
9392       ctx->block->instructions.emplace_back(std::move(vec));
9393 
9394       /* Unlock a GDS mutex. */
9395       ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
9396       m = bld.m0(bld.as_uniform(ordered_id));
9397       ds_instr =
9398          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9399       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9400 
9401       emit_split_vector(ctx, dst, instr->num_components);
9402       break;
9403    }
9404    case nir_intrinsic_xfb_counter_sub_gfx11_amd: {
9405       unsigned write_mask = nir_intrinsic_write_mask(instr);
9406       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
9407 
9408       u_foreach_bit (i, write_mask) {
9409          Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9410          Instruction* ds_instr;
9411 
9412          ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9413                            i * 4, 0u, true);
9414          ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9415       }
9416       break;
9417    }
9418    case nir_intrinsic_export_amd:
9419    case nir_intrinsic_export_row_amd: {
9420       unsigned flags = nir_intrinsic_flags(instr);
9421       unsigned target = nir_intrinsic_base(instr);
9422       unsigned write_mask = nir_intrinsic_write_mask(instr);
9423 
9424       /* Mark vertex export block. */
9425       if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
9426          ctx->block->kind |= block_kind_export_end;
9427 
9428       if (target < V_008DFC_SQ_EXP_MRTZ)
9429          ctx->program->has_color_exports = true;
9430 
9431       const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;
9432 
9433       aco_ptr<Instruction> exp{create_instruction(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};
9434 
9435       exp->exp().dest = target;
9436       exp->exp().enabled_mask = write_mask;
9437       exp->exp().compressed = flags & AC_EXP_FLAG_COMPRESSED;
9438 
9439       /* ACO may reorder position/mrt export instructions, then mark done for last
9440        * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
9441        * exports here and leave it to ACO.
9442        */
9443       if (target == V_008DFC_SQ_EXP_PRIM)
9444          exp->exp().done = flags & AC_EXP_FLAG_DONE;
9445       else
9446          exp->exp().done = false;
9447 
9448       /* ACO may reorder mrt export instructions, then mark valid mask for last
9449        * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9450        * exports here and leave it to ACO.
9451        */
9452       if (target > V_008DFC_SQ_EXP_NULL)
9453          exp->exp().valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9454       else
9455          exp->exp().valid_mask = false;
9456 
9457       exp->exp().row_en = row_en;
9458 
9459       /* Compressed export uses two bits for a channel. */
9460       uint32_t channel_mask = exp->exp().compressed
9461                                  ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0)
9462                                  : write_mask;
9463 
9464       Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9465       for (unsigned i = 0; i < 4; i++) {
9466          exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9467                                ? Operand(emit_extract_vector(ctx, value, i, v1))
9468                                : Operand(v1);
9469       }
9470 
9471       if (row_en) {
9472          Temp row = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9473          /* Hack to prevent the RA from moving the source into m0 and then back to a normal SGPR. */
9474          row = bld.copy(bld.def(s1, m0), row);
9475          exp->operands[4] = bld.m0(row);
9476       }
9477 
9478       ctx->block->instructions.emplace_back(std::move(exp));
9479       break;
9480    }
9481    case nir_intrinsic_export_dual_src_blend_amd: {
9482       Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9483       Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9484       unsigned write_mask = nir_intrinsic_write_mask(instr);
9485 
9486       struct aco_export_mrt mrt0, mrt1;
9487       for (unsigned i = 0; i < 4; i++) {
9488          mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9489                                                     : Operand(v1);
9490 
9491          mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9492                                                     : Operand(v1);
9493       }
9494       mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9495 
9496       create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9497 
9498       ctx->block->kind |= block_kind_export_end;
9499       break;
9500    }
9501    case nir_intrinsic_strict_wqm_coord_amd: {
9502       Temp dst = get_ssa_temp(ctx, &instr->def);
9503       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9504       unsigned begin_size = nir_intrinsic_base(instr);
9505 
9506       unsigned num_src = 1;
9507       auto it = ctx->allocated_vec.find(src.id());
9508       if (it != ctx->allocated_vec.end())
9509          num_src = src.bytes() / it->second[0].bytes();
9510 
9511       aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO,
9512                                                   num_src + !!begin_size, 1)};
9513 
9514       if (begin_size)
9515          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9516       for (unsigned i = 0; i < num_src; i++) {
9517          Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9518          vec->operands[i + !!begin_size] = Operand(comp);
9519       }
9520 
9521       vec->definitions[0] = Definition(dst);
9522       ctx->block->instructions.emplace_back(std::move(vec));
9523       break;
9524    }
9525    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9526       Temp dst = get_ssa_temp(ctx, &instr->def);
9527       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9528                Operand::c32(aco_symbol_lds_ngg_scratch_base));
9529       break;
9530    }
9531    case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9532       Temp dst = get_ssa_temp(ctx, &instr->def);
9533       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9534                Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9535       break;
9536    }
9537    case nir_intrinsic_store_scalar_arg_amd: {
9538       BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
9539       ctx->arg_temps[nir_intrinsic_base(instr)] =
9540          bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9541       break;
9542    }
9543    case nir_intrinsic_store_vector_arg_amd: {
9544       BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
9545       ctx->arg_temps[nir_intrinsic_base(instr)] =
9546          as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9547       break;
9548    }
9549    case nir_intrinsic_begin_invocation_interlock: {
9550       pops_await_overlapped_waves(ctx);
9551       break;
9552    }
9553    case nir_intrinsic_end_invocation_interlock: {
9554       if (ctx->options->gfx_level < GFX11)
9555          bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9556       break;
9557    }
9558    case nir_intrinsic_cmat_muladd_amd: visit_cmat_muladd(ctx, instr); break;
9559    case nir_intrinsic_nop_amd: bld.sopp(aco_opcode::s_nop, nir_intrinsic_base(instr)); break;
9560    case nir_intrinsic_sleep_amd: bld.sopp(aco_opcode::s_sleep, nir_intrinsic_base(instr)); break;
9561    case nir_intrinsic_unit_test_amd:
9562       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(nir_intrinsic_base(instr)),
9563                  get_ssa_temp(ctx, instr->src[0].ssa));
9564       break;
9565    case nir_intrinsic_unit_test_uniform_amd:
9566    case nir_intrinsic_unit_test_divergent_amd:
9567       bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
9568                  Operand::c32(nir_intrinsic_base(instr)));
9569       break;
9570    default:
9571       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9572       abort();
9573 
9574       break;
9575    }
9576 }
9577 
9578 void
get_const_vec(nir_def * vec,nir_const_value * cv[4])9579 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9580 {
9581    if (vec->parent_instr->type != nir_instr_type_alu)
9582       return;
9583    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9584    if (vec_instr->op != nir_op_vec(vec->num_components))
9585       return;
9586 
9587    for (unsigned i = 0; i < vec->num_components; i++) {
9588       cv[i] =
9589          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9590    }
9591 }
9592 
9593 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9594 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9595 {
9596    assert(instr->op != nir_texop_samples_identical);
9597 
9598    Builder bld(ctx->program, ctx->block);
9599    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9600         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9601         has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9602    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9603                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9604                            coord = Temp(), wqm_coord = Temp();
9605    std::vector<Temp> coords;
9606    std::vector<Temp> derivs;
9607    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9608 
9609    for (unsigned i = 0; i < instr->num_srcs; i++) {
9610       switch (instr->src[i].src_type) {
9611       case nir_tex_src_texture_handle:
9612          resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9613          break;
9614       case nir_tex_src_sampler_handle:
9615          sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9616          break;
9617       default: break;
9618       }
9619    }
9620 
9621    bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9622                                   (instr->dest_type & (nir_type_int | nir_type_uint));
9623    bool tg4_integer_cube_workaround =
9624       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9625 
9626    bool a16 = false, g16 = false;
9627 
9628    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9629    if (coord_idx > 0)
9630       a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9631 
9632    int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9633    if (ddx_idx > 0)
9634       g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9635 
9636    for (unsigned i = 0; i < instr->num_srcs; i++) {
9637       switch (instr->src[i].src_type) {
9638       case nir_tex_src_coord: {
9639          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9640          coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9641          break;
9642       }
9643       case nir_tex_src_backend1: {
9644          assert(instr->src[i].src.ssa->bit_size == 32);
9645          wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9646          has_wqm_coord = true;
9647          break;
9648       }
9649       case nir_tex_src_bias:
9650          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9651          /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9652          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9653          has_bias = true;
9654          break;
9655       case nir_tex_src_lod: {
9656          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9657             level_zero = true;
9658          } else {
9659             assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9660             lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9661             has_lod = true;
9662          }
9663          break;
9664       }
9665       case nir_tex_src_min_lod:
9666          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9667          clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9668          has_clamped_lod = true;
9669          break;
9670       case nir_tex_src_comparator:
9671          if (instr->is_shadow) {
9672             assert(instr->src[i].src.ssa->bit_size == 32);
9673             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9674             has_compare = true;
9675          }
9676          break;
9677       case nir_tex_src_offset:
9678       case nir_tex_src_backend2:
9679          assert(instr->src[i].src.ssa->bit_size == 32);
9680          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9681          get_const_vec(instr->src[i].src.ssa, const_offset);
9682          has_offset = true;
9683          break;
9684       case nir_tex_src_ddx:
9685          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9686          ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9687          has_ddx = true;
9688          break;
9689       case nir_tex_src_ddy:
9690          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9691          ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9692          has_ddy = true;
9693          break;
9694       case nir_tex_src_ms_index:
9695          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9696          sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9697          has_sample_index = true;
9698          break;
9699       case nir_tex_src_texture_offset:
9700       case nir_tex_src_sampler_offset:
9701       default: break;
9702       }
9703    }
9704 
9705    if (has_wqm_coord) {
9706       assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9707              instr->op == nir_texop_lod);
9708       assert(wqm_coord.regClass().is_linear_vgpr());
9709       assert(!a16 && !g16);
9710    }
9711 
9712    if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9713       level_zero = true;
9714 
9715    if (has_offset) {
9716       assert(instr->op != nir_texop_txf);
9717 
9718       aco_ptr<Instruction> tmp_instr;
9719       Temp acc, pack = Temp();
9720 
9721       uint32_t pack_const = 0;
9722       for (unsigned i = 0; i < offset.size(); i++) {
9723          if (!const_offset[i])
9724             continue;
9725          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9726       }
9727 
9728       if (offset.type() == RegType::sgpr) {
9729          for (unsigned i = 0; i < offset.size(); i++) {
9730             if (const_offset[i])
9731                continue;
9732 
9733             acc = emit_extract_vector(ctx, offset, i, s1);
9734             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9735                            Operand::c32(0x3Fu));
9736 
9737             if (i) {
9738                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9739                               Operand::c32(8u * i));
9740             }
9741 
9742             if (pack == Temp()) {
9743                pack = acc;
9744             } else {
9745                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9746             }
9747          }
9748 
9749          if (pack_const && pack != Temp())
9750             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9751                             Operand::c32(pack_const), pack);
9752       } else {
9753          for (unsigned i = 0; i < offset.size(); i++) {
9754             if (const_offset[i])
9755                continue;
9756 
9757             acc = emit_extract_vector(ctx, offset, i, v1);
9758             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9759 
9760             if (i) {
9761                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9762             }
9763 
9764             if (pack == Temp()) {
9765                pack = acc;
9766             } else {
9767                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9768             }
9769          }
9770 
9771          if (pack_const && pack != Temp())
9772             pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9773       }
9774       if (pack == Temp())
9775          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9776       else
9777          offset = pack;
9778    }
9779 
9780    std::vector<Temp> unpacked_coord;
9781    if (coord != Temp())
9782       unpacked_coord.push_back(coord);
9783    if (has_sample_index)
9784       unpacked_coord.push_back(sample_index);
9785    if (has_lod)
9786       unpacked_coord.push_back(lod);
9787    if (has_clamped_lod)
9788       unpacked_coord.push_back(clamped_lod);
9789 
9790    coords = emit_pack_v1(ctx, unpacked_coord);
9791 
9792    /* pack derivatives */
9793    if (has_ddx || has_ddy) {
9794       assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9795       std::array<Temp, 2> ddxddy = {ddx, ddy};
9796       for (Temp tmp : ddxddy) {
9797          if (tmp == Temp())
9798             continue;
9799          std::vector<Temp> unpacked = {tmp};
9800          for (Temp derv : emit_pack_v1(ctx, unpacked))
9801             derivs.push_back(derv);
9802       }
9803       has_derivs = true;
9804    }
9805 
9806    unsigned dim = 0;
9807    bool da = false;
9808    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9809       dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9810       da = should_declare_array((ac_image_dim)dim);
9811    }
9812 
9813    /* Build tex instruction */
9814    unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9815    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9816       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9817    if (instr->is_sparse)
9818       dmask = MAX2(dmask, 1) | 0x10;
9819    bool d16 = instr->def.bit_size == 16;
9820    Temp dst = get_ssa_temp(ctx, &instr->def);
9821    Temp tmp_dst = dst;
9822 
9823    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9824    if (instr->op == nir_texop_tg4) {
9825       assert(instr->def.num_components == (4 + instr->is_sparse));
9826       if (instr->is_shadow)
9827          dmask = 1;
9828       else
9829          dmask = 1 << instr->component;
9830       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9831          tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9832    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9833       tmp_dst = bld.tmp(v1);
9834    } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9835       unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9836       tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9837    }
9838 
9839    Temp tg4_compare_cube_wa64 = Temp();
9840 
9841    if (tg4_integer_workarounds) {
9842       Temp half_texel[2];
9843       if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
9844          half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
9845       } else {
9846          Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9847          Temp size = bld.tmp(v2);
9848          MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9849                                            Operand(s4), std::vector<Temp>{tg4_lod});
9850          tex->dim = dim;
9851          tex->dmask = 0x3;
9852          tex->da = da;
9853          emit_split_vector(ctx, size, size.size());
9854 
9855          for (unsigned i = 0; i < 2; i++) {
9856             half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9857             half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9858             half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9859             half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9860                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9861          }
9862 
9863          if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9864             /* In vulkan, whether the sampler uses unnormalized
9865              * coordinates or not is a dynamic property of the
9866              * sampler. Hence, to figure out whether or not we
9867              * need to divide by the texture size, we need to test
9868              * the sampler at runtime. This tests the bit set by
9869              * radv_init_sampler().
9870              */
9871             unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9872             Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
9873             Temp not_needed =
9874                bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
9875 
9876             not_needed = bool_to_vector_condition(ctx, not_needed);
9877             half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9878                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9879             half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9880                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9881          }
9882       }
9883 
9884       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9885                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9886 
9887       if (tg4_integer_cube_workaround) {
9888          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9889          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9890          aco_ptr<Instruction> split{
9891             create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9892          split->operands[0] = Operand(resource);
9893          for (unsigned i = 0; i < resource.size(); i++) {
9894             desc[i] = bld.tmp(s1);
9895             split->definitions[i] = Definition(desc[i]);
9896          }
9897          ctx->block->instructions.emplace_back(std::move(split));
9898 
9899          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9900                               Operand::c32(20u | (6u << 16)));
9901          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9902                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9903 
9904          Temp nfmt;
9905          if (instr->dest_type & nir_type_uint) {
9906             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9907                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9908                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9909          } else {
9910             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9911                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9912                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9913          }
9914          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9915          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9916 
9917          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9918                          Operand::c32(26u));
9919 
9920          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9921                             Operand::c32(C_008F14_NUM_FORMAT));
9922          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9923 
9924          aco_ptr<Instruction> vec{
9925             create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9926          for (unsigned i = 0; i < resource.size(); i++)
9927             vec->operands[i] = Operand(desc[i]);
9928          resource = bld.tmp(resource.regClass());
9929          vec->definitions[0] = Definition(resource);
9930          ctx->block->instructions.emplace_back(std::move(vec));
9931 
9932          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9933                                   tg4_compare_cube_wa64);
9934          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9935                                   tg4_compare_cube_wa64);
9936       }
9937       coords[0] = new_coords[0];
9938       coords[1] = new_coords[1];
9939    }
9940 
9941    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9942       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9943       // ac_build_buffer_load_format_gfx9_safe()
9944 
9945       assert(coords.size() == 1);
9946       aco_opcode op;
9947       if (d16) {
9948          switch (util_last_bit(dmask & 0xf)) {
9949          case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9950          case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9951          case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9952          case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9953          default: unreachable("Tex instruction loads more than 4 components.");
9954          }
9955       } else {
9956          switch (util_last_bit(dmask & 0xf)) {
9957          case 1: op = aco_opcode::buffer_load_format_x; break;
9958          case 2: op = aco_opcode::buffer_load_format_xy; break;
9959          case 3: op = aco_opcode::buffer_load_format_xyz; break;
9960          case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9961          default: unreachable("Tex instruction loads more than 4 components.");
9962          }
9963       }
9964 
9965       aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9966       mubuf->operands[0] = Operand(resource);
9967       mubuf->operands[1] = Operand(coords[0]);
9968       mubuf->operands[2] = Operand::c32(0);
9969       mubuf->definitions[0] = Definition(tmp_dst);
9970       mubuf->mubuf().idxen = true;
9971       mubuf->mubuf().tfe = instr->is_sparse;
9972       if (mubuf->mubuf().tfe)
9973          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9974       ctx->block->instructions.emplace_back(std::move(mubuf));
9975 
9976       expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9977       return;
9978    }
9979 
9980    /* gather MIMG address components */
9981    std::vector<Temp> args;
9982    if (has_wqm_coord) {
9983       args.emplace_back(wqm_coord);
9984       if (!(ctx->block->kind & block_kind_top_level))
9985          ctx->unended_linear_vgprs.push_back(wqm_coord);
9986    }
9987    if (has_offset)
9988       args.emplace_back(offset);
9989    if (has_bias)
9990       args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9991    if (has_compare)
9992       args.emplace_back(compare);
9993    if (has_derivs)
9994       args.insert(args.end(), derivs.begin(), derivs.end());
9995 
9996    args.insert(args.end(), coords.begin(), coords.end());
9997 
9998    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9999        instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
10000       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
10001                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
10002                          ? aco_opcode::image_load
10003                          : aco_opcode::image_load_mip;
10004       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
10005       MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
10006       if (instr->op == nir_texop_fragment_mask_fetch_amd)
10007          tex->dim = da ? ac_image_2darray : ac_image_2d;
10008       else
10009          tex->dim = dim;
10010       tex->dmask = dmask & 0xf;
10011       tex->unrm = true;
10012       tex->da = da;
10013       tex->tfe = instr->is_sparse;
10014       tex->d16 = d16;
10015       tex->a16 = a16;
10016 
10017       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
10018          /* Use 0x76543210 if the image doesn't have FMASK. */
10019          assert(dmask == 1 && dst.bytes() == 4);
10020          assert(dst.id() != tmp_dst.id());
10021 
10022          if (dst.regClass() == s1) {
10023             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
10024                                         emit_extract_vector(ctx, resource, 1, s1));
10025             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
10026                      Operand::c32(0x76543210), bld.scc(is_not_null));
10027          } else {
10028             Temp is_not_null = bld.tmp(bld.lm);
10029             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
10030                          emit_extract_vector(ctx, resource, 1, s1));
10031             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
10032                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
10033          }
10034       } else {
10035          expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
10036       }
10037       return;
10038    }
10039 
10040    bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
10041 
10042    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
10043    aco_opcode opcode = aco_opcode::image_sample;
10044    if (has_offset) { /* image_sample_*_o */
10045       if (has_clamped_lod) {
10046          if (has_compare) {
10047             opcode = aco_opcode::image_sample_c_cl_o;
10048             if (separate_g16)
10049                opcode = aco_opcode::image_sample_c_d_cl_o_g16;
10050             else if (has_derivs)
10051                opcode = aco_opcode::image_sample_c_d_cl_o;
10052             if (has_bias)
10053                opcode = aco_opcode::image_sample_c_b_cl_o;
10054          } else {
10055             opcode = aco_opcode::image_sample_cl_o;
10056             if (separate_g16)
10057                opcode = aco_opcode::image_sample_d_cl_o_g16;
10058             else if (has_derivs)
10059                opcode = aco_opcode::image_sample_d_cl_o;
10060             if (has_bias)
10061                opcode = aco_opcode::image_sample_b_cl_o;
10062          }
10063       } else if (has_compare) {
10064          opcode = aco_opcode::image_sample_c_o;
10065          if (separate_g16)
10066             opcode = aco_opcode::image_sample_c_d_o_g16;
10067          else if (has_derivs)
10068             opcode = aco_opcode::image_sample_c_d_o;
10069          if (has_bias)
10070             opcode = aco_opcode::image_sample_c_b_o;
10071          if (level_zero)
10072             opcode = aco_opcode::image_sample_c_lz_o;
10073          if (has_lod)
10074             opcode = aco_opcode::image_sample_c_l_o;
10075       } else {
10076          opcode = aco_opcode::image_sample_o;
10077          if (separate_g16)
10078             opcode = aco_opcode::image_sample_d_o_g16;
10079          else if (has_derivs)
10080             opcode = aco_opcode::image_sample_d_o;
10081          if (has_bias)
10082             opcode = aco_opcode::image_sample_b_o;
10083          if (level_zero)
10084             opcode = aco_opcode::image_sample_lz_o;
10085          if (has_lod)
10086             opcode = aco_opcode::image_sample_l_o;
10087       }
10088    } else if (has_clamped_lod) { /* image_sample_*_cl */
10089       if (has_compare) {
10090          opcode = aco_opcode::image_sample_c_cl;
10091          if (separate_g16)
10092             opcode = aco_opcode::image_sample_c_d_cl_g16;
10093          else if (has_derivs)
10094             opcode = aco_opcode::image_sample_c_d_cl;
10095          if (has_bias)
10096             opcode = aco_opcode::image_sample_c_b_cl;
10097       } else {
10098          opcode = aco_opcode::image_sample_cl;
10099          if (separate_g16)
10100             opcode = aco_opcode::image_sample_d_cl_g16;
10101          else if (has_derivs)
10102             opcode = aco_opcode::image_sample_d_cl;
10103          if (has_bias)
10104             opcode = aco_opcode::image_sample_b_cl;
10105       }
10106    } else { /* no offset */
10107       if (has_compare) {
10108          opcode = aco_opcode::image_sample_c;
10109          if (separate_g16)
10110             opcode = aco_opcode::image_sample_c_d_g16;
10111          else if (has_derivs)
10112             opcode = aco_opcode::image_sample_c_d;
10113          if (has_bias)
10114             opcode = aco_opcode::image_sample_c_b;
10115          if (level_zero)
10116             opcode = aco_opcode::image_sample_c_lz;
10117          if (has_lod)
10118             opcode = aco_opcode::image_sample_c_l;
10119       } else {
10120          opcode = aco_opcode::image_sample;
10121          if (separate_g16)
10122             opcode = aco_opcode::image_sample_d_g16;
10123          else if (has_derivs)
10124             opcode = aco_opcode::image_sample_d;
10125          if (has_bias)
10126             opcode = aco_opcode::image_sample_b;
10127          if (level_zero)
10128             opcode = aco_opcode::image_sample_lz;
10129          if (has_lod)
10130             opcode = aco_opcode::image_sample_l;
10131       }
10132    }
10133 
10134    if (instr->op == nir_texop_tg4) {
10135       /* GFX11 supports implicit LOD, but the extension is unsupported. */
10136       assert(level_zero || ctx->options->gfx_level < GFX11);
10137 
10138       if (has_offset) { /* image_gather4_*_o */
10139          if (has_compare) {
10140             opcode = aco_opcode::image_gather4_c_o;
10141             if (level_zero)
10142                opcode = aco_opcode::image_gather4_c_lz_o;
10143             if (has_lod)
10144                opcode = aco_opcode::image_gather4_c_l_o;
10145             if (has_bias)
10146                opcode = aco_opcode::image_gather4_c_b_o;
10147          } else {
10148             opcode = aco_opcode::image_gather4_o;
10149             if (level_zero)
10150                opcode = aco_opcode::image_gather4_lz_o;
10151             if (has_lod)
10152                opcode = aco_opcode::image_gather4_l_o;
10153             if (has_bias)
10154                opcode = aco_opcode::image_gather4_b_o;
10155          }
10156       } else {
10157          if (has_compare) {
10158             opcode = aco_opcode::image_gather4_c;
10159             if (level_zero)
10160                opcode = aco_opcode::image_gather4_c_lz;
10161             if (has_lod)
10162                opcode = aco_opcode::image_gather4_c_l;
10163             if (has_bias)
10164                opcode = aco_opcode::image_gather4_c_b;
10165          } else {
10166             opcode = aco_opcode::image_gather4;
10167             if (level_zero)
10168                opcode = aco_opcode::image_gather4_lz;
10169             if (has_lod)
10170                opcode = aco_opcode::image_gather4_l;
10171             if (has_bias)
10172                opcode = aco_opcode::image_gather4_b;
10173          }
10174       }
10175    } else if (instr->op == nir_texop_lod) {
10176       opcode = aco_opcode::image_get_lod;
10177    }
10178 
10179    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
10180                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
10181                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
10182 
10183    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
10184    MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
10185    tex->dim = dim;
10186    tex->dmask = dmask & 0xf;
10187    tex->da = da;
10188    tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
10189    tex->tfe = instr->is_sparse;
10190    tex->d16 = d16;
10191    tex->a16 = a16;
10192    if (implicit_derivs)
10193       set_wqm(ctx, true);
10194 
10195    if (tg4_integer_cube_workaround) {
10196       assert(tmp_dst.id() != dst.id());
10197       assert(tmp_dst.size() == dst.size());
10198 
10199       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
10200       Temp val[4];
10201       for (unsigned i = 0; i < 4; i++) {
10202          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
10203          Temp cvt_val;
10204          if (instr->dest_type & nir_type_uint)
10205             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
10206          else
10207             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
10208          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
10209                            tg4_compare_cube_wa64);
10210       }
10211 
10212       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
10213       if (instr->is_sparse)
10214          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
10215                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
10216       else
10217          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
10218                               val[3]);
10219    }
10220    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
10221    expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
10222 }
10223 
10224 Operand
get_phi_operand(isel_context * ctx,nir_def * ssa,RegClass rc)10225 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc)
10226 {
10227    Temp tmp = get_ssa_temp(ctx, ssa);
10228    if (ssa->parent_instr->type == nir_instr_type_undef) {
10229       return Operand(rc);
10230    } else if (ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
10231       bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
10232       return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
10233    } else {
10234       return Operand(tmp);
10235    }
10236 }
10237 
10238 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)10239 visit_phi(isel_context* ctx, nir_phi_instr* instr)
10240 {
10241    Temp dst = get_ssa_temp(ctx, &instr->def);
10242    assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
10243    aco_opcode opcode = instr->def.bit_size == 1 ? aco_opcode::p_boolean_phi : aco_opcode::p_phi;
10244 
10245    /* we want a sorted list of sources, since the predecessor list is also sorted */
10246    std::map<unsigned, nir_def*> phi_src;
10247    nir_foreach_phi_src (src, instr)
10248       phi_src[src->pred->index] = src->src.ssa;
10249 
10250    Instruction* phi = create_instruction(opcode, Format::PSEUDO, phi_src.size(), 1);
10251    unsigned i = 0;
10252    for (std::pair<unsigned, nir_def*> src : phi_src)
10253       phi->operands[i++] = get_phi_operand(ctx, src.second, dst.regClass());
10254    phi->definitions[0] = Definition(dst);
10255    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10256 }
10257 
10258 void
visit_undef(isel_context * ctx,nir_undef_instr * instr)10259 visit_undef(isel_context* ctx, nir_undef_instr* instr)
10260 {
10261    Temp dst = get_ssa_temp(ctx, &instr->def);
10262 
10263    assert(dst.type() == RegType::sgpr);
10264 
10265    if (dst.size() == 1) {
10266       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
10267    } else {
10268       aco_ptr<Instruction> vec{
10269          create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
10270       for (unsigned i = 0; i < dst.size(); i++)
10271          vec->operands[i] = Operand::zero();
10272       vec->definitions[0] = Definition(dst);
10273       ctx->block->instructions.emplace_back(std::move(vec));
10274    }
10275 }
10276 
10277 void
begin_loop(isel_context * ctx,loop_context * lc)10278 begin_loop(isel_context* ctx, loop_context* lc)
10279 {
10280    // TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true
10281    append_logical_end(ctx->block);
10282    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10283    Builder bld(ctx->program, ctx->block);
10284    bld.branch(aco_opcode::p_branch, bld.def(s2));
10285    unsigned loop_preheader_idx = ctx->block->index;
10286 
10287    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10288 
10289    ctx->program->next_loop_depth++;
10290 
10291    Block* loop_header = ctx->program->create_and_insert_block();
10292    loop_header->kind |= block_kind_loop_header;
10293    add_edge(loop_preheader_idx, loop_header);
10294    ctx->block = loop_header;
10295 
10296    append_logical_start(ctx->block);
10297 
10298    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10299    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10300    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10301    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10302    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10303 }
10304 
10305 void
update_exec_info(isel_context * ctx)10306 update_exec_info(isel_context* ctx)
10307 {
10308    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10309       ctx->cf_info.exec.potentially_empty_discard = false;
10310 
10311    ctx->cf_info.exec.potentially_empty_break &=
10312       ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_break_depth;
10313    ctx->cf_info.exec.potentially_empty_continue &=
10314       ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_continue_depth;
10315 
10316    if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_break_depth &&
10317        !ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.parent_loop.has_divergent_continue) {
10318       ctx->cf_info.exec.potentially_empty_break = false;
10319    }
10320    if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_continue_depth &&
10321        !ctx->cf_info.parent_if.is_divergent) {
10322       ctx->cf_info.exec.potentially_empty_continue = false;
10323    }
10324 
10325    if (!ctx->cf_info.exec.potentially_empty_break)
10326       ctx->cf_info.exec.potentially_empty_break_depth = UINT16_MAX;
10327    if (!ctx->cf_info.exec.potentially_empty_continue)
10328       ctx->cf_info.exec.potentially_empty_continue_depth = UINT16_MAX;
10329 }
10330 
10331 void
end_loop(isel_context * ctx,loop_context * lc)10332 end_loop(isel_context* ctx, loop_context* lc)
10333 {
10334    // TODO: what if a loop ends with a unconditional or uniformly branched continue
10335    //       and this branch is never taken?
10336    if (!ctx->cf_info.has_branch) {
10337       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10338       Builder bld(ctx->program, ctx->block);
10339       append_logical_end(ctx->block);
10340 
10341       /* No need to check exec.potentially_empty_break/continue originating inside the loop. In the
10342        * only case where it's possible at this point (divergent break after divergent continue), we
10343        * should continue anyway. */
10344       if (ctx->cf_info.exec.potentially_empty_discard ||
10345           (ctx->cf_info.exec.potentially_empty_break &&
10346            ctx->cf_info.exec.potentially_empty_break_depth < ctx->block->loop_nest_depth) ||
10347           (ctx->cf_info.exec.potentially_empty_continue &&
10348            ctx->cf_info.exec.potentially_empty_continue_depth < ctx->block->loop_nest_depth)) {
10349          /* Discards can result in code running with an empty exec mask.
10350           * This would result in divergent breaks not ever being taken. As a
10351           * workaround, break the loop when the loop mask is empty instead of
10352           * always continuing. */
10353          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10354          unsigned block_idx = ctx->block->index;
10355 
10356          /* create helper blocks to avoid critical edges */
10357          Block* break_block = ctx->program->create_and_insert_block();
10358          break_block->kind = block_kind_uniform;
10359          bld.reset(break_block);
10360          bld.branch(aco_opcode::p_branch, bld.def(s2));
10361          add_linear_edge(block_idx, break_block);
10362          add_linear_edge(break_block->index, &lc->loop_exit);
10363 
10364          Block* continue_block = ctx->program->create_and_insert_block();
10365          continue_block->kind = block_kind_uniform;
10366          bld.reset(continue_block);
10367          bld.branch(aco_opcode::p_branch, bld.def(s2));
10368          add_linear_edge(block_idx, continue_block);
10369          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10370 
10371          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10372             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10373          ctx->block = &ctx->program->blocks[block_idx];
10374       } else {
10375          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10376          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10377             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10378          else
10379             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10380       }
10381 
10382       bld.reset(ctx->block);
10383       bld.branch(aco_opcode::p_branch, bld.def(s2));
10384    }
10385 
10386    ctx->cf_info.has_branch = false;
10387    ctx->program->next_loop_depth--;
10388 
10389    /* emit loop successor block */
10390    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10391    append_logical_start(ctx->block);
10392 
10393    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10394    ctx->cf_info.parent_loop.exit = lc->exit_old;
10395    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10396    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10397    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10398    update_exec_info(ctx);
10399 }
10400 
10401 void
emit_loop_jump(isel_context * ctx,bool is_break)10402 emit_loop_jump(isel_context* ctx, bool is_break)
10403 {
10404    Builder bld(ctx->program, ctx->block);
10405    Block* logical_target;
10406    append_logical_end(ctx->block);
10407    unsigned idx = ctx->block->index;
10408 
10409    if (is_break) {
10410       logical_target = ctx->cf_info.parent_loop.exit;
10411       add_logical_edge(idx, logical_target);
10412       ctx->block->kind |= block_kind_break;
10413 
10414       if (!ctx->cf_info.parent_if.is_divergent &&
10415           !ctx->cf_info.parent_loop.has_divergent_continue) {
10416          /* uniform break - directly jump out of the loop */
10417          ctx->block->kind |= block_kind_uniform;
10418          ctx->cf_info.has_branch = true;
10419          bld.branch(aco_opcode::p_branch, bld.def(s2));
10420          add_linear_edge(idx, logical_target);
10421          return;
10422       }
10423       ctx->cf_info.parent_loop.has_divergent_branch = true;
10424 
10425       if (!ctx->cf_info.exec.potentially_empty_break) {
10426          ctx->cf_info.exec.potentially_empty_break = true;
10427          ctx->cf_info.exec.potentially_empty_break_depth = ctx->block->loop_nest_depth;
10428       }
10429    } else {
10430       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10431       add_logical_edge(idx, logical_target);
10432       ctx->block->kind |= block_kind_continue;
10433 
10434       if (!ctx->cf_info.parent_if.is_divergent) {
10435          /* uniform continue - directly jump to the loop header */
10436          ctx->block->kind |= block_kind_uniform;
10437          ctx->cf_info.has_branch = true;
10438          bld.branch(aco_opcode::p_branch, bld.def(s2));
10439          add_linear_edge(idx, logical_target);
10440          return;
10441       }
10442 
10443       /* for potential uniform breaks after this continue,
10444          we must ensure that they are handled correctly */
10445       ctx->cf_info.parent_loop.has_divergent_continue = true;
10446       ctx->cf_info.parent_loop.has_divergent_branch = true;
10447 
10448       if (!ctx->cf_info.exec.potentially_empty_continue) {
10449          ctx->cf_info.exec.potentially_empty_continue = true;
10450          ctx->cf_info.exec.potentially_empty_continue_depth = ctx->block->loop_nest_depth;
10451       }
10452    }
10453 
10454    /* remove critical edges from linear CFG */
10455    bld.branch(aco_opcode::p_branch, bld.def(s2));
10456    Block* break_block = ctx->program->create_and_insert_block();
10457    break_block->kind |= block_kind_uniform;
10458    add_linear_edge(idx, break_block);
10459    /* the loop_header pointer might be invalidated by this point */
10460    if (!is_break)
10461       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10462    add_linear_edge(break_block->index, logical_target);
10463    bld.reset(break_block);
10464    bld.branch(aco_opcode::p_branch, bld.def(s2));
10465 
10466    Block* continue_block = ctx->program->create_and_insert_block();
10467    add_linear_edge(idx, continue_block);
10468    append_logical_start(continue_block);
10469    ctx->block = continue_block;
10470 }
10471 
10472 void
emit_loop_break(isel_context * ctx)10473 emit_loop_break(isel_context* ctx)
10474 {
10475    emit_loop_jump(ctx, true);
10476 }
10477 
10478 void
emit_loop_continue(isel_context * ctx)10479 emit_loop_continue(isel_context* ctx)
10480 {
10481    emit_loop_jump(ctx, false);
10482 }
10483 
10484 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10485 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10486 {
10487    switch (instr->type) {
10488    case nir_jump_break: emit_loop_break(ctx); break;
10489    case nir_jump_continue: emit_loop_continue(ctx); break;
10490    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10491    }
10492 }
10493 
10494 void
visit_block(isel_context * ctx,nir_block * block)10495 visit_block(isel_context* ctx, nir_block* block)
10496 {
10497    if (ctx->block->kind & block_kind_top_level) {
10498       Builder bld(ctx->program, ctx->block);
10499       for (Temp tmp : ctx->unended_linear_vgprs) {
10500          bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10501       }
10502       ctx->unended_linear_vgprs.clear();
10503    }
10504 
10505    ctx->block->instructions.reserve(ctx->block->instructions.size() +
10506                                     exec_list_length(&block->instr_list) * 2);
10507    nir_foreach_instr (instr, block) {
10508       switch (instr->type) {
10509       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10510       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10511       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10512       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10513       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10514       case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10515       case nir_instr_type_deref: break;
10516       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10517       default: isel_err(instr, "Unknown NIR instr type");
10518       }
10519    }
10520 }
10521 
10522 static bool
all_uses_inside_loop(nir_def * def,nir_block * block_before_loop,nir_block * block_after_loop)10523 all_uses_inside_loop(nir_def* def, nir_block* block_before_loop, nir_block* block_after_loop)
10524 {
10525    nir_foreach_use_including_if (use, def) {
10526       if (nir_src_is_if(use)) {
10527          nir_block* branch_block =
10528             nir_cf_node_as_block(nir_cf_node_prev(&nir_src_parent_if(use)->cf_node));
10529          if (branch_block->index <= block_before_loop->index || branch_block->index >= block_after_loop->index)
10530             return false;
10531       } else {
10532          nir_instr* instr = nir_src_parent_instr(use);
10533          if ((instr->block->index <= block_before_loop->index || instr->block->index >= block_after_loop->index) &&
10534              !(instr->type == nir_instr_type_phi && instr->block == block_after_loop)) {
10535             return false;
10536          }
10537       }
10538    }
10539 
10540    return true;
10541 }
10542 
10543 Temp
rename_temp(const std::map<unsigned,unsigned> & renames,Temp tmp)10544 rename_temp(const std::map<unsigned, unsigned>& renames, Temp tmp)
10545 {
10546    auto it = renames.find(tmp.id());
10547    if (it != renames.end())
10548       return Temp(it->second, tmp.regClass());
10549    return tmp;
10550 }
10551 
10552 static void
lcssa_workaround(isel_context * ctx,nir_loop * loop)10553 lcssa_workaround(isel_context* ctx, nir_loop* loop)
10554 {
10555    nir_block* block_before_loop = nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node));
10556    nir_block* block_after_loop = nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node));
10557 
10558    std::map<unsigned, unsigned> renames;
10559    nir_foreach_block_in_cf_node (block, &loop->cf_node) {
10560       /* These values are reachable from the loop exit even when continue_or_break is used. We
10561        * shouldn't create phis with undef operands in case the contents are important even if exec
10562        * is zero (for example, memory access addresses). */
10563       if (nir_block_dominates(block, nir_loop_last_block(loop)))
10564          continue;
10565 
10566       /* Definitions in this block are not reachable from the loop exit, and so all uses are inside
10567        * the loop. */
10568       if (!nir_block_dominates(block, block_after_loop))
10569          continue;
10570 
10571       nir_foreach_instr (instr, block) {
10572          nir_def* def = nir_instr_def(instr);
10573          if (!def)
10574             continue;
10575 
10576          Temp tmp = get_ssa_temp(ctx, def);
10577          if (!tmp.is_linear() || all_uses_inside_loop(def, block_before_loop, block_after_loop))
10578             continue;
10579 
10580          Temp new_tmp = ctx->program->allocateTmp(tmp.regClass());
10581          aco_ptr<Instruction> phi(create_instruction(aco_opcode::p_phi, Format::PSEUDO,
10582                                                      ctx->block->logical_preds.size(), 1));
10583          for (unsigned i = 0; i < ctx->block->logical_preds.size(); i++)
10584             phi->operands[i] = Operand(new_tmp);
10585          phi->definitions[0] = Definition(tmp);
10586          ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10587 
10588          renames.emplace(tmp.id(), new_tmp.id());
10589       }
10590    }
10591 
10592    if (renames.empty())
10593       return;
10594 
10595    for (unsigned i = ctx->block->index - 1;
10596         ctx->program->blocks[i].loop_nest_depth > ctx->block->loop_nest_depth; i--) {
10597       for (aco_ptr<Instruction>& instr : ctx->program->blocks[i].instructions) {
10598          for (Definition& def : instr->definitions) {
10599             if (def.isTemp())
10600                def.setTemp(rename_temp(renames, def.getTemp()));
10601          }
10602          for (Operand& op : instr->operands) {
10603             if (op.isTemp())
10604                op.setTemp(rename_temp(renames, op.getTemp()));
10605          }
10606       }
10607    }
10608 }
10609 
10610 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10611 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10612 static void end_uniform_if(isel_context* ctx, if_context* ic);
10613 
10614 static void
visit_loop(isel_context * ctx,nir_loop * loop)10615 visit_loop(isel_context* ctx, nir_loop* loop)
10616 {
10617    assert(!nir_loop_has_continue_construct(loop));
10618    loop_context lc;
10619    begin_loop(ctx, &lc);
10620 
10621    visit_cf_list(ctx, &loop->body);
10622 
10623    end_loop(ctx, &lc);
10624 
10625    /* Create extra LCSSA phis for continue_or_break */
10626    if (ctx->block->linear_preds.size() > ctx->block->logical_preds.size())
10627       lcssa_workaround(ctx, loop);
10628 }
10629 
10630 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond,nir_selection_control sel_ctrl=nir_selection_control_none)10631 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10632                         nir_selection_control sel_ctrl = nir_selection_control_none)
10633 {
10634    ic->cond = cond;
10635 
10636    append_logical_end(ctx->block);
10637    ctx->block->kind |= block_kind_branch;
10638 
10639    /* branch to linear then block */
10640    assert(cond.regClass() == ctx->program->lane_mask);
10641    aco_ptr<Instruction> branch;
10642    branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 1));
10643    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10644    branch->operands[0] = Operand(cond);
10645    bool never_taken =
10646       sel_ctrl == nir_selection_control_divergent_always_taken &&
10647       !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10648         ctx->cf_info.exec.potentially_empty_continue);
10649    branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10650    branch->branch().never_taken = never_taken;
10651    ctx->block->instructions.push_back(std::move(branch));
10652 
10653    ic->BB_if_idx = ctx->block->index;
10654    ic->BB_invert = Block();
10655    /* Invert blocks are intentionally not marked as top level because they
10656     * are not part of the logical cfg. */
10657    ic->BB_invert.kind |= block_kind_invert;
10658    ic->BB_endif = Block();
10659    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10660 
10661    ic->exec_old = ctx->cf_info.exec;
10662    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10663    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10664    ctx->cf_info.parent_if.is_divergent = true;
10665 
10666    /* divergent branches use cbranch_execz */
10667    ctx->cf_info.exec = exec_info();
10668 
10669    /** emit logical then block */
10670    ctx->program->next_divergent_if_logical_depth++;
10671    Block* BB_then_logical = ctx->program->create_and_insert_block();
10672    add_edge(ic->BB_if_idx, BB_then_logical);
10673    ctx->block = BB_then_logical;
10674    append_logical_start(BB_then_logical);
10675 }
10676 
10677 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic,nir_selection_control sel_ctrl=nir_selection_control_none)10678 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10679                         nir_selection_control sel_ctrl = nir_selection_control_none)
10680 {
10681    Block* BB_then_logical = ctx->block;
10682    append_logical_end(BB_then_logical);
10683    /* branch from logical then block to invert block */
10684    aco_ptr<Instruction> branch;
10685    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10686    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10687    BB_then_logical->instructions.emplace_back(std::move(branch));
10688    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10689    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10690       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10691    BB_then_logical->kind |= block_kind_uniform;
10692    assert(!ctx->cf_info.has_branch);
10693    ctx->cf_info.parent_loop.has_divergent_branch = false;
10694    ctx->program->next_divergent_if_logical_depth--;
10695 
10696    /** emit linear then block */
10697    Block* BB_then_linear = ctx->program->create_and_insert_block();
10698    BB_then_linear->kind |= block_kind_uniform;
10699    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10700    /* branch from linear then block to invert block */
10701    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10702    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10703    BB_then_linear->instructions.emplace_back(std::move(branch));
10704    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10705 
10706    /** emit invert merge block */
10707    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10708    ic->invert_idx = ctx->block->index;
10709 
10710    /* branch to linear else block (skip else) */
10711    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10712    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10713    bool never_taken =
10714       sel_ctrl == nir_selection_control_divergent_always_taken &&
10715       !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10716         ctx->cf_info.exec.potentially_empty_continue);
10717    branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10718    branch->branch().never_taken = never_taken;
10719    ctx->block->instructions.push_back(std::move(branch));
10720 
10721    ic->exec_old.combine(ctx->cf_info.exec);
10722    /* divergent branches use cbranch_execz */
10723    ctx->cf_info.exec = exec_info();
10724 
10725    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10726    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10727 
10728    /** emit logical else block */
10729    ctx->program->next_divergent_if_logical_depth++;
10730    Block* BB_else_logical = ctx->program->create_and_insert_block();
10731    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10732    add_linear_edge(ic->invert_idx, BB_else_logical);
10733    ctx->block = BB_else_logical;
10734    append_logical_start(BB_else_logical);
10735 }
10736 
10737 static void
end_divergent_if(isel_context * ctx,if_context * ic)10738 end_divergent_if(isel_context* ctx, if_context* ic)
10739 {
10740    Block* BB_else_logical = ctx->block;
10741    append_logical_end(BB_else_logical);
10742 
10743    /* branch from logical else block to endif block */
10744    aco_ptr<Instruction> branch;
10745    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10746    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10747    BB_else_logical->instructions.emplace_back(std::move(branch));
10748    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10749    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10750       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10751    BB_else_logical->kind |= block_kind_uniform;
10752    ctx->program->next_divergent_if_logical_depth--;
10753 
10754    assert(!ctx->cf_info.has_branch);
10755    ctx->cf_info.parent_loop.has_divergent_branch = false;
10756 
10757    /** emit linear else block */
10758    Block* BB_else_linear = ctx->program->create_and_insert_block();
10759    BB_else_linear->kind |= block_kind_uniform;
10760    add_linear_edge(ic->invert_idx, BB_else_linear);
10761 
10762    /* branch from linear else block to endif block */
10763    branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10764    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10765    BB_else_linear->instructions.emplace_back(std::move(branch));
10766    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10767 
10768    /** emit endif merge block */
10769    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10770    append_logical_start(ctx->block);
10771 
10772    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10773    ctx->cf_info.exec.combine(ic->exec_old);
10774    update_exec_info(ctx);
10775    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10776 
10777    /* We shouldn't create unreachable blocks. */
10778    assert(!ctx->block->logical_preds.empty());
10779 }
10780 
10781 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10782 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10783 {
10784    assert(cond.regClass() == s1);
10785 
10786    append_logical_end(ctx->block);
10787    ctx->block->kind |= block_kind_uniform;
10788 
10789    aco_ptr<Instruction> branch;
10790    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10791    branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10792    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10793    branch->operands[0] = Operand(cond);
10794    branch->operands[0].setFixed(scc);
10795    ctx->block->instructions.emplace_back(std::move(branch));
10796 
10797    ic->BB_if_idx = ctx->block->index;
10798    ic->BB_endif = Block();
10799    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10800 
10801    ctx->cf_info.has_branch = false;
10802    ctx->cf_info.parent_loop.has_divergent_branch = false;
10803 
10804    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10805    ic->has_divergent_continue_old = ctx->cf_info.parent_loop.has_divergent_continue;
10806 
10807    /** emit then block */
10808    ctx->program->next_uniform_if_depth++;
10809    Block* BB_then = ctx->program->create_and_insert_block();
10810    add_edge(ic->BB_if_idx, BB_then);
10811    append_logical_start(BB_then);
10812    ctx->block = BB_then;
10813 }
10814 
10815 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10816 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10817 {
10818    Block* BB_then = ctx->block;
10819 
10820    if (!ctx->cf_info.has_branch) {
10821       append_logical_end(BB_then);
10822       /* branch from then block to endif block */
10823       aco_ptr<Instruction> branch;
10824       branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10825       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10826       BB_then->instructions.emplace_back(std::move(branch));
10827       add_linear_edge(BB_then->index, &ic->BB_endif);
10828       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10829          add_logical_edge(BB_then->index, &ic->BB_endif);
10830       BB_then->kind |= block_kind_uniform;
10831    }
10832 
10833    ctx->cf_info.has_branch = false;
10834    ctx->cf_info.parent_loop.has_divergent_branch = false;
10835 
10836    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10837    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10838 
10839    ic->has_divergent_continue_then = ctx->cf_info.parent_loop.has_divergent_continue;
10840    ctx->cf_info.parent_loop.has_divergent_continue = ic->has_divergent_continue_old;
10841 
10842    /** emit else block */
10843    Block* BB_else = ctx->program->create_and_insert_block();
10844    add_edge(ic->BB_if_idx, BB_else);
10845    append_logical_start(BB_else);
10846    ctx->block = BB_else;
10847 }
10848 
10849 static void
end_uniform_if(isel_context * ctx,if_context * ic)10850 end_uniform_if(isel_context* ctx, if_context* ic)
10851 {
10852    Block* BB_else = ctx->block;
10853 
10854    if (!ctx->cf_info.has_branch) {
10855       append_logical_end(BB_else);
10856       /* branch from then block to endif block */
10857       aco_ptr<Instruction> branch;
10858       branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
10859       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10860       BB_else->instructions.emplace_back(std::move(branch));
10861       add_linear_edge(BB_else->index, &ic->BB_endif);
10862       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10863          add_logical_edge(BB_else->index, &ic->BB_endif);
10864       BB_else->kind |= block_kind_uniform;
10865    }
10866 
10867    ctx->cf_info.has_branch = false;
10868    ctx->cf_info.parent_loop.has_divergent_branch = false;
10869    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10870    ctx->cf_info.parent_loop.has_divergent_continue |= ic->has_divergent_continue_then;
10871 
10872    /** emit endif merge block */
10873    ctx->program->next_uniform_if_depth--;
10874    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10875    append_logical_start(ctx->block);
10876 
10877    /* We shouldn't create unreachable blocks. */
10878    assert(!ctx->block->logical_preds.empty());
10879 }
10880 
10881 static void
visit_if(isel_context * ctx,nir_if * if_stmt)10882 visit_if(isel_context* ctx, nir_if* if_stmt)
10883 {
10884    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10885    Builder bld(ctx->program, ctx->block);
10886    aco_ptr<Instruction> branch;
10887    if_context ic;
10888 
10889    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10890       /**
10891        * Uniform conditionals are represented in the following way*) :
10892        *
10893        * The linear and logical CFG:
10894        *                        BB_IF
10895        *                        /    \
10896        *       BB_THEN (logical)      BB_ELSE (logical)
10897        *                        \    /
10898        *                        BB_ENDIF
10899        *
10900        * *) Exceptions may be due to break and continue statements within loops
10901        *    If a break/continue happens within uniform control flow, it branches
10902        *    to the loop exit/entry block. Otherwise, it branches to the next
10903        *    merge block.
10904        **/
10905 
10906       assert(cond.regClass() == ctx->program->lane_mask);
10907       cond = bool_to_scalar_condition(ctx, cond);
10908 
10909       begin_uniform_if_then(ctx, &ic, cond);
10910       visit_cf_list(ctx, &if_stmt->then_list);
10911 
10912       begin_uniform_if_else(ctx, &ic);
10913       visit_cf_list(ctx, &if_stmt->else_list);
10914 
10915       end_uniform_if(ctx, &ic);
10916    } else { /* non-uniform condition */
10917       /**
10918        * To maintain a logical and linear CFG without critical edges,
10919        * non-uniform conditionals are represented in the following way*) :
10920        *
10921        * The linear CFG:
10922        *                        BB_IF
10923        *                        /    \
10924        *       BB_THEN (logical)      BB_THEN (linear)
10925        *                        \    /
10926        *                        BB_INVERT (linear)
10927        *                        /    \
10928        *       BB_ELSE (logical)      BB_ELSE (linear)
10929        *                        \    /
10930        *                        BB_ENDIF
10931        *
10932        * The logical CFG:
10933        *                        BB_IF
10934        *                        /    \
10935        *       BB_THEN (logical)      BB_ELSE (logical)
10936        *                        \    /
10937        *                        BB_ENDIF
10938        *
10939        * *) Exceptions may be due to break and continue statements within loops
10940        **/
10941 
10942       begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10943       visit_cf_list(ctx, &if_stmt->then_list);
10944 
10945       begin_divergent_if_else(ctx, &ic, if_stmt->control);
10946       visit_cf_list(ctx, &if_stmt->else_list);
10947 
10948       end_divergent_if(ctx, &ic);
10949    }
10950 }
10951 
10952 static void
visit_cf_list(isel_context * ctx,struct exec_list * list)10953 visit_cf_list(isel_context* ctx, struct exec_list* list)
10954 {
10955    foreach_list_typed (nir_cf_node, node, node, list) {
10956       switch (node->type) {
10957       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10958       case nir_cf_node_if: visit_if(ctx, nir_cf_node_as_if(node)); break;
10959       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10960       default: unreachable("unimplemented cf list type");
10961       }
10962    }
10963 }
10964 
10965 static void
export_mrt(isel_context * ctx,const struct aco_export_mrt * mrt)10966 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10967 {
10968    Builder bld(ctx->program, ctx->block);
10969 
10970    bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10971            mrt->enabled_channels, mrt->target, mrt->compr);
10972 
10973    ctx->program->has_color_exports = true;
10974 }
10975 
10976 static bool
export_fs_mrt_color(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned slot,struct aco_export_mrt * mrt)10977 export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
10978                     unsigned slot, struct aco_export_mrt* mrt)
10979 {
10980    unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf;
10981 
10982    if (col_format == V_028714_SPI_SHADER_ZERO)
10983       return false;
10984 
10985    Builder bld(ctx->program, ctx->block);
10986    Operand values[4];
10987 
10988    for (unsigned i = 0; i < 4; ++i) {
10989       values[i] = Operand(colors[i]);
10990    }
10991 
10992    unsigned enabled_channels = 0;
10993    aco_opcode compr_op = aco_opcode::num_opcodes;
10994    bool compr = false;
10995    bool is_16bit = colors[0].regClass() == v2b;
10996    bool is_int8 = (info->color_is_int8 >> slot) & 1;
10997    bool is_int10 = (info->color_is_int10 >> slot) & 1;
10998    bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1;
10999 
11000    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
11001    if (enable_mrt_output_nan_fixup && !is_16bit &&
11002        (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
11003         col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
11004         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
11005       for (unsigned i = 0; i < 4; i++) {
11006          Temp is_not_nan =
11007             bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
11008          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
11009                               is_not_nan);
11010       }
11011    }
11012 
11013    switch (col_format) {
11014    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
11015 
11016    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
11017 
11018    case V_028714_SPI_SHADER_32_AR:
11019       if (ctx->options->gfx_level >= GFX10) {
11020          /* Special case: on GFX10, the outputs are different for 32_AR */
11021          enabled_channels = 0x3;
11022          values[1] = values[3];
11023          values[3] = Operand(v1);
11024       } else {
11025          enabled_channels = 0x9;
11026       }
11027       break;
11028 
11029    case V_028714_SPI_SHADER_FP16_ABGR:
11030       for (int i = 0; i < 2; i++) {
11031          if (is_16bit) {
11032             values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2],
11033                                    values[i * 2 + 1]);
11034          } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
11035             values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2],
11036                                  values[i * 2 + 1]);
11037          } else {
11038             values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2],
11039                                  values[i * 2 + 1]);
11040          }
11041       }
11042       values[2] = Operand(v1);
11043       values[3] = Operand(v1);
11044       enabled_channels = 0xf;
11045       compr = true;
11046       break;
11047 
11048    case V_028714_SPI_SHADER_UNORM16_ABGR:
11049       if (is_16bit && ctx->options->gfx_level >= GFX9) {
11050          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
11051       } else {
11052          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
11053       }
11054       break;
11055 
11056    case V_028714_SPI_SHADER_SNORM16_ABGR:
11057       if (is_16bit && ctx->options->gfx_level >= GFX9) {
11058          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
11059       } else {
11060          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
11061       }
11062       break;
11063 
11064    case V_028714_SPI_SHADER_UINT16_ABGR:
11065       compr_op = aco_opcode::v_cvt_pk_u16_u32;
11066       if (is_int8 || is_int10) {
11067          /* clamp */
11068          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
11069 
11070          for (unsigned i = 0; i < 4; i++) {
11071             uint32_t max = i == 3 && is_int10 ? 3 : max_rgb;
11072 
11073             values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
11074          }
11075       } else if (is_16bit) {
11076          for (unsigned i = 0; i < 4; i++) {
11077             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
11078             values[i] = Operand(tmp);
11079          }
11080       }
11081       break;
11082 
11083    case V_028714_SPI_SHADER_SINT16_ABGR:
11084       compr_op = aco_opcode::v_cvt_pk_i16_i32;
11085       if (is_int8 || is_int10) {
11086          /* clamp */
11087          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
11088          uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11089 
11090          for (unsigned i = 0; i < 4; i++) {
11091             uint32_t max = i == 3 && is_int10 ? 1 : max_rgb;
11092             uint32_t min = i == 3 && is_int10 ? -2u : min_rgb;
11093 
11094             values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
11095             values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
11096          }
11097       } else if (is_16bit) {
11098          for (unsigned i = 0; i < 4; i++) {
11099             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11100             values[i] = Operand(tmp);
11101          }
11102       }
11103       break;
11104 
11105    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11106 
11107    case V_028714_SPI_SHADER_ZERO:
11108    default: return false;
11109    }
11110 
11111    if (compr_op != aco_opcode::num_opcodes) {
11112       values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]);
11113       values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]);
11114       values[2] = Operand(v1);
11115       values[3] = Operand(v1);
11116       enabled_channels = 0xf;
11117       compr = true;
11118    } else if (!compr) {
11119       for (int i = 0; i < 4; i++)
11120          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11121    }
11122 
11123    if (ctx->program->gfx_level >= GFX11) {
11124       /* GFX11 doesn't use COMPR for exports, but the channel mask should be
11125        * 0x3 instead.
11126        */
11127       enabled_channels = compr ? 0x3 : enabled_channels;
11128       compr = false;
11129    }
11130 
11131    for (unsigned i = 0; i < 4; i++)
11132       mrt->out[i] = values[i];
11133    mrt->target = V_008DFC_SQ_EXP_MRT;
11134    mrt->enabled_channels = enabled_channels;
11135    mrt->compr = compr;
11136 
11137    return true;
11138 }
11139 
11140 static void
export_fs_mrtz(isel_context * ctx,Temp depth,Temp stencil,Temp samplemask,Temp alpha)11141 export_fs_mrtz(isel_context* ctx, Temp depth, Temp stencil, Temp samplemask, Temp alpha)
11142 {
11143    Builder bld(ctx->program, ctx->block);
11144    unsigned enabled_channels = 0;
11145    bool compr = false;
11146    Operand values[4];
11147 
11148    for (unsigned i = 0; i < 4; ++i) {
11149       values[i] = Operand(v1);
11150    }
11151 
11152    /* Both stencil and sample mask only need 16-bits. */
11153    if (!depth.id() && !alpha.id() && (stencil.id() || samplemask.id())) {
11154       compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
11155 
11156       if (stencil.id()) {
11157          /* Stencil should be in X[23:16]. */
11158          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil);
11159          enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
11160       }
11161 
11162       if (samplemask.id()) {
11163          /* SampleMask should be in Y[15:0]. */
11164          values[1] = Operand(samplemask);
11165          enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
11166       }
11167    } else {
11168       if (depth.id()) {
11169          values[0] = Operand(depth);
11170          enabled_channels |= 0x1;
11171       }
11172 
11173       if (stencil.id()) {
11174          values[1] = Operand(stencil);
11175          enabled_channels |= 0x2;
11176       }
11177 
11178       if (samplemask.id()) {
11179          values[2] = Operand(samplemask);
11180          enabled_channels |= 0x4;
11181       }
11182 
11183       if (alpha.id()) {
11184          assert(ctx->program->gfx_level >= GFX11);
11185          values[3] = Operand(alpha);
11186          enabled_channels |= 0x8;
11187       }
11188    }
11189 
11190    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
11191     * writemask component.
11192     */
11193    if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
11194        ctx->options->family != CHIP_HAINAN) {
11195       enabled_channels |= 0x1;
11196    }
11197 
11198    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
11199            V_008DFC_SQ_EXP_MRTZ, compr);
11200 }
11201 
11202 static void
create_fs_null_export(isel_context * ctx)11203 create_fs_null_export(isel_context* ctx)
11204 {
11205    /* FS must always have exports.
11206     * So when there are none, we need to add a null export.
11207     */
11208 
11209    Builder bld(ctx->program, ctx->block);
11210    /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
11211    unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
11212    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11213            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11214 
11215    ctx->program->has_color_exports = true;
11216 }
11217 
11218 static void
create_fs_jump_to_epilog(isel_context * ctx)11219 create_fs_jump_to_epilog(isel_context* ctx)
11220 {
11221    Builder bld(ctx->program, ctx->block);
11222    std::vector<Operand> exports;
11223    unsigned vgpr = 256; /* VGPR 0 */
11224 
11225    if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11226       exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
11227 
11228    if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11229       exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
11230 
11231    if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11232       exports.emplace_back(
11233          Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
11234 
11235    PhysReg exports_start(vgpr);
11236 
11237    for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
11238       unsigned color_index = slot - FRAG_RESULT_DATA0;
11239       unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
11240       unsigned write_mask = ctx->outputs.mask[slot];
11241 
11242       if (!write_mask)
11243          continue;
11244 
11245       PhysReg color_start(exports_start.reg() + color_index * 4);
11246 
11247       for (unsigned i = 0; i < 4; i++) {
11248          if (!(write_mask & BITFIELD_BIT(i))) {
11249             exports.emplace_back(Operand(v1));
11250             continue;
11251          }
11252 
11253          PhysReg chan_reg = color_start.advance(i * 4u);
11254          Operand chan(ctx->outputs.temps[slot * 4u + i]);
11255 
11256          if (color_type == ACO_TYPE_FLOAT16) {
11257             chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
11258          } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
11259             bool sign_ext = color_type == ACO_TYPE_INT16;
11260             Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
11261             chan = Operand(tmp);
11262          }
11263 
11264          chan.setFixed(chan_reg);
11265          exports.emplace_back(chan);
11266       }
11267    }
11268 
11269    Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));
11270 
11271    aco_ptr<Instruction> jump{
11272       create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
11273    jump->operands[0] = Operand(continue_pc);
11274    for (unsigned i = 0; i < exports.size(); i++) {
11275       jump->operands[i + 1] = exports[i];
11276    }
11277    ctx->block->instructions.emplace_back(std::move(jump));
11278 }
11279 
11280 PhysReg
get_arg_reg(const struct ac_shader_args * args,struct ac_arg arg)11281 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
11282 {
11283    assert(arg.used);
11284    enum ac_arg_regfile file = args->args[arg.arg_index].file;
11285    unsigned reg = args->args[arg.arg_index].offset;
11286    return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
11287 }
11288 
11289 static Operand
get_arg_for_end(isel_context * ctx,struct ac_arg arg)11290 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
11291 {
11292    return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
11293 }
11294 
11295 static void
passthrough_all_args(isel_context * ctx,std::vector<Operand> & regs)11296 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
11297 {
11298    struct ac_arg arg;
11299    arg.used = true;
11300 
11301    for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
11302       regs.emplace_back(get_arg_for_end(ctx, arg));
11303 }
11304 
11305 static void
build_end_with_regs(isel_context * ctx,std::vector<Operand> & regs)11306 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
11307 {
11308    aco_ptr<Instruction> end{
11309       create_instruction(aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
11310 
11311    for (unsigned i = 0; i < regs.size(); i++)
11312       end->operands[i] = regs[i];
11313 
11314    ctx->block->instructions.emplace_back(std::move(end));
11315 
11316    ctx->block->kind |= block_kind_end_with_regs;
11317 }
11318 
11319 static void
create_fs_end_for_epilog(isel_context * ctx)11320 create_fs_end_for_epilog(isel_context* ctx)
11321 {
11322    Builder bld(ctx->program, ctx->block);
11323 
11324    std::vector<Operand> regs;
11325 
11326    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
11327 
11328    unsigned vgpr = 256;
11329 
11330    for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
11331       unsigned index = slot - FRAG_RESULT_DATA0;
11332       unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
11333       unsigned write_mask = ctx->outputs.mask[slot];
11334 
11335       if (!write_mask)
11336          continue;
11337 
11338       if (type == ACO_TYPE_ANY32) {
11339          u_foreach_bit (i, write_mask) {
11340             regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11341          }
11342       } else {
11343          for (unsigned i = 0; i < 2; i++) {
11344             unsigned mask = (write_mask >> (i * 2)) & 0x3;
11345             if (!mask)
11346                continue;
11347 
11348             unsigned chan = slot * 4 + i * 2;
11349             Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
11350             Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
11351 
11352             Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
11353             regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
11354          }
11355       }
11356       vgpr += 4;
11357    }
11358 
11359    if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11360       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
11361 
11362    if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11363       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
11364 
11365    if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11366       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
11367 
11368    build_end_with_regs(ctx, regs);
11369 
11370    /* Exit WQM mode finally. */
11371    ctx->program->needs_exact = true;
11372 }
11373 
11374 Instruction*
add_startpgm(struct isel_context * ctx)11375 add_startpgm(struct isel_context* ctx)
11376 {
11377    unsigned def_count = 0;
11378    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11379       if (ctx->args->args[i].skip)
11380          continue;
11381       unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11382       if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11383          def_count += ctx->args->args[i].size;
11384       else
11385          def_count++;
11386    }
11387 
11388    if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12)
11389       def_count += 3;
11390 
11391    Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11392    ctx->block->instructions.emplace_back(startpgm);
11393    for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11394       if (ctx->args->args[i].skip)
11395          continue;
11396 
11397       enum ac_arg_regfile file = ctx->args->args[i].file;
11398       unsigned size = ctx->args->args[i].size;
11399       unsigned reg = ctx->args->args[i].offset;
11400       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11401 
11402       if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11403          Temp elems[16];
11404          for (unsigned j = 0; j < size; j++) {
11405             elems[j] = ctx->program->allocateTmp(s1);
11406             startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11407          }
11408          ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11409       } else {
11410          Temp dst = ctx->program->allocateTmp(type);
11411          Definition def(dst);
11412          def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11413          ctx->arg_temps[i] = dst;
11414          startpgm->definitions[arg++] = def;
11415 
11416          if (ctx->args->args[i].pending_vmem) {
11417             assert(file == AC_ARG_VGPR);
11418             ctx->program->args_pending_vmem.push_back(def);
11419          }
11420       }
11421    }
11422 
11423    if (ctx->program->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
11424       Temp idx = ctx->program->allocateTmp(s1);
11425       Temp idy = ctx->program->allocateTmp(s1);
11426       ctx->ttmp8 = ctx->program->allocateTmp(s1);
11427       startpgm->definitions[def_count - 3] = Definition(idx);
11428       startpgm->definitions[def_count - 3].setFixed(PhysReg(108 + 9 /*ttmp9*/));
11429       startpgm->definitions[def_count - 2] = Definition(ctx->ttmp8);
11430       startpgm->definitions[def_count - 2].setFixed(PhysReg(108 + 8 /*ttmp8*/));
11431       startpgm->definitions[def_count - 1] = Definition(idy);
11432       startpgm->definitions[def_count - 1].setFixed(PhysReg(108 + 7 /*ttmp7*/));
11433       ctx->workgroup_id[0] = Operand(idx);
11434       if (ctx->args->workgroup_ids[2].used) {
11435          Builder bld(ctx->program, ctx->block);
11436          ctx->workgroup_id[1] =
11437             bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::zero(),
11438                        Operand::c32(16u), Operand::zero());
11439          ctx->workgroup_id[2] =
11440             bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::c32(1u),
11441                        Operand::c32(16u), Operand::zero());
11442       } else {
11443          ctx->workgroup_id[1] = Operand(idy);
11444          ctx->workgroup_id[2] = Operand::zero();
11445       }
11446    } else if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
11447       const struct ac_arg* ids = ctx->args->workgroup_ids;
11448       for (unsigned i = 0; i < 3; i++)
11449          ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero();
11450    }
11451 
11452    /* epilog has no scratch */
11453    if (ctx->args->scratch_offset.used) {
11454       if (ctx->program->gfx_level < GFX9) {
11455          /* Stash these in the program so that they can be accessed later when
11456           * handling spilling.
11457           */
11458          if (ctx->args->ring_offsets.used)
11459             ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11460 
11461          ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11462       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11463          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11464           */
11465          Operand scratch_addr = ctx->args->ring_offsets.used
11466                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11467                                    : Operand(s2);
11468 
11469          Builder bld(ctx->program, ctx->block);
11470          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11471                     get_arg(ctx, ctx->args->scratch_offset));
11472       }
11473    }
11474 
11475    return startpgm;
11476 }
11477 
11478 void
fix_ls_vgpr_init_bug(isel_context * ctx)11479 fix_ls_vgpr_init_bug(isel_context* ctx)
11480 {
11481    Builder bld(ctx->program, ctx->block);
11482    constexpr unsigned hs_idx = 1u;
11483    Builder::Result hs_thread_count =
11484       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11485                get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11486    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11487 
11488    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11489 
11490    Temp instance_id =
11491       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11492                get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11493    Temp vs_rel_patch_id =
11494       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11495                get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11496    Temp vertex_id =
11497       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11498                get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11499 
11500    ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11501    ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11502    ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11503 }
11504 
11505 void
split_arguments(isel_context * ctx,Instruction * startpgm)11506 split_arguments(isel_context* ctx, Instruction* startpgm)
11507 {
11508    /* Split all arguments except for the first (ring_offsets) and the last
11509     * (exec) so that the dead channels don't stay live throughout the program.
11510     */
11511    for (int i = 1; i < startpgm->definitions.size(); i++) {
11512       if (startpgm->definitions[i].regClass().size() > 1) {
11513          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11514                            startpgm->definitions[i].regClass().size());
11515       }
11516    }
11517 }
11518 
11519 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11520 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11521 {
11522    Program* program = ctx->program;
11523 
11524    unsigned float_controls = shader->info.float_controls_execution_mode;
11525 
11526    program->next_fp_mode.must_flush_denorms32 =
11527       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11528    program->next_fp_mode.must_flush_denorms16_64 =
11529       float_controls &
11530       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11531 
11532    program->next_fp_mode.care_about_round32 =
11533       float_controls &
11534       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11535 
11536    program->next_fp_mode.care_about_round16_64 =
11537       float_controls &
11538       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11539        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11540 
11541    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11542     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11543    if (program->next_fp_mode.must_flush_denorms16_64)
11544       program->next_fp_mode.denorm16_64 = 0;
11545    else
11546       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11547 
11548    /* preserving fp32 denorms is expensive, so only do it if asked */
11549    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11550       program->next_fp_mode.denorm32 = fp_denorm_keep;
11551    else
11552       program->next_fp_mode.denorm32 = 0;
11553 
11554    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11555       program->next_fp_mode.round32 = fp_round_tz;
11556    else
11557       program->next_fp_mode.round32 = fp_round_ne;
11558 
11559    if (float_controls &
11560        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11561       program->next_fp_mode.round16_64 = fp_round_tz;
11562    else
11563       program->next_fp_mode.round16_64 = fp_round_ne;
11564 
11565    ctx->block->fp_mode = program->next_fp_mode;
11566 }
11567 
11568 void
cleanup_cfg(Program * program)11569 cleanup_cfg(Program* program)
11570 {
11571    /* create linear_succs/logical_succs */
11572    for (Block& BB : program->blocks) {
11573       for (unsigned idx : BB.linear_preds)
11574          program->blocks[idx].linear_succs.emplace_back(BB.index);
11575       for (unsigned idx : BB.logical_preds)
11576          program->blocks[idx].logical_succs.emplace_back(BB.index);
11577    }
11578 }
11579 
11580 void
finish_program(isel_context * ctx)11581 finish_program(isel_context* ctx)
11582 {
11583    cleanup_cfg(ctx->program);
11584 
11585    /* Insert a single p_end_wqm instruction after the last derivative calculation */
11586    if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11587       /* Find the next BB at top-level CFG */
11588       while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11589          ctx->wqm_block_idx++;
11590          ctx->wqm_instruction_idx = 0;
11591       }
11592 
11593       std::vector<aco_ptr<Instruction>>* instrs =
11594          &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11595       auto it = instrs->begin() + ctx->wqm_instruction_idx;
11596 
11597       /* Delay transistion to Exact to help optimizations and scheduling */
11598       while (it != instrs->end()) {
11599          aco_ptr<Instruction>& instr = *it;
11600          /* End WQM before: */
11601          if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11602              instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11603              instr->opcode == aco_opcode::p_jump_to_epilog ||
11604              instr->opcode == aco_opcode::p_logical_start)
11605             break;
11606 
11607          ++it;
11608 
11609          /* End WQM after: */
11610          if (instr->opcode == aco_opcode::p_logical_end ||
11611              instr->opcode == aco_opcode::p_discard_if ||
11612              instr->opcode == aco_opcode::p_demote_to_helper ||
11613              instr->opcode == aco_opcode::p_end_with_regs)
11614             break;
11615       }
11616 
11617       Builder bld(ctx->program);
11618       bld.reset(instrs, it);
11619       bld.pseudo(aco_opcode::p_end_wqm);
11620    }
11621 }
11622 
11623 Temp
lanecount_to_mask(isel_context * ctx,Temp count)11624 lanecount_to_mask(isel_context* ctx, Temp count)
11625 {
11626    assert(count.regClass() == s1);
11627 
11628    Builder bld(ctx->program, ctx->block);
11629    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11630    Temp cond;
11631 
11632    if (ctx->program->wave_size == 64) {
11633       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11634       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11635                                 Operand::c32(6u /* log2(64) */));
11636       cond =
11637          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11638    } else {
11639       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11640        * the register */
11641       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11642    }
11643 
11644    return cond;
11645 }
11646 
11647 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11648 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11649 {
11650    Builder bld(ctx->program, ctx->block);
11651 
11652    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11653    Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11654                        : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11655                                   get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11656 
11657    return lanecount_to_mask(ctx, count);
11658 }
11659 
11660 static void
insert_rt_jump_next(isel_context & ctx,const struct ac_shader_args * args)11661 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11662 {
11663    unsigned src_count = 0;
11664    for (unsigned i = 0; i < ctx.args->arg_count; i++)
11665       src_count += !!BITSET_TEST(ctx.output_args, i);
11666 
11667    Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11668    ctx.block->instructions.emplace_back(ret);
11669 
11670    src_count = 0;
11671    for (unsigned i = 0; i < ctx.args->arg_count; i++) {
11672       if (!BITSET_TEST(ctx.output_args, i))
11673          continue;
11674 
11675       enum ac_arg_regfile file = ctx.args->args[i].file;
11676       unsigned size = ctx.args->args[i].size;
11677       unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11678       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11679       Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11680                                          : Operand(PhysReg{reg}, type);
11681       ret->operands[src_count] = op;
11682       src_count++;
11683    }
11684 
11685    Builder bld(ctx.program, ctx.block);
11686    bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11687 }
11688 
11689 void
select_program_rt(isel_context & ctx,unsigned shader_count,struct nir_shader * const * shaders,const struct ac_shader_args * args)11690 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11691                   const struct ac_shader_args* args)
11692 {
11693    for (unsigned i = 0; i < shader_count; i++) {
11694       if (i) {
11695          ctx.block = ctx.program->create_and_insert_block();
11696          ctx.block->kind = block_kind_top_level | block_kind_resume;
11697       }
11698 
11699       nir_shader* nir = shaders[i];
11700       init_context(&ctx, nir);
11701       setup_fp_mode(&ctx, nir);
11702 
11703       Instruction* startpgm = add_startpgm(&ctx);
11704       append_logical_start(ctx.block);
11705       split_arguments(&ctx, startpgm);
11706       visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11707       append_logical_end(ctx.block);
11708       ctx.block->kind |= block_kind_uniform;
11709 
11710       /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11711        * shader without shader calls.
11712        */
11713       if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11714          insert_rt_jump_next(ctx, args);
11715 
11716       cleanup_context(&ctx);
11717    }
11718 
11719    ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11720    finish_program(&ctx);
11721 }
11722 
11723 void
pops_await_overlapped_waves(isel_context * ctx)11724 pops_await_overlapped_waves(isel_context* ctx)
11725 {
11726    ctx->program->has_pops_overlapped_waves_wait = true;
11727 
11728    Builder bld(ctx->program, ctx->block);
11729 
11730    if (ctx->program->gfx_level >= GFX11) {
11731       /* GFX11+ - waiting for the export from the overlapped waves.
11732        * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11733        */
11734       bld.sopp(aco_opcode::s_wait_event,
11735                ctx->program->gfx_level >= GFX12 ? wait_event_imm_wait_export_ready_gfx12 : 0);
11736       return;
11737    }
11738 
11739    /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11740 
11741    const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11742 
11743    /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11744    const Temp did_overlap =
11745       bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11746    if_context did_overlap_if_context;
11747    begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11748    bld.reset(ctx->block);
11749 
11750    /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11751    if (ctx->program->gfx_level >= GFX10) {
11752       /* 2 packer ID bits on GFX10-10.3. */
11753       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11754                                       collision, Operand::c32(0x2001c));
11755       /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11756       const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11757                                                  bld.def(s1, scc), packer_id, Operand::c32(1));
11758       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11759    } else {
11760       /* 1 packer ID bit on GFX9. */
11761       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11762                                       collision, Operand::c32(0x1001c));
11763       /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11764        * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11765        */
11766       const Temp packer_id_hwreg_bits =
11767          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11768       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11769    }
11770 
11771    Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11772                                              collision, Operand::c32(0xa0010));
11773    if (ctx->program->gfx_level < GFX10) {
11774       /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11775        * actual wave ID by 1 in case of wraparound.
11776        */
11777       const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11778                                             collision, Operand::c32(0x3ff));
11779       const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11780          aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11781       newest_overlapped_wave_id =
11782          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11783                   newest_overlapped_wave_id_wrapped);
11784    }
11785 
11786    /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11787     * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11788     * no more than 1023 values behind the current wave ID.
11789     * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11790     * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11791     * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11792     * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11793     * `a - (b + 1)` is `a + ~b`.
11794     * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11795     * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11796     * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11797     * expected.
11798     */
11799    const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11800                                         collision, Operand::c32(0x3ff));
11801    newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11802                                         newest_overlapped_wave_id, wave_id_offset);
11803 
11804    /* Await the overlapped waves. */
11805 
11806    loop_context wait_loop_context;
11807    begin_loop(ctx, &wait_loop_context);
11808    bld.reset(ctx->block);
11809 
11810    const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11811                                            bld.def(s1, scc), wave_id_offset);
11812    /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11813     * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11814     * exited the ordered section.
11815     */
11816    const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11817                                                        newest_overlapped_wave_id, exiting_wave_id);
11818    if_context newest_overlapped_wave_exited_if_context;
11819    begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11820                          newest_overlapped_wave_exited);
11821    emit_loop_break(ctx);
11822    begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11823    end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11824    bld.reset(ctx->block);
11825 
11826    /* Sleep before rechecking to let overlapped waves run for some time. */
11827    bld.sopp(aco_opcode::s_sleep, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11828 
11829    end_loop(ctx, &wait_loop_context);
11830    bld.reset(ctx->block);
11831 
11832    /* Indicate the wait has been done to subsequent compilation stages. */
11833    bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11834 
11835    begin_uniform_if_else(ctx, &did_overlap_if_context);
11836    end_uniform_if(ctx, &did_overlap_if_context);
11837    bld.reset(ctx->block);
11838 }
11839 
11840 static void
create_merged_jump_to_epilog(isel_context * ctx)11841 create_merged_jump_to_epilog(isel_context* ctx)
11842 {
11843    Builder bld(ctx->program, ctx->block);
11844    std::vector<Operand> regs;
11845 
11846    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11847       if (!ctx->args->args[i].preserved)
11848          continue;
11849 
11850       const enum ac_arg_regfile file = ctx->args->args[i].file;
11851       const unsigned reg = ctx->args->args[i].offset;
11852 
11853       Operand op(ctx->arg_temps[i]);
11854       op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11855       regs.emplace_back(op);
11856    }
11857 
11858    Temp continue_pc =
11859       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11860 
11861    aco_ptr<Instruction> jump{
11862       create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11863    jump->operands[0] = Operand(continue_pc);
11864    for (unsigned i = 0; i < regs.size(); i++) {
11865       jump->operands[i + 1] = regs[i];
11866    }
11867    ctx->block->instructions.emplace_back(std::move(jump));
11868 }
11869 
11870 static void
create_end_for_merged_shader(isel_context * ctx)11871 create_end_for_merged_shader(isel_context* ctx)
11872 {
11873    std::vector<Operand> regs;
11874 
11875    unsigned max_args;
11876    if (ctx->stage.sw == SWStage::VS) {
11877       assert(ctx->args->vertex_id.used);
11878       max_args = ctx->args->vertex_id.arg_index;
11879    } else {
11880       assert(ctx->stage.sw == SWStage::TES);
11881       assert(ctx->args->tes_u.used);
11882       max_args = ctx->args->tes_u.arg_index;
11883    }
11884 
11885    struct ac_arg arg;
11886    arg.used = true;
11887 
11888    for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
11889       regs.emplace_back(get_arg_for_end(ctx, arg));
11890 
11891    build_end_with_regs(ctx, regs);
11892 }
11893 
11894 void
select_shader(isel_context & ctx,nir_shader * nir,const bool need_startpgm,const bool need_endpgm,const bool need_barrier,if_context * ic_merged_wave_info,const bool check_merged_wave_info,const bool endif_merged_wave_info)11895 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
11896               const bool need_barrier, if_context* ic_merged_wave_info,
11897               const bool check_merged_wave_info, const bool endif_merged_wave_info)
11898 {
11899    init_context(&ctx, nir);
11900    setup_fp_mode(&ctx, nir);
11901 
11902    Program* program = ctx.program;
11903 
11904    if (need_startpgm) {
11905       /* Needs to be after init_context() for FS. */
11906       Instruction* startpgm = add_startpgm(&ctx);
11907 
11908       if (!program->info.vs.has_prolog &&
11909           (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11910          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, 0x3u);
11911       }
11912 
11913       append_logical_start(ctx.block);
11914 
11915       if (ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs &&
11916           !program->info.vs.has_prolog)
11917          fix_ls_vgpr_init_bug(&ctx);
11918 
11919       split_arguments(&ctx, startpgm);
11920    }
11921 
11922    if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11923        !program->stage.has(SWStage::GS)) {
11924       /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11925        * s_sendmsg(GS_ALLOC_REQ).
11926        */
11927       Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, 0u);
11928    }
11929 
11930    if (check_merged_wave_info) {
11931       const unsigned i =
11932          nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11933       const Temp cond = merged_wave_info_to_mask(&ctx, i);
11934       begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11935    }
11936 
11937    if (need_barrier) {
11938       const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11939                                      program->wave_size % nir->info.tess.tcs_vertices_out == 0
11940                                   ? scope_subgroup
11941                                   : scope_workgroup;
11942 
11943       Builder(ctx.program, ctx.block)
11944          .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11945                   scope);
11946    }
11947 
11948    nir_function_impl* func = nir_shader_get_entrypoint(nir);
11949    visit_cf_list(&ctx, &func->body);
11950 
11951    if (ctx.program->info.ps.has_epilog) {
11952       if (ctx.stage == fragment_fs) {
11953          if (ctx.options->is_opengl)
11954             create_fs_end_for_epilog(&ctx);
11955          else
11956             create_fs_jump_to_epilog(&ctx);
11957 
11958          /* FS epilogs always have at least one color/null export. */
11959          ctx.program->has_color_exports = true;
11960       }
11961    }
11962 
11963    if (endif_merged_wave_info) {
11964       begin_divergent_if_else(&ctx, ic_merged_wave_info);
11965       end_divergent_if(&ctx, ic_merged_wave_info);
11966    }
11967 
11968    bool is_first_stage_of_merged_shader = false;
11969 
11970    if (ctx.program->info.merged_shader_compiled_separately &&
11971        (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11972       assert(program->gfx_level >= GFX9);
11973       if (ctx.options->is_opengl)
11974          create_end_for_merged_shader(&ctx);
11975       else
11976          create_merged_jump_to_epilog(&ctx);
11977 
11978       is_first_stage_of_merged_shader = true;
11979    }
11980 
11981    cleanup_context(&ctx);
11982 
11983    if (need_endpgm) {
11984       program->config->float_mode = program->blocks[0].fp_mode.val;
11985 
11986       append_logical_end(ctx.block);
11987       ctx.block->kind |= block_kind_uniform;
11988 
11989       if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
11990           (nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
11991          Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
11992       }
11993 
11994       finish_program(&ctx);
11995    }
11996 }
11997 
11998 void
select_program_merged(isel_context & ctx,const unsigned shader_count,nir_shader * const * shaders)11999 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
12000 {
12001    if_context ic_merged_wave_info;
12002    const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
12003 
12004    for (unsigned i = 0; i < shader_count; i++) {
12005       nir_shader* nir = shaders[i];
12006 
12007       /* We always need to insert p_startpgm at the beginning of the first shader.  */
12008       const bool need_startpgm = i == 0;
12009 
12010       /* Need to handle program end for last shader stage. */
12011       const bool need_endpgm = i == shader_count - 1;
12012 
12013       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
12014       nir_function_impl* func = nir_shader_get_entrypoint(nir);
12015       const bool empty_shader =
12016          nir_cf_list_is_empty_block(&func->body) &&
12017          ((nir->info.stage == MESA_SHADER_VERTEX &&
12018            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
12019           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
12020 
12021       /* See if we need to emit a check of the merged wave info SGPR. */
12022       const bool check_merged_wave_info =
12023          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
12024       const bool endif_merged_wave_info =
12025          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
12026 
12027       /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
12028       const bool tcs_skip_barrier =
12029          ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
12030 
12031       /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
12032       const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
12033 
12034       select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
12035                     check_merged_wave_info, endif_merged_wave_info);
12036 
12037       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
12038          /* Special handling when TCS input and output patch size is the same.
12039           * Outputs of the previous stage are inputs to the next stage.
12040           */
12041          ctx.inputs = ctx.outputs;
12042          ctx.outputs = shader_io_state();
12043       }
12044    }
12045 }
12046 
12047 void
emit_polygon_stipple(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12048 emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12049 {
12050    Builder bld(ctx->program, ctx->block);
12051 
12052    /* Use the fixed-point gl_FragCoord input.
12053     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
12054     * per coordinate to get the repeating effect.
12055     */
12056    Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
12057    Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
12058    Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
12059                          Operand::c32(5u));
12060 
12061    /* Load the buffer descriptor. */
12062    Temp list = get_arg(ctx, finfo->internal_bindings);
12063    list = convert_pointer_to_64_bit(ctx, list);
12064    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12065                         Operand::c32(finfo->poly_stipple_buf_offset));
12066 
12067    /* The stipple pattern is 32x32, each row has 32 bits. */
12068    Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
12069    Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
12070                         0, true);
12071    Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
12072    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
12073    bld.pseudo(aco_opcode::p_demote_to_helper, cond);
12074 
12075    ctx->block->kind |= block_kind_uses_discard;
12076    ctx->program->needs_exact = true;
12077 }
12078 
12079 void
overwrite_interp_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12080 overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12081 {
12082    Builder bld(ctx->program, ctx->block);
12083 
12084    if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
12085       /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
12086        * The hw doesn't compute CENTROID if the whole wave only
12087        * contains fully-covered quads.
12088        */
12089       Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
12090 
12091       /* enabled when bit 31 is set */
12092       Temp cond =
12093          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
12094 
12095       /* scale 1bit scc to wave size bits used by v_cndmask */
12096       cond = bool_to_vector_condition(ctx, cond);
12097 
12098       if (finfo->bc_optimize_for_persp) {
12099          Temp center = get_arg(ctx, ctx->args->persp_center);
12100          Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
12101 
12102          Temp dst = bld.tmp(v2);
12103          select_vec2(ctx, dst, cond, center, centroid);
12104          ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
12105       }
12106 
12107       if (finfo->bc_optimize_for_linear) {
12108          Temp center = get_arg(ctx, ctx->args->linear_center);
12109          Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
12110 
12111          Temp dst = bld.tmp(v2);
12112          select_vec2(ctx, dst, cond, center, centroid);
12113          ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
12114       }
12115    }
12116 
12117    if (finfo->force_persp_sample_interp) {
12118       Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
12119       ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
12120       ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
12121    }
12122 
12123    if (finfo->force_linear_sample_interp) {
12124       Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
12125       ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
12126       ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
12127    }
12128 
12129    if (finfo->force_persp_center_interp) {
12130       Temp persp_center = get_arg(ctx, ctx->args->persp_center);
12131       ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
12132       ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
12133    }
12134 
12135    if (finfo->force_linear_center_interp) {
12136       Temp linear_center = get_arg(ctx, ctx->args->linear_center);
12137       ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
12138       ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
12139    }
12140 }
12141 
12142 void
overwrite_samplemask_arg(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12143 overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12144 {
12145    Builder bld(ctx->program, ctx->block);
12146 
12147    /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
12148     * says:
12149     *
12150     *    "When per-sample shading is active due to the use of a fragment
12151     *     input qualified by sample or due to the use of the gl_SampleID
12152     *     or gl_SamplePosition variables, only the bit for the current
12153     *     sample is set in gl_SampleMaskIn. When state specifies multiple
12154     *     fragment shader invocations for a given fragment, the sample
12155     *     mask for any single fragment shader invocation may specify a
12156     *     subset of the covered samples for the fragment. In this case,
12157     *     the bit corresponding to each covered sample will be set in
12158     *     exactly one fragment shader invocation."
12159     *
12160     * The samplemask loaded by hardware is always the coverage of the
12161     * entire pixel/fragment, so mask bits out based on the sample ID.
12162     */
12163    if (finfo->samplemask_log_ps_iter) {
12164       Temp ancillary = get_arg(ctx, ctx->args->ancillary);
12165       Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
12166                                Operand::c32(4u));
12167       Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
12168 
12169       uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
12170       Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
12171 
12172       Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
12173       samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
12174 
12175       ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
12176    }
12177 }
12178 
12179 Temp
get_interp_color(isel_context * ctx,int interp_vgpr,unsigned attr_index,unsigned comp)12180 get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
12181 {
12182    Builder bld(ctx->program, ctx->block);
12183 
12184    Temp dst = bld.tmp(v1);
12185 
12186    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
12187 
12188    if (interp_vgpr != -1) {
12189       /* interp args are all 2 vgprs */
12190       int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
12191       Temp interp_ij = ctx->arg_temps[arg_index];
12192 
12193       emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask, false);
12194    } else {
12195       emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask, false);
12196    }
12197 
12198    return dst;
12199 }
12200 
12201 void
interpolate_color_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo,std::vector<Operand> & regs)12202 interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
12203                        std::vector<Operand>& regs)
12204 {
12205    if (!finfo->colors_read)
12206       return;
12207 
12208    Builder bld(ctx->program, ctx->block);
12209 
12210    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
12211 
12212    if (finfo->color_two_side) {
12213       Temp face = get_arg(ctx, ctx->args->front_face);
12214       Temp is_face_positive =
12215          bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), face);
12216 
12217       u_foreach_bit (i, finfo->colors_read) {
12218          unsigned color_index = i / 4;
12219          unsigned front_index = finfo->color_attr_index[color_index];
12220          int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12221 
12222          /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
12223           * otherwise it's at offset "num_inputs".
12224           */
12225          unsigned back_index = finfo->num_interp_inputs;
12226          if (color_index == 1 && finfo->colors_read & 0xf)
12227             back_index++;
12228 
12229          Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
12230          Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
12231 
12232          Temp color =
12233             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
12234 
12235          regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12236       }
12237    } else {
12238       u_foreach_bit (i, finfo->colors_read) {
12239          unsigned color_index = i / 4;
12240          unsigned attr_index = finfo->color_attr_index[color_index];
12241          int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12242          Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
12243 
12244          regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12245       }
12246    }
12247 }
12248 
12249 void
emit_clamp_alpha_test(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned color_index)12250 emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
12251                       unsigned color_index)
12252 {
12253    Builder bld(ctx->program, ctx->block);
12254 
12255    if (info->clamp_color) {
12256       for (unsigned i = 0; i < 4; i++) {
12257          if (colors[i].regClass() == v2b) {
12258             colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
12259                                  Operand::c16(0x3c00), colors[i]);
12260          } else {
12261             assert(colors[i].regClass() == v1);
12262             colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
12263                                  Operand::c32(0x3f800000u), colors[i]);
12264          }
12265       }
12266    }
12267 
12268    if (info->alpha_to_one) {
12269       if (colors[3].regClass() == v2b)
12270          colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00));
12271       else
12272          colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u));
12273    }
12274 
12275    if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) {
12276       Operand cond = Operand::c32(-1u);
12277       if (info->alpha_func != COMPARE_FUNC_NEVER) {
12278          aco_opcode opcode = aco_opcode::num_opcodes;
12279 
12280          switch (info->alpha_func) {
12281          case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break;
12282          case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break;
12283          case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break;
12284          case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break;
12285          case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break;
12286          case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break;
12287          default: unreachable("invalid alpha func");
12288          }
12289 
12290          Temp ref = get_arg(ctx, info->alpha_reference);
12291 
12292          Temp alpha = colors[3].regClass() == v2b
12293                          ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3])
12294                          : colors[3];
12295 
12296          /* true if not pass */
12297          cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha);
12298       }
12299 
12300       bld.pseudo(aco_opcode::p_discard_if, cond);
12301       ctx->block->kind |= block_kind_uses_discard;
12302       ctx->program->needs_exact = true;
12303    }
12304 }
12305 
12306 } /* end namespace */
12307 
12308 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12309 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
12310                ac_shader_config* config, const struct aco_compiler_options* options,
12311                const struct aco_shader_info* info, const struct ac_shader_args* args)
12312 {
12313    isel_context ctx =
12314       setup_isel_context(program, shader_count, shaders, config, options, info, args);
12315 
12316    if (ctx.stage == raytracing_cs)
12317       return select_program_rt(ctx, shader_count, shaders, args);
12318 
12319    if (shader_count >= 2) {
12320       select_program_merged(ctx, shader_count, shaders);
12321    } else {
12322       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
12323       if_context ic_merged_wave_info;
12324 
12325       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
12326       if (ctx.program->info.merged_shader_compiled_separately) {
12327          assert(ctx.program->gfx_level >= GFX9);
12328          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
12329             check_merged_wave_info = endif_merged_wave_info = true;
12330          } else {
12331             const bool ngg_gs =
12332                ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
12333             assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
12334             check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
12335             need_barrier = !ngg_gs;
12336          }
12337       }
12338 
12339       select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
12340                     check_merged_wave_info, endif_merged_wave_info);
12341    }
12342 }
12343 
12344 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12345 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
12346                            const struct aco_compiler_options* options,
12347                            const struct aco_shader_info* info, const struct ac_shader_args* args)
12348 {
12349    assert(options->gfx_level == GFX8);
12350 
12351    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12352                 config);
12353 
12354    isel_context ctx = {};
12355    ctx.program = program;
12356    ctx.args = args;
12357    ctx.options = options;
12358    ctx.stage = program->stage;
12359 
12360    ctx.block = ctx.program->create_and_insert_block();
12361    ctx.block->kind = block_kind_top_level;
12362 
12363    program->workgroup_size = 1; /* XXX */
12364 
12365    add_startpgm(&ctx);
12366    append_logical_start(ctx.block);
12367 
12368    Builder bld(ctx.program, ctx.block);
12369 
12370    /* Load the buffer descriptor from TMA. */
12371    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
12372             Operand::zero());
12373 
12374    ac_hw_cache_flags cache_glc;
12375    cache_glc.value = ac_glc;
12376 
12377    /* Store TTMP0-TTMP1. */
12378    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
12379             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), cache_glc);
12380 
12381    uint32_t hw_regs_idx[] = {
12382       2, /* HW_REG_STATUS */
12383       3, /* HW_REG_TRAP_STS */
12384       4, /* HW_REG_HW_ID */
12385       7, /* HW_REG_IB_STS */
12386    };
12387 
12388    /* Store some hardware registers. */
12389    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
12390       /* "((size - 1) << 11) | register" */
12391       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
12392                ((20 - 1) << 11) | hw_regs_idx[i]);
12393 
12394       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
12395                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(),
12396                cache_glc);
12397    }
12398 
12399    program->config->float_mode = program->blocks[0].fp_mode.val;
12400 
12401    append_logical_end(ctx.block);
12402    ctx.block->kind |= block_kind_uniform;
12403    bld.sopp(aco_opcode::s_endpgm);
12404 
12405    finish_program(&ctx);
12406 }
12407 
12408 Operand
get_arg_fixed(const struct ac_shader_args * args,struct ac_arg arg)12409 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
12410 {
12411    enum ac_arg_regfile file = args->args[arg.arg_index].file;
12412    unsigned size = args->args[arg.arg_index].size;
12413    RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
12414    return Operand(get_arg_reg(args, arg), rc);
12415 }
12416 
12417 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)12418 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
12419 {
12420    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
12421    for (unsigned i = 0; i < count;) {
12422       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
12423 
12424       if (size == 4)
12425          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
12426                   Operand::c32((start + i) * 16u));
12427       else if (size == 2)
12428          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
12429                   Operand::c32((start + i) * 16u));
12430       else
12431          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
12432                   Operand::c32((start + i) * 16u));
12433 
12434       dest = dest.advance(size * 16u);
12435       i += size;
12436    }
12437 
12438    return count;
12439 }
12440 
12441 void
wait_for_smem_loads(Builder & bld)12442 wait_for_smem_loads(Builder& bld)
12443 {
12444    if (bld.program->gfx_level >= GFX12) {
12445       bld.sopp(aco_opcode::s_wait_kmcnt, 0);
12446    } else {
12447       wait_imm lgkm_imm;
12448       lgkm_imm.lgkm = 0;
12449       bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(bld.program->gfx_level));
12450    }
12451 }
12452 
12453 void
wait_for_vmem_loads(Builder & bld)12454 wait_for_vmem_loads(Builder& bld)
12455 {
12456    if (bld.program->gfx_level >= GFX12) {
12457       bld.sopp(aco_opcode::s_wait_loadcnt, 0);
12458    } else {
12459       wait_imm vm_imm;
12460       vm_imm.vm = 0;
12461       bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(bld.program->gfx_level));
12462    }
12463 }
12464 
12465 Operand
calc_nontrivial_instance_id(Builder & bld,const struct ac_shader_args * args,const struct aco_vs_prolog_info * pinfo,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)12466 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
12467                             const struct aco_vs_prolog_info* pinfo, unsigned index,
12468                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
12469                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
12470 {
12471    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
12472             get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
12473 
12474    wait_for_smem_loads(bld);
12475 
12476    Definition fetch_index_def(tmp_vgpr0, v1);
12477    Operand fetch_index(tmp_vgpr0, v1);
12478 
12479    Operand div_info(tmp_sgpr, s1);
12480    if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
12481       /* use SDWA */
12482       if (bld.program->gfx_level < GFX9) {
12483          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12484          div_info = Operand(tmp_vgpr1, v1);
12485       }
12486 
12487       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12488 
12489       Instruction* instr;
12490       if (bld.program->gfx_level >= GFX9)
12491          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12492       else
12493          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12494                                div_info, fetch_index)
12495                     .instr;
12496       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12497 
12498       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12499                fetch_index);
12500 
12501       instr =
12502          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12503       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12504    } else {
12505       Operand tmp_op(tmp_vgpr1, v1);
12506       Definition tmp_def(tmp_vgpr1, v1);
12507 
12508       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12509 
12510       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12511       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12512 
12513       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12514                Operand(tmp_sgpr.advance(4), s1));
12515 
12516       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12517       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12518    }
12519 
12520    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12521 
12522    return fetch_index;
12523 }
12524 
12525 void
select_rt_prolog(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * in_args,const struct ac_shader_args * out_args)12526 select_rt_prolog(Program* program, ac_shader_config* config,
12527                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12528                  const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12529 {
12530    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12531                 config);
12532    Block* block = program->create_and_insert_block();
12533    block->kind = block_kind_top_level;
12534    program->workgroup_size = info->workgroup_size;
12535    program->wave_size = info->workgroup_size;
12536    calc_min_waves(program);
12537    Builder bld(program, block);
12538    block->instructions.reserve(32);
12539    unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12540    unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12541 
12542    /* Inputs:
12543     * Ring offsets:                s[0-1]
12544     * Indirect descriptor sets:    s[2]
12545     * Push constants pointer:      s[3]
12546     * SBT descriptors:             s[4-5]
12547     * Traversal shader address:    s[6-7]
12548     * Ray launch size address:     s[8-9]
12549     * Dynamic callable stack base: s[10]
12550     * Workgroup IDs (xyz):         s[11], s[12], s[13]
12551     * Scratch offset:              s[14]
12552     * Local invocation IDs:        v[0-2]
12553     */
12554    PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12555    PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12556    PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12557    PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12558    PhysReg in_wg_id_x;
12559    PhysReg in_wg_id_y;
12560    PhysReg in_wg_id_z;
12561    PhysReg in_scratch_offset;
12562    if (options->gfx_level < GFX12) {
12563       in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12564       in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12565       in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12566    } else {
12567       in_wg_id_x = PhysReg(108 + 9 /*ttmp9*/);
12568       in_wg_id_y = PhysReg(108 + 7 /*ttmp7*/);
12569    }
12570    if (options->gfx_level < GFX11)
12571       in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12572    PhysReg in_local_ids[2] = {
12573       get_arg_reg(in_args, in_args->local_invocation_ids),
12574       get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
12575    };
12576 
12577    /* Outputs:
12578     * Callee shader PC:            s[0-1]
12579     * Indirect descriptor sets:    s[2]
12580     * Push constants pointer:      s[3]
12581     * SBT descriptors:             s[4-5]
12582     * Traversal shader address:    s[6-7]
12583     * Ray launch sizes (xyz):      s[8], s[9], s[10]
12584     * Scratch offset (<GFX9 only): s[11]
12585     * Ring offsets (<GFX9 only):   s[12-13]
12586     * Ray launch IDs:              v[0-2]
12587     * Stack pointer:               v[3]
12588     * Shader VA:                   v[4-5]
12589     * Shader Record Ptr:           v[6-7]
12590     */
12591    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12592    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
12593    PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
12594    PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
12595    PhysReg out_launch_ids[3];
12596    for (unsigned i = 0; i < 3; i++)
12597       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
12598    PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12599    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12600 
12601    /* Temporaries: */
12602    num_sgprs = align(num_sgprs, 2);
12603    PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
12604    num_sgprs += 2;
12605    PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
12606    num_sgprs += 2;
12607    PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs};
12608    num_sgprs++;
12609 
12610    PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
12611 
12612    /* Confirm some assumptions about register aliasing */
12613    assert(in_ring_offsets == out_uniform_shader_addr);
12614    assert(get_arg_reg(in_args, in_args->push_constants) ==
12615           get_arg_reg(out_args, out_args->push_constants));
12616    assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12617           get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12618    assert(in_launch_size_addr == out_launch_size_x);
12619    assert(in_stack_base == out_launch_size_z);
12620    assert(in_local_ids[0] == out_launch_ids[0]);
12621 
12622    /* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
12623     * arg. Make sure no other outputs have overwritten it by then.
12624     */
12625    assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
12626 
12627    /* load raygen sbt */
12628    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12629             Operand::c32(0u));
12630 
12631    /* init scratch */
12632    if (options->gfx_level < GFX9) {
12633       /* copy ring offsets to temporary location*/
12634       bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12635                Operand(in_ring_offsets, s2));
12636    } else if (options->gfx_level < GFX11) {
12637       hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12638                       Operand(in_scratch_offset, s1));
12639    }
12640 
12641    /* set stack ptr */
12642    bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12643 
12644    /* load raygen address */
12645    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12646             Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12647 
12648    /* load ray launch sizes */
12649    bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12650             Operand(in_launch_size_addr, s2), Operand::c32(8u));
12651    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12652             Operand(in_launch_size_addr, s2), Operand::c32(0u));
12653 
12654    /* calculate ray launch ids */
12655    if (options->gfx_level >= GFX11) {
12656       /* Thread IDs are packed in VGPR0, 10 bits per component. */
12657       bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12658                Operand::c32(10u), Operand::c32(3u));
12659       bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12660                Operand(in_local_ids[0], v1));
12661    }
12662    /* Do this backwards to reduce some RAW hazards on GFX11+ */
12663    if (options->gfx_level >= GFX12) {
12664       bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
12665                    Operand(in_wg_id_y, s1));
12666       bld.vop3(aco_opcode::v_mad_u32_u16, Definition(out_launch_ids[1], v1),
12667                Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12668                Operand(in_local_ids[1], v1));
12669    } else {
12670       bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12671       bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1),
12672                Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12673                Operand(in_local_ids[1], v1));
12674    }
12675    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12676             Operand::c32(8), Operand(in_local_ids[0], v1));
12677 
12678    /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12679    if (options->gfx_level < GFX9) {
12680       bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12681                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12682    } else {
12683       bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12684                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12685    }
12686    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12687             Operand(tmp_raygen_sbt.advance(4), s1));
12688 
12689    /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
12690     * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
12691     * tmp_wg_id_x_times_size now holds wg_id.x * wg_size.
12692     */
12693    bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_id_x_times_size, s1), Definition(scc, s1),
12694             Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
12695 
12696    /* Calculate and add local_invocation_index */
12697    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
12698             Operand(tmp_wg_id_x_times_size, s1));
12699    if (program->wave_size == 64) {
12700       if (program->gfx_level <= GFX7)
12701          bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
12702                   Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12703       else
12704          bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
12705                   Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12706    }
12707 
12708    /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
12709    bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
12710             Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
12711    bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
12712             Operand::c32_or_c64(-1u, program->wave_size == 64),
12713             Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
12714    bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
12715             Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
12716    bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
12717             Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
12718 
12719    if (options->gfx_level < GFX9) {
12720       /* write scratch/ring offsets to outputs, if needed */
12721       bld.sop1(aco_opcode::s_mov_b32,
12722                Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12723                Operand(in_scratch_offset, s1));
12724       bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12725                Operand(tmp_ring_offsets, s2));
12726    }
12727 
12728    /* jump to raygen */
12729    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12730 
12731    program->config->float_mode = program->blocks[0].fp_mode.val;
12732    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12733    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12734 }
12735 
12736 PhysReg
get_next_vgpr(unsigned size,unsigned * num,int * offset=NULL)12737 get_next_vgpr(unsigned size, unsigned* num, int *offset = NULL)
12738 {
12739    unsigned reg = *num + (offset ? *offset : 0);
12740    if (reg + size >= *num) {
12741       *num = reg + size;
12742       if (offset)
12743          *offset = 0;
12744    } else if (offset) {
12745       *offset += size;
12746    }
12747    return PhysReg(256 + reg);
12748 }
12749 
12750 struct UnalignedVsAttribLoad {
12751    /* dst/scratch are PhysReg converted to unsigned */
12752    unsigned dst;
12753    unsigned scratch;
12754    bool d16;
12755    const struct ac_vtx_format_info* vtx_info;
12756 };
12757 
12758 struct UnalignedVsAttribLoadState {
12759    unsigned max_vgprs;
12760    unsigned initial_num_vgprs;
12761    unsigned* num_vgprs;
12762    unsigned overflow_num_vgprs;
12763    aco::small_vec<UnalignedVsAttribLoad, 16> current_loads;
12764 };
12765 
12766 void
convert_unaligned_vs_attrib(Builder & bld,UnalignedVsAttribLoad load)12767 convert_unaligned_vs_attrib(Builder& bld, UnalignedVsAttribLoad load)
12768 {
12769    PhysReg dst(load.dst);
12770    PhysReg scratch(load.scratch);
12771    const struct ac_vtx_format_info* vtx_info = load.vtx_info;
12772    unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12773    unsigned nfmt = vtx_info->hw_format[0] >> 4;
12774 
12775    unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12776    if (load.d16) {
12777       bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(scratch, v1),
12778                Operand::c32(8), Operand(dst, v1));
12779    } else {
12780       for (unsigned i = 1; i < size; i++) {
12781          PhysReg byte_reg = scratch.advance(i * 4 - 4);
12782          if (bld.program->gfx_level >= GFX9) {
12783             bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(byte_reg, v1),
12784                      Operand::c32(i * 8), Operand(dst, v1));
12785          } else {
12786             bld.vop2(aco_opcode::v_lshlrev_b32, Definition(byte_reg, v1), Operand::c32(i * 8),
12787                      Operand(byte_reg, v1));
12788             bld.vop2(aco_opcode::v_or_b32, Definition(dst, v1), Operand(dst, v1),
12789                      Operand(byte_reg, v1));
12790          }
12791       }
12792    }
12793 
12794    unsigned num_channels = vtx_info->chan_byte_size ? 1 : vtx_info->num_channels;
12795    PhysReg chan[4] = {dst, dst.advance(4), dst.advance(8), dst.advance(12)};
12796 
12797    if (dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11) {
12798       bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(22),
12799                Operand::c32(10));
12800       bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(11),
12801                Operand::c32(11));
12802       bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(0),
12803                Operand::c32(11));
12804       bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[2], v1), Operand::c32(5),
12805                Operand(chan[2], v1));
12806       bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[1], v1), Operand::c32(4),
12807                Operand(chan[1], v1));
12808       bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[0], v1), Operand::c32(4),
12809                Operand(chan[0], v1));
12810    } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_2_10_10_10) {
12811       aco_opcode bfe = aco_opcode::v_bfe_u32;
12812       switch (nfmt) {
12813       case V_008F0C_BUF_NUM_FORMAT_SNORM:
12814       case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12815       case V_008F0C_BUF_NUM_FORMAT_SINT: bfe = aco_opcode::v_bfe_i32; break;
12816       default: break;
12817       }
12818 
12819       bool swapxz = G_008F0C_DST_SEL_X(vtx_info->dst_sel) != V_008F0C_SQ_SEL_X;
12820       bld.vop3(bfe, Definition(chan[3], v1), Operand(dst, v1), Operand::c32(30), Operand::c32(2));
12821       bld.vop3(bfe, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(swapxz ? 0 : 20),
12822                Operand::c32(10));
12823       bld.vop3(bfe, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(10), Operand::c32(10));
12824       bld.vop3(bfe, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(swapxz ? 20 : 0),
12825                Operand::c32(10));
12826    } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_8 || dfmt == V_008F0C_BUF_DATA_FORMAT_16) {
12827       unsigned bits = dfmt == V_008F0C_BUF_DATA_FORMAT_8 ? 8 : 16;
12828       switch (nfmt) {
12829       case V_008F0C_BUF_NUM_FORMAT_SNORM:
12830       case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12831       case V_008F0C_BUF_NUM_FORMAT_SINT:
12832          bld.vop3(aco_opcode::v_bfe_i32, Definition(dst, v1), Operand(dst, v1), Operand::c32(0),
12833                   Operand::c32(bits));
12834          break;
12835       default: break;
12836       }
12837    }
12838 
12839    if (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT &&
12840        (dfmt == V_008F0C_BUF_DATA_FORMAT_16 || dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11)) {
12841       for (unsigned i = 0; i < num_channels; i++)
12842          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(chan[i], v1), Operand(chan[i], v1));
12843    } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12844       for (unsigned i = 0; i < num_channels; i++)
12845          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(chan[i], v1), Operand(chan[i], v1));
12846    } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SSCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12847       for (unsigned i = 0; i < num_channels; i++)
12848          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(chan[i], v1), Operand(chan[i], v1));
12849    }
12850 
12851    std::array<unsigned, 4> chan_max;
12852    switch (dfmt) {
12853    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: chan_max = {1023, 1023, 1023, 3}; break;
12854    case V_008F0C_BUF_DATA_FORMAT_8: chan_max = {255, 255, 255, 255}; break;
12855    case V_008F0C_BUF_DATA_FORMAT_16: chan_max = {65535, 65535, 65535, 65535}; break;
12856    }
12857 
12858    if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12859       for (unsigned i = 0; i < num_channels; i++)
12860          bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12861                   Operand::c32(fui(1.0 / chan_max[i])), Operand(chan[i], v1));
12862    } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12863       for (unsigned i = 0; i < num_channels; i++) {
12864          bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12865                   Operand::c32(fui(1.0 / (chan_max[i] >> 1))), Operand(chan[i], v1));
12866          bld.vop2(aco_opcode::v_max_f32, Definition(chan[i], v1), Operand::c32(0xbf800000),
12867                   Operand(chan[i], v1));
12868       }
12869    }
12870 }
12871 
12872 void
convert_current_unaligned_vs_attribs(Builder & bld,UnalignedVsAttribLoadState * state)12873 convert_current_unaligned_vs_attribs(Builder& bld, UnalignedVsAttribLoadState* state)
12874 {
12875    if (state->current_loads.empty())
12876       return;
12877 
12878    wait_for_vmem_loads(bld);
12879 
12880    for (UnalignedVsAttribLoad load : state->current_loads)
12881       convert_unaligned_vs_attrib(bld, load);
12882    state->current_loads.clear();
12883 
12884    state->overflow_num_vgprs = state->initial_num_vgprs;
12885    state->num_vgprs = &state->overflow_num_vgprs;
12886 }
12887 
12888 void
load_unaligned_vs_attrib(Builder & bld,PhysReg dst,Operand desc,Operand index,uint32_t offset,const struct ac_vtx_format_info * vtx_info,UnalignedVsAttribLoadState * state)12889 load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index, uint32_t offset,
12890                          const struct ac_vtx_format_info* vtx_info,
12891                          UnalignedVsAttribLoadState* state)
12892 {
12893    unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12894 
12895    UnalignedVsAttribLoad load;
12896    load.dst = dst;
12897    load.vtx_info = vtx_info;
12898    load.d16 = bld.program->gfx_level >= GFX9 && !bld.program->dev.sram_ecc_enabled && size == 4;
12899 
12900    unsigned num_scratch_vgprs = load.d16 ? 1 : (size - 1);
12901    if (!vtx_info->chan_byte_size) {
12902       /* When chan_byte_size==0, we're loading the entire attribute, so we can use the last 3
12903        * components of the destination.
12904        */
12905       assert(num_scratch_vgprs <= 3);
12906       load.scratch = dst.advance(4);
12907    } else {
12908       if (*state->num_vgprs + num_scratch_vgprs > state->max_vgprs)
12909          convert_current_unaligned_vs_attribs(bld, state);
12910 
12911       load.scratch = get_next_vgpr(num_scratch_vgprs, state->num_vgprs, NULL);
12912    }
12913 
12914    PhysReg scratch(load.scratch);
12915    if (load.d16) {
12916       bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index,
12917                 Operand::c32(0u), offset, false, true);
12918       bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index,
12919                 Operand::c32(0u), offset + 2, false, true);
12920       bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index,
12921                 Operand::c32(0u), offset + 1, false, true);
12922       bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index,
12923                 Operand::c32(0u), offset + 3, false, true);
12924    } else {
12925       for (unsigned i = 0; i < size; i++) {
12926          Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
12927          bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(offset + i), 0,
12928                    false, true);
12929       }
12930    }
12931 
12932    state->current_loads.push_back(load);
12933 }
12934 
12935 void
select_vs_prolog(Program * program,const struct aco_vs_prolog_info * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12936 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12937                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12938                  const struct ac_shader_args* args)
12939 {
12940    assert(pinfo->num_attributes > 0);
12941 
12942    /* This should be enough for any shader/stage. */
12943    unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12944 
12945    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12946                 config);
12947    program->dev.vgpr_limit = 256;
12948 
12949    Block* block = program->create_and_insert_block();
12950    block->kind = block_kind_top_level;
12951 
12952    program->workgroup_size = 64;
12953    calc_min_waves(program);
12954 
12955    Builder bld(program, block);
12956 
12957    block->instructions.reserve(16 + pinfo->num_attributes * 4);
12958 
12959    /* Besides performance, the purpose of this is also for the FeatureRequiredExportPriority GFX11.5
12960     * issue. */
12961    bld.sopp(aco_opcode::s_setprio, 3);
12962 
12963    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12964    bool has_nontrivial_divisors = pinfo->nontrivial_divisors;
12965 
12966    /* choose sgprs */
12967    PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12968    PhysReg prolog_input = vertex_buffers.advance(8);
12969    PhysReg desc(
12970       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12971 
12972    Operand start_instance = get_arg_fixed(args, args->start_instance);
12973    Operand instance_id = get_arg_fixed(args, args->instance_id);
12974 
12975    bool needs_instance_index =
12976       pinfo->instance_rate_inputs &
12977       ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
12978    bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
12979    bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
12980    bool needs_tmp_vgpr0 = has_nontrivial_divisors;
12981    bool needs_tmp_vgpr1 = has_nontrivial_divisors &&
12982                           (program->gfx_level <= GFX8 || program->gfx_level >= GFX11);
12983 
12984    int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4;
12985 
12986    unsigned num_vgprs = args->num_vgprs_used;
12987    PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs);
12988    PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1;
12989    if (needs_vertex_index)
12990       vertex_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12991    if (needs_instance_index)
12992       instance_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12993    if (needs_start_instance)
12994       start_instance_vgpr = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12995    if (needs_tmp_vgpr0)
12996       nontrivial_tmp_vgpr0 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12997    if (needs_tmp_vgpr1)
12998       nontrivial_tmp_vgpr1 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12999 
13000    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
13001             get_arg_fixed(args, args->vertex_buffers));
13002    if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
13003       bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
13004                options->address32_hi & 0xFFFF);
13005    } else {
13006       bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
13007                Operand::c32((unsigned)options->address32_hi));
13008    }
13009 
13010    const struct ac_vtx_format_info* vtx_info_table =
13011       ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
13012 
13013    UnalignedVsAttribLoadState unaligned_state;
13014    unaligned_state.max_vgprs = MAX2(84, num_vgprs + 8);
13015    unaligned_state.initial_num_vgprs = num_vgprs;
13016    unaligned_state.num_vgprs = &num_vgprs;
13017 
13018    unsigned num_sgprs = 0;
13019    for (unsigned loc = 0; loc < pinfo->num_attributes;) {
13020       unsigned num_descs =
13021          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
13022       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
13023 
13024       if (loc == 0) {
13025          /* perform setup while we load the descriptors */
13026          if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
13027             Operand count = get_arg_fixed(args, args->merged_wave_info);
13028             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
13029             if (program->wave_size == 64) {
13030                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
13031                         Operand::c32(6u /* log2(64) */));
13032                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
13033                         Operand(exec, s2), Operand(scc, s1));
13034             }
13035          }
13036 
13037          /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
13038          if (info->hw_stage == AC_HW_HULL_SHADER && options->has_ls_vgpr_init_bug) {
13039             /* We don't want load_vb_descs() to write vcc. */
13040             assert(program->dev.sgpr_limit <= vcc.reg());
13041 
13042             bld.sop2(aco_opcode::s_bfe_u32, Definition(vcc, s1), Definition(scc, s1),
13043                      get_arg_fixed(args, args->merged_wave_info), Operand::c32((8u << 16) | 8u));
13044             bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), Operand::c32(-1), Operand::zero(),
13045                      Operand(scc, s1));
13046 
13047             /* These copies are ordered so that vertex_id=tcs_patch_id doesn't overwrite vertex_id
13048              * before instance_id=vertex_id. */
13049             ac_arg src_args[] = {args->vertex_id, args->tcs_rel_ids, args->tcs_patch_id};
13050             ac_arg dst_args[] = {args->instance_id, args->vs_rel_patch_id, args->vertex_id};
13051             for (unsigned i = 0; i < 3; i++) {
13052                bld.vop2(aco_opcode::v_cndmask_b32, Definition(get_arg_reg(args, dst_args[i]), v1),
13053                         get_arg_fixed(args, src_args[i]), get_arg_fixed(args, dst_args[i]),
13054                         Operand(vcc, bld.lm));
13055             }
13056          }
13057 
13058          if (needs_vertex_index)
13059             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
13060                        get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
13061          if (needs_instance_index)
13062             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
13063                        Operand(s2), true);
13064          if (needs_start_instance)
13065             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
13066       }
13067 
13068       wait_for_smem_loads(bld);
13069 
13070       for (unsigned i = 0; i < num_descs;) {
13071          PhysReg dest(attributes_start.reg() + loc * 4u);
13072 
13073          /* calculate index */
13074          Operand fetch_index = Operand(vertex_index, v1);
13075          if (pinfo->instance_rate_inputs & (1u << loc)) {
13076             if (!(pinfo->zero_divisors & (1u << loc))) {
13077                fetch_index = instance_id;
13078                if (pinfo->nontrivial_divisors & (1u << loc)) {
13079                   unsigned index = util_bitcount(pinfo->nontrivial_divisors & BITFIELD_MASK(loc));
13080                   fetch_index = calc_nontrivial_instance_id(
13081                      bld, args, pinfo, index, instance_id, start_instance, prolog_input,
13082                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
13083                } else {
13084                   fetch_index = Operand(instance_index, v1);
13085                }
13086             } else {
13087                fetch_index = Operand(start_instance_vgpr, v1);
13088             }
13089          }
13090 
13091          /* perform load */
13092          PhysReg cur_desc = desc.advance(i * 16);
13093          if ((pinfo->misaligned_mask & (1u << loc))) {
13094             const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
13095 
13096             assert(vtx_info->has_hw_format & 0x1);
13097             unsigned dfmt = vtx_info->hw_format[0] & 0xf;
13098             unsigned nfmt = vtx_info->hw_format[0] >> 4;
13099 
13100             for (unsigned j = 0; j < (vtx_info->chan_byte_size ? vtx_info->num_channels : 1); j++) {
13101                bool post_shuffle = pinfo->post_shuffle & (1u << loc);
13102                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
13103 
13104                if ((pinfo->unaligned_mask & (1u << loc)) && vtx_info->chan_byte_size <= 4)
13105                   load_unaligned_vs_attrib(bld, dest.advance(j * 4u), Operand(cur_desc, s4),
13106                                            fetch_index, offset, vtx_info, &unaligned_state);
13107                else if (vtx_info->chan_byte_size == 8)
13108                   bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
13109                             Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
13110                             fetch_index, Operand::c32(offset), dfmt, nfmt, 0, false, true);
13111                else
13112                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
13113                             Operand(cur_desc, s4), fetch_index, Operand::c32(offset), dfmt, nfmt,
13114                             0, false, true);
13115             }
13116 
13117             unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
13118             loc += slots;
13119             i += slots;
13120          } else {
13121             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
13122                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true);
13123             loc++;
13124             i++;
13125          }
13126       }
13127    }
13128 
13129    uint32_t constant_mask = pinfo->misaligned_mask;
13130    while (constant_mask) {
13131       unsigned loc = u_bit_scan(&constant_mask);
13132       const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
13133 
13134       /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
13135        * For 64-bit data types, no default attribute values are provided. Input variables must
13136        * not use more components than provided by the attribute.
13137        */
13138       if (vtx_info->chan_byte_size == 8) {
13139          if (vtx_info->num_channels > 2)
13140             u_bit_scan(&constant_mask);
13141          continue;
13142       }
13143 
13144       assert(vtx_info->has_hw_format & 0x1);
13145       unsigned nfmt = vtx_info->hw_format[0] >> 4;
13146 
13147       uint32_t one = nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
13148                         ? 1u
13149                         : 0x3f800000u;
13150       PhysReg dest(attributes_start.reg() + loc * 4u);
13151       for (unsigned j = vtx_info->num_channels; j < 4; j++) {
13152          bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
13153                   Operand::c32(j == 3 ? one : 0u));
13154       }
13155    }
13156 
13157    convert_current_unaligned_vs_attribs(bld, &unaligned_state);
13158 
13159    if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)
13160       wait_for_vmem_loads(bld);
13161 
13162    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
13163     * so we may need to fix it up. */
13164    u_foreach_bit (loc, (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)) {
13165       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
13166 
13167       unsigned alpha_adjust = (pinfo->alpha_adjust_lo >> loc) & 0x1;
13168       alpha_adjust |= ((pinfo->alpha_adjust_hi >> loc) & 0x1) << 1;
13169 
13170       if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
13171          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
13172 
13173       /* For the integer-like cases, do a natural sign extension.
13174        *
13175        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
13176        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
13177        * exponent.
13178        */
13179       unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
13180       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
13181                Operand::c32(offset), Operand::c32(2u));
13182 
13183       /* Convert back to the right type. */
13184       if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
13185          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13186          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
13187                   Operand(alpha, v1));
13188       } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
13189          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13190       }
13191    }
13192 
13193    block->kind |= block_kind_uniform;
13194 
13195    /* continue on to the main shader */
13196    Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
13197    if (has_nontrivial_divisors) {
13198       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
13199                get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
13200       wait_for_smem_loads(bld);
13201       continue_pc = Operand(prolog_input, s2);
13202    }
13203 
13204    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
13205 
13206    program->config->float_mode = program->blocks[0].fp_mode.val;
13207    /* addition on GFX6-8 requires a carry-out (we use VCC) */
13208    program->needs_vcc = program->gfx_level <= GFX8;
13209    program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
13210    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
13211 }
13212 
13213 void
select_ps_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13214 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
13215                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
13216                  const struct ac_shader_args* args)
13217 {
13218    const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
13219    isel_context ctx =
13220       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13221 
13222    ctx.block->fp_mode = program->next_fp_mode;
13223 
13224    add_startpgm(&ctx);
13225    append_logical_start(ctx.block);
13226 
13227    Builder bld(ctx.program, ctx.block);
13228 
13229    Temp colors[MAX_DRAW_BUFFERS][4];
13230    for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13231       if (!einfo->colors[i].used)
13232          continue;
13233 
13234       Temp color = get_arg(&ctx, einfo->colors[i]);
13235       unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3;
13236 
13237       emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8);
13238       for (unsigned c = 0; c < 4; ++c) {
13239          colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b);
13240       }
13241 
13242       emit_clamp_alpha_test(&ctx, einfo, colors[i], i);
13243    }
13244 
13245    bool has_mrtz_depth = einfo->depth.used;
13246    bool has_mrtz_stencil = einfo->stencil.used;
13247    bool has_mrtz_samplemask = einfo->samplemask.used;
13248    bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used;
13249    bool has_mrtz_export =
13250       has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha;
13251    if (has_mrtz_export) {
13252       Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp();
13253       Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp();
13254       Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp();
13255       Temp alpha = has_mrtz_alpha ? colors[0][3] : Temp();
13256 
13257       export_fs_mrtz(&ctx, depth, stencil, samplemask, alpha);
13258    }
13259 
13260    /* Export all color render targets */
13261    struct aco_export_mrt mrts[MAX_DRAW_BUFFERS];
13262    unsigned mrt_num = 0;
13263 
13264    if (einfo->broadcast_last_cbuf) {
13265       for (unsigned i = 0; i <= einfo->broadcast_last_cbuf; i++) {
13266          struct aco_export_mrt* mrt = &mrts[mrt_num];
13267          if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt))
13268             mrt->target += mrt_num++;
13269       }
13270    } else {
13271       for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13272          struct aco_export_mrt* mrt = &mrts[mrt_num];
13273          const uint8_t cb_idx = einfo->color_map[i];
13274 
13275          if (cb_idx == 0xff || !einfo->colors[cb_idx].used)
13276             continue;
13277 
13278          if (export_fs_mrt_color(&ctx, einfo, colors[cb_idx], i, mrt)) {
13279             mrt->target += mrt_num++;
13280          }
13281       }
13282    }
13283 
13284    if (mrt_num) {
13285       if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
13286          assert(mrt_num == 2);
13287          create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]);
13288       } else {
13289          for (unsigned i = 0; i < mrt_num; i++)
13290             export_mrt(&ctx, &mrts[i]);
13291       }
13292    } else if (!has_mrtz_export && !einfo->skip_null_export) {
13293       create_fs_null_export(&ctx);
13294    }
13295 
13296    program->config->float_mode = program->blocks[0].fp_mode.val;
13297 
13298    append_logical_end(ctx.block);
13299    ctx.block->kind |= block_kind_export_end;
13300    bld.reset(ctx.block);
13301    bld.sopp(aco_opcode::s_endpgm);
13302 
13303    finish_program(&ctx);
13304 }
13305 
13306 void
select_ps_prolog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13307 select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
13308                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
13309                  const struct ac_shader_args* args)
13310 {
13311    const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
13312    isel_context ctx =
13313       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13314 
13315    ctx.block->fp_mode = program->next_fp_mode;
13316 
13317    add_startpgm(&ctx);
13318    append_logical_start(ctx.block);
13319 
13320    if (finfo->poly_stipple)
13321       emit_polygon_stipple(&ctx, finfo);
13322 
13323    overwrite_interp_args(&ctx, finfo);
13324 
13325    overwrite_samplemask_arg(&ctx, finfo);
13326 
13327    std::vector<Operand> regs;
13328    passthrough_all_args(&ctx, regs);
13329 
13330    interpolate_color_args(&ctx, finfo, regs);
13331 
13332    program->config->float_mode = program->blocks[0].fp_mode.val;
13333 
13334    append_logical_end(ctx.block);
13335 
13336    build_end_with_regs(&ctx, regs);
13337 
13338    /* To compute all end args in WQM mode if required by main part. */
13339    if (finfo->needs_wqm)
13340       set_wqm(&ctx, true);
13341 
13342    /* Exit WQM mode finally. */
13343    program->needs_exact = true;
13344 
13345    finish_program(&ctx);
13346 }
13347 
13348 } // namespace aco
13349