xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/aco_ir.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include "aco_builder.h"
10 
11 #include "util/u_debug.h"
12 
13 #include "c11/threads.h"
14 
15 namespace aco {
16 
17 thread_local aco::monotonic_buffer_resource* instruction_buffer = nullptr;
18 
19 uint64_t debug_flags = 0;
20 
21 static const struct debug_control aco_debug_options[] = {
22    {"validateir", DEBUG_VALIDATE_IR},
23    {"validatera", DEBUG_VALIDATE_RA},
24    {"validate-livevars", DEBUG_VALIDATE_LIVE_VARS},
25    {"novalidateir", DEBUG_NO_VALIDATE_IR},
26    {"force-waitcnt", DEBUG_FORCE_WAITCNT},
27    {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
28    {"novn", DEBUG_NO_VN},
29    {"noopt", DEBUG_NO_OPT},
30    {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
31    {"nosched-ilp", DEBUG_NO_SCHED_ILP},
32    {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
33    {"perfinfo", DEBUG_PERF_INFO},
34    {"liveinfo", DEBUG_LIVE_INFO},
35    {NULL, 0}};
36 
37 static once_flag init_once_flag = ONCE_FLAG_INIT;
38 
39 static void
init_once()40 init_once()
41 {
42    debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
43 
44 #ifndef NDEBUG
45    /* enable some flags by default on debug builds */
46    debug_flags |= aco::DEBUG_VALIDATE_IR;
47 #endif
48 
49    if (debug_flags & aco::DEBUG_NO_VALIDATE_IR)
50       debug_flags &= ~aco::DEBUG_VALIDATE_IR;
51 }
52 
53 void
init()54 init()
55 {
56    call_once(&init_once_flag, init_once);
57 }
58 
59 void
init_program(Program * program,Stage stage,const struct aco_shader_info * info,enum amd_gfx_level gfx_level,enum radeon_family family,bool wgp_mode,ac_shader_config * config)60 init_program(Program* program, Stage stage, const struct aco_shader_info* info,
61              enum amd_gfx_level gfx_level, enum radeon_family family, bool wgp_mode,
62              ac_shader_config* config)
63 {
64    instruction_buffer = &program->m;
65    program->stage = stage;
66    program->config = config;
67    program->info = *info;
68    program->gfx_level = gfx_level;
69    if (family == CHIP_UNKNOWN) {
70       switch (gfx_level) {
71       case GFX6: program->family = CHIP_TAHITI; break;
72       case GFX7: program->family = CHIP_BONAIRE; break;
73       case GFX8: program->family = CHIP_POLARIS10; break;
74       case GFX9: program->family = CHIP_VEGA10; break;
75       case GFX10: program->family = CHIP_NAVI10; break;
76       case GFX10_3: program->family = CHIP_NAVI21; break;
77       case GFX11: program->family = CHIP_NAVI31; break;
78       case GFX12: program->family = CHIP_GFX1200; break;
79       default: program->family = CHIP_UNKNOWN; break;
80       }
81    } else {
82       program->family = family;
83    }
84    program->wave_size = info->wave_size;
85    program->lane_mask = program->wave_size == 32 ? s1 : s2;
86 
87    program->dev.lds_encoding_granule = gfx_level >= GFX11 && stage == fragment_fs ? 1024
88                                        : gfx_level >= GFX7                        ? 512
89                                                                                   : 256;
90    program->dev.lds_alloc_granule = gfx_level >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
91 
92    /* GFX6: There is 64KB LDS per CU, but a single workgroup can only use 32KB. */
93    program->dev.lds_limit = gfx_level >= GFX7 ? 65536 : 32768;
94 
95    /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
96    program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
97 
98    program->dev.vgpr_limit = stage == raytracing_cs ? 128 : 256;
99    program->dev.physical_vgprs = 256;
100    program->dev.vgpr_alloc_granule = 4;
101 
102    if (gfx_level >= GFX10) {
103       program->dev.physical_sgprs = 128 * 20; /* enough for max waves */
104       program->dev.sgpr_alloc_granule = 128;
105       program->dev.sgpr_limit =
106          108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
107 
108       if (family == CHIP_NAVI31 || family == CHIP_NAVI32 || family == CHIP_GFX1151 ||
109           gfx_level >= GFX12) {
110          program->dev.physical_vgprs = program->wave_size == 32 ? 1536 : 768;
111          program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 24 : 12;
112       } else {
113          program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
114          if (gfx_level >= GFX10_3)
115             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
116          else
117             program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
118       }
119    } else if (program->gfx_level >= GFX8) {
120       program->dev.physical_sgprs = 800;
121       program->dev.sgpr_alloc_granule = 16;
122       program->dev.sgpr_limit = 102;
123       if (family == CHIP_TONGA || family == CHIP_ICELAND)
124          program->dev.sgpr_alloc_granule = 96; /* workaround hardware bug */
125    } else {
126       program->dev.physical_sgprs = 512;
127       program->dev.sgpr_alloc_granule = 8;
128       program->dev.sgpr_limit = 104;
129    }
130 
131    program->dev.scratch_alloc_granule = gfx_level >= GFX11 ? 256 : 1024;
132 
133    program->dev.max_waves_per_simd = 10;
134    if (program->gfx_level >= GFX10_3)
135       program->dev.max_waves_per_simd = 16;
136    else if (program->gfx_level == GFX10)
137       program->dev.max_waves_per_simd = 20;
138    else if (program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM)
139       program->dev.max_waves_per_simd = 8;
140 
141    program->dev.simd_per_cu = program->gfx_level >= GFX10 ? 2 : 4;
142 
143    switch (program->family) {
144    /* GFX8 APUs */
145    case CHIP_CARRIZO:
146    case CHIP_STONEY:
147    /* GFX9 APUS */
148    case CHIP_RAVEN:
149    case CHIP_RAVEN2:
150    case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
151    default: break;
152    }
153 
154    program->dev.sram_ecc_enabled = program->family == CHIP_MI100;
155    /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
156    program->dev.has_fast_fma32 = program->gfx_level >= GFX9;
157    if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
158        program->family == CHIP_HAWAII)
159       program->dev.has_fast_fma32 = true;
160    program->dev.has_mac_legacy32 = program->gfx_level <= GFX7 || program->gfx_level == GFX10;
161    program->dev.has_fmac_legacy32 = program->gfx_level >= GFX10_3 && program->gfx_level < GFX12;
162 
163    program->dev.fused_mad_mix = program->gfx_level >= GFX10;
164    if (program->family == CHIP_VEGA12 || program->family == CHIP_VEGA20 ||
165        program->family == CHIP_MI100 || program->family == CHIP_MI200)
166       program->dev.fused_mad_mix = true;
167 
168    if (program->gfx_level >= GFX11) {
169       program->dev.scratch_global_offset_min = -4096;
170       program->dev.scratch_global_offset_max = 4095;
171    } else if (program->gfx_level >= GFX10 || program->gfx_level == GFX8) {
172       program->dev.scratch_global_offset_min = -2048;
173       program->dev.scratch_global_offset_max = 2047;
174    } else if (program->gfx_level == GFX9) {
175       /* The minimum is actually -4096, but negative offsets are broken when SADDR is used. */
176       program->dev.scratch_global_offset_min = 0;
177       program->dev.scratch_global_offset_max = 4095;
178    }
179 
180    if (program->gfx_level >= GFX12) {
181       /* Same as GFX11, except one less for VSAMPLE. */
182       program->dev.max_nsa_vgprs = 3;
183    } else if (program->gfx_level >= GFX11) {
184       /* GFX11 can have only 1 NSA dword. The last VGPR isn't included here because it contains the
185        * rest of the address.
186        */
187       program->dev.max_nsa_vgprs = 4;
188    } else if (program->gfx_level >= GFX10_3) {
189       /* GFX10.3 can have up to 3 NSA dwords. */
190       program->dev.max_nsa_vgprs = 13;
191    } else if (program->gfx_level >= GFX10) {
192       /* Limit NSA instructions to 1 NSA dword on GFX10 to avoid stability issues. */
193       program->dev.max_nsa_vgprs = 5;
194    } else {
195       program->dev.max_nsa_vgprs = 0;
196    }
197 
198    program->wgp_mode = wgp_mode;
199 
200    program->progress = CompilationProgress::after_isel;
201 
202    program->next_fp_mode.must_flush_denorms32 = false;
203    program->next_fp_mode.must_flush_denorms16_64 = false;
204    program->next_fp_mode.care_about_round32 = false;
205    program->next_fp_mode.care_about_round16_64 = false;
206    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
207    program->next_fp_mode.denorm32 = 0;
208    program->next_fp_mode.round16_64 = fp_round_ne;
209    program->next_fp_mode.round32 = fp_round_ne;
210 }
211 
212 bool
is_wait_export_ready(amd_gfx_level gfx_level,const Instruction * instr)213 is_wait_export_ready(amd_gfx_level gfx_level, const Instruction* instr)
214 {
215    return instr->opcode == aco_opcode::s_wait_event &&
216           (gfx_level >= GFX12 ? (instr->salu().imm & wait_event_imm_wait_export_ready_gfx12)
217                               : !(instr->salu().imm & wait_event_imm_dont_wait_export_ready_gfx11));
218 }
219 
220 memory_sync_info
get_sync_info(const Instruction * instr)221 get_sync_info(const Instruction* instr)
222 {
223    /* Primitive Ordered Pixel Shading barriers necessary for accesses to memory shared between
224     * overlapping waves in the queue family.
225     */
226    if (instr->opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done ||
227        instr->opcode == aco_opcode::s_wait_event) {
228       return memory_sync_info(storage_buffer | storage_image, semantic_acquire, scope_queuefamily);
229    } else if (instr->opcode == aco_opcode::p_pops_gfx9_ordered_section_done) {
230       return memory_sync_info(storage_buffer | storage_image, semantic_release, scope_queuefamily);
231    }
232 
233    switch (instr->format) {
234    case Format::SMEM: return instr->smem().sync;
235    case Format::MUBUF: return instr->mubuf().sync;
236    case Format::MIMG: return instr->mimg().sync;
237    case Format::MTBUF: return instr->mtbuf().sync;
238    case Format::FLAT:
239    case Format::GLOBAL:
240    case Format::SCRATCH: return instr->flatlike().sync;
241    case Format::DS: return instr->ds().sync;
242    case Format::LDSDIR: return instr->ldsdir().sync;
243    default: return memory_sync_info();
244    }
245 }
246 
247 bool
can_use_SDWA(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool pre_ra)248 can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra)
249 {
250    if (!instr->isVALU())
251       return false;
252 
253    if (gfx_level < GFX8 || gfx_level >= GFX11 || instr->isDPP() || instr->isVOP3P())
254       return false;
255 
256    if (instr->isSDWA())
257       return true;
258 
259    if (instr->isVOP3()) {
260       VALU_instruction& vop3 = instr->valu();
261       if (instr->format == Format::VOP3)
262          return false;
263       if (vop3.clamp && instr->isVOPC() && gfx_level != GFX8)
264          return false;
265       if (vop3.omod && gfx_level < GFX9)
266          return false;
267 
268       // TODO: return true if we know we will use vcc
269       if (!pre_ra && instr->definitions.size() >= 2)
270          return false;
271 
272       for (unsigned i = 1; i < instr->operands.size(); i++) {
273          if (instr->operands[i].isLiteral())
274             return false;
275          if (gfx_level < GFX9 && !instr->operands[i].isOfType(RegType::vgpr))
276             return false;
277       }
278    }
279 
280    if (!instr->definitions.empty() && instr->definitions[0].bytes() > 4 && !instr->isVOPC())
281       return false;
282 
283    if (!instr->operands.empty()) {
284       if (instr->operands[0].isLiteral())
285          return false;
286       if (gfx_level < GFX9 && !instr->operands[0].isOfType(RegType::vgpr))
287          return false;
288       if (instr->operands[0].bytes() > 4)
289          return false;
290       if (instr->operands.size() > 1 && instr->operands[1].bytes() > 4)
291          return false;
292    }
293 
294    bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
295                  instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
296 
297    if (gfx_level != GFX8 && is_mac)
298       return false;
299 
300    // TODO: return true if we know we will use vcc
301    if (!pre_ra && instr->isVOPC() && gfx_level == GFX8)
302       return false;
303    if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
304       return false;
305 
306    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
307           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
308           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
309           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
310           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
311           instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
312 }
313 
314 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
315 aco_ptr<Instruction>
convert_to_SDWA(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)316 convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
317 {
318    if (instr->isSDWA())
319       return NULL;
320 
321    aco_ptr<Instruction> tmp = std::move(instr);
322    Format format = asSDWA(withoutVOP3(tmp->format));
323    instr.reset(
324       create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
325    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
326    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
327 
328    SDWA_instruction& sdwa = instr->sdwa();
329 
330    if (tmp->isVOP3()) {
331       VALU_instruction& vop3 = tmp->valu();
332       sdwa.neg = vop3.neg;
333       sdwa.abs = vop3.abs;
334       sdwa.omod = vop3.omod;
335       sdwa.clamp = vop3.clamp;
336    }
337 
338    for (unsigned i = 0; i < instr->operands.size(); i++) {
339       /* SDWA only uses operands 0 and 1. */
340       if (i >= 2)
341          break;
342 
343       sdwa.sel[i] = SubdwordSel(instr->operands[i].bytes(), 0, false);
344    }
345 
346    sdwa.dst_sel = SubdwordSel(instr->definitions[0].bytes(), 0, false);
347 
348    if (instr->definitions[0].getTemp().type() == RegType::sgpr && gfx_level == GFX8)
349       instr->definitions[0].setFixed(vcc);
350    if (instr->definitions.size() >= 2)
351       instr->definitions[1].setFixed(vcc);
352    if (instr->operands.size() >= 3)
353       instr->operands[2].setFixed(vcc);
354 
355    instr->pass_flags = tmp->pass_flags;
356 
357    return tmp;
358 }
359 
360 bool
can_use_DPP(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,bool dpp8)361 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
362 {
363    assert(instr->isVALU() && !instr->operands.empty());
364 
365    if (instr->isDPP())
366       return instr->isDPP8() == dpp8;
367 
368    if (instr->isSDWA() || instr->isVINTERP_INREG())
369       return false;
370 
371    if ((instr->format == Format::VOP3 || instr->isVOP3P()) && gfx_level < GFX11)
372       return false;
373 
374    if ((instr->isVOPC() || instr->definitions.size() > 1) && instr->definitions.back().isFixed() &&
375        instr->definitions.back().physReg() != vcc && gfx_level < GFX11)
376       return false;
377 
378    if (instr->operands.size() >= 3 && instr->operands[2].isFixed() &&
379        instr->operands[2].isOfType(RegType::sgpr) && instr->operands[2].physReg() != vcc &&
380        gfx_level < GFX11)
381       return false;
382 
383    if (instr->isVOP3() && gfx_level < GFX11) {
384       const VALU_instruction* vop3 = &instr->valu();
385       if (vop3->clamp || vop3->omod)
386          return false;
387       if (dpp8)
388          return false;
389    }
390 
391    for (unsigned i = 0; i < instr->operands.size(); i++) {
392       if (instr->operands[i].isLiteral())
393          return false;
394       if (!instr->operands[i].isOfType(RegType::vgpr) && i < 2)
395          return false;
396    }
397 
398    /* According to LLVM, it's unsafe to combine DPP into v_cmpx. */
399    if (instr->writes_exec())
400       return false;
401 
402    /* simpler than listing all VOP3P opcodes which do not support DPP */
403    if (instr->isVOP3P()) {
404       return instr->opcode == aco_opcode::v_fma_mix_f32 ||
405              instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
406              instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
407              instr->opcode == aco_opcode::v_dot2_f32_f16 ||
408              instr->opcode == aco_opcode::v_dot2_f32_bf16;
409    }
410 
411    if (instr->opcode == aco_opcode::v_pk_fmac_f16)
412       return gfx_level < GFX11;
413 
414    /* there are more cases but those all take 64-bit inputs */
415    return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
416           instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
417           instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
418           instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
419           instr->opcode != aco_opcode::v_readfirstlane_b32 &&
420           instr->opcode != aco_opcode::v_cvt_f64_i32 &&
421           instr->opcode != aco_opcode::v_cvt_f64_f32 &&
422           instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
423           instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
424           instr->opcode != aco_opcode::v_mul_hi_i32 &&
425           instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
426           instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
427           instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
428           instr->opcode != aco_opcode::v_mad_u64_u32 &&
429           instr->opcode != aco_opcode::v_mad_i64_i32 &&
430           instr->opcode != aco_opcode::v_permlane16_b32 &&
431           instr->opcode != aco_opcode::v_permlanex16_b32 &&
432           instr->opcode != aco_opcode::v_permlane64_b32 &&
433           instr->opcode != aco_opcode::v_readlane_b32_e64 &&
434           instr->opcode != aco_opcode::v_writelane_b32_e64 &&
435           instr->opcode != aco_opcode::p_v_cvt_pk_u8_f32;
436 }
437 
438 aco_ptr<Instruction>
convert_to_DPP(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr,bool dpp8)439 convert_to_DPP(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, bool dpp8)
440 {
441    if (instr->isDPP())
442       return NULL;
443 
444    aco_ptr<Instruction> tmp = std::move(instr);
445    Format format =
446       (Format)((uint32_t)tmp->format | (uint32_t)(dpp8 ? Format::DPP8 : Format::DPP16));
447    if (dpp8)
448       instr.reset(
449          create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
450    else
451       instr.reset(
452          create_instruction(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
453    std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
454    std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
455 
456    if (dpp8) {
457       DPP8_instruction* dpp = &instr->dpp8();
458       dpp->lane_sel = 0xfac688; /* [0,1,2,3,4,5,6,7] */
459       dpp->fetch_inactive = gfx_level >= GFX10;
460    } else {
461       DPP16_instruction* dpp = &instr->dpp16();
462       dpp->dpp_ctrl = dpp_quad_perm(0, 1, 2, 3);
463       dpp->row_mask = 0xf;
464       dpp->bank_mask = 0xf;
465       dpp->fetch_inactive = gfx_level >= GFX10;
466    }
467 
468    instr->valu().neg = tmp->valu().neg;
469    instr->valu().abs = tmp->valu().abs;
470    instr->valu().omod = tmp->valu().omod;
471    instr->valu().clamp = tmp->valu().clamp;
472    instr->valu().opsel = tmp->valu().opsel;
473    instr->valu().opsel_lo = tmp->valu().opsel_lo;
474    instr->valu().opsel_hi = tmp->valu().opsel_hi;
475 
476    if ((instr->isVOPC() || instr->definitions.size() > 1) && gfx_level < GFX11)
477       instr->definitions.back().setFixed(vcc);
478 
479    if (instr->operands.size() >= 3 && instr->operands[2].isOfType(RegType::sgpr) &&
480        gfx_level < GFX11)
481       instr->operands[2].setFixed(vcc);
482 
483    instr->pass_flags = tmp->pass_flags;
484 
485    /* DPP16 supports input modifiers, so we might no longer need VOP3. */
486    bool remove_vop3 = !dpp8 && !instr->valu().omod && !instr->valu().clamp &&
487                       (instr->isVOP1() || instr->isVOP2() || instr->isVOPC());
488 
489    /* VOPC/add_co/sub_co definition needs VCC without VOP3. */
490    remove_vop3 &= instr->definitions.back().regClass().type() != RegType::sgpr ||
491                   !instr->definitions.back().isFixed() ||
492                   instr->definitions.back().physReg() == vcc;
493 
494    /* addc/subb/cndmask 3rd operand needs VCC without VOP3. */
495    remove_vop3 &= instr->operands.size() < 3 || !instr->operands[2].isFixed() ||
496                   instr->operands[2].isOfType(RegType::vgpr) || instr->operands[2].physReg() == vcc;
497 
498    if (remove_vop3)
499       instr->format = withoutVOP3(instr->format);
500 
501    return tmp;
502 }
503 
504 bool
can_use_input_modifiers(amd_gfx_level gfx_level,aco_opcode op,int idx)505 can_use_input_modifiers(amd_gfx_level gfx_level, aco_opcode op, int idx)
506 {
507    if (op == aco_opcode::v_mov_b32)
508       return gfx_level >= GFX10;
509 
510    if (op == aco_opcode::v_ldexp_f16 || op == aco_opcode::v_ldexp_f32 ||
511        op == aco_opcode::v_ldexp_f64)
512       return idx == 0;
513 
514    return instr_info.can_use_input_modifiers[(int)op];
515 }
516 
517 bool
can_use_opsel(amd_gfx_level gfx_level,aco_opcode op,int idx)518 can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx)
519 {
520    /* opsel is only GFX9+ */
521    if (gfx_level < GFX9)
522       return false;
523 
524    switch (op) {
525    case aco_opcode::v_div_fixup_f16:
526    case aco_opcode::v_fma_f16:
527    case aco_opcode::v_mad_f16:
528    case aco_opcode::v_mad_u16:
529    case aco_opcode::v_mad_i16:
530    case aco_opcode::v_med3_f16:
531    case aco_opcode::v_med3_i16:
532    case aco_opcode::v_med3_u16:
533    case aco_opcode::v_min3_f16:
534    case aco_opcode::v_min3_i16:
535    case aco_opcode::v_min3_u16:
536    case aco_opcode::v_max3_f16:
537    case aco_opcode::v_max3_i16:
538    case aco_opcode::v_max3_u16:
539    case aco_opcode::v_minmax_f16:
540    case aco_opcode::v_maxmin_f16:
541    case aco_opcode::v_max_u16_e64:
542    case aco_opcode::v_max_i16_e64:
543    case aco_opcode::v_min_u16_e64:
544    case aco_opcode::v_min_i16_e64:
545    case aco_opcode::v_add_i16:
546    case aco_opcode::v_sub_i16:
547    case aco_opcode::v_add_u16_e64:
548    case aco_opcode::v_sub_u16_e64:
549    case aco_opcode::v_lshlrev_b16_e64:
550    case aco_opcode::v_lshrrev_b16_e64:
551    case aco_opcode::v_ashrrev_i16_e64:
552    case aco_opcode::v_and_b16:
553    case aco_opcode::v_or_b16:
554    case aco_opcode::v_xor_b16:
555    case aco_opcode::v_mul_lo_u16_e64: return true;
556    case aco_opcode::v_pack_b32_f16:
557    case aco_opcode::v_cvt_pknorm_i16_f16:
558    case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
559    case aco_opcode::v_mad_u32_u16:
560    case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
561    case aco_opcode::v_dot2_f16_f16:
562    case aco_opcode::v_dot2_bf16_bf16: return idx == -1 || idx == 2;
563    case aco_opcode::v_cndmask_b16: return idx != 2;
564    case aco_opcode::v_interp_p10_f16_f32_inreg:
565    case aco_opcode::v_interp_p10_rtz_f16_f32_inreg: return idx == 0 || idx == 2;
566    case aco_opcode::v_interp_p2_f16_f32_inreg:
567    case aco_opcode::v_interp_p2_rtz_f16_f32_inreg: return idx == -1 || idx == 0;
568    default:
569       return gfx_level >= GFX11 && (get_gfx11_true16_mask(op) & BITFIELD_BIT(idx == -1 ? 3 : idx));
570    }
571 }
572 
573 bool
can_write_m0(const aco_ptr<Instruction> & instr)574 can_write_m0(const aco_ptr<Instruction>& instr)
575 {
576    if (instr->isSALU())
577       return true;
578 
579    /* VALU can't write m0 on any GPU generations. */
580    if (instr->isVALU())
581       return false;
582 
583    switch (instr->opcode) {
584    case aco_opcode::p_parallelcopy:
585    case aco_opcode::p_extract:
586    case aco_opcode::p_insert:
587       /* These pseudo instructions are implemented with SALU when writing m0. */
588       return true;
589    default:
590       /* Assume that no other instructions can write m0. */
591       return false;
592    }
593 }
594 
595 bool
instr_is_16bit(amd_gfx_level gfx_level,aco_opcode op)596 instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
597 {
598    /* partial register writes are GFX9+, only */
599    if (gfx_level < GFX9)
600       return false;
601 
602    switch (op) {
603    /* VOP3 */
604    case aco_opcode::v_mad_legacy_f16:
605    case aco_opcode::v_mad_legacy_u16:
606    case aco_opcode::v_mad_legacy_i16:
607    case aco_opcode::v_fma_legacy_f16:
608    case aco_opcode::v_div_fixup_legacy_f16: return false;
609    case aco_opcode::v_interp_p2_f16:
610    case aco_opcode::v_interp_p2_hi_f16:
611    case aco_opcode::v_fma_mixlo_f16:
612    case aco_opcode::v_fma_mixhi_f16:
613    /* VOP2 */
614    case aco_opcode::v_mac_f16:
615    case aco_opcode::v_madak_f16:
616    case aco_opcode::v_madmk_f16: return gfx_level >= GFX9;
617    case aco_opcode::v_add_f16:
618    case aco_opcode::v_sub_f16:
619    case aco_opcode::v_subrev_f16:
620    case aco_opcode::v_mul_f16:
621    case aco_opcode::v_max_f16:
622    case aco_opcode::v_min_f16:
623    case aco_opcode::v_ldexp_f16:
624    case aco_opcode::v_fmac_f16:
625    case aco_opcode::v_fmamk_f16:
626    case aco_opcode::v_fmaak_f16:
627    /* VOP1 */
628    case aco_opcode::v_cvt_f16_f32:
629    case aco_opcode::p_v_cvt_f16_f32_rtne:
630    case aco_opcode::v_cvt_f16_u16:
631    case aco_opcode::v_cvt_f16_i16:
632    case aco_opcode::v_rcp_f16:
633    case aco_opcode::v_sqrt_f16:
634    case aco_opcode::v_rsq_f16:
635    case aco_opcode::v_log_f16:
636    case aco_opcode::v_exp_f16:
637    case aco_opcode::v_frexp_mant_f16:
638    case aco_opcode::v_frexp_exp_i16_f16:
639    case aco_opcode::v_floor_f16:
640    case aco_opcode::v_ceil_f16:
641    case aco_opcode::v_trunc_f16:
642    case aco_opcode::v_rndne_f16:
643    case aco_opcode::v_fract_f16:
644    case aco_opcode::v_sin_f16:
645    case aco_opcode::v_cos_f16:
646    case aco_opcode::v_cvt_u16_f16:
647    case aco_opcode::v_cvt_i16_f16:
648    case aco_opcode::v_cvt_norm_i16_f16:
649    case aco_opcode::v_cvt_norm_u16_f16: return gfx_level >= GFX10;
650    /* all non legacy opsel instructions preserve the high bits */
651    default: return can_use_opsel(gfx_level, op, -1);
652    }
653 }
654 
655 /* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is opsel and the field
656  * only supports v0-v127.
657  * The first three bits are used for operands 0-2, and the 4th bit is used for the destination.
658  */
659 uint8_t
get_gfx11_true16_mask(aco_opcode op)660 get_gfx11_true16_mask(aco_opcode op)
661 {
662    switch (op) {
663    case aco_opcode::v_ceil_f16:
664    case aco_opcode::v_cos_f16:
665    case aco_opcode::v_cvt_f16_i16:
666    case aco_opcode::v_cvt_f16_u16:
667    case aco_opcode::v_cvt_i16_f16:
668    case aco_opcode::v_cvt_u16_f16:
669    case aco_opcode::v_cvt_norm_i16_f16:
670    case aco_opcode::v_cvt_norm_u16_f16:
671    case aco_opcode::v_exp_f16:
672    case aco_opcode::v_floor_f16:
673    case aco_opcode::v_fract_f16:
674    case aco_opcode::v_frexp_exp_i16_f16:
675    case aco_opcode::v_frexp_mant_f16:
676    case aco_opcode::v_log_f16:
677    case aco_opcode::v_not_b16:
678    case aco_opcode::v_rcp_f16:
679    case aco_opcode::v_rndne_f16:
680    case aco_opcode::v_rsq_f16:
681    case aco_opcode::v_sin_f16:
682    case aco_opcode::v_sqrt_f16:
683    case aco_opcode::v_trunc_f16:
684    case aco_opcode::v_swap_b16:
685    case aco_opcode::v_mov_b16: return 0x1 | 0x8;
686    case aco_opcode::v_add_f16:
687    case aco_opcode::v_fmaak_f16:
688    case aco_opcode::v_fmac_f16:
689    case aco_opcode::v_fmamk_f16:
690    case aco_opcode::v_ldexp_f16:
691    case aco_opcode::v_max_f16:
692    case aco_opcode::v_min_f16:
693    case aco_opcode::v_mul_f16:
694    case aco_opcode::v_sub_f16:
695    case aco_opcode::v_subrev_f16:
696    case aco_opcode::v_and_b16:
697    case aco_opcode::v_or_b16:
698    case aco_opcode::v_xor_b16: return 0x3 | 0x8;
699    case aco_opcode::v_cvt_f32_f16:
700    case aco_opcode::v_cvt_i32_i16:
701    case aco_opcode::v_cvt_u32_u16: return 0x1;
702    case aco_opcode::v_cmp_class_f16:
703    case aco_opcode::v_cmp_eq_f16:
704    case aco_opcode::v_cmp_eq_i16:
705    case aco_opcode::v_cmp_eq_u16:
706    case aco_opcode::v_cmp_ge_f16:
707    case aco_opcode::v_cmp_ge_i16:
708    case aco_opcode::v_cmp_ge_u16:
709    case aco_opcode::v_cmp_gt_f16:
710    case aco_opcode::v_cmp_gt_i16:
711    case aco_opcode::v_cmp_gt_u16:
712    case aco_opcode::v_cmp_le_f16:
713    case aco_opcode::v_cmp_le_i16:
714    case aco_opcode::v_cmp_le_u16:
715    case aco_opcode::v_cmp_lg_f16:
716    case aco_opcode::v_cmp_lg_i16:
717    case aco_opcode::v_cmp_lg_u16:
718    case aco_opcode::v_cmp_lt_f16:
719    case aco_opcode::v_cmp_lt_i16:
720    case aco_opcode::v_cmp_lt_u16:
721    case aco_opcode::v_cmp_neq_f16:
722    case aco_opcode::v_cmp_nge_f16:
723    case aco_opcode::v_cmp_ngt_f16:
724    case aco_opcode::v_cmp_nle_f16:
725    case aco_opcode::v_cmp_nlg_f16:
726    case aco_opcode::v_cmp_nlt_f16:
727    case aco_opcode::v_cmp_o_f16:
728    case aco_opcode::v_cmp_u_f16:
729    case aco_opcode::v_cmpx_class_f16:
730    case aco_opcode::v_cmpx_eq_f16:
731    case aco_opcode::v_cmpx_eq_i16:
732    case aco_opcode::v_cmpx_eq_u16:
733    case aco_opcode::v_cmpx_ge_f16:
734    case aco_opcode::v_cmpx_ge_i16:
735    case aco_opcode::v_cmpx_ge_u16:
736    case aco_opcode::v_cmpx_gt_f16:
737    case aco_opcode::v_cmpx_gt_i16:
738    case aco_opcode::v_cmpx_gt_u16:
739    case aco_opcode::v_cmpx_le_f16:
740    case aco_opcode::v_cmpx_le_i16:
741    case aco_opcode::v_cmpx_le_u16:
742    case aco_opcode::v_cmpx_lg_f16:
743    case aco_opcode::v_cmpx_lg_i16:
744    case aco_opcode::v_cmpx_lg_u16:
745    case aco_opcode::v_cmpx_lt_f16:
746    case aco_opcode::v_cmpx_lt_i16:
747    case aco_opcode::v_cmpx_lt_u16:
748    case aco_opcode::v_cmpx_neq_f16:
749    case aco_opcode::v_cmpx_nge_f16:
750    case aco_opcode::v_cmpx_ngt_f16:
751    case aco_opcode::v_cmpx_nle_f16:
752    case aco_opcode::v_cmpx_nlg_f16:
753    case aco_opcode::v_cmpx_nlt_f16:
754    case aco_opcode::v_cmpx_o_f16:
755    case aco_opcode::v_cmpx_u_f16: return 0x3;
756    case aco_opcode::v_cvt_f16_f32:
757    case aco_opcode::v_sat_pk_u8_i16: return 0x8;
758    default: return 0x0;
759    }
760 }
761 
762 uint32_t
get_reduction_identity(ReduceOp op,unsigned idx)763 get_reduction_identity(ReduceOp op, unsigned idx)
764 {
765    switch (op) {
766    case iadd8:
767    case iadd16:
768    case iadd32:
769    case iadd64:
770    case fadd16:
771    case fadd32:
772    case fadd64:
773    case ior8:
774    case ior16:
775    case ior32:
776    case ior64:
777    case ixor8:
778    case ixor16:
779    case ixor32:
780    case ixor64:
781    case umax8:
782    case umax16:
783    case umax32:
784    case umax64: return 0;
785    case imul8:
786    case imul16:
787    case imul32:
788    case imul64: return idx ? 0 : 1;
789    case fmul16: return 0x3c00u;                /* 1.0 */
790    case fmul32: return 0x3f800000u;            /* 1.0 */
791    case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
792    case imin8: return INT8_MAX;
793    case imin16: return INT16_MAX;
794    case imin32: return INT32_MAX;
795    case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
796    case imax8: return INT8_MIN;
797    case imax16: return INT16_MIN;
798    case imax32: return INT32_MIN;
799    case imax64: return idx ? 0x80000000u : 0;
800    case umin8:
801    case umin16:
802    case iand8:
803    case iand16: return 0xffffffffu;
804    case umin32:
805    case umin64:
806    case iand32:
807    case iand64: return 0xffffffffu;
808    case fmin16: return 0x7c00u;                /* infinity */
809    case fmin32: return 0x7f800000u;            /* infinity */
810    case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
811    case fmax16: return 0xfc00u;                /* negative infinity */
812    case fmax32: return 0xff800000u;            /* negative infinity */
813    case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
814    default: unreachable("Invalid reduction operation"); break;
815    }
816    return 0;
817 }
818 
819 unsigned
get_operand_size(aco_ptr<Instruction> & instr,unsigned index)820 get_operand_size(aco_ptr<Instruction>& instr, unsigned index)
821 {
822    if (instr->isPseudo())
823       return instr->operands[index].bytes() * 8u;
824    else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
825             instr->opcode == aco_opcode::v_mad_i64_i32)
826       return index == 2 ? 64 : 32;
827    else if (instr->opcode == aco_opcode::v_fma_mix_f32 ||
828             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
829             instr->opcode == aco_opcode::v_fma_mixhi_f16)
830       return instr->valu().opsel_hi[index] ? 16 : 32;
831    else if (instr->opcode == aco_opcode::v_interp_p10_f16_f32_inreg ||
832             instr->opcode == aco_opcode::v_interp_p10_rtz_f16_f32_inreg)
833       return index == 1 ? 32 : 16;
834    else if (instr->opcode == aco_opcode::v_interp_p2_f16_f32_inreg ||
835             instr->opcode == aco_opcode::v_interp_p2_rtz_f16_f32_inreg)
836       return index == 0 ? 16 : 32;
837    else if (instr->isVALU() || instr->isSALU())
838       return instr_info.operand_size[(int)instr->opcode];
839    else
840       return 0;
841 }
842 
843 bool
needs_exec_mask(const Instruction * instr)844 needs_exec_mask(const Instruction* instr)
845 {
846    if (instr->isVALU()) {
847       return instr->opcode != aco_opcode::v_readlane_b32 &&
848              instr->opcode != aco_opcode::v_readlane_b32_e64 &&
849              instr->opcode != aco_opcode::v_writelane_b32 &&
850              instr->opcode != aco_opcode::v_writelane_b32_e64;
851    }
852 
853    if (instr->isVMEM() || instr->isFlatLike())
854       return true;
855 
856    if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
857       return instr->reads_exec();
858 
859    if (instr->isPseudo()) {
860       switch (instr->opcode) {
861       case aco_opcode::p_create_vector:
862       case aco_opcode::p_extract_vector:
863       case aco_opcode::p_split_vector:
864       case aco_opcode::p_phi:
865       case aco_opcode::p_parallelcopy:
866          for (Definition def : instr->definitions) {
867             if (def.getTemp().type() == RegType::vgpr)
868                return true;
869          }
870          return instr->reads_exec();
871       case aco_opcode::p_spill:
872       case aco_opcode::p_reload:
873       case aco_opcode::p_end_linear_vgpr:
874       case aco_opcode::p_logical_start:
875       case aco_opcode::p_logical_end:
876       case aco_opcode::p_startpgm:
877       case aco_opcode::p_end_wqm:
878       case aco_opcode::p_init_scratch: return instr->reads_exec();
879       case aco_opcode::p_start_linear_vgpr: return instr->operands.size();
880       default: break;
881       }
882    }
883 
884    return true;
885 }
886 
887 struct CmpInfo {
888    aco_opcode swapped;
889    aco_opcode inverse;
890    aco_opcode vcmpx;
891 };
892 
893 static ALWAYS_INLINE bool
get_cmp_info(aco_opcode op,CmpInfo * info)894 get_cmp_info(aco_opcode op, CmpInfo* info)
895 {
896    info->swapped = aco_opcode::num_opcodes;
897    info->inverse = aco_opcode::num_opcodes;
898    info->vcmpx = aco_opcode::num_opcodes;
899    switch (op) {
900       // clang-format off
901 #define CMP2(ord, unord, ord_swap, unord_swap, sz)                                                 \
902    case aco_opcode::v_cmp_##ord##_f##sz:                                                           \
903    case aco_opcode::v_cmp_n##unord##_f##sz:                                                        \
904       info->swapped = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord_swap##_f##sz \
905                                                       : aco_opcode::v_cmp_n##unord_swap##_f##sz;   \
906       info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz \
907                                                                : aco_opcode::v_cmp_n##ord##_f##sz; \
908       info->vcmpx = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmpx_##ord##_f##sz       \
909                                                           : aco_opcode::v_cmpx_n##unord##_f##sz;   \
910       return true;
911 #define CMP(ord, unord, ord_swap, unord_swap)                                                      \
912    CMP2(ord, unord, ord_swap, unord_swap, 16)                                                      \
913    CMP2(ord, unord, ord_swap, unord_swap, 32)                                                      \
914    CMP2(ord, unord, ord_swap, unord_swap, 64)
915       CMP(lt, /*n*/ge, gt, /*n*/le)
916       CMP(eq, /*n*/lg, eq, /*n*/lg)
917       CMP(le, /*n*/gt, ge, /*n*/lt)
918       CMP(gt, /*n*/le, lt, /*n*/ge)
919       CMP(lg, /*n*/eq, lg, /*n*/eq)
920       CMP(ge, /*n*/lt, le, /*n*/gt)
921 #undef CMP
922 #undef CMP2
923 #define ORD_TEST(sz)                                                                               \
924    case aco_opcode::v_cmp_u_f##sz:                                                                 \
925       info->swapped = aco_opcode::v_cmp_u_f##sz;                                                   \
926       info->inverse = aco_opcode::v_cmp_o_f##sz;                                                   \
927       info->vcmpx = aco_opcode::v_cmpx_u_f##sz;                                                    \
928       return true;                                                                                 \
929    case aco_opcode::v_cmp_o_f##sz:                                                                 \
930       info->swapped = aco_opcode::v_cmp_o_f##sz;                                                   \
931       info->inverse = aco_opcode::v_cmp_u_f##sz;                                                   \
932       info->vcmpx = aco_opcode::v_cmpx_o_f##sz;                                                    \
933       return true;
934       ORD_TEST(16)
935       ORD_TEST(32)
936       ORD_TEST(64)
937 #undef ORD_TEST
938 #define CMPI2(op, swap, inv, type, sz)                                                             \
939    case aco_opcode::v_cmp_##op##_##type##sz:                                                       \
940       info->swapped = aco_opcode::v_cmp_##swap##_##type##sz;                                       \
941       info->inverse = aco_opcode::v_cmp_##inv##_##type##sz;                                        \
942       info->vcmpx = aco_opcode::v_cmpx_##op##_##type##sz;                                          \
943       return true;
944 #define CMPI(op, swap, inv)                                                                        \
945    CMPI2(op, swap, inv, i, 16)                                                                     \
946    CMPI2(op, swap, inv, u, 16)                                                                     \
947    CMPI2(op, swap, inv, i, 32)                                                                     \
948    CMPI2(op, swap, inv, u, 32)                                                                     \
949    CMPI2(op, swap, inv, i, 64)                                                                     \
950    CMPI2(op, swap, inv, u, 64)
951       CMPI(lt, gt, ge)
952       CMPI(eq, eq, lg)
953       CMPI(le, ge, gt)
954       CMPI(gt, lt, le)
955       CMPI(lg, lg, eq)
956       CMPI(ge, le, lt)
957 #undef CMPI
958 #undef CMPI2
959 #define CMPCLASS(sz)                                                                               \
960    case aco_opcode::v_cmp_class_f##sz:                                                             \
961       info->vcmpx = aco_opcode::v_cmpx_class_f##sz;                                                \
962       return true;
963       CMPCLASS(16)
964       CMPCLASS(32)
965       CMPCLASS(64)
966 #undef CMPCLASS
967       // clang-format on
968    default: return false;
969    }
970 }
971 
972 aco_opcode
get_vcmp_inverse(aco_opcode op)973 get_vcmp_inverse(aco_opcode op)
974 {
975    CmpInfo info;
976    return get_cmp_info(op, &info) ? info.inverse : aco_opcode::num_opcodes;
977 }
978 
979 aco_opcode
get_vcmp_swapped(aco_opcode op)980 get_vcmp_swapped(aco_opcode op)
981 {
982    CmpInfo info;
983    return get_cmp_info(op, &info) ? info.swapped : aco_opcode::num_opcodes;
984 }
985 
986 aco_opcode
get_vcmpx(aco_opcode op)987 get_vcmpx(aco_opcode op)
988 {
989    CmpInfo info;
990    return get_cmp_info(op, &info) ? info.vcmpx : aco_opcode::num_opcodes;
991 }
992 
993 bool
is_cmpx(aco_opcode op)994 is_cmpx(aco_opcode op)
995 {
996    CmpInfo info;
997    return !get_cmp_info(op, &info);
998 }
999 
1000 bool
can_swap_operands(aco_ptr<Instruction> & instr,aco_opcode * new_op,unsigned idx0,unsigned idx1)1001 can_swap_operands(aco_ptr<Instruction>& instr, aco_opcode* new_op, unsigned idx0, unsigned idx1)
1002 {
1003    if (idx0 == idx1) {
1004       *new_op = instr->opcode;
1005       return true;
1006    }
1007 
1008    if (idx0 > idx1)
1009       std::swap(idx0, idx1);
1010 
1011    if (instr->isDPP())
1012       return false;
1013 
1014    if (!instr->isVOP3() && !instr->isVOP3P() && !instr->operands[0].isOfType(RegType::vgpr))
1015       return false;
1016 
1017    if (instr->isVOPC()) {
1018       CmpInfo info;
1019       if (get_cmp_info(instr->opcode, &info) && info.swapped != aco_opcode::num_opcodes) {
1020          *new_op = info.swapped;
1021          return true;
1022       }
1023    }
1024 
1025    /* opcodes not relevant for DPP or SGPRs optimizations are not included. */
1026    switch (instr->opcode) {
1027    case aco_opcode::v_med3_f32: return false; /* order matters for clamp+GFX8+denorm ftz. */
1028    case aco_opcode::v_add_u32:
1029    case aco_opcode::v_add_co_u32:
1030    case aco_opcode::v_add_co_u32_e64:
1031    case aco_opcode::v_add_i32:
1032    case aco_opcode::v_add_i16:
1033    case aco_opcode::v_add_u16_e64:
1034    case aco_opcode::v_add3_u32:
1035    case aco_opcode::v_add_f16:
1036    case aco_opcode::v_add_f32:
1037    case aco_opcode::v_mul_i32_i24:
1038    case aco_opcode::v_mul_hi_i32_i24:
1039    case aco_opcode::v_mul_u32_u24:
1040    case aco_opcode::v_mul_hi_u32_u24:
1041    case aco_opcode::v_mul_lo_u16:
1042    case aco_opcode::v_mul_lo_u16_e64:
1043    case aco_opcode::v_mul_f16:
1044    case aco_opcode::v_mul_f32:
1045    case aco_opcode::v_mul_legacy_f32:
1046    case aco_opcode::v_or_b32:
1047    case aco_opcode::v_and_b32:
1048    case aco_opcode::v_xor_b32:
1049    case aco_opcode::v_xnor_b32:
1050    case aco_opcode::v_xor3_b32:
1051    case aco_opcode::v_or3_b32:
1052    case aco_opcode::v_and_b16:
1053    case aco_opcode::v_or_b16:
1054    case aco_opcode::v_xor_b16:
1055    case aco_opcode::v_max3_f32:
1056    case aco_opcode::v_min3_f32:
1057    case aco_opcode::v_max3_f16:
1058    case aco_opcode::v_min3_f16:
1059    case aco_opcode::v_med3_f16:
1060    case aco_opcode::v_max3_u32:
1061    case aco_opcode::v_min3_u32:
1062    case aco_opcode::v_med3_u32:
1063    case aco_opcode::v_max3_i32:
1064    case aco_opcode::v_min3_i32:
1065    case aco_opcode::v_med3_i32:
1066    case aco_opcode::v_max3_u16:
1067    case aco_opcode::v_min3_u16:
1068    case aco_opcode::v_med3_u16:
1069    case aco_opcode::v_max3_i16:
1070    case aco_opcode::v_min3_i16:
1071    case aco_opcode::v_med3_i16:
1072    case aco_opcode::v_max_f16:
1073    case aco_opcode::v_max_f32:
1074    case aco_opcode::v_min_f16:
1075    case aco_opcode::v_min_f32:
1076    case aco_opcode::v_max_i32:
1077    case aco_opcode::v_min_i32:
1078    case aco_opcode::v_max_u32:
1079    case aco_opcode::v_min_u32:
1080    case aco_opcode::v_max_i16:
1081    case aco_opcode::v_min_i16:
1082    case aco_opcode::v_max_u16:
1083    case aco_opcode::v_min_u16:
1084    case aco_opcode::v_max_i16_e64:
1085    case aco_opcode::v_min_i16_e64:
1086    case aco_opcode::v_max_u16_e64:
1087    case aco_opcode::v_min_u16_e64: *new_op = instr->opcode; return true;
1088    case aco_opcode::v_sub_f16: *new_op = aco_opcode::v_subrev_f16; return true;
1089    case aco_opcode::v_sub_f32: *new_op = aco_opcode::v_subrev_f32; return true;
1090    case aco_opcode::v_sub_co_u32: *new_op = aco_opcode::v_subrev_co_u32; return true;
1091    case aco_opcode::v_sub_u16: *new_op = aco_opcode::v_subrev_u16; return true;
1092    case aco_opcode::v_sub_u32: *new_op = aco_opcode::v_subrev_u32; return true;
1093    case aco_opcode::v_sub_co_u32_e64: *new_op = aco_opcode::v_subrev_co_u32_e64; return true;
1094    case aco_opcode::v_subrev_f16: *new_op = aco_opcode::v_sub_f16; return true;
1095    case aco_opcode::v_subrev_f32: *new_op = aco_opcode::v_sub_f32; return true;
1096    case aco_opcode::v_subrev_co_u32: *new_op = aco_opcode::v_sub_co_u32; return true;
1097    case aco_opcode::v_subrev_u16: *new_op = aco_opcode::v_sub_u16; return true;
1098    case aco_opcode::v_subrev_u32: *new_op = aco_opcode::v_sub_u32; return true;
1099    case aco_opcode::v_subrev_co_u32_e64: *new_op = aco_opcode::v_sub_co_u32_e64; return true;
1100    case aco_opcode::v_addc_co_u32:
1101    case aco_opcode::v_mad_i32_i24:
1102    case aco_opcode::v_mad_u32_u24:
1103    case aco_opcode::v_lerp_u8:
1104    case aco_opcode::v_sad_u8:
1105    case aco_opcode::v_sad_hi_u8:
1106    case aco_opcode::v_sad_u16:
1107    case aco_opcode::v_sad_u32:
1108    case aco_opcode::v_xad_u32:
1109    case aco_opcode::v_add_lshl_u32:
1110    case aco_opcode::v_and_or_b32:
1111    case aco_opcode::v_mad_u16:
1112    case aco_opcode::v_mad_i16:
1113    case aco_opcode::v_mad_u32_u16:
1114    case aco_opcode::v_mad_i32_i16:
1115    case aco_opcode::v_maxmin_f32:
1116    case aco_opcode::v_minmax_f32:
1117    case aco_opcode::v_maxmin_f16:
1118    case aco_opcode::v_minmax_f16:
1119    case aco_opcode::v_maxmin_u32:
1120    case aco_opcode::v_minmax_u32:
1121    case aco_opcode::v_maxmin_i32:
1122    case aco_opcode::v_minmax_i32:
1123    case aco_opcode::v_fma_f32:
1124    case aco_opcode::v_fma_legacy_f32:
1125    case aco_opcode::v_fmac_f32:
1126    case aco_opcode::v_fmac_legacy_f32:
1127    case aco_opcode::v_mac_f32:
1128    case aco_opcode::v_mac_legacy_f32:
1129    case aco_opcode::v_fma_f16:
1130    case aco_opcode::v_fmac_f16:
1131    case aco_opcode::v_mac_f16:
1132    case aco_opcode::v_dot4c_i32_i8:
1133    case aco_opcode::v_dot2c_f32_f16:
1134    case aco_opcode::v_dot2_f32_f16:
1135    case aco_opcode::v_dot2_f32_bf16:
1136    case aco_opcode::v_dot2_f16_f16:
1137    case aco_opcode::v_dot2_bf16_bf16:
1138    case aco_opcode::v_fma_mix_f32:
1139    case aco_opcode::v_fma_mixlo_f16:
1140    case aco_opcode::v_fma_mixhi_f16:
1141    case aco_opcode::v_pk_fmac_f16: {
1142       if (idx1 == 2)
1143          return false;
1144       *new_op = instr->opcode;
1145       return true;
1146    }
1147    case aco_opcode::v_subb_co_u32: {
1148       if (idx1 == 2)
1149          return false;
1150       *new_op = aco_opcode::v_subbrev_co_u32;
1151       return true;
1152    }
1153    case aco_opcode::v_subbrev_co_u32: {
1154       if (idx1 == 2)
1155          return false;
1156       *new_op = aco_opcode::v_subb_co_u32;
1157       return true;
1158    }
1159    default: return false;
1160    }
1161 }
1162 
wait_imm()1163 wait_imm::wait_imm()
1164     : exp(unset_counter), lgkm(unset_counter), vm(unset_counter), vs(unset_counter),
1165       sample(unset_counter), bvh(unset_counter), km(unset_counter)
1166 {}
wait_imm(uint16_t vm_,uint16_t exp_,uint16_t lgkm_,uint16_t vs_)1167 wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
1168     : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_), sample(unset_counter), bvh(unset_counter),
1169       km(unset_counter)
1170 {}
1171 
1172 uint16_t
pack(enum amd_gfx_level gfx_level) const1173 wait_imm::pack(enum amd_gfx_level gfx_level) const
1174 {
1175    uint16_t imm = 0;
1176    assert(exp == unset_counter || exp <= 0x7);
1177    if (gfx_level >= GFX11) {
1178       assert(lgkm == unset_counter || lgkm <= 0x3f);
1179       assert(vm == unset_counter || vm <= 0x3f);
1180       imm = ((vm & 0x3f) << 10) | ((lgkm & 0x3f) << 4) | (exp & 0x7);
1181    } else if (gfx_level >= GFX10) {
1182       assert(lgkm == unset_counter || lgkm <= 0x3f);
1183       assert(vm == unset_counter || vm <= 0x3f);
1184       imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1185    } else if (gfx_level >= GFX9) {
1186       assert(lgkm == unset_counter || lgkm <= 0xf);
1187       assert(vm == unset_counter || vm <= 0x3f);
1188       imm = ((vm & 0x30) << 10) | ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1189    } else {
1190       assert(lgkm == unset_counter || lgkm <= 0xf);
1191       assert(vm == unset_counter || vm <= 0xf);
1192       imm = ((lgkm & 0xf) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
1193    }
1194    if (gfx_level < GFX9 && vm == wait_imm::unset_counter)
1195       imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
1196                         architecture when interpreting the immediate */
1197    if (gfx_level < GFX10 && lgkm == wait_imm::unset_counter)
1198       imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
1199                         architecture when interpreting the immediate */
1200    return imm;
1201 }
1202 
1203 wait_imm
max(enum amd_gfx_level gfx_level)1204 wait_imm::max(enum amd_gfx_level gfx_level)
1205 {
1206    wait_imm imm;
1207    imm.vm = gfx_level >= GFX9 ? 63 : 15;
1208    imm.exp = 7;
1209    imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
1210    imm.vs = gfx_level >= GFX10 ? 63 : 0;
1211    imm.sample = gfx_level >= GFX12 ? 63 : 0;
1212    imm.bvh = gfx_level >= GFX12 ? 7 : 0;
1213    imm.km = gfx_level >= GFX12 ? 31 : 0;
1214    return imm;
1215 }
1216 
1217 bool
unpack(enum amd_gfx_level gfx_level,const Instruction * instr)1218 wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
1219 {
1220    if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
1221       return false;
1222 
1223    aco_opcode op = instr->opcode;
1224    uint16_t packed = instr->salu().imm;
1225 
1226    if (op == aco_opcode::s_wait_loadcnt) {
1227       vm = std::min<uint8_t>(vm, packed);
1228    } else if (op == aco_opcode::s_wait_storecnt) {
1229       vs = std::min<uint8_t>(vs, packed);
1230    } else if (op == aco_opcode::s_wait_samplecnt) {
1231       sample = std::min<uint8_t>(sample, packed);
1232    } else if (op == aco_opcode::s_wait_bvhcnt) {
1233       bvh = std::min<uint8_t>(bvh, packed);
1234    } else if (op == aco_opcode::s_wait_expcnt) {
1235       exp = std::min<uint8_t>(exp, packed);
1236    } else if (op == aco_opcode::s_wait_dscnt) {
1237       lgkm = std::min<uint8_t>(lgkm, packed);
1238    } else if (op == aco_opcode::s_wait_kmcnt) {
1239       km = std::min<uint8_t>(km, packed);
1240    } else if (op == aco_opcode::s_wait_loadcnt_dscnt) {
1241       uint32_t vm2 = (packed >> 8) & 0x3f;
1242       uint32_t ds = packed & 0x3f;
1243       vm = std::min<uint8_t>(vm, vm2 == 0x3f ? wait_imm::unset_counter : vm2);
1244       lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1245    } else if (op == aco_opcode::s_wait_storecnt_dscnt) {
1246       uint32_t vs2 = (packed >> 8) & 0x3f;
1247       uint32_t ds = packed & 0x3f;
1248       vs = std::min<uint8_t>(vs, vs2 == 0x3f ? wait_imm::unset_counter : vs2);
1249       lgkm = std::min<uint8_t>(lgkm, ds == 0x3f ? wait_imm::unset_counter : ds);
1250    } else if (op == aco_opcode::s_waitcnt_expcnt) {
1251       exp = std::min<uint8_t>(exp, packed);
1252    } else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
1253       lgkm = std::min<uint8_t>(lgkm, packed);
1254    } else if (op == aco_opcode::s_waitcnt_vmcnt) {
1255       vm = std::min<uint8_t>(vm, packed);
1256    } else if (op == aco_opcode::s_waitcnt_vscnt) {
1257       vs = std::min<uint8_t>(vs, packed);
1258    } else if (op == aco_opcode::s_waitcnt) {
1259       uint8_t vm2, lgkm2, exp2;
1260       if (gfx_level >= GFX11) {
1261          vm2 = (packed >> 10) & 0x3f;
1262          lgkm2 = (packed >> 4) & 0x3f;
1263          exp2 = packed & 0x7;
1264       } else {
1265          vm2 = packed & 0xf;
1266          if (gfx_level >= GFX9)
1267             vm2 |= (packed >> 10) & 0x30;
1268 
1269          exp2 = (packed >> 4) & 0x7;
1270 
1271          lgkm2 = (packed >> 8) & 0xf;
1272          if (gfx_level >= GFX10)
1273             lgkm2 |= (packed >> 8) & 0x30;
1274       }
1275 
1276       if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
1277          vm2 = wait_imm::unset_counter;
1278       if (exp2 == 0x7)
1279          exp2 = wait_imm::unset_counter;
1280       if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
1281          lgkm2 = wait_imm::unset_counter;
1282 
1283       vm = std::min(vm, vm2);
1284       exp = std::min(exp, exp2);
1285       lgkm = std::min(lgkm, lgkm2);
1286    } else {
1287       return false;
1288    }
1289    return true;
1290 }
1291 
1292 bool
combine(const wait_imm & other)1293 wait_imm::combine(const wait_imm& other)
1294 {
1295    bool changed = false;
1296    for (unsigned i = 0; i < wait_type_num; i++) {
1297       if (other[i] < (*this)[i])
1298          changed = true;
1299       (*this)[i] = std::min((*this)[i], other[i]);
1300    }
1301    return changed;
1302 }
1303 
1304 bool
empty() const1305 wait_imm::empty() const
1306 {
1307    for (unsigned i = 0; i < wait_type_num; i++) {
1308       if ((*this)[i] != unset_counter)
1309          return false;
1310    }
1311    return true;
1312 }
1313 
1314 void
print(FILE * output) const1315 wait_imm::print(FILE* output) const
1316 {
1317    const char* names[wait_type_num];
1318    names[wait_type_exp] = "exp";
1319    names[wait_type_vm] = "vm";
1320    names[wait_type_lgkm] = "lgkm";
1321    names[wait_type_vs] = "vs";
1322    names[wait_type_sample] = "sample";
1323    names[wait_type_bvh] = "bvh";
1324    names[wait_type_km] = "km";
1325    for (unsigned i = 0; i < wait_type_num; i++) {
1326       if ((*this)[i] != unset_counter)
1327          fprintf(output, "%s: %u\n", names[i], (*this)[i]);
1328    }
1329 }
1330 
1331 bool
should_form_clause(const Instruction * a,const Instruction * b)1332 should_form_clause(const Instruction* a, const Instruction* b)
1333 {
1334    if (a->definitions.empty() != b->definitions.empty())
1335       return false;
1336 
1337    if (a->format != b->format)
1338       return false;
1339 
1340    if (a->operands.empty() || b->operands.empty())
1341       return false;
1342 
1343    /* Assume loads which don't use descriptors might load from similar addresses. */
1344    if (a->isFlatLike() || a->accessesLDS())
1345       return true;
1346    if (a->isSMEM() && a->operands[0].bytes() == 8 && b->operands[0].bytes() == 8)
1347       return true;
1348 
1349    /* If they load from the same descriptor, assume they might load from similar
1350     * addresses.
1351     */
1352    if (a->isVMEM() || a->isSMEM())
1353       return a->operands[0].tempId() == b->operands[0].tempId();
1354 
1355    if (a->isEXP() && b->isEXP())
1356       return true;
1357 
1358    return false;
1359 }
1360 
1361 int
get_op_fixed_to_def(Instruction * instr)1362 get_op_fixed_to_def(Instruction* instr)
1363 {
1364    if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 ||
1365        instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
1366        instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_mac_legacy_f32 ||
1367        instr->opcode == aco_opcode::v_fmac_legacy_f32 ||
1368        instr->opcode == aco_opcode::v_pk_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 ||
1369        instr->opcode == aco_opcode::v_writelane_b32_e64 ||
1370        instr->opcode == aco_opcode::v_dot4c_i32_i8 || instr->opcode == aco_opcode::s_fmac_f32 ||
1371        instr->opcode == aco_opcode::s_fmac_f16) {
1372       return 2;
1373    } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32 ||
1374               instr->opcode == aco_opcode::s_cmovk_i32) {
1375       return 0;
1376    } else if (instr->isMUBUF() && instr->definitions.size() == 1 && instr->operands.size() == 4) {
1377       return 3;
1378    } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
1379               !instr->operands[2].isUndefined()) {
1380       return 2;
1381    }
1382    return -1;
1383 }
1384 
1385 uint8_t
get_vmem_type(enum amd_gfx_level gfx_level,Instruction * instr)1386 get_vmem_type(enum amd_gfx_level gfx_level, Instruction* instr)
1387 {
1388    if (instr->opcode == aco_opcode::image_bvh64_intersect_ray)
1389       return vmem_bvh;
1390    else if (gfx_level >= GFX12 && instr->opcode == aco_opcode::image_msaa_load)
1391       return vmem_sampler;
1392    else if (instr->isMIMG() && !instr->operands[1].isUndefined() &&
1393             instr->operands[1].regClass() == s4)
1394       return vmem_sampler;
1395    else if (instr->isVMEM() || instr->isScratch() || instr->isGlobal())
1396       return vmem_nosampler;
1397    return 0;
1398 }
1399 
1400 unsigned
parse_vdst_wait(Instruction * instr)1401 parse_vdst_wait(Instruction* instr)
1402 {
1403    if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP())
1404       return 0;
1405    else if (instr->isLDSDIR())
1406       return instr->ldsdir().wait_vdst;
1407    else if (instr->opcode == aco_opcode::s_waitcnt_depctr)
1408       return (instr->salu().imm >> 12) & 0xf;
1409    else
1410       return 15;
1411 }
1412 
1413 bool
dealloc_vgprs(Program * program)1414 dealloc_vgprs(Program* program)
1415 {
1416    if (program->gfx_level < GFX11)
1417       return false;
1418 
1419    /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
1420     * store. */
1421    if (uses_scratch(program))
1422       return false;
1423 
1424    /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
1425     * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
1426     * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
1427     * pending VMEM stores or exports if we insert the sendmsg for these stages. */
1428    if (program->gfx_level == GFX11_5 && (program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER ||
1429                                          program->stage.hw == AC_HW_PIXEL_SHADER))
1430       return false;
1431 
1432    Block& block = program->blocks.back();
1433 
1434    /* don't bother checking if there is a pending VMEM store or export: there almost always is */
1435    Builder bld(program);
1436    if (!block.instructions.empty() && block.instructions.back()->opcode == aco_opcode::s_endpgm) {
1437       bld.reset(&block.instructions, block.instructions.begin() + (block.instructions.size() - 1));
1438       /* Due to a hazard, an s_nop is needed before "s_sendmsg sendmsg_dealloc_vgprs". */
1439       bld.sopp(aco_opcode::s_nop, 0);
1440       bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
1441    }
1442 
1443    return true;
1444 }
1445 
1446 bool
isTrans() const1447 Instruction::isTrans() const noexcept
1448 {
1449    return instr_info.classes[(int)opcode] == instr_class::valu_transcendental32 ||
1450           instr_info.classes[(int)opcode] == instr_class::valu_double_transcendental ||
1451           instr_info.classes[(int)opcode] == instr_class::valu_pseudo_scalar_trans;
1452 }
1453 
1454 size_t
get_instr_data_size(Format format)1455 get_instr_data_size(Format format)
1456 {
1457    switch (format) {
1458    case Format::SOP1:
1459    case Format::SOP2:
1460    case Format::SOPC:
1461    case Format::SOPK:
1462    case Format::SOPP: return sizeof(SALU_instruction);
1463    case Format::SMEM: return sizeof(SMEM_instruction);
1464    case Format::PSEUDO: return sizeof(Pseudo_instruction);
1465    case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
1466    case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
1467    case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
1468    case Format::DS: return sizeof(DS_instruction);
1469    case Format::FLAT:
1470    case Format::GLOBAL:
1471    case Format::SCRATCH: return sizeof(FLAT_instruction);
1472    case Format::LDSDIR: return sizeof(LDSDIR_instruction);
1473    case Format::MTBUF: return sizeof(MTBUF_instruction);
1474    case Format::MUBUF: return sizeof(MUBUF_instruction);
1475    case Format::MIMG: return sizeof(MIMG_instruction);
1476    case Format::VOPD: return sizeof(VOPD_instruction);
1477    case Format::VINTERP_INREG: return sizeof(VINTERP_inreg_instruction);
1478    case Format::VINTRP: return sizeof(VINTRP_instruction);
1479    case Format::EXP: return sizeof(Export_instruction);
1480    default:
1481       if ((uint16_t)format & (uint16_t)Format::DPP16)
1482          return sizeof(DPP16_instruction);
1483       else if ((uint16_t)format & (uint16_t)Format::DPP8)
1484          return sizeof(DPP8_instruction);
1485       else if ((uint16_t)format & (uint16_t)Format::SDWA)
1486          return sizeof(SDWA_instruction);
1487       else
1488          return sizeof(VALU_instruction);
1489    }
1490 }
1491 
1492 Instruction*
create_instruction(aco_opcode opcode,Format format,uint32_t num_operands,uint32_t num_definitions)1493 create_instruction(aco_opcode opcode, Format format, uint32_t num_operands,
1494                    uint32_t num_definitions)
1495 {
1496    size_t size = get_instr_data_size(format);
1497    size_t total_size = size + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition);
1498 
1499    void* data = instruction_buffer->allocate(total_size, alignof(uint32_t));
1500    memset(data, 0, total_size);
1501    Instruction* inst = (Instruction*)data;
1502 
1503    inst->opcode = opcode;
1504    inst->format = format;
1505 
1506    uint16_t operands_offset = size - offsetof(Instruction, operands);
1507    inst->operands = aco::span<Operand>(operands_offset, num_operands);
1508    uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions;
1509    inst->definitions = aco::span<Definition>(definitions_offset, num_definitions);
1510 
1511    return inst;
1512 }
1513 
1514 } // namespace aco
1515