xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_shader.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22 
23 #if LLVM_AVAILABLE
24 #include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
25 #else
26 #define LLVM_VERSION_MAJOR 0
27 #endif
28 
29 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
30 
31 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
32 
33 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
34 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
35 
36 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)37 unsigned si_get_ps_num_interp(struct si_shader *ps)
38 {
39    unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
40    unsigned num_interp =
41       ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
42 
43    assert(num_interp <= 32);
44    return MIN2(num_interp, 32);
45 }
46 
47 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)48 bool si_is_multi_part_shader(struct si_shader *shader)
49 {
50    if (shader->selector->screen->info.gfx_level <= GFX8 ||
51        shader->selector->stage > MESA_SHADER_GEOMETRY)
52       return false;
53 
54    return shader->key.ge.as_ls || shader->key.ge.as_es ||
55           shader->selector->stage == MESA_SHADER_TESS_CTRL ||
56           shader->selector->stage == MESA_SHADER_GEOMETRY;
57 }
58 
59 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)60 bool si_is_merged_shader(struct si_shader *shader)
61 {
62    if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
63       return false;
64 
65    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
66 }
67 
68 /**
69  * Returns a unique index for a semantic name and index. The index must be
70  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
71  * calculated.
72  */
si_shader_io_get_unique_index(unsigned semantic)73 unsigned si_shader_io_get_unique_index(unsigned semantic)
74 {
75    switch (semantic) {
76    case VARYING_SLOT_POS:
77       return SI_UNIQUE_SLOT_POS;
78    default:
79       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
80          return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
81 
82       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
83          return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
84 
85       assert(!"invalid generic index");
86       return 0;
87 
88    /* Legacy desktop GL varyings. */
89    case VARYING_SLOT_FOGC:
90       return SI_UNIQUE_SLOT_FOGC;
91    case VARYING_SLOT_COL0:
92       return SI_UNIQUE_SLOT_COL0;
93    case VARYING_SLOT_COL1:
94       return SI_UNIQUE_SLOT_COL1;
95    case VARYING_SLOT_BFC0:
96       return SI_UNIQUE_SLOT_BFC0;
97    case VARYING_SLOT_BFC1:
98       return SI_UNIQUE_SLOT_BFC1;
99    case VARYING_SLOT_TEX0:
100    case VARYING_SLOT_TEX1:
101    case VARYING_SLOT_TEX2:
102    case VARYING_SLOT_TEX3:
103    case VARYING_SLOT_TEX4:
104    case VARYING_SLOT_TEX5:
105    case VARYING_SLOT_TEX6:
106    case VARYING_SLOT_TEX7:
107       return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
108    case VARYING_SLOT_CLIP_VERTEX:
109       return SI_UNIQUE_SLOT_CLIP_VERTEX;
110 
111    /* Varyings present in both GLES and desktop GL. */
112    case VARYING_SLOT_CLIP_DIST0:
113       return SI_UNIQUE_SLOT_CLIP_DIST0;
114    case VARYING_SLOT_CLIP_DIST1:
115       return SI_UNIQUE_SLOT_CLIP_DIST1;
116    case VARYING_SLOT_PSIZ:
117       return SI_UNIQUE_SLOT_PSIZ;
118    case VARYING_SLOT_LAYER:
119       return SI_UNIQUE_SLOT_LAYER;
120    case VARYING_SLOT_VIEWPORT:
121       return SI_UNIQUE_SLOT_VIEWPORT;
122    case VARYING_SLOT_PRIMITIVE_ID:
123       return SI_UNIQUE_SLOT_PRIMITIVE_ID;
124    }
125 }
126 
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader)127 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader)
128 {
129    struct si_shader_selector *sel = shader->selector;
130 
131    if (shader->selector->screen->info.gfx_level >= GFX11) {
132       /* NGG streamout. */
133       if (sel->stage == MESA_SHADER_TESS_EVAL)
134          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
135       return;
136    }
137 
138    /* Streamout SGPRs. */
139    if (si_shader_uses_streamout(shader)) {
140       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
141       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
142 
143       /* A streamout buffer offset is loaded if the stride is non-zero. */
144       for (int i = 0; i < 4; i++) {
145          if (!sel->info.base.xfb_stride[i])
146             continue;
147 
148          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
149       }
150    } else if (sel->stage == MESA_SHADER_TESS_EVAL) {
151       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
152    }
153 }
154 
si_get_max_workgroup_size(const struct si_shader * shader)155 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
156 {
157    gl_shader_stage stage = shader->is_gs_copy_shader ?
158       MESA_SHADER_VERTEX : shader->selector->stage;
159 
160    switch (stage) {
161    case MESA_SHADER_VERTEX:
162    case MESA_SHADER_TESS_EVAL:
163       /* Use the largest workgroup size for streamout */
164       if (shader->key.ge.as_ngg)
165          return si_shader_uses_streamout(shader) ? 256 : 128;
166 
167       /* As part of merged shader. */
168       return shader->selector->screen->info.gfx_level >= GFX9 &&
169          (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : 0;
170 
171    case MESA_SHADER_TESS_CTRL:
172       /* Return this so that LLVM doesn't remove s_barrier
173        * instructions on chips where we use s_barrier. */
174       return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
175 
176    case MESA_SHADER_GEOMETRY:
177       /* GS can always generate up to 256 vertices. */
178       return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0;
179 
180    case MESA_SHADER_COMPUTE:
181       break; /* see below */
182 
183    default:
184       return 0;
185    }
186 
187    /* Compile a variable block size using the maximum variable size. */
188    if (shader->selector->info.base.workgroup_size_variable)
189       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
190 
191    uint16_t *local_size = shader->selector->info.base.workgroup_size;
192    unsigned max_work_group_size = (uint32_t)local_size[0] *
193                                   (uint32_t)local_size[1] *
194                                   (uint32_t)local_size[2];
195    assert(max_work_group_size);
196    return max_work_group_size;
197 }
198 
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)199 static void declare_const_and_shader_buffers(struct si_shader_args *args,
200                                              struct si_shader *shader,
201                                              bool assign_params)
202 {
203    enum ac_arg_type const_shader_buf_type;
204 
205    if (shader->selector->info.base.num_ubos == 1 &&
206        shader->selector->info.base.num_ssbos == 0)
207       const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
208    else
209       const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
210 
211    ac_add_arg(
212       &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
213       assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
214 }
215 
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)216 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
217 {
218    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
219               assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
220 }
221 
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)222 static void declare_per_stage_desc_pointers(struct si_shader_args *args,
223                                             struct si_shader *shader,
224                                             bool assign_params)
225 {
226    declare_const_and_shader_buffers(args, shader, assign_params);
227    declare_samplers_and_images(args, assign_params);
228 }
229 
declare_global_desc_pointers(struct si_shader_args * args)230 static void declare_global_desc_pointers(struct si_shader_args *args)
231 {
232    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
233    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
234               &args->bindless_samplers_and_images);
235 }
236 
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)237 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
238                                               struct si_shader *shader)
239 {
240    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
241 
242    unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
243    if (num_vbos_in_user_sgprs) {
244       unsigned user_sgprs = args->ac.num_sgprs_used;
245 
246       if (si_is_merged_shader(shader))
247          user_sgprs -= 8;
248       assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
249 
250       /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
251       for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
252          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
253 
254       assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
255       for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
256          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
257    }
258 }
259 
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)260 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
261 {
262    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
263 
264    if (shader->selector->screen->info.gfx_level >= GFX12) {
265       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
266    } else if (shader->key.ge.as_ls) {
267       if (shader->selector->screen->info.gfx_level >= GFX11) {
268          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
269          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
270          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
271       } else if (shader->selector->screen->info.gfx_level >= GFX10) {
272          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
273          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
274          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
275       } else {
276          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
277          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
278          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
279       }
280    } else if (shader->selector->screen->info.gfx_level >= GFX10) {
281       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
282       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
283                  /* user vgpr or PrimID (legacy) */
284                  shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
285       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
286    } else {
287       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
288       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
289       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
290    }
291 }
292 
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args)293 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args)
294 {
295    bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
296 
297    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
298    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                  /* i16 x1, y1 */
299    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);                /* depth */
300 
301    if (shader->selector->info.base.vs.blit_sgprs_amd ==
302        SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
303       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
304       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
305       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
306       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
307       if (has_attribute_ring_address)
308          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
309    } else if (shader->selector->info.base.vs.blit_sgprs_amd ==
310               SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
311       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
312       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
313       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
314       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
315       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
316       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
317       if (has_attribute_ring_address)
318          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
319    }
320 }
321 
declare_tes_input_vgprs(struct si_shader_args * args)322 static void declare_tes_input_vgprs(struct si_shader_args *args)
323 {
324    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
325    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
326    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
327    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
328 }
329 
330 enum
331 {
332    /* Convenient merged shader definitions. */
333    SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
334    SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
335 };
336 
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)337 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
338                         enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
339 {
340    assert(args->arg_count == idx);
341    ac_add_arg(args, file, registers, type, arg);
342 }
343 
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args)344 void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
345 {
346    unsigned i, num_returns, num_return_sgprs;
347    unsigned num_prolog_vgprs = 0;
348    struct si_shader_selector *sel = shader->selector;
349    unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
350    unsigned stage_case = stage;
351 
352    memset(args, 0, sizeof(*args));
353 
354    /* Set MERGED shaders. */
355    if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
356       if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
357          stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
358       else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
359          stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
360    }
361 
362    switch (stage_case) {
363    case MESA_SHADER_VERTEX:
364       declare_global_desc_pointers(args);
365 
366       if (sel->info.base.vs.blit_sgprs_amd) {
367          declare_vs_blit_inputs(shader, args);
368       } else {
369          declare_per_stage_desc_pointers(args, shader, true);
370          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
371 
372          if (shader->is_gs_copy_shader) {
373             declare_streamout_params(args, shader);
374          } else {
375             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
376             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
377             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
378             declare_vb_descriptor_input_sgprs(args, shader);
379 
380             if (shader->key.ge.as_es) {
381                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
382             } else if (shader->key.ge.as_ls) {
383                /* no extra parameters */
384             } else {
385                declare_streamout_params(args, shader);
386             }
387          }
388       }
389 
390       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
391       if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
392          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
393 
394       /* VGPRs */
395       declare_vs_input_vgprs(args, shader);
396 
397       break;
398 
399    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
400       declare_global_desc_pointers(args);
401       declare_per_stage_desc_pointers(args, shader, true);
402       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
403       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
404       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
405       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
406       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
407 
408       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
409       if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
410          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
411 
412       /* VGPRs */
413       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
414       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
415       break;
416 
417    case SI_SHADER_MERGED_VERTEX_TESSCTRL:
418       /* Merged stages have 8 system SGPRs at the beginning. */
419       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
420       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_HS */
421       declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_TESS_CTRL);
422       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
423       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
424       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
425       if (sel->screen->info.gfx_level >= GFX11)
426          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
427       else
428          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
429       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
430       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
431 
432       declare_global_desc_pointers(args);
433       declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_VERTEX);
434 
435       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
436       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
437       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
438       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
439       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
440       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
441 
442       /* VGPRs (first TCS, then VS) */
443       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
444       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
445 
446       if (stage == MESA_SHADER_VERTEX) {
447          declare_vs_input_vgprs(args, shader);
448 
449          /* Need to keep LS/HS arg index same for shared args when ACO,
450           * so this is not able to be before shared VGPRs.
451           */
452          declare_vb_descriptor_input_sgprs(args, shader);
453 
454          /* LS return values are inputs to the TCS main shader part. */
455          if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
456             for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
457                ac_add_return(&args->ac, AC_ARG_SGPR);
458             for (i = 0; i < 2; i++)
459                ac_add_return(&args->ac, AC_ARG_VGPR);
460 
461             /* VS outputs passed via VGPRs to TCS. */
462             if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) {
463                unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
464                for (i = 0; i < num_outputs * 4; i++)
465                   ac_add_return(&args->ac, AC_ARG_VGPR);
466             }
467          }
468       } else {
469          /* TCS inputs are passed via VGPRs from VS. */
470          if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) {
471             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs);
472             for (i = 0; i < num_inputs * 4; i++)
473                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
474          }
475       }
476       break;
477 
478    case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
479       /* Merged stages have 8 system SGPRs at the beginning. */
480       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
481       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_GS */
482       declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_GEOMETRY);
483 
484       if (shader->key.ge.as_ngg)
485          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
486       else
487          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
488 
489       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
490       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
491       if (sel->screen->info.gfx_level >= GFX11)
492          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
493       else
494          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
495       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
496       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
497 
498       declare_global_desc_pointers(args);
499       if (stage != MESA_SHADER_VERTEX || !sel->info.base.vs.blit_sgprs_amd) {
500          declare_per_stage_desc_pointers(
501             args, shader, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
502       }
503 
504       if (stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd) {
505          declare_vs_blit_inputs(shader, args);
506       } else {
507          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
508 
509          if (stage == MESA_SHADER_VERTEX) {
510             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
511             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
512             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
513          } else if (stage == MESA_SHADER_TESS_EVAL) {
514             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
515             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
516             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
517          } else {
518             /* GS */
519             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
520             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
521             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
522          }
523 
524          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
525          if (sel->screen->info.gfx_level >= GFX11)
526             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
527          else
528             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
529       }
530 
531       /* VGPRs (first GS, then VS/TES) */
532       if (sel->screen->info.gfx_level >= GFX12) {
533          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
534          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
535          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
536       } else {
537          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
538          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
539          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
540          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
541          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
542       }
543 
544       if (stage == MESA_SHADER_VERTEX) {
545          declare_vs_input_vgprs(args, shader);
546 
547          /* Need to keep ES/GS arg index same for shared args when ACO,
548           * so this is not able to be before shared VGPRs.
549           */
550          if (!sel->info.base.vs.blit_sgprs_amd)
551             declare_vb_descriptor_input_sgprs(args, shader);
552       } else if (stage == MESA_SHADER_TESS_EVAL) {
553          declare_tes_input_vgprs(args);
554       }
555 
556       if (shader->key.ge.as_es && !shader->is_monolithic &&
557           (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
558          /* ES return values are inputs to GS. */
559          for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
560             ac_add_return(&args->ac, AC_ARG_SGPR);
561          for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
562             ac_add_return(&args->ac, AC_ARG_VGPR);
563       }
564       break;
565 
566    case MESA_SHADER_TESS_EVAL:
567       declare_global_desc_pointers(args);
568       declare_per_stage_desc_pointers(args, shader, true);
569       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
570       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
571       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
572 
573       if (shader->key.ge.as_es) {
574          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
575          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
576          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
577       } else {
578          declare_streamout_params(args, shader);
579          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
580       }
581 
582       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
583       if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
584          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
585 
586       /* VGPRs */
587       declare_tes_input_vgprs(args);
588       break;
589 
590    case MESA_SHADER_GEOMETRY:
591       declare_global_desc_pointers(args);
592       declare_per_stage_desc_pointers(args, shader, true);
593       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
594       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
595 
596       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
597       if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
598          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
599 
600       /* VGPRs */
601       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
602       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
603       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
604       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
605       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
606       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
607       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
608       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
609       break;
610 
611    case MESA_SHADER_FRAGMENT:
612       declare_global_desc_pointers(args);
613       declare_per_stage_desc_pointers(args, shader, true);
614       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
615                          SI_PARAM_ALPHA_REF);
616       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
617                          SI_PARAM_PRIM_MASK);
618 
619       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
620                          SI_PARAM_PERSP_SAMPLE);
621       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
622                          SI_PARAM_PERSP_CENTER);
623       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
624                          SI_PARAM_PERSP_CENTROID);
625       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
626       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
627                          SI_PARAM_LINEAR_SAMPLE);
628       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
629                          SI_PARAM_LINEAR_CENTER);
630       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
631                          SI_PARAM_LINEAR_CENTROID);
632       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
633       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
634                          SI_PARAM_POS_X_FLOAT);
635       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
636                          SI_PARAM_POS_Y_FLOAT);
637       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
638                          SI_PARAM_POS_Z_FLOAT);
639       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
640                          SI_PARAM_POS_W_FLOAT);
641       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face,
642                          SI_PARAM_FRONT_FACE);
643       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
644                          SI_PARAM_ANCILLARY);
645       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
646                          SI_PARAM_SAMPLE_COVERAGE);
647       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
648                          SI_PARAM_POS_FIXED_PT);
649 
650       if (sel->info.base.use_aco_amd) {
651          ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
652 
653          /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
654          if (sel->screen->info.gfx_level < GFX11)
655             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
656       }
657 
658       /* Monolithic PS emit prolog and epilog in NIR directly. */
659       if (!shader->is_monolithic) {
660          /* Color inputs from the prolog. */
661          if (shader->selector->info.colors_read) {
662             unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
663 
664             for (i = 0; i < num_color_elements; i++)
665                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
666 
667             num_prolog_vgprs += num_color_elements;
668          }
669 
670          /* Outputs for the epilog. */
671          num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
672          num_returns =
673             num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
674             shader->selector->info.writes_z + shader->selector->info.writes_stencil +
675             shader->ps.writes_samplemask + 1 /* SampleMaskIn */;
676 
677          for (i = 0; i < num_return_sgprs; i++)
678             ac_add_return(&args->ac, AC_ARG_SGPR);
679          for (; i < num_returns; i++)
680             ac_add_return(&args->ac, AC_ARG_VGPR);
681       }
682       break;
683 
684    case MESA_SHADER_COMPUTE:
685       declare_global_desc_pointers(args);
686       declare_per_stage_desc_pointers(args, shader, true);
687       if (shader->selector->info.uses_grid_size)
688          ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
689       if (shader->selector->info.uses_variable_block_size)
690          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
691 
692       unsigned cs_user_data_dwords =
693          shader->selector->info.base.cs.user_data_components_amd;
694       if (cs_user_data_dwords) {
695          ac_add_arg(&args->ac, AC_ARG_SGPR, MIN2(cs_user_data_dwords, 4), AC_ARG_INT,
696                     &args->cs_user_data[0]);
697          if (cs_user_data_dwords > 4) {
698             ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords - 4, AC_ARG_INT,
699                        &args->cs_user_data[1]);
700          }
701       }
702 
703       /* Some descriptors can be in user SGPRs. */
704       /* Shader buffers in user SGPRs. */
705       for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
706          while (args->ac.num_sgprs_used % 4 != 0)
707             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
708 
709          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
710       }
711       /* Images in user SGPRs. */
712       for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
713          unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8;
714 
715          while (args->ac.num_sgprs_used % num_sgprs != 0)
716             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
717 
718          ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
719       }
720 
721       /* Hardware SGPRs. */
722       for (i = 0; i < 3; i++) {
723          if (shader->selector->info.uses_block_id[i]) {
724             /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
725              * still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
726              */
727             if (sel->screen->info.gfx_level >= GFX12)
728                args->ac.workgroup_ids[i].used = true;
729             else
730                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
731          }
732       }
733       if (shader->selector->info.uses_tg_size)
734          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
735 
736       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
737       if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
738          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
739 
740       /* Hardware VGPRs. */
741       /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
742       if (sel->screen->info.gfx_level >= GFX11 ||
743           (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200))
744          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids);
745       else
746          ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids);
747       break;
748    default:
749       assert(0 && "unimplemented shader");
750       return;
751    }
752 
753    shader->info.num_input_sgprs = args->ac.num_sgprs_used;
754    shader->info.num_input_vgprs = args->ac.num_vgprs_used;
755 
756    assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
757    shader->info.num_input_vgprs -= num_prolog_vgprs;
758 }
759 
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)760 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
761 {
762    return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
763           screen->info.gfx_level >= GFX7 ? 512 : 256;
764 }
765 
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)766 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
767                            struct ac_rtld_binary *rtld)
768 {
769    const struct si_shader_selector *sel = shader->selector;
770    const char *part_elfs[5];
771    size_t part_sizes[5];
772    unsigned num_parts = 0;
773 
774 #define add_part(shader_or_part)                                                                   \
775    if (shader_or_part) {                                                                           \
776       part_elfs[num_parts] = (shader_or_part)->binary.code_buffer;                                 \
777       part_sizes[num_parts] = (shader_or_part)->binary.code_size;                                  \
778       num_parts++;                                                                                 \
779    }
780 
781    add_part(shader->prolog);
782    add_part(shader->previous_stage);
783    add_part(shader);
784    add_part(shader->epilog);
785 
786 #undef add_part
787 
788    struct ac_rtld_symbol lds_symbols[2];
789    unsigned num_lds_symbols = 0;
790 
791    if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
792        (sel->stage == MESA_SHADER_GEOMETRY ||
793         (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
794       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
795       sym->name = "esgs_ring";
796       sym->size = shader->gs_info.esgs_ring_size * 4;
797       sym->align = 64 * 1024;
798    }
799 
800    if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
801       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
802       sym->name = "ngg_emit";
803       sym->size = shader->ngg.ngg_emit_size * 4;
804       sym->align = 4;
805    }
806 
807    bool ok = ac_rtld_open(
808       rtld, (struct ac_rtld_open_info){.info = &screen->info,
809                                        .options =
810                                           {
811                                              .halt_at_entry = screen->options.halt_shaders,
812                                              .waitcnt_wa = num_parts > 1 &&
813                                                            screen->info.needs_llvm_wait_wa,
814                                           },
815                                        .shader_type = sel->stage,
816                                        .wave_size = shader->wave_size,
817                                        .num_parts = num_parts,
818                                        .elf_ptrs = part_elfs,
819                                        .elf_sizes = part_sizes,
820                                        .num_shared_lds_symbols = num_lds_symbols,
821                                        .shared_lds_symbols = lds_symbols});
822 
823    if (rtld->lds_size > 0) {
824       unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
825       shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
826    }
827 
828    return ok;
829 }
830 
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])831 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
832 {
833    unsigned num_bin = 0;
834 
835    if (shader->prolog)
836       bin[num_bin++] = &shader->prolog->binary;
837 
838    if (shader->previous_stage)
839       bin[num_bin++] = &shader->previous_stage->binary;
840 
841    bin[num_bin++] = &shader->binary;
842 
843    if (shader->epilog)
844       bin[num_bin++] = &shader->epilog->binary;
845 
846    return num_bin;
847 }
848 
849 /* si_get_shader_binary_size should only be called once per shader
850  * and the result should be stored in shader->complete_shader_binary_size.
851  */
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)852 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
853 {
854    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
855       struct ac_rtld_binary rtld;
856       si_shader_binary_open(screen, shader, &rtld);
857       uint64_t size = rtld.exec_size;
858       ac_rtld_close(&rtld);
859       return size;
860    } else {
861       struct si_shader_binary *bin[4];
862       unsigned num_bin = get_shader_binaries(shader, bin);
863 
864       unsigned size = 0;
865       for (unsigned i = 0; i < num_bin; i++) {
866          assert(bin[i]->type == SI_SHADER_BINARY_RAW);
867          size += bin[i]->exec_size;
868       }
869       return size;
870    }
871 }
872 
si_get_shader_prefetch_size(struct si_shader * shader)873 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
874 {
875    struct si_screen *sscreen = shader->selector->screen;
876    /* This excludes arrays of constants after instructions. */
877    unsigned exec_size =
878       ac_align_shader_binary_for_prefetch(&sscreen->info,
879                                           shader->complete_shader_binary_size);
880 
881    /* INST_PREF_SIZE uses 128B granularity.
882     * - GFX11: max 128 * 63 = 8064
883     * - GFX12: max 128 * 255 = 32640
884     */
885    unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
886    unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
887 
888    return MIN2(max_pref_size, exec_size_gran128);
889 }
890 
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)891 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
892                             uint64_t *value)
893 {
894    uint64_t *scratch_va = data;
895 
896    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
897       *value = (uint32_t)*scratch_va;
898       return true;
899    }
900    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
901       /* Enable scratch coalescing. */
902       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
903 
904       if (gfx_level >= GFX11)
905          *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
906       else
907          *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
908       return true;
909    }
910 
911    return false;
912 }
913 
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset,int64_t bo_offset)914 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
915                                unsigned binary_size, bool dma_upload,
916                                struct si_context **upload_ctx,
917                                struct pipe_resource **staging,
918                                unsigned *staging_offset,
919                                int64_t bo_offset)
920 {
921    unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
922 
923    if (bo_offset >= 0) {
924       /* sqtt needs to upload shaders as a pipeline, where all shaders
925        * are contiguous in memory.
926        * In this case, bo_offset will be positive and we don't have to
927        * realloc a new bo.
928        */
929       shader->gpu_address = shader->bo->gpu_address + bo_offset;
930       dma_upload = false;
931    } else {
932       si_resource_reference(&shader->bo, NULL);
933       shader->bo = si_aligned_buffer_create(
934          &sscreen->b,
935          SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
936          (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
937          PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
938       if (!shader->bo)
939          return NULL;
940 
941       shader->gpu_address = shader->bo->gpu_address;
942       bo_offset = 0;
943    }
944 
945    if (dma_upload) {
946       /* First upload into a staging buffer. */
947       *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
948 
949       void *ret;
950       u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
951                      staging_offset, staging, &ret);
952       if (!ret)
953          si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
954 
955       return ret;
956    } else {
957       void *ptr = sscreen->ws->buffer_map(sscreen->ws,
958          shader->bo->buf, NULL,
959          PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
960       if (!ptr)
961          return NULL;
962 
963       return ptr + bo_offset;
964    }
965 }
966 
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)967 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
968                                void *code, unsigned code_size,
969                                unsigned binary_size, bool dma_upload,
970                                struct si_context *upload_ctx,
971                                struct pipe_resource *staging,
972                                unsigned staging_offset)
973 {
974    if (sscreen->debug_flags & DBG(SQTT)) {
975       /* Remember the uploaded code */
976       shader->binary.uploaded_code_size = code_size;
977       shader->binary.uploaded_code = malloc(code_size);
978       memcpy(shader->binary.uploaded_code, code, code_size);
979    }
980 
981    if (dma_upload) {
982       /* Then copy from the staging buffer to VRAM.
983        *
984        * We can't use the upload copy in si_buffer_transfer_unmap because that might use
985        * a compute shader, and we can't use shaders in the code that is responsible for making
986        * them available.
987        */
988       si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
989                             binary_size);
990       si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
991       upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;
992 
993 #if 0 /* debug: validate whether the copy was successful */
994       uint32_t *dst_binary = malloc(binary_size);
995       uint32_t *src_binary = (uint32_t*)code;
996       pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
997       puts("dst_binary == src_binary:");
998       for (unsigned i = 0; i < binary_size / 4; i++) {
999          printf("   %08x == %08x\n", dst_binary[i], src_binary[i]);
1000       }
1001       free(dst_binary);
1002       exit(0);
1003 #endif
1004 
1005       si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
1006       pipe_resource_reference(&staging, NULL);
1007    } else {
1008       sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
1009    }
1010 }
1011 
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1012 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
1013                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1014 {
1015    struct ac_rtld_binary binary;
1016    if (!si_shader_binary_open(sscreen, shader, &binary))
1017       return -1;
1018 
1019    struct si_context *upload_ctx = NULL;
1020    struct pipe_resource *staging = NULL;
1021    unsigned staging_offset = 0;
1022 
1023    void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
1024                                     &upload_ctx, &staging, &staging_offset,
1025                                     bo_offset);
1026    if (!rx_ptr)
1027       return -1;
1028 
1029    /* Upload. */
1030    struct ac_rtld_upload_info u = {};
1031    u.binary = &binary;
1032    u.get_external_symbol = si_get_external_symbol;
1033    u.cb_data = &scratch_va;
1034    u.rx_va = shader->gpu_address;
1035    u.rx_ptr = rx_ptr;
1036 
1037    int size = ac_rtld_upload(&u);
1038 
1039    post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1040                       upload_ctx, staging, staging_offset);
1041 
1042    ac_rtld_close(&binary);
1043 
1044    return size;
1045 }
1046 
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1047 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1048 {
1049    gl_shader_stage stage =
1050       shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1051 
1052    if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1053        (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1054       unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1055 
1056       if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1057          size_in_dw += shader->ngg.ngg_emit_size;
1058 
1059       if (shader->key.ge.as_ngg) {
1060          unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1061          if (scratch_dw_size) {
1062             /* scratch base address needs to be 8 byte aligned */
1063             size_in_dw = ALIGN(size_in_dw, 2);
1064             size_in_dw += scratch_dw_size;
1065          }
1066       }
1067 
1068       shader->config.lds_size =
1069          DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1070    }
1071 }
1072 
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1073 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1074                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1075 {
1076    struct si_shader_binary *bin[4];
1077    unsigned num_bin = get_shader_binaries(shader, bin);
1078 
1079    unsigned code_size = 0, exec_size = 0;
1080    for (unsigned i = 0; i < num_bin; i++) {
1081       assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1082       code_size += bin[i]->code_size;
1083       exec_size += bin[i]->exec_size;
1084    }
1085 
1086    struct si_context *upload_ctx = NULL;
1087    struct pipe_resource *staging = NULL;
1088    unsigned staging_offset = 0;
1089 
1090    void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1091                                     &upload_ctx, &staging, &staging_offset,
1092                                     bo_offset);
1093    if (!rx_ptr)
1094       return -1;
1095 
1096    unsigned exec_offset = 0, data_offset = exec_size;
1097    for (unsigned i = 0; i < num_bin; i++) {
1098       memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1099 
1100       if (bin[i]->num_symbols) {
1101          /* Offset needed to add to const data symbol because of inserting other
1102           * shader part between exec code and const data.
1103           */
1104          unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1105 
1106          /* Prolog and epilog have no symbols. */
1107          struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1108          assert(sh && bin[i] == &sh->binary);
1109 
1110          si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1111                                 scratch_va, const_offset);
1112       }
1113 
1114       exec_offset += bin[i]->exec_size;
1115 
1116       unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1117       if (data_size) {
1118          memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1119          data_offset += data_size;
1120       }
1121    }
1122 
1123    post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1124                       upload_ctx, staging, staging_offset);
1125 
1126    calculate_needed_lds_size(sscreen, shader);
1127    return code_size;
1128 }
1129 
si_shader_binary_upload_at(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,int64_t bo_offset)1130 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1131                                uint64_t scratch_va, int64_t bo_offset)
1132 {
1133    bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
1134                      sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
1135                      bo_offset < 0;
1136 
1137    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1138       return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
1139    } else {
1140       assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1141       return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
1142    }
1143 }
1144 
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1145 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1146                             uint64_t scratch_va)
1147 {
1148    return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
1149 }
1150 
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1151 static void print_disassembly(const char *disasm, size_t nbytes,
1152                               const char *name, FILE *file,
1153                               struct util_debug_callback *debug)
1154 {
1155    if (debug && debug->debug_message) {
1156       /* Very long debug messages are cut off, so send the
1157        * disassembly one line at a time. This causes more
1158        * overhead, but on the plus side it simplifies
1159        * parsing of resulting logs.
1160        */
1161       util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1162 
1163       uint64_t line = 0;
1164       while (line < nbytes) {
1165          int count = nbytes - line;
1166          const char *nl = memchr(disasm + line, '\n', nbytes - line);
1167          if (nl)
1168             count = nl - (disasm + line);
1169 
1170          if (count) {
1171             util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1172          }
1173 
1174          line += count + 1;
1175       }
1176 
1177       util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1178    }
1179 
1180    if (file) {
1181       fprintf(file, "Shader %s disassembly:\n", name);
1182       fprintf(file, "%*s", (int)nbytes, disasm);
1183    }
1184 }
1185 
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1186 static void si_shader_dump_disassembly(struct si_screen *screen,
1187                                        const struct si_shader_binary *binary,
1188                                        gl_shader_stage stage, unsigned wave_size,
1189                                        struct util_debug_callback *debug, const char *name,
1190                                        FILE *file)
1191 {
1192    if (binary->type == SI_SHADER_BINARY_RAW) {
1193       print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1194       return;
1195    }
1196 
1197    struct ac_rtld_binary rtld_binary;
1198 
1199    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1200                                       .info = &screen->info,
1201                                       .shader_type = stage,
1202                                       .wave_size = wave_size,
1203                                       .num_parts = 1,
1204                                       .elf_ptrs = &binary->code_buffer,
1205                                       .elf_sizes = &binary->code_size}))
1206       return;
1207 
1208    const char *disasm;
1209    size_t nbytes;
1210 
1211    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1212       goto out;
1213 
1214    if (nbytes > INT_MAX)
1215       goto out;
1216 
1217    print_disassembly(disasm, nbytes, name, file, debug);
1218 
1219 out:
1220    ac_rtld_close(&rtld_binary);
1221 }
1222 
si_calculate_max_simd_waves(struct si_shader * shader)1223 static void si_calculate_max_simd_waves(struct si_shader *shader)
1224 {
1225    struct si_screen *sscreen = shader->selector->screen;
1226    struct ac_shader_config *conf = &shader->config;
1227    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1228    unsigned lds_per_wave = 0;
1229    unsigned max_simd_waves;
1230 
1231    max_simd_waves = sscreen->info.max_waves_per_simd;
1232 
1233    /* Compute LDS usage for PS. */
1234    switch (shader->selector->stage) {
1235    case MESA_SHADER_FRAGMENT:
1236       /* The minimum usage per wave is (num_inputs * 48). The maximum
1237        * usage is (num_inputs * 48 * 16).
1238        * We can get anything in between and it varies between waves.
1239        *
1240        * The 48 bytes per input for a single primitive is equal to
1241        * 4 bytes/component * 4 components/input * 3 points.
1242        *
1243        * Other stages don't know the size at compile time or don't
1244        * allocate LDS per wave, but instead they do it per thread group.
1245        */
1246       lds_per_wave = conf->lds_size * lds_increment +
1247                      align(shader->info.num_ps_inputs * 48, lds_increment);
1248       break;
1249    case MESA_SHADER_COMPUTE: {
1250          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1251          lds_per_wave = (conf->lds_size * lds_increment) /
1252                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1253       }
1254       break;
1255    default:;
1256    }
1257 
1258    /* Compute the per-SIMD wave counts. */
1259    if (conf->num_sgprs) {
1260       max_simd_waves =
1261          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1262    }
1263 
1264    if (conf->num_vgprs) {
1265       /* GFX 10.3 internally:
1266        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1267        * - aligns LDS to 1024
1268        *
1269        * For shader-db stats, set num_vgprs that the hw actually uses.
1270        */
1271       unsigned num_vgprs = conf->num_vgprs;
1272       if (sscreen->info.gfx_level >= GFX10_3) {
1273          unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1274          num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1275       } else {
1276          num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1277       }
1278 
1279       /* Always print wave limits as Wave64, so that we can compare
1280        * Wave32 and Wave64 with shader-db fairly. */
1281       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1282       max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1283    }
1284 
1285    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1286    if (lds_per_wave)
1287       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1288 
1289    shader->info.max_simd_waves = max_simd_waves;
1290 }
1291 
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1292 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1293                                         struct util_debug_callback *debug)
1294 {
1295    const struct ac_shader_config *conf = &shader->config;
1296    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1297 
1298    if (screen->options.debug_disassembly)
1299       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1300                                  shader->wave_size, debug, "main", NULL);
1301 
1302    unsigned num_ls_outputs = 0;
1303    unsigned num_hs_outputs = 0;
1304    unsigned num_es_outputs = 0;
1305    unsigned num_gs_outputs = 0;
1306    unsigned num_vs_outputs = 0;
1307    unsigned num_ps_outputs = 0;
1308 
1309    if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1310       /* This doesn't include pos exports because only param exports are interesting
1311        * for performance and can be optimized.
1312        */
1313       if (shader->key.ge.as_ls)
1314          num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
1315       else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1316          num_hs_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
1317       else if (shader->key.ge.as_es)
1318          num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1319       else if (shader->gs_copy_shader)
1320          num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1321       else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1322          num_gs_outputs = shader->info.nr_param_exports;
1323       else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1324                shader->selector->stage == MESA_SHADER_TESS_EVAL)
1325          num_vs_outputs = shader->info.nr_param_exports;
1326       else
1327          unreachable("invalid shader key");
1328    } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1329       num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1330                        (shader->selector->info.writes_z ||
1331                         shader->selector->info.writes_stencil ||
1332                         shader->ps.writes_samplemask);
1333    }
1334 
1335    util_debug_message(debug, SHADER_INFO,
1336                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1337                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1338                       "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1339                       "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1340                       "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1341                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1342                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1343                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1344                       num_ls_outputs, num_hs_outputs,
1345                       util_last_bit64(shader->selector->info.patch_outputs_written),
1346                       num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1347                       shader->selector->info.base.num_inlinable_uniforms,
1348                       shader->selector->info.has_divergent_loop,
1349                       stages[shader->selector->stage], shader->wave_size);
1350 }
1351 
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1352 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1353                         enum si_shader_dump_type dump_type)
1354 {
1355    static uint64_t filter[] = {
1356       [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1357       [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1358       [SI_DUMP_NIR] = DBG(NIR),
1359       [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1360       [SI_DUMP_LLVM_IR] = DBG(LLVM),
1361       [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1362       [SI_DUMP_ACO_IR] = DBG(ACO),
1363       [SI_DUMP_ASM] = DBG(ASM),
1364       [SI_DUMP_STATS] = DBG(STATS),
1365       [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1366    };
1367    assert(dump_type < ARRAY_SIZE(filter));
1368 
1369    return sscreen->debug_flags & (1 << stage) &&
1370           sscreen->debug_flags & filter[dump_type];
1371 }
1372 
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1373 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1374                                  bool check_debug_option)
1375 {
1376    const struct ac_shader_config *conf = &shader->config;
1377 
1378    if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1379       fprintf(file,
1380               "*** SHADER CONFIG ***\n"
1381               "SPI_PS_INPUT_ADDR = 0x%04x\n"
1382               "SPI_PS_INPUT_ENA  = 0x%04x\n",
1383               conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1384    }
1385 
1386    fprintf(file,
1387            "*** SHADER STATS ***\n"
1388            "SGPRS: %d\n"
1389            "VGPRS: %d\n"
1390            "Spilled SGPRs: %d\n"
1391            "Spilled VGPRs: %d\n"
1392            "Private memory VGPRs: %d\n"
1393            "Code Size: %d bytes\n"
1394            "LDS: %d bytes\n"
1395            "Scratch: %d bytes per wave\n"
1396            "Max Waves: %d\n"
1397            "********************\n\n\n",
1398            conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1399            shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1400            conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1401            conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1402 }
1403 
si_get_shader_name(const struct si_shader * shader)1404 const char *si_get_shader_name(const struct si_shader *shader)
1405 {
1406    switch (shader->selector->stage) {
1407    case MESA_SHADER_VERTEX:
1408       if (shader->key.ge.as_es)
1409          return "Vertex Shader as ES";
1410       else if (shader->key.ge.as_ls)
1411          return "Vertex Shader as LS";
1412       else if (shader->key.ge.as_ngg)
1413          return "Vertex Shader as ESGS";
1414       else
1415          return "Vertex Shader as VS";
1416    case MESA_SHADER_TESS_CTRL:
1417       return "Tessellation Control Shader";
1418    case MESA_SHADER_TESS_EVAL:
1419       if (shader->key.ge.as_es)
1420          return "Tessellation Evaluation Shader as ES";
1421       else if (shader->key.ge.as_ngg)
1422          return "Tessellation Evaluation Shader as ESGS";
1423       else
1424          return "Tessellation Evaluation Shader as VS";
1425    case MESA_SHADER_GEOMETRY:
1426       if (shader->is_gs_copy_shader)
1427          return "GS Copy Shader as VS";
1428       else
1429          return "Geometry Shader";
1430    case MESA_SHADER_FRAGMENT:
1431       return "Pixel Shader";
1432    case MESA_SHADER_COMPUTE:
1433       return "Compute Shader";
1434    default:
1435       return "Unknown Shader";
1436    }
1437 }
1438 
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1439 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1440                     struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1441 {
1442    gl_shader_stage stage = shader->selector->stage;
1443 
1444    if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1445       si_dump_shader_key(shader, file);
1446 
1447    if (!check_debug_option && shader->binary.llvm_ir_string) {
1448       /* This is only used with ddebug. */
1449       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1450          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1451          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1452       }
1453 
1454       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1455       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1456    }
1457 
1458    if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1459       fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1460 
1461       if (shader->prolog)
1462          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1463                                     "prolog", file);
1464       if (shader->previous_stage)
1465          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1466                                     shader->wave_size, debug, "previous stage", file);
1467       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1468                                  file);
1469 
1470       if (shader->epilog)
1471          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1472                                     "epilog", file);
1473       fprintf(file, "\n");
1474 
1475       si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1476    }
1477 }
1478 
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1479 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1480 {
1481    fprintf(f, "  mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1482    fprintf(f, "  mono.instance_divisor_is_fetched = %u\n",
1483            key->ge.mono.instance_divisor_is_fetched);
1484    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1485    fprintf(f, "  mono.vs.fix_fetch = {");
1486    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1487       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1488       if (i)
1489          fprintf(f, ", ");
1490       if (!fix.bits)
1491          fprintf(f, "0");
1492       else
1493          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1494                  fix.u.format);
1495    }
1496    fprintf(f, "}\n");
1497 }
1498 
si_dump_shader_key(const struct si_shader * shader,FILE * f)1499 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1500 {
1501    const union si_shader_key *key = &shader->key;
1502    gl_shader_stage stage = shader->selector->stage;
1503 
1504    fprintf(f, "SHADER KEY\n");
1505    fprintf(f, "  source_blake3 = {");
1506    _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
1507    fprintf(f, "}\n");
1508 
1509    switch (stage) {
1510    case MESA_SHADER_VERTEX:
1511       si_dump_shader_key_vs(key, f);
1512       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1513       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
1514       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1515       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1516       break;
1517 
1518    case MESA_SHADER_TESS_CTRL:
1519       if (shader->selector->screen->info.gfx_level >= GFX9)
1520          si_dump_shader_key_vs(key, f);
1521 
1522       fprintf(f, "  opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
1523       fprintf(f, "  opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
1524       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1525       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1526       break;
1527 
1528    case MESA_SHADER_TESS_EVAL:
1529       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1530       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1531       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1532       break;
1533 
1534    case MESA_SHADER_GEOMETRY:
1535       if (shader->is_gs_copy_shader)
1536          break;
1537 
1538       if (shader->selector->screen->info.gfx_level >= GFX9 &&
1539           key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1540          si_dump_shader_key_vs(key, f);
1541 
1542       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1543       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1544       break;
1545 
1546    case MESA_SHADER_COMPUTE:
1547       break;
1548 
1549    case MESA_SHADER_FRAGMENT:
1550       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1551       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1552       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1553       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
1554               key->ps.part.prolog.force_persp_sample_interp);
1555       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
1556               key->ps.part.prolog.force_linear_sample_interp);
1557       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
1558               key->ps.part.prolog.force_persp_center_interp);
1559       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
1560               key->ps.part.prolog.force_linear_center_interp);
1561       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
1562               key->ps.part.prolog.bc_optimize_for_persp);
1563       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
1564               key->ps.part.prolog.bc_optimize_for_linear);
1565       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
1566               key->ps.part.prolog.samplemask_log_ps_iter);
1567       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
1568               key->ps.part.epilog.spi_shader_col_format);
1569       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1570       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1571       fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1572       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1573       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1574       fprintf(f, "  epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1575       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1576       fprintf(f, "  epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1577       fprintf(f, "  epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1578       fprintf(f, "  epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1579       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1580       fprintf(f, "  mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1581       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
1582               key->ps.mono.interpolate_at_sample_force_center);
1583       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1584       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1585       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1586       break;
1587 
1588    default:
1589       assert(0);
1590    }
1591 
1592    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1593         stage == MESA_SHADER_VERTEX) &&
1594        !key->ge.as_es && !key->ge.as_ls) {
1595       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1596       fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1597       fprintf(f, "  opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer);
1598       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1599       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1600       fprintf(f, "  opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout);
1601       fprintf(f, "  mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
1602    }
1603 
1604    if (stage <= MESA_SHADER_GEOMETRY)
1605       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1606    else
1607       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1608 
1609    if (stage <= MESA_SHADER_GEOMETRY) {
1610       if (key->ge.opt.inline_uniforms) {
1611          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1612                  key->ge.opt.inline_uniforms,
1613                  key->ge.opt.inlined_uniform_values[0],
1614                  key->ge.opt.inlined_uniform_values[1],
1615                  key->ge.opt.inlined_uniform_values[2],
1616                  key->ge.opt.inlined_uniform_values[3]);
1617       } else {
1618          fprintf(f, "  opt.inline_uniforms = 0\n");
1619       }
1620    } else {
1621       if (key->ps.opt.inline_uniforms) {
1622          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1623                  key->ps.opt.inline_uniforms,
1624                  key->ps.opt.inlined_uniform_values[0],
1625                  key->ps.opt.inlined_uniform_values[1],
1626                  key->ps.opt.inlined_uniform_values[2],
1627                  key->ps.opt.inlined_uniform_values[3]);
1628       } else {
1629          fprintf(f, "  opt.inline_uniforms = 0\n");
1630       }
1631    }
1632 }
1633 
1634 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1635 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1636 {
1637    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1638    assert(impl);
1639    assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1640 
1641    if (!key->ge.opt.kill_outputs &&
1642        !key->ge.opt.kill_pointsize &&
1643        !key->ge.opt.kill_layer &&
1644        !key->ge.opt.kill_clip_distances &&
1645        !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1646       nir_metadata_preserve(impl, nir_metadata_all);
1647       return false;
1648    }
1649 
1650    bool progress = false;
1651 
1652    nir_foreach_block(block, impl) {
1653       nir_foreach_instr_safe(instr, block) {
1654          if (instr->type != nir_instr_type_intrinsic)
1655             continue;
1656 
1657          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1658          if (intr->intrinsic != nir_intrinsic_store_output)
1659             continue;
1660 
1661          /* No indirect indexing allowed. */
1662          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1663          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1664 
1665          assert(intr->num_components == 1); /* only scalar stores expected */
1666          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1667 
1668          if (nir_slot_is_varying(sem.location) &&
1669              key->ge.opt.kill_outputs &
1670              (1ull << si_shader_io_get_unique_index(sem.location)))
1671             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1672 
1673          switch (sem.location) {
1674          case VARYING_SLOT_PSIZ:
1675             if (key->ge.opt.kill_pointsize)
1676                progress |= nir_remove_sysval_output(intr);
1677             break;
1678 
1679          case VARYING_SLOT_CLIP_VERTEX:
1680             /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1681              * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1682              */
1683             if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1684                 SI_USER_CLIP_PLANE_MASK)
1685                progress |= nir_remove_sysval_output(intr);
1686             break;
1687 
1688          case VARYING_SLOT_CLIP_DIST0:
1689          case VARYING_SLOT_CLIP_DIST1:
1690             if (key->ge.opt.kill_clip_distances) {
1691                assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1692                unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1693                                 nir_intrinsic_component(intr);
1694 
1695                if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1696                   progress |= nir_remove_sysval_output(intr);
1697             }
1698             break;
1699 
1700          case VARYING_SLOT_LAYER:
1701             /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1702             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1703 
1704             if (key->ge.opt.kill_layer)
1705                progress |= nir_remove_sysval_output(intr);
1706             break;
1707          }
1708       }
1709    }
1710 
1711    if (progress) {
1712       nir_metadata_preserve(impl, nir_metadata_control_flow);
1713    } else {
1714       nir_metadata_preserve(impl, nir_metadata_all);
1715    }
1716 
1717    return progress;
1718 }
1719 
1720 /* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
kill_ps_outputs_cb(struct nir_builder * b,nir_instr * instr,void * _key)1721 static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
1722 {
1723    if (instr->type != nir_instr_type_intrinsic)
1724       return false;
1725 
1726    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1727    if (intr->intrinsic != nir_intrinsic_store_output)
1728       return false;
1729 
1730    /* No indirect indexing allowed. */
1731    ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1732    assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1733 
1734    unsigned location = nir_intrinsic_io_semantics(intr).location;
1735    const union si_shader_key *key = _key;
1736 
1737    switch (location) {
1738    case FRAG_RESULT_DEPTH:
1739    case FRAG_RESULT_STENCIL:
1740       return false;
1741 
1742    case FRAG_RESULT_SAMPLE_MASK:
1743       if (key->ps.part.epilog.kill_samplemask) {
1744          nir_instr_remove(instr);
1745          return true;
1746       }
1747       return false;
1748    }
1749 
1750    /* Color outputs. */
1751    unsigned comp_mask = BITFIELD_MASK(intr->num_components);
1752    assert(nir_intrinsic_component(intr) == 0);
1753    unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
1754 
1755    /* Preserve alpha if ALPHA_TESTING is enabled. */
1756    if (key->ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS ||
1757        key->ps.part.epilog.alpha_to_coverage_via_mrtz)
1758       cb_shader_mask |= 1 << 3;
1759 
1760    /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
1761    if (location == FRAG_RESULT_COLOR) {
1762       for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
1763          cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
1764    }
1765 
1766    unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
1767    unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
1768 
1769    if ((output_mask & comp_mask) == comp_mask)
1770       return false;
1771 
1772    if (!(output_mask & comp_mask)) {
1773       nir_instr_remove(instr);
1774       return true;
1775    }
1776 
1777    /* Fill disabled components with undef. */
1778    b->cursor = nir_before_instr(instr);
1779    nir_def *new_value = intr->src[0].ssa;
1780    nir_def *undef = nir_undef(b, 1, new_value->bit_size);
1781 
1782    unsigned kill_mask = ~output_mask & comp_mask;
1783    u_foreach_bit(i, kill_mask) {
1784       new_value = nir_vector_insert_imm(b, new_value, undef, i);
1785    }
1786 
1787    nir_src_rewrite(&intr->src[0], new_value);
1788    return true;
1789 }
1790 
si_nir_kill_ps_outputs(nir_shader * nir,const union si_shader_key * key)1791 static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
1792 {
1793    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1794    return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
1795                                        nir_metadata_control_flow, (void*)key);
1796 }
1797 
clamp_vertex_color_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)1798 static bool clamp_vertex_color_instr(nir_builder *b,
1799                                      nir_intrinsic_instr *intrin, void *state)
1800 {
1801    if (intrin->intrinsic != nir_intrinsic_store_output)
1802       return false;
1803 
1804    unsigned location = nir_intrinsic_io_semantics(intrin).location;
1805    if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 &&
1806        location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1)
1807       return false;
1808 
1809    /* no indirect output */
1810    assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1811    /* only scalar output */
1812    assert(intrin->src[0].ssa->num_components == 1);
1813 
1814    b->cursor = nir_before_instr(&intrin->instr);
1815 
1816    nir_def *color = intrin->src[0].ssa;
1817    nir_def *clamp = nir_load_clamp_vertex_color_amd(b);
1818    nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color);
1819    nir_src_rewrite(&intrin->src[0], new_color);
1820 
1821    return true;
1822 }
1823 
si_nir_clamp_vertex_color(nir_shader * nir)1824 static bool si_nir_clamp_vertex_color(nir_shader *nir)
1825 {
1826    uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1;
1827    if (!(nir->info.outputs_written & mask))
1828       return false;
1829 
1830    return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr,
1831                                        nir_metadata_control_flow,
1832                                        NULL);
1833 }
1834 
si_map_io_driver_location(unsigned semantic)1835 static unsigned si_map_io_driver_location(unsigned semantic)
1836 {
1837    if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1838        semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1839        semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1840       return ac_shader_io_get_unique_index_patch(semantic);
1841 
1842    return si_shader_io_get_unique_index(semantic);
1843 }
1844 
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir,uint64_t tcs_vgpr_only_inputs)1845 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
1846                                uint64_t tcs_vgpr_only_inputs)
1847 {
1848    struct si_shader_selector *sel = shader->selector;
1849    struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
1850    const union si_shader_key *key = &shader->key;
1851    const bool is_gfx9_mono_tcs = shader->is_monolithic && next_sel->stage == MESA_SHADER_TESS_CTRL &&
1852                                  sel->screen->info.gfx_level >= GFX9;
1853 
1854    if (nir->info.stage == MESA_SHADER_VERTEX) {
1855       if (key->ge.as_ls) {
1856          NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
1857                     is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1858                     key->ge.opt.same_patch_vertices,
1859                     is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
1860                     tcs_vgpr_only_inputs);
1861          return true;
1862       } else if (key->ge.as_es) {
1863          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1864                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1865          return true;
1866       }
1867    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1868       NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
1869                  is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1870                  key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);
1871 
1872       /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1873       nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;
1874 
1875       NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location,
1876                  sel->screen->info.gfx_level,
1877                  ~0ULL, ~0U, /* no TES inputs filter */
1878                  shader->wave_size,
1879                  sel->info.tessfactors_are_def_in_all_invocs);
1880       return true;
1881    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1882       NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1883 
1884       if (key->ge.as_es) {
1885          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1886                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1887       }
1888 
1889       return true;
1890    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1891       NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1892                  sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1893       return true;
1894    }
1895 
1896    return false;
1897 }
1898 
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1899 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1900 {
1901    struct si_shader_selector *sel = shader->selector;
1902    const union si_shader_key *key = &shader->key;
1903    assert(key->ge.as_ngg);
1904 
1905    uint8_t clip_cull_dist_mask =
1906       (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1907       sel->info.culldist_mask;
1908 
1909    ac_nir_lower_ngg_options options = {
1910       .family = sel->screen->info.family,
1911       .gfx_level = sel->screen->info.gfx_level,
1912       .max_workgroup_size = si_get_max_workgroup_size(shader),
1913       .wave_size = shader->wave_size,
1914       .can_cull = !!key->ge.opt.ngg_culling,
1915       .disable_streamout = !si_shader_uses_streamout(shader),
1916       .vs_output_param_offset = shader->info.vs_output_param_offset,
1917       .has_param_exports = shader->info.nr_param_exports,
1918       .clip_cull_dist_mask = clip_cull_dist_mask,
1919       .kill_pointsize = key->ge.opt.kill_pointsize,
1920       .kill_layer = key->ge.opt.kill_layer,
1921       .force_vrs = sel->screen->options.vrs2x2,
1922       .use_gfx12_xfb_intrinsic = true,
1923    };
1924 
1925    if (nir->info.stage == MESA_SHADER_VERTEX ||
1926        nir->info.stage == MESA_SHADER_TESS_EVAL) {
1927       /* Per instance inputs, used to remove instance load after culling. */
1928       unsigned instance_rate_inputs = 0;
1929 
1930       if (nir->info.stage == MESA_SHADER_VERTEX) {
1931          instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1932                                 key->ge.mono.instance_divisor_is_fetched;
1933 
1934          /* Manually mark the instance ID used, so the shader can repack it. */
1935          if (instance_rate_inputs)
1936             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1937       } else {
1938          /* Manually mark the primitive ID used, so the shader can repack it. */
1939          if (key->ge.mono.u.vs_export_prim_id)
1940             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1941       }
1942 
1943       unsigned clip_plane_enable =
1944          SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1945       unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(shader);
1946 
1947       options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1948       options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1949       options.passthrough = gfx10_is_ngg_passthrough(shader);
1950       options.use_edgeflags = gfx10_edgeflags_have_effect(shader);
1951       options.has_gen_prim_query = options.has_xfb_prim_query =
1952          sel->screen->info.gfx_level >= GFX11 && !sel->info.base.vs.blit_sgprs_amd;
1953       options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1954       options.instance_rate_inputs = instance_rate_inputs;
1955       options.user_clip_plane_enable_mask = clip_plane_enable;
1956 
1957       NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1958    } else {
1959       assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1960 
1961       options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1962       options.has_gen_prim_query = options.has_xfb_prim_query =
1963          sel->screen->info.gfx_level >= GFX11;
1964       options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1965       options.has_gs_primitives_query = true;
1966 
1967       /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1968       if (key->ge.part.gs.es)
1969          nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1970 
1971       NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1972    }
1973 
1974    /* may generate some vector output store */
1975    NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1976 }
1977 
si_deserialize_shader(struct si_shader_selector * sel)1978 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1979 {
1980    struct pipe_screen *screen = &sel->screen->b;
1981    const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1982                                                       pipe_shader_type_from_mesa(sel->stage));
1983 
1984    struct blob_reader blob_reader;
1985    blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1986    return nir_deserialize(NULL, options, &blob_reader);
1987 }
1988 
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])1989 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
1990                                         int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
1991 {
1992    struct si_shader_selector *sel = shader->selector;
1993    struct si_shader_binary_info *info = &shader->info;
1994 
1995    uint64_t outputs_written = 0;
1996    uint32_t outputs_written_16bit = 0;
1997 
1998    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1999    assert(impl);
2000 
2001    nir_foreach_block(block, impl) {
2002       nir_foreach_instr_safe(instr, block) {
2003          if (instr->type != nir_instr_type_intrinsic)
2004             continue;
2005 
2006          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2007          if (intr->intrinsic != nir_intrinsic_store_output)
2008             continue;
2009 
2010          /* No indirect indexing allowed. */
2011          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
2012          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
2013 
2014          assert(intr->num_components == 1); /* only scalar stores expected */
2015          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2016 
2017          if (sem.location >= VARYING_SLOT_VAR0_16BIT)
2018             outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
2019          else
2020             outputs_written |= BITFIELD64_BIT(sem.location);
2021 
2022          /* Assign the param index if it's unassigned. */
2023          if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
2024              (sem.gs_streams & 0x3) == 0 &&
2025              info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
2026             /* The semantic and the base should be the same as in si_shader_info. */
2027             assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
2028             /* It must not be remapped (duplicated). */
2029             assert(slot_remap[sem.location] == -1);
2030 
2031             info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
2032          }
2033       }
2034    }
2035 
2036    /* Duplicated outputs are redirected here. */
2037    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
2038       if (slot_remap[i] >= 0)
2039          info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
2040    }
2041 
2042    if (shader->key.ge.mono.u.vs_export_prim_id) {
2043       info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
2044    }
2045 
2046    /* Update outputs written info, we may remove some outputs before. */
2047    nir->info.outputs_written = outputs_written;
2048    nir->info.outputs_written_16bit = outputs_written_16bit;
2049 }
2050 
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)2051 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
2052 {
2053    /* Initialize this first. */
2054    shader->info.nr_param_exports = 0;
2055 
2056    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2057    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2058           sizeof(shader->info.vs_output_param_offset));
2059 
2060    /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
2061     * mapped to multiple fragment shader inputs.
2062     */
2063    int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
2064    memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
2065 
2066    /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
2067    /* TODO: This doesn't affect GS. */
2068    NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
2069               shader->info.vs_output_param_offset);
2070 
2071    /* Assign the non-constant outputs. */
2072    /* TODO: Use this for the GS copy shader too. */
2073    si_nir_assign_param_offsets(nir, shader, slot_remap);
2074 }
2075 
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)2076 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
2077                                       const union si_shader_key *key)
2078 {
2079    const struct si_shader_info *info = &sel->info;
2080 
2081    /* Must have a position export. */
2082    unsigned nr_pos_exports = 1;
2083 
2084    if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
2085        (info->writes_edgeflag && !key->ge.as_ngg) ||
2086        (info->writes_layer && !key->ge.opt.kill_layer) ||
2087        info->writes_viewport_index || sel->screen->options.vrs2x2) {
2088       nr_pos_exports++;
2089    }
2090 
2091    unsigned clipdist_mask =
2092       (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
2093 
2094    for (int i = 0; i < 2; i++) {
2095       if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2096          nr_pos_exports++;
2097    }
2098 
2099    return nr_pos_exports;
2100 }
2101 
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2102 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2103 {
2104    nir_def **colors = (nir_def **)state;
2105 
2106    if (instr->type != nir_instr_type_intrinsic)
2107       return false;
2108 
2109    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2110 
2111    if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2112        intrin->intrinsic != nir_intrinsic_load_color1)
2113       return false;
2114 
2115    unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2116    assert(colors[index]);
2117 
2118    nir_def_replace(&intrin->def, colors[index]);
2119    return true;
2120 }
2121 
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2122 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2123                                         const struct si_shader_info *info)
2124 {
2125    bool progress = false;
2126    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2127 
2128    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2129    nir_builder *b = &builder;
2130 
2131    /* Build ready to be used colors at the beginning of the shader. */
2132    nir_def *colors[2] = {0};
2133    for (int i = 0; i < 2; i++) {
2134       if (!(info->colors_read & (0xf << (i * 4))))
2135          continue;
2136 
2137       unsigned color_base = info->color_attr_index[i];
2138       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
2139        * otherwise it's at offset "num_inputs".
2140        */
2141       unsigned back_color_base = info->num_inputs;
2142       if (i == 1 && (info->colors_read & 0xf))
2143          back_color_base += 1;
2144 
2145       enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2146       if (interp_mode == INTERP_MODE_COLOR) {
2147          interp_mode = key->ps.part.prolog.flatshade_colors ?
2148             INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2149       }
2150 
2151       nir_def *back_color = NULL;
2152       if (interp_mode == INTERP_MODE_FLAT) {
2153          colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2154                                    .base = color_base,
2155                                    .io_semantics.location = VARYING_SLOT_COL0 + i,
2156                                    .io_semantics.num_slots = 1);
2157 
2158          if (key->ps.part.prolog.color_two_side) {
2159             back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2160                                         .base = back_color_base,
2161                                         .io_semantics.location = VARYING_SLOT_BFC0 + i,
2162                                         .io_semantics.num_slots = 1);
2163          }
2164       } else {
2165          nir_intrinsic_op op = 0;
2166          switch (info->color_interpolate_loc[i]) {
2167          case TGSI_INTERPOLATE_LOC_CENTER:
2168             op = nir_intrinsic_load_barycentric_pixel;
2169             break;
2170          case TGSI_INTERPOLATE_LOC_CENTROID:
2171             op = nir_intrinsic_load_barycentric_centroid;
2172             break;
2173          case TGSI_INTERPOLATE_LOC_SAMPLE:
2174             op = nir_intrinsic_load_barycentric_sample;
2175             break;
2176          default:
2177             unreachable("invalid color interpolate location");
2178             break;
2179          }
2180 
2181          nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2182 
2183          colors[i] =
2184             nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2185                                         .base = color_base,
2186                                         .io_semantics.location = VARYING_SLOT_COL0 + i,
2187                                         .io_semantics.num_slots = 1);
2188 
2189          if (key->ps.part.prolog.color_two_side) {
2190             back_color =
2191                nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2192                                            .base = back_color_base,
2193                                            .io_semantics.location = VARYING_SLOT_BFC0 + i,
2194                                            .io_semantics.num_slots = 1);
2195          }
2196       }
2197 
2198       if (back_color) {
2199          nir_def *is_front_face = nir_load_front_face(b, 1);
2200          colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2201       }
2202 
2203       progress = true;
2204    }
2205 
2206    /* lower nir_load_color0/1 to use the color value. */
2207    return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2208                                        nir_metadata_control_flow,
2209                                        colors) || progress;
2210 }
2211 
si_nir_emit_polygon_stipple(nir_shader * nir,struct si_shader_args * args)2212 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
2213 {
2214    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2215 
2216    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2217    nir_builder *b = &builder;
2218 
2219    /* Load the buffer descriptor. */
2220    nir_def *desc =
2221       si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
2222 
2223    /* Use the fixed-point gl_FragCoord input.
2224     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2225     * per coordinate to get the repeating effect.
2226     */
2227    nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5);
2228    nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5);
2229 
2230    nir_def *zero = nir_imm_int(b, 0);
2231    /* The stipple pattern is 32x32, each row has 32 bits. */
2232    nir_def *offset = nir_ishl_imm(b, pos_y, 2);
2233    nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2234    nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1));
2235 
2236    nir_def *pass = nir_i2b(b, bit);
2237    nir_discard_if(b, nir_inot(b, pass));
2238 }
2239 
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2240 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2241 {
2242    return shader->info.stage == MESA_SHADER_COMPUTE && shader->info.shared_size > 0 && sscreen->options.clear_lds;
2243 }
2244 
si_get_nir_shader(struct si_shader * shader,struct si_shader_args * args,bool * free_nir,uint64_t tcs_vgpr_only_inputs,ac_nir_gs_output_info * output_info)2245 struct nir_shader *si_get_nir_shader(struct si_shader *shader,
2246                                      struct si_shader_args *args,
2247                                      bool *free_nir,
2248                                      uint64_t tcs_vgpr_only_inputs,
2249                                      ac_nir_gs_output_info *output_info)
2250 {
2251    struct si_shader_selector *sel = shader->selector;
2252    const union si_shader_key *key = &shader->key;
2253 
2254    nir_shader *nir;
2255    *free_nir = false;
2256 
2257    if (sel->nir) {
2258       nir = sel->nir;
2259    } else if (sel->nir_binary) {
2260       nir = si_deserialize_shader(sel);
2261       *free_nir = true;
2262    } else {
2263       return NULL;
2264    }
2265 
2266    bool progress = false;
2267    bool late_opts = false;
2268 
2269    const char *original_name = NULL;
2270    if (unlikely(should_print_nir(nir))) {
2271       /* Modify the shader's name so that each variant gets its own name. */
2272       original_name = ralloc_strdup(nir, nir->info.name);
2273       ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
2274 
2275       /* Dummy pass to get the starting point. */
2276       printf("nir_dummy_pass\n");
2277       nir_print_shader(nir, stdout);
2278    }
2279 
2280    /* Kill outputs according to the shader key. */
2281    if (sel->stage <= MESA_SHADER_GEOMETRY)
2282       NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2283 
2284    NIR_PASS(progress, nir, ac_nir_lower_tex,
2285             &(ac_nir_lower_tex_options){
2286                .gfx_level = sel->screen->info.gfx_level,
2287                .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2288             });
2289 
2290    if (nir->info.uses_resource_info_query)
2291       NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2292 
2293    bool inline_uniforms = false;
2294    uint32_t *inlined_uniform_values;
2295    si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
2296                                &inline_uniforms, &inlined_uniform_values);
2297 
2298    if (inline_uniforms) {
2299       assert(*free_nir);
2300 
2301       /* Most places use shader information from the default variant, not
2302        * the optimized variant. These are the things that the driver looks at
2303        * in optimized variants and the list of things that we need to do.
2304        *
2305        * The driver takes into account these things if they suddenly disappear
2306        * from the shader code:
2307        * - Register usage and code size decrease (obvious)
2308        * - Eliminated PS system values are disabled by LLVM
2309        *   (FragCoord, FrontFace, barycentrics)
2310        * - VS/TES/GS param exports are eliminated if they are undef.
2311        *   The param space for eliminated outputs is also not allocated.
2312        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2313        * - TCS output stores are eliminated
2314        * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2315        *
2316        * TODO: These are things the driver ignores in the final shader code
2317        * and relies on the default shader info.
2318        * - System values in VS, TCS, TES, GS are not eliminated
2319        * - uses_discard - if it changed to false
2320        * - writes_memory - if it changed to false
2321        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2322        *   eliminated
2323        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2324        *   GS outputs are eliminated except for the temporary LDS.
2325        *   Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2326        *   on current states, so we don't care about the shader code.
2327        *
2328        * TODO: Merged shaders don't inline uniforms for the first stage.
2329        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2330        * (key == NULL for the first stage here)
2331        *
2332        * TODO: Compute shaders don't support inlinable uniforms, because they
2333        * don't have shader variants.
2334        *
2335        * TODO: The driver uses a linear search to find a shader variant. This
2336        * can be really slow if we get too many variants due to uniform inlining.
2337        */
2338       NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2339                  inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2340       progress = true;
2341    }
2342 
2343    if (sel->stage == MESA_SHADER_FRAGMENT) {
2344       /* This uses the epilog key, so only monolithic shaders can call this. */
2345       if (shader->is_monolithic)
2346          NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
2347 
2348       if (key->ps.mono.poly_line_smoothing)
2349          NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2350 
2351       if (key->ps.mono.point_smoothing)
2352          NIR_PASS(progress, nir, nir_lower_point_smooth);
2353    }
2354 
2355    /* This must be before si_nir_lower_resource. */
2356    if (!sel->screen->info.has_image_opcodes)
2357       NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2358 
2359    /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2360    if (sel->info.base.use_aco_amd && sel->info.has_non_uniform_tex_access) {
2361       nir_lower_non_uniform_access_options options = {
2362          .types = nir_lower_non_uniform_texture_access,
2363       };
2364       NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2365    }
2366 
2367    bool is_last_vgt_stage =
2368       (sel->stage == MESA_SHADER_VERTEX ||
2369        sel->stage == MESA_SHADER_TESS_EVAL ||
2370        (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2371       !shader->key.ge.as_ls && !shader->key.ge.as_es;
2372 
2373    /* Legacy GS is not last VGT stage because it has GS copy shader. */
2374    bool is_legacy_gs = sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg;
2375 
2376    if (is_last_vgt_stage || is_legacy_gs)
2377       NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
2378 
2379    if (progress) {
2380       si_nir_opts(sel->screen, nir, true);
2381       late_opts = true;
2382       progress = false;
2383    }
2384 
2385    /* Lower large variables that are always constant with load_constant intrinsics, which
2386     * get turned into PC-relative loads from a data section next to the shader.
2387     *
2388     * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2389     * this should be done after that.
2390     *
2391     * The pass crashes if there are dead temps of lowered IO interface types, so remove
2392     * them first.
2393     */
2394    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2395    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2396 
2397    /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2398     * this should be done after that.
2399     */
2400    progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2401 
2402    if (sel->stage == MESA_SHADER_VERTEX)
2403       NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
2404 
2405    progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
2406 
2407    if (is_last_vgt_stage) {
2408       /* Assign param export indices. */
2409       si_assign_param_offsets(nir, shader);
2410 
2411       /* Assign num of position exports. */
2412       shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2413 
2414       if (key->ge.as_ngg) {
2415          /* Lower last VGT NGG shader stage. */
2416          si_lower_ngg(shader, nir);
2417       } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) {
2418          /* Lower last VGT none-NGG VS/TES shader stage. */
2419          unsigned clip_cull_mask =
2420             (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2421             sel->info.culldist_mask;
2422 
2423          NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2424                     sel->screen->info.gfx_level,
2425                     clip_cull_mask,
2426                     shader->info.vs_output_param_offset,
2427                     shader->info.nr_param_exports,
2428                     shader->key.ge.mono.u.vs_export_prim_id,
2429                     !si_shader_uses_streamout(shader),
2430                     key->ge.opt.kill_pointsize,
2431                     key->ge.opt.kill_layer,
2432                     sel->screen->options.vrs2x2);
2433       }
2434       progress = true;
2435    } else if (is_legacy_gs) {
2436       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
2437       progress = true;
2438    } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2439       /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
2440        * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
2441        * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
2442        * because they are still represented by nir_intrinsic_load_color0/1.
2443        */
2444       NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
2445 
2446       /* Two-side color selection and interpolation: Get the latest shader info because
2447        * uniform inlining and colormask can fully eliminate color inputs.
2448        */
2449       struct si_shader_info info;
2450       si_nir_scan_shader(sel->screen, nir, &info);
2451 
2452       if (info.colors_read)
2453          NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
2454 
2455       /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
2456        * which can only occur with monolithic PS.
2457        */
2458       shader->info.num_ps_inputs = info.num_inputs;
2459       shader->info.ps_colors_read = info.colors_read;
2460 
2461       ac_nir_lower_ps_options options = {
2462          .gfx_level = sel->screen->info.gfx_level,
2463          .family = sel->screen->info.family,
2464          .use_aco = sel->info.base.use_aco_amd,
2465          .uses_discard = si_shader_uses_discard(shader),
2466          .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2467          .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2468          .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2469          .color_is_int8 = key->ps.part.epilog.color_is_int8,
2470          .color_is_int10 = key->ps.part.epilog.color_is_int10,
2471          .clamp_color = key->ps.part.epilog.clamp_color,
2472          .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2473          .alpha_func = key->ps.part.epilog.alpha_func,
2474          .broadcast_last_cbuf = key->ps.part.epilog.last_cbuf,
2475          .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2476 
2477          .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2478          .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2479          .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp,
2480          .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp,
2481          .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp,
2482          .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp,
2483          .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter,
2484       };
2485 
2486       NIR_PASS_V(nir, ac_nir_lower_ps, &options);
2487 
2488       if (key->ps.part.prolog.poly_stipple)
2489          NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
2490 
2491       progress = true;
2492    }
2493 
2494    assert(shader->wave_size == 32 || shader->wave_size == 64);
2495 
2496    NIR_PASS(progress, nir, nir_lower_subgroups,
2497             &(struct nir_lower_subgroups_options) {
2498                .subgroup_size = shader->wave_size,
2499                .ballot_bit_size = shader->wave_size,
2500                .ballot_components = 1,
2501                .lower_to_scalar = true,
2502                .lower_subgroup_masks = true,
2503                .lower_relative_shuffle = true,
2504                .lower_rotate_to_shuffle = !sel->info.base.use_aco_amd,
2505                .lower_shuffle_to_32bit = true,
2506                .lower_vote_eq = true,
2507                .lower_vote_bool_eq = true,
2508                .lower_quad_broadcast_dynamic = true,
2509                .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
2510                .lower_shuffle_to_swizzle_amd = true,
2511                .lower_ballot_bit_count_to_mbcnt_amd = true,
2512                .lower_inverse_ballot = !sel->info.base.use_aco_amd && LLVM_VERSION_MAJOR < 17,
2513                .lower_boolean_reduce = sel->info.base.use_aco_amd,
2514                .lower_boolean_shuffle = true,
2515             });
2516 
2517    NIR_PASS(progress, nir, nir_lower_pack);
2518    NIR_PASS(progress, nir, nir_lower_int64);
2519    NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2520    NIR_PASS(progress, nir, nir_lower_idiv,
2521             &(nir_lower_idiv_options){
2522                .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2523             });
2524 
2525    if (si_should_clear_lds(sel->screen, nir)) {
2526       const unsigned chunk_size = 16; /* max single store size */
2527       const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2528       NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2529    }
2530 
2531    NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2532             si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2533             &args->ac);
2534    NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
2535 
2536    if (progress) {
2537       si_nir_opts(sel->screen, nir, false);
2538       progress = false;
2539       late_opts = true;
2540    }
2541 
2542    NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
2543             &(nir_load_store_vectorize_options){
2544                .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
2545                         nir_var_shader_temp,
2546                .callback = ac_nir_mem_vectorize_callback,
2547                .cb_data = &sel->screen->info.gfx_level,
2548                /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
2549                 * the final offset is not.
2550                 */
2551                .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
2552             });
2553    NIR_PASS(progress, nir, nir_opt_shrink_stores, false);
2554    NIR_PASS(progress, nir, ac_nir_lower_global_access);
2555    /* This must be after vectorization because it causes bindings_different_restrict() to fail. */
2556    NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
2557 
2558    if (progress) {
2559       si_nir_opts(sel->screen, nir, false);
2560       progress = false;
2561       late_opts = true;
2562    }
2563 
2564    static const nir_opt_offsets_options offset_options = {
2565       .uniform_max = 0,
2566       .buffer_max = ~0,
2567       .shared_max = ~0,
2568    };
2569    NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2570 
2571    if (late_opts)
2572       si_nir_late_opts(nir);
2573 
2574    /* aco only accept scalar const, must be done after si_nir_late_opts()
2575     * which may generate vec const.
2576     */
2577    if (sel->info.base.use_aco_amd)
2578       NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2579 
2580    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2581     * 200 is tuned for Viewperf. It should be done last.
2582     */
2583    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2584 
2585    if (unlikely(original_name)) {
2586       ralloc_free((void*)nir->info.name);
2587       nir->info.name = original_name;
2588    }
2589 
2590    return nir;
2591 }
2592 
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)2593 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
2594 {
2595    struct si_shader_info info;
2596    si_nir_scan_shader(shader->selector->screen, nir, &info);
2597 
2598    shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2599    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2600 
2601    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2602       /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
2603       shader->info.num_ps_inputs = info.num_inputs;
2604       shader->info.ps_colors_read = info.colors_read;
2605 
2606       /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
2607       unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
2608       memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
2609    }
2610 }
2611 
2612 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2613 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2614 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2615                                struct ac_llvm_compiler *compiler,
2616                                struct si_shader *gs_shader,
2617                                nir_shader *gs_nir,
2618                                struct util_debug_callback *debug,
2619                                ac_nir_gs_output_info *output_info)
2620 {
2621    struct si_shader *shader;
2622    struct si_shader_selector *gs_selector = gs_shader->selector;
2623    struct si_shader_info *gsinfo = &gs_selector->info;
2624    union si_shader_key *gskey = &gs_shader->key;
2625 
2626    shader = CALLOC_STRUCT(si_shader);
2627    if (!shader)
2628       return NULL;
2629 
2630    /* We can leave the fence as permanently signaled because the GS copy
2631     * shader only becomes visible globally after it has been compiled. */
2632    util_queue_fence_init(&shader->ready);
2633 
2634    shader->selector = gs_selector;
2635    shader->is_gs_copy_shader = true;
2636    shader->wave_size = si_determine_wave_size(sscreen, shader);
2637 
2638    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2639    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2640           sizeof(shader->info.vs_output_param_offset));
2641 
2642    for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2643       unsigned semantic = gsinfo->output_semantic[i];
2644 
2645       /* Skip if no channel writes to stream 0. */
2646       if (!nir_slot_is_varying(semantic) ||
2647           (gsinfo->output_streams[i] & 0x03 &&
2648            gsinfo->output_streams[i] & 0x0c &&
2649            gsinfo->output_streams[i] & 0x30 &&
2650            gsinfo->output_streams[i] & 0xc0))
2651          continue;
2652 
2653       shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2654    }
2655 
2656    shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2657 
2658    unsigned clip_cull_mask =
2659       (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2660 
2661    nir_shader *nir =
2662       ac_nir_create_gs_copy_shader(gs_nir,
2663                                    sscreen->info.gfx_level,
2664                                    clip_cull_mask,
2665                                    shader->info.vs_output_param_offset,
2666                                    shader->info.nr_param_exports,
2667                                    !si_shader_uses_streamout(gs_shader),
2668                                    gskey->ge.opt.kill_pointsize,
2669                                    gskey->ge.opt.kill_layer,
2670                                    sscreen->options.vrs2x2,
2671                                    output_info);
2672 
2673    struct si_shader_args args;
2674    si_init_shader_args(shader, &args);
2675 
2676    NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level, AC_HW_VERTEX_SHADER, &args.ac);
2677    NIR_PASS_V(nir, si_nir_lower_abi, shader, &args);
2678 
2679    si_nir_opts(gs_selector->screen, nir, false);
2680 
2681    /* aco only accept scalar const */
2682    if (gsinfo->base.use_aco_amd)
2683       NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2684 
2685    if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
2686       fprintf(stderr, "GS Copy Shader:\n");
2687       nir_print_shader(nir, stderr);
2688    }
2689 
2690    bool ok =
2691 #if AMD_LLVM_AVAILABLE
2692       !gs_selector->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader,
2693                                                                    &args, debug, nir) :
2694 #endif
2695       si_aco_compile_shader(shader, &args, nir, debug);
2696 
2697    if (ok) {
2698       assert(!shader->config.scratch_bytes_per_wave);
2699       ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
2700       si_shader_dump(sscreen, shader, debug, stderr, true);
2701    }
2702    ralloc_free(nir);
2703 
2704    if (!ok) {
2705       FREE(shader);
2706       shader = NULL;
2707    } else {
2708       si_fix_resource_usage(sscreen, shader);
2709    }
2710    return shader;
2711 }
2712 
2713 struct si_gs_output_info {
2714    uint8_t streams[64];
2715    uint8_t streams_16bit_lo[16];
2716    uint8_t streams_16bit_hi[16];
2717 
2718    uint8_t usage_mask[64];
2719    uint8_t usage_mask_16bit_lo[16];
2720    uint8_t usage_mask_16bit_hi[16];
2721 
2722    ac_nir_gs_output_info info;
2723 };
2724 
2725 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2726 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2727 {
2728    for (int i = 0; i < info->num_outputs; i++) {
2729       unsigned slot = info->output_semantic[i];
2730       if (slot < VARYING_SLOT_VAR0_16BIT) {
2731          out_info->streams[slot] = info->output_streams[i];
2732          out_info->usage_mask[slot] = info->output_usagemask[i];
2733       } else {
2734          unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2735          /* TODO: 16bit need separated fields for lo/hi part. */
2736          out_info->streams_16bit_lo[index] = info->output_streams[i];
2737          out_info->streams_16bit_hi[index] = info->output_streams[i];
2738          out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2739          out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2740       }
2741    }
2742 
2743    ac_nir_gs_output_info *ac_info = &out_info->info;
2744 
2745    ac_info->streams = out_info->streams;
2746    ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2747    ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2748 
2749    ac_info->usage_mask = out_info->usage_mask;
2750    ac_info->usage_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2751    ac_info->usage_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2752 
2753    /* TODO: construct 16bit slot per component store type. */
2754    ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2755 }
2756 
si_fixup_spi_ps_input_config(struct si_shader * shader)2757 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2758 {
2759    const union si_shader_key *key = &shader->key;
2760 
2761    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2762    if (key->ps.part.prolog.poly_stipple)
2763       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2764 
2765    /* Set up the enable bits for per-sample shading if needed. */
2766    if (key->ps.part.prolog.force_persp_sample_interp &&
2767        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2768         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2769       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2770       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2771       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2772    }
2773    if (key->ps.part.prolog.force_linear_sample_interp &&
2774        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2775         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2776       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2777       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2778       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2779    }
2780    if (key->ps.part.prolog.force_persp_center_interp &&
2781        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2782         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2783       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2784       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2785       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2786    }
2787    if (key->ps.part.prolog.force_linear_center_interp &&
2788        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2789         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2790       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2791       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2792       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2793    }
2794 
2795    /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2796    if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2797        !(shader->config.spi_ps_input_ena & 0xf)) {
2798       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2799    }
2800 
2801    /* At least one pair of interpolation weights must be enabled. */
2802    if (!(shader->config.spi_ps_input_ena & 0x7f))
2803       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2804 
2805    /* Samplemask fixup requires the sample ID. */
2806    if (key->ps.part.prolog.samplemask_log_ps_iter)
2807       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2808 }
2809 
2810 static void
si_set_spi_ps_input_config(struct si_shader * shader)2811 si_set_spi_ps_input_config(struct si_shader *shader)
2812 {
2813    const struct si_shader_selector *sel = shader->selector;
2814    const struct si_shader_info *info = &sel->info;
2815    const union si_shader_key *key = &shader->key;
2816 
2817    /* TODO: This should be determined from the final NIR instead of the input NIR,
2818     * otherwise LLVM will have a performance advantage here because it determines
2819     * VGPR inputs for each shader variant after LLVM optimizations.
2820     */
2821    shader->config.spi_ps_input_ena =
2822       S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
2823       S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
2824       S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) |
2825       S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
2826       S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
2827       S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
2828       S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
2829       S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
2830       S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
2831 
2832    uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask;
2833    u_foreach_bit(i, mask) {
2834       shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
2835    }
2836 
2837    if (key->ps.part.prolog.color_two_side)
2838       shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2839 
2840    /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */
2841    if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) {
2842       shader->config.spi_ps_input_ena |=
2843          S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) |
2844          S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) |
2845          S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color);
2846    }
2847 
2848    /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */
2849    if (key->ps.mono.poly_line_smoothing)
2850       shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
2851 
2852    /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered
2853     * to nir_load_barycentric_pixel and nir_load_interpolated_input.
2854     */
2855    if (key->ps.mono.point_smoothing)
2856       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2857 
2858    /* See fetch_framebuffer() for used args when fbfetch output. */
2859    if (info->base.fs.uses_fbfetch_output) {
2860       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2861 
2862       if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa)
2863          shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2864    }
2865 
2866    if (shader->is_monolithic) {
2867       si_fixup_spi_ps_input_config(shader);
2868       shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2869    } else {
2870       /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2871        * shader part in si_shader_select_ps_parts().
2872        *
2873        * Reserve register locations for VGPR inputs the PS prolog may need.
2874        */
2875       shader->config.spi_ps_input_addr =
2876          shader->config.spi_ps_input_ena |
2877          SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2878    }
2879 }
2880 
2881 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)2882 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
2883                       const char *fmt, va_list args)
2884 {
2885    vfprintf(stderr, fmt, args);
2886    fprintf(stderr, "\n");
2887 }
2888 
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2889 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2890                        struct si_shader *shader, struct util_debug_callback *debug)
2891 {
2892    bool ret = true;
2893    struct si_shader_selector *sel = shader->selector;
2894 
2895    /* ACO need spi_ps_input in advance to init args and used in compiler. */
2896    if (sel->stage == MESA_SHADER_FRAGMENT && sel->info.base.use_aco_amd)
2897       si_set_spi_ps_input_config(shader);
2898 
2899    /* We need this info only when legacy GS. */
2900    struct si_gs_output_info legacy_gs_output_info;
2901    if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2902       memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info));
2903       si_init_gs_output_info(&sel->info, &legacy_gs_output_info);
2904    }
2905 
2906    struct si_shader_args args;
2907    si_init_shader_args(shader, &args);
2908 
2909    bool free_nir;
2910    struct nir_shader *nir =
2911       si_get_nir_shader(shader, &args, &free_nir, 0, &legacy_gs_output_info.info);
2912 
2913    /* Dump NIR before doing NIR->LLVM conversion in case the
2914     * conversion fails. */
2915    if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_NIR)) {
2916       nir_print_shader(nir, stderr);
2917 
2918       if (nir->xfb_info)
2919          nir_print_xfb_info(nir->xfb_info, stderr);
2920    }
2921 
2922    /* Initialize vs_output_ps_input_cntl to default. */
2923    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
2924       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
2925    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
2926 
2927    si_update_shader_binary_info(shader, nir);
2928 
2929    /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
2930    shader->info.uses_instanceid |= sel->info.uses_instanceid;
2931    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
2932 
2933    /* Set the FP ALU behavior. */
2934    /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
2935     * for performance and correctness reasons. FP32 denormals can't be enabled because
2936     * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
2937     *
2938     * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
2939     * can be set simultaneously, which means we are allowed to choose, but not really because
2940     * some options cause GLCTS failures.
2941     */
2942    unsigned float_mode = V_00B028_FP_16_64_DENORMS;
2943 
2944    if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
2945        nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
2946       float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
2947 
2948    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
2949                                                     FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
2950        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
2951                                                   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
2952       float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
2953 
2954    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
2955                                                     FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
2956        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
2957                                                   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
2958       float_mode &= ~V_00B028_FP_16_64_DENORMS;
2959 
2960    ret =
2961 #if AMD_LLVM_AVAILABLE
2962       !sel->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &args,
2963                                                            debug, nir) :
2964 #endif
2965       si_aco_compile_shader(shader, &args, nir, debug);
2966 
2967    if (!ret)
2968       goto out;
2969 
2970    shader->config.float_mode = float_mode;
2971 
2972    /* The GS copy shader is compiled next. */
2973    if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2974       shader->gs_copy_shader =
2975          si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
2976                                         &legacy_gs_output_info.info);
2977       if (!shader->gs_copy_shader) {
2978          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
2979          ret = false;
2980          goto out;
2981       }
2982    }
2983 
2984    /* Compute vs_output_ps_input_cntl. */
2985    if ((sel->stage == MESA_SHADER_VERTEX ||
2986         sel->stage == MESA_SHADER_TESS_EVAL ||
2987         sel->stage == MESA_SHADER_GEOMETRY) &&
2988        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
2989       uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
2990 
2991       if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
2992          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
2993 
2994       /* We must use the original shader info before the removal of duplicated shader outputs. */
2995       /* VS and TES should also set primitive ID output if it's used. */
2996       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
2997                                           shader->key.ge.mono.u.vs_export_prim_id;
2998 
2999       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
3000          unsigned semantic = sel->info.output_semantic[i];
3001          unsigned offset = vs_output_param_offset[semantic];
3002          unsigned ps_input_cntl;
3003 
3004          if (offset <= AC_EXP_PARAM_OFFSET_31) {
3005             /* The input is loaded from parameter memory. */
3006             ps_input_cntl = S_028644_OFFSET(offset);
3007          } else {
3008             /* The input is a DEFAULT_VAL constant. */
3009             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
3010                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
3011             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
3012 
3013             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
3014             ps_input_cntl = S_028644_OFFSET(0x20) |
3015                             S_028644_DEFAULT_VAL(offset);
3016          }
3017 
3018          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
3019       }
3020    }
3021 
3022    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
3023    if (sel->stage == MESA_SHADER_COMPUTE) {
3024       unsigned max_vgprs =
3025          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
3026       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3027       unsigned max_sgprs_per_wave = 128;
3028       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3029       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3030       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
3031       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3032 
3033       max_vgprs = max_vgprs / waves_per_simd;
3034       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3035 
3036       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
3037          fprintf(stderr,
3038                  "LLVM failed to compile a shader correctly: "
3039                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3040                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
3041 
3042          /* Just terminate the process, because dependent
3043           * shaders can hang due to bad input data, but use
3044           * the env var to allow shader-db to work.
3045           */
3046          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3047             abort();
3048       }
3049    }
3050 
3051    /* Add/remove the scratch offset to/from input SGPRs. */
3052    if (!sel->screen->info.has_scratch_base_registers &&
3053        !si_is_merged_shader(shader)) {
3054       if (sel->info.base.use_aco_amd) {
3055          /* When aco scratch_offset arg is added explicitly at the beginning.
3056           * After compile if no scratch used, reduce the input sgpr count.
3057           */
3058          if (!shader->config.scratch_bytes_per_wave)
3059             shader->info.num_input_sgprs--;
3060       } else {
3061          /* scratch_offset arg is added by llvm implicitly */
3062          if (shader->info.num_input_sgprs)
3063             shader->info.num_input_sgprs++;
3064       }
3065    }
3066 
3067    /* Calculate the number of fragment input VGPRs. */
3068    if (sel->stage == MESA_SHADER_FRAGMENT) {
3069       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
3070          &shader->config, &shader->info.num_fragcoord_components);
3071    }
3072 
3073    si_calculate_max_simd_waves(shader);
3074 
3075    if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_STATS)) {
3076       struct util_debug_callback out_stderr = {
3077          .debug_message = debug_message_stderr,
3078       };
3079 
3080       si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
3081    } else {
3082       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3083    }
3084 
3085 out:
3086    if (free_nir)
3087       ralloc_free(nir);
3088 
3089    return ret;
3090 }
3091 
3092 /**
3093  * Create, compile and return a shader part (prolog or epilog).
3094  *
3095  * \param sscreen  screen
3096  * \param list     list of shader parts of the same category
3097  * \param type     shader type
3098  * \param key      shader part key
3099  * \param prolog   whether the part being requested is a prolog
3100  * \param tm       LLVM target machine
3101  * \param debug    debug callback
3102  * \return         non-NULL on success
3103  */
3104 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3105 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3106                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3107                    struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3108                    const char *name)
3109 {
3110    struct si_shader_part *result;
3111 
3112    simple_mtx_lock(&sscreen->shader_parts_mutex);
3113 
3114    /* Find existing. */
3115    for (result = *list; result; result = result->next) {
3116       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3117          simple_mtx_unlock(&sscreen->shader_parts_mutex);
3118          return result;
3119       }
3120    }
3121 
3122    /* Compile a new one. */
3123    result = CALLOC_STRUCT(si_shader_part);
3124    result->key = *key;
3125 
3126    bool ok =
3127 #if AMD_LLVM_AVAILABLE
3128       !(sscreen->use_aco ||
3129         (stage == MESA_SHADER_FRAGMENT &&
3130          ((prolog && key->ps_prolog.use_aco) ||
3131           (!prolog && key->ps_epilog.use_aco)))) ?
3132       si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3133 #endif
3134       si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3135 
3136    if (ok) {
3137       result->next = *list;
3138       *list = result;
3139    } else {
3140       FREE(result);
3141       result = NULL;
3142    }
3143 
3144    simple_mtx_unlock(&sscreen->shader_parts_mutex);
3145    return result;
3146 }
3147 
3148 
3149 /**
3150  * Select and compile (or reuse) TCS parts (epilog).
3151  */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3152 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3153                                        struct si_shader *shader, struct util_debug_callback *debug)
3154 {
3155    if (sscreen->info.gfx_level >= GFX9) {
3156       assert(shader->wave_size == 32 || shader->wave_size == 64);
3157       unsigned index = shader->wave_size / 32 - 1;
3158       shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
3159    }
3160 
3161    return true;
3162 }
3163 
3164 /**
3165  * Select and compile (or reuse) GS parts (prolog).
3166  */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3167 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3168                                       struct si_shader *shader, struct util_debug_callback *debug)
3169 {
3170    if (sscreen->info.gfx_level >= GFX9) {
3171       if (shader->key.ge.as_ngg) {
3172          assert(shader->wave_size == 32 || shader->wave_size == 64);
3173          unsigned index = shader->wave_size / 32 - 1;
3174          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
3175       } else {
3176          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3177       }
3178    }
3179 
3180    return true;
3181 }
3182 
3183 /**
3184  * Compute the PS prolog key, which contains all the information needed to
3185  * build the PS prolog function, and set related bits in shader->config.
3186  */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3187 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3188 {
3189    struct si_shader_info *info = &shader->selector->info;
3190 
3191    memset(key, 0, sizeof(*key));
3192    key->ps_prolog.states = shader->key.ps.part.prolog;
3193    key->ps_prolog.use_aco = info->base.use_aco_amd;
3194    key->ps_prolog.wave32 = shader->wave_size == 32;
3195    key->ps_prolog.colors_read = shader->info.ps_colors_read;
3196    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3197    key->ps_prolog.wqm =
3198       info->base.fs.needs_quad_helper_invocations &&
3199       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3200        key->ps_prolog.states.force_linear_sample_interp ||
3201        key->ps_prolog.states.force_persp_center_interp ||
3202        key->ps_prolog.states.force_linear_center_interp ||
3203        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
3204    key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
3205 
3206    if (shader->key.ps.part.prolog.poly_stipple)
3207       shader->info.uses_vmem_load_other = true;
3208 
3209    if (shader->info.ps_colors_read) {
3210       uint8_t *color = shader->selector->info.color_attr_index;
3211 
3212       if (shader->key.ps.part.prolog.color_two_side) {
3213          /* BCOLORs are stored after the last input. */
3214          key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3215          shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3216       }
3217 
3218       for (unsigned i = 0; i < 2; i++) {
3219          unsigned interp = info->color_interpolate[i];
3220          unsigned location = info->color_interpolate_loc[i];
3221 
3222          if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3223             continue;
3224 
3225          key->ps_prolog.color_attr_index[i] = color[i];
3226 
3227          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3228             interp = INTERP_MODE_FLAT;
3229 
3230          switch (interp) {
3231          case INTERP_MODE_FLAT:
3232             key->ps_prolog.color_interp_vgpr_index[i] = -1;
3233             break;
3234          case INTERP_MODE_SMOOTH:
3235          case INTERP_MODE_COLOR:
3236             /* Force the interpolation location for colors here. */
3237             if (shader->key.ps.part.prolog.force_persp_sample_interp)
3238                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3239             if (shader->key.ps.part.prolog.force_persp_center_interp)
3240                location = TGSI_INTERPOLATE_LOC_CENTER;
3241 
3242             switch (location) {
3243             case TGSI_INTERPOLATE_LOC_SAMPLE:
3244                key->ps_prolog.color_interp_vgpr_index[i] = 0;
3245                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3246                break;
3247             case TGSI_INTERPOLATE_LOC_CENTER:
3248                key->ps_prolog.color_interp_vgpr_index[i] = 2;
3249                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3250                break;
3251             case TGSI_INTERPOLATE_LOC_CENTROID:
3252                key->ps_prolog.color_interp_vgpr_index[i] = 4;
3253                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3254                break;
3255             default:
3256                assert(0);
3257             }
3258             break;
3259          case INTERP_MODE_NOPERSPECTIVE:
3260             /* Force the interpolation location for colors here. */
3261             if (shader->key.ps.part.prolog.force_linear_sample_interp)
3262                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3263             if (shader->key.ps.part.prolog.force_linear_center_interp)
3264                location = TGSI_INTERPOLATE_LOC_CENTER;
3265 
3266             /* The VGPR assignment for non-monolithic shaders
3267              * works because InitialPSInputAddr is set on the
3268              * main shader and PERSP_PULL_MODEL is never used.
3269              */
3270             switch (location) {
3271             case TGSI_INTERPOLATE_LOC_SAMPLE:
3272                key->ps_prolog.color_interp_vgpr_index[i] = 6;
3273                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3274                break;
3275             case TGSI_INTERPOLATE_LOC_CENTER:
3276                key->ps_prolog.color_interp_vgpr_index[i] = 8;
3277                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3278                break;
3279             case TGSI_INTERPOLATE_LOC_CENTROID:
3280                key->ps_prolog.color_interp_vgpr_index[i] = 10;
3281                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3282                break;
3283             default:
3284                assert(0);
3285             }
3286             break;
3287          default:
3288             assert(0);
3289          }
3290       }
3291    }
3292 }
3293 
3294 /**
3295  * Check whether a PS prolog is required based on the key.
3296  */
si_need_ps_prolog(const union si_shader_part_key * key)3297 bool si_need_ps_prolog(const union si_shader_part_key *key)
3298 {
3299    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3300           key->ps_prolog.states.force_linear_sample_interp ||
3301           key->ps_prolog.states.force_persp_center_interp ||
3302           key->ps_prolog.states.force_linear_center_interp ||
3303           key->ps_prolog.states.bc_optimize_for_persp ||
3304           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3305           key->ps_prolog.states.samplemask_log_ps_iter;
3306 }
3307 
3308 /**
3309  * Compute the PS epilog key, which contains all the information needed to
3310  * build the PS epilog function.
3311  */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3312 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3313 {
3314    struct si_shader_info *info = &shader->selector->info;
3315    memset(key, 0, sizeof(*key));
3316    key->ps_epilog.use_aco = info->base.use_aco_amd;
3317    key->ps_epilog.wave32 = shader->wave_size == 32;
3318    key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3319    key->ps_epilog.colors_written = info->colors_written;
3320    key->ps_epilog.color_types = info->output_color_types;
3321    key->ps_epilog.writes_z = info->writes_z;
3322    key->ps_epilog.writes_stencil = info->writes_stencil;
3323    key->ps_epilog.writes_samplemask = info->writes_samplemask &&
3324                                       !shader->key.ps.part.epilog.kill_samplemask;
3325    key->ps_epilog.states = shader->key.ps.part.epilog;
3326 }
3327 
3328 /**
3329  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3330  */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3331 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3332                                       struct si_shader *shader, struct util_debug_callback *debug)
3333 {
3334    union si_shader_part_key prolog_key;
3335    union si_shader_part_key epilog_key;
3336 
3337    /* Get the prolog. */
3338    si_get_ps_prolog_key(shader, &prolog_key);
3339 
3340    /* The prolog is a no-op if these aren't set. */
3341    if (si_need_ps_prolog(&prolog_key)) {
3342       shader->prolog =
3343          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3344                             compiler, debug, "Fragment Shader Prolog");
3345       if (!shader->prolog)
3346          return false;
3347    }
3348 
3349    /* Get the epilog. */
3350    si_get_ps_epilog_key(shader, &epilog_key);
3351 
3352    shader->epilog =
3353       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3354                          compiler, debug, "Fragment Shader Epilog");
3355    if (!shader->epilog)
3356       return false;
3357 
3358    si_fixup_spi_ps_input_config(shader);
3359 
3360    /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3361    unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3362    unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3363    assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3364 
3365    return true;
3366 }
3367 
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3368 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3369 {
3370    /* If tessellation is all offchip and on-chip GS isn't used, this
3371     * workaround is not needed.
3372     */
3373    return;
3374 
3375    /* SPI barrier management bug:
3376     *   Make sure we have at least 4k of LDS in use to avoid the bug.
3377     *   It applies to workgroup sizes of more than one wavefront.
3378     */
3379    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3380       *lds_size = MAX2(*lds_size, 8);
3381 }
3382 
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3383 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3384 {
3385    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3386 
3387    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3388 
3389    if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3390        si_get_max_workgroup_size(shader) > shader->wave_size) {
3391       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3392    }
3393 }
3394 
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3395 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3396                               struct si_shader *shader, struct util_debug_callback *debug)
3397 {
3398    struct si_shader_selector *sel = shader->selector;
3399    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
3400 
3401    if (sel->stage == MESA_SHADER_FRAGMENT) {
3402       shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3403                                      !shader->key.ps.part.epilog.kill_samplemask;
3404    }
3405 
3406    /* LS, ES, VS are compiled on demand if the main part hasn't been
3407     * compiled for that stage.
3408     *
3409     * GS are compiled on demand if the main part hasn't been compiled
3410     * for the chosen NGG-ness.
3411     *
3412     * Vertex shaders are compiled on demand when a vertex fetch
3413     * workaround must be applied.
3414     */
3415    if (shader->is_monolithic) {
3416       /* Monolithic shader (compiled as a whole, has many variants,
3417        * may take a long time to compile).
3418        */
3419       if (!si_compile_shader(sscreen, compiler, shader, debug))
3420          return false;
3421    } else {
3422       /* The shader consists of several parts:
3423        *
3424        * - the middle part is the user shader, it has 1 variant only
3425        *   and it was compiled during the creation of the shader
3426        *   selector
3427        * - the prolog part is inserted at the beginning
3428        * - the epilog part is inserted at the end
3429        *
3430        * The prolog and epilog have many (but simple) variants.
3431        *
3432        * Starting with gfx9, geometry and tessellation control
3433        * shaders also contain the prolog and user shader parts of
3434        * the previous shader stage.
3435        */
3436 
3437       if (!mainp)
3438          return false;
3439 
3440       /* Copy the compiled shader data over. */
3441       shader->is_binary_shared = true;
3442       shader->binary = mainp->binary;
3443       shader->config = mainp->config;
3444       shader->info = mainp->info;
3445 
3446       /* Select prologs and/or epilogs. */
3447       switch (sel->stage) {
3448       case MESA_SHADER_TESS_CTRL:
3449          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3450             return false;
3451          break;
3452       case MESA_SHADER_GEOMETRY:
3453          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3454             return false;
3455 
3456          /* Clone the GS copy shader for the shader variant.
3457           * We can't just copy the pointer because we change the pm4 state and
3458           * si_shader_selector::gs_copy_shader must be immutable because it's shared
3459           * by multiple contexts.
3460           */
3461          if (!shader->key.ge.as_ngg) {
3462             assert(mainp->gs_copy_shader);
3463             assert(mainp->gs_copy_shader->bo);
3464             assert(!mainp->gs_copy_shader->previous_stage_sel);
3465             assert(!mainp->gs_copy_shader->scratch_va);
3466 
3467             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3468             memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
3469                    sizeof(*shader->gs_copy_shader));
3470             /* Increase the reference count. */
3471             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3472             /* Initialize some fields differently. */
3473             shader->gs_copy_shader->shader_log = NULL;
3474             shader->gs_copy_shader->is_binary_shared = true;
3475             util_queue_fence_init(&shader->gs_copy_shader->ready);
3476          }
3477          break;
3478       case MESA_SHADER_FRAGMENT:
3479          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3480             return false;
3481 
3482          /* Make sure we have at least as many VGPRs as there
3483           * are allocated inputs.
3484           */
3485          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3486          break;
3487       default:;
3488       }
3489 
3490       assert(shader->wave_size == mainp->wave_size);
3491       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3492 
3493       /* Update SGPR and VGPR counts. */
3494       if (shader->prolog) {
3495          shader->config.num_sgprs =
3496             MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
3497          shader->config.num_vgprs =
3498             MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
3499       }
3500       if (shader->previous_stage) {
3501          shader->config.num_sgprs =
3502             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3503          shader->config.num_vgprs =
3504             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3505          shader->config.spilled_sgprs =
3506             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3507          shader->config.spilled_vgprs =
3508             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3509          shader->info.private_mem_vgprs =
3510             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3511          shader->config.scratch_bytes_per_wave =
3512             MAX2(shader->config.scratch_bytes_per_wave,
3513                  shader->previous_stage->config.scratch_bytes_per_wave);
3514          shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3515          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3516          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3517       }
3518       if (shader->epilog) {
3519          shader->config.num_sgprs =
3520             MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
3521          shader->config.num_vgprs =
3522             MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
3523       }
3524       si_calculate_max_simd_waves(shader);
3525    }
3526 
3527    if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3528       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3529       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3530          fprintf(stderr, "Failed to compute subgroup info\n");
3531          return false;
3532       }
3533    } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3534       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3535    }
3536 
3537    shader->uses_vs_state_provoking_vertex =
3538       sscreen->use_ngg &&
3539       /* Used to convert triangle strips from GS to triangles. */
3540       ((sel->stage == MESA_SHADER_GEOMETRY &&
3541         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3542        (sel->stage == MESA_SHADER_VERTEX &&
3543         /* Used to export PrimitiveID from the correct vertex. */
3544         shader->key.ge.mono.u.vs_export_prim_id));
3545 
3546    shader->uses_gs_state_outprim = sscreen->use_ngg &&
3547                                    /* Only used by streamout and the PrimID export in vertex
3548                                     * shaders. */
3549                                    sel->stage == MESA_SHADER_VERTEX &&
3550                                    (si_shader_uses_streamout(shader) ||
3551                                     shader->uses_vs_state_provoking_vertex);
3552 
3553    if (sel->stage == MESA_SHADER_VERTEX) {
3554       shader->uses_base_instance = sel->info.uses_base_instance ||
3555                                    shader->key.ge.mono.instance_divisor_is_one ||
3556                                    shader->key.ge.mono.instance_divisor_is_fetched;
3557    } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3558       shader->uses_base_instance = shader->previous_stage_sel &&
3559                                    (shader->previous_stage_sel->info.uses_base_instance ||
3560                                     shader->key.ge.mono.instance_divisor_is_one ||
3561                                     shader->key.ge.mono.instance_divisor_is_fetched);
3562    } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3563       shader->uses_base_instance = shader->previous_stage_sel &&
3564                                    (shader->previous_stage_sel->info.uses_base_instance ||
3565                                     shader->key.ge.mono.instance_divisor_is_one ||
3566                                     shader->key.ge.mono.instance_divisor_is_fetched);
3567    }
3568 
3569    si_fix_resource_usage(sscreen, shader);
3570 
3571    /* Upload. */
3572    bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3573 
3574    shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);
3575 
3576    si_shader_dump(sscreen, shader, debug, stderr, true);
3577 
3578    if (!ok)
3579       fprintf(stderr, "LLVM failed to upload shader\n");
3580    return ok;
3581 }
3582 
si_shader_binary_clean(struct si_shader_binary * binary)3583 void si_shader_binary_clean(struct si_shader_binary *binary)
3584 {
3585    free((void *)binary->code_buffer);
3586    binary->code_buffer = NULL;
3587 
3588    free(binary->llvm_ir_string);
3589    binary->llvm_ir_string = NULL;
3590 
3591    free((void *)binary->symbols);
3592    binary->symbols = NULL;
3593 
3594    free(binary->uploaded_code);
3595    binary->uploaded_code = NULL;
3596    binary->uploaded_code_size = 0;
3597 }
3598 
si_shader_destroy(struct si_shader * shader)3599 void si_shader_destroy(struct si_shader *shader)
3600 {
3601    si_resource_reference(&shader->bo, NULL);
3602 
3603    if (!shader->is_binary_shared)
3604       si_shader_binary_clean(&shader->binary);
3605 
3606    free(shader->shader_log);
3607 }
3608 
si_get_prev_stage_nir_shader(struct si_shader * shader,struct si_shader * prev_shader,struct si_shader_args * args,bool * free_nir)3609 nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
3610                                          struct si_shader *prev_shader,
3611                                          struct si_shader_args *args,
3612                                          bool *free_nir)
3613 {
3614    const struct si_shader_selector *sel = shader->selector;
3615    const union si_shader_key *key = &shader->key;
3616 
3617    if (sel->stage == MESA_SHADER_TESS_CTRL) {
3618       struct si_shader_selector *ls = key->ge.part.tcs.ls;
3619 
3620       prev_shader->selector = ls;
3621       prev_shader->key.ge.as_ls = 1;
3622    } else {
3623       struct si_shader_selector *es = key->ge.part.gs.es;
3624 
3625       prev_shader->selector = es;
3626       prev_shader->key.ge.as_es = 1;
3627       prev_shader->key.ge.as_ngg = key->ge.as_ngg;
3628    }
3629 
3630    prev_shader->next_shader = shader;
3631    prev_shader->key.ge.mono = key->ge.mono;
3632    prev_shader->key.ge.opt = key->ge.opt;
3633    prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
3634    /* kill_outputs was computed based on second shader's outputs so we can't use it to
3635     * kill first shader's outputs.
3636     */
3637    prev_shader->key.ge.opt.kill_outputs = 0;
3638    prev_shader->is_monolithic = true;
3639    prev_shader->wave_size = shader->wave_size;
3640 
3641    si_init_shader_args(prev_shader, args);
3642 
3643    nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
3644                                        sel->info.tcs_vgpr_only_inputs, NULL);
3645 
3646    si_update_shader_binary_info(shader, nir);
3647 
3648    shader->info.uses_instanceid |=
3649       prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid;
3650 
3651    return nir;
3652 }
3653 
si_get_tcs_out_patch_stride(const struct si_shader_info * info)3654 unsigned si_get_tcs_out_patch_stride(const struct si_shader_info *info)
3655 {
3656    unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
3657    unsigned vertex_stride = util_last_bit64(info->outputs_written_before_tes_gs) * 4;
3658    unsigned num_patch_outputs = util_last_bit64(info->patch_outputs_written);
3659 
3660    return tcs_out_vertices * vertex_stride + num_patch_outputs * 4;
3661 }
3662 
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3663 void si_get_ps_prolog_args(struct si_shader_args *args,
3664                            const union si_shader_part_key *key)
3665 {
3666    memset(args, 0, sizeof(*args));
3667 
3668    const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3669 
3670    struct ac_arg input_sgprs[num_input_sgprs];
3671    for (unsigned i = 0; i < num_input_sgprs; i++)
3672       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3673 
3674    args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3675    /* Use the absolute location of the input. */
3676    args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3677 
3678    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3679    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3680    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3681    /* skip PERSP_PULL_MODEL */
3682    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3683    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3684    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3685    /* skip LINE_STIPPLE_TEX */
3686 
3687    /* POS_X|Y|Z|W_FLOAT */
3688    for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
3689       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3690 
3691    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3692    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3693    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3694    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3695 }
3696 
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3697 void si_get_ps_epilog_args(struct si_shader_args *args,
3698                            const union si_shader_part_key *key,
3699                            struct ac_arg colors[MAX_DRAW_BUFFERS],
3700                            struct ac_arg *depth, struct ac_arg *stencil,
3701                            struct ac_arg *sample_mask)
3702 {
3703    memset(args, 0, sizeof(*args));
3704 
3705    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3706    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3707    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3708    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3709    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3710 
3711    u_foreach_bit (i, key->ps_epilog.colors_written) {
3712       ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3713    }
3714 
3715    if (key->ps_epilog.writes_z)
3716       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3717 
3718    if (key->ps_epilog.writes_stencil)
3719       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3720 
3721    if (key->ps_epilog.writes_samplemask)
3722       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3723 }
3724