1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22
23 #if LLVM_AVAILABLE
24 #include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
25 #else
26 #define LLVM_VERSION_MAJOR 0
27 #endif
28
29 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
30
31 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
32
33 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
34 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
35
36 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)37 unsigned si_get_ps_num_interp(struct si_shader *ps)
38 {
39 unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
40 unsigned num_interp =
41 ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
42
43 assert(num_interp <= 32);
44 return MIN2(num_interp, 32);
45 }
46
47 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)48 bool si_is_multi_part_shader(struct si_shader *shader)
49 {
50 if (shader->selector->screen->info.gfx_level <= GFX8 ||
51 shader->selector->stage > MESA_SHADER_GEOMETRY)
52 return false;
53
54 return shader->key.ge.as_ls || shader->key.ge.as_es ||
55 shader->selector->stage == MESA_SHADER_TESS_CTRL ||
56 shader->selector->stage == MESA_SHADER_GEOMETRY;
57 }
58
59 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)60 bool si_is_merged_shader(struct si_shader *shader)
61 {
62 if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
63 return false;
64
65 return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
66 }
67
68 /**
69 * Returns a unique index for a semantic name and index. The index must be
70 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
71 * calculated.
72 */
si_shader_io_get_unique_index(unsigned semantic)73 unsigned si_shader_io_get_unique_index(unsigned semantic)
74 {
75 switch (semantic) {
76 case VARYING_SLOT_POS:
77 return SI_UNIQUE_SLOT_POS;
78 default:
79 if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
80 return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
81
82 if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
83 return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
84
85 assert(!"invalid generic index");
86 return 0;
87
88 /* Legacy desktop GL varyings. */
89 case VARYING_SLOT_FOGC:
90 return SI_UNIQUE_SLOT_FOGC;
91 case VARYING_SLOT_COL0:
92 return SI_UNIQUE_SLOT_COL0;
93 case VARYING_SLOT_COL1:
94 return SI_UNIQUE_SLOT_COL1;
95 case VARYING_SLOT_BFC0:
96 return SI_UNIQUE_SLOT_BFC0;
97 case VARYING_SLOT_BFC1:
98 return SI_UNIQUE_SLOT_BFC1;
99 case VARYING_SLOT_TEX0:
100 case VARYING_SLOT_TEX1:
101 case VARYING_SLOT_TEX2:
102 case VARYING_SLOT_TEX3:
103 case VARYING_SLOT_TEX4:
104 case VARYING_SLOT_TEX5:
105 case VARYING_SLOT_TEX6:
106 case VARYING_SLOT_TEX7:
107 return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
108 case VARYING_SLOT_CLIP_VERTEX:
109 return SI_UNIQUE_SLOT_CLIP_VERTEX;
110
111 /* Varyings present in both GLES and desktop GL. */
112 case VARYING_SLOT_CLIP_DIST0:
113 return SI_UNIQUE_SLOT_CLIP_DIST0;
114 case VARYING_SLOT_CLIP_DIST1:
115 return SI_UNIQUE_SLOT_CLIP_DIST1;
116 case VARYING_SLOT_PSIZ:
117 return SI_UNIQUE_SLOT_PSIZ;
118 case VARYING_SLOT_LAYER:
119 return SI_UNIQUE_SLOT_LAYER;
120 case VARYING_SLOT_VIEWPORT:
121 return SI_UNIQUE_SLOT_VIEWPORT;
122 case VARYING_SLOT_PRIMITIVE_ID:
123 return SI_UNIQUE_SLOT_PRIMITIVE_ID;
124 }
125 }
126
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader)127 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader)
128 {
129 struct si_shader_selector *sel = shader->selector;
130
131 if (shader->selector->screen->info.gfx_level >= GFX11) {
132 /* NGG streamout. */
133 if (sel->stage == MESA_SHADER_TESS_EVAL)
134 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
135 return;
136 }
137
138 /* Streamout SGPRs. */
139 if (si_shader_uses_streamout(shader)) {
140 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
141 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
142
143 /* A streamout buffer offset is loaded if the stride is non-zero. */
144 for (int i = 0; i < 4; i++) {
145 if (!sel->info.base.xfb_stride[i])
146 continue;
147
148 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
149 }
150 } else if (sel->stage == MESA_SHADER_TESS_EVAL) {
151 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
152 }
153 }
154
si_get_max_workgroup_size(const struct si_shader * shader)155 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
156 {
157 gl_shader_stage stage = shader->is_gs_copy_shader ?
158 MESA_SHADER_VERTEX : shader->selector->stage;
159
160 switch (stage) {
161 case MESA_SHADER_VERTEX:
162 case MESA_SHADER_TESS_EVAL:
163 /* Use the largest workgroup size for streamout */
164 if (shader->key.ge.as_ngg)
165 return si_shader_uses_streamout(shader) ? 256 : 128;
166
167 /* As part of merged shader. */
168 return shader->selector->screen->info.gfx_level >= GFX9 &&
169 (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : 0;
170
171 case MESA_SHADER_TESS_CTRL:
172 /* Return this so that LLVM doesn't remove s_barrier
173 * instructions on chips where we use s_barrier. */
174 return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
175
176 case MESA_SHADER_GEOMETRY:
177 /* GS can always generate up to 256 vertices. */
178 return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0;
179
180 case MESA_SHADER_COMPUTE:
181 break; /* see below */
182
183 default:
184 return 0;
185 }
186
187 /* Compile a variable block size using the maximum variable size. */
188 if (shader->selector->info.base.workgroup_size_variable)
189 return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
190
191 uint16_t *local_size = shader->selector->info.base.workgroup_size;
192 unsigned max_work_group_size = (uint32_t)local_size[0] *
193 (uint32_t)local_size[1] *
194 (uint32_t)local_size[2];
195 assert(max_work_group_size);
196 return max_work_group_size;
197 }
198
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)199 static void declare_const_and_shader_buffers(struct si_shader_args *args,
200 struct si_shader *shader,
201 bool assign_params)
202 {
203 enum ac_arg_type const_shader_buf_type;
204
205 if (shader->selector->info.base.num_ubos == 1 &&
206 shader->selector->info.base.num_ssbos == 0)
207 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
208 else
209 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
210
211 ac_add_arg(
212 &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
213 assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
214 }
215
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)216 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
217 {
218 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
219 assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
220 }
221
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)222 static void declare_per_stage_desc_pointers(struct si_shader_args *args,
223 struct si_shader *shader,
224 bool assign_params)
225 {
226 declare_const_and_shader_buffers(args, shader, assign_params);
227 declare_samplers_and_images(args, assign_params);
228 }
229
declare_global_desc_pointers(struct si_shader_args * args)230 static void declare_global_desc_pointers(struct si_shader_args *args)
231 {
232 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
233 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
234 &args->bindless_samplers_and_images);
235 }
236
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)237 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
238 struct si_shader *shader)
239 {
240 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
241
242 unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
243 if (num_vbos_in_user_sgprs) {
244 unsigned user_sgprs = args->ac.num_sgprs_used;
245
246 if (si_is_merged_shader(shader))
247 user_sgprs -= 8;
248 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
249
250 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
251 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
252 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
253
254 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
255 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
256 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
257 }
258 }
259
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)260 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
261 {
262 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
263
264 if (shader->selector->screen->info.gfx_level >= GFX12) {
265 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
266 } else if (shader->key.ge.as_ls) {
267 if (shader->selector->screen->info.gfx_level >= GFX11) {
268 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
269 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
270 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
271 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
272 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
273 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
274 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
275 } else {
276 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
277 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
278 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
279 }
280 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
281 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
282 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
283 /* user vgpr or PrimID (legacy) */
284 shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
285 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
286 } else {
287 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
288 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
289 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
290 }
291 }
292
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args)293 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args)
294 {
295 bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
296
297 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
298 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
299 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
300
301 if (shader->selector->info.base.vs.blit_sgprs_amd ==
302 SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
303 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
304 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
305 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
306 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
307 if (has_attribute_ring_address)
308 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
309 } else if (shader->selector->info.base.vs.blit_sgprs_amd ==
310 SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
311 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
312 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
313 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
314 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
315 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
316 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
317 if (has_attribute_ring_address)
318 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
319 }
320 }
321
declare_tes_input_vgprs(struct si_shader_args * args)322 static void declare_tes_input_vgprs(struct si_shader_args *args)
323 {
324 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
325 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
326 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
327 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
328 }
329
330 enum
331 {
332 /* Convenient merged shader definitions. */
333 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
334 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
335 };
336
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)337 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
338 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
339 {
340 assert(args->arg_count == idx);
341 ac_add_arg(args, file, registers, type, arg);
342 }
343
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args)344 void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
345 {
346 unsigned i, num_returns, num_return_sgprs;
347 unsigned num_prolog_vgprs = 0;
348 struct si_shader_selector *sel = shader->selector;
349 unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
350 unsigned stage_case = stage;
351
352 memset(args, 0, sizeof(*args));
353
354 /* Set MERGED shaders. */
355 if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
356 if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
357 stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
358 else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
359 stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
360 }
361
362 switch (stage_case) {
363 case MESA_SHADER_VERTEX:
364 declare_global_desc_pointers(args);
365
366 if (sel->info.base.vs.blit_sgprs_amd) {
367 declare_vs_blit_inputs(shader, args);
368 } else {
369 declare_per_stage_desc_pointers(args, shader, true);
370 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
371
372 if (shader->is_gs_copy_shader) {
373 declare_streamout_params(args, shader);
374 } else {
375 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
376 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
377 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
378 declare_vb_descriptor_input_sgprs(args, shader);
379
380 if (shader->key.ge.as_es) {
381 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
382 } else if (shader->key.ge.as_ls) {
383 /* no extra parameters */
384 } else {
385 declare_streamout_params(args, shader);
386 }
387 }
388 }
389
390 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
391 if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
392 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
393
394 /* VGPRs */
395 declare_vs_input_vgprs(args, shader);
396
397 break;
398
399 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
400 declare_global_desc_pointers(args);
401 declare_per_stage_desc_pointers(args, shader, true);
402 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
403 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
404 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
405 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
406 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
407
408 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
409 if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
410 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
411
412 /* VGPRs */
413 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
414 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
415 break;
416
417 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
418 /* Merged stages have 8 system SGPRs at the beginning. */
419 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
420 /* Gfx11+: SPI_SHADER_PGM_LO/HI_HS */
421 declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_TESS_CTRL);
422 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
423 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
424 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
425 if (sel->screen->info.gfx_level >= GFX11)
426 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
427 else
428 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
429 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
430 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
431
432 declare_global_desc_pointers(args);
433 declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_VERTEX);
434
435 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
436 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
437 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
438 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
439 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
440 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
441
442 /* VGPRs (first TCS, then VS) */
443 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
444 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
445
446 if (stage == MESA_SHADER_VERTEX) {
447 declare_vs_input_vgprs(args, shader);
448
449 /* Need to keep LS/HS arg index same for shared args when ACO,
450 * so this is not able to be before shared VGPRs.
451 */
452 declare_vb_descriptor_input_sgprs(args, shader);
453
454 /* LS return values are inputs to the TCS main shader part. */
455 if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
456 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
457 ac_add_return(&args->ac, AC_ARG_SGPR);
458 for (i = 0; i < 2; i++)
459 ac_add_return(&args->ac, AC_ARG_VGPR);
460
461 /* VS outputs passed via VGPRs to TCS. */
462 if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) {
463 unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
464 for (i = 0; i < num_outputs * 4; i++)
465 ac_add_return(&args->ac, AC_ARG_VGPR);
466 }
467 }
468 } else {
469 /* TCS inputs are passed via VGPRs from VS. */
470 if (shader->key.ge.opt.same_patch_vertices && !sel->info.base.use_aco_amd) {
471 unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs);
472 for (i = 0; i < num_inputs * 4; i++)
473 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
474 }
475 }
476 break;
477
478 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
479 /* Merged stages have 8 system SGPRs at the beginning. */
480 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
481 /* Gfx11+: SPI_SHADER_PGM_LO/HI_GS */
482 declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_GEOMETRY);
483
484 if (shader->key.ge.as_ngg)
485 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
486 else
487 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
488
489 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
490 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
491 if (sel->screen->info.gfx_level >= GFX11)
492 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
493 else
494 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
495 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
496 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
497
498 declare_global_desc_pointers(args);
499 if (stage != MESA_SHADER_VERTEX || !sel->info.base.vs.blit_sgprs_amd) {
500 declare_per_stage_desc_pointers(
501 args, shader, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
502 }
503
504 if (stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd) {
505 declare_vs_blit_inputs(shader, args);
506 } else {
507 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
508
509 if (stage == MESA_SHADER_VERTEX) {
510 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
511 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
512 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
513 } else if (stage == MESA_SHADER_TESS_EVAL) {
514 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
515 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
516 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
517 } else {
518 /* GS */
519 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
520 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
521 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
522 }
523
524 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
525 if (sel->screen->info.gfx_level >= GFX11)
526 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
527 else
528 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
529 }
530
531 /* VGPRs (first GS, then VS/TES) */
532 if (sel->screen->info.gfx_level >= GFX12) {
533 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
534 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
535 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
536 } else {
537 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
538 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
539 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
540 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
541 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
542 }
543
544 if (stage == MESA_SHADER_VERTEX) {
545 declare_vs_input_vgprs(args, shader);
546
547 /* Need to keep ES/GS arg index same for shared args when ACO,
548 * so this is not able to be before shared VGPRs.
549 */
550 if (!sel->info.base.vs.blit_sgprs_amd)
551 declare_vb_descriptor_input_sgprs(args, shader);
552 } else if (stage == MESA_SHADER_TESS_EVAL) {
553 declare_tes_input_vgprs(args);
554 }
555
556 if (shader->key.ge.as_es && !shader->is_monolithic &&
557 (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
558 /* ES return values are inputs to GS. */
559 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
560 ac_add_return(&args->ac, AC_ARG_SGPR);
561 for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
562 ac_add_return(&args->ac, AC_ARG_VGPR);
563 }
564 break;
565
566 case MESA_SHADER_TESS_EVAL:
567 declare_global_desc_pointers(args);
568 declare_per_stage_desc_pointers(args, shader, true);
569 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
570 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
571 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
572
573 if (shader->key.ge.as_es) {
574 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
575 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
576 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
577 } else {
578 declare_streamout_params(args, shader);
579 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
580 }
581
582 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
583 if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
584 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
585
586 /* VGPRs */
587 declare_tes_input_vgprs(args);
588 break;
589
590 case MESA_SHADER_GEOMETRY:
591 declare_global_desc_pointers(args);
592 declare_per_stage_desc_pointers(args, shader, true);
593 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
594 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
595
596 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
597 if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
598 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
599
600 /* VGPRs */
601 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
602 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
603 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
604 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
605 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
606 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
607 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
608 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
609 break;
610
611 case MESA_SHADER_FRAGMENT:
612 declare_global_desc_pointers(args);
613 declare_per_stage_desc_pointers(args, shader, true);
614 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
615 SI_PARAM_ALPHA_REF);
616 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
617 SI_PARAM_PRIM_MASK);
618
619 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
620 SI_PARAM_PERSP_SAMPLE);
621 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
622 SI_PARAM_PERSP_CENTER);
623 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
624 SI_PARAM_PERSP_CENTROID);
625 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
626 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
627 SI_PARAM_LINEAR_SAMPLE);
628 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
629 SI_PARAM_LINEAR_CENTER);
630 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
631 SI_PARAM_LINEAR_CENTROID);
632 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
633 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
634 SI_PARAM_POS_X_FLOAT);
635 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
636 SI_PARAM_POS_Y_FLOAT);
637 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
638 SI_PARAM_POS_Z_FLOAT);
639 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
640 SI_PARAM_POS_W_FLOAT);
641 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face,
642 SI_PARAM_FRONT_FACE);
643 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
644 SI_PARAM_ANCILLARY);
645 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
646 SI_PARAM_SAMPLE_COVERAGE);
647 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
648 SI_PARAM_POS_FIXED_PT);
649
650 if (sel->info.base.use_aco_amd) {
651 ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
652
653 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
654 if (sel->screen->info.gfx_level < GFX11)
655 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
656 }
657
658 /* Monolithic PS emit prolog and epilog in NIR directly. */
659 if (!shader->is_monolithic) {
660 /* Color inputs from the prolog. */
661 if (shader->selector->info.colors_read) {
662 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
663
664 for (i = 0; i < num_color_elements; i++)
665 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
666
667 num_prolog_vgprs += num_color_elements;
668 }
669
670 /* Outputs for the epilog. */
671 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
672 num_returns =
673 num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
674 shader->selector->info.writes_z + shader->selector->info.writes_stencil +
675 shader->ps.writes_samplemask + 1 /* SampleMaskIn */;
676
677 for (i = 0; i < num_return_sgprs; i++)
678 ac_add_return(&args->ac, AC_ARG_SGPR);
679 for (; i < num_returns; i++)
680 ac_add_return(&args->ac, AC_ARG_VGPR);
681 }
682 break;
683
684 case MESA_SHADER_COMPUTE:
685 declare_global_desc_pointers(args);
686 declare_per_stage_desc_pointers(args, shader, true);
687 if (shader->selector->info.uses_grid_size)
688 ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
689 if (shader->selector->info.uses_variable_block_size)
690 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
691
692 unsigned cs_user_data_dwords =
693 shader->selector->info.base.cs.user_data_components_amd;
694 if (cs_user_data_dwords) {
695 ac_add_arg(&args->ac, AC_ARG_SGPR, MIN2(cs_user_data_dwords, 4), AC_ARG_INT,
696 &args->cs_user_data[0]);
697 if (cs_user_data_dwords > 4) {
698 ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords - 4, AC_ARG_INT,
699 &args->cs_user_data[1]);
700 }
701 }
702
703 /* Some descriptors can be in user SGPRs. */
704 /* Shader buffers in user SGPRs. */
705 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
706 while (args->ac.num_sgprs_used % 4 != 0)
707 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
708
709 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
710 }
711 /* Images in user SGPRs. */
712 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
713 unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8;
714
715 while (args->ac.num_sgprs_used % num_sgprs != 0)
716 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
717
718 ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
719 }
720
721 /* Hardware SGPRs. */
722 for (i = 0; i < 3; i++) {
723 if (shader->selector->info.uses_block_id[i]) {
724 /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
725 * still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
726 */
727 if (sel->screen->info.gfx_level >= GFX12)
728 args->ac.workgroup_ids[i].used = true;
729 else
730 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
731 }
732 }
733 if (shader->selector->info.uses_tg_size)
734 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
735
736 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
737 if (sel->info.base.use_aco_amd && sel->screen->info.gfx_level < GFX11)
738 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
739
740 /* Hardware VGPRs. */
741 /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
742 if (sel->screen->info.gfx_level >= GFX11 ||
743 (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200))
744 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids);
745 else
746 ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids);
747 break;
748 default:
749 assert(0 && "unimplemented shader");
750 return;
751 }
752
753 shader->info.num_input_sgprs = args->ac.num_sgprs_used;
754 shader->info.num_input_vgprs = args->ac.num_vgprs_used;
755
756 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
757 shader->info.num_input_vgprs -= num_prolog_vgprs;
758 }
759
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)760 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
761 {
762 return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
763 screen->info.gfx_level >= GFX7 ? 512 : 256;
764 }
765
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)766 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
767 struct ac_rtld_binary *rtld)
768 {
769 const struct si_shader_selector *sel = shader->selector;
770 const char *part_elfs[5];
771 size_t part_sizes[5];
772 unsigned num_parts = 0;
773
774 #define add_part(shader_or_part) \
775 if (shader_or_part) { \
776 part_elfs[num_parts] = (shader_or_part)->binary.code_buffer; \
777 part_sizes[num_parts] = (shader_or_part)->binary.code_size; \
778 num_parts++; \
779 }
780
781 add_part(shader->prolog);
782 add_part(shader->previous_stage);
783 add_part(shader);
784 add_part(shader->epilog);
785
786 #undef add_part
787
788 struct ac_rtld_symbol lds_symbols[2];
789 unsigned num_lds_symbols = 0;
790
791 if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
792 (sel->stage == MESA_SHADER_GEOMETRY ||
793 (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
794 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
795 sym->name = "esgs_ring";
796 sym->size = shader->gs_info.esgs_ring_size * 4;
797 sym->align = 64 * 1024;
798 }
799
800 if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
801 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
802 sym->name = "ngg_emit";
803 sym->size = shader->ngg.ngg_emit_size * 4;
804 sym->align = 4;
805 }
806
807 bool ok = ac_rtld_open(
808 rtld, (struct ac_rtld_open_info){.info = &screen->info,
809 .options =
810 {
811 .halt_at_entry = screen->options.halt_shaders,
812 .waitcnt_wa = num_parts > 1 &&
813 screen->info.needs_llvm_wait_wa,
814 },
815 .shader_type = sel->stage,
816 .wave_size = shader->wave_size,
817 .num_parts = num_parts,
818 .elf_ptrs = part_elfs,
819 .elf_sizes = part_sizes,
820 .num_shared_lds_symbols = num_lds_symbols,
821 .shared_lds_symbols = lds_symbols});
822
823 if (rtld->lds_size > 0) {
824 unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
825 shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
826 }
827
828 return ok;
829 }
830
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])831 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
832 {
833 unsigned num_bin = 0;
834
835 if (shader->prolog)
836 bin[num_bin++] = &shader->prolog->binary;
837
838 if (shader->previous_stage)
839 bin[num_bin++] = &shader->previous_stage->binary;
840
841 bin[num_bin++] = &shader->binary;
842
843 if (shader->epilog)
844 bin[num_bin++] = &shader->epilog->binary;
845
846 return num_bin;
847 }
848
849 /* si_get_shader_binary_size should only be called once per shader
850 * and the result should be stored in shader->complete_shader_binary_size.
851 */
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)852 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
853 {
854 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
855 struct ac_rtld_binary rtld;
856 si_shader_binary_open(screen, shader, &rtld);
857 uint64_t size = rtld.exec_size;
858 ac_rtld_close(&rtld);
859 return size;
860 } else {
861 struct si_shader_binary *bin[4];
862 unsigned num_bin = get_shader_binaries(shader, bin);
863
864 unsigned size = 0;
865 for (unsigned i = 0; i < num_bin; i++) {
866 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
867 size += bin[i]->exec_size;
868 }
869 return size;
870 }
871 }
872
si_get_shader_prefetch_size(struct si_shader * shader)873 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
874 {
875 struct si_screen *sscreen = shader->selector->screen;
876 /* This excludes arrays of constants after instructions. */
877 unsigned exec_size =
878 ac_align_shader_binary_for_prefetch(&sscreen->info,
879 shader->complete_shader_binary_size);
880
881 /* INST_PREF_SIZE uses 128B granularity.
882 * - GFX11: max 128 * 63 = 8064
883 * - GFX12: max 128 * 255 = 32640
884 */
885 unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
886 unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
887
888 return MIN2(max_pref_size, exec_size_gran128);
889 }
890
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)891 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
892 uint64_t *value)
893 {
894 uint64_t *scratch_va = data;
895
896 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
897 *value = (uint32_t)*scratch_va;
898 return true;
899 }
900 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
901 /* Enable scratch coalescing. */
902 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
903
904 if (gfx_level >= GFX11)
905 *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
906 else
907 *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
908 return true;
909 }
910
911 return false;
912 }
913
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset,int64_t bo_offset)914 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
915 unsigned binary_size, bool dma_upload,
916 struct si_context **upload_ctx,
917 struct pipe_resource **staging,
918 unsigned *staging_offset,
919 int64_t bo_offset)
920 {
921 unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
922
923 if (bo_offset >= 0) {
924 /* sqtt needs to upload shaders as a pipeline, where all shaders
925 * are contiguous in memory.
926 * In this case, bo_offset will be positive and we don't have to
927 * realloc a new bo.
928 */
929 shader->gpu_address = shader->bo->gpu_address + bo_offset;
930 dma_upload = false;
931 } else {
932 si_resource_reference(&shader->bo, NULL);
933 shader->bo = si_aligned_buffer_create(
934 &sscreen->b,
935 SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
936 (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
937 PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
938 if (!shader->bo)
939 return NULL;
940
941 shader->gpu_address = shader->bo->gpu_address;
942 bo_offset = 0;
943 }
944
945 if (dma_upload) {
946 /* First upload into a staging buffer. */
947 *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
948
949 void *ret;
950 u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
951 staging_offset, staging, &ret);
952 if (!ret)
953 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
954
955 return ret;
956 } else {
957 void *ptr = sscreen->ws->buffer_map(sscreen->ws,
958 shader->bo->buf, NULL,
959 PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
960 if (!ptr)
961 return NULL;
962
963 return ptr + bo_offset;
964 }
965 }
966
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)967 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
968 void *code, unsigned code_size,
969 unsigned binary_size, bool dma_upload,
970 struct si_context *upload_ctx,
971 struct pipe_resource *staging,
972 unsigned staging_offset)
973 {
974 if (sscreen->debug_flags & DBG(SQTT)) {
975 /* Remember the uploaded code */
976 shader->binary.uploaded_code_size = code_size;
977 shader->binary.uploaded_code = malloc(code_size);
978 memcpy(shader->binary.uploaded_code, code, code_size);
979 }
980
981 if (dma_upload) {
982 /* Then copy from the staging buffer to VRAM.
983 *
984 * We can't use the upload copy in si_buffer_transfer_unmap because that might use
985 * a compute shader, and we can't use shaders in the code that is responsible for making
986 * them available.
987 */
988 si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
989 binary_size);
990 si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
991 upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;
992
993 #if 0 /* debug: validate whether the copy was successful */
994 uint32_t *dst_binary = malloc(binary_size);
995 uint32_t *src_binary = (uint32_t*)code;
996 pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
997 puts("dst_binary == src_binary:");
998 for (unsigned i = 0; i < binary_size / 4; i++) {
999 printf(" %08x == %08x\n", dst_binary[i], src_binary[i]);
1000 }
1001 free(dst_binary);
1002 exit(0);
1003 #endif
1004
1005 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
1006 pipe_resource_reference(&staging, NULL);
1007 } else {
1008 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
1009 }
1010 }
1011
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1012 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
1013 uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1014 {
1015 struct ac_rtld_binary binary;
1016 if (!si_shader_binary_open(sscreen, shader, &binary))
1017 return -1;
1018
1019 struct si_context *upload_ctx = NULL;
1020 struct pipe_resource *staging = NULL;
1021 unsigned staging_offset = 0;
1022
1023 void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
1024 &upload_ctx, &staging, &staging_offset,
1025 bo_offset);
1026 if (!rx_ptr)
1027 return -1;
1028
1029 /* Upload. */
1030 struct ac_rtld_upload_info u = {};
1031 u.binary = &binary;
1032 u.get_external_symbol = si_get_external_symbol;
1033 u.cb_data = &scratch_va;
1034 u.rx_va = shader->gpu_address;
1035 u.rx_ptr = rx_ptr;
1036
1037 int size = ac_rtld_upload(&u);
1038
1039 post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1040 upload_ctx, staging, staging_offset);
1041
1042 ac_rtld_close(&binary);
1043
1044 return size;
1045 }
1046
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1047 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1048 {
1049 gl_shader_stage stage =
1050 shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1051
1052 if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1053 (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1054 unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1055
1056 if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1057 size_in_dw += shader->ngg.ngg_emit_size;
1058
1059 if (shader->key.ge.as_ngg) {
1060 unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1061 if (scratch_dw_size) {
1062 /* scratch base address needs to be 8 byte aligned */
1063 size_in_dw = ALIGN(size_in_dw, 2);
1064 size_in_dw += scratch_dw_size;
1065 }
1066 }
1067
1068 shader->config.lds_size =
1069 DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1070 }
1071 }
1072
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1073 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1074 uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1075 {
1076 struct si_shader_binary *bin[4];
1077 unsigned num_bin = get_shader_binaries(shader, bin);
1078
1079 unsigned code_size = 0, exec_size = 0;
1080 for (unsigned i = 0; i < num_bin; i++) {
1081 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1082 code_size += bin[i]->code_size;
1083 exec_size += bin[i]->exec_size;
1084 }
1085
1086 struct si_context *upload_ctx = NULL;
1087 struct pipe_resource *staging = NULL;
1088 unsigned staging_offset = 0;
1089
1090 void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1091 &upload_ctx, &staging, &staging_offset,
1092 bo_offset);
1093 if (!rx_ptr)
1094 return -1;
1095
1096 unsigned exec_offset = 0, data_offset = exec_size;
1097 for (unsigned i = 0; i < num_bin; i++) {
1098 memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1099
1100 if (bin[i]->num_symbols) {
1101 /* Offset needed to add to const data symbol because of inserting other
1102 * shader part between exec code and const data.
1103 */
1104 unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1105
1106 /* Prolog and epilog have no symbols. */
1107 struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1108 assert(sh && bin[i] == &sh->binary);
1109
1110 si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1111 scratch_va, const_offset);
1112 }
1113
1114 exec_offset += bin[i]->exec_size;
1115
1116 unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1117 if (data_size) {
1118 memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1119 data_offset += data_size;
1120 }
1121 }
1122
1123 post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1124 upload_ctx, staging, staging_offset);
1125
1126 calculate_needed_lds_size(sscreen, shader);
1127 return code_size;
1128 }
1129
si_shader_binary_upload_at(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,int64_t bo_offset)1130 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1131 uint64_t scratch_va, int64_t bo_offset)
1132 {
1133 bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
1134 sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
1135 bo_offset < 0;
1136
1137 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1138 return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
1139 } else {
1140 assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1141 return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
1142 }
1143 }
1144
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1145 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1146 uint64_t scratch_va)
1147 {
1148 return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
1149 }
1150
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1151 static void print_disassembly(const char *disasm, size_t nbytes,
1152 const char *name, FILE *file,
1153 struct util_debug_callback *debug)
1154 {
1155 if (debug && debug->debug_message) {
1156 /* Very long debug messages are cut off, so send the
1157 * disassembly one line at a time. This causes more
1158 * overhead, but on the plus side it simplifies
1159 * parsing of resulting logs.
1160 */
1161 util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1162
1163 uint64_t line = 0;
1164 while (line < nbytes) {
1165 int count = nbytes - line;
1166 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1167 if (nl)
1168 count = nl - (disasm + line);
1169
1170 if (count) {
1171 util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1172 }
1173
1174 line += count + 1;
1175 }
1176
1177 util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1178 }
1179
1180 if (file) {
1181 fprintf(file, "Shader %s disassembly:\n", name);
1182 fprintf(file, "%*s", (int)nbytes, disasm);
1183 }
1184 }
1185
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1186 static void si_shader_dump_disassembly(struct si_screen *screen,
1187 const struct si_shader_binary *binary,
1188 gl_shader_stage stage, unsigned wave_size,
1189 struct util_debug_callback *debug, const char *name,
1190 FILE *file)
1191 {
1192 if (binary->type == SI_SHADER_BINARY_RAW) {
1193 print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1194 return;
1195 }
1196
1197 struct ac_rtld_binary rtld_binary;
1198
1199 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1200 .info = &screen->info,
1201 .shader_type = stage,
1202 .wave_size = wave_size,
1203 .num_parts = 1,
1204 .elf_ptrs = &binary->code_buffer,
1205 .elf_sizes = &binary->code_size}))
1206 return;
1207
1208 const char *disasm;
1209 size_t nbytes;
1210
1211 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1212 goto out;
1213
1214 if (nbytes > INT_MAX)
1215 goto out;
1216
1217 print_disassembly(disasm, nbytes, name, file, debug);
1218
1219 out:
1220 ac_rtld_close(&rtld_binary);
1221 }
1222
si_calculate_max_simd_waves(struct si_shader * shader)1223 static void si_calculate_max_simd_waves(struct si_shader *shader)
1224 {
1225 struct si_screen *sscreen = shader->selector->screen;
1226 struct ac_shader_config *conf = &shader->config;
1227 unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1228 unsigned lds_per_wave = 0;
1229 unsigned max_simd_waves;
1230
1231 max_simd_waves = sscreen->info.max_waves_per_simd;
1232
1233 /* Compute LDS usage for PS. */
1234 switch (shader->selector->stage) {
1235 case MESA_SHADER_FRAGMENT:
1236 /* The minimum usage per wave is (num_inputs * 48). The maximum
1237 * usage is (num_inputs * 48 * 16).
1238 * We can get anything in between and it varies between waves.
1239 *
1240 * The 48 bytes per input for a single primitive is equal to
1241 * 4 bytes/component * 4 components/input * 3 points.
1242 *
1243 * Other stages don't know the size at compile time or don't
1244 * allocate LDS per wave, but instead they do it per thread group.
1245 */
1246 lds_per_wave = conf->lds_size * lds_increment +
1247 align(shader->info.num_ps_inputs * 48, lds_increment);
1248 break;
1249 case MESA_SHADER_COMPUTE: {
1250 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1251 lds_per_wave = (conf->lds_size * lds_increment) /
1252 DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1253 }
1254 break;
1255 default:;
1256 }
1257
1258 /* Compute the per-SIMD wave counts. */
1259 if (conf->num_sgprs) {
1260 max_simd_waves =
1261 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1262 }
1263
1264 if (conf->num_vgprs) {
1265 /* GFX 10.3 internally:
1266 * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1267 * - aligns LDS to 1024
1268 *
1269 * For shader-db stats, set num_vgprs that the hw actually uses.
1270 */
1271 unsigned num_vgprs = conf->num_vgprs;
1272 if (sscreen->info.gfx_level >= GFX10_3) {
1273 unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1274 num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1275 } else {
1276 num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1277 }
1278
1279 /* Always print wave limits as Wave64, so that we can compare
1280 * Wave32 and Wave64 with shader-db fairly. */
1281 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1282 max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1283 }
1284
1285 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1286 if (lds_per_wave)
1287 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1288
1289 shader->info.max_simd_waves = max_simd_waves;
1290 }
1291
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1292 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1293 struct util_debug_callback *debug)
1294 {
1295 const struct ac_shader_config *conf = &shader->config;
1296 static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1297
1298 if (screen->options.debug_disassembly)
1299 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1300 shader->wave_size, debug, "main", NULL);
1301
1302 unsigned num_ls_outputs = 0;
1303 unsigned num_hs_outputs = 0;
1304 unsigned num_es_outputs = 0;
1305 unsigned num_gs_outputs = 0;
1306 unsigned num_vs_outputs = 0;
1307 unsigned num_ps_outputs = 0;
1308
1309 if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1310 /* This doesn't include pos exports because only param exports are interesting
1311 * for performance and can be optimized.
1312 */
1313 if (shader->key.ge.as_ls)
1314 num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
1315 else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1316 num_hs_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
1317 else if (shader->key.ge.as_es)
1318 num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1319 else if (shader->gs_copy_shader)
1320 num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1321 else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1322 num_gs_outputs = shader->info.nr_param_exports;
1323 else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1324 shader->selector->stage == MESA_SHADER_TESS_EVAL)
1325 num_vs_outputs = shader->info.nr_param_exports;
1326 else
1327 unreachable("invalid shader key");
1328 } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1329 num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1330 (shader->selector->info.writes_z ||
1331 shader->selector->info.writes_stencil ||
1332 shader->ps.writes_samplemask);
1333 }
1334
1335 util_debug_message(debug, SHADER_INFO,
1336 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1337 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1338 "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1339 "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1340 "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1341 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1342 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1343 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1344 num_ls_outputs, num_hs_outputs,
1345 util_last_bit64(shader->selector->info.patch_outputs_written),
1346 num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1347 shader->selector->info.base.num_inlinable_uniforms,
1348 shader->selector->info.has_divergent_loop,
1349 stages[shader->selector->stage], shader->wave_size);
1350 }
1351
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1352 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1353 enum si_shader_dump_type dump_type)
1354 {
1355 static uint64_t filter[] = {
1356 [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1357 [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1358 [SI_DUMP_NIR] = DBG(NIR),
1359 [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1360 [SI_DUMP_LLVM_IR] = DBG(LLVM),
1361 [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1362 [SI_DUMP_ACO_IR] = DBG(ACO),
1363 [SI_DUMP_ASM] = DBG(ASM),
1364 [SI_DUMP_STATS] = DBG(STATS),
1365 [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1366 };
1367 assert(dump_type < ARRAY_SIZE(filter));
1368
1369 return sscreen->debug_flags & (1 << stage) &&
1370 sscreen->debug_flags & filter[dump_type];
1371 }
1372
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1373 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1374 bool check_debug_option)
1375 {
1376 const struct ac_shader_config *conf = &shader->config;
1377
1378 if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1379 fprintf(file,
1380 "*** SHADER CONFIG ***\n"
1381 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1382 "SPI_PS_INPUT_ENA = 0x%04x\n",
1383 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1384 }
1385
1386 fprintf(file,
1387 "*** SHADER STATS ***\n"
1388 "SGPRS: %d\n"
1389 "VGPRS: %d\n"
1390 "Spilled SGPRs: %d\n"
1391 "Spilled VGPRs: %d\n"
1392 "Private memory VGPRs: %d\n"
1393 "Code Size: %d bytes\n"
1394 "LDS: %d bytes\n"
1395 "Scratch: %d bytes per wave\n"
1396 "Max Waves: %d\n"
1397 "********************\n\n\n",
1398 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1399 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1400 conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1401 conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1402 }
1403
si_get_shader_name(const struct si_shader * shader)1404 const char *si_get_shader_name(const struct si_shader *shader)
1405 {
1406 switch (shader->selector->stage) {
1407 case MESA_SHADER_VERTEX:
1408 if (shader->key.ge.as_es)
1409 return "Vertex Shader as ES";
1410 else if (shader->key.ge.as_ls)
1411 return "Vertex Shader as LS";
1412 else if (shader->key.ge.as_ngg)
1413 return "Vertex Shader as ESGS";
1414 else
1415 return "Vertex Shader as VS";
1416 case MESA_SHADER_TESS_CTRL:
1417 return "Tessellation Control Shader";
1418 case MESA_SHADER_TESS_EVAL:
1419 if (shader->key.ge.as_es)
1420 return "Tessellation Evaluation Shader as ES";
1421 else if (shader->key.ge.as_ngg)
1422 return "Tessellation Evaluation Shader as ESGS";
1423 else
1424 return "Tessellation Evaluation Shader as VS";
1425 case MESA_SHADER_GEOMETRY:
1426 if (shader->is_gs_copy_shader)
1427 return "GS Copy Shader as VS";
1428 else
1429 return "Geometry Shader";
1430 case MESA_SHADER_FRAGMENT:
1431 return "Pixel Shader";
1432 case MESA_SHADER_COMPUTE:
1433 return "Compute Shader";
1434 default:
1435 return "Unknown Shader";
1436 }
1437 }
1438
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1439 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1440 struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1441 {
1442 gl_shader_stage stage = shader->selector->stage;
1443
1444 if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1445 si_dump_shader_key(shader, file);
1446
1447 if (!check_debug_option && shader->binary.llvm_ir_string) {
1448 /* This is only used with ddebug. */
1449 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1450 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1451 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1452 }
1453
1454 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1455 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1456 }
1457
1458 if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1459 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1460
1461 if (shader->prolog)
1462 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1463 "prolog", file);
1464 if (shader->previous_stage)
1465 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1466 shader->wave_size, debug, "previous stage", file);
1467 si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1468 file);
1469
1470 if (shader->epilog)
1471 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1472 "epilog", file);
1473 fprintf(file, "\n");
1474
1475 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1476 }
1477 }
1478
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1479 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1480 {
1481 fprintf(f, " mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1482 fprintf(f, " mono.instance_divisor_is_fetched = %u\n",
1483 key->ge.mono.instance_divisor_is_fetched);
1484 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1485 fprintf(f, " mono.vs.fix_fetch = {");
1486 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1487 union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1488 if (i)
1489 fprintf(f, ", ");
1490 if (!fix.bits)
1491 fprintf(f, "0");
1492 else
1493 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1494 fix.u.format);
1495 }
1496 fprintf(f, "}\n");
1497 }
1498
si_dump_shader_key(const struct si_shader * shader,FILE * f)1499 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1500 {
1501 const union si_shader_key *key = &shader->key;
1502 gl_shader_stage stage = shader->selector->stage;
1503
1504 fprintf(f, "SHADER KEY\n");
1505 fprintf(f, " source_blake3 = {");
1506 _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
1507 fprintf(f, "}\n");
1508
1509 switch (stage) {
1510 case MESA_SHADER_VERTEX:
1511 si_dump_shader_key_vs(key, f);
1512 fprintf(f, " as_es = %u\n", key->ge.as_es);
1513 fprintf(f, " as_ls = %u\n", key->ge.as_ls);
1514 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1515 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1516 break;
1517
1518 case MESA_SHADER_TESS_CTRL:
1519 if (shader->selector->screen->info.gfx_level >= GFX9)
1520 si_dump_shader_key_vs(key, f);
1521
1522 fprintf(f, " opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
1523 fprintf(f, " opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
1524 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1525 fprintf(f, " opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1526 break;
1527
1528 case MESA_SHADER_TESS_EVAL:
1529 fprintf(f, " as_es = %u\n", key->ge.as_es);
1530 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1531 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1532 break;
1533
1534 case MESA_SHADER_GEOMETRY:
1535 if (shader->is_gs_copy_shader)
1536 break;
1537
1538 if (shader->selector->screen->info.gfx_level >= GFX9 &&
1539 key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1540 si_dump_shader_key_vs(key, f);
1541
1542 fprintf(f, " mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1543 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1544 break;
1545
1546 case MESA_SHADER_COMPUTE:
1547 break;
1548
1549 case MESA_SHADER_FRAGMENT:
1550 fprintf(f, " prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1551 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1552 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1553 fprintf(f, " prolog.force_persp_sample_interp = %u\n",
1554 key->ps.part.prolog.force_persp_sample_interp);
1555 fprintf(f, " prolog.force_linear_sample_interp = %u\n",
1556 key->ps.part.prolog.force_linear_sample_interp);
1557 fprintf(f, " prolog.force_persp_center_interp = %u\n",
1558 key->ps.part.prolog.force_persp_center_interp);
1559 fprintf(f, " prolog.force_linear_center_interp = %u\n",
1560 key->ps.part.prolog.force_linear_center_interp);
1561 fprintf(f, " prolog.bc_optimize_for_persp = %u\n",
1562 key->ps.part.prolog.bc_optimize_for_persp);
1563 fprintf(f, " prolog.bc_optimize_for_linear = %u\n",
1564 key->ps.part.prolog.bc_optimize_for_linear);
1565 fprintf(f, " prolog.samplemask_log_ps_iter = %u\n",
1566 key->ps.part.prolog.samplemask_log_ps_iter);
1567 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n",
1568 key->ps.part.epilog.spi_shader_col_format);
1569 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1570 fprintf(f, " epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1571 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1572 fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1573 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1574 fprintf(f, " epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1575 fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1576 fprintf(f, " epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1577 fprintf(f, " epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1578 fprintf(f, " epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1579 fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1580 fprintf(f, " mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1581 fprintf(f, " mono.interpolate_at_sample_force_center = %u\n",
1582 key->ps.mono.interpolate_at_sample_force_center);
1583 fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1584 fprintf(f, " mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1585 fprintf(f, " mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1586 break;
1587
1588 default:
1589 assert(0);
1590 }
1591
1592 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1593 stage == MESA_SHADER_VERTEX) &&
1594 !key->ge.as_es && !key->ge.as_ls) {
1595 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1596 fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1597 fprintf(f, " opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer);
1598 fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1599 fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1600 fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout);
1601 fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
1602 }
1603
1604 if (stage <= MESA_SHADER_GEOMETRY)
1605 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1606 else
1607 fprintf(f, " opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1608
1609 if (stage <= MESA_SHADER_GEOMETRY) {
1610 if (key->ge.opt.inline_uniforms) {
1611 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1612 key->ge.opt.inline_uniforms,
1613 key->ge.opt.inlined_uniform_values[0],
1614 key->ge.opt.inlined_uniform_values[1],
1615 key->ge.opt.inlined_uniform_values[2],
1616 key->ge.opt.inlined_uniform_values[3]);
1617 } else {
1618 fprintf(f, " opt.inline_uniforms = 0\n");
1619 }
1620 } else {
1621 if (key->ps.opt.inline_uniforms) {
1622 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1623 key->ps.opt.inline_uniforms,
1624 key->ps.opt.inlined_uniform_values[0],
1625 key->ps.opt.inlined_uniform_values[1],
1626 key->ps.opt.inlined_uniform_values[2],
1627 key->ps.opt.inlined_uniform_values[3]);
1628 } else {
1629 fprintf(f, " opt.inline_uniforms = 0\n");
1630 }
1631 }
1632 }
1633
1634 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1635 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1636 {
1637 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1638 assert(impl);
1639 assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1640
1641 if (!key->ge.opt.kill_outputs &&
1642 !key->ge.opt.kill_pointsize &&
1643 !key->ge.opt.kill_layer &&
1644 !key->ge.opt.kill_clip_distances &&
1645 !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1646 nir_metadata_preserve(impl, nir_metadata_all);
1647 return false;
1648 }
1649
1650 bool progress = false;
1651
1652 nir_foreach_block(block, impl) {
1653 nir_foreach_instr_safe(instr, block) {
1654 if (instr->type != nir_instr_type_intrinsic)
1655 continue;
1656
1657 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1658 if (intr->intrinsic != nir_intrinsic_store_output)
1659 continue;
1660
1661 /* No indirect indexing allowed. */
1662 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1663 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1664
1665 assert(intr->num_components == 1); /* only scalar stores expected */
1666 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1667
1668 if (nir_slot_is_varying(sem.location) &&
1669 key->ge.opt.kill_outputs &
1670 (1ull << si_shader_io_get_unique_index(sem.location)))
1671 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1672
1673 switch (sem.location) {
1674 case VARYING_SLOT_PSIZ:
1675 if (key->ge.opt.kill_pointsize)
1676 progress |= nir_remove_sysval_output(intr);
1677 break;
1678
1679 case VARYING_SLOT_CLIP_VERTEX:
1680 /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1681 * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1682 */
1683 if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1684 SI_USER_CLIP_PLANE_MASK)
1685 progress |= nir_remove_sysval_output(intr);
1686 break;
1687
1688 case VARYING_SLOT_CLIP_DIST0:
1689 case VARYING_SLOT_CLIP_DIST1:
1690 if (key->ge.opt.kill_clip_distances) {
1691 assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1692 unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1693 nir_intrinsic_component(intr);
1694
1695 if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1696 progress |= nir_remove_sysval_output(intr);
1697 }
1698 break;
1699
1700 case VARYING_SLOT_LAYER:
1701 /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1702 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1703
1704 if (key->ge.opt.kill_layer)
1705 progress |= nir_remove_sysval_output(intr);
1706 break;
1707 }
1708 }
1709 }
1710
1711 if (progress) {
1712 nir_metadata_preserve(impl, nir_metadata_control_flow);
1713 } else {
1714 nir_metadata_preserve(impl, nir_metadata_all);
1715 }
1716
1717 return progress;
1718 }
1719
1720 /* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
kill_ps_outputs_cb(struct nir_builder * b,nir_instr * instr,void * _key)1721 static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
1722 {
1723 if (instr->type != nir_instr_type_intrinsic)
1724 return false;
1725
1726 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1727 if (intr->intrinsic != nir_intrinsic_store_output)
1728 return false;
1729
1730 /* No indirect indexing allowed. */
1731 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1732 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1733
1734 unsigned location = nir_intrinsic_io_semantics(intr).location;
1735 const union si_shader_key *key = _key;
1736
1737 switch (location) {
1738 case FRAG_RESULT_DEPTH:
1739 case FRAG_RESULT_STENCIL:
1740 return false;
1741
1742 case FRAG_RESULT_SAMPLE_MASK:
1743 if (key->ps.part.epilog.kill_samplemask) {
1744 nir_instr_remove(instr);
1745 return true;
1746 }
1747 return false;
1748 }
1749
1750 /* Color outputs. */
1751 unsigned comp_mask = BITFIELD_MASK(intr->num_components);
1752 assert(nir_intrinsic_component(intr) == 0);
1753 unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
1754
1755 /* Preserve alpha if ALPHA_TESTING is enabled. */
1756 if (key->ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS ||
1757 key->ps.part.epilog.alpha_to_coverage_via_mrtz)
1758 cb_shader_mask |= 1 << 3;
1759
1760 /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
1761 if (location == FRAG_RESULT_COLOR) {
1762 for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
1763 cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
1764 }
1765
1766 unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
1767 unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
1768
1769 if ((output_mask & comp_mask) == comp_mask)
1770 return false;
1771
1772 if (!(output_mask & comp_mask)) {
1773 nir_instr_remove(instr);
1774 return true;
1775 }
1776
1777 /* Fill disabled components with undef. */
1778 b->cursor = nir_before_instr(instr);
1779 nir_def *new_value = intr->src[0].ssa;
1780 nir_def *undef = nir_undef(b, 1, new_value->bit_size);
1781
1782 unsigned kill_mask = ~output_mask & comp_mask;
1783 u_foreach_bit(i, kill_mask) {
1784 new_value = nir_vector_insert_imm(b, new_value, undef, i);
1785 }
1786
1787 nir_src_rewrite(&intr->src[0], new_value);
1788 return true;
1789 }
1790
si_nir_kill_ps_outputs(nir_shader * nir,const union si_shader_key * key)1791 static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
1792 {
1793 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1794 return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
1795 nir_metadata_control_flow, (void*)key);
1796 }
1797
clamp_vertex_color_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)1798 static bool clamp_vertex_color_instr(nir_builder *b,
1799 nir_intrinsic_instr *intrin, void *state)
1800 {
1801 if (intrin->intrinsic != nir_intrinsic_store_output)
1802 return false;
1803
1804 unsigned location = nir_intrinsic_io_semantics(intrin).location;
1805 if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 &&
1806 location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1)
1807 return false;
1808
1809 /* no indirect output */
1810 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1811 /* only scalar output */
1812 assert(intrin->src[0].ssa->num_components == 1);
1813
1814 b->cursor = nir_before_instr(&intrin->instr);
1815
1816 nir_def *color = intrin->src[0].ssa;
1817 nir_def *clamp = nir_load_clamp_vertex_color_amd(b);
1818 nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color);
1819 nir_src_rewrite(&intrin->src[0], new_color);
1820
1821 return true;
1822 }
1823
si_nir_clamp_vertex_color(nir_shader * nir)1824 static bool si_nir_clamp_vertex_color(nir_shader *nir)
1825 {
1826 uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1;
1827 if (!(nir->info.outputs_written & mask))
1828 return false;
1829
1830 return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr,
1831 nir_metadata_control_flow,
1832 NULL);
1833 }
1834
si_map_io_driver_location(unsigned semantic)1835 static unsigned si_map_io_driver_location(unsigned semantic)
1836 {
1837 if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1838 semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1839 semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1840 return ac_shader_io_get_unique_index_patch(semantic);
1841
1842 return si_shader_io_get_unique_index(semantic);
1843 }
1844
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir,uint64_t tcs_vgpr_only_inputs)1845 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
1846 uint64_t tcs_vgpr_only_inputs)
1847 {
1848 struct si_shader_selector *sel = shader->selector;
1849 struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
1850 const union si_shader_key *key = &shader->key;
1851 const bool is_gfx9_mono_tcs = shader->is_monolithic && next_sel->stage == MESA_SHADER_TESS_CTRL &&
1852 sel->screen->info.gfx_level >= GFX9;
1853
1854 if (nir->info.stage == MESA_SHADER_VERTEX) {
1855 if (key->ge.as_ls) {
1856 NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
1857 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1858 key->ge.opt.same_patch_vertices,
1859 is_gfx9_mono_tcs ? next_sel->info.base.inputs_read : ~0ull,
1860 tcs_vgpr_only_inputs);
1861 return true;
1862 } else if (key->ge.as_es) {
1863 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1864 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1865 return true;
1866 }
1867 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1868 NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
1869 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1870 key->ge.opt.same_patch_vertices, sel->info.tcs_vgpr_only_inputs);
1871
1872 /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1873 nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;
1874
1875 NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location,
1876 sel->screen->info.gfx_level,
1877 ~0ULL, ~0U, /* no TES inputs filter */
1878 shader->wave_size,
1879 sel->info.tessfactors_are_def_in_all_invocs);
1880 return true;
1881 } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1882 NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1883
1884 if (key->ge.as_es) {
1885 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1886 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1887 }
1888
1889 return true;
1890 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1891 NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1892 sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1893 return true;
1894 }
1895
1896 return false;
1897 }
1898
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1899 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1900 {
1901 struct si_shader_selector *sel = shader->selector;
1902 const union si_shader_key *key = &shader->key;
1903 assert(key->ge.as_ngg);
1904
1905 uint8_t clip_cull_dist_mask =
1906 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1907 sel->info.culldist_mask;
1908
1909 ac_nir_lower_ngg_options options = {
1910 .family = sel->screen->info.family,
1911 .gfx_level = sel->screen->info.gfx_level,
1912 .max_workgroup_size = si_get_max_workgroup_size(shader),
1913 .wave_size = shader->wave_size,
1914 .can_cull = !!key->ge.opt.ngg_culling,
1915 .disable_streamout = !si_shader_uses_streamout(shader),
1916 .vs_output_param_offset = shader->info.vs_output_param_offset,
1917 .has_param_exports = shader->info.nr_param_exports,
1918 .clip_cull_dist_mask = clip_cull_dist_mask,
1919 .kill_pointsize = key->ge.opt.kill_pointsize,
1920 .kill_layer = key->ge.opt.kill_layer,
1921 .force_vrs = sel->screen->options.vrs2x2,
1922 .use_gfx12_xfb_intrinsic = true,
1923 };
1924
1925 if (nir->info.stage == MESA_SHADER_VERTEX ||
1926 nir->info.stage == MESA_SHADER_TESS_EVAL) {
1927 /* Per instance inputs, used to remove instance load after culling. */
1928 unsigned instance_rate_inputs = 0;
1929
1930 if (nir->info.stage == MESA_SHADER_VERTEX) {
1931 instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1932 key->ge.mono.instance_divisor_is_fetched;
1933
1934 /* Manually mark the instance ID used, so the shader can repack it. */
1935 if (instance_rate_inputs)
1936 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1937 } else {
1938 /* Manually mark the primitive ID used, so the shader can repack it. */
1939 if (key->ge.mono.u.vs_export_prim_id)
1940 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1941 }
1942
1943 unsigned clip_plane_enable =
1944 SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1945 unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(shader);
1946
1947 options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1948 options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1949 options.passthrough = gfx10_is_ngg_passthrough(shader);
1950 options.use_edgeflags = gfx10_edgeflags_have_effect(shader);
1951 options.has_gen_prim_query = options.has_xfb_prim_query =
1952 sel->screen->info.gfx_level >= GFX11 && !sel->info.base.vs.blit_sgprs_amd;
1953 options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1954 options.instance_rate_inputs = instance_rate_inputs;
1955 options.user_clip_plane_enable_mask = clip_plane_enable;
1956
1957 NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1958 } else {
1959 assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1960
1961 options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1962 options.has_gen_prim_query = options.has_xfb_prim_query =
1963 sel->screen->info.gfx_level >= GFX11;
1964 options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1965 options.has_gs_primitives_query = true;
1966
1967 /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1968 if (key->ge.part.gs.es)
1969 nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1970
1971 NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1972 }
1973
1974 /* may generate some vector output store */
1975 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1976 }
1977
si_deserialize_shader(struct si_shader_selector * sel)1978 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1979 {
1980 struct pipe_screen *screen = &sel->screen->b;
1981 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1982 pipe_shader_type_from_mesa(sel->stage));
1983
1984 struct blob_reader blob_reader;
1985 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1986 return nir_deserialize(NULL, options, &blob_reader);
1987 }
1988
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])1989 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
1990 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
1991 {
1992 struct si_shader_selector *sel = shader->selector;
1993 struct si_shader_binary_info *info = &shader->info;
1994
1995 uint64_t outputs_written = 0;
1996 uint32_t outputs_written_16bit = 0;
1997
1998 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1999 assert(impl);
2000
2001 nir_foreach_block(block, impl) {
2002 nir_foreach_instr_safe(instr, block) {
2003 if (instr->type != nir_instr_type_intrinsic)
2004 continue;
2005
2006 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2007 if (intr->intrinsic != nir_intrinsic_store_output)
2008 continue;
2009
2010 /* No indirect indexing allowed. */
2011 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
2012 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
2013
2014 assert(intr->num_components == 1); /* only scalar stores expected */
2015 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2016
2017 if (sem.location >= VARYING_SLOT_VAR0_16BIT)
2018 outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
2019 else
2020 outputs_written |= BITFIELD64_BIT(sem.location);
2021
2022 /* Assign the param index if it's unassigned. */
2023 if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
2024 (sem.gs_streams & 0x3) == 0 &&
2025 info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
2026 /* The semantic and the base should be the same as in si_shader_info. */
2027 assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
2028 /* It must not be remapped (duplicated). */
2029 assert(slot_remap[sem.location] == -1);
2030
2031 info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
2032 }
2033 }
2034 }
2035
2036 /* Duplicated outputs are redirected here. */
2037 for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
2038 if (slot_remap[i] >= 0)
2039 info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
2040 }
2041
2042 if (shader->key.ge.mono.u.vs_export_prim_id) {
2043 info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
2044 }
2045
2046 /* Update outputs written info, we may remove some outputs before. */
2047 nir->info.outputs_written = outputs_written;
2048 nir->info.outputs_written_16bit = outputs_written_16bit;
2049 }
2050
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)2051 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
2052 {
2053 /* Initialize this first. */
2054 shader->info.nr_param_exports = 0;
2055
2056 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2057 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2058 sizeof(shader->info.vs_output_param_offset));
2059
2060 /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
2061 * mapped to multiple fragment shader inputs.
2062 */
2063 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
2064 memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
2065
2066 /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
2067 /* TODO: This doesn't affect GS. */
2068 NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
2069 shader->info.vs_output_param_offset);
2070
2071 /* Assign the non-constant outputs. */
2072 /* TODO: Use this for the GS copy shader too. */
2073 si_nir_assign_param_offsets(nir, shader, slot_remap);
2074 }
2075
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)2076 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
2077 const union si_shader_key *key)
2078 {
2079 const struct si_shader_info *info = &sel->info;
2080
2081 /* Must have a position export. */
2082 unsigned nr_pos_exports = 1;
2083
2084 if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
2085 (info->writes_edgeflag && !key->ge.as_ngg) ||
2086 (info->writes_layer && !key->ge.opt.kill_layer) ||
2087 info->writes_viewport_index || sel->screen->options.vrs2x2) {
2088 nr_pos_exports++;
2089 }
2090
2091 unsigned clipdist_mask =
2092 (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
2093
2094 for (int i = 0; i < 2; i++) {
2095 if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2096 nr_pos_exports++;
2097 }
2098
2099 return nr_pos_exports;
2100 }
2101
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2102 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2103 {
2104 nir_def **colors = (nir_def **)state;
2105
2106 if (instr->type != nir_instr_type_intrinsic)
2107 return false;
2108
2109 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2110
2111 if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2112 intrin->intrinsic != nir_intrinsic_load_color1)
2113 return false;
2114
2115 unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2116 assert(colors[index]);
2117
2118 nir_def_replace(&intrin->def, colors[index]);
2119 return true;
2120 }
2121
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2122 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2123 const struct si_shader_info *info)
2124 {
2125 bool progress = false;
2126 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2127
2128 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2129 nir_builder *b = &builder;
2130
2131 /* Build ready to be used colors at the beginning of the shader. */
2132 nir_def *colors[2] = {0};
2133 for (int i = 0; i < 2; i++) {
2134 if (!(info->colors_read & (0xf << (i * 4))))
2135 continue;
2136
2137 unsigned color_base = info->color_attr_index[i];
2138 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
2139 * otherwise it's at offset "num_inputs".
2140 */
2141 unsigned back_color_base = info->num_inputs;
2142 if (i == 1 && (info->colors_read & 0xf))
2143 back_color_base += 1;
2144
2145 enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2146 if (interp_mode == INTERP_MODE_COLOR) {
2147 interp_mode = key->ps.part.prolog.flatshade_colors ?
2148 INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2149 }
2150
2151 nir_def *back_color = NULL;
2152 if (interp_mode == INTERP_MODE_FLAT) {
2153 colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2154 .base = color_base,
2155 .io_semantics.location = VARYING_SLOT_COL0 + i,
2156 .io_semantics.num_slots = 1);
2157
2158 if (key->ps.part.prolog.color_two_side) {
2159 back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2160 .base = back_color_base,
2161 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2162 .io_semantics.num_slots = 1);
2163 }
2164 } else {
2165 nir_intrinsic_op op = 0;
2166 switch (info->color_interpolate_loc[i]) {
2167 case TGSI_INTERPOLATE_LOC_CENTER:
2168 op = nir_intrinsic_load_barycentric_pixel;
2169 break;
2170 case TGSI_INTERPOLATE_LOC_CENTROID:
2171 op = nir_intrinsic_load_barycentric_centroid;
2172 break;
2173 case TGSI_INTERPOLATE_LOC_SAMPLE:
2174 op = nir_intrinsic_load_barycentric_sample;
2175 break;
2176 default:
2177 unreachable("invalid color interpolate location");
2178 break;
2179 }
2180
2181 nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2182
2183 colors[i] =
2184 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2185 .base = color_base,
2186 .io_semantics.location = VARYING_SLOT_COL0 + i,
2187 .io_semantics.num_slots = 1);
2188
2189 if (key->ps.part.prolog.color_two_side) {
2190 back_color =
2191 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2192 .base = back_color_base,
2193 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2194 .io_semantics.num_slots = 1);
2195 }
2196 }
2197
2198 if (back_color) {
2199 nir_def *is_front_face = nir_load_front_face(b, 1);
2200 colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2201 }
2202
2203 progress = true;
2204 }
2205
2206 /* lower nir_load_color0/1 to use the color value. */
2207 return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2208 nir_metadata_control_flow,
2209 colors) || progress;
2210 }
2211
si_nir_emit_polygon_stipple(nir_shader * nir,struct si_shader_args * args)2212 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
2213 {
2214 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2215
2216 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2217 nir_builder *b = &builder;
2218
2219 /* Load the buffer descriptor. */
2220 nir_def *desc =
2221 si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
2222
2223 /* Use the fixed-point gl_FragCoord input.
2224 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2225 * per coordinate to get the repeating effect.
2226 */
2227 nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5);
2228 nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5);
2229
2230 nir_def *zero = nir_imm_int(b, 0);
2231 /* The stipple pattern is 32x32, each row has 32 bits. */
2232 nir_def *offset = nir_ishl_imm(b, pos_y, 2);
2233 nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2234 nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1));
2235
2236 nir_def *pass = nir_i2b(b, bit);
2237 nir_discard_if(b, nir_inot(b, pass));
2238 }
2239
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2240 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2241 {
2242 return shader->info.stage == MESA_SHADER_COMPUTE && shader->info.shared_size > 0 && sscreen->options.clear_lds;
2243 }
2244
si_get_nir_shader(struct si_shader * shader,struct si_shader_args * args,bool * free_nir,uint64_t tcs_vgpr_only_inputs,ac_nir_gs_output_info * output_info)2245 struct nir_shader *si_get_nir_shader(struct si_shader *shader,
2246 struct si_shader_args *args,
2247 bool *free_nir,
2248 uint64_t tcs_vgpr_only_inputs,
2249 ac_nir_gs_output_info *output_info)
2250 {
2251 struct si_shader_selector *sel = shader->selector;
2252 const union si_shader_key *key = &shader->key;
2253
2254 nir_shader *nir;
2255 *free_nir = false;
2256
2257 if (sel->nir) {
2258 nir = sel->nir;
2259 } else if (sel->nir_binary) {
2260 nir = si_deserialize_shader(sel);
2261 *free_nir = true;
2262 } else {
2263 return NULL;
2264 }
2265
2266 bool progress = false;
2267 bool late_opts = false;
2268
2269 const char *original_name = NULL;
2270 if (unlikely(should_print_nir(nir))) {
2271 /* Modify the shader's name so that each variant gets its own name. */
2272 original_name = ralloc_strdup(nir, nir->info.name);
2273 ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
2274
2275 /* Dummy pass to get the starting point. */
2276 printf("nir_dummy_pass\n");
2277 nir_print_shader(nir, stdout);
2278 }
2279
2280 /* Kill outputs according to the shader key. */
2281 if (sel->stage <= MESA_SHADER_GEOMETRY)
2282 NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2283
2284 NIR_PASS(progress, nir, ac_nir_lower_tex,
2285 &(ac_nir_lower_tex_options){
2286 .gfx_level = sel->screen->info.gfx_level,
2287 .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2288 });
2289
2290 if (nir->info.uses_resource_info_query)
2291 NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2292
2293 bool inline_uniforms = false;
2294 uint32_t *inlined_uniform_values;
2295 si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
2296 &inline_uniforms, &inlined_uniform_values);
2297
2298 if (inline_uniforms) {
2299 assert(*free_nir);
2300
2301 /* Most places use shader information from the default variant, not
2302 * the optimized variant. These are the things that the driver looks at
2303 * in optimized variants and the list of things that we need to do.
2304 *
2305 * The driver takes into account these things if they suddenly disappear
2306 * from the shader code:
2307 * - Register usage and code size decrease (obvious)
2308 * - Eliminated PS system values are disabled by LLVM
2309 * (FragCoord, FrontFace, barycentrics)
2310 * - VS/TES/GS param exports are eliminated if they are undef.
2311 * The param space for eliminated outputs is also not allocated.
2312 * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2313 * - TCS output stores are eliminated
2314 * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2315 *
2316 * TODO: These are things the driver ignores in the final shader code
2317 * and relies on the default shader info.
2318 * - System values in VS, TCS, TES, GS are not eliminated
2319 * - uses_discard - if it changed to false
2320 * - writes_memory - if it changed to false
2321 * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2322 * eliminated
2323 * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2324 * GS outputs are eliminated except for the temporary LDS.
2325 * Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2326 * on current states, so we don't care about the shader code.
2327 *
2328 * TODO: Merged shaders don't inline uniforms for the first stage.
2329 * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2330 * (key == NULL for the first stage here)
2331 *
2332 * TODO: Compute shaders don't support inlinable uniforms, because they
2333 * don't have shader variants.
2334 *
2335 * TODO: The driver uses a linear search to find a shader variant. This
2336 * can be really slow if we get too many variants due to uniform inlining.
2337 */
2338 NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2339 inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2340 progress = true;
2341 }
2342
2343 if (sel->stage == MESA_SHADER_FRAGMENT) {
2344 /* This uses the epilog key, so only monolithic shaders can call this. */
2345 if (shader->is_monolithic)
2346 NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
2347
2348 if (key->ps.mono.poly_line_smoothing)
2349 NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2350
2351 if (key->ps.mono.point_smoothing)
2352 NIR_PASS(progress, nir, nir_lower_point_smooth);
2353 }
2354
2355 /* This must be before si_nir_lower_resource. */
2356 if (!sel->screen->info.has_image_opcodes)
2357 NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2358
2359 /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2360 if (sel->info.base.use_aco_amd && sel->info.has_non_uniform_tex_access) {
2361 nir_lower_non_uniform_access_options options = {
2362 .types = nir_lower_non_uniform_texture_access,
2363 };
2364 NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2365 }
2366
2367 bool is_last_vgt_stage =
2368 (sel->stage == MESA_SHADER_VERTEX ||
2369 sel->stage == MESA_SHADER_TESS_EVAL ||
2370 (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2371 !shader->key.ge.as_ls && !shader->key.ge.as_es;
2372
2373 /* Legacy GS is not last VGT stage because it has GS copy shader. */
2374 bool is_legacy_gs = sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg;
2375
2376 if (is_last_vgt_stage || is_legacy_gs)
2377 NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
2378
2379 if (progress) {
2380 si_nir_opts(sel->screen, nir, true);
2381 late_opts = true;
2382 progress = false;
2383 }
2384
2385 /* Lower large variables that are always constant with load_constant intrinsics, which
2386 * get turned into PC-relative loads from a data section next to the shader.
2387 *
2388 * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2389 * this should be done after that.
2390 *
2391 * The pass crashes if there are dead temps of lowered IO interface types, so remove
2392 * them first.
2393 */
2394 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2395 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2396
2397 /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2398 * this should be done after that.
2399 */
2400 progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2401
2402 if (sel->stage == MESA_SHADER_VERTEX)
2403 NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
2404
2405 progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
2406
2407 if (is_last_vgt_stage) {
2408 /* Assign param export indices. */
2409 si_assign_param_offsets(nir, shader);
2410
2411 /* Assign num of position exports. */
2412 shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2413
2414 if (key->ge.as_ngg) {
2415 /* Lower last VGT NGG shader stage. */
2416 si_lower_ngg(shader, nir);
2417 } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) {
2418 /* Lower last VGT none-NGG VS/TES shader stage. */
2419 unsigned clip_cull_mask =
2420 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2421 sel->info.culldist_mask;
2422
2423 NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2424 sel->screen->info.gfx_level,
2425 clip_cull_mask,
2426 shader->info.vs_output_param_offset,
2427 shader->info.nr_param_exports,
2428 shader->key.ge.mono.u.vs_export_prim_id,
2429 !si_shader_uses_streamout(shader),
2430 key->ge.opt.kill_pointsize,
2431 key->ge.opt.kill_layer,
2432 sel->screen->options.vrs2x2);
2433 }
2434 progress = true;
2435 } else if (is_legacy_gs) {
2436 NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
2437 progress = true;
2438 } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2439 /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
2440 * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
2441 * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
2442 * because they are still represented by nir_intrinsic_load_color0/1.
2443 */
2444 NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
2445
2446 /* Two-side color selection and interpolation: Get the latest shader info because
2447 * uniform inlining and colormask can fully eliminate color inputs.
2448 */
2449 struct si_shader_info info;
2450 si_nir_scan_shader(sel->screen, nir, &info);
2451
2452 if (info.colors_read)
2453 NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
2454
2455 /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
2456 * which can only occur with monolithic PS.
2457 */
2458 shader->info.num_ps_inputs = info.num_inputs;
2459 shader->info.ps_colors_read = info.colors_read;
2460
2461 ac_nir_lower_ps_options options = {
2462 .gfx_level = sel->screen->info.gfx_level,
2463 .family = sel->screen->info.family,
2464 .use_aco = sel->info.base.use_aco_amd,
2465 .uses_discard = si_shader_uses_discard(shader),
2466 .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2467 .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2468 .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2469 .color_is_int8 = key->ps.part.epilog.color_is_int8,
2470 .color_is_int10 = key->ps.part.epilog.color_is_int10,
2471 .clamp_color = key->ps.part.epilog.clamp_color,
2472 .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2473 .alpha_func = key->ps.part.epilog.alpha_func,
2474 .broadcast_last_cbuf = key->ps.part.epilog.last_cbuf,
2475 .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2476
2477 .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2478 .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2479 .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp,
2480 .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp,
2481 .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp,
2482 .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp,
2483 .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter,
2484 };
2485
2486 NIR_PASS_V(nir, ac_nir_lower_ps, &options);
2487
2488 if (key->ps.part.prolog.poly_stipple)
2489 NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
2490
2491 progress = true;
2492 }
2493
2494 assert(shader->wave_size == 32 || shader->wave_size == 64);
2495
2496 NIR_PASS(progress, nir, nir_lower_subgroups,
2497 &(struct nir_lower_subgroups_options) {
2498 .subgroup_size = shader->wave_size,
2499 .ballot_bit_size = shader->wave_size,
2500 .ballot_components = 1,
2501 .lower_to_scalar = true,
2502 .lower_subgroup_masks = true,
2503 .lower_relative_shuffle = true,
2504 .lower_rotate_to_shuffle = !sel->info.base.use_aco_amd,
2505 .lower_shuffle_to_32bit = true,
2506 .lower_vote_eq = true,
2507 .lower_vote_bool_eq = true,
2508 .lower_quad_broadcast_dynamic = true,
2509 .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
2510 .lower_shuffle_to_swizzle_amd = true,
2511 .lower_ballot_bit_count_to_mbcnt_amd = true,
2512 .lower_inverse_ballot = !sel->info.base.use_aco_amd && LLVM_VERSION_MAJOR < 17,
2513 .lower_boolean_reduce = sel->info.base.use_aco_amd,
2514 .lower_boolean_shuffle = true,
2515 });
2516
2517 NIR_PASS(progress, nir, nir_lower_pack);
2518 NIR_PASS(progress, nir, nir_lower_int64);
2519 NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2520 NIR_PASS(progress, nir, nir_lower_idiv,
2521 &(nir_lower_idiv_options){
2522 .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2523 });
2524
2525 if (si_should_clear_lds(sel->screen, nir)) {
2526 const unsigned chunk_size = 16; /* max single store size */
2527 const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2528 NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2529 }
2530
2531 NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2532 si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2533 &args->ac);
2534 NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
2535
2536 if (progress) {
2537 si_nir_opts(sel->screen, nir, false);
2538 progress = false;
2539 late_opts = true;
2540 }
2541
2542 NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
2543 &(nir_load_store_vectorize_options){
2544 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
2545 nir_var_shader_temp,
2546 .callback = ac_nir_mem_vectorize_callback,
2547 .cb_data = &sel->screen->info.gfx_level,
2548 /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
2549 * the final offset is not.
2550 */
2551 .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
2552 });
2553 NIR_PASS(progress, nir, nir_opt_shrink_stores, false);
2554 NIR_PASS(progress, nir, ac_nir_lower_global_access);
2555 /* This must be after vectorization because it causes bindings_different_restrict() to fail. */
2556 NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
2557
2558 if (progress) {
2559 si_nir_opts(sel->screen, nir, false);
2560 progress = false;
2561 late_opts = true;
2562 }
2563
2564 static const nir_opt_offsets_options offset_options = {
2565 .uniform_max = 0,
2566 .buffer_max = ~0,
2567 .shared_max = ~0,
2568 };
2569 NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2570
2571 if (late_opts)
2572 si_nir_late_opts(nir);
2573
2574 /* aco only accept scalar const, must be done after si_nir_late_opts()
2575 * which may generate vec const.
2576 */
2577 if (sel->info.base.use_aco_amd)
2578 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2579
2580 /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2581 * 200 is tuned for Viewperf. It should be done last.
2582 */
2583 NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2584
2585 if (unlikely(original_name)) {
2586 ralloc_free((void*)nir->info.name);
2587 nir->info.name = original_name;
2588 }
2589
2590 return nir;
2591 }
2592
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)2593 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
2594 {
2595 struct si_shader_info info;
2596 si_nir_scan_shader(shader->selector->screen, nir, &info);
2597
2598 shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2599 shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2600
2601 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2602 /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
2603 shader->info.num_ps_inputs = info.num_inputs;
2604 shader->info.ps_colors_read = info.colors_read;
2605
2606 /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
2607 unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
2608 memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
2609 }
2610 }
2611
2612 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2613 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2614 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2615 struct ac_llvm_compiler *compiler,
2616 struct si_shader *gs_shader,
2617 nir_shader *gs_nir,
2618 struct util_debug_callback *debug,
2619 ac_nir_gs_output_info *output_info)
2620 {
2621 struct si_shader *shader;
2622 struct si_shader_selector *gs_selector = gs_shader->selector;
2623 struct si_shader_info *gsinfo = &gs_selector->info;
2624 union si_shader_key *gskey = &gs_shader->key;
2625
2626 shader = CALLOC_STRUCT(si_shader);
2627 if (!shader)
2628 return NULL;
2629
2630 /* We can leave the fence as permanently signaled because the GS copy
2631 * shader only becomes visible globally after it has been compiled. */
2632 util_queue_fence_init(&shader->ready);
2633
2634 shader->selector = gs_selector;
2635 shader->is_gs_copy_shader = true;
2636 shader->wave_size = si_determine_wave_size(sscreen, shader);
2637
2638 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2639 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2640 sizeof(shader->info.vs_output_param_offset));
2641
2642 for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2643 unsigned semantic = gsinfo->output_semantic[i];
2644
2645 /* Skip if no channel writes to stream 0. */
2646 if (!nir_slot_is_varying(semantic) ||
2647 (gsinfo->output_streams[i] & 0x03 &&
2648 gsinfo->output_streams[i] & 0x0c &&
2649 gsinfo->output_streams[i] & 0x30 &&
2650 gsinfo->output_streams[i] & 0xc0))
2651 continue;
2652
2653 shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2654 }
2655
2656 shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2657
2658 unsigned clip_cull_mask =
2659 (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2660
2661 nir_shader *nir =
2662 ac_nir_create_gs_copy_shader(gs_nir,
2663 sscreen->info.gfx_level,
2664 clip_cull_mask,
2665 shader->info.vs_output_param_offset,
2666 shader->info.nr_param_exports,
2667 !si_shader_uses_streamout(gs_shader),
2668 gskey->ge.opt.kill_pointsize,
2669 gskey->ge.opt.kill_layer,
2670 sscreen->options.vrs2x2,
2671 output_info);
2672
2673 struct si_shader_args args;
2674 si_init_shader_args(shader, &args);
2675
2676 NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level, AC_HW_VERTEX_SHADER, &args.ac);
2677 NIR_PASS_V(nir, si_nir_lower_abi, shader, &args);
2678
2679 si_nir_opts(gs_selector->screen, nir, false);
2680
2681 /* aco only accept scalar const */
2682 if (gsinfo->base.use_aco_amd)
2683 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2684
2685 if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
2686 fprintf(stderr, "GS Copy Shader:\n");
2687 nir_print_shader(nir, stderr);
2688 }
2689
2690 bool ok =
2691 #if AMD_LLVM_AVAILABLE
2692 !gs_selector->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader,
2693 &args, debug, nir) :
2694 #endif
2695 si_aco_compile_shader(shader, &args, nir, debug);
2696
2697 if (ok) {
2698 assert(!shader->config.scratch_bytes_per_wave);
2699 ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
2700 si_shader_dump(sscreen, shader, debug, stderr, true);
2701 }
2702 ralloc_free(nir);
2703
2704 if (!ok) {
2705 FREE(shader);
2706 shader = NULL;
2707 } else {
2708 si_fix_resource_usage(sscreen, shader);
2709 }
2710 return shader;
2711 }
2712
2713 struct si_gs_output_info {
2714 uint8_t streams[64];
2715 uint8_t streams_16bit_lo[16];
2716 uint8_t streams_16bit_hi[16];
2717
2718 uint8_t usage_mask[64];
2719 uint8_t usage_mask_16bit_lo[16];
2720 uint8_t usage_mask_16bit_hi[16];
2721
2722 ac_nir_gs_output_info info;
2723 };
2724
2725 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2726 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2727 {
2728 for (int i = 0; i < info->num_outputs; i++) {
2729 unsigned slot = info->output_semantic[i];
2730 if (slot < VARYING_SLOT_VAR0_16BIT) {
2731 out_info->streams[slot] = info->output_streams[i];
2732 out_info->usage_mask[slot] = info->output_usagemask[i];
2733 } else {
2734 unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2735 /* TODO: 16bit need separated fields for lo/hi part. */
2736 out_info->streams_16bit_lo[index] = info->output_streams[i];
2737 out_info->streams_16bit_hi[index] = info->output_streams[i];
2738 out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2739 out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2740 }
2741 }
2742
2743 ac_nir_gs_output_info *ac_info = &out_info->info;
2744
2745 ac_info->streams = out_info->streams;
2746 ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2747 ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2748
2749 ac_info->usage_mask = out_info->usage_mask;
2750 ac_info->usage_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2751 ac_info->usage_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2752
2753 /* TODO: construct 16bit slot per component store type. */
2754 ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2755 }
2756
si_fixup_spi_ps_input_config(struct si_shader * shader)2757 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2758 {
2759 const union si_shader_key *key = &shader->key;
2760
2761 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2762 if (key->ps.part.prolog.poly_stipple)
2763 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2764
2765 /* Set up the enable bits for per-sample shading if needed. */
2766 if (key->ps.part.prolog.force_persp_sample_interp &&
2767 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2768 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2769 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2770 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2771 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2772 }
2773 if (key->ps.part.prolog.force_linear_sample_interp &&
2774 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2775 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2776 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2777 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2778 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2779 }
2780 if (key->ps.part.prolog.force_persp_center_interp &&
2781 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2782 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2783 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2784 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2785 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2786 }
2787 if (key->ps.part.prolog.force_linear_center_interp &&
2788 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2789 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2790 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2791 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2792 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2793 }
2794
2795 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2796 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2797 !(shader->config.spi_ps_input_ena & 0xf)) {
2798 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2799 }
2800
2801 /* At least one pair of interpolation weights must be enabled. */
2802 if (!(shader->config.spi_ps_input_ena & 0x7f))
2803 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2804
2805 /* Samplemask fixup requires the sample ID. */
2806 if (key->ps.part.prolog.samplemask_log_ps_iter)
2807 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2808 }
2809
2810 static void
si_set_spi_ps_input_config(struct si_shader * shader)2811 si_set_spi_ps_input_config(struct si_shader *shader)
2812 {
2813 const struct si_shader_selector *sel = shader->selector;
2814 const struct si_shader_info *info = &sel->info;
2815 const union si_shader_key *key = &shader->key;
2816
2817 /* TODO: This should be determined from the final NIR instead of the input NIR,
2818 * otherwise LLVM will have a performance advantage here because it determines
2819 * VGPR inputs for each shader variant after LLVM optimizations.
2820 */
2821 shader->config.spi_ps_input_ena =
2822 S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
2823 S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
2824 S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) |
2825 S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
2826 S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
2827 S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
2828 S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
2829 S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
2830 S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
2831
2832 uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask;
2833 u_foreach_bit(i, mask) {
2834 shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
2835 }
2836
2837 if (key->ps.part.prolog.color_two_side)
2838 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2839
2840 /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */
2841 if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) {
2842 shader->config.spi_ps_input_ena |=
2843 S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) |
2844 S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) |
2845 S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color);
2846 }
2847
2848 /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */
2849 if (key->ps.mono.poly_line_smoothing)
2850 shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
2851
2852 /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered
2853 * to nir_load_barycentric_pixel and nir_load_interpolated_input.
2854 */
2855 if (key->ps.mono.point_smoothing)
2856 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2857
2858 /* See fetch_framebuffer() for used args when fbfetch output. */
2859 if (info->base.fs.uses_fbfetch_output) {
2860 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2861
2862 if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa)
2863 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2864 }
2865
2866 if (shader->is_monolithic) {
2867 si_fixup_spi_ps_input_config(shader);
2868 shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2869 } else {
2870 /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2871 * shader part in si_shader_select_ps_parts().
2872 *
2873 * Reserve register locations for VGPR inputs the PS prolog may need.
2874 */
2875 shader->config.spi_ps_input_addr =
2876 shader->config.spi_ps_input_ena |
2877 SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2878 }
2879 }
2880
2881 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)2882 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
2883 const char *fmt, va_list args)
2884 {
2885 vfprintf(stderr, fmt, args);
2886 fprintf(stderr, "\n");
2887 }
2888
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2889 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2890 struct si_shader *shader, struct util_debug_callback *debug)
2891 {
2892 bool ret = true;
2893 struct si_shader_selector *sel = shader->selector;
2894
2895 /* ACO need spi_ps_input in advance to init args and used in compiler. */
2896 if (sel->stage == MESA_SHADER_FRAGMENT && sel->info.base.use_aco_amd)
2897 si_set_spi_ps_input_config(shader);
2898
2899 /* We need this info only when legacy GS. */
2900 struct si_gs_output_info legacy_gs_output_info;
2901 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2902 memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info));
2903 si_init_gs_output_info(&sel->info, &legacy_gs_output_info);
2904 }
2905
2906 struct si_shader_args args;
2907 si_init_shader_args(shader, &args);
2908
2909 bool free_nir;
2910 struct nir_shader *nir =
2911 si_get_nir_shader(shader, &args, &free_nir, 0, &legacy_gs_output_info.info);
2912
2913 /* Dump NIR before doing NIR->LLVM conversion in case the
2914 * conversion fails. */
2915 if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_NIR)) {
2916 nir_print_shader(nir, stderr);
2917
2918 if (nir->xfb_info)
2919 nir_print_xfb_info(nir->xfb_info, stderr);
2920 }
2921
2922 /* Initialize vs_output_ps_input_cntl to default. */
2923 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
2924 shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
2925 shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
2926
2927 si_update_shader_binary_info(shader, nir);
2928
2929 /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
2930 shader->info.uses_instanceid |= sel->info.uses_instanceid;
2931 shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
2932
2933 /* Set the FP ALU behavior. */
2934 /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
2935 * for performance and correctness reasons. FP32 denormals can't be enabled because
2936 * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
2937 *
2938 * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
2939 * can be set simultaneously, which means we are allowed to choose, but not really because
2940 * some options cause GLCTS failures.
2941 */
2942 unsigned float_mode = V_00B028_FP_16_64_DENORMS;
2943
2944 if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
2945 nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
2946 float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
2947
2948 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
2949 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
2950 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
2951 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
2952 float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
2953
2954 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
2955 FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
2956 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
2957 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
2958 float_mode &= ~V_00B028_FP_16_64_DENORMS;
2959
2960 ret =
2961 #if AMD_LLVM_AVAILABLE
2962 !sel->info.base.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &args,
2963 debug, nir) :
2964 #endif
2965 si_aco_compile_shader(shader, &args, nir, debug);
2966
2967 if (!ret)
2968 goto out;
2969
2970 shader->config.float_mode = float_mode;
2971
2972 /* The GS copy shader is compiled next. */
2973 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2974 shader->gs_copy_shader =
2975 si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
2976 &legacy_gs_output_info.info);
2977 if (!shader->gs_copy_shader) {
2978 fprintf(stderr, "radeonsi: can't create GS copy shader\n");
2979 ret = false;
2980 goto out;
2981 }
2982 }
2983
2984 /* Compute vs_output_ps_input_cntl. */
2985 if ((sel->stage == MESA_SHADER_VERTEX ||
2986 sel->stage == MESA_SHADER_TESS_EVAL ||
2987 sel->stage == MESA_SHADER_GEOMETRY) &&
2988 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
2989 uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
2990
2991 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
2992 vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
2993
2994 /* We must use the original shader info before the removal of duplicated shader outputs. */
2995 /* VS and TES should also set primitive ID output if it's used. */
2996 unsigned num_outputs_with_prim_id = sel->info.num_outputs +
2997 shader->key.ge.mono.u.vs_export_prim_id;
2998
2999 for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
3000 unsigned semantic = sel->info.output_semantic[i];
3001 unsigned offset = vs_output_param_offset[semantic];
3002 unsigned ps_input_cntl;
3003
3004 if (offset <= AC_EXP_PARAM_OFFSET_31) {
3005 /* The input is loaded from parameter memory. */
3006 ps_input_cntl = S_028644_OFFSET(offset);
3007 } else {
3008 /* The input is a DEFAULT_VAL constant. */
3009 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
3010 offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
3011 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
3012
3013 /* OFFSET=0x20 means that DEFAULT_VAL is used. */
3014 ps_input_cntl = S_028644_OFFSET(0x20) |
3015 S_028644_DEFAULT_VAL(offset);
3016 }
3017
3018 shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
3019 }
3020 }
3021
3022 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
3023 if (sel->stage == MESA_SHADER_COMPUTE) {
3024 unsigned max_vgprs =
3025 sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
3026 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3027 unsigned max_sgprs_per_wave = 128;
3028 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3029 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3030 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
3031 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3032
3033 max_vgprs = max_vgprs / waves_per_simd;
3034 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3035
3036 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
3037 fprintf(stderr,
3038 "LLVM failed to compile a shader correctly: "
3039 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3040 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
3041
3042 /* Just terminate the process, because dependent
3043 * shaders can hang due to bad input data, but use
3044 * the env var to allow shader-db to work.
3045 */
3046 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3047 abort();
3048 }
3049 }
3050
3051 /* Add/remove the scratch offset to/from input SGPRs. */
3052 if (!sel->screen->info.has_scratch_base_registers &&
3053 !si_is_merged_shader(shader)) {
3054 if (sel->info.base.use_aco_amd) {
3055 /* When aco scratch_offset arg is added explicitly at the beginning.
3056 * After compile if no scratch used, reduce the input sgpr count.
3057 */
3058 if (!shader->config.scratch_bytes_per_wave)
3059 shader->info.num_input_sgprs--;
3060 } else {
3061 /* scratch_offset arg is added by llvm implicitly */
3062 if (shader->info.num_input_sgprs)
3063 shader->info.num_input_sgprs++;
3064 }
3065 }
3066
3067 /* Calculate the number of fragment input VGPRs. */
3068 if (sel->stage == MESA_SHADER_FRAGMENT) {
3069 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
3070 &shader->config, &shader->info.num_fragcoord_components);
3071 }
3072
3073 si_calculate_max_simd_waves(shader);
3074
3075 if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_STATS)) {
3076 struct util_debug_callback out_stderr = {
3077 .debug_message = debug_message_stderr,
3078 };
3079
3080 si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
3081 } else {
3082 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3083 }
3084
3085 out:
3086 if (free_nir)
3087 ralloc_free(nir);
3088
3089 return ret;
3090 }
3091
3092 /**
3093 * Create, compile and return a shader part (prolog or epilog).
3094 *
3095 * \param sscreen screen
3096 * \param list list of shader parts of the same category
3097 * \param type shader type
3098 * \param key shader part key
3099 * \param prolog whether the part being requested is a prolog
3100 * \param tm LLVM target machine
3101 * \param debug debug callback
3102 * \return non-NULL on success
3103 */
3104 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3105 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3106 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3107 struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3108 const char *name)
3109 {
3110 struct si_shader_part *result;
3111
3112 simple_mtx_lock(&sscreen->shader_parts_mutex);
3113
3114 /* Find existing. */
3115 for (result = *list; result; result = result->next) {
3116 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3117 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3118 return result;
3119 }
3120 }
3121
3122 /* Compile a new one. */
3123 result = CALLOC_STRUCT(si_shader_part);
3124 result->key = *key;
3125
3126 bool ok =
3127 #if AMD_LLVM_AVAILABLE
3128 !(sscreen->use_aco ||
3129 (stage == MESA_SHADER_FRAGMENT &&
3130 ((prolog && key->ps_prolog.use_aco) ||
3131 (!prolog && key->ps_epilog.use_aco)))) ?
3132 si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3133 #endif
3134 si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3135
3136 if (ok) {
3137 result->next = *list;
3138 *list = result;
3139 } else {
3140 FREE(result);
3141 result = NULL;
3142 }
3143
3144 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3145 return result;
3146 }
3147
3148
3149 /**
3150 * Select and compile (or reuse) TCS parts (epilog).
3151 */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3152 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3153 struct si_shader *shader, struct util_debug_callback *debug)
3154 {
3155 if (sscreen->info.gfx_level >= GFX9) {
3156 assert(shader->wave_size == 32 || shader->wave_size == 64);
3157 unsigned index = shader->wave_size / 32 - 1;
3158 shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
3159 }
3160
3161 return true;
3162 }
3163
3164 /**
3165 * Select and compile (or reuse) GS parts (prolog).
3166 */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3167 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3168 struct si_shader *shader, struct util_debug_callback *debug)
3169 {
3170 if (sscreen->info.gfx_level >= GFX9) {
3171 if (shader->key.ge.as_ngg) {
3172 assert(shader->wave_size == 32 || shader->wave_size == 64);
3173 unsigned index = shader->wave_size / 32 - 1;
3174 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
3175 } else {
3176 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3177 }
3178 }
3179
3180 return true;
3181 }
3182
3183 /**
3184 * Compute the PS prolog key, which contains all the information needed to
3185 * build the PS prolog function, and set related bits in shader->config.
3186 */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3187 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3188 {
3189 struct si_shader_info *info = &shader->selector->info;
3190
3191 memset(key, 0, sizeof(*key));
3192 key->ps_prolog.states = shader->key.ps.part.prolog;
3193 key->ps_prolog.use_aco = info->base.use_aco_amd;
3194 key->ps_prolog.wave32 = shader->wave_size == 32;
3195 key->ps_prolog.colors_read = shader->info.ps_colors_read;
3196 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3197 key->ps_prolog.wqm =
3198 info->base.fs.needs_quad_helper_invocations &&
3199 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3200 key->ps_prolog.states.force_linear_sample_interp ||
3201 key->ps_prolog.states.force_persp_center_interp ||
3202 key->ps_prolog.states.force_linear_center_interp ||
3203 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
3204 key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
3205
3206 if (shader->key.ps.part.prolog.poly_stipple)
3207 shader->info.uses_vmem_load_other = true;
3208
3209 if (shader->info.ps_colors_read) {
3210 uint8_t *color = shader->selector->info.color_attr_index;
3211
3212 if (shader->key.ps.part.prolog.color_two_side) {
3213 /* BCOLORs are stored after the last input. */
3214 key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3215 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3216 }
3217
3218 for (unsigned i = 0; i < 2; i++) {
3219 unsigned interp = info->color_interpolate[i];
3220 unsigned location = info->color_interpolate_loc[i];
3221
3222 if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3223 continue;
3224
3225 key->ps_prolog.color_attr_index[i] = color[i];
3226
3227 if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3228 interp = INTERP_MODE_FLAT;
3229
3230 switch (interp) {
3231 case INTERP_MODE_FLAT:
3232 key->ps_prolog.color_interp_vgpr_index[i] = -1;
3233 break;
3234 case INTERP_MODE_SMOOTH:
3235 case INTERP_MODE_COLOR:
3236 /* Force the interpolation location for colors here. */
3237 if (shader->key.ps.part.prolog.force_persp_sample_interp)
3238 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3239 if (shader->key.ps.part.prolog.force_persp_center_interp)
3240 location = TGSI_INTERPOLATE_LOC_CENTER;
3241
3242 switch (location) {
3243 case TGSI_INTERPOLATE_LOC_SAMPLE:
3244 key->ps_prolog.color_interp_vgpr_index[i] = 0;
3245 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3246 break;
3247 case TGSI_INTERPOLATE_LOC_CENTER:
3248 key->ps_prolog.color_interp_vgpr_index[i] = 2;
3249 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3250 break;
3251 case TGSI_INTERPOLATE_LOC_CENTROID:
3252 key->ps_prolog.color_interp_vgpr_index[i] = 4;
3253 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3254 break;
3255 default:
3256 assert(0);
3257 }
3258 break;
3259 case INTERP_MODE_NOPERSPECTIVE:
3260 /* Force the interpolation location for colors here. */
3261 if (shader->key.ps.part.prolog.force_linear_sample_interp)
3262 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3263 if (shader->key.ps.part.prolog.force_linear_center_interp)
3264 location = TGSI_INTERPOLATE_LOC_CENTER;
3265
3266 /* The VGPR assignment for non-monolithic shaders
3267 * works because InitialPSInputAddr is set on the
3268 * main shader and PERSP_PULL_MODEL is never used.
3269 */
3270 switch (location) {
3271 case TGSI_INTERPOLATE_LOC_SAMPLE:
3272 key->ps_prolog.color_interp_vgpr_index[i] = 6;
3273 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3274 break;
3275 case TGSI_INTERPOLATE_LOC_CENTER:
3276 key->ps_prolog.color_interp_vgpr_index[i] = 8;
3277 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3278 break;
3279 case TGSI_INTERPOLATE_LOC_CENTROID:
3280 key->ps_prolog.color_interp_vgpr_index[i] = 10;
3281 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3282 break;
3283 default:
3284 assert(0);
3285 }
3286 break;
3287 default:
3288 assert(0);
3289 }
3290 }
3291 }
3292 }
3293
3294 /**
3295 * Check whether a PS prolog is required based on the key.
3296 */
si_need_ps_prolog(const union si_shader_part_key * key)3297 bool si_need_ps_prolog(const union si_shader_part_key *key)
3298 {
3299 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3300 key->ps_prolog.states.force_linear_sample_interp ||
3301 key->ps_prolog.states.force_persp_center_interp ||
3302 key->ps_prolog.states.force_linear_center_interp ||
3303 key->ps_prolog.states.bc_optimize_for_persp ||
3304 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3305 key->ps_prolog.states.samplemask_log_ps_iter;
3306 }
3307
3308 /**
3309 * Compute the PS epilog key, which contains all the information needed to
3310 * build the PS epilog function.
3311 */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3312 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3313 {
3314 struct si_shader_info *info = &shader->selector->info;
3315 memset(key, 0, sizeof(*key));
3316 key->ps_epilog.use_aco = info->base.use_aco_amd;
3317 key->ps_epilog.wave32 = shader->wave_size == 32;
3318 key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3319 key->ps_epilog.colors_written = info->colors_written;
3320 key->ps_epilog.color_types = info->output_color_types;
3321 key->ps_epilog.writes_z = info->writes_z;
3322 key->ps_epilog.writes_stencil = info->writes_stencil;
3323 key->ps_epilog.writes_samplemask = info->writes_samplemask &&
3324 !shader->key.ps.part.epilog.kill_samplemask;
3325 key->ps_epilog.states = shader->key.ps.part.epilog;
3326 }
3327
3328 /**
3329 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3330 */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3331 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3332 struct si_shader *shader, struct util_debug_callback *debug)
3333 {
3334 union si_shader_part_key prolog_key;
3335 union si_shader_part_key epilog_key;
3336
3337 /* Get the prolog. */
3338 si_get_ps_prolog_key(shader, &prolog_key);
3339
3340 /* The prolog is a no-op if these aren't set. */
3341 if (si_need_ps_prolog(&prolog_key)) {
3342 shader->prolog =
3343 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3344 compiler, debug, "Fragment Shader Prolog");
3345 if (!shader->prolog)
3346 return false;
3347 }
3348
3349 /* Get the epilog. */
3350 si_get_ps_epilog_key(shader, &epilog_key);
3351
3352 shader->epilog =
3353 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3354 compiler, debug, "Fragment Shader Epilog");
3355 if (!shader->epilog)
3356 return false;
3357
3358 si_fixup_spi_ps_input_config(shader);
3359
3360 /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3361 unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3362 unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3363 assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3364
3365 return true;
3366 }
3367
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3368 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3369 {
3370 /* If tessellation is all offchip and on-chip GS isn't used, this
3371 * workaround is not needed.
3372 */
3373 return;
3374
3375 /* SPI barrier management bug:
3376 * Make sure we have at least 4k of LDS in use to avoid the bug.
3377 * It applies to workgroup sizes of more than one wavefront.
3378 */
3379 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3380 *lds_size = MAX2(*lds_size, 8);
3381 }
3382
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3383 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3384 {
3385 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3386
3387 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3388
3389 if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3390 si_get_max_workgroup_size(shader) > shader->wave_size) {
3391 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3392 }
3393 }
3394
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3395 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3396 struct si_shader *shader, struct util_debug_callback *debug)
3397 {
3398 struct si_shader_selector *sel = shader->selector;
3399 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
3400
3401 if (sel->stage == MESA_SHADER_FRAGMENT) {
3402 shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3403 !shader->key.ps.part.epilog.kill_samplemask;
3404 }
3405
3406 /* LS, ES, VS are compiled on demand if the main part hasn't been
3407 * compiled for that stage.
3408 *
3409 * GS are compiled on demand if the main part hasn't been compiled
3410 * for the chosen NGG-ness.
3411 *
3412 * Vertex shaders are compiled on demand when a vertex fetch
3413 * workaround must be applied.
3414 */
3415 if (shader->is_monolithic) {
3416 /* Monolithic shader (compiled as a whole, has many variants,
3417 * may take a long time to compile).
3418 */
3419 if (!si_compile_shader(sscreen, compiler, shader, debug))
3420 return false;
3421 } else {
3422 /* The shader consists of several parts:
3423 *
3424 * - the middle part is the user shader, it has 1 variant only
3425 * and it was compiled during the creation of the shader
3426 * selector
3427 * - the prolog part is inserted at the beginning
3428 * - the epilog part is inserted at the end
3429 *
3430 * The prolog and epilog have many (but simple) variants.
3431 *
3432 * Starting with gfx9, geometry and tessellation control
3433 * shaders also contain the prolog and user shader parts of
3434 * the previous shader stage.
3435 */
3436
3437 if (!mainp)
3438 return false;
3439
3440 /* Copy the compiled shader data over. */
3441 shader->is_binary_shared = true;
3442 shader->binary = mainp->binary;
3443 shader->config = mainp->config;
3444 shader->info = mainp->info;
3445
3446 /* Select prologs and/or epilogs. */
3447 switch (sel->stage) {
3448 case MESA_SHADER_TESS_CTRL:
3449 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3450 return false;
3451 break;
3452 case MESA_SHADER_GEOMETRY:
3453 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3454 return false;
3455
3456 /* Clone the GS copy shader for the shader variant.
3457 * We can't just copy the pointer because we change the pm4 state and
3458 * si_shader_selector::gs_copy_shader must be immutable because it's shared
3459 * by multiple contexts.
3460 */
3461 if (!shader->key.ge.as_ngg) {
3462 assert(mainp->gs_copy_shader);
3463 assert(mainp->gs_copy_shader->bo);
3464 assert(!mainp->gs_copy_shader->previous_stage_sel);
3465 assert(!mainp->gs_copy_shader->scratch_va);
3466
3467 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3468 memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
3469 sizeof(*shader->gs_copy_shader));
3470 /* Increase the reference count. */
3471 pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3472 /* Initialize some fields differently. */
3473 shader->gs_copy_shader->shader_log = NULL;
3474 shader->gs_copy_shader->is_binary_shared = true;
3475 util_queue_fence_init(&shader->gs_copy_shader->ready);
3476 }
3477 break;
3478 case MESA_SHADER_FRAGMENT:
3479 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3480 return false;
3481
3482 /* Make sure we have at least as many VGPRs as there
3483 * are allocated inputs.
3484 */
3485 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3486 break;
3487 default:;
3488 }
3489
3490 assert(shader->wave_size == mainp->wave_size);
3491 assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3492
3493 /* Update SGPR and VGPR counts. */
3494 if (shader->prolog) {
3495 shader->config.num_sgprs =
3496 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
3497 shader->config.num_vgprs =
3498 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
3499 }
3500 if (shader->previous_stage) {
3501 shader->config.num_sgprs =
3502 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3503 shader->config.num_vgprs =
3504 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3505 shader->config.spilled_sgprs =
3506 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3507 shader->config.spilled_vgprs =
3508 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3509 shader->info.private_mem_vgprs =
3510 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3511 shader->config.scratch_bytes_per_wave =
3512 MAX2(shader->config.scratch_bytes_per_wave,
3513 shader->previous_stage->config.scratch_bytes_per_wave);
3514 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3515 shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3516 shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3517 }
3518 if (shader->epilog) {
3519 shader->config.num_sgprs =
3520 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
3521 shader->config.num_vgprs =
3522 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
3523 }
3524 si_calculate_max_simd_waves(shader);
3525 }
3526
3527 if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3528 assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3529 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3530 fprintf(stderr, "Failed to compute subgroup info\n");
3531 return false;
3532 }
3533 } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3534 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3535 }
3536
3537 shader->uses_vs_state_provoking_vertex =
3538 sscreen->use_ngg &&
3539 /* Used to convert triangle strips from GS to triangles. */
3540 ((sel->stage == MESA_SHADER_GEOMETRY &&
3541 util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3542 (sel->stage == MESA_SHADER_VERTEX &&
3543 /* Used to export PrimitiveID from the correct vertex. */
3544 shader->key.ge.mono.u.vs_export_prim_id));
3545
3546 shader->uses_gs_state_outprim = sscreen->use_ngg &&
3547 /* Only used by streamout and the PrimID export in vertex
3548 * shaders. */
3549 sel->stage == MESA_SHADER_VERTEX &&
3550 (si_shader_uses_streamout(shader) ||
3551 shader->uses_vs_state_provoking_vertex);
3552
3553 if (sel->stage == MESA_SHADER_VERTEX) {
3554 shader->uses_base_instance = sel->info.uses_base_instance ||
3555 shader->key.ge.mono.instance_divisor_is_one ||
3556 shader->key.ge.mono.instance_divisor_is_fetched;
3557 } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3558 shader->uses_base_instance = shader->previous_stage_sel &&
3559 (shader->previous_stage_sel->info.uses_base_instance ||
3560 shader->key.ge.mono.instance_divisor_is_one ||
3561 shader->key.ge.mono.instance_divisor_is_fetched);
3562 } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3563 shader->uses_base_instance = shader->previous_stage_sel &&
3564 (shader->previous_stage_sel->info.uses_base_instance ||
3565 shader->key.ge.mono.instance_divisor_is_one ||
3566 shader->key.ge.mono.instance_divisor_is_fetched);
3567 }
3568
3569 si_fix_resource_usage(sscreen, shader);
3570
3571 /* Upload. */
3572 bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3573
3574 shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);
3575
3576 si_shader_dump(sscreen, shader, debug, stderr, true);
3577
3578 if (!ok)
3579 fprintf(stderr, "LLVM failed to upload shader\n");
3580 return ok;
3581 }
3582
si_shader_binary_clean(struct si_shader_binary * binary)3583 void si_shader_binary_clean(struct si_shader_binary *binary)
3584 {
3585 free((void *)binary->code_buffer);
3586 binary->code_buffer = NULL;
3587
3588 free(binary->llvm_ir_string);
3589 binary->llvm_ir_string = NULL;
3590
3591 free((void *)binary->symbols);
3592 binary->symbols = NULL;
3593
3594 free(binary->uploaded_code);
3595 binary->uploaded_code = NULL;
3596 binary->uploaded_code_size = 0;
3597 }
3598
si_shader_destroy(struct si_shader * shader)3599 void si_shader_destroy(struct si_shader *shader)
3600 {
3601 si_resource_reference(&shader->bo, NULL);
3602
3603 if (!shader->is_binary_shared)
3604 si_shader_binary_clean(&shader->binary);
3605
3606 free(shader->shader_log);
3607 }
3608
si_get_prev_stage_nir_shader(struct si_shader * shader,struct si_shader * prev_shader,struct si_shader_args * args,bool * free_nir)3609 nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
3610 struct si_shader *prev_shader,
3611 struct si_shader_args *args,
3612 bool *free_nir)
3613 {
3614 const struct si_shader_selector *sel = shader->selector;
3615 const union si_shader_key *key = &shader->key;
3616
3617 if (sel->stage == MESA_SHADER_TESS_CTRL) {
3618 struct si_shader_selector *ls = key->ge.part.tcs.ls;
3619
3620 prev_shader->selector = ls;
3621 prev_shader->key.ge.as_ls = 1;
3622 } else {
3623 struct si_shader_selector *es = key->ge.part.gs.es;
3624
3625 prev_shader->selector = es;
3626 prev_shader->key.ge.as_es = 1;
3627 prev_shader->key.ge.as_ngg = key->ge.as_ngg;
3628 }
3629
3630 prev_shader->next_shader = shader;
3631 prev_shader->key.ge.mono = key->ge.mono;
3632 prev_shader->key.ge.opt = key->ge.opt;
3633 prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
3634 /* kill_outputs was computed based on second shader's outputs so we can't use it to
3635 * kill first shader's outputs.
3636 */
3637 prev_shader->key.ge.opt.kill_outputs = 0;
3638 prev_shader->is_monolithic = true;
3639 prev_shader->wave_size = shader->wave_size;
3640
3641 si_init_shader_args(prev_shader, args);
3642
3643 nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
3644 sel->info.tcs_vgpr_only_inputs, NULL);
3645
3646 si_update_shader_binary_info(shader, nir);
3647
3648 shader->info.uses_instanceid |=
3649 prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid;
3650
3651 return nir;
3652 }
3653
si_get_tcs_out_patch_stride(const struct si_shader_info * info)3654 unsigned si_get_tcs_out_patch_stride(const struct si_shader_info *info)
3655 {
3656 unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
3657 unsigned vertex_stride = util_last_bit64(info->outputs_written_before_tes_gs) * 4;
3658 unsigned num_patch_outputs = util_last_bit64(info->patch_outputs_written);
3659
3660 return tcs_out_vertices * vertex_stride + num_patch_outputs * 4;
3661 }
3662
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3663 void si_get_ps_prolog_args(struct si_shader_args *args,
3664 const union si_shader_part_key *key)
3665 {
3666 memset(args, 0, sizeof(*args));
3667
3668 const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3669
3670 struct ac_arg input_sgprs[num_input_sgprs];
3671 for (unsigned i = 0; i < num_input_sgprs; i++)
3672 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3673
3674 args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3675 /* Use the absolute location of the input. */
3676 args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3677
3678 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3679 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3680 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3681 /* skip PERSP_PULL_MODEL */
3682 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3683 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3684 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3685 /* skip LINE_STIPPLE_TEX */
3686
3687 /* POS_X|Y|Z|W_FLOAT */
3688 for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
3689 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3690
3691 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3692 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3693 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3694 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3695 }
3696
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3697 void si_get_ps_epilog_args(struct si_shader_args *args,
3698 const union si_shader_part_key *key,
3699 struct ac_arg colors[MAX_DRAW_BUFFERS],
3700 struct ac_arg *depth, struct ac_arg *stencil,
3701 struct ac_arg *sample_mask)
3702 {
3703 memset(args, 0, sizeof(*args));
3704
3705 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3706 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3707 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3708 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3709 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3710
3711 u_foreach_bit (i, key->ps_epilog.colors_written) {
3712 ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3713 }
3714
3715 if (key->ps_epilog.writes_z)
3716 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3717
3718 if (key->ps_epilog.writes_stencil)
3719 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3720
3721 if (key->ps_epilog.writes_samplemask)
3722 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3723 }
3724