1 /*
2 * Copyright 2022 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir_builder.h"
8
9 #include "ac_nir.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "si_state.h"
13 #include "si_shader_internal.h"
14
15 struct lower_abi_state {
16 struct si_shader *shader;
17 struct si_shader_args *args;
18
19 nir_def *esgs_ring;
20 nir_def *tess_offchip_ring;
21 nir_def *gsvs_ring[4];
22 };
23
24 #define GET_FIELD_NIR(field) \
25 ac_nir_unpack_arg(b, &args->ac, args->vs_state_bits, \
26 field##__SHIFT, util_bitcount(field##__MASK))
27
si_nir_load_internal_binding(nir_builder * b,struct si_shader_args * args,unsigned slot,unsigned num_components)28 nir_def *si_nir_load_internal_binding(nir_builder *b, struct si_shader_args *args,
29 unsigned slot, unsigned num_components)
30 {
31 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->internal_bindings);
32 return nir_load_smem_amd(b, num_components, addr, nir_imm_int(b, slot * 16));
33 }
34
build_attr_ring_desc(nir_builder * b,struct si_shader * shader,struct si_shader_args * args)35 static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
36 struct si_shader_args *args)
37 {
38 struct si_shader_selector *sel = shader->selector;
39
40 nir_def *attr_address =
41 sel->stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd ?
42 ac_nir_load_arg_at_offset(b, &args->ac, args->vs_blit_inputs,
43 sel->info.base.vs.blit_sgprs_amd - 1) :
44 ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
45
46 unsigned stride = 16 * si_shader_num_alloc_param_exports(shader);
47 uint32_t desc[4];
48
49 ac_build_attr_ring_descriptor(sel->screen->info.gfx_level,
50 (uint64_t)sel->screen->info.address32_hi << 32,
51 0xffffffff, stride, desc);
52
53 nir_def *comp[] = {
54 attr_address,
55 nir_imm_int(b, desc[1]),
56 nir_imm_int(b, desc[2]),
57 nir_imm_int(b, desc[3]),
58 };
59
60 return nir_vec(b, comp, 4);
61 }
62
63 static nir_def *
fetch_framebuffer(nir_builder * b,struct si_shader_args * args,struct si_shader_selector * sel,union si_shader_key * key)64 fetch_framebuffer(nir_builder *b, struct si_shader_args *args,
65 struct si_shader_selector *sel, union si_shader_key *key)
66 {
67 /* Load the image descriptor. */
68 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
69 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0_FMASK % 2 == 0);
70
71 nir_def *zero = nir_imm_zero(b, 1, 32);
72 nir_def *undef = nir_undef(b, 1, 32);
73
74 unsigned chan = 0;
75 nir_def *vec[4] = {undef, undef, undef, undef};
76
77 vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 16);
78
79 if (!key->ps.mono.fbfetch_is_1D)
80 vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 16);
81
82 /* Get the current render target layer index. */
83 if (key->ps.mono.fbfetch_layered)
84 vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 11);
85
86 nir_def *coords = nir_vec(b, vec, 4);
87
88 enum glsl_sampler_dim dim;
89 if (key->ps.mono.fbfetch_msaa)
90 dim = GLSL_SAMPLER_DIM_MS;
91 else if (key->ps.mono.fbfetch_is_1D)
92 dim = GLSL_SAMPLER_DIM_1D;
93 else
94 dim = GLSL_SAMPLER_DIM_2D;
95
96 nir_def *sample_id;
97 if (key->ps.mono.fbfetch_msaa) {
98 sample_id = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 8, 4);
99
100 if (sel->screen->info.gfx_level < GFX11 &&
101 !(sel->screen->debug_flags & DBG(NO_FMASK))) {
102 nir_def *desc =
103 si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0_FMASK, 8);
104
105 nir_def *fmask =
106 nir_bindless_image_fragment_mask_load_amd(
107 b, desc, coords,
108 .image_dim = dim,
109 .image_array = key->ps.mono.fbfetch_layered,
110 .access = ACCESS_CAN_REORDER);
111
112 nir_def *offset = nir_ishl_imm(b, sample_id, 2);
113 /* 3 for EQAA handling, see lower_image_to_fragment_mask_load() */
114 nir_def *width = nir_imm_int(b, 3);
115 sample_id = nir_ubfe(b, fmask, offset, width);
116 }
117 } else {
118 sample_id = zero;
119 }
120
121 nir_def *desc = si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0, 8);
122
123 return nir_bindless_image_load(b, 4, 32, desc, coords, sample_id, zero,
124 .image_dim = dim,
125 .image_array = key->ps.mono.fbfetch_layered,
126 .access = ACCESS_CAN_REORDER);
127 }
128
build_tess_ring_desc(nir_builder * b,struct si_screen * screen,struct si_shader_args * args)129 static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen,
130 struct si_shader_args *args)
131 {
132 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->tes_offchip_addr);
133 uint32_t desc[4];
134
135 ac_build_raw_buffer_descriptor(screen->info.gfx_level,
136 (uint64_t)screen->info.address32_hi << 32,
137 0xffffffff, desc);
138
139 nir_def *comp[4] = {
140 addr,
141 nir_imm_int(b, desc[1]),
142 nir_imm_int(b, desc[2]),
143 nir_imm_int(b, desc[3]),
144 };
145
146 return nir_vec(b, comp, 4);
147 }
148
build_esgs_ring_desc(nir_builder * b,enum amd_gfx_level gfx_level,struct si_shader_args * args)149 static nir_def *build_esgs_ring_desc(nir_builder *b, enum amd_gfx_level gfx_level,
150 struct si_shader_args *args)
151 {
152 nir_def *desc = si_nir_load_internal_binding(b, args, SI_RING_ESGS, 4);
153
154 if (b->shader->info.stage == MESA_SHADER_GEOMETRY)
155 return desc;
156
157 nir_def *vec[4];
158 for (int i = 0; i < 4; i++)
159 vec[i] = nir_channel(b, desc, i);
160
161 vec[1] = nir_ior_imm(b, vec[1], S_008F04_SWIZZLE_ENABLE_GFX6(1));
162 vec[3] = nir_ior_imm(b, vec[3],
163 S_008F0C_ELEMENT_SIZE(1) |
164 S_008F0C_INDEX_STRIDE(3) |
165 S_008F0C_ADD_TID_ENABLE(1));
166
167 /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
168 if (gfx_level == GFX8)
169 vec[3] = nir_iand_imm(b, vec[3], C_008F0C_DATA_FORMAT);
170
171 return nir_vec(b, vec, 4);
172 }
173
build_gsvs_ring_desc(nir_builder * b,struct lower_abi_state * s)174 static void build_gsvs_ring_desc(nir_builder *b, struct lower_abi_state *s)
175 {
176 const struct si_shader_selector *sel = s->shader->selector;
177 const union si_shader_key *key = &s->shader->key;
178
179 if (s->shader->is_gs_copy_shader) {
180 s->gsvs_ring[0] = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 4);
181 } else if (sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
182 nir_def *base_addr = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 2);
183 base_addr = nir_pack_64_2x32(b, base_addr);
184
185 /* The conceptual layout of the GSVS ring is
186 * v0c0 .. vLv0 v0c1 .. vLc1 ..
187 * but the real memory layout is swizzled across
188 * threads:
189 * t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
190 * t16v0c0 ..
191 * Override the buffer descriptor accordingly.
192 */
193
194 for (unsigned stream = 0; stream < 4; stream++) {
195 unsigned num_components = sel->info.num_stream_output_components[stream];
196 if (!num_components)
197 continue;
198
199 unsigned stride = 4 * num_components * sel->info.base.gs.vertices_out;
200 /* Limit on the stride field for <= GFX7. */
201 assert(stride < (1 << 14));
202
203 unsigned num_records = s->shader->wave_size;
204
205 const struct ac_buffer_state buffer_state = {
206 .size = num_records,
207 .format = PIPE_FORMAT_R32_FLOAT,
208 .swizzle = {
209 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W,
210 },
211 .stride = stride,
212 .swizzle_enable = true,
213 .element_size = 1,
214 .index_stride = 1,
215 .add_tid = true,
216 .gfx10_oob_select = V_008F0C_OOB_SELECT_DISABLED,
217 };
218 uint32_t tmp_desc[4];
219
220 ac_build_buffer_descriptor(sel->screen->info.gfx_level, &buffer_state, tmp_desc);
221
222 nir_def *desc[4];
223 desc[0] = nir_unpack_64_2x32_split_x(b, base_addr);
224 desc[1] = nir_ior_imm(b, nir_unpack_64_2x32_split_y(b, base_addr), tmp_desc[1]);
225 desc[2] = nir_imm_int(b, tmp_desc[2]);
226 desc[3] = nir_imm_int(b, tmp_desc[3]);
227
228 s->gsvs_ring[stream] = nir_vec(b, desc, 4);
229
230 /* next stream's desc addr */
231 base_addr = nir_iadd_imm(b, base_addr, stride * num_records);
232 }
233 }
234 }
235
preload_reusable_variables(nir_builder * b,struct lower_abi_state * s)236 static void preload_reusable_variables(nir_builder *b, struct lower_abi_state *s)
237 {
238 const struct si_shader_selector *sel = s->shader->selector;
239 const union si_shader_key *key = &s->shader->key;
240
241 b->cursor = nir_before_impl(b->impl);
242
243 if (sel->screen->info.gfx_level <= GFX8 && sel->stage <= MESA_SHADER_GEOMETRY &&
244 (key->ge.as_es || sel->stage == MESA_SHADER_GEOMETRY)) {
245 s->esgs_ring = build_esgs_ring_desc(b, sel->screen->info.gfx_level, s->args);
246 }
247
248 if (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_TESS_EVAL)
249 s->tess_offchip_ring = build_tess_ring_desc(b, sel->screen, s->args);
250
251 build_gsvs_ring_desc(b, s);
252 }
253
get_num_vertices_per_prim(nir_builder * b,struct lower_abi_state * s)254 static nir_def *get_num_vertices_per_prim(nir_builder *b, struct lower_abi_state *s)
255 {
256 struct si_shader_args *args = s->args;
257 unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(s->shader);
258
259 if (num_vertices)
260 return nir_imm_int(b, num_vertices);
261 else
262 return nir_iadd_imm(b, GET_FIELD_NIR(GS_STATE_OUTPRIM), 1);
263 }
264
lower_intrinsic(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)265 static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
266 {
267 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
268
269 struct si_shader *shader = s->shader;
270 struct si_shader_args *args = s->args;
271 struct si_shader_selector *sel = shader->selector;
272 union si_shader_key *key = &shader->key;
273 gl_shader_stage stage = sel->stage;
274
275 b->cursor = nir_before_instr(instr);
276
277 nir_def *replacement = NULL;
278
279 switch (intrin->intrinsic) {
280 case nir_intrinsic_load_first_vertex:
281 replacement = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
282 break;
283 case nir_intrinsic_load_base_vertex: {
284 nir_def *indexed = GET_FIELD_NIR(VS_STATE_INDEXED);
285 indexed = nir_i2b(b, indexed);
286
287 nir_def *base_vertex = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
288 replacement = nir_bcsel(b, indexed, base_vertex, nir_imm_int(b, 0));
289 break;
290 }
291 case nir_intrinsic_load_workgroup_size: {
292 assert(sel->info.base.workgroup_size_variable && sel->info.uses_variable_block_size);
293
294 nir_def *block_size = ac_nir_load_arg(b, &args->ac, args->block_size);
295 nir_def *comp[] = {
296 nir_ubfe_imm(b, block_size, 0, 10),
297 nir_ubfe_imm(b, block_size, 10, 10),
298 nir_ubfe_imm(b, block_size, 20, 10),
299 };
300 replacement = nir_vec(b, comp, 3);
301 break;
302 }
303 case nir_intrinsic_load_tess_level_outer_default:
304 case nir_intrinsic_load_tess_level_inner_default: {
305 nir_def *buf = si_nir_load_internal_binding(b, args, SI_HS_CONST_DEFAULT_TESS_LEVELS, 4);
306 unsigned num_components = intrin->def.num_components;
307 unsigned offset =
308 intrin->intrinsic == nir_intrinsic_load_tess_level_inner_default ? 16 : 0;
309 replacement = nir_load_ubo(b, num_components, 32, buf, nir_imm_int(b, offset),
310 .range = ~0);
311 break;
312 }
313 case nir_intrinsic_load_patch_vertices_in:
314 if (stage == MESA_SHADER_TESS_CTRL)
315 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5);
316 else if (stage == MESA_SHADER_TESS_EVAL) {
317 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
318 } else
319 unreachable("no nir_load_patch_vertices_in");
320 replacement = nir_iadd_imm(b, replacement, 1);
321 break;
322 case nir_intrinsic_load_sample_mask_in:
323 replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage);
324 break;
325 case nir_intrinsic_load_lshs_vertex_stride_amd:
326 if (stage == MESA_SHADER_VERTEX) {
327 replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader));
328 } else if (stage == MESA_SHADER_TESS_CTRL) {
329 if (sel->screen->info.gfx_level >= GFX9 && shader->is_monolithic) {
330 replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader));
331 } else {
332 nir_def *num_ls_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 17, 6);
333 nir_def *extra_dw = nir_bcsel(b, nir_ieq_imm(b, num_ls_out, 0), nir_imm_int(b, 0), nir_imm_int(b, 4));
334 replacement = nir_iadd_nuw(b, nir_ishl_imm(b, num_ls_out, 4), extra_dw);
335 }
336 } else {
337 unreachable("no nir_load_lshs_vertex_stride_amd");
338 }
339 break;
340 case nir_intrinsic_load_esgs_vertex_stride_amd:
341 assert(sel->screen->info.gfx_level >= GFX9);
342 if (shader->is_monolithic) {
343 replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
344 } else {
345 nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
346 replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
347 }
348 break;
349 case nir_intrinsic_load_tcs_num_patches_amd: {
350 nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
351 replacement = nir_iadd_imm(b, tmp, 1);
352 break;
353 }
354 case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
355 nir_def *per_vtx_out_patch_size = NULL;
356
357 if (stage == MESA_SHADER_TESS_CTRL) {
358 const unsigned num_hs_out = util_last_bit64(sel->info.outputs_written_before_tes_gs);
359 const unsigned out_vtx_size = num_hs_out * 16;
360 const unsigned out_vtx_per_patch = sel->info.base.tess.tcs_vertices_out;
361 per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
362 } else {
363 nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
364 nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
365 nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
366 nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
367 per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
368 }
369
370 nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
371 nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
372 replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
373 break;
374 }
375 case nir_intrinsic_load_ring_tess_offchip_offset_amd:
376 replacement = ac_nir_load_arg(b, &args->ac, args->ac.tess_offchip_offset);
377 break;
378 case nir_intrinsic_load_ring_es2gs_offset_amd:
379 replacement = ac_nir_load_arg(b, &args->ac, args->ac.es2gs_offset);
380 break;
381 case nir_intrinsic_load_clip_half_line_width_amd: {
382 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
383 replacement = nir_load_smem_amd(b, 2, addr, nir_imm_int(b, 32));
384 break;
385 }
386 case nir_intrinsic_load_viewport_xy_scale_and_offset: {
387 bool prim_is_lines = key->ge.opt.ngg_culling & SI_NGG_CULL_LINES;
388 nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
389 unsigned offset = prim_is_lines ? 16 : 0;
390 replacement = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, offset));
391 break;
392 }
393 case nir_intrinsic_load_num_vertices_per_primitive_amd:
394 replacement = get_num_vertices_per_prim(b, s);
395 break;
396 case nir_intrinsic_load_cull_ccw_amd:
397 /* radeonsi embed cw/ccw info into front/back face enabled */
398 replacement = nir_imm_false(b);
399 break;
400 case nir_intrinsic_load_cull_any_enabled_amd:
401 replacement = nir_imm_bool(b, !!key->ge.opt.ngg_culling);
402 break;
403 case nir_intrinsic_load_cull_back_face_enabled_amd:
404 replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE);
405 break;
406 case nir_intrinsic_load_cull_front_face_enabled_amd:
407 replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE);
408 break;
409 case nir_intrinsic_load_cull_small_prim_precision_amd: {
410 nir_def *small_prim_precision =
411 key->ge.opt.ngg_culling & SI_NGG_CULL_LINES ?
412 GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION_NO_AA) :
413 GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION);
414
415 /* Extract the small prim precision. */
416 small_prim_precision = nir_ior_imm(b, small_prim_precision, 0x70);
417 replacement = nir_ishl_imm(b, small_prim_precision, 23);
418 break;
419 }
420 case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
421 unsigned mask = SI_NGG_CULL_LINES | SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
422 replacement = nir_imm_bool(b, (key->ge.opt.ngg_culling & mask) != SI_NGG_CULL_LINES);
423 break;
424 }
425 case nir_intrinsic_load_provoking_vtx_in_prim_amd:
426 replacement = nir_bcsel(b, nir_i2b(b, GET_FIELD_NIR(GS_STATE_PROVOKING_VTX_FIRST)),
427 nir_imm_int(b, 0),
428 nir_iadd_imm(b, get_num_vertices_per_prim(b, s), -1));
429 break;
430 case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
431 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_PIPELINE_STATS_EMU));
432 break;
433 case nir_intrinsic_load_prim_gen_query_enabled_amd:
434 case nir_intrinsic_load_prim_xfb_query_enabled_amd:
435 replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_STREAMOUT_QUERY_ENABLED));
436 break;
437 case nir_intrinsic_load_clamp_vertex_color_amd:
438 replacement = nir_i2b(b, GET_FIELD_NIR(VS_STATE_CLAMP_VERTEX_COLOR));
439 break;
440 case nir_intrinsic_load_user_clip_plane: {
441 nir_def *buf = si_nir_load_internal_binding(b, args, SI_VS_CONST_CLIP_PLANES, 4);
442 unsigned offset = nir_intrinsic_ucp_id(intrin) * 16;
443 replacement = nir_load_ubo(b, 4, 32, buf, nir_imm_int(b, offset),
444 .range = ~0);
445 break;
446 }
447 case nir_intrinsic_load_streamout_buffer_amd: {
448 unsigned slot = SI_VS_STREAMOUT_BUF0 + nir_intrinsic_base(intrin);
449 replacement = si_nir_load_internal_binding(b, args, slot, 4);
450 break;
451 }
452 case nir_intrinsic_load_xfb_state_address_gfx12_amd: {
453 nir_def *address = si_nir_load_internal_binding(b, args, SI_STREAMOUT_STATE_BUF, 1);
454 nir_def *address32_hi = nir_imm_int(b, s->shader->selector->screen->info.address32_hi);
455 replacement = nir_pack_64_2x32_split(b, address, address32_hi);
456 break;
457 }
458 case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
459 case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
460 enum pipe_statistics_query_index index =
461 intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ?
462 PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS;
463
464 /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls,
465 * which makes the pipeline statistic incorrect.
466 */
467 assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES);
468
469 nir_def *buf =
470 si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
471 unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4;
472
473 nir_def *count = intrin->src[0].ssa;
474 nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), count,
475 .atomic_op = nir_atomic_op_iadd);
476 break;
477 }
478 case nir_intrinsic_atomic_add_gen_prim_count_amd:
479 case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
480 nir_def *buf = si_nir_load_internal_binding(b, args, SI_GS_QUERY_BUF, 4);
481
482 unsigned stream = nir_intrinsic_stream_id(intrin);
483 unsigned offset = intrin->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
484 offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].generated_primitives) :
485 offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].emitted_primitives);
486
487 nir_def *prim_count = intrin->src[0].ssa;
488 nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), prim_count,
489 .atomic_op = nir_atomic_op_iadd);
490 break;
491 }
492 case nir_intrinsic_load_debug_log_desc_amd:
493 replacement = si_nir_load_internal_binding(b, args, SI_RING_SHADER_LOG, 4);
494 break;
495 case nir_intrinsic_load_ring_attr_amd:
496 replacement = build_attr_ring_desc(b, shader, args);
497 break;
498 case nir_intrinsic_load_ring_attr_offset_amd: {
499 nir_def *offset = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_attr_offset, 0, 15);
500 replacement = nir_ishl_imm(b, offset, 9);
501 break;
502 }
503 case nir_intrinsic_load_ring_gs2vs_offset_amd:
504 replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs2vs_offset);
505 break;
506 case nir_intrinsic_load_streamout_config_amd:
507 replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_config);
508 break;
509 case nir_intrinsic_load_streamout_write_index_amd:
510 replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_write_index);
511 break;
512 case nir_intrinsic_load_streamout_offset_amd:
513 replacement =
514 ac_nir_load_arg(b, &args->ac, args->ac.streamout_offset[nir_intrinsic_base(intrin)]);
515 break;
516 case nir_intrinsic_load_force_vrs_rates_amd:
517 if (sel->screen->info.gfx_level >= GFX11) {
518 /* Bits [2:5] = VRS rate
519 *
520 * The range is [0, 15].
521 *
522 * If the hw doesn't support VRS 4x4, it will silently use 2x2 instead.
523 */
524 replacement = nir_imm_int(b, V_0283D0_VRS_SHADING_RATE_4X4 << 2);
525 } else {
526 /* Bits [2:3] = VRS rate X
527 * Bits [4:5] = VRS rate Y
528 *
529 * The range is [-2, 1]. Values:
530 * 1: 2x coarser shading rate in that direction.
531 * 0: normal shading rate
532 * -1: 2x finer shading rate (sample shading, not directional)
533 * -2: 4x finer shading rate (sample shading, not directional)
534 *
535 * Sample shading can't go above 8 samples, so both numbers can't be -2
536 * at the same time.
537 */
538 replacement = nir_imm_int(b, (1 << 2) | (1 << 4));
539 }
540 break;
541 case nir_intrinsic_load_barycentric_at_sample: {
542 unsigned mode = nir_intrinsic_interp_mode(intrin);
543
544 if (key->ps.mono.interpolate_at_sample_force_center) {
545 replacement = nir_load_barycentric_pixel(b, 32, .interp_mode = mode);
546 } else {
547 nir_def *sample_id = intrin->src[0].ssa;
548 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
549 nir_def *offset = nir_ishl_imm(b, sample_id, 3);
550
551 nir_def *buf = si_nir_load_internal_binding(b, args, SI_PS_CONST_SAMPLE_POSITIONS, 4);
552 nir_def *sample_pos = nir_load_ubo(b, 2, 32, buf, offset, .range = ~0);
553
554 sample_pos = nir_fadd_imm(b, sample_pos, -0.5);
555
556 replacement = nir_load_barycentric_at_offset(b, 32, sample_pos, .interp_mode = mode);
557 }
558 break;
559 }
560 case nir_intrinsic_load_output: {
561 nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
562
563 /* not fbfetch */
564 if (!(stage == MESA_SHADER_FRAGMENT && sem.fb_fetch_output))
565 return false;
566
567 /* Ignore src0, because KHR_blend_func_extended disallows multiple render targets. */
568
569 replacement = fetch_framebuffer(b, args, sel, key);
570 break;
571 }
572 case nir_intrinsic_load_ring_tess_factors_amd: {
573 assert(s->tess_offchip_ring);
574 nir_def *addr = nir_channel(b, s->tess_offchip_ring, 0);
575 addr = nir_iadd_imm(b, addr, sel->screen->hs.tess_offchip_ring_size);
576 replacement = nir_vector_insert_imm(b, s->tess_offchip_ring, addr, 0);
577 break;
578 }
579 case nir_intrinsic_load_ring_tess_factors_offset_amd:
580 replacement = ac_nir_load_arg(b, &args->ac, args->ac.tcs_factor_offset);
581 break;
582 case nir_intrinsic_load_alpha_reference_amd:
583 replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference);
584 break;
585 case nir_intrinsic_load_front_face:
586 if (!key->ps.opt.force_front_face_input)
587 return false;
588 replacement = nir_imm_bool(b, key->ps.opt.force_front_face_input == 1);
589 break;
590 case nir_intrinsic_load_barycentric_optimize_amd: {
591 nir_def *prim_mask = ac_nir_load_arg(b, &args->ac, args->ac.prim_mask);
592 /* enabled when bit 31 is set */
593 replacement = nir_ilt_imm(b, prim_mask, 0);
594 break;
595 }
596 case nir_intrinsic_load_layer_id:
597 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary,
598 16, sel->screen->info.gfx_level >= GFX12 ? 14 : 13);
599 break;
600 case nir_intrinsic_load_color0:
601 case nir_intrinsic_load_color1: {
602 uint32_t colors_read = sel->info.colors_read;
603
604 int start, offset;
605 if (intrin->intrinsic == nir_intrinsic_load_color0) {
606 start = 0;
607 offset = 0;
608 } else {
609 start = 4;
610 offset = util_bitcount(colors_read & 0xf);
611 }
612
613 nir_def *color[4];
614 for (int i = 0; i < 4; i++) {
615 if (colors_read & BITFIELD_BIT(start + i)) {
616 color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++);
617
618 nir_intrinsic_set_flags(nir_instr_as_intrinsic(color[i]->parent_instr),
619 SI_VECTOR_ARG_IS_COLOR |
620 SI_VECTOR_ARG_COLOR_COMPONENT(start + i));
621 } else {
622 color[i] = nir_undef(b, 1, 32);
623 }
624 }
625
626 replacement = nir_vec(b, color, 4);
627 break;
628 }
629 case nir_intrinsic_load_point_coord_maybe_flipped: {
630 nir_def *interp_param =
631 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NONE);
632
633 /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */
634 replacement = nir_load_interpolated_input(b, 2, 32, interp_param, nir_imm_int(b, 0),
635 .base = si_get_ps_num_interp(shader),
636 .component = 2,
637 /* This tells si_nir_scan_shader that it's PARAM_GEN */
638 .io_semantics.no_varying = 1);
639 break;
640 }
641 case nir_intrinsic_load_poly_line_smooth_enabled:
642 replacement = nir_imm_bool(b, key->ps.mono.poly_line_smoothing);
643 break;
644 case nir_intrinsic_load_gs_vertex_offset_amd: {
645 unsigned base = nir_intrinsic_base(intrin);
646 replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[base]);
647 break;
648 }
649 case nir_intrinsic_load_merged_wave_info_amd:
650 replacement = ac_nir_load_arg(b, &args->ac, args->ac.merged_wave_info);
651 break;
652 case nir_intrinsic_load_workgroup_num_input_vertices_amd:
653 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 12, 9);
654 break;
655 case nir_intrinsic_load_workgroup_num_input_primitives_amd:
656 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 22, 9);
657 break;
658 case nir_intrinsic_load_initial_edgeflags_amd:
659 if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES ||
660 (shader->selector->stage == MESA_SHADER_VERTEX &&
661 shader->selector->info.base.vs.blit_sgprs_amd)) {
662 /* Line primitives and blits don't need edge flags. */
663 replacement = nir_imm_int(b, 0);
664 } else if (shader->selector->stage == MESA_SHADER_VERTEX) {
665 if (sel->screen->info.gfx_level >= GFX12) {
666 replacement = nir_iand_imm(b, ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]),
667 ac_get_all_edge_flag_bits(sel->screen->info.gfx_level));
668 } else {
669 /* Use the following trick to extract the edge flags:
670 * extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
671 * shifted = v_mul_u32_u24 extracted, 0x80402u ; shift the bits: 8->9, 9->19, 10->29
672 * result = v_and_b32 shifted, 0x20080200 ; remove garbage
673 */
674 nir_def *tmp = ac_nir_load_arg(b, &args->ac, args->ac.gs_invocation_id);
675 tmp = nir_iand_imm(b, tmp, 0x700);
676 tmp = nir_imul_imm(b, tmp, 0x80402);
677 replacement = nir_iand_imm(b, tmp, 0x20080200);
678 }
679 } else {
680 /* Edge flags are always enabled when polygon mode is enabled, so we always have to
681 * return valid edge flags if the primitive type is not lines and if we are not blitting
682 * because the shader doesn't know when polygon mode is enabled.
683 */
684 replacement = nir_imm_int(b, ac_get_all_edge_flag_bits(sel->screen->info.gfx_level));
685 }
686 break;
687 case nir_intrinsic_load_packed_passthrough_primitive_amd:
688 replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]);
689 break;
690 case nir_intrinsic_load_ordered_id_amd:
691 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 0, 12);
692 break;
693 case nir_intrinsic_load_ring_esgs_amd:
694 assert(s->esgs_ring);
695 replacement = s->esgs_ring;
696 break;
697 case nir_intrinsic_load_tess_rel_patch_id_amd:
698 /* LLVM need to replace patch id arg, so have to be done in LLVM backend. */
699 if (!sel->info.base.use_aco_amd)
700 return false;
701
702 if (stage == MESA_SHADER_TESS_CTRL) {
703 replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_rel_ids, 0, 8);
704 } else {
705 assert(stage == MESA_SHADER_TESS_EVAL);
706 replacement = ac_nir_load_arg(b, &args->ac, args->ac.tes_rel_patch_id);
707 }
708 break;
709 case nir_intrinsic_load_ring_tess_offchip_amd:
710 assert(s->tess_offchip_ring);
711 replacement = s->tess_offchip_ring;
712 break;
713 case nir_intrinsic_load_tcs_tess_levels_to_tes_amd:
714 if (shader->is_monolithic) {
715 replacement = nir_imm_bool(b, key->ge.opt.tes_reads_tess_factors);
716 } else {
717 replacement = nir_ine_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 31, 1), 0);
718 }
719 break;
720 case nir_intrinsic_load_tcs_primitive_mode_amd:
721 if (shader->is_monolithic) {
722 replacement = nir_imm_int(b, key->ge.opt.tes_prim_mode);
723 } else {
724 replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 29, 2);
725 }
726 break;
727 case nir_intrinsic_load_ring_gsvs_amd: {
728 unsigned stream_id = nir_intrinsic_stream_id(intrin);
729 /* Unused nir_load_ring_gsvs_amd may not be eliminated yet. */
730 replacement = s->gsvs_ring[stream_id] ?
731 s->gsvs_ring[stream_id] : nir_undef(b, 4, 32);
732 break;
733 }
734 case nir_intrinsic_load_user_data_amd: {
735 nir_def *low_vec4 = ac_nir_load_arg(b, &args->ac, args->cs_user_data[0]);
736 replacement = nir_pad_vector(b, low_vec4, 8);
737
738 if (args->cs_user_data[1].used && intrin->def.num_components > 4) {
739 nir_def *high_vec4 = ac_nir_load_arg(b, &args->ac, args->cs_user_data[1]);
740 for (unsigned i = 0; i < high_vec4->num_components; i++)
741 replacement = nir_vector_insert_imm(b, replacement, nir_channel(b, high_vec4, i), 4 + i);
742 }
743 break;
744 }
745 default:
746 return false;
747 }
748
749 if (replacement)
750 nir_def_rewrite_uses(&intrin->def, replacement);
751
752 nir_instr_remove(instr);
753 nir_instr_free(instr);
754
755 return true;
756 }
757
lower_tex(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)758 static bool lower_tex(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
759 {
760 nir_tex_instr *tex = nir_instr_as_tex(instr);
761 const struct si_shader_selector *sel = s->shader->selector;
762 enum amd_gfx_level gfx_level = sel->screen->info.gfx_level;
763
764 b->cursor = nir_before_instr(instr);
765
766 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
767 * OpenGL 4.5 spec says:
768 *
769 * "If the texture’s internal format indicates a fixed-point
770 * depth texture, then D_t and D_ref are clamped to the
771 * range [0, 1]; otherwise no clamping is performed."
772 *
773 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
774 * so the depth comparison value isn't clamped for Z16 and
775 * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
776 * an explicitly clamped 32-bit float format.
777 */
778
779 /* LLVM keep non-uniform sampler as index, so can't do this in NIR. */
780 if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->info.base.use_aco_amd) {
781 int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
782 int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
783 assert(samp_index >= 0 && comp_index >= 0);
784
785 nir_def *sampler = tex->src[samp_index].src.ssa;
786 nir_def *compare = tex->src[comp_index].src.ssa;
787 /* Must have been lowered to descriptor. */
788 assert(sampler->num_components > 1);
789
790 nir_def *upgraded = nir_channel(b, sampler, 3);
791 upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
792
793 nir_def *clamped = nir_fsat(b, compare);
794 compare = nir_bcsel(b, upgraded, clamped, compare);
795
796 nir_src_rewrite(&tex->src[comp_index].src, compare);
797 return true;
798 }
799
800 return false;
801 }
802
si_nir_lower_abi(nir_shader * nir,struct si_shader * shader,struct si_shader_args * args)803 bool si_nir_lower_abi(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args)
804 {
805 struct lower_abi_state state = {
806 .shader = shader,
807 .args = args,
808 };
809
810 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
811
812 nir_builder b = nir_builder_create(impl);
813
814 preload_reusable_variables(&b, &state);
815
816 bool progress = false;
817 nir_foreach_block_safe(block, impl) {
818 nir_foreach_instr_safe(instr, block) {
819 if (instr->type == nir_instr_type_intrinsic)
820 progress |= lower_intrinsic(&b, instr, &state);
821 else if (instr->type == nir_instr_type_tex)
822 progress |= lower_tex(&b, instr, &state);
823 }
824 }
825
826 nir_metadata preserved = progress ?
827 nir_metadata_control_flow :
828 nir_metadata_all;
829 nir_metadata_preserve(impl, preserved);
830
831 return progress;
832 }
833