xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_nir_lower_abi.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2022 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "nir_builder.h"
8 
9 #include "ac_nir.h"
10 #include "si_pipe.h"
11 #include "si_query.h"
12 #include "si_state.h"
13 #include "si_shader_internal.h"
14 
15 struct lower_abi_state {
16    struct si_shader *shader;
17    struct si_shader_args *args;
18 
19    nir_def *esgs_ring;
20    nir_def *tess_offchip_ring;
21    nir_def *gsvs_ring[4];
22 };
23 
24 #define GET_FIELD_NIR(field) \
25    ac_nir_unpack_arg(b, &args->ac, args->vs_state_bits, \
26                      field##__SHIFT, util_bitcount(field##__MASK))
27 
si_nir_load_internal_binding(nir_builder * b,struct si_shader_args * args,unsigned slot,unsigned num_components)28 nir_def *si_nir_load_internal_binding(nir_builder *b, struct si_shader_args *args,
29                                           unsigned slot, unsigned num_components)
30 {
31    nir_def *addr = ac_nir_load_arg(b, &args->ac, args->internal_bindings);
32    return nir_load_smem_amd(b, num_components, addr, nir_imm_int(b, slot * 16));
33 }
34 
build_attr_ring_desc(nir_builder * b,struct si_shader * shader,struct si_shader_args * args)35 static nir_def *build_attr_ring_desc(nir_builder *b, struct si_shader *shader,
36                                          struct si_shader_args *args)
37 {
38    struct si_shader_selector *sel = shader->selector;
39 
40    nir_def *attr_address =
41       sel->stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd ?
42       ac_nir_load_arg_at_offset(b, &args->ac, args->vs_blit_inputs,
43                                 sel->info.base.vs.blit_sgprs_amd - 1) :
44       ac_nir_load_arg(b, &args->ac, args->gs_attr_address);
45 
46    unsigned stride = 16 * si_shader_num_alloc_param_exports(shader);
47    uint32_t desc[4];
48 
49    ac_build_attr_ring_descriptor(sel->screen->info.gfx_level,
50                                  (uint64_t)sel->screen->info.address32_hi << 32,
51                                  0xffffffff, stride, desc);
52 
53    nir_def *comp[] = {
54       attr_address,
55       nir_imm_int(b, desc[1]),
56       nir_imm_int(b, desc[2]),
57       nir_imm_int(b, desc[3]),
58    };
59 
60    return nir_vec(b, comp, 4);
61 }
62 
63 static nir_def *
fetch_framebuffer(nir_builder * b,struct si_shader_args * args,struct si_shader_selector * sel,union si_shader_key * key)64 fetch_framebuffer(nir_builder *b, struct si_shader_args *args,
65                   struct si_shader_selector *sel, union si_shader_key *key)
66 {
67    /* Load the image descriptor. */
68    STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
69    STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0_FMASK % 2 == 0);
70 
71    nir_def *zero = nir_imm_zero(b, 1, 32);
72    nir_def *undef = nir_undef(b, 1, 32);
73 
74    unsigned chan = 0;
75    nir_def *vec[4] = {undef, undef, undef, undef};
76 
77    vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 16);
78 
79    if (!key->ps.mono.fbfetch_is_1D)
80       vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 16);
81 
82    /* Get the current render target layer index. */
83    if (key->ps.mono.fbfetch_layered)
84       vec[chan++] = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 16, 11);
85 
86    nir_def *coords = nir_vec(b, vec, 4);
87 
88    enum glsl_sampler_dim dim;
89    if (key->ps.mono.fbfetch_msaa)
90       dim = GLSL_SAMPLER_DIM_MS;
91    else if (key->ps.mono.fbfetch_is_1D)
92       dim = GLSL_SAMPLER_DIM_1D;
93    else
94       dim = GLSL_SAMPLER_DIM_2D;
95 
96    nir_def *sample_id;
97    if (key->ps.mono.fbfetch_msaa) {
98       sample_id = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary, 8, 4);
99 
100       if (sel->screen->info.gfx_level < GFX11 &&
101           !(sel->screen->debug_flags & DBG(NO_FMASK))) {
102          nir_def *desc =
103             si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0_FMASK, 8);
104 
105          nir_def *fmask =
106             nir_bindless_image_fragment_mask_load_amd(
107                b, desc, coords,
108                .image_dim = dim,
109                .image_array = key->ps.mono.fbfetch_layered,
110                .access = ACCESS_CAN_REORDER);
111 
112          nir_def *offset = nir_ishl_imm(b, sample_id, 2);
113          /* 3 for EQAA handling, see lower_image_to_fragment_mask_load() */
114          nir_def *width = nir_imm_int(b, 3);
115          sample_id = nir_ubfe(b, fmask, offset, width);
116       }
117    } else {
118       sample_id = zero;
119    }
120 
121    nir_def *desc = si_nir_load_internal_binding(b, args, SI_PS_IMAGE_COLORBUF0, 8);
122 
123    return nir_bindless_image_load(b, 4, 32, desc, coords, sample_id, zero,
124                                   .image_dim = dim,
125                                   .image_array = key->ps.mono.fbfetch_layered,
126                                   .access = ACCESS_CAN_REORDER);
127 }
128 
build_tess_ring_desc(nir_builder * b,struct si_screen * screen,struct si_shader_args * args)129 static nir_def *build_tess_ring_desc(nir_builder *b, struct si_screen *screen,
130                                          struct si_shader_args *args)
131 {
132    nir_def *addr = ac_nir_load_arg(b, &args->ac, args->tes_offchip_addr);
133    uint32_t desc[4];
134 
135    ac_build_raw_buffer_descriptor(screen->info.gfx_level,
136                              (uint64_t)screen->info.address32_hi << 32,
137                              0xffffffff, desc);
138 
139    nir_def *comp[4] = {
140       addr,
141       nir_imm_int(b, desc[1]),
142       nir_imm_int(b, desc[2]),
143       nir_imm_int(b, desc[3]),
144    };
145 
146    return nir_vec(b, comp, 4);
147 }
148 
build_esgs_ring_desc(nir_builder * b,enum amd_gfx_level gfx_level,struct si_shader_args * args)149 static nir_def *build_esgs_ring_desc(nir_builder *b, enum amd_gfx_level gfx_level,
150                                          struct si_shader_args *args)
151 {
152    nir_def *desc = si_nir_load_internal_binding(b, args, SI_RING_ESGS, 4);
153 
154    if (b->shader->info.stage == MESA_SHADER_GEOMETRY)
155       return desc;
156 
157    nir_def *vec[4];
158    for (int i = 0; i < 4; i++)
159       vec[i] = nir_channel(b, desc, i);
160 
161    vec[1] = nir_ior_imm(b, vec[1], S_008F04_SWIZZLE_ENABLE_GFX6(1));
162    vec[3] = nir_ior_imm(b, vec[3],
163                         S_008F0C_ELEMENT_SIZE(1) |
164                         S_008F0C_INDEX_STRIDE(3) |
165                         S_008F0C_ADD_TID_ENABLE(1));
166 
167    /* If MUBUF && ADD_TID_ENABLE, DATA_FORMAT means STRIDE[14:17] on gfx8-9, so set 0. */
168    if (gfx_level == GFX8)
169       vec[3] = nir_iand_imm(b, vec[3], C_008F0C_DATA_FORMAT);
170 
171    return nir_vec(b, vec, 4);
172 }
173 
build_gsvs_ring_desc(nir_builder * b,struct lower_abi_state * s)174 static void build_gsvs_ring_desc(nir_builder *b, struct lower_abi_state *s)
175 {
176    const struct si_shader_selector *sel = s->shader->selector;
177    const union si_shader_key *key = &s->shader->key;
178 
179    if (s->shader->is_gs_copy_shader) {
180       s->gsvs_ring[0] = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 4);
181    } else if (sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
182       nir_def *base_addr = si_nir_load_internal_binding(b, s->args, SI_RING_GSVS, 2);
183       base_addr = nir_pack_64_2x32(b, base_addr);
184 
185       /* The conceptual layout of the GSVS ring is
186        *   v0c0 .. vLv0 v0c1 .. vLc1 ..
187        * but the real memory layout is swizzled across
188        * threads:
189        *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
190        *   t16v0c0 ..
191        * Override the buffer descriptor accordingly.
192        */
193 
194       for (unsigned stream = 0; stream < 4; stream++) {
195          unsigned num_components = sel->info.num_stream_output_components[stream];
196          if (!num_components)
197             continue;
198 
199          unsigned stride = 4 * num_components * sel->info.base.gs.vertices_out;
200          /* Limit on the stride field for <= GFX7. */
201          assert(stride < (1 << 14));
202 
203          unsigned num_records = s->shader->wave_size;
204 
205          const struct ac_buffer_state buffer_state = {
206             .size = num_records,
207             .format = PIPE_FORMAT_R32_FLOAT,
208             .swizzle = {
209                PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W,
210             },
211             .stride = stride,
212             .swizzle_enable = true,
213             .element_size = 1,
214             .index_stride = 1,
215             .add_tid = true,
216             .gfx10_oob_select = V_008F0C_OOB_SELECT_DISABLED,
217          };
218          uint32_t tmp_desc[4];
219 
220          ac_build_buffer_descriptor(sel->screen->info.gfx_level, &buffer_state, tmp_desc);
221 
222          nir_def *desc[4];
223          desc[0] = nir_unpack_64_2x32_split_x(b, base_addr);
224          desc[1] = nir_ior_imm(b, nir_unpack_64_2x32_split_y(b, base_addr), tmp_desc[1]);
225          desc[2] = nir_imm_int(b, tmp_desc[2]);
226          desc[3] = nir_imm_int(b, tmp_desc[3]);
227 
228          s->gsvs_ring[stream] = nir_vec(b, desc, 4);
229 
230          /* next stream's desc addr */
231          base_addr = nir_iadd_imm(b, base_addr, stride * num_records);
232       }
233    }
234 }
235 
preload_reusable_variables(nir_builder * b,struct lower_abi_state * s)236 static void preload_reusable_variables(nir_builder *b, struct lower_abi_state *s)
237 {
238    const struct si_shader_selector *sel = s->shader->selector;
239    const union si_shader_key *key = &s->shader->key;
240 
241    b->cursor = nir_before_impl(b->impl);
242 
243    if (sel->screen->info.gfx_level <= GFX8 && sel->stage <= MESA_SHADER_GEOMETRY &&
244        (key->ge.as_es || sel->stage == MESA_SHADER_GEOMETRY)) {
245       s->esgs_ring = build_esgs_ring_desc(b, sel->screen->info.gfx_level, s->args);
246    }
247 
248    if (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_TESS_EVAL)
249       s->tess_offchip_ring = build_tess_ring_desc(b, sel->screen, s->args);
250 
251    build_gsvs_ring_desc(b, s);
252 }
253 
get_num_vertices_per_prim(nir_builder * b,struct lower_abi_state * s)254 static nir_def *get_num_vertices_per_prim(nir_builder *b, struct lower_abi_state *s)
255 {
256    struct si_shader_args *args = s->args;
257    unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(s->shader);
258 
259    if (num_vertices)
260       return nir_imm_int(b, num_vertices);
261    else
262       return nir_iadd_imm(b, GET_FIELD_NIR(GS_STATE_OUTPRIM), 1);
263 }
264 
lower_intrinsic(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)265 static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
266 {
267    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
268 
269    struct si_shader *shader = s->shader;
270    struct si_shader_args *args = s->args;
271    struct si_shader_selector *sel = shader->selector;
272    union si_shader_key *key = &shader->key;
273    gl_shader_stage stage = sel->stage;
274 
275    b->cursor = nir_before_instr(instr);
276 
277    nir_def *replacement = NULL;
278 
279    switch (intrin->intrinsic) {
280    case nir_intrinsic_load_first_vertex:
281       replacement = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
282       break;
283    case nir_intrinsic_load_base_vertex: {
284       nir_def *indexed = GET_FIELD_NIR(VS_STATE_INDEXED);
285       indexed = nir_i2b(b, indexed);
286 
287       nir_def *base_vertex = ac_nir_load_arg(b, &args->ac, args->ac.base_vertex);
288       replacement = nir_bcsel(b, indexed, base_vertex, nir_imm_int(b, 0));
289       break;
290    }
291    case nir_intrinsic_load_workgroup_size: {
292       assert(sel->info.base.workgroup_size_variable && sel->info.uses_variable_block_size);
293 
294       nir_def *block_size = ac_nir_load_arg(b, &args->ac, args->block_size);
295       nir_def *comp[] = {
296          nir_ubfe_imm(b, block_size, 0, 10),
297          nir_ubfe_imm(b, block_size, 10, 10),
298          nir_ubfe_imm(b, block_size, 20, 10),
299       };
300       replacement = nir_vec(b, comp, 3);
301       break;
302    }
303    case nir_intrinsic_load_tess_level_outer_default:
304    case nir_intrinsic_load_tess_level_inner_default: {
305       nir_def *buf = si_nir_load_internal_binding(b, args, SI_HS_CONST_DEFAULT_TESS_LEVELS, 4);
306       unsigned num_components = intrin->def.num_components;
307       unsigned offset =
308          intrin->intrinsic == nir_intrinsic_load_tess_level_inner_default ? 16 : 0;
309       replacement = nir_load_ubo(b, num_components, 32, buf, nir_imm_int(b, offset),
310                                  .range = ~0);
311       break;
312    }
313    case nir_intrinsic_load_patch_vertices_in:
314       if (stage == MESA_SHADER_TESS_CTRL)
315          replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 12, 5);
316       else if (stage == MESA_SHADER_TESS_EVAL) {
317          replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
318       } else
319          unreachable("no nir_load_patch_vertices_in");
320       replacement = nir_iadd_imm(b, replacement, 1);
321       break;
322    case nir_intrinsic_load_sample_mask_in:
323       replacement = ac_nir_load_arg(b, &args->ac, args->ac.sample_coverage);
324       break;
325    case nir_intrinsic_load_lshs_vertex_stride_amd:
326       if (stage == MESA_SHADER_VERTEX) {
327          replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader));
328       } else if (stage == MESA_SHADER_TESS_CTRL) {
329          if (sel->screen->info.gfx_level >= GFX9 && shader->is_monolithic) {
330             replacement = nir_imm_int(b, si_shader_lshs_vertex_stride(shader));
331          } else {
332             nir_def *num_ls_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 17, 6);
333             nir_def *extra_dw = nir_bcsel(b, nir_ieq_imm(b, num_ls_out, 0), nir_imm_int(b, 0), nir_imm_int(b, 4));
334             replacement = nir_iadd_nuw(b, nir_ishl_imm(b, num_ls_out, 4), extra_dw);
335          }
336       } else {
337          unreachable("no nir_load_lshs_vertex_stride_amd");
338       }
339       break;
340    case nir_intrinsic_load_esgs_vertex_stride_amd:
341       assert(sel->screen->info.gfx_level >= GFX9);
342       if (shader->is_monolithic) {
343          replacement = nir_imm_int(b, key->ge.part.gs.es->info.esgs_vertex_stride / 4);
344       } else {
345          nir_def *num_es_outputs = GET_FIELD_NIR(GS_STATE_NUM_ES_OUTPUTS);
346          replacement = nir_iadd_imm(b, nir_imul_imm(b, num_es_outputs, 4), 1);
347       }
348       break;
349    case nir_intrinsic_load_tcs_num_patches_amd: {
350       nir_def *tmp = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
351       replacement = nir_iadd_imm(b, tmp, 1);
352       break;
353    }
354    case nir_intrinsic_load_hs_out_patch_data_offset_amd: {
355       nir_def *per_vtx_out_patch_size = NULL;
356 
357       if (stage == MESA_SHADER_TESS_CTRL) {
358          const unsigned num_hs_out = util_last_bit64(sel->info.outputs_written_before_tes_gs);
359          const unsigned out_vtx_size = num_hs_out * 16;
360          const unsigned out_vtx_per_patch = sel->info.base.tess.tcs_vertices_out;
361          per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
362       } else {
363          nir_def *num_hs_out = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 23, 6);
364          nir_def *out_vtx_size = nir_ishl_imm(b, num_hs_out, 4);
365          nir_def *o = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 7, 5);
366          nir_def *out_vtx_per_patch = nir_iadd_imm_nuw(b, o, 1);
367          per_vtx_out_patch_size = nir_imul(b, out_vtx_per_patch, out_vtx_size);
368       }
369 
370       nir_def *p = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 0, 7);
371       nir_def *num_patches = nir_iadd_imm_nuw(b, p, 1);
372       replacement = nir_imul(b, per_vtx_out_patch_size, num_patches);
373       break;
374    }
375    case nir_intrinsic_load_ring_tess_offchip_offset_amd:
376       replacement = ac_nir_load_arg(b, &args->ac, args->ac.tess_offchip_offset);
377       break;
378    case nir_intrinsic_load_ring_es2gs_offset_amd:
379       replacement = ac_nir_load_arg(b, &args->ac, args->ac.es2gs_offset);
380       break;
381    case nir_intrinsic_load_clip_half_line_width_amd: {
382       nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
383       replacement = nir_load_smem_amd(b, 2, addr, nir_imm_int(b, 32));
384       break;
385    }
386    case nir_intrinsic_load_viewport_xy_scale_and_offset: {
387       bool prim_is_lines = key->ge.opt.ngg_culling & SI_NGG_CULL_LINES;
388       nir_def *addr = ac_nir_load_arg(b, &args->ac, args->small_prim_cull_info);
389       unsigned offset = prim_is_lines ? 16 : 0;
390       replacement = nir_load_smem_amd(b, 4, addr, nir_imm_int(b, offset));
391       break;
392    }
393    case nir_intrinsic_load_num_vertices_per_primitive_amd:
394       replacement = get_num_vertices_per_prim(b, s);
395       break;
396    case nir_intrinsic_load_cull_ccw_amd:
397       /* radeonsi embed cw/ccw info into front/back face enabled */
398       replacement = nir_imm_false(b);
399       break;
400    case nir_intrinsic_load_cull_any_enabled_amd:
401       replacement = nir_imm_bool(b, !!key->ge.opt.ngg_culling);
402       break;
403    case nir_intrinsic_load_cull_back_face_enabled_amd:
404       replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_BACK_FACE);
405       break;
406    case nir_intrinsic_load_cull_front_face_enabled_amd:
407       replacement = nir_imm_bool(b, key->ge.opt.ngg_culling & SI_NGG_CULL_FRONT_FACE);
408       break;
409    case nir_intrinsic_load_cull_small_prim_precision_amd: {
410       nir_def *small_prim_precision =
411          key->ge.opt.ngg_culling & SI_NGG_CULL_LINES ?
412          GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION_NO_AA) :
413          GET_FIELD_NIR(GS_STATE_SMALL_PRIM_PRECISION);
414 
415       /* Extract the small prim precision. */
416       small_prim_precision = nir_ior_imm(b, small_prim_precision, 0x70);
417       replacement = nir_ishl_imm(b, small_prim_precision, 23);
418       break;
419    }
420    case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
421       unsigned mask = SI_NGG_CULL_LINES | SI_NGG_CULL_SMALL_LINES_DIAMOND_EXIT;
422       replacement = nir_imm_bool(b, (key->ge.opt.ngg_culling & mask) != SI_NGG_CULL_LINES);
423       break;
424    }
425    case nir_intrinsic_load_provoking_vtx_in_prim_amd:
426       replacement = nir_bcsel(b, nir_i2b(b, GET_FIELD_NIR(GS_STATE_PROVOKING_VTX_FIRST)),
427                               nir_imm_int(b, 0),
428                               nir_iadd_imm(b, get_num_vertices_per_prim(b, s), -1));
429       break;
430    case nir_intrinsic_load_pipeline_stat_query_enabled_amd:
431       replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_PIPELINE_STATS_EMU));
432       break;
433    case nir_intrinsic_load_prim_gen_query_enabled_amd:
434    case nir_intrinsic_load_prim_xfb_query_enabled_amd:
435       replacement = nir_i2b(b, GET_FIELD_NIR(GS_STATE_STREAMOUT_QUERY_ENABLED));
436       break;
437    case nir_intrinsic_load_clamp_vertex_color_amd:
438       replacement = nir_i2b(b, GET_FIELD_NIR(VS_STATE_CLAMP_VERTEX_COLOR));
439       break;
440    case nir_intrinsic_load_user_clip_plane: {
441       nir_def *buf = si_nir_load_internal_binding(b, args, SI_VS_CONST_CLIP_PLANES, 4);
442       unsigned offset = nir_intrinsic_ucp_id(intrin) * 16;
443       replacement = nir_load_ubo(b, 4, 32, buf, nir_imm_int(b, offset),
444                                  .range = ~0);
445       break;
446    }
447    case nir_intrinsic_load_streamout_buffer_amd: {
448       unsigned slot = SI_VS_STREAMOUT_BUF0 + nir_intrinsic_base(intrin);
449       replacement = si_nir_load_internal_binding(b, args, slot, 4);
450       break;
451    }
452    case nir_intrinsic_load_xfb_state_address_gfx12_amd: {
453       nir_def *address = si_nir_load_internal_binding(b, args, SI_STREAMOUT_STATE_BUF, 1);
454       nir_def *address32_hi = nir_imm_int(b, s->shader->selector->screen->info.address32_hi);
455       replacement = nir_pack_64_2x32_split(b, address, address32_hi);
456       break;
457    }
458    case nir_intrinsic_atomic_add_gs_emit_prim_count_amd:
459    case nir_intrinsic_atomic_add_shader_invocation_count_amd: {
460       enum pipe_statistics_query_index index =
461          intrin->intrinsic == nir_intrinsic_atomic_add_gs_emit_prim_count_amd ?
462          PIPE_STAT_QUERY_GS_PRIMITIVES : PIPE_STAT_QUERY_GS_INVOCATIONS;
463 
464       /* GFX11 only needs to emulate PIPE_STAT_QUERY_GS_PRIMITIVES because GS culls,
465        * which makes the pipeline statistic incorrect.
466        */
467       assert(sel->screen->info.gfx_level < GFX11 || index == PIPE_STAT_QUERY_GS_PRIMITIVES);
468 
469       nir_def *buf =
470          si_nir_load_internal_binding(b, args, SI_GS_QUERY_EMULATED_COUNTERS_BUF, 4);
471       unsigned offset = si_query_pipestat_end_dw_offset(sel->screen, index) * 4;
472 
473       nir_def *count = intrin->src[0].ssa;
474       nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), count,
475                       .atomic_op = nir_atomic_op_iadd);
476       break;
477    }
478    case nir_intrinsic_atomic_add_gen_prim_count_amd:
479    case nir_intrinsic_atomic_add_xfb_prim_count_amd: {
480       nir_def *buf = si_nir_load_internal_binding(b, args, SI_GS_QUERY_BUF, 4);
481 
482       unsigned stream = nir_intrinsic_stream_id(intrin);
483       unsigned offset = intrin->intrinsic == nir_intrinsic_atomic_add_gen_prim_count_amd ?
484          offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].generated_primitives) :
485          offsetof(struct gfx11_sh_query_buffer_mem, stream[stream].emitted_primitives);
486 
487       nir_def *prim_count = intrin->src[0].ssa;
488       nir_ssbo_atomic(b, 32, buf, nir_imm_int(b, offset), prim_count,
489                       .atomic_op = nir_atomic_op_iadd);
490       break;
491    }
492    case nir_intrinsic_load_debug_log_desc_amd:
493       replacement = si_nir_load_internal_binding(b, args, SI_RING_SHADER_LOG, 4);
494       break;
495    case nir_intrinsic_load_ring_attr_amd:
496       replacement = build_attr_ring_desc(b, shader, args);
497       break;
498    case nir_intrinsic_load_ring_attr_offset_amd: {
499       nir_def *offset = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_attr_offset, 0, 15);
500       replacement = nir_ishl_imm(b, offset, 9);
501       break;
502    }
503    case nir_intrinsic_load_ring_gs2vs_offset_amd:
504       replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs2vs_offset);
505       break;
506    case nir_intrinsic_load_streamout_config_amd:
507       replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_config);
508       break;
509    case nir_intrinsic_load_streamout_write_index_amd:
510       replacement = ac_nir_load_arg(b, &args->ac, args->ac.streamout_write_index);
511       break;
512    case nir_intrinsic_load_streamout_offset_amd:
513       replacement =
514          ac_nir_load_arg(b, &args->ac, args->ac.streamout_offset[nir_intrinsic_base(intrin)]);
515       break;
516    case nir_intrinsic_load_force_vrs_rates_amd:
517       if (sel->screen->info.gfx_level >= GFX11) {
518          /* Bits [2:5] = VRS rate
519           *
520           * The range is [0, 15].
521           *
522           * If the hw doesn't support VRS 4x4, it will silently use 2x2 instead.
523           */
524          replacement = nir_imm_int(b, V_0283D0_VRS_SHADING_RATE_4X4 << 2);
525       } else {
526          /* Bits [2:3] = VRS rate X
527           * Bits [4:5] = VRS rate Y
528           *
529           * The range is [-2, 1]. Values:
530           *   1: 2x coarser shading rate in that direction.
531           *   0: normal shading rate
532           *  -1: 2x finer shading rate (sample shading, not directional)
533           *  -2: 4x finer shading rate (sample shading, not directional)
534           *
535           * Sample shading can't go above 8 samples, so both numbers can't be -2
536           * at the same time.
537           */
538          replacement = nir_imm_int(b, (1 << 2) | (1 << 4));
539       }
540       break;
541    case nir_intrinsic_load_barycentric_at_sample: {
542       unsigned mode = nir_intrinsic_interp_mode(intrin);
543 
544       if (key->ps.mono.interpolate_at_sample_force_center) {
545          replacement = nir_load_barycentric_pixel(b, 32, .interp_mode = mode);
546       } else {
547          nir_def *sample_id = intrin->src[0].ssa;
548          /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
549          nir_def *offset = nir_ishl_imm(b, sample_id, 3);
550 
551          nir_def *buf = si_nir_load_internal_binding(b, args, SI_PS_CONST_SAMPLE_POSITIONS, 4);
552          nir_def *sample_pos = nir_load_ubo(b, 2, 32, buf, offset, .range = ~0);
553 
554          sample_pos = nir_fadd_imm(b, sample_pos, -0.5);
555 
556          replacement = nir_load_barycentric_at_offset(b, 32, sample_pos, .interp_mode = mode);
557       }
558       break;
559    }
560    case nir_intrinsic_load_output: {
561       nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
562 
563       /* not fbfetch */
564       if (!(stage == MESA_SHADER_FRAGMENT && sem.fb_fetch_output))
565          return false;
566 
567       /* Ignore src0, because KHR_blend_func_extended disallows multiple render targets. */
568 
569       replacement = fetch_framebuffer(b, args, sel, key);
570       break;
571    }
572    case nir_intrinsic_load_ring_tess_factors_amd: {
573       assert(s->tess_offchip_ring);
574       nir_def *addr = nir_channel(b, s->tess_offchip_ring, 0);
575       addr = nir_iadd_imm(b, addr, sel->screen->hs.tess_offchip_ring_size);
576       replacement = nir_vector_insert_imm(b, s->tess_offchip_ring, addr, 0);
577       break;
578    }
579    case nir_intrinsic_load_ring_tess_factors_offset_amd:
580       replacement = ac_nir_load_arg(b, &args->ac, args->ac.tcs_factor_offset);
581       break;
582    case nir_intrinsic_load_alpha_reference_amd:
583       replacement = ac_nir_load_arg(b, &args->ac, args->alpha_reference);
584       break;
585    case nir_intrinsic_load_front_face:
586       if (!key->ps.opt.force_front_face_input)
587          return false;
588       replacement = nir_imm_bool(b, key->ps.opt.force_front_face_input == 1);
589       break;
590    case nir_intrinsic_load_barycentric_optimize_amd: {
591       nir_def *prim_mask = ac_nir_load_arg(b, &args->ac, args->ac.prim_mask);
592       /* enabled when bit 31 is set */
593       replacement = nir_ilt_imm(b, prim_mask, 0);
594       break;
595    }
596    case nir_intrinsic_load_layer_id:
597       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.ancillary,
598                                       16, sel->screen->info.gfx_level >= GFX12 ? 14 : 13);
599       break;
600    case nir_intrinsic_load_color0:
601    case nir_intrinsic_load_color1: {
602       uint32_t colors_read = sel->info.colors_read;
603 
604       int start, offset;
605       if (intrin->intrinsic == nir_intrinsic_load_color0) {
606          start = 0;
607          offset = 0;
608       } else {
609          start = 4;
610          offset = util_bitcount(colors_read & 0xf);
611       }
612 
613       nir_def *color[4];
614       for (int i = 0; i < 4; i++) {
615          if (colors_read & BITFIELD_BIT(start + i)) {
616             color[i] = ac_nir_load_arg_at_offset(b, &args->ac, args->color_start, offset++);
617 
618             nir_intrinsic_set_flags(nir_instr_as_intrinsic(color[i]->parent_instr),
619                                     SI_VECTOR_ARG_IS_COLOR |
620                                     SI_VECTOR_ARG_COLOR_COMPONENT(start + i));
621          } else {
622             color[i] = nir_undef(b, 1, 32);
623          }
624       }
625 
626       replacement = nir_vec(b, color, 4);
627       break;
628    }
629    case nir_intrinsic_load_point_coord_maybe_flipped: {
630       nir_def *interp_param =
631          nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NONE);
632 
633       /* Load point coordinates (x, y) which are written by the hw after the interpolated inputs */
634       replacement = nir_load_interpolated_input(b, 2, 32, interp_param, nir_imm_int(b, 0),
635                                                 .base = si_get_ps_num_interp(shader),
636                                                 .component = 2,
637                                                 /* This tells si_nir_scan_shader that it's PARAM_GEN */
638                                                 .io_semantics.no_varying = 1);
639       break;
640    }
641    case nir_intrinsic_load_poly_line_smooth_enabled:
642       replacement = nir_imm_bool(b, key->ps.mono.poly_line_smoothing);
643       break;
644    case nir_intrinsic_load_gs_vertex_offset_amd: {
645       unsigned base = nir_intrinsic_base(intrin);
646       replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[base]);
647       break;
648    }
649    case nir_intrinsic_load_merged_wave_info_amd:
650       replacement = ac_nir_load_arg(b, &args->ac, args->ac.merged_wave_info);
651       break;
652    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
653       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 12, 9);
654       break;
655    case nir_intrinsic_load_workgroup_num_input_primitives_amd:
656       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 22, 9);
657       break;
658    case nir_intrinsic_load_initial_edgeflags_amd:
659       if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES ||
660           (shader->selector->stage == MESA_SHADER_VERTEX &&
661            shader->selector->info.base.vs.blit_sgprs_amd)) {
662          /* Line primitives and blits don't need edge flags. */
663          replacement = nir_imm_int(b, 0);
664       } else if (shader->selector->stage == MESA_SHADER_VERTEX) {
665          if (sel->screen->info.gfx_level >= GFX12) {
666             replacement = nir_iand_imm(b, ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]),
667                                        ac_get_all_edge_flag_bits(sel->screen->info.gfx_level));
668          } else {
669             /* Use the following trick to extract the edge flags:
670              *   extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
671              *   shifted = v_mul_u32_u24 extracted, 0x80402u   ; shift the bits: 8->9, 9->19, 10->29
672              *   result = v_and_b32 shifted, 0x20080200        ; remove garbage
673              */
674             nir_def *tmp = ac_nir_load_arg(b, &args->ac, args->ac.gs_invocation_id);
675             tmp = nir_iand_imm(b, tmp, 0x700);
676             tmp = nir_imul_imm(b, tmp, 0x80402);
677             replacement = nir_iand_imm(b, tmp, 0x20080200);
678          }
679       } else {
680          /* Edge flags are always enabled when polygon mode is enabled, so we always have to
681           * return valid edge flags if the primitive type is not lines and if we are not blitting
682           * because the shader doesn't know when polygon mode is enabled.
683           */
684          replacement = nir_imm_int(b, ac_get_all_edge_flag_bits(sel->screen->info.gfx_level));
685       }
686       break;
687    case nir_intrinsic_load_packed_passthrough_primitive_amd:
688       replacement = ac_nir_load_arg(b, &args->ac, args->ac.gs_vtx_offset[0]);
689       break;
690    case nir_intrinsic_load_ordered_id_amd:
691       replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.gs_tg_info, 0, 12);
692       break;
693    case nir_intrinsic_load_ring_esgs_amd:
694       assert(s->esgs_ring);
695       replacement = s->esgs_ring;
696       break;
697    case nir_intrinsic_load_tess_rel_patch_id_amd:
698       /* LLVM need to replace patch id arg, so have to be done in LLVM backend. */
699       if (!sel->info.base.use_aco_amd)
700          return false;
701 
702       if (stage == MESA_SHADER_TESS_CTRL) {
703          replacement = ac_nir_unpack_arg(b, &args->ac, args->ac.tcs_rel_ids, 0, 8);
704       } else {
705          assert(stage == MESA_SHADER_TESS_EVAL);
706          replacement = ac_nir_load_arg(b, &args->ac, args->ac.tes_rel_patch_id);
707       }
708       break;
709    case nir_intrinsic_load_ring_tess_offchip_amd:
710       assert(s->tess_offchip_ring);
711       replacement = s->tess_offchip_ring;
712       break;
713    case nir_intrinsic_load_tcs_tess_levels_to_tes_amd:
714       if (shader->is_monolithic) {
715          replacement = nir_imm_bool(b, key->ge.opt.tes_reads_tess_factors);
716       } else {
717          replacement = nir_ine_imm(b, ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 31, 1), 0);
718       }
719       break;
720    case nir_intrinsic_load_tcs_primitive_mode_amd:
721       if (shader->is_monolithic) {
722          replacement = nir_imm_int(b, key->ge.opt.tes_prim_mode);
723       } else {
724          replacement = ac_nir_unpack_arg(b, &args->ac, args->tcs_offchip_layout, 29, 2);
725       }
726       break;
727    case nir_intrinsic_load_ring_gsvs_amd: {
728       unsigned stream_id = nir_intrinsic_stream_id(intrin);
729       /* Unused nir_load_ring_gsvs_amd may not be eliminated yet. */
730       replacement = s->gsvs_ring[stream_id] ?
731          s->gsvs_ring[stream_id] : nir_undef(b, 4, 32);
732       break;
733    }
734    case nir_intrinsic_load_user_data_amd: {
735       nir_def *low_vec4 = ac_nir_load_arg(b, &args->ac, args->cs_user_data[0]);
736       replacement = nir_pad_vector(b, low_vec4, 8);
737 
738       if (args->cs_user_data[1].used && intrin->def.num_components > 4) {
739          nir_def *high_vec4 = ac_nir_load_arg(b, &args->ac, args->cs_user_data[1]);
740          for (unsigned i = 0; i < high_vec4->num_components; i++)
741             replacement = nir_vector_insert_imm(b, replacement, nir_channel(b, high_vec4, i), 4 + i);
742       }
743       break;
744    }
745    default:
746       return false;
747    }
748 
749    if (replacement)
750       nir_def_rewrite_uses(&intrin->def, replacement);
751 
752    nir_instr_remove(instr);
753    nir_instr_free(instr);
754 
755    return true;
756 }
757 
lower_tex(nir_builder * b,nir_instr * instr,struct lower_abi_state * s)758 static bool lower_tex(nir_builder *b, nir_instr *instr, struct lower_abi_state *s)
759 {
760    nir_tex_instr *tex = nir_instr_as_tex(instr);
761    const struct si_shader_selector *sel = s->shader->selector;
762    enum amd_gfx_level gfx_level = sel->screen->info.gfx_level;
763 
764    b->cursor = nir_before_instr(instr);
765 
766    /* Section 8.23.1 (Depth Texture Comparison Mode) of the
767     * OpenGL 4.5 spec says:
768     *
769     *    "If the texture’s internal format indicates a fixed-point
770     *     depth texture, then D_t and D_ref are clamped to the
771     *     range [0, 1]; otherwise no clamping is performed."
772     *
773     * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
774     * so the depth comparison value isn't clamped for Z16 and
775     * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
776     * an explicitly clamped 32-bit float format.
777     */
778 
779    /* LLVM keep non-uniform sampler as index, so can't do this in NIR. */
780    if (tex->is_shadow && gfx_level >= GFX8 && gfx_level <= GFX9 && sel->info.base.use_aco_amd) {
781       int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
782       int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
783       assert(samp_index >= 0 && comp_index >= 0);
784 
785       nir_def *sampler = tex->src[samp_index].src.ssa;
786       nir_def *compare = tex->src[comp_index].src.ssa;
787       /* Must have been lowered to descriptor. */
788       assert(sampler->num_components > 1);
789 
790       nir_def *upgraded = nir_channel(b, sampler, 3);
791       upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
792 
793       nir_def *clamped = nir_fsat(b, compare);
794       compare = nir_bcsel(b, upgraded, clamped, compare);
795 
796       nir_src_rewrite(&tex->src[comp_index].src, compare);
797       return true;
798    }
799 
800    return false;
801 }
802 
si_nir_lower_abi(nir_shader * nir,struct si_shader * shader,struct si_shader_args * args)803 bool si_nir_lower_abi(nir_shader *nir, struct si_shader *shader, struct si_shader_args *args)
804 {
805    struct lower_abi_state state = {
806       .shader = shader,
807       .args = args,
808    };
809 
810    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
811 
812    nir_builder b = nir_builder_create(impl);
813 
814    preload_reusable_variables(&b, &state);
815 
816    bool progress = false;
817    nir_foreach_block_safe(block, impl) {
818       nir_foreach_instr_safe(instr, block) {
819          if (instr->type == nir_instr_type_intrinsic)
820             progress |= lower_intrinsic(&b, instr, &state);
821          else if (instr->type == nir_instr_type_tex)
822             progress |= lower_tex(&b, instr, &state);
823       }
824    }
825 
826    nir_metadata preserved = progress ?
827       nir_metadata_control_flow :
828       nir_metadata_all;
829    nir_metadata_preserve(impl, preserved);
830 
831    return progress;
832 }
833