xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_shader_nir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 #include "si_pipe.h"
10 #include "ac_nir.h"
11 #include "aco_interface.h"
12 
13 
si_alu_to_scalar_packed_math_filter(const nir_instr * instr,const void * data)14 bool si_alu_to_scalar_packed_math_filter(const nir_instr *instr, const void *data)
15 {
16    if (instr->type == nir_instr_type_alu) {
17       nir_alu_instr *alu = nir_instr_as_alu(instr);
18       bool use_aco = (bool)data;
19 
20       if (alu->def.bit_size == 16 && alu->def.num_components == 2 &&
21           (!use_aco || aco_nir_op_supports_packed_math_16bit(alu)))
22          return false;
23    }
24 
25    return true;
26 }
27 
si_vectorize_callback(const nir_instr * instr,const void * data)28 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
29 {
30    if (instr->type != nir_instr_type_alu)
31       return 0;
32 
33    nir_alu_instr *alu = nir_instr_as_alu(instr);
34    if (alu->def.bit_size != 16)
35       return 1;
36 
37    bool use_aco = (bool)data;
38 
39    if (use_aco) {
40       return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
41    } else {
42       switch (alu->op) {
43       case nir_op_unpack_32_2x16_split_x:
44       case nir_op_unpack_32_2x16_split_y:
45       case nir_op_extract_i8:
46       case nir_op_extract_u8:
47       case nir_op_extract_i16:
48       case nir_op_extract_u16:
49       case nir_op_insert_u8:
50       case nir_op_insert_u16:
51          return 1;
52       default:
53          return 2;
54       }
55    }
56 }
57 
si_lower_bit_size_callback(const nir_instr * instr,void * data)58 static unsigned si_lower_bit_size_callback(const nir_instr *instr, void *data)
59 {
60    if (instr->type != nir_instr_type_alu)
61       return 0;
62 
63    nir_alu_instr *alu = nir_instr_as_alu(instr);
64 
65    switch (alu->op) {
66    case nir_op_imul_high:
67    case nir_op_umul_high:
68       if (alu->def.bit_size < 32)
69          return 32;
70       break;
71    default:
72       break;
73    }
74 
75    return 0;
76 }
77 
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)78 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
79 {
80    bool use_aco = sscreen->use_aco || nir->info.use_aco_amd;
81    bool progress;
82 
83    do {
84       progress = false;
85       bool lower_alu_to_scalar = false;
86       bool lower_phis_to_scalar = false;
87 
88       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
89       NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
90                nir->options->lower_to_scalar_filter, (void *)use_aco);
91       NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
92 
93       if (first) {
94          NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
95          NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
96          NIR_PASS(progress, nir, nir_opt_find_array_copies);
97       }
98       NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
99       NIR_PASS(progress, nir, nir_opt_dead_write_vars);
100 
101       NIR_PASS(lower_alu_to_scalar, nir, nir_opt_loop);
102       /* (Constant) copy propagation is needed for txf with offsets. */
103       NIR_PASS(progress, nir, nir_copy_prop);
104       NIR_PASS(progress, nir, nir_opt_remove_phis);
105       NIR_PASS(progress, nir, nir_opt_dce);
106       /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
107       NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
108                nir_opt_if_optimize_phi_true_false);
109       NIR_PASS(progress, nir, nir_opt_dead_cf);
110 
111       if (lower_alu_to_scalar) {
112          NIR_PASS_V(nir, nir_lower_alu_to_scalar,
113                     nir->options->lower_to_scalar_filter, (void *)use_aco);
114       }
115       if (lower_phis_to_scalar)
116          NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
117       progress |= lower_alu_to_scalar | lower_phis_to_scalar;
118 
119       NIR_PASS(progress, nir, nir_opt_cse);
120       NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
121 
122       /* Needed for algebraic lowering */
123       NIR_PASS(progress, nir, nir_lower_bit_size, si_lower_bit_size_callback, NULL);
124       NIR_PASS(progress, nir, nir_opt_algebraic);
125       NIR_PASS(progress, nir, nir_opt_generate_bfi);
126       NIR_PASS(progress, nir, nir_opt_constant_folding);
127 
128       if (!nir->info.flrp_lowered) {
129          unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
130                                (nir->options->lower_flrp32 ? 32 : 0) |
131                                (nir->options->lower_flrp64 ? 64 : 0);
132          assert(lower_flrp);
133          bool lower_flrp_progress = false;
134 
135          NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
136          if (lower_flrp_progress) {
137             NIR_PASS(progress, nir, nir_opt_constant_folding);
138             progress = true;
139          }
140 
141          /* Nothing should rematerialize any flrps, so we only
142           * need to do this lowering once.
143           */
144          nir->info.flrp_lowered = true;
145       }
146 
147       NIR_PASS(progress, nir, nir_opt_undef);
148       NIR_PASS(progress, nir, nir_opt_conditional_discard);
149       if (nir->options->max_unroll_iterations) {
150          NIR_PASS(progress, nir, nir_opt_loop_unroll);
151       }
152 
153       if (nir->info.stage == MESA_SHADER_FRAGMENT)
154          NIR_PASS_V(nir, nir_opt_move_discards_to_top);
155 
156       if (sscreen->info.has_packed_math_16bit)
157          NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, (void *)use_aco);
158    } while (progress);
159 
160    NIR_PASS_V(nir, nir_lower_var_copies);
161 }
162 
si_nir_late_opts(nir_shader * nir)163 void si_nir_late_opts(nir_shader *nir)
164 {
165    bool more_late_algebraic = true;
166    while (more_late_algebraic) {
167       more_late_algebraic = false;
168       NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
169       NIR_PASS_V(nir, nir_opt_constant_folding);
170 
171       /* We should run this after constant folding for stages that support indirect
172        * inputs/outputs.
173        */
174       if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
175           nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
176          NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
177 
178       NIR_PASS_V(nir, nir_copy_prop);
179       NIR_PASS_V(nir, nir_opt_dce);
180       NIR_PASS_V(nir, nir_opt_cse);
181    }
182 }
183 
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)184 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
185 {
186    /* Optimize types of image_sample sources and destinations.
187     *
188     * The image_sample sources bit sizes are:
189     *   nir_tex_src_coord:       a16 ? 16 : 32
190     *   nir_tex_src_comparator:  32
191     *   nir_tex_src_offset:      32
192     *   nir_tex_src_bias:        a16 ? 16 : 32
193     *   nir_tex_src_lod:         a16 ? 16 : 32
194     *   nir_tex_src_min_lod:     a16 ? 16 : 32
195     *   nir_tex_src_ms_index:    a16 ? 16 : 32
196     *   nir_tex_src_ddx:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
197     *   nir_tex_src_ddy:         has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
198     *
199     * We only use a16/g16 if all of the affected sources are 16bit.
200     */
201    bool has_g16 = sscreen->info.gfx_level >= GFX10;
202    struct nir_opt_tex_srcs_options opt_srcs_options[] = {
203       {
204          .sampler_dims =
205             ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
206          .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
207                       (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
208                       (1 << nir_tex_src_ms_index) |
209                       (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
210       },
211       {
212          .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
213          .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
214       },
215    };
216    struct nir_opt_16bit_tex_image_options opt_16bit_options = {
217       .rounding_mode = nir_rounding_mode_undef,
218       .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
219       .opt_image_dest_types = nir_type_float | nir_type_int | nir_type_uint,
220       .integer_dest_saturates = true,
221       .opt_image_store_data = true,
222       .opt_image_srcs = true,
223       .opt_srcs_options_count = has_g16 ? 2 : 1,
224       .opt_srcs_options = opt_srcs_options,
225    };
226    bool changed = false;
227    NIR_PASS(changed, nir, nir_opt_16bit_tex_image, &opt_16bit_options);
228 
229    if (changed) {
230       si_nir_opts(sscreen, nir, false);
231       si_nir_late_opts(nir);
232    }
233 }
234 
235 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)236 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
237 {
238    return instr->type == nir_instr_type_intrinsic;
239 }
240 
241 static nir_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)242 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
243 {
244    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
245 
246    switch (intrin->intrinsic) {
247    case nir_intrinsic_is_sparse_texels_resident:
248       /* code==0 means sparse texels are resident */
249       return nir_ieq_imm(b, intrin->src[0].ssa, 0);
250    case nir_intrinsic_sparse_residency_code_and:
251       return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
252    default:
253       return NULL;
254    }
255 }
256 
si_lower_intrinsics(nir_shader * nir)257 static bool si_lower_intrinsics(nir_shader *nir)
258 {
259    return nir_shader_lower_instructions(nir,
260                                         lower_intrinsic_filter,
261                                         lower_intrinsic_instr,
262                                         NULL);
263 }
264 
si_lower_mediump_io(nir_shader * nir)265 void si_lower_mediump_io(nir_shader *nir)
266 {
267    NIR_PASS_V(nir, nir_lower_mediump_io,
268               /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
269                * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
270                */
271               (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
272               BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
273               true);
274 }
275 
276 /**
277  * Perform "lowering" operations on the NIR that are run once when the shader
278  * selector is created.
279  */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)280 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
281 {
282    /* Perform lowerings (and optimizations) of code.
283     *
284     * Performance considerations aside, we must:
285     * - lower certain ALU operations
286     * - ensure constant offsets for texture instructions are folded
287     *   and copy-propagated
288     */
289    const struct nir_lower_tex_options lower_tex_options = {
290       .lower_txp = ~0u,
291       .lower_txf_offset = true,
292       .lower_txs_cube_array = true,
293       .lower_invalid_implicit_lod = true,
294       .lower_tg4_offsets = true,
295       .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
296       .lower_1d = sscreen->info.gfx_level == GFX9,
297    };
298    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
299 
300    const struct nir_lower_image_options lower_image_options = {
301       .lower_cube_size = true,
302       .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11 &&
303                                          !(sscreen->debug_flags & DBG(NO_FMASK)),
304    };
305    NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
306 
307    NIR_PASS_V(nir, si_lower_intrinsics);
308 
309    NIR_PASS_V(nir, ac_nir_lower_sin_cos);
310 
311    /* Lower load constants to scalar and then clean up the mess */
312    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
313    NIR_PASS_V(nir, nir_lower_var_copies);
314    NIR_PASS_V(nir, nir_opt_intrinsics);
315    NIR_PASS_V(nir, nir_lower_system_values);
316 
317    /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
318    if (nir->info.stage == MESA_SHADER_VERTEX ||
319        nir->info.stage == MESA_SHADER_TESS_EVAL ||
320        nir->info.stage == MESA_SHADER_GEOMETRY)
321       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
322 
323    if (nir->info.stage == MESA_SHADER_GEOMETRY) {
324       unsigned flags = nir_lower_gs_intrinsics_per_stream;
325       if (sscreen->use_ngg) {
326          flags |= nir_lower_gs_intrinsics_count_primitives |
327             nir_lower_gs_intrinsics_count_vertices_per_primitive |
328             nir_lower_gs_intrinsics_overwrite_incomplete;
329       }
330 
331       NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags);
332    }
333 
334    if (nir->info.stage == MESA_SHADER_COMPUTE) {
335       nir_lower_compute_system_values_options options = {0};
336 
337       /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
338        * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
339        * incorrect with a non-linear thread order.
340        *
341        * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
342        * divisible by 2.
343        */
344       options.lower_local_invocation_index =
345          nir->info.derivative_group == DERIVATIVE_GROUP_QUADS ||
346          (sscreen->info.gfx_level >= GFX12 &&
347           nir->info.derivative_group == DERIVATIVE_GROUP_NONE &&
348           (nir->info.workgroup_size_variable ||
349            (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)));
350       NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
351 
352       /* Gfx12 supports this in hw. */
353       if (sscreen->info.gfx_level < GFX12 &&
354           nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
355          nir_opt_cse(nir); /* CSE load_local_invocation_id */
356          memset(&options, 0, sizeof(options));
357          options.shuffle_local_ids_for_quad_derivatives = true;
358          NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
359       }
360    }
361 
362    si_nir_opts(sscreen, nir, true);
363    /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
364    si_nir_late_opts(nir);
365 
366    if (sscreen->info.gfx_level >= GFX9)
367       si_late_optimize_16bit_samplers(sscreen, nir);
368 
369    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
370 
371    NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
372 }
373 
si_mark_divergent_texture_non_uniform(struct nir_shader * nir)374 static bool si_mark_divergent_texture_non_uniform(struct nir_shader *nir)
375 {
376    assert(nir->info.divergence_analysis_run);
377 
378    /* sampler_non_uniform and texture_non_uniform are always false in GLSL,
379     * but this can lead to unexpected behavior if texture/sampler index come from
380     * a vertex attribute.
381     *
382     * For instance, 2 consecutive draws using 2 different index values,
383     * could be squashed together by the hw - producing a single draw with
384     * non-dynamically uniform index.
385     *
386     * To avoid this, detect divergent indexing, mark them as non-uniform,
387     * so that we can apply waterfall loop on these index later (either llvm
388     * backend or nir_lower_non_uniform_access).
389     *
390     * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253
391     */
392 
393    bool divergence_changed = false;
394 
395    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
396    nir_foreach_block_safe(block, impl) {
397       nir_foreach_instr_safe(instr, block) {
398          if (instr->type != nir_instr_type_tex)
399             continue;
400 
401          nir_tex_instr *tex = nir_instr_as_tex(instr);
402          for (int i = 0; i < tex->num_srcs; i++) {
403             bool divergent = tex->src[i].src.ssa->divergent;
404 
405             switch (tex->src[i].src_type) {
406             case nir_tex_src_texture_deref:
407             case nir_tex_src_texture_handle:
408                tex->texture_non_uniform |= divergent;
409                break;
410             case nir_tex_src_sampler_deref:
411             case nir_tex_src_sampler_handle:
412                tex->sampler_non_uniform |= divergent;
413                break;
414             default:
415                break;
416             }
417          }
418 
419          /* If dest is already divergent, divergence won't change. */
420          divergence_changed |= !tex->def.divergent &&
421             (tex->texture_non_uniform || tex->sampler_non_uniform);
422       }
423    }
424 
425    nir_metadata_preserve(impl, nir_metadata_all);
426    return divergence_changed;
427 }
428 
si_finalize_nir(struct pipe_screen * screen,void * nirptr)429 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
430 {
431    struct si_screen *sscreen = (struct si_screen *)screen;
432    struct nir_shader *nir = (struct nir_shader *)nirptr;
433 
434    nir_lower_io_passes(nir, false);
435    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | nir_var_shader_out, NULL);
436 
437    if (nir->info.stage == MESA_SHADER_FRAGMENT)
438       NIR_PASS_V(nir, nir_lower_color_inputs);
439 
440    NIR_PASS_V(nir, ac_nir_lower_subdword_loads,
441               (ac_nir_lower_subdword_options) {
442                  .modes_1_comp = nir_var_mem_ubo,
443                  .modes_N_comps = nir_var_mem_ubo | nir_var_mem_ssbo
444               });
445    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
446 
447    /* Remove dead derefs, so that we can remove uniforms. */
448    NIR_PASS_V(nir, nir_opt_dce);
449 
450    /* Remove uniforms because those should have been lowered to UBOs already. */
451    nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
452       if (!glsl_type_get_image_count(var->type) &&
453           !glsl_type_get_texture_count(var->type) &&
454           !glsl_type_get_sampler_count(var->type))
455          exec_node_remove(&var->node);
456    }
457 
458    si_lower_nir(sscreen, nir);
459    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
460 
461    /* Update xfb info after we did medium io lowering. */
462    if (nir->xfb_info && nir->info.outputs_written_16bit)
463       nir_gather_xfb_info_from_intrinsics(nir);
464 
465    if (sscreen->options.inline_uniforms)
466       nir_find_inlinable_uniforms(nir);
467 
468    /* Lower large variables that are always constant with load_constant intrinsics, which
469     * get turned into PC-relative loads from a data section next to the shader.
470     *
471     * Run this once before lcssa because the added phis may prevent this
472     * pass from operating correctly.
473     *
474     * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
475     * or may create unneeded code, so run si_nir_opts if needed.
476     */
477    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
478    bool progress = false;
479    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
480    if (progress)
481       si_nir_opts(sscreen, nir, false);
482 
483    NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
484    NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
485 
486    /* Must be after divergence analysis. */
487    bool divergence_changed = false;
488    NIR_PASS(divergence_changed, nir, si_mark_divergent_texture_non_uniform);
489    /* Re-analysis whole shader if texture instruction divergence changed. */
490    if (divergence_changed)
491       NIR_PASS_V(nir, nir_divergence_analysis);
492 
493    return NULL;
494 }
495