1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "nir_builder.h"
8 #include "nir_xfb_info.h"
9 #include "si_pipe.h"
10 #include "ac_nir.h"
11 #include "aco_interface.h"
12
13
si_alu_to_scalar_packed_math_filter(const nir_instr * instr,const void * data)14 bool si_alu_to_scalar_packed_math_filter(const nir_instr *instr, const void *data)
15 {
16 if (instr->type == nir_instr_type_alu) {
17 nir_alu_instr *alu = nir_instr_as_alu(instr);
18 bool use_aco = (bool)data;
19
20 if (alu->def.bit_size == 16 && alu->def.num_components == 2 &&
21 (!use_aco || aco_nir_op_supports_packed_math_16bit(alu)))
22 return false;
23 }
24
25 return true;
26 }
27
si_vectorize_callback(const nir_instr * instr,const void * data)28 static uint8_t si_vectorize_callback(const nir_instr *instr, const void *data)
29 {
30 if (instr->type != nir_instr_type_alu)
31 return 0;
32
33 nir_alu_instr *alu = nir_instr_as_alu(instr);
34 if (alu->def.bit_size != 16)
35 return 1;
36
37 bool use_aco = (bool)data;
38
39 if (use_aco) {
40 return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
41 } else {
42 switch (alu->op) {
43 case nir_op_unpack_32_2x16_split_x:
44 case nir_op_unpack_32_2x16_split_y:
45 case nir_op_extract_i8:
46 case nir_op_extract_u8:
47 case nir_op_extract_i16:
48 case nir_op_extract_u16:
49 case nir_op_insert_u8:
50 case nir_op_insert_u16:
51 return 1;
52 default:
53 return 2;
54 }
55 }
56 }
57
si_lower_bit_size_callback(const nir_instr * instr,void * data)58 static unsigned si_lower_bit_size_callback(const nir_instr *instr, void *data)
59 {
60 if (instr->type != nir_instr_type_alu)
61 return 0;
62
63 nir_alu_instr *alu = nir_instr_as_alu(instr);
64
65 switch (alu->op) {
66 case nir_op_imul_high:
67 case nir_op_umul_high:
68 if (alu->def.bit_size < 32)
69 return 32;
70 break;
71 default:
72 break;
73 }
74
75 return 0;
76 }
77
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)78 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
79 {
80 bool use_aco = sscreen->use_aco || nir->info.use_aco_amd;
81 bool progress;
82
83 do {
84 progress = false;
85 bool lower_alu_to_scalar = false;
86 bool lower_phis_to_scalar = false;
87
88 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
89 NIR_PASS(progress, nir, nir_lower_alu_to_scalar,
90 nir->options->lower_to_scalar_filter, (void *)use_aco);
91 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
92
93 if (first) {
94 NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
95 NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
96 NIR_PASS(progress, nir, nir_opt_find_array_copies);
97 }
98 NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
99 NIR_PASS(progress, nir, nir_opt_dead_write_vars);
100
101 NIR_PASS(lower_alu_to_scalar, nir, nir_opt_loop);
102 /* (Constant) copy propagation is needed for txf with offsets. */
103 NIR_PASS(progress, nir, nir_copy_prop);
104 NIR_PASS(progress, nir, nir_opt_remove_phis);
105 NIR_PASS(progress, nir, nir_opt_dce);
106 /* nir_opt_if_optimize_phi_true_false is disabled on LLVM14 (#6976) */
107 NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if,
108 nir_opt_if_optimize_phi_true_false);
109 NIR_PASS(progress, nir, nir_opt_dead_cf);
110
111 if (lower_alu_to_scalar) {
112 NIR_PASS_V(nir, nir_lower_alu_to_scalar,
113 nir->options->lower_to_scalar_filter, (void *)use_aco);
114 }
115 if (lower_phis_to_scalar)
116 NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
117 progress |= lower_alu_to_scalar | lower_phis_to_scalar;
118
119 NIR_PASS(progress, nir, nir_opt_cse);
120 NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
121
122 /* Needed for algebraic lowering */
123 NIR_PASS(progress, nir, nir_lower_bit_size, si_lower_bit_size_callback, NULL);
124 NIR_PASS(progress, nir, nir_opt_algebraic);
125 NIR_PASS(progress, nir, nir_opt_generate_bfi);
126 NIR_PASS(progress, nir, nir_opt_constant_folding);
127
128 if (!nir->info.flrp_lowered) {
129 unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
130 (nir->options->lower_flrp32 ? 32 : 0) |
131 (nir->options->lower_flrp64 ? 64 : 0);
132 assert(lower_flrp);
133 bool lower_flrp_progress = false;
134
135 NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
136 if (lower_flrp_progress) {
137 NIR_PASS(progress, nir, nir_opt_constant_folding);
138 progress = true;
139 }
140
141 /* Nothing should rematerialize any flrps, so we only
142 * need to do this lowering once.
143 */
144 nir->info.flrp_lowered = true;
145 }
146
147 NIR_PASS(progress, nir, nir_opt_undef);
148 NIR_PASS(progress, nir, nir_opt_conditional_discard);
149 if (nir->options->max_unroll_iterations) {
150 NIR_PASS(progress, nir, nir_opt_loop_unroll);
151 }
152
153 if (nir->info.stage == MESA_SHADER_FRAGMENT)
154 NIR_PASS_V(nir, nir_opt_move_discards_to_top);
155
156 if (sscreen->info.has_packed_math_16bit)
157 NIR_PASS(progress, nir, nir_opt_vectorize, si_vectorize_callback, (void *)use_aco);
158 } while (progress);
159
160 NIR_PASS_V(nir, nir_lower_var_copies);
161 }
162
si_nir_late_opts(nir_shader * nir)163 void si_nir_late_opts(nir_shader *nir)
164 {
165 bool more_late_algebraic = true;
166 while (more_late_algebraic) {
167 more_late_algebraic = false;
168 NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
169 NIR_PASS_V(nir, nir_opt_constant_folding);
170
171 /* We should run this after constant folding for stages that support indirect
172 * inputs/outputs.
173 */
174 if (nir->options->support_indirect_inputs & BITFIELD_BIT(nir->info.stage) ||
175 nir->options->support_indirect_outputs & BITFIELD_BIT(nir->info.stage))
176 NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);
177
178 NIR_PASS_V(nir, nir_copy_prop);
179 NIR_PASS_V(nir, nir_opt_dce);
180 NIR_PASS_V(nir, nir_opt_cse);
181 }
182 }
183
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)184 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
185 {
186 /* Optimize types of image_sample sources and destinations.
187 *
188 * The image_sample sources bit sizes are:
189 * nir_tex_src_coord: a16 ? 16 : 32
190 * nir_tex_src_comparator: 32
191 * nir_tex_src_offset: 32
192 * nir_tex_src_bias: a16 ? 16 : 32
193 * nir_tex_src_lod: a16 ? 16 : 32
194 * nir_tex_src_min_lod: a16 ? 16 : 32
195 * nir_tex_src_ms_index: a16 ? 16 : 32
196 * nir_tex_src_ddx: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
197 * nir_tex_src_ddy: has_g16 ? (g16 ? 16 : 32) : (a16 ? 16 : 32)
198 *
199 * We only use a16/g16 if all of the affected sources are 16bit.
200 */
201 bool has_g16 = sscreen->info.gfx_level >= GFX10;
202 struct nir_opt_tex_srcs_options opt_srcs_options[] = {
203 {
204 .sampler_dims =
205 ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
206 .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) |
207 (1 << nir_tex_src_bias) | (1 << nir_tex_src_min_lod) |
208 (1 << nir_tex_src_ms_index) |
209 (has_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
210 },
211 {
212 .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
213 .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
214 },
215 };
216 struct nir_opt_16bit_tex_image_options opt_16bit_options = {
217 .rounding_mode = nir_rounding_mode_undef,
218 .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
219 .opt_image_dest_types = nir_type_float | nir_type_int | nir_type_uint,
220 .integer_dest_saturates = true,
221 .opt_image_store_data = true,
222 .opt_image_srcs = true,
223 .opt_srcs_options_count = has_g16 ? 2 : 1,
224 .opt_srcs_options = opt_srcs_options,
225 };
226 bool changed = false;
227 NIR_PASS(changed, nir, nir_opt_16bit_tex_image, &opt_16bit_options);
228
229 if (changed) {
230 si_nir_opts(sscreen, nir, false);
231 si_nir_late_opts(nir);
232 }
233 }
234
235 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)236 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
237 {
238 return instr->type == nir_instr_type_intrinsic;
239 }
240
241 static nir_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)242 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
243 {
244 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
245
246 switch (intrin->intrinsic) {
247 case nir_intrinsic_is_sparse_texels_resident:
248 /* code==0 means sparse texels are resident */
249 return nir_ieq_imm(b, intrin->src[0].ssa, 0);
250 case nir_intrinsic_sparse_residency_code_and:
251 return nir_ior(b, intrin->src[0].ssa, intrin->src[1].ssa);
252 default:
253 return NULL;
254 }
255 }
256
si_lower_intrinsics(nir_shader * nir)257 static bool si_lower_intrinsics(nir_shader *nir)
258 {
259 return nir_shader_lower_instructions(nir,
260 lower_intrinsic_filter,
261 lower_intrinsic_instr,
262 NULL);
263 }
264
si_lower_mediump_io(nir_shader * nir)265 void si_lower_mediump_io(nir_shader *nir)
266 {
267 NIR_PASS_V(nir, nir_lower_mediump_io,
268 /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
269 * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
270 */
271 (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
272 BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
273 true);
274 }
275
276 /**
277 * Perform "lowering" operations on the NIR that are run once when the shader
278 * selector is created.
279 */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)280 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
281 {
282 /* Perform lowerings (and optimizations) of code.
283 *
284 * Performance considerations aside, we must:
285 * - lower certain ALU operations
286 * - ensure constant offsets for texture instructions are folded
287 * and copy-propagated
288 */
289 const struct nir_lower_tex_options lower_tex_options = {
290 .lower_txp = ~0u,
291 .lower_txf_offset = true,
292 .lower_txs_cube_array = true,
293 .lower_invalid_implicit_lod = true,
294 .lower_tg4_offsets = true,
295 .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
296 .lower_1d = sscreen->info.gfx_level == GFX9,
297 };
298 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
299
300 const struct nir_lower_image_options lower_image_options = {
301 .lower_cube_size = true,
302 .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11 &&
303 !(sscreen->debug_flags & DBG(NO_FMASK)),
304 };
305 NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
306
307 NIR_PASS_V(nir, si_lower_intrinsics);
308
309 NIR_PASS_V(nir, ac_nir_lower_sin_cos);
310
311 /* Lower load constants to scalar and then clean up the mess */
312 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
313 NIR_PASS_V(nir, nir_lower_var_copies);
314 NIR_PASS_V(nir, nir_opt_intrinsics);
315 NIR_PASS_V(nir, nir_lower_system_values);
316
317 /* si_nir_kill_outputs and ac_nir_optimize_outputs require outputs to be scalar. */
318 if (nir->info.stage == MESA_SHADER_VERTEX ||
319 nir->info.stage == MESA_SHADER_TESS_EVAL ||
320 nir->info.stage == MESA_SHADER_GEOMETRY)
321 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
322
323 if (nir->info.stage == MESA_SHADER_GEOMETRY) {
324 unsigned flags = nir_lower_gs_intrinsics_per_stream;
325 if (sscreen->use_ngg) {
326 flags |= nir_lower_gs_intrinsics_count_primitives |
327 nir_lower_gs_intrinsics_count_vertices_per_primitive |
328 nir_lower_gs_intrinsics_overwrite_incomplete;
329 }
330
331 NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags);
332 }
333
334 if (nir->info.stage == MESA_SHADER_COMPUTE) {
335 nir_lower_compute_system_values_options options = {0};
336
337 /* gl_LocalInvocationIndex must be derived from gl_LocalInvocationID.xyz to make it correct
338 * with quad derivatives. Using gl_SubgroupID for that (which is what we do by default) is
339 * incorrect with a non-linear thread order.
340 *
341 * On Gfx12, we always use a non-linear thread order if the workgroup X and Y size is
342 * divisible by 2.
343 */
344 options.lower_local_invocation_index =
345 nir->info.derivative_group == DERIVATIVE_GROUP_QUADS ||
346 (sscreen->info.gfx_level >= GFX12 &&
347 nir->info.derivative_group == DERIVATIVE_GROUP_NONE &&
348 (nir->info.workgroup_size_variable ||
349 (nir->info.workgroup_size[0] % 2 == 0 && nir->info.workgroup_size[1] % 2 == 0)));
350 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
351
352 /* Gfx12 supports this in hw. */
353 if (sscreen->info.gfx_level < GFX12 &&
354 nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
355 nir_opt_cse(nir); /* CSE load_local_invocation_id */
356 memset(&options, 0, sizeof(options));
357 options.shuffle_local_ids_for_quad_derivatives = true;
358 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
359 }
360 }
361
362 si_nir_opts(sscreen, nir, true);
363 /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
364 si_nir_late_opts(nir);
365
366 if (sscreen->info.gfx_level >= GFX9)
367 si_late_optimize_16bit_samplers(sscreen, nir);
368
369 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
370
371 NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
372 }
373
si_mark_divergent_texture_non_uniform(struct nir_shader * nir)374 static bool si_mark_divergent_texture_non_uniform(struct nir_shader *nir)
375 {
376 assert(nir->info.divergence_analysis_run);
377
378 /* sampler_non_uniform and texture_non_uniform are always false in GLSL,
379 * but this can lead to unexpected behavior if texture/sampler index come from
380 * a vertex attribute.
381 *
382 * For instance, 2 consecutive draws using 2 different index values,
383 * could be squashed together by the hw - producing a single draw with
384 * non-dynamically uniform index.
385 *
386 * To avoid this, detect divergent indexing, mark them as non-uniform,
387 * so that we can apply waterfall loop on these index later (either llvm
388 * backend or nir_lower_non_uniform_access).
389 *
390 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253
391 */
392
393 bool divergence_changed = false;
394
395 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
396 nir_foreach_block_safe(block, impl) {
397 nir_foreach_instr_safe(instr, block) {
398 if (instr->type != nir_instr_type_tex)
399 continue;
400
401 nir_tex_instr *tex = nir_instr_as_tex(instr);
402 for (int i = 0; i < tex->num_srcs; i++) {
403 bool divergent = tex->src[i].src.ssa->divergent;
404
405 switch (tex->src[i].src_type) {
406 case nir_tex_src_texture_deref:
407 case nir_tex_src_texture_handle:
408 tex->texture_non_uniform |= divergent;
409 break;
410 case nir_tex_src_sampler_deref:
411 case nir_tex_src_sampler_handle:
412 tex->sampler_non_uniform |= divergent;
413 break;
414 default:
415 break;
416 }
417 }
418
419 /* If dest is already divergent, divergence won't change. */
420 divergence_changed |= !tex->def.divergent &&
421 (tex->texture_non_uniform || tex->sampler_non_uniform);
422 }
423 }
424
425 nir_metadata_preserve(impl, nir_metadata_all);
426 return divergence_changed;
427 }
428
si_finalize_nir(struct pipe_screen * screen,void * nirptr)429 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
430 {
431 struct si_screen *sscreen = (struct si_screen *)screen;
432 struct nir_shader *nir = (struct nir_shader *)nirptr;
433
434 nir_lower_io_passes(nir, false);
435 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_shader_in | nir_var_shader_out, NULL);
436
437 if (nir->info.stage == MESA_SHADER_FRAGMENT)
438 NIR_PASS_V(nir, nir_lower_color_inputs);
439
440 NIR_PASS_V(nir, ac_nir_lower_subdword_loads,
441 (ac_nir_lower_subdword_options) {
442 .modes_1_comp = nir_var_mem_ubo,
443 .modes_N_comps = nir_var_mem_ubo | nir_var_mem_ssbo
444 });
445 NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset);
446
447 /* Remove dead derefs, so that we can remove uniforms. */
448 NIR_PASS_V(nir, nir_opt_dce);
449
450 /* Remove uniforms because those should have been lowered to UBOs already. */
451 nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
452 if (!glsl_type_get_image_count(var->type) &&
453 !glsl_type_get_texture_count(var->type) &&
454 !glsl_type_get_sampler_count(var->type))
455 exec_node_remove(&var->node);
456 }
457
458 si_lower_nir(sscreen, nir);
459 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
460
461 /* Update xfb info after we did medium io lowering. */
462 if (nir->xfb_info && nir->info.outputs_written_16bit)
463 nir_gather_xfb_info_from_intrinsics(nir);
464
465 if (sscreen->options.inline_uniforms)
466 nir_find_inlinable_uniforms(nir);
467
468 /* Lower large variables that are always constant with load_constant intrinsics, which
469 * get turned into PC-relative loads from a data section next to the shader.
470 *
471 * Run this once before lcssa because the added phis may prevent this
472 * pass from operating correctly.
473 *
474 * nir_opt_large_constants may use op_amul (see nir_build_deref_offset),
475 * or may create unneeded code, so run si_nir_opts if needed.
476 */
477 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
478 bool progress = false;
479 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
480 if (progress)
481 si_nir_opts(sscreen, nir, false);
482
483 NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
484 NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
485
486 /* Must be after divergence analysis. */
487 bool divergence_changed = false;
488 NIR_PASS(divergence_changed, nir, si_mark_divergent_texture_non_uniform);
489 /* Re-analysis whole shader if texture instruction divergence changed. */
490 if (divergence_changed)
491 NIR_PASS_V(nir, nir_divergence_analysis);
492
493 return NULL;
494 }
495