xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r300/compiler/r300_nir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023 Pavel Ondračka <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "r300_nir.h"
7 
8 #include "compiler/nir/nir_builder.h"
9 #include "r300_screen.h"
10 
11 bool
r300_is_only_used_as_float(const nir_alu_instr * instr)12 r300_is_only_used_as_float(const nir_alu_instr *instr)
13 {
14    nir_foreach_use(src, &instr->def) {
15       if (nir_src_is_if(src))
16          return false;
17 
18       nir_instr *user_instr = nir_src_parent_instr(src);
19       if (user_instr->type == nir_instr_type_alu) {
20          nir_alu_instr *alu = nir_instr_as_alu(user_instr);
21          switch (alu->op) {
22          case nir_op_mov:
23          case nir_op_vec2:
24          case nir_op_vec3:
25          case nir_op_vec4:
26          case nir_op_bcsel:
27          case nir_op_b32csel:
28             if (!r300_is_only_used_as_float(alu))
29                return false;
30             break;
31          default:
32 	    break;
33          }
34 
35          const nir_op_info *info = &nir_op_infos[alu->op];
36          nir_alu_src *alu_src = exec_node_data(nir_alu_src, src, src);
37          int src_idx = alu_src - &alu->src[0];
38          if ((info->input_types[src_idx] & nir_type_int) ||
39              (info->input_types[src_idx] & nir_type_bool))
40             return false;
41       }
42    }
43    return true;
44 }
45 
46 static unsigned char
r300_should_vectorize_instr(const nir_instr * instr,const void * data)47 r300_should_vectorize_instr(const nir_instr *instr, const void *data)
48 {
49    bool *too_many_ubos = (bool *) data;
50 
51    if (instr->type != nir_instr_type_alu)
52       return 0;
53 
54    /* Vectorization can make the constant layout worse and increase
55     * the constant register usage. The worst scenario is vectorization
56     * of lowered indirect register access, where we access i-th element
57     * and later we access i-1 or i+1 (most notably glamor and gsk shaders).
58     * In this case we already added constants 1..n where n is the array
59     * size, however we can reuse them unless the lowered ladder gets
60     * vectorized later.
61     *
62     * Thus prevent vectorization of the specific patterns from lowered
63     * indirect access.
64     *
65     * This is quite a heavy hammer, we could in theory estimate how many
66     * slots will the current ubos and constants need and only disable
67     * vectorization when we are close to the limit. However, this would
68     * likely need a global shader analysis each time r300_should_vectorize_inst
69     * is called, which we want to avoid.
70     *
71     * So for now just don't vectorize anything that loads constants.
72     */
73    if (*too_many_ubos) {
74       nir_alu_instr *alu = nir_instr_as_alu(instr);
75       unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
76       for (unsigned i = 0; i < num_srcs; i++) {
77          if (nir_src_is_const(alu->src[i].src)) {
78             return 0;
79          }
80       }
81    }
82 
83    return 4;
84 }
85 
86 /* R300 and R400 have just 32 vec4 constant register slots in fs.
87  * Therefore, while its possible we will be able to compact some of
88  * the constants later, we need to be extra careful with adding
89  * new constants anyway.
90  */
have_too_many_ubos(nir_shader * s,bool is_r500)91 static bool have_too_many_ubos(nir_shader *s, bool is_r500)
92 {
93    if (s->info.stage != MESA_SHADER_FRAGMENT)
94       return false;
95 
96    if (is_r500)
97       return false;
98 
99    nir_foreach_variable_with_modes(var, s, nir_var_mem_ubo) {
100       int ubo = var->data.driver_location;
101       assert (ubo == 0);
102 
103       unsigned size = glsl_get_explicit_size(var->interface_type, false);
104       if (DIV_ROUND_UP(size, 16) > 32)
105          return true;
106    }
107 
108    return false;
109 }
110 
111 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)112 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
113 {
114    if (intr->intrinsic == nir_intrinsic_load_ubo_vec4) {
115       nir_intrinsic_set_access(intr, nir_intrinsic_access(intr) | ACCESS_CAN_SPECULATE);
116       return true;
117    }
118    return false;
119 }
120 
121 static void
r300_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)122 r300_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
123 {
124    bool is_r500 = r300_screen(screen)->caps.is_r500;
125 
126    bool progress;
127    if (s->info.stage == MESA_SHADER_FRAGMENT) {
128       if (is_r500) {
129          NIR_PASS_V(s, r300_transform_fs_trig_input);
130       }
131    } else {
132       if (r300_screen(screen)->caps.has_tcl) {
133          if (r300_screen(screen)->caps.is_r500) {
134             /* Only nine should set both NTT shader name and
135              * use_legacy_math_rules and D3D9 already mandates
136              * the proper range for the trigonometric inputs.
137              */
138             if (!s->info.use_legacy_math_rules || !(s->info.name && !strcmp("TTN", s->info.name))) {
139                NIR_PASS_V(s, r300_transform_vs_trig_input);
140             }
141          } else {
142             if (r300_screen(screen)->caps.is_r400) {
143                NIR_PASS_V(s, r300_transform_vs_trig_input);
144             }
145          }
146       }
147    }
148 
149    do {
150       progress = false;
151 
152       NIR_PASS_V(s, nir_lower_vars_to_ssa);
153 
154       NIR_PASS(progress, s, nir_copy_prop);
155       NIR_PASS(progress, s, r300_nir_lower_flrp);
156       NIR_PASS(progress, s, nir_opt_algebraic);
157       if (s->info.stage == MESA_SHADER_VERTEX) {
158          if (!is_r500)
159             NIR_PASS(progress, s, r300_nir_lower_bool_to_float);
160          NIR_PASS(progress, s, r300_nir_fuse_fround_d3d9);
161       }
162       NIR_PASS(progress, s, nir_opt_constant_folding);
163       NIR_PASS(progress, s, nir_opt_remove_phis);
164       NIR_PASS(progress, s, nir_opt_conditional_discard);
165       NIR_PASS(progress, s, nir_opt_dce);
166       NIR_PASS(progress, s, nir_opt_dead_cf);
167       NIR_PASS(progress, s, nir_opt_cse);
168       NIR_PASS(progress, s, nir_opt_find_array_copies);
169       NIR_PASS(progress, s, nir_opt_copy_prop_vars);
170       NIR_PASS(progress, s, nir_opt_dead_write_vars);
171 
172       NIR_PASS(progress, s, nir_opt_if, nir_opt_if_optimize_phi_true_false);
173       if (is_r500)
174          nir_shader_intrinsics_pass(s, set_speculate,
175                                     nir_metadata_control_flow, NULL);
176       NIR_PASS(progress, s, nir_opt_peephole_select, is_r500 ? 8 : ~0, true, true);
177       if (s->info.stage == MESA_SHADER_FRAGMENT) {
178          NIR_PASS(progress, s, r300_nir_lower_bool_to_float_fs);
179       }
180       NIR_PASS(progress, s, nir_opt_algebraic);
181       NIR_PASS(progress, s, nir_opt_constant_folding);
182       NIR_PASS(progress, s, nir_opt_shrink_stores, true);
183       NIR_PASS(progress, s, nir_opt_shrink_vectors, false);
184       NIR_PASS(progress, s, nir_opt_loop);
185 
186       bool too_many_ubos = have_too_many_ubos(s, is_r500);
187       NIR_PASS(progress, s, nir_opt_vectorize, r300_should_vectorize_instr,
188                &too_many_ubos);
189       NIR_PASS(progress, s, nir_opt_undef);
190       if(!progress)
191          NIR_PASS(progress, s, nir_lower_undef_to_zero);
192       NIR_PASS(progress, s, nir_opt_loop_unroll);
193 
194       /* Try to fold addressing math into ubo_vec4's base to avoid load_consts
195        * and ALU ops for it.
196        */
197       nir_opt_offsets_options offset_options = {
198          .ubo_vec4_max = 255,
199 
200          /* No const offset in TGSI for shared accesses. */
201          .shared_max = 0,
202 
203          /* unused intrinsics */
204          .uniform_max = 0,
205          .buffer_max = 0,
206       };
207 
208       NIR_PASS(progress, s, nir_opt_offsets, &offset_options);
209    } while (progress);
210 
211    NIR_PASS_V(s, nir_lower_var_copies);
212    NIR_PASS(progress, s, nir_remove_dead_variables, nir_var_function_temp,
213 			NULL);
214 }
215 
r300_check_control_flow(nir_shader * s)216 static char *r300_check_control_flow(nir_shader *s)
217 {
218    nir_function_impl *impl = nir_shader_get_entrypoint(s);
219    nir_block *first = nir_start_block(impl);
220    nir_cf_node *next = nir_cf_node_next(&first->cf_node);
221 
222    if (next) {
223       switch (next->type) {
224          case nir_cf_node_if:
225             return "If/then statements not supported by R300/R400 shaders, should have been flattened by peephole_select.";
226          case nir_cf_node_loop:
227             return "Looping not supported R300/R400 shaders, all loops must be statically unrollable.";
228          default:
229             return "Unknown control flow type";
230       }
231    }
232 
233    return NULL;
234 }
235 
236 char *
r300_finalize_nir(struct pipe_screen * pscreen,void * nir)237 r300_finalize_nir(struct pipe_screen *pscreen, void *nir)
238 {
239    nir_shader *s = nir;
240 
241    r300_optimize_nir(s, pscreen);
242 
243    /* st_program.c's parameter list optimization requires that future nir
244     * variants don't reallocate the uniform storage, so we have to remove
245     * uniforms that occupy storage.  But we don't want to remove samplers,
246     * because they're needed for YUV variant lowering.
247     */
248    nir_remove_dead_derefs(s);
249    nir_foreach_uniform_variable_safe(var, s) {
250       if (var->data.mode == nir_var_uniform &&
251           (glsl_type_get_image_count(var->type) ||
252            glsl_type_get_sampler_count(var->type)))
253          continue;
254 
255       exec_node_remove(&var->node);
256    }
257    nir_validate_shader(s, "after uniform var removal");
258 
259    nir_sweep(s);
260 
261    if (!r300_screen(pscreen)->caps.is_r500 &&
262        (r300_screen(pscreen)->caps.has_tcl || s->info.stage == MESA_SHADER_FRAGMENT)) {
263       char *msg = r300_check_control_flow(s);
264       if (msg)
265          return strdup(msg);
266    }
267 
268    return NULL;
269 }
270