xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_nir_opt_outputs.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* This helps separate shaders because the next shader doesn't have to be known.
8  *
9  * It optimizes VS and TES outputs before FS as follows:
10  * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g.
11  *   (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation
12  *   qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently).
13  * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1),
14  *   treat undef as whatever.
15  *
16  * It requires that there is no indirect indexing and all output stores must be scalar.
17  */
18 
19 #include "ac_nir.h"
20 #include "nir_builder.h"
21 
22 struct ac_chan_info {
23    nir_instr *value;
24    nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */
25 };
26 
27 struct ac_out_info {
28    unsigned base; /* nir_intrinsic_base */
29    nir_alu_type types;
30    bool duplicated;
31    bool constant;
32 
33    /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels.
34     * Channels 4-7 are high bits of 16-bit channels.
35     */
36    struct ac_chan_info chan[8];
37 };
38 
ac_remove_varying(struct ac_out_info * out)39 static void ac_remove_varying(struct ac_out_info *out)
40 {
41    /* Remove the output. (all channels) */
42    for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) {
43       if (out->chan[i].store_intr) {
44          nir_remove_varying(out->chan[i].store_intr, MESA_SHADER_FRAGMENT);
45          out->chan[i].store_intr = NULL;
46          out->chan[i].value = NULL;
47       }
48    }
49 }
50 
51 /* Return true if the output matches DEFAULT_VAL and has been eliminated. */
ac_eliminate_const_output(struct ac_out_info * out,gl_varying_slot semantic,uint8_t * param_export_index)52 static bool ac_eliminate_const_output(struct ac_out_info *out,
53                                       gl_varying_slot semantic,
54                                       uint8_t *param_export_index)
55 {
56    if (!(out->types & 32))
57       return false;
58 
59    bool is_zero[4] = {0}, is_one[4] = {0};
60 
61    for (unsigned i = 0; i < 4; i++) {
62       /* NULL means undef. */
63       if (!out->chan[i].value) {
64          is_zero[i] = true;
65          is_one[i] = true;
66       } else if (out->chan[i].value->type == nir_instr_type_load_const) {
67          if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0)
68             is_zero[i] = true;
69          else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1)
70             is_one[i] = true;
71          else
72             return false; /* other constant */
73       } else
74          return false;
75    }
76 
77    /* Only certain combinations of 0 and 1 are supported. */
78    unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
79 
80    if (is_zero[0] && is_zero[1] && is_zero[2]) {
81       if (is_zero[3])
82          default_val = AC_EXP_PARAM_DEFAULT_VAL_0000;
83       else if (is_one[3])
84          default_val = AC_EXP_PARAM_DEFAULT_VAL_0001;
85       else
86          return false;
87    } else if (is_one[0] && is_one[1] && is_one[2]) {
88       if (is_zero[3])
89          default_val = AC_EXP_PARAM_DEFAULT_VAL_1110;
90       else if (is_one[3])
91          default_val = AC_EXP_PARAM_DEFAULT_VAL_1111;
92       else
93          return false;
94    } else {
95       return false;
96    }
97 
98    /* Change OFFSET to DEFAULT_VAL. */
99    param_export_index[semantic] = default_val;
100    out->constant = true;
101    ac_remove_varying(out);
102    return true;
103 }
104 
ac_eliminate_duplicated_output(struct ac_out_info * outputs,BITSET_DECLARE (outputs_optimized,NUM_TOTAL_VARYING_SLOTS),gl_varying_slot current,struct nir_builder * b,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])105 static bool ac_eliminate_duplicated_output(struct ac_out_info *outputs,
106                                            BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS),
107                                            gl_varying_slot current, struct nir_builder *b,
108                                            int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
109 {
110    struct ac_out_info *cur = &outputs[current];
111    unsigned p, copy_back_channels = 0;
112 
113    /* Check all outputs before current. */
114    BITSET_FOREACH_SET(p, outputs_optimized, current) {
115       struct ac_out_info *prev = &outputs[p];
116 
117       /* Only compare with real outputs. */
118       if (prev->constant || prev->duplicated)
119          continue;
120 
121       /* The types must match (only 16-bit and 32-bit types are allowed). */
122       if ((prev->types & 16) != (cur->types & 16))
123          continue;
124 
125       bool different = false;
126 
127       /* Iterate over all channels, including 16-bit channels in chan_hi. */
128       for (unsigned j = 0; j < 8; j++) {
129          nir_instr *prev_chan = prev->chan[j].value;
130          nir_instr *cur_chan = cur->chan[j].value;
131 
132          /* Treat undef as a match. */
133          if (!cur_chan)
134             continue;
135 
136          /* If prev is undef but cur isn't, we can merge the outputs
137           * and consider the output duplicated.
138           */
139          if (!prev_chan) {
140             copy_back_channels |= 1 << j;
141             continue;
142          }
143 
144          /* Test whether the values are different. */
145          if (prev_chan != cur_chan &&
146              (prev_chan->type != nir_instr_type_load_const ||
147               cur_chan->type != nir_instr_type_load_const ||
148               nir_instr_as_load_const(prev_chan)->value[0].u32 !=
149               nir_instr_as_load_const(cur_chan)->value[0].u32)) {
150             different = true;
151             break;
152          }
153       }
154       if (!different)
155          break;
156 
157       copy_back_channels = 0;
158    }
159    if (p == current)
160       return false;
161 
162    /* An equal output already exists. Make FS use the existing one instead.
163     * This effectively disables the current output and the param export shouldn't
164     * be generated.
165     */
166    cur->duplicated = true;
167 
168    /* p is gl_varying_slot in addition to being an index into outputs. */
169    slot_remap[current] = p;
170 
171    /* If the matching preceding output has undef where the current one has a proper value,
172     * move the value to the preceding output.
173     */
174    struct ac_out_info *prev = &outputs[p];
175 
176    while (copy_back_channels) {
177       unsigned i = u_bit_scan(&copy_back_channels);
178       struct ac_chan_info *prev_chan = &prev->chan[i];
179       struct ac_chan_info *cur_chan = &cur->chan[i];
180 
181       b->cursor = nir_after_instr(&cur_chan->store_intr->instr);
182 
183       /* The store intrinsic doesn't exist for this channel. Create a new one. */
184       nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr);
185       struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr);
186       struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr);
187       struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr);
188 
189       /* p is gl_varying_slot in addition to being an index into outputs. */
190       sem.location = p;
191       assert(sem.high_16bits == i / 4);
192 
193       /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep
194        * the system value output. This is just the varying portion.
195        */
196       sem.no_sysval_output = 1;
197 
198       /* Write just one component. */
199       prev_chan->store_intr = nir_store_output(b, nir_instr_def(cur_chan->value),
200                                                nir_imm_int(b, 0),
201                                                .base = prev->base,
202                                                .component = i % 4,
203                                                .io_semantics = sem,
204                                                .src_type = src_type,
205                                                .write_mask = 0x1,
206                                                .io_xfb = xfb,
207                                                .io_xfb2 = xfb2);
208 
209       /* Update the undef channels in the output info. */
210       assert(!prev_chan->value);
211       prev_chan->value = cur_chan->value;
212 
213       /* Remove transform feedback info from the current instruction because
214        * we moved it too. The instruction might not be removed if it's a system
215        * value output.
216        */
217       static struct nir_io_xfb zero_xfb;
218       nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb);
219       nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb);
220    }
221 
222    ac_remove_varying(cur);
223    return true;
224 }
225 
ac_nir_optimize_outputs(nir_shader * nir,bool sprite_tex_disallowed,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])226 bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
227                              int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
228                              uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])
229 {
230    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
231    assert(impl);
232 
233    if (nir->info.stage != MESA_SHADER_VERTEX &&
234        nir->info.stage != MESA_SHADER_TESS_EVAL) {
235       nir_metadata_preserve(impl, nir_metadata_all);
236       return false;
237    }
238 
239    struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = { 0 };
240 
241    BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS);
242    BITSET_ZERO(outputs_optimized);
243 
244    /* Gather outputs. */
245    nir_foreach_block(block, impl) {
246       nir_foreach_instr_safe(instr, block) {
247          if (instr->type != nir_instr_type_intrinsic)
248             continue;
249 
250          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
251          if (intr->intrinsic != nir_intrinsic_store_output)
252             continue;
253 
254          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
255 
256          /* Only process varyings that appear as param exports. */
257          if (!nir_slot_is_varying(sem.location) || sem.no_varying)
258             continue;
259 
260          /* We can't optimize texture coordinates if sprite_coord_enable can override them. */
261          if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 &&
262              !sprite_tex_disallowed)
263             continue;
264 
265          BITSET_SET(outputs_optimized, sem.location);
266 
267          /* No indirect indexing allowed. */
268          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
269          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
270 
271          /* nir_lower_io_to_scalar is required before this */
272          assert(intr->src[0].ssa->num_components == 1);
273          /* No intrinsic should store undef. */
274          assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_undef);
275 
276          /* Gather the output. */
277          struct ac_out_info *out_info = &outputs[sem.location];
278          if (!out_info->types)
279             out_info->base = nir_intrinsic_base(intr);
280          else
281             assert(out_info->base == nir_intrinsic_base(intr));
282 
283          out_info->types |= nir_intrinsic_src_type(intr);
284 
285          unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr);
286          out_info->chan[chan].store_intr = intr;
287          out_info->chan[chan].value = intr->src[0].ssa->parent_instr;
288       }
289    }
290 
291    unsigned i;
292    bool progress = false;
293 
294    struct nir_builder b = nir_builder_create(impl);
295 
296    /* Optimize outputs. */
297    BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) {
298       progress |=
299          ac_eliminate_const_output(&outputs[i], i, param_export_index) ||
300          ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap);
301    }
302 
303    if (progress) {
304       nir_metadata_preserve(impl, nir_metadata_control_flow);
305    } else {
306       nir_metadata_preserve(impl, nir_metadata_all);
307    }
308    return progress;
309 }
310