1 /*
2 * Copyright © 2021 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* This helps separate shaders because the next shader doesn't have to be known.
8 *
9 * It optimizes VS and TES outputs before FS as follows:
10 * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g.
11 * (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation
12 * qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently).
13 * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1),
14 * treat undef as whatever.
15 *
16 * It requires that there is no indirect indexing and all output stores must be scalar.
17 */
18
19 #include "ac_nir.h"
20 #include "nir_builder.h"
21
22 struct ac_chan_info {
23 nir_instr *value;
24 nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */
25 };
26
27 struct ac_out_info {
28 unsigned base; /* nir_intrinsic_base */
29 nir_alu_type types;
30 bool duplicated;
31 bool constant;
32
33 /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels.
34 * Channels 4-7 are high bits of 16-bit channels.
35 */
36 struct ac_chan_info chan[8];
37 };
38
ac_remove_varying(struct ac_out_info * out)39 static void ac_remove_varying(struct ac_out_info *out)
40 {
41 /* Remove the output. (all channels) */
42 for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) {
43 if (out->chan[i].store_intr) {
44 nir_remove_varying(out->chan[i].store_intr, MESA_SHADER_FRAGMENT);
45 out->chan[i].store_intr = NULL;
46 out->chan[i].value = NULL;
47 }
48 }
49 }
50
51 /* Return true if the output matches DEFAULT_VAL and has been eliminated. */
ac_eliminate_const_output(struct ac_out_info * out,gl_varying_slot semantic,uint8_t * param_export_index)52 static bool ac_eliminate_const_output(struct ac_out_info *out,
53 gl_varying_slot semantic,
54 uint8_t *param_export_index)
55 {
56 if (!(out->types & 32))
57 return false;
58
59 bool is_zero[4] = {0}, is_one[4] = {0};
60
61 for (unsigned i = 0; i < 4; i++) {
62 /* NULL means undef. */
63 if (!out->chan[i].value) {
64 is_zero[i] = true;
65 is_one[i] = true;
66 } else if (out->chan[i].value->type == nir_instr_type_load_const) {
67 if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0)
68 is_zero[i] = true;
69 else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1)
70 is_one[i] = true;
71 else
72 return false; /* other constant */
73 } else
74 return false;
75 }
76
77 /* Only certain combinations of 0 and 1 are supported. */
78 unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
79
80 if (is_zero[0] && is_zero[1] && is_zero[2]) {
81 if (is_zero[3])
82 default_val = AC_EXP_PARAM_DEFAULT_VAL_0000;
83 else if (is_one[3])
84 default_val = AC_EXP_PARAM_DEFAULT_VAL_0001;
85 else
86 return false;
87 } else if (is_one[0] && is_one[1] && is_one[2]) {
88 if (is_zero[3])
89 default_val = AC_EXP_PARAM_DEFAULT_VAL_1110;
90 else if (is_one[3])
91 default_val = AC_EXP_PARAM_DEFAULT_VAL_1111;
92 else
93 return false;
94 } else {
95 return false;
96 }
97
98 /* Change OFFSET to DEFAULT_VAL. */
99 param_export_index[semantic] = default_val;
100 out->constant = true;
101 ac_remove_varying(out);
102 return true;
103 }
104
ac_eliminate_duplicated_output(struct ac_out_info * outputs,BITSET_DECLARE (outputs_optimized,NUM_TOTAL_VARYING_SLOTS),gl_varying_slot current,struct nir_builder * b,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])105 static bool ac_eliminate_duplicated_output(struct ac_out_info *outputs,
106 BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS),
107 gl_varying_slot current, struct nir_builder *b,
108 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
109 {
110 struct ac_out_info *cur = &outputs[current];
111 unsigned p, copy_back_channels = 0;
112
113 /* Check all outputs before current. */
114 BITSET_FOREACH_SET(p, outputs_optimized, current) {
115 struct ac_out_info *prev = &outputs[p];
116
117 /* Only compare with real outputs. */
118 if (prev->constant || prev->duplicated)
119 continue;
120
121 /* The types must match (only 16-bit and 32-bit types are allowed). */
122 if ((prev->types & 16) != (cur->types & 16))
123 continue;
124
125 bool different = false;
126
127 /* Iterate over all channels, including 16-bit channels in chan_hi. */
128 for (unsigned j = 0; j < 8; j++) {
129 nir_instr *prev_chan = prev->chan[j].value;
130 nir_instr *cur_chan = cur->chan[j].value;
131
132 /* Treat undef as a match. */
133 if (!cur_chan)
134 continue;
135
136 /* If prev is undef but cur isn't, we can merge the outputs
137 * and consider the output duplicated.
138 */
139 if (!prev_chan) {
140 copy_back_channels |= 1 << j;
141 continue;
142 }
143
144 /* Test whether the values are different. */
145 if (prev_chan != cur_chan &&
146 (prev_chan->type != nir_instr_type_load_const ||
147 cur_chan->type != nir_instr_type_load_const ||
148 nir_instr_as_load_const(prev_chan)->value[0].u32 !=
149 nir_instr_as_load_const(cur_chan)->value[0].u32)) {
150 different = true;
151 break;
152 }
153 }
154 if (!different)
155 break;
156
157 copy_back_channels = 0;
158 }
159 if (p == current)
160 return false;
161
162 /* An equal output already exists. Make FS use the existing one instead.
163 * This effectively disables the current output and the param export shouldn't
164 * be generated.
165 */
166 cur->duplicated = true;
167
168 /* p is gl_varying_slot in addition to being an index into outputs. */
169 slot_remap[current] = p;
170
171 /* If the matching preceding output has undef where the current one has a proper value,
172 * move the value to the preceding output.
173 */
174 struct ac_out_info *prev = &outputs[p];
175
176 while (copy_back_channels) {
177 unsigned i = u_bit_scan(©_back_channels);
178 struct ac_chan_info *prev_chan = &prev->chan[i];
179 struct ac_chan_info *cur_chan = &cur->chan[i];
180
181 b->cursor = nir_after_instr(&cur_chan->store_intr->instr);
182
183 /* The store intrinsic doesn't exist for this channel. Create a new one. */
184 nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr);
185 struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr);
186 struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr);
187 struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr);
188
189 /* p is gl_varying_slot in addition to being an index into outputs. */
190 sem.location = p;
191 assert(sem.high_16bits == i / 4);
192
193 /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep
194 * the system value output. This is just the varying portion.
195 */
196 sem.no_sysval_output = 1;
197
198 /* Write just one component. */
199 prev_chan->store_intr = nir_store_output(b, nir_instr_def(cur_chan->value),
200 nir_imm_int(b, 0),
201 .base = prev->base,
202 .component = i % 4,
203 .io_semantics = sem,
204 .src_type = src_type,
205 .write_mask = 0x1,
206 .io_xfb = xfb,
207 .io_xfb2 = xfb2);
208
209 /* Update the undef channels in the output info. */
210 assert(!prev_chan->value);
211 prev_chan->value = cur_chan->value;
212
213 /* Remove transform feedback info from the current instruction because
214 * we moved it too. The instruction might not be removed if it's a system
215 * value output.
216 */
217 static struct nir_io_xfb zero_xfb;
218 nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb);
219 nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb);
220 }
221
222 ac_remove_varying(cur);
223 return true;
224 }
225
ac_nir_optimize_outputs(nir_shader * nir,bool sprite_tex_disallowed,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])226 bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
227 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
228 uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])
229 {
230 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
231 assert(impl);
232
233 if (nir->info.stage != MESA_SHADER_VERTEX &&
234 nir->info.stage != MESA_SHADER_TESS_EVAL) {
235 nir_metadata_preserve(impl, nir_metadata_all);
236 return false;
237 }
238
239 struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = { 0 };
240
241 BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS);
242 BITSET_ZERO(outputs_optimized);
243
244 /* Gather outputs. */
245 nir_foreach_block(block, impl) {
246 nir_foreach_instr_safe(instr, block) {
247 if (instr->type != nir_instr_type_intrinsic)
248 continue;
249
250 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
251 if (intr->intrinsic != nir_intrinsic_store_output)
252 continue;
253
254 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
255
256 /* Only process varyings that appear as param exports. */
257 if (!nir_slot_is_varying(sem.location) || sem.no_varying)
258 continue;
259
260 /* We can't optimize texture coordinates if sprite_coord_enable can override them. */
261 if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 &&
262 !sprite_tex_disallowed)
263 continue;
264
265 BITSET_SET(outputs_optimized, sem.location);
266
267 /* No indirect indexing allowed. */
268 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
269 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
270
271 /* nir_lower_io_to_scalar is required before this */
272 assert(intr->src[0].ssa->num_components == 1);
273 /* No intrinsic should store undef. */
274 assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_undef);
275
276 /* Gather the output. */
277 struct ac_out_info *out_info = &outputs[sem.location];
278 if (!out_info->types)
279 out_info->base = nir_intrinsic_base(intr);
280 else
281 assert(out_info->base == nir_intrinsic_base(intr));
282
283 out_info->types |= nir_intrinsic_src_type(intr);
284
285 unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr);
286 out_info->chan[chan].store_intr = intr;
287 out_info->chan[chan].value = intr->src[0].ssa->parent_instr;
288 }
289 }
290
291 unsigned i;
292 bool progress = false;
293
294 struct nir_builder b = nir_builder_create(impl);
295
296 /* Optimize outputs. */
297 BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) {
298 progress |=
299 ac_eliminate_const_output(&outputs[i], i, param_export_index) ||
300 ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap);
301 }
302
303 if (progress) {
304 nir_metadata_preserve(impl, nir_metadata_control_flow);
305 } else {
306 nir_metadata_preserve(impl, nir_metadata_all);
307 }
308 return progress;
309 }
310