xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_nir_lower_ps.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_nir.h"
8 #include "sid.h"
9 #include "nir_builder.h"
10 #include "nir_builtin_builder.h"
11 
12 typedef struct {
13    const ac_nir_lower_ps_options *options;
14 
15    nir_variable *persp_center;
16    nir_variable *persp_centroid;
17    nir_variable *persp_sample;
18    nir_variable *linear_center;
19    nir_variable *linear_centroid;
20    nir_variable *linear_sample;
21    bool lower_load_barycentric;
22 
23    /* Add one for dual source blend second output. */
24    nir_def *outputs[FRAG_RESULT_MAX + 1][4];
25    nir_alu_type output_types[FRAG_RESULT_MAX + 1];
26 
27    /* MAX_DRAW_BUFFERS for MRT export, 1 for MRTZ export */
28    nir_intrinsic_instr *exp[MAX_DRAW_BUFFERS + 1];
29    unsigned exp_num;
30 
31    unsigned compacted_mrt_index;
32 } lower_ps_state;
33 
34 #define DUAL_SRC_BLEND_SLOT FRAG_RESULT_MAX
35 
36 static void
create_interp_param(nir_builder * b,lower_ps_state * s)37 create_interp_param(nir_builder *b, lower_ps_state *s)
38 {
39    if (s->options->force_persp_sample_interp) {
40       s->persp_center =
41          nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_center");
42    }
43 
44    if (s->options->bc_optimize_for_persp ||
45        s->options->force_persp_sample_interp ||
46        s->options->force_persp_center_interp) {
47       s->persp_centroid =
48          nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_centroid");
49    }
50 
51    if (s->options->force_persp_center_interp) {
52       s->persp_sample =
53          nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_sample");
54    }
55 
56    if (s->options->force_linear_sample_interp) {
57       s->linear_center =
58          nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_center");
59    }
60 
61    if (s->options->bc_optimize_for_linear ||
62        s->options->force_linear_sample_interp ||
63        s->options->force_linear_center_interp) {
64       s->linear_centroid =
65          nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_centroid");
66    }
67 
68    if (s->options->force_linear_center_interp) {
69       s->linear_sample =
70          nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_sample");
71    }
72 
73    s->lower_load_barycentric =
74       s->persp_center || s->persp_centroid || s->persp_sample ||
75       s->linear_center || s->linear_centroid || s->linear_sample;
76 }
77 
78 static void
init_interp_param(nir_builder * b,lower_ps_state * s)79 init_interp_param(nir_builder *b, lower_ps_state *s)
80 {
81    b->cursor = nir_before_cf_list(&b->impl->body);
82 
83    /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
84     * The hw doesn't compute CENTROID if the whole wave only
85     * contains fully-covered quads.
86     */
87    if (s->options->bc_optimize_for_persp || s->options->bc_optimize_for_linear) {
88       nir_def *bc_optimize = nir_load_barycentric_optimize_amd(b);
89 
90       if (s->options->bc_optimize_for_persp) {
91          nir_def *center =
92             nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
93          nir_def *centroid =
94             nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
95 
96          nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
97          nir_store_var(b, s->persp_centroid, value, 0x3);
98       }
99 
100       if (s->options->bc_optimize_for_linear) {
101          nir_def *center =
102             nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
103          nir_def *centroid =
104             nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
105 
106          nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
107          nir_store_var(b, s->linear_centroid, value, 0x3);
108       }
109    }
110 
111    if (s->options->force_persp_sample_interp) {
112       nir_def *sample =
113          nir_load_barycentric_sample(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
114       nir_store_var(b, s->persp_center, sample, 0x3);
115       nir_store_var(b, s->persp_centroid, sample, 0x3);
116    }
117 
118    if (s->options->force_linear_sample_interp) {
119       nir_def *sample =
120          nir_load_barycentric_sample(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
121       nir_store_var(b, s->linear_center, sample, 0x3);
122       nir_store_var(b, s->linear_centroid, sample, 0x3);
123    }
124 
125    if (s->options->force_persp_center_interp) {
126       nir_def *center =
127          nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
128       nir_store_var(b, s->persp_sample, center, 0x3);
129       nir_store_var(b, s->persp_centroid, center, 0x3);
130    }
131 
132    if (s->options->force_linear_center_interp) {
133       nir_def *center =
134          nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
135       nir_store_var(b, s->linear_sample, center, 0x3);
136       nir_store_var(b, s->linear_centroid, center, 0x3);
137    }
138 }
139 
140 static bool
lower_ps_load_barycentric(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)141 lower_ps_load_barycentric(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
142 {
143    enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intrin);
144    nir_variable *var = NULL;
145 
146    switch (mode) {
147    case INTERP_MODE_NONE:
148    case INTERP_MODE_SMOOTH:
149       switch (intrin->intrinsic) {
150       case nir_intrinsic_load_barycentric_pixel:
151          var = s->persp_center;
152          break;
153       case nir_intrinsic_load_barycentric_centroid:
154          var = s->persp_centroid;
155          break;
156       case nir_intrinsic_load_barycentric_sample:
157          var = s->persp_sample;
158          break;
159       default:
160          break;
161       }
162       break;
163 
164    case INTERP_MODE_NOPERSPECTIVE:
165       switch (intrin->intrinsic) {
166       case nir_intrinsic_load_barycentric_pixel:
167          var = s->linear_center;
168          break;
169       case nir_intrinsic_load_barycentric_centroid:
170          var = s->linear_centroid;
171          break;
172       case nir_intrinsic_load_barycentric_sample:
173          var = s->linear_sample;
174          break;
175       default:
176          break;
177       }
178       break;
179 
180    default:
181       break;
182    }
183 
184    if (!var)
185       return false;
186 
187    b->cursor = nir_before_instr(&intrin->instr);
188 
189    nir_def *replacement = nir_load_var(b, var);
190    nir_def_replace(&intrin->def, replacement);
191    return true;
192 }
193 
194 static bool
gather_ps_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)195 gather_ps_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
196 {
197    nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
198    unsigned write_mask = nir_intrinsic_write_mask(intrin);
199    unsigned component = nir_intrinsic_component(intrin);
200    nir_alu_type type = nir_intrinsic_src_type(intrin);
201    nir_def *store_val = intrin->src[0].ssa;
202 
203    b->cursor = nir_before_instr(&intrin->instr);
204 
205    unsigned slot = sem.dual_source_blend_index ?
206       DUAL_SRC_BLEND_SLOT : sem.location;
207 
208    u_foreach_bit (i, write_mask) {
209       unsigned comp = component + i;
210       s->outputs[slot][comp] = nir_channel(b, store_val, i);
211    }
212 
213    /* Same slot should have same type for all components. */
214    assert(s->output_types[slot] == nir_type_invalid || s->output_types[slot] == type);
215 
216    s->output_types[slot] = type;
217 
218    /* Keep output instruction if not exported in nir. */
219    if (!s->options->no_color_export && !s->options->no_depth_export) {
220       nir_instr_remove(&intrin->instr);
221    } else {
222       if (slot >= FRAG_RESULT_DATA0 && !s->options->no_color_export) {
223          nir_instr_remove(&intrin->instr);
224       } else if ((slot == FRAG_RESULT_DEPTH || slot == FRAG_RESULT_STENCIL ||
225                   slot == FRAG_RESULT_SAMPLE_MASK) && !s->options->no_depth_export) {
226          nir_instr_remove(&intrin->instr);
227       }
228    }
229 
230    return true;
231 }
232 
233 static bool
lower_ps_load_sample_mask_in(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)234 lower_ps_load_sample_mask_in(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
235 {
236    /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
237     * says:
238     *
239     *    "When per-sample shading is active due to the use of a fragment
240     *     input qualified by sample or due to the use of the gl_SampleID
241     *     or gl_SamplePosition variables, only the bit for the current
242     *     sample is set in gl_SampleMaskIn. When state specifies multiple
243     *     fragment shader invocations for a given fragment, the sample
244     *     mask for any single fragment shader invocation may specify a
245     *     subset of the covered samples for the fragment. In this case,
246     *     the bit corresponding to each covered sample will be set in
247     *     exactly one fragment shader invocation."
248     *
249     * The samplemask loaded by hardware is always the coverage of the
250     * entire pixel/fragment, so mask bits out based on the sample ID.
251     */
252 
253    b->cursor = nir_before_instr(&intrin->instr);
254 
255    uint32_t ps_iter_mask = ac_get_ps_iter_mask(s->options->ps_iter_samples);
256    nir_def *sampleid = nir_load_sample_id(b);
257    nir_def *submask = nir_ishl(b, nir_imm_int(b, ps_iter_mask), sampleid);
258 
259    nir_def *sample_mask = nir_load_sample_mask_in(b);
260    nir_def *replacement = nir_iand(b, sample_mask, submask);
261 
262    nir_def_replace(&intrin->def, replacement);
263    return true;
264 }
265 
266 static bool
lower_ps_intrinsic(nir_builder * b,nir_instr * instr,void * state)267 lower_ps_intrinsic(nir_builder *b, nir_instr *instr, void *state)
268 {
269    lower_ps_state *s = (lower_ps_state *)state;
270 
271    if (instr->type != nir_instr_type_intrinsic)
272       return false;
273 
274    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
275 
276    switch (intrin->intrinsic) {
277    case nir_intrinsic_store_output:
278       return gather_ps_store_output(b, intrin, s);
279    case nir_intrinsic_load_barycentric_pixel:
280    case nir_intrinsic_load_barycentric_centroid:
281    case nir_intrinsic_load_barycentric_sample:
282       if (s->lower_load_barycentric)
283          return lower_ps_load_barycentric(b, intrin, s);
284       break;
285    case nir_intrinsic_load_sample_mask_in:
286       if (s->options->ps_iter_samples > 1)
287          return lower_ps_load_sample_mask_in(b, intrin, s);
288       break;
289    default:
290       break;
291    }
292 
293    return false;
294 }
295 
296 static void
emit_ps_color_clamp_and_alpha_test(nir_builder * b,lower_ps_state * s)297 emit_ps_color_clamp_and_alpha_test(nir_builder *b, lower_ps_state *s)
298 {
299    uint32_t color_mask =
300       BITFIELD_BIT(FRAG_RESULT_COLOR) |
301       BITFIELD_RANGE(FRAG_RESULT_DATA0, MAX_DRAW_BUFFERS);
302    uint32_t color_outputs =
303       (b->shader->info.outputs_written & color_mask) |
304       /* both dual source blend outputs use FRAG_RESULT_DATA0 slot in nir,
305        * but we use an extra slot number in lower_ps_state for the second
306        * output
307        */
308       BITFIELD_BIT(DUAL_SRC_BLEND_SLOT);
309 
310    u_foreach_bit (slot, color_outputs) {
311       if (s->options->clamp_color) {
312          for (int i = 0; i < 4; i++) {
313             if (s->outputs[slot][i])
314                s->outputs[slot][i] = nir_fsat(b, s->outputs[slot][i]);
315          }
316       }
317 
318       if (s->options->alpha_to_one) {
319          /* any one has written to this slot */
320          if (s->output_types[slot] != nir_type_invalid) {
321             unsigned bit_size = nir_alu_type_get_type_size(s->output_types[slot]);
322             s->outputs[slot][3] = nir_imm_floatN_t(b, 1, bit_size);
323          }
324       }
325 
326       if (slot == FRAG_RESULT_COLOR || slot == FRAG_RESULT_DATA0) {
327          if (s->options->alpha_func == COMPARE_FUNC_ALWAYS) {
328             /* always pass, do nothing */
329          } else if (s->options->alpha_func == COMPARE_FUNC_NEVER) {
330             nir_discard(b);
331          } else if (s->outputs[slot][3]) {
332             nir_def *ref = nir_load_alpha_reference_amd(b);
333             nir_def *cond =
334                nir_compare_func(b, s->options->alpha_func, s->outputs[slot][3], ref);
335             nir_discard_if(b, nir_inot(b, cond));
336          }
337       }
338    }
339 }
340 
341 static void
emit_ps_mrtz_export(nir_builder * b,lower_ps_state * s)342 emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s)
343 {
344    uint64_t outputs_written = b->shader->info.outputs_written;
345 
346    nir_def *mrtz_alpha = NULL;
347    if (s->options->alpha_to_coverage_via_mrtz) {
348       mrtz_alpha = s->outputs[FRAG_RESULT_COLOR][3] ?
349          s->outputs[FRAG_RESULT_COLOR][3] :
350          s->outputs[FRAG_RESULT_DATA0][3];
351    }
352 
353    nir_def *depth = s->outputs[FRAG_RESULT_DEPTH][0];
354    nir_def *stencil = s->outputs[FRAG_RESULT_STENCIL][0];
355    nir_def *sample_mask = s->outputs[FRAG_RESULT_SAMPLE_MASK][0];
356 
357    if (s->options->kill_samplemask) {
358       sample_mask = NULL;
359       outputs_written &= ~BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
360    }
361 
362    /* skip mrtz export if no one has written to any of them */
363    if (!depth && !stencil && !sample_mask && !mrtz_alpha)
364       return;
365 
366    /* use outputs_written to determine export format as we use it to set
367     * R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store output,
368     * because store output may be optimized out.
369     */
370    unsigned format =
371       ac_get_spi_shader_z_format(outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH),
372                                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL),
373                                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK),
374                                  s->options->alpha_to_coverage_via_mrtz);
375 
376    nir_def *undef = nir_undef(b, 1, 32);
377    nir_def *outputs[4] = {undef, undef, undef, undef};
378    unsigned write_mask = 0;
379    unsigned flags = 0;
380 
381    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
382       assert(!depth && !mrtz_alpha);
383 
384       if (s->options->gfx_level < GFX11)
385          flags |= AC_EXP_FLAG_COMPRESSED;
386 
387       if (stencil) {
388          outputs[0] = nir_ishl_imm(b, stencil, 16);
389          write_mask |= s->options->gfx_level >= GFX11 ? 0x1 : 0x3;
390       }
391 
392       if (sample_mask) {
393          outputs[1] = sample_mask;
394          write_mask |= s->options->gfx_level >= GFX11 ? 0x2 : 0xc;
395       }
396    } else {
397       if (depth) {
398          outputs[0] = depth;
399          write_mask |= 0x1;
400       }
401 
402       if (stencil) {
403          outputs[1] = stencil;
404          write_mask |= 0x2;
405       }
406 
407       if (sample_mask) {
408          outputs[2] = sample_mask;
409          write_mask |= 0x4;
410       }
411 
412       if (mrtz_alpha) {
413          outputs[3] = mrtz_alpha;
414          write_mask |= 0x8;
415       }
416    }
417 
418    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the
419     * X writemask component.
420     */
421    if (s->options->gfx_level == GFX6 &&
422        s->options->family != CHIP_OLAND &&
423        s->options->family != CHIP_HAINAN) {
424       write_mask |= 0x1;
425    }
426 
427    s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
428                                          .base = V_008DFC_SQ_EXP_MRTZ,
429                                          .write_mask = write_mask,
430                                          .flags = flags);
431 }
432 
433 static unsigned
get_ps_color_export_target(lower_ps_state * s)434 get_ps_color_export_target(lower_ps_state *s)
435 {
436    unsigned target = V_008DFC_SQ_EXP_MRT + s->compacted_mrt_index;
437 
438    if (s->options->dual_src_blend_swizzle && s->compacted_mrt_index < 2)
439       target += 21;
440 
441    s->compacted_mrt_index++;
442 
443    return target;
444 }
445 
446 static bool
emit_ps_color_export(nir_builder * b,lower_ps_state * s,gl_frag_result slot,unsigned cbuf)447 emit_ps_color_export(nir_builder *b, lower_ps_state *s, gl_frag_result slot, unsigned cbuf)
448 {
449    assert(cbuf < 8);
450 
451    unsigned spi_shader_col_format = (s->options->spi_shader_col_format >> (cbuf * 4)) & 0xf;
452    if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
453       return false;
454 
455    /* get target after checking spi_shader_col_format as we need to increase
456     * compacted_mrt_index anyway regardless of whether the export is built
457     */
458    unsigned target = get_ps_color_export_target(s);
459 
460    nir_alu_type type = s->output_types[slot];
461    /* no one has written to this slot */
462    if (type == nir_type_invalid)
463       return false;
464 
465    bool is_int8 = s->options->color_is_int8 & BITFIELD_BIT(cbuf);
466    bool is_int10 = s->options->color_is_int10 & BITFIELD_BIT(cbuf);
467    bool enable_mrt_output_nan_fixup =
468       s->options->enable_mrt_output_nan_fixup & BITFIELD_BIT(cbuf);
469 
470    nir_def *undef = nir_undef(b, 1, 32);
471    nir_def *outputs[4] = {undef, undef, undef, undef};
472    unsigned write_mask = 0;
473    unsigned flags = 0;
474 
475    nir_alu_type base_type = nir_alu_type_get_base_type(type);
476    unsigned type_size = nir_alu_type_get_type_size(type);
477 
478    nir_def *data[4];
479    memcpy(data, s->outputs[slot], sizeof(data));
480 
481    /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
482    if (enable_mrt_output_nan_fixup && type == nir_type_float32) {
483       for (int i = 0; i < 4; i++) {
484          if (data[i]) {
485             nir_def *isnan = nir_fisnan(b, data[i]);
486             data[i] = nir_bcsel(b, isnan, nir_imm_float(b, 0), data[i]);
487          }
488       }
489    }
490 
491    switch (spi_shader_col_format) {
492    case V_028714_SPI_SHADER_32_R:
493       if (!data[0])
494          return false;
495 
496       outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
497       write_mask = 0x1;
498       break;
499 
500    case V_028714_SPI_SHADER_32_GR:
501       if (!data[0] && !data[1])
502          return false;
503 
504       if (data[0]) {
505          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
506          write_mask |= 0x1;
507       }
508 
509       if (data[1]) {
510          outputs[1] = nir_convert_to_bit_size(b, data[1], base_type, 32);
511          write_mask |= 0x2;
512       }
513       break;
514 
515    case V_028714_SPI_SHADER_32_AR:
516       if (!data[0] && !data[3])
517          return false;
518 
519       if (data[0]) {
520          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
521          write_mask |= 0x1;
522       }
523 
524       if (data[3]) {
525          unsigned index = s->options->gfx_level >= GFX10 ? 1 : 3;
526          outputs[index] = nir_convert_to_bit_size(b, data[3], base_type, 32);
527          write_mask |= BITFIELD_BIT(index);
528       }
529       break;
530 
531    case V_028714_SPI_SHADER_32_ABGR:
532       for (int i = 0; i < 4; i++) {
533          if (data[i]) {
534             outputs[i] = nir_convert_to_bit_size(b, data[i], base_type, 32);
535             write_mask |= BITFIELD_BIT(i);
536          }
537       }
538       break;
539 
540    default: {
541       nir_op pack_op = nir_op_pack_32_2x16;
542 
543       switch (spi_shader_col_format) {
544       case V_028714_SPI_SHADER_FP16_ABGR:
545          if (type_size == 32)
546             pack_op = nir_op_pack_half_2x16_rtz_split;
547          break;
548       case V_028714_SPI_SHADER_UINT16_ABGR:
549          if (type_size == 32) {
550             pack_op = nir_op_pack_uint_2x16;
551             if (is_int8 || is_int10) {
552                /* clamp 32bit output for 8/10 bit color component */
553                uint32_t max_rgb = is_int8 ? 255 : 1023;
554 
555                for (int i = 0; i < 4; i++) {
556                   if (!data[i])
557                      continue;
558 
559                   uint32_t max_value = i == 3 && is_int10 ? 3 : max_rgb;
560                   data[i] = nir_umin(b, data[i], nir_imm_int(b, max_value));
561                }
562             }
563          }
564          break;
565       case V_028714_SPI_SHADER_SINT16_ABGR:
566          if (type_size == 32) {
567             pack_op = nir_op_pack_sint_2x16;
568             if (is_int8 || is_int10) {
569                /* clamp 32bit output for 8/10 bit color component */
570                uint32_t max_rgb = is_int8 ? 127 : 511;
571                uint32_t min_rgb = is_int8 ? -128 : -512;
572 
573                for (int i = 0; i < 4; i++) {
574                   if (!data[i])
575                      continue;
576 
577                   uint32_t max_value = i == 3 && is_int10 ? 1 : max_rgb;
578                   uint32_t min_value = i == 3 && is_int10 ? -2u : min_rgb;
579 
580                   data[i] = nir_imin(b, data[i], nir_imm_int(b, max_value));
581                   data[i] = nir_imax(b, data[i], nir_imm_int(b, min_value));
582                }
583             }
584          }
585          break;
586       case V_028714_SPI_SHADER_UNORM16_ABGR:
587          pack_op = nir_op_pack_unorm_2x16;
588          break;
589       case V_028714_SPI_SHADER_SNORM16_ABGR:
590          pack_op = nir_op_pack_snorm_2x16;
591          break;
592       default:
593          unreachable("unsupported color export format");
594          break;
595       }
596 
597       for (int i = 0; i < 2; i++) {
598          nir_def *lo = data[i * 2];
599          nir_def *hi = data[i * 2 + 1];
600          if (!lo && !hi)
601             continue;
602 
603          lo = lo ? lo : nir_undef(b, 1, type_size);
604          hi = hi ? hi : nir_undef(b, 1, type_size);
605 
606          if (nir_op_infos[pack_op].num_inputs == 2) {
607             outputs[i] = nir_build_alu2(b, pack_op, lo, hi);
608          } else {
609             nir_def *vec = nir_vec2(b, lo, hi);
610             outputs[i] = nir_build_alu1(b, pack_op, vec);
611          }
612 
613          if (s->options->gfx_level >= GFX11)
614             write_mask |= BITFIELD_BIT(i);
615          else
616             write_mask |= 0x3 << (i * 2);
617       }
618 
619       if (s->options->gfx_level < GFX11)
620          flags |= AC_EXP_FLAG_COMPRESSED;
621    }
622    }
623 
624    s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
625                                          .base = target,
626                                          .write_mask = write_mask,
627                                          .flags = flags);
628    return true;
629 }
630 
631 static void
emit_ps_dual_src_blend_swizzle(nir_builder * b,lower_ps_state * s,unsigned first_color_export)632 emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first_color_export)
633 {
634    assert(s->exp_num > first_color_export + 1);
635 
636    nir_intrinsic_instr *mrt0_exp = s->exp[first_color_export];
637    nir_intrinsic_instr *mrt1_exp = s->exp[first_color_export + 1];
638 
639    /* There are some instructions which operate mrt1_exp's argument
640     * between mrt0_exp and mrt1_exp. Move mrt0_exp next to mrt1_exp,
641     * so that we can swizzle their arguments.
642     */
643    unsigned target0 = nir_intrinsic_base(mrt0_exp);
644    unsigned target1 = nir_intrinsic_base(mrt1_exp);
645    if (target0 > target1) {
646       /* mrt0 export is after mrt1 export, this happens when src0 is missing,
647        * so we emit mrt1 first then emit an empty mrt0.
648        *
649        * swap the pointer
650        */
651       nir_intrinsic_instr *tmp = mrt0_exp;
652       mrt0_exp = mrt1_exp;
653       mrt1_exp = tmp;
654 
655       /* move mrt1_exp down to after mrt0_exp */
656       nir_instr_move(nir_after_instr(&mrt0_exp->instr), &mrt1_exp->instr);
657    } else {
658       /* move mrt0_exp down to before mrt1_exp */
659       nir_instr_move(nir_before_instr(&mrt1_exp->instr), &mrt0_exp->instr);
660    }
661 
662    uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
663    uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
664    uint32_t write_mask = mrt0_write_mask | mrt1_write_mask;
665 
666    nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
667    nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
668 
669    /* Swizzle code is right before mrt0_exp. */
670    b->cursor = nir_before_instr(&mrt0_exp->instr);
671 
672    /* ACO need to emit the swizzle code by a pseudo instruction. */
673    if (s->options->use_aco) {
674       nir_export_dual_src_blend_amd(b, mrt0_arg, mrt1_arg, .write_mask = write_mask);
675       nir_instr_remove(&mrt0_exp->instr);
676       nir_instr_remove(&mrt1_exp->instr);
677       return;
678    }
679 
680    nir_def *undef = nir_undef(b, 1, 32);
681    nir_def *arg0_vec[4] = {undef, undef, undef, undef};
682    nir_def *arg1_vec[4] = {undef, undef, undef, undef};
683 
684    /* For illustration, originally
685     *   lane0 export arg00 and arg01
686     *   lane1 export arg10 and arg11.
687     *
688     * After the following operation
689     *   lane0 export arg00 and arg10
690     *   lane1 export arg01 and arg11.
691     */
692    u_foreach_bit (i, write_mask) {
693       nir_def *arg0 = nir_channel(b, mrt0_arg, i);
694       nir_def *arg1 = nir_channel(b, mrt1_arg, i);
695 
696       /* swap odd,even lanes of arg0 */
697       arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
698 
699       /* swap even lanes between arg0 and arg1 */
700       nir_def *tid = nir_load_subgroup_invocation(b);
701       nir_def *is_even = nir_ieq_imm(b, nir_iand_imm(b, tid, 1), 0);
702 
703       nir_def *tmp = arg0;
704       arg0 = nir_bcsel(b, is_even, arg1, arg0);
705       arg1 = nir_bcsel(b, is_even, tmp, arg1);
706 
707       /* swap odd,even lanes again for arg0 */
708       arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
709 
710       arg0_vec[i] = arg0;
711       arg1_vec[i] = arg1;
712    }
713 
714    nir_src_rewrite(&mrt0_exp->src[0], nir_vec(b, arg0_vec, 4));
715    nir_src_rewrite(&mrt1_exp->src[0], nir_vec(b, arg1_vec, 4));
716 
717    nir_intrinsic_set_write_mask(mrt0_exp, write_mask);
718    nir_intrinsic_set_write_mask(mrt1_exp, write_mask);
719 }
720 
721 static void
emit_ps_null_export(nir_builder * b,lower_ps_state * s)722 emit_ps_null_export(nir_builder *b, lower_ps_state *s)
723 {
724    const bool pops = b->shader->info.fs.sample_interlock_ordered ||
725                      b->shader->info.fs.sample_interlock_unordered ||
726                      b->shader->info.fs.pixel_interlock_ordered ||
727                      b->shader->info.fs.pixel_interlock_unordered;
728 
729    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
730     * for discard.
731     * In Primitive Ordered Pixel Shading, however, GFX11+ explicitly uses the `done` export to exit
732     * the ordered section, and before GFX11, shaders with POPS also need an export.
733     */
734    if (s->options->gfx_level >= GFX10 && !s->options->uses_discard && !pops)
735       return;
736 
737    /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
738     * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below it.
739     */
740    if (s->options->gfx_level >= GFX11 && pops)
741       nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
742                                 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
743                                 nir_var_mem_global);
744 
745    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
746    unsigned target = s->options->gfx_level >= GFX11 ?
747       V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
748 
749    nir_intrinsic_instr *intrin =
750       nir_export_amd(b, nir_undef(b, 4, 32),
751                      .base = target,
752                      .flags = AC_EXP_FLAG_VALID_MASK | AC_EXP_FLAG_DONE);
753    /* To avoid builder set write mask to 0xf. */
754    nir_intrinsic_set_write_mask(intrin, 0);
755 }
756 
757 static void
export_ps_outputs(nir_builder * b,lower_ps_state * s)758 export_ps_outputs(nir_builder *b, lower_ps_state *s)
759 {
760    b->cursor = nir_after_impl(b->impl);
761 
762    emit_ps_color_clamp_and_alpha_test(b, s);
763 
764    if (!s->options->no_depth_export)
765       emit_ps_mrtz_export(b, s);
766 
767    /* When non-monolithic shader, RADV export mrtz in main part (except on
768     * RDNA3 for alpha to coverage) and export color in epilog.
769     */
770    if (s->options->no_color_export)
771       return;
772 
773    unsigned first_color_export = s->exp_num;
774 
775    /* When dual src blend is enabled and we need both src0 and src1
776     * export present, try to export both src, and add an empty export
777     * for either src missing.
778     */
779    if (s->output_types[DUAL_SRC_BLEND_SLOT] != nir_type_invalid ||
780        s->options->dual_src_blend_swizzle) {
781       unsigned slot;
782       if (s->output_types[FRAG_RESULT_COLOR] != nir_type_invalid) {
783          /* when dual source blending, there must be only one color buffer */
784          assert(s->options->broadcast_last_cbuf == 0);
785          slot = FRAG_RESULT_COLOR;
786       } else {
787          slot = FRAG_RESULT_DATA0;
788       }
789 
790       bool src0_exported = emit_ps_color_export(b, s, slot, 0);
791       /* src1 use cubf1 info, when dual src blend is enabled it's
792        * same as cbuf0, but when dual src blend is disabled it's used
793        * to disable src1 export.
794        */
795       bool src1_exported = emit_ps_color_export(b, s, DUAL_SRC_BLEND_SLOT, 1);
796 
797       bool need_empty_export =
798          /* miss src1, need to add src1 only when swizzle case */
799          (src0_exported && !src1_exported && s->options->dual_src_blend_swizzle) ||
800          /* miss src0, always need to add src0 */
801          (!src0_exported && src1_exported);
802 
803       if (need_empty_export) {
804          /* set to expected value */
805          s->compacted_mrt_index = src0_exported ? 1 : 0;
806 
807          unsigned target = get_ps_color_export_target(s);
808 
809          s->exp[s->exp_num++] =
810             nir_export_amd(b, nir_undef(b, 4, 32), .base = target);
811       }
812    } else {
813       if (s->output_types[FRAG_RESULT_COLOR] != nir_type_invalid) {
814          /* write to all color buffers */
815          for (int cbuf = 0; cbuf <= s->options->broadcast_last_cbuf; cbuf++)
816             emit_ps_color_export(b, s, FRAG_RESULT_COLOR, cbuf);
817       } else {
818          for (int cbuf = 0; cbuf < MAX_DRAW_BUFFERS; cbuf++) {
819             unsigned slot = FRAG_RESULT_DATA0 + cbuf;
820             emit_ps_color_export(b, s, slot, cbuf);
821          }
822       }
823    }
824 
825    if (s->exp_num) {
826       if (s->options->dual_src_blend_swizzle) {
827          emit_ps_dual_src_blend_swizzle(b, s, first_color_export);
828          /* Skip last export flag setting because they have been replaced by
829           * a pseudo instruction.
830           */
831          if (s->options->use_aco)
832             return;
833       }
834 
835       /* Specify that this is the last export */
836       nir_intrinsic_instr *final_exp = s->exp[s->exp_num - 1];
837       unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
838       final_exp_flags |= AC_EXP_FLAG_DONE | AC_EXP_FLAG_VALID_MASK;
839       nir_intrinsic_set_flags(final_exp, final_exp_flags);
840 
841       /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
842        * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below
843        * it.
844        */
845       if (s->options->gfx_level >= GFX11 &&
846           (b->shader->info.fs.sample_interlock_ordered ||
847            b->shader->info.fs.sample_interlock_unordered ||
848            b->shader->info.fs.pixel_interlock_ordered ||
849            b->shader->info.fs.pixel_interlock_unordered)) {
850          b->cursor = nir_before_instr(&final_exp->instr);
851          nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
852                                    nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
853                                    nir_var_mem_global);
854       }
855    } else {
856       emit_ps_null_export(b, s);
857    }
858 }
859 
860 void
ac_nir_lower_ps(nir_shader * nir,const ac_nir_lower_ps_options * options)861 ac_nir_lower_ps(nir_shader *nir, const ac_nir_lower_ps_options *options)
862 {
863    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
864 
865    nir_builder builder = nir_builder_create(impl);
866    nir_builder *b = &builder;
867 
868    lower_ps_state state = {
869       .options = options,
870    };
871 
872    create_interp_param(b, &state);
873 
874    nir_shader_instructions_pass(nir, lower_ps_intrinsic,
875                                 nir_metadata_control_flow,
876                                 &state);
877 
878    /* Must be after lower_ps_intrinsic() to prevent it lower added intrinsic here. */
879    init_interp_param(b, &state);
880 
881    export_ps_outputs(b, &state);
882 
883    /* Cleanup nir variable, as RADV won't do this. */
884    if (state.lower_load_barycentric)
885       nir_lower_vars_to_ssa(nir);
886 }
887