xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/agx_nir_prolog_epilog.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Alyssa Rosenzweig
3  * Copyright 2024 Valve Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "gallium/include/pipe/p_defines.h"
8 #include "agx_linker.h"
9 #include "agx_nir_lower_gs.h"
10 #include "agx_nir_lower_vbo.h"
11 #include "agx_nir_passes.h"
12 #include "agx_pack.h"
13 #include "agx_tilebuffer.h"
14 #include "nir.h"
15 #include "nir_builder.h"
16 #include "nir_builder_opcodes.h"
17 #include "nir_lower_blend.h"
18 #include "shader_enums.h"
19 
20 /*
21  * Insert code into a fragment shader to lower polygon stipple. The stipple is
22  * passed in a sideband, rather than requiring a texture binding. This is
23  * simpler for drivers to integrate and might be more efficient.
24  */
25 static bool
agx_nir_lower_poly_stipple(nir_shader * s)26 agx_nir_lower_poly_stipple(nir_shader *s)
27 {
28    assert(s->info.stage == MESA_SHADER_FRAGMENT);
29 
30    /* Insert at the beginning for performance. */
31    nir_builder b_ =
32       nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
33    nir_builder *b = &b_;
34 
35    /* The stipple coordinate is defined at the window coordinate mod 32. It's
36     * reversed along the X-axis to simplify the driver, hence the NOT.
37     */
38    nir_def *raw = nir_u2u32(b, nir_load_pixel_coord(b));
39    nir_def *coord = nir_umod_imm(
40       b,
41       nir_vec2(b, nir_inot(b, nir_channel(b, raw, 0)), nir_channel(b, raw, 1)),
42       32);
43 
44    /* Extract the column from the packed bitfield */
45    nir_def *pattern = nir_load_polygon_stipple_agx(b, nir_channel(b, coord, 1));
46    nir_def *bit = nir_ubitfield_extract(b, pattern, nir_channel(b, coord, 0),
47                                         nir_imm_int(b, 1));
48 
49    /* Discard fragments where the pattern is 0 */
50    nir_demote_if(b, nir_ieq_imm(b, bit, 0));
51    s->info.fs.uses_discard = true;
52 
53    nir_metadata_preserve(b->impl, nir_metadata_control_flow);
54    return true;
55 }
56 
57 static bool
lower_vbo(nir_shader * s,const struct agx_velem_key * key,const struct agx_robustness rs)58 lower_vbo(nir_shader *s, const struct agx_velem_key *key,
59           const struct agx_robustness rs)
60 {
61    struct agx_attribute out[AGX_MAX_VBUFS];
62 
63    for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
64       out[i] = (struct agx_attribute){
65          .divisor = key[i].divisor,
66          .stride = key[i].stride,
67          .format = key[i].format,
68          .instanced = key[i].instanced,
69       };
70    }
71 
72    return agx_nir_lower_vbo(s, out, rs);
73 }
74 
75 static int
map_vs_part_uniform(nir_intrinsic_instr * intr,unsigned nr_attribs)76 map_vs_part_uniform(nir_intrinsic_instr *intr, unsigned nr_attribs)
77 {
78    switch (intr->intrinsic) {
79    case nir_intrinsic_load_vbo_base_agx:
80       return 4 * nir_src_as_uint(intr->src[0]);
81    case nir_intrinsic_load_attrib_clamp_agx:
82       return (4 * nr_attribs) + (2 * nir_src_as_uint(intr->src[0]));
83    case nir_intrinsic_load_first_vertex:
84       return (6 * nr_attribs);
85    case nir_intrinsic_load_base_instance:
86       return (6 * nr_attribs) + 2;
87    case nir_intrinsic_load_input_assembly_buffer_agx:
88       return (6 * nr_attribs) + 8;
89    default:
90       return -1;
91    }
92 }
93 
94 static int
map_fs_part_uniform(nir_intrinsic_instr * intr)95 map_fs_part_uniform(nir_intrinsic_instr *intr)
96 {
97    switch (intr->intrinsic) {
98    case nir_intrinsic_load_blend_const_color_r_float:
99       return 4;
100    case nir_intrinsic_load_blend_const_color_g_float:
101       return 6;
102    case nir_intrinsic_load_blend_const_color_b_float:
103       return 8;
104    case nir_intrinsic_load_blend_const_color_a_float:
105       return 10;
106    default:
107       return -1;
108    }
109 }
110 
111 static bool
lower_non_monolithic_uniforms(nir_builder * b,nir_intrinsic_instr * intr,void * data)112 lower_non_monolithic_uniforms(nir_builder *b, nir_intrinsic_instr *intr,
113                               void *data)
114 {
115    int unif;
116    if (b->shader->info.stage == MESA_SHADER_VERTEX) {
117       unsigned *nr_attribs = data;
118       unif = map_vs_part_uniform(intr, *nr_attribs);
119    } else {
120       unif = map_fs_part_uniform(intr);
121    }
122 
123    if (unif >= 0) {
124       b->cursor = nir_instr_remove(&intr->instr);
125       nir_def *load = nir_load_preamble(b, 1, intr->def.bit_size, .base = unif);
126       nir_def_rewrite_uses(&intr->def, load);
127       return true;
128    } else if (intr->intrinsic == nir_intrinsic_load_texture_handle_agx) {
129       b->cursor = nir_instr_remove(&intr->instr);
130       nir_def *offs =
131          nir_imul_imm(b, nir_u2u32(b, intr->src[0].ssa), AGX_TEXTURE_LENGTH);
132       nir_def_rewrite_uses(&intr->def, nir_vec2(b, nir_imm_int(b, 0), offs));
133       return true;
134    } else {
135       return false;
136    }
137 }
138 
139 void
agx_nir_vs_prolog(nir_builder * b,const void * key_)140 agx_nir_vs_prolog(nir_builder *b, const void *key_)
141 {
142    const struct agx_vs_prolog_key *key = key_;
143    b->shader->info.stage = MESA_SHADER_VERTEX;
144    b->shader->info.name = "VS prolog";
145 
146    /* First, construct a passthrough shader reading each attribute and exporting
147     * the value. We also need to export vertex/instance ID in their usual regs.
148     */
149    unsigned i = 0;
150    nir_def *vec = NULL;
151    unsigned vec_idx = ~0;
152    BITSET_FOREACH_SET(i, key->component_mask, AGX_MAX_ATTRIBS * 4) {
153       unsigned a = i / 4;
154       unsigned c = i % 4;
155 
156       if (vec_idx != a) {
157          vec = nir_load_input(b, 4, 32, nir_imm_int(b, 0), .base = a);
158          vec_idx = a;
159       }
160 
161       /* ABI: attributes passed starting at r8 */
162       nir_export_agx(b, nir_channel(b, vec, c), .base = 2 * (8 + i));
163    }
164 
165    nir_export_agx(b, nir_load_vertex_id(b), .base = 5 * 2);
166    nir_export_agx(b, nir_load_instance_id(b), .base = 6 * 2);
167 
168    /* Now lower the resulting program using the key */
169    lower_vbo(b->shader, key->attribs, key->robustness);
170 
171    if (!key->hw) {
172       agx_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
173    }
174 
175    /* Finally, lower uniforms according to our ABI */
176    unsigned nr = DIV_ROUND_UP(BITSET_LAST_BIT(key->component_mask), 4);
177    nir_shader_intrinsics_pass(b->shader, lower_non_monolithic_uniforms,
178                               nir_metadata_control_flow, &nr);
179    b->shader->info.io_lowered = true;
180 }
181 
182 static bool
lower_input_to_prolog(nir_builder * b,nir_intrinsic_instr * intr,void * data)183 lower_input_to_prolog(nir_builder *b, nir_intrinsic_instr *intr, void *data)
184 {
185    if (intr->intrinsic != nir_intrinsic_load_input)
186       return false;
187 
188    unsigned idx = nir_src_as_uint(intr->src[0]) + nir_intrinsic_base(intr);
189    unsigned comp = nir_intrinsic_component(intr);
190 
191    assert(intr->def.bit_size == 32 && "todo: push conversions up?");
192    unsigned base = 4 * idx + comp;
193 
194    b->cursor = nir_before_instr(&intr->instr);
195    nir_def *val = nir_load_exported_agx(
196       b, intr->def.num_components, intr->def.bit_size, .base = 16 + 2 * base);
197 
198    BITSET_WORD *comps_read = data;
199    nir_component_mask_t mask = nir_def_components_read(&intr->def);
200 
201    u_foreach_bit(c, mask) {
202       BITSET_SET(comps_read, base + c);
203    }
204 
205    nir_def_replace(&intr->def, val);
206    return true;
207 }
208 
209 bool
agx_nir_lower_vs_input_to_prolog(nir_shader * s,BITSET_WORD * attrib_components_read)210 agx_nir_lower_vs_input_to_prolog(nir_shader *s,
211                                  BITSET_WORD *attrib_components_read)
212 {
213    return nir_shader_intrinsics_pass(s, lower_input_to_prolog,
214                                      nir_metadata_control_flow,
215                                      attrib_components_read);
216 }
217 
218 static bool
lower_active_samples_to_register(nir_builder * b,nir_intrinsic_instr * intr,void * data)219 lower_active_samples_to_register(nir_builder *b, nir_intrinsic_instr *intr,
220                                  void *data)
221 {
222    if (intr->intrinsic != nir_intrinsic_load_active_samples_agx)
223       return false;
224 
225    b->cursor = nir_instr_remove(&intr->instr);
226 
227    /* ABI: r0h contains the active sample mask */
228    nir_def *id = nir_load_exported_agx(b, 1, 16, .base = 1);
229    nir_def_rewrite_uses(&intr->def, id);
230    return true;
231 }
232 
233 static bool
lower_tests_zs_intr(nir_builder * b,nir_intrinsic_instr * intr,void * data)234 lower_tests_zs_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
235 {
236    bool *value = data;
237    if (intr->intrinsic != nir_intrinsic_load_shader_part_tests_zs_agx)
238       return false;
239 
240    b->cursor = nir_instr_remove(&intr->instr);
241    nir_def_rewrite_uses(&intr->def, nir_imm_intN_t(b, *value ? 0xFF : 0, 16));
242    return true;
243 }
244 
245 static bool
lower_tests_zs(nir_shader * s,bool value)246 lower_tests_zs(nir_shader *s, bool value)
247 {
248    if (!s->info.fs.uses_discard)
249       return false;
250 
251    return nir_shader_intrinsics_pass(s, lower_tests_zs_intr,
252                                      nir_metadata_control_flow, &value);
253 }
254 
255 static inline bool
blend_uses_2src(struct agx_blend_rt_key rt)256 blend_uses_2src(struct agx_blend_rt_key rt)
257 {
258    enum pipe_blendfactor factors[] = {
259       rt.rgb_src_factor,
260       rt.rgb_dst_factor,
261       rt.alpha_src_factor,
262       rt.alpha_dst_factor,
263    };
264 
265    for (unsigned i = 0; i < ARRAY_SIZE(factors); ++i) {
266       switch (factors[i]) {
267       case PIPE_BLENDFACTOR_SRC1_COLOR:
268       case PIPE_BLENDFACTOR_SRC1_ALPHA:
269       case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
270       case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
271          return true;
272       default:
273          break;
274       }
275    }
276 
277    return false;
278 }
279 
280 void
agx_nir_fs_epilog(nir_builder * b,const void * key_)281 agx_nir_fs_epilog(nir_builder *b, const void *key_)
282 {
283    const struct agx_fs_epilog_key *key = key_;
284    b->shader->info.stage = MESA_SHADER_FRAGMENT;
285    b->shader->info.name = "FS epilog";
286 
287    /* First, construct a passthrough shader reading each colour and outputting
288     * the value.
289     */
290    u_foreach_bit(rt, key->link.rt_written) {
291       bool dual_src = (rt == 1) && blend_uses_2src(key->blend.rt[0]);
292       unsigned read_rt = (key->link.broadcast_rt0 && !dual_src) ? 0 : rt;
293       unsigned size = (key->link.size_32 & BITFIELD_BIT(read_rt)) ? 32 : 16;
294 
295       nir_def *value =
296          nir_load_exported_agx(b, 4, size, .base = 2 * (4 + (4 * read_rt)));
297 
298       if (key->link.rt0_w_1 && read_rt == 0) {
299          value =
300             nir_vector_insert_imm(b, value, nir_imm_floatN_t(b, 1.0, size), 3);
301       }
302 
303       nir_store_output(
304          b, value, nir_imm_int(b, 0),
305          .io_semantics.location = FRAG_RESULT_DATA0 + (dual_src ? 0 : rt),
306          .io_semantics.dual_source_blend_index = dual_src,
307          .src_type = nir_type_float | size);
308    }
309 
310    /* Grab registers early, this has to happen in the first block. */
311    nir_def *sample_id = NULL, *write_samples = NULL;
312    if (key->link.sample_shading) {
313       sample_id = nir_load_exported_agx(b, 1, 16, .base = 1);
314    }
315 
316    if (key->link.sample_mask_after_force_early) {
317       write_samples = nir_load_exported_agx(b, 1, 16, .base = 7);
318    }
319 
320    /* Now lower the resulting program using the key */
321    struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
322       key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples, true);
323 
324    if (key->force_small_tile)
325       tib.tile_size = (struct agx_tile_size){16, 16};
326 
327    bool force_translucent = false;
328    nir_lower_blend_options opts = {
329       .scalar_blend_const = true,
330       .logicop_enable = key->blend.logicop_func != PIPE_LOGICOP_COPY,
331       .logicop_func = key->blend.logicop_func,
332    };
333 
334    static_assert(ARRAY_SIZE(opts.format) == 8, "max RTs out of sync");
335 
336    for (unsigned i = 0; i < 8; ++i) {
337       opts.format[i] = key->rt_formats[i];
338       opts.rt[i] = (nir_lower_blend_rt){
339          .rgb.src_factor = key->blend.rt[i].rgb_src_factor,
340          .rgb.dst_factor = key->blend.rt[i].rgb_dst_factor,
341          .rgb.func = key->blend.rt[i].rgb_func,
342 
343          .alpha.src_factor = key->blend.rt[i].alpha_src_factor,
344          .alpha.dst_factor = key->blend.rt[i].alpha_dst_factor,
345          .alpha.func = key->blend.rt[i].alpha_func,
346 
347          .colormask = key->blend.rt[i].colormask,
348       };
349    }
350 
351    /* It's more efficient to use masked stores (with
352     * agx_nir_lower_tilebuffer) than to emulate colour masking with
353     * nir_lower_blend.
354     */
355    uint8_t colormasks[8] = {0};
356 
357    for (unsigned i = 0; i < 8; ++i) {
358       if (key->rt_formats[i] == PIPE_FORMAT_NONE)
359          continue;
360 
361       /* TODO: Flakes some dEQPs, seems to invoke UB. Revisit later.
362        * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.77
363        * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.98
364        */
365       if (0 /* agx_tilebuffer_supports_mask(&tib, i) */) {
366          colormasks[i] = key->blend.rt[i].colormask;
367          opts.rt[i].colormask = (uint8_t)BITFIELD_MASK(4);
368       } else {
369          colormasks[i] = (uint8_t)BITFIELD_MASK(4);
370       }
371 
372       /* If not all bound RTs are fully written to, we need to force
373        * translucent pass type. agx_nir_lower_tilebuffer will take
374        * care of this for its own colormasks input.
375        */
376       unsigned comps = util_format_get_nr_components(key->rt_formats[i]);
377       if ((opts.rt[i].colormask & BITFIELD_MASK(comps)) !=
378           BITFIELD_MASK(comps)) {
379          force_translucent = true;
380       }
381    }
382 
383    /* Alpha-to-coverage must be lowered before alpha-to-one */
384    if (key->blend.alpha_to_coverage)
385       NIR_PASS(_, b->shader, agx_nir_lower_alpha_to_coverage, tib.nr_samples);
386 
387    /* Depth/stencil writes must be deferred until after all discards,
388     * particularly alpha-to-coverage.
389     */
390    if (key->link.write_z || key->link.write_s) {
391       nir_store_zs_agx(
392          b, nir_imm_intN_t(b, 0xFF, 16),
393          nir_load_exported_agx(b, 1, 32, .base = 4),
394          nir_load_exported_agx(b, 1, 16, .base = 6),
395          .base = (key->link.write_z ? 1 : 0) | (key->link.write_s ? 2 : 0));
396 
397       if (key->link.write_z)
398          b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_DEPTH);
399 
400       if (key->link.write_s)
401          b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_STENCIL);
402    }
403 
404    /* Alpha-to-one must be lowered before blending */
405    if (key->blend.alpha_to_one)
406       NIR_PASS(_, b->shader, agx_nir_lower_alpha_to_one);
407 
408    NIR_PASS(_, b->shader, nir_lower_blend, &opts);
409 
410    unsigned rt_spill = key->link.rt_spill_base;
411    NIR_PASS(_, b->shader, agx_nir_lower_tilebuffer, &tib, colormasks, &rt_spill,
412             write_samples, &force_translucent);
413    NIR_PASS(_, b->shader, agx_nir_lower_texture);
414    NIR_PASS(_, b->shader, agx_nir_lower_multisampled_image_store);
415 
416    /* If the API shader runs once per sample, then the epilog runs once per
417     * sample as well, so we need to lower our code to run for a single sample.
418     *
419     * If the API shader runs once per pixel, then the epilog runs once per
420     * pixel. So we run through the monolithic MSAA lowering, which wraps the
421     * epilog in the sample loop if needed. This localizes sample shading
422     * to the epilog, when sample shading is not used but blending is.
423     */
424    if (key->link.sample_shading) {
425       NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
426       NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
427 
428       /* Ensure the sample ID is preserved in register. We do this late since it
429        * has to go in the last block, and the above passes might add control
430        * flow when lowering.
431        */
432       b->cursor = nir_after_impl(b->impl);
433       nir_export_agx(b, sample_id, .base = 1);
434    } else {
435       NIR_PASS(_, b->shader, agx_nir_lower_monolithic_msaa, key->nr_samples);
436    }
437 
438    /* Finally, lower uniforms according to our ABI */
439    nir_shader_intrinsics_pass(b->shader, lower_non_monolithic_uniforms,
440                               nir_metadata_control_flow, NULL);
441 
442    /* There is no shader part after the epilog, so we're always responsible for
443     * running our own tests, unless the fragment shader forced early tests.
444     */
445    NIR_PASS(_, b->shader, lower_tests_zs, !key->link.already_ran_zs);
446 
447    b->shader->info.io_lowered = true;
448    b->shader->info.fs.uses_fbfetch_output |= force_translucent;
449    b->shader->info.fs.uses_sample_shading = key->link.sample_shading;
450 }
451 
452 struct lower_epilog_ctx {
453    struct agx_fs_epilog_link_info *info;
454    nir_variable *masked_samples;
455 };
456 
457 static bool
lower_output_to_epilog(nir_builder * b,nir_intrinsic_instr * intr,void * data)458 lower_output_to_epilog(nir_builder *b, nir_intrinsic_instr *intr, void *data)
459 {
460    struct lower_epilog_ctx *ctx = data;
461    struct agx_fs_epilog_link_info *info = ctx->info;
462 
463    if (intr->intrinsic == nir_intrinsic_store_zs_agx) {
464       assert(nir_src_as_uint(intr->src[0]) == 0xff && "msaa not yet lowered");
465       b->cursor = nir_instr_remove(&intr->instr);
466 
467       unsigned base = nir_intrinsic_base(intr);
468       info->write_z = !!(base & 1);
469       info->write_s = !!(base & 2);
470 
471       /* ABI: r2 contains the written depth */
472       if (info->write_z)
473          nir_export_agx(b, intr->src[1].ssa, .base = 4);
474 
475       /* ABI: r3l contains the written stencil */
476       if (info->write_s)
477          nir_export_agx(b, intr->src[2].ssa, .base = 6);
478 
479       return true;
480    }
481 
482    if (intr->intrinsic == nir_intrinsic_discard_agx &&
483        b->shader->info.fs.early_fragment_tests) {
484 
485       if (!ctx->masked_samples) {
486          b->cursor = nir_before_impl(nir_shader_get_entrypoint(b->shader));
487 
488          ctx->masked_samples =
489             nir_local_variable_create(b->impl, glsl_uint16_t_type(), NULL);
490 
491          nir_store_var(b, ctx->masked_samples, nir_imm_intN_t(b, 0xFF, 16),
492                        nir_component_mask(1));
493       }
494 
495       b->cursor = nir_before_instr(&intr->instr);
496 
497       nir_def *mask = nir_load_var(b, ctx->masked_samples);
498       nir_def *mask_2 =
499          nir_ixor(b, intr->src[0].ssa, nir_imm_intN_t(b, 0xff, 16));
500 
501       mask = nir_iand(b, mask, mask_2);
502       nir_store_var(b, ctx->masked_samples, mask, nir_component_mask(1));
503 
504       nir_instr_remove(&intr->instr);
505       return true;
506    }
507 
508    if (intr->intrinsic != nir_intrinsic_store_output)
509       return false;
510 
511    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
512 
513    /* Fix up gl_FragColor */
514    if (sem.location == FRAG_RESULT_COLOR) {
515       sem.location = FRAG_RESULT_DATA0;
516       info->broadcast_rt0 = true;
517       info->rt_written = ~0;
518    }
519 
520    /* We don't use the epilog for sample mask writes */
521    if (sem.location < FRAG_RESULT_DATA0)
522       return false;
523 
524    /* Determine the render target index. Dual source blending aliases a second
525     * render target, so get that out of the way now.
526     */
527    unsigned rt = sem.location - FRAG_RESULT_DATA0;
528    rt += nir_src_as_uint(intr->src[1]);
529 
530    if (sem.dual_source_blend_index) {
531       assert(rt == 0);
532       rt = 1;
533    }
534 
535    info->rt_written |= BITFIELD_BIT(rt);
536 
537    b->cursor = nir_instr_remove(&intr->instr);
538    nir_def *vec = intr->src[0].ssa;
539 
540    if (vec->bit_size == 32)
541       info->size_32 |= BITFIELD_BIT(rt);
542    else
543       assert(vec->bit_size == 16);
544 
545    uint32_t one_f = (vec->bit_size == 32 ? fui(1.0) : _mesa_float_to_half(1.0));
546    unsigned comp = nir_intrinsic_component(intr);
547 
548    u_foreach_bit(c, nir_intrinsic_write_mask(intr)) {
549       nir_scalar s = nir_scalar_resolved(vec, c);
550       if (rt == 0 && c == 3 && nir_scalar_is_const(s) &&
551           nir_scalar_as_uint(s) == one_f) {
552 
553          info->rt0_w_1 = true;
554       } else {
555          unsigned stride = vec->bit_size / 16;
556 
557          nir_export_agx(b, nir_channel(b, vec, c),
558                         .base = (2 * (4 + (4 * rt))) + (comp + c) * stride);
559       }
560    }
561 
562    return true;
563 }
564 
565 bool
agx_nir_lower_fs_output_to_epilog(nir_shader * s,struct agx_fs_epilog_link_info * out)566 agx_nir_lower_fs_output_to_epilog(nir_shader *s,
567                                   struct agx_fs_epilog_link_info *out)
568 {
569    struct lower_epilog_ctx ctx = {.info = out};
570 
571    nir_shader_intrinsics_pass(s, lower_output_to_epilog,
572                               nir_metadata_control_flow, &ctx);
573 
574    if (ctx.masked_samples) {
575       nir_builder b =
576          nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(s)));
577 
578       nir_export_agx(&b, nir_load_var(&b, ctx.masked_samples), .base = 7);
579       out->sample_mask_after_force_early = true;
580 
581       bool progress;
582       do {
583          progress = false;
584          NIR_PASS(progress, s, nir_lower_vars_to_ssa);
585          NIR_PASS(progress, s, nir_opt_dce);
586       } while (progress);
587    }
588 
589    out->sample_shading = s->info.fs.uses_sample_shading;
590    return true;
591 }
592 
593 bool
agx_nir_lower_fs_active_samples_to_register(nir_shader * s)594 agx_nir_lower_fs_active_samples_to_register(nir_shader *s)
595 {
596    return nir_shader_intrinsics_pass(s, lower_active_samples_to_register,
597                                      nir_metadata_control_flow, NULL);
598 }
599 
600 static bool
agx_nir_lower_stats_fs(nir_shader * s)601 agx_nir_lower_stats_fs(nir_shader *s)
602 {
603    assert(s->info.stage == MESA_SHADER_FRAGMENT);
604    nir_builder b_ =
605       nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
606    nir_builder *b = &b_;
607 
608    nir_push_if(b, nir_inot(b, nir_load_helper_invocation(b, 1)));
609    nir_def *samples = nir_bit_count(b, nir_load_sample_mask_in(b));
610    unsigned query = PIPE_STAT_QUERY_PS_INVOCATIONS;
611 
612    nir_def *addr = nir_load_stat_query_address_agx(b, .base = query);
613    nir_global_atomic(b, 32, addr, samples, .atomic_op = nir_atomic_op_iadd);
614 
615    nir_pop_if(b, NULL);
616    nir_metadata_preserve(b->impl, nir_metadata_control_flow);
617    return true;
618 }
619 
620 void
agx_nir_fs_prolog(nir_builder * b,const void * key_)621 agx_nir_fs_prolog(nir_builder *b, const void *key_)
622 {
623    const struct agx_fs_prolog_key *key = key_;
624    b->shader->info.stage = MESA_SHADER_FRAGMENT;
625    b->shader->info.name = "FS prolog";
626 
627    /* First, insert code for any emulated features */
628    if (key->api_sample_mask != 0xff) {
629       /* Kill samples that are NOT covered by the mask */
630       nir_discard_agx(b, nir_imm_intN_t(b, key->api_sample_mask ^ 0xff, 16));
631       b->shader->info.fs.uses_discard = true;
632    }
633 
634    if (key->statistics) {
635       NIR_PASS(_, b->shader, agx_nir_lower_stats_fs);
636    }
637 
638    if (key->cull_distance_size) {
639       NIR_PASS(_, b->shader, agx_nir_lower_cull_distance_fs,
640                key->cull_distance_size);
641    }
642 
643    if (key->polygon_stipple) {
644       NIR_PASS_V(b->shader, agx_nir_lower_poly_stipple);
645    }
646 
647    /* Then, lower the prolog */
648    NIR_PASS(_, b->shader, agx_nir_lower_discard_zs_emit);
649    NIR_PASS(_, b->shader, agx_nir_lower_sample_mask);
650    NIR_PASS(_, b->shader, nir_shader_intrinsics_pass,
651             lower_non_monolithic_uniforms, nir_metadata_control_flow, NULL);
652    NIR_PASS(_, b->shader, lower_tests_zs, key->run_zs_tests);
653 
654    b->shader->info.io_lowered = true;
655 }
656