xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/agx_nir_lower_tilebuffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2022 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include <stdint.h>
7 #include "compiler/glsl_types.h"
8 #include "util/format/u_format.h"
9 #include "util/macros.h"
10 #include "agx_nir_format_helpers.h"
11 #include "agx_pack.h"
12 #include "agx_tilebuffer.h"
13 #include "layout.h"
14 #include "nir.h"
15 #include "nir_builder.h"
16 #include "nir_builder_opcodes.h"
17 
18 #define AGX_NUM_TEXTURE_STATE_REGS 16
19 #define ALL_SAMPLES                0xFF
20 
21 struct ctx {
22    struct agx_tilebuffer_layout *tib;
23    uint8_t *colormasks;
24    bool *translucent;
25    unsigned bindless_base;
26    bool any_memory_stores;
27    uint8_t outputs_written;
28    nir_def *write_samples;
29 };
30 
31 static bool
tib_filter(const nir_instr * instr,UNUSED const void * _)32 tib_filter(const nir_instr *instr, UNUSED const void *_)
33 {
34    if (instr->type != nir_instr_type_intrinsic)
35       return false;
36 
37    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
38    if (intr->intrinsic != nir_intrinsic_store_output &&
39        intr->intrinsic != nir_intrinsic_load_output)
40       return false;
41 
42    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
43    assert(sem.dual_source_blend_index == 0 && "dual source blending lowered");
44    return (sem.location >= FRAG_RESULT_DATA0);
45 }
46 
47 static void
store_tilebuffer(nir_builder * b,struct agx_tilebuffer_layout * tib,enum pipe_format format,enum pipe_format logical_format,unsigned rt,nir_def * value,nir_def * samples,unsigned write_mask)48 store_tilebuffer(nir_builder *b, struct agx_tilebuffer_layout *tib,
49                  enum pipe_format format, enum pipe_format logical_format,
50                  unsigned rt, nir_def *value, nir_def *samples,
51                  unsigned write_mask)
52 {
53    /* The hardware cannot extend for a 32-bit format. Extend ourselves. */
54    if (format == PIPE_FORMAT_R32_UINT && value->bit_size == 16) {
55       if (util_format_is_pure_sint(logical_format))
56          value = nir_i2i32(b, value);
57       else if (util_format_is_pure_uint(logical_format))
58          value = nir_u2u32(b, value);
59       else
60          value = nir_f2f32(b, value);
61    }
62 
63    /* Pure integer formatss need to be clamped in software, at least in some
64     * cases. We do so on store. Piglit gl-3.0-render-integer checks this, as
65     * does KHR-GL33.packed_pixels.*.
66     */
67    const struct util_format_description *desc =
68       util_format_description(logical_format);
69    unsigned c = util_format_get_first_non_void_channel(logical_format);
70 
71    if (desc->channel[c].size <= 16 &&
72        util_format_is_pure_integer(logical_format)) {
73 
74       unsigned bits[4] = {
75          desc->channel[0].size,
76          desc->channel[1].size,
77          desc->channel[2].size,
78          desc->channel[3].size,
79       };
80 
81       if (util_format_is_pure_sint(logical_format))
82          value = nir_format_clamp_sint(b, value, bits);
83       else
84          value = nir_format_clamp_uint(b, value, bits);
85 
86       value = nir_u2u16(b, value);
87    }
88 
89    if (!samples)
90       samples = nir_imm_intN_t(b, ALL_SAMPLES, 16);
91 
92    uint8_t offset_B = agx_tilebuffer_offset_B(tib, rt);
93    nir_store_local_pixel_agx(b, value, samples, nir_undef(b, 2, 16),
94                              .base = offset_B, .write_mask = write_mask,
95                              .format = format);
96 }
97 
98 static nir_def *
nir_fsat_signed(nir_builder * b,nir_def * x)99 nir_fsat_signed(nir_builder *b, nir_def *x)
100 {
101    return nir_fclamp(b, x, nir_imm_floatN_t(b, -1.0, x->bit_size),
102                      nir_imm_floatN_t(b, +1.0, x->bit_size));
103 }
104 
105 static nir_def *
nir_fsat_to_format(nir_builder * b,nir_def * x,enum pipe_format format)106 nir_fsat_to_format(nir_builder *b, nir_def *x, enum pipe_format format)
107 {
108    if (util_format_is_unorm(format))
109       return nir_fsat(b, x);
110    else if (util_format_is_snorm(format))
111       return nir_fsat_signed(b, x);
112    else
113       return x;
114 }
115 
116 static nir_def *
load_tilebuffer(nir_builder * b,struct agx_tilebuffer_layout * tib,uint8_t load_comps,uint8_t bit_size,unsigned rt,enum pipe_format format,enum pipe_format logical_format)117 load_tilebuffer(nir_builder *b, struct agx_tilebuffer_layout *tib,
118                 uint8_t load_comps, uint8_t bit_size, unsigned rt,
119                 enum pipe_format format, enum pipe_format logical_format)
120 {
121    unsigned comps = util_format_get_nr_components(logical_format);
122    bool f16 = (format == PIPE_FORMAT_R16_FLOAT);
123 
124    /* Don't load with F16 */
125    if (f16)
126       format = PIPE_FORMAT_R16_UINT;
127 
128    uint8_t offset_B = agx_tilebuffer_offset_B(tib, rt);
129    nir_def *res = nir_load_local_pixel_agx(
130       b, MIN2(load_comps, comps), f16 ? 16 : bit_size,
131       nir_imm_intN_t(b, ALL_SAMPLES, 16), .base = offset_B, .format = format);
132 
133    /* Extend floats */
134    if (f16 && bit_size != 16) {
135       assert(bit_size == 32);
136       res = nir_f2f32(b, res);
137    }
138 
139    /* Some formats like RGB565 are float in the tilebuffer but logically
140     * normalized. We need to clamp on load to get proper blending semantics, as
141     * the APIs require clamping here and nir_lower_blend (correctly) assumes
142     * load_output is clamped. The spilled path is unaffected as the clamping
143     * implicitly happens when roundtripping to memory.
144     */
145    if (f16)
146       res = nir_fsat_to_format(b, res, logical_format);
147 
148    res = nir_sign_extend_if_sint(b, res, logical_format);
149    return nir_pad_vector(b, res, load_comps);
150 }
151 
152 /*
153  * As a simple implementation, we use image load/store instructions to access
154  * spilled render targets. The driver will supply corresponding texture and PBE
155  * descriptors for each render target, accessed bindlessly
156  *
157  * Note that this lower happens after driver bindings are lowered, so the
158  * bindless handle is in the AGX-specific format.
159  */
160 static nir_def *
handle_for_rt(nir_builder * b,unsigned base,unsigned rt,bool pbe)161 handle_for_rt(nir_builder *b, unsigned base, unsigned rt, bool pbe)
162 {
163    unsigned index = base + (2 * rt) + (pbe ? 1 : 0);
164    return nir_load_texture_handle_agx(b, nir_imm_int(b, index));
165 }
166 
167 static enum glsl_sampler_dim
dim_for_rt(nir_builder * b,unsigned nr_samples,nir_def ** sample)168 dim_for_rt(nir_builder *b, unsigned nr_samples, nir_def **sample)
169 {
170    if (nr_samples == 1) {
171       *sample = nir_imm_intN_t(b, 0, 16);
172       return GLSL_SAMPLER_DIM_2D;
173    } else {
174       *sample = nir_u2u16(b, nir_load_sample_id(b));
175       b->shader->info.fs.uses_sample_shading = true;
176       return GLSL_SAMPLER_DIM_MS;
177    }
178 }
179 
180 static nir_def *
image_coords(nir_builder * b)181 image_coords(nir_builder *b)
182 {
183    nir_def *xy__ = nir_pad_vec4(b, nir_u2u32(b, nir_load_pixel_coord(b)));
184    return nir_vector_insert_imm(b, xy__, nir_load_layer_id(b), 2);
185 }
186 
187 static void
store_memory(nir_builder * b,unsigned bindless_base,unsigned nr_samples,enum pipe_format format,unsigned rt,nir_def * value,nir_def * samples)188 store_memory(nir_builder *b, unsigned bindless_base, unsigned nr_samples,
189              enum pipe_format format, unsigned rt, nir_def *value,
190              nir_def *samples)
191 {
192    nir_def *image = handle_for_rt(b, bindless_base, rt, true);
193    nir_def *tex_image = handle_for_rt(b, bindless_base, rt, false);
194    nir_def *zero = nir_imm_intN_t(b, 0, 16);
195    nir_def *lod = zero;
196 
197    nir_def *sample;
198    enum glsl_sampler_dim dim = dim_for_rt(b, nr_samples, &sample);
199    nir_def *coords = image_coords(b);
200 
201    nir_def *size =
202       nir_bindless_image_size(b, 3, 32, tex_image, nir_imm_int(b, 0),
203                               .image_array = true, .image_dim = dim);
204 
205    nir_begin_invocation_interlock(b);
206 
207    /* XXX: We should not get out-of-bounds image coords. Yet here we are :-/
208     *
209     * Fixes faults in:
210     *
211     * dEQP-VK.pipeline.monolithic.multisample.misc.dynamic_rendering.multi_renderpass.r8g8b8a8_unorm_r16g16b16a16_sfloat_r32g32b32a32_uint_d16_unorm.random_68
212     *
213     * which hits eMRT with multisampled image stores on an odd framebuffer size,
214     * and we get coordinates that go all the way up to align((width,height),
215     * (32,32)) despite setting scissor and such.
216     *
217     * XXX: needs more investigation, macOS seems to not choke on this so what
218     * are we doing wrong?
219     */
220    nir_def *cond = nir_ball(b, nir_ult(b, nir_trim_vector(b, coords, 2),
221                                        nir_trim_vector(b, size, 2)));
222 
223    if (nr_samples > 1) {
224       nir_def *coverage = nir_load_sample_mask(b);
225 
226       if (samples != NULL)
227          coverage = nir_iand(b, coverage, nir_u2u32(b, samples));
228 
229       nir_def *covered = nir_ubitfield_extract(
230          b, coverage, nir_u2u32(b, sample), nir_imm_int(b, 1));
231 
232       cond = nir_iand(b, cond, nir_ine_imm(b, covered, 0));
233    } else if (samples != NULL) {
234       cond = nir_iand(b, cond, nir_ine_imm(b, samples, 0));
235    }
236 
237    nir_push_if(b, cond);
238    {
239       nir_bindless_image_store(b, image, coords, sample, value, lod,
240                                .image_dim = dim, .image_array = true,
241                                .format = format);
242    }
243    nir_pop_if(b, NULL);
244 }
245 
246 static nir_def *
load_memory(nir_builder * b,unsigned bindless_base,unsigned nr_samples,uint8_t comps,uint8_t bit_size,unsigned rt,enum pipe_format format)247 load_memory(nir_builder *b, unsigned bindless_base, unsigned nr_samples,
248             uint8_t comps, uint8_t bit_size, unsigned rt,
249             enum pipe_format format)
250 {
251    nir_def *image = handle_for_rt(b, bindless_base, rt, false);
252    nir_def *zero = nir_imm_intN_t(b, 0, 16);
253    nir_def *lod = zero;
254 
255    nir_def *sample;
256    enum glsl_sampler_dim dim = dim_for_rt(b, nr_samples, &sample);
257    nir_def *coords = image_coords(b);
258 
259    /* Ensure pixels below this one have written out their results */
260    nir_begin_invocation_interlock(b);
261 
262    return nir_bindless_image_load(
263       b, comps, bit_size, image, coords, sample, lod, .image_dim = dim,
264       .image_array = true, .format = format, .access = ACCESS_IN_BOUNDS_AGX);
265 }
266 
267 static nir_def *
tib_impl(nir_builder * b,nir_instr * instr,void * data)268 tib_impl(nir_builder *b, nir_instr *instr, void *data)
269 {
270    struct ctx *ctx = data;
271    struct agx_tilebuffer_layout *tib = ctx->tib;
272    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
273 
274    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
275    unsigned rt = sem.location - FRAG_RESULT_DATA0;
276    assert(rt < ARRAY_SIZE(tib->logical_format));
277 
278    enum pipe_format logical_format = tib->logical_format[rt];
279    enum pipe_format format = agx_tilebuffer_physical_format(tib, rt);
280    unsigned comps = util_format_get_nr_components(logical_format);
281 
282    if (intr->intrinsic == nir_intrinsic_store_output) {
283       ctx->outputs_written |= BITFIELD_BIT(rt);
284 
285       /* Only write components that actually exist */
286       uint16_t write_mask = (uint16_t)BITFIELD_MASK(comps);
287 
288       /* Delete stores to nonexistent render targets */
289       if (logical_format == PIPE_FORMAT_NONE)
290          return NIR_LOWER_INSTR_PROGRESS_REPLACE;
291 
292       /* Only write colours masked by the blend state */
293       if (ctx->colormasks)
294          write_mask &= ctx->colormasks[rt];
295 
296       /* Masked stores require a translucent pass type */
297       if (write_mask != BITFIELD_MASK(comps)) {
298          assert(ctx->translucent != NULL &&
299                 "colour masking requires translucency");
300 
301          assert(agx_tilebuffer_supports_mask(tib, rt));
302          *(ctx->translucent) = true;
303       }
304 
305       if (ctx->write_samples) {
306          assert(ctx->translucent != NULL &&
307                 "sample masking requires translucency");
308 
309          *(ctx->translucent) = true;
310       }
311 
312       /* But we ignore the NIR write mask for that, since it's basically an
313        * optimization hint.
314        */
315       if (agx_tilebuffer_supports_mask(tib, rt))
316          write_mask &= nir_intrinsic_write_mask(intr);
317 
318       /* Delete stores that are entirely masked out */
319       if (!write_mask)
320          return NIR_LOWER_INSTR_PROGRESS_REPLACE;
321 
322       nir_def *value = intr->src[0].ssa;
323 
324       /* Trim to format as required by hardware */
325       value = nir_trim_vector(b, intr->src[0].ssa, comps);
326 
327       if (tib->spilled[rt]) {
328          store_memory(b, ctx->bindless_base, tib->nr_samples, logical_format,
329                       rt, value, ctx->write_samples);
330          ctx->any_memory_stores = true;
331       } else {
332          store_tilebuffer(b, tib, format, logical_format, rt, value,
333                           ctx->write_samples, write_mask);
334       }
335 
336       return NIR_LOWER_INSTR_PROGRESS_REPLACE;
337    } else {
338       uint8_t bit_size = intr->def.bit_size;
339 
340       /* Loads from non-existent render targets are undefined in NIR but not
341        * possible to encode in the hardware, delete them.
342        */
343       if (logical_format == PIPE_FORMAT_NONE) {
344          return nir_undef(b, intr->num_components, bit_size);
345       } else if (tib->spilled[rt]) {
346          *(ctx->translucent) = true;
347 
348          return load_memory(b, ctx->bindless_base, tib->nr_samples,
349                             intr->num_components, bit_size, rt, logical_format);
350       } else {
351          return load_tilebuffer(b, tib, intr->num_components, bit_size, rt,
352                                 format, logical_format);
353       }
354    }
355 }
356 
357 bool
agx_nir_lower_tilebuffer(nir_shader * shader,struct agx_tilebuffer_layout * tib,uint8_t * colormasks,unsigned * bindless_base,nir_def * write_samples,bool * translucent)358 agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib,
359                          uint8_t *colormasks, unsigned *bindless_base,
360                          nir_def *write_samples, bool *translucent)
361 {
362    assert(shader->info.stage == MESA_SHADER_FRAGMENT);
363 
364    struct ctx ctx = {
365       .tib = tib,
366       .colormasks = colormasks,
367       .translucent = translucent,
368       .write_samples = write_samples,
369    };
370 
371    /* Allocate 1 texture + 1 PBE descriptor for each spilled descriptor */
372    if (agx_tilebuffer_spills(tib)) {
373       assert(bindless_base != NULL && "must be specified if spilling");
374       ctx.bindless_base = *bindless_base;
375       *bindless_base += (AGX_MAX_RENDER_TARGETS * 2);
376    }
377 
378    bool progress =
379       nir_shader_lower_instructions(shader, tib_filter, tib_impl, &ctx);
380 
381    /* Flush at end */
382    if (ctx.any_memory_stores) {
383       nir_function_impl *impl = nir_shader_get_entrypoint(shader);
384       nir_builder b = nir_builder_at(nir_after_impl(impl));
385       nir_fence_pbe_to_tex_pixel_agx(&b);
386    }
387 
388    /* If there are any render targets bound to the framebuffer that aren't
389     * statically written by the fragment shader, that acts as an implicit mask
390     * and requires translucency.
391     *
392     * XXX: Could be optimized.
393     */
394    for (unsigned i = 0; i < ARRAY_SIZE(tib->logical_format); ++i) {
395       bool exists = tib->logical_format[i] != PIPE_FORMAT_NONE;
396       bool written = ctx.outputs_written & BITFIELD_BIT(i);
397 
398       if (translucent)
399          *translucent |= (exists && !written);
400    }
401 
402    return progress;
403 }
404