xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_compile.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2020 Collabora Ltd.
4  * Copyright 2016 Broadcom
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "agx_compile.h"
9 #include "asahi/layout/layout.h"
10 #include "compiler/nir/nir_builder.h"
11 #include "util/bitset.h"
12 #include "util/glheader.h"
13 #include "util/list.h"
14 #include "util/macros.h"
15 #include "util/u_debug.h"
16 #include "util/u_dynarray.h"
17 #include "agx_builder.h"
18 #include "agx_compiler.h"
19 #include "agx_debug.h"
20 #include "agx_nir.h"
21 #include "glsl_types.h"
22 #include "nir.h"
23 #include "nir_builtin_builder.h"
24 #include "nir_intrinsics.h"
25 #include "nir_intrinsics_indices.h"
26 #include "shader_enums.h"
27 
28 /* Alignment for shader programs. I'm not sure what the optimal value is. */
29 #define AGX_CODE_ALIGN 0x100
30 
31 /* clang-format off */
32 static const struct debug_named_value agx_debug_options[] = {
33    {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
34    {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
35    {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
36    {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
37    {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
38    {"noopt",     AGX_DBG_NOOPT,     "Disable backend optimizations"},
39    {"wait",      AGX_DBG_WAIT,      "Wait after all async instructions"},
40    {"nopreamble",AGX_DBG_NOPREAMBLE,"Do not use shader preambles"},
41    {"demand",    AGX_DBG_DEMAND,    "Bound tightly to register demand"},
42    {"nosched",   AGX_DBG_NOSCHED,   "Do not schedule the shader"},
43    {"spill",     AGX_DBG_SPILL,     "Spill (almost) everything"},
44    {"nopromote", AGX_DBG_NOPROMOTE, "Do not promote constants to uniforms"},
45    DEBUG_NAMED_VALUE_END
46 };
47 /* clang-format on */
48 
49 DEBUG_GET_ONCE_FLAGS_OPTION(agx_compiler_debug, "AGX_MESA_DEBUG",
50                             agx_debug_options, 0)
51 
52 int agx_compiler_debug = 0;
53 
54 uint64_t
agx_get_compiler_debug(void)55 agx_get_compiler_debug(void)
56 {
57    return debug_get_option_agx_compiler_debug();
58 }
59 
60 static agx_index
agx_cached_preload(agx_context * ctx,unsigned base,enum agx_size size)61 agx_cached_preload(agx_context *ctx, unsigned base, enum agx_size size)
62 {
63    if (agx_is_null(ctx->preloaded[base])) {
64       agx_block *block = agx_start_block(ctx);
65       agx_builder b = agx_init_builder(ctx, agx_before_block(block));
66       ctx->preloaded[base] = agx_preload(&b, agx_register(base, size));
67    }
68 
69    return ctx->preloaded[base];
70 }
71 
72 static agx_index
agx_vertex_id(agx_builder * b)73 agx_vertex_id(agx_builder *b)
74 {
75    return agx_cached_preload(b->shader, 10, AGX_SIZE_32);
76 }
77 
78 static agx_index
agx_instance_id(agx_builder * b)79 agx_instance_id(agx_builder *b)
80 {
81    return agx_cached_preload(b->shader, 12, AGX_SIZE_32);
82 }
83 
84 #define VARYING_NUM_COMPONENTS (VARYING_SLOT_MAX * 4)
85 
86 struct coefficient_info {
87    BITSET_DECLARE(smooth, VARYING_NUM_COMPONENTS);
88    BITSET_DECLARE(flat, VARYING_NUM_COMPONENTS);
89    BITSET_DECLARE(noperspective, VARYING_NUM_COMPONENTS);
90 };
91 
92 static BITSET_WORD *
bitset_for_interp(struct coefficient_info * info,enum glsl_interp_mode mode)93 bitset_for_interp(struct coefficient_info *info, enum glsl_interp_mode mode)
94 {
95    /* clang-format off */
96    switch (mode) {
97    case INTERP_MODE_NONE:
98    case INTERP_MODE_SMOOTH:         return info->smooth;
99    case INTERP_MODE_NOPERSPECTIVE:  return info->noperspective;
100    case INTERP_MODE_FLAT:           return info->flat;
101    default:                         unreachable("invalid interp mode");
102    }
103    /* clang-format on */
104 }
105 
106 static bool
gather_cf(nir_builder * b,nir_intrinsic_instr * intr,void * data)107 gather_cf(nir_builder *b, nir_intrinsic_instr *intr, void *data)
108 {
109    /* First handle frag coord loads */
110    struct coefficient_info *info = data;
111    if (intr->intrinsic == nir_intrinsic_load_frag_coord_zw) {
112       BITSET_SET(info->noperspective,
113                  VARYING_SLOT_POS + nir_intrinsic_component(intr));
114       return false;
115    }
116 
117    /* Look for input loads and grab the instruction with the interp mode */
118    nir_intrinsic_instr *bary;
119    unsigned nr = 1;
120 
121    if (intr->intrinsic == nir_intrinsic_load_coefficients_agx) {
122       bary = intr;
123       /* Always load a scalar */
124    } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
125       bary = nir_src_as_intrinsic(intr->src[0]);
126       nr = intr->num_components;
127 
128       /* Perspective interpolation internally reads W */
129       if (nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE)
130          BITSET_SET(info->noperspective, VARYING_SLOT_POS + 3);
131    } else {
132       return false;
133    }
134 
135    BITSET_WORD *set = bitset_for_interp(data, nir_intrinsic_interp_mode(bary));
136    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
137    nir_src *offset = nir_get_io_offset_src(intr);
138 
139    /* Mark the exact range for direct loads to minimize CF registers, but mark a
140     * conservative bounding range for indirect array access.
141     */
142    if (nir_src_is_const(*offset)) {
143       unsigned location = sem.location + nir_src_as_uint(*offset);
144       unsigned start_comp = (location * 4) + nir_intrinsic_component(intr);
145 
146       BITSET_SET_RANGE(set, start_comp, start_comp + nr - 1);
147    } else {
148       unsigned start_comp = (sem.location * 4) + nir_intrinsic_component(intr);
149       bool compact = sem.location == VARYING_SLOT_CLIP_DIST0 ||
150                      sem.location == VARYING_SLOT_CLIP_DIST1;
151       unsigned stride = compact ? 1 : 4;
152 
153       /* For now we have to assign CF for the whole vec4 to make indirect
154        * indexiing work. This could be optimized later.
155        */
156       nr = stride;
157 
158       for (unsigned i = 0; i < sem.num_slots; ++i) {
159          BITSET_SET_RANGE(set, start_comp + (i * stride),
160                           start_comp + (i * stride) + nr - 1);
161       }
162    }
163 
164    return false;
165 }
166 
167 /*
168  * We assign all coefficient registers up front to ensure we have a consistent
169  * layout required for indirects to work.
170  */
171 static void
assign_coefficient_regs(nir_shader * nir,struct agx_varyings_fs * var)172 assign_coefficient_regs(nir_shader *nir, struct agx_varyings_fs *var)
173 {
174    struct coefficient_info info = {0};
175    nir_shader_intrinsics_pass(nir, gather_cf, nir_metadata_all, &info);
176 
177    /* W */
178    if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 3)) {
179       var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
180          .cf_base = var->nr_cf++,
181          .slot = VARYING_SLOT_POS,
182          .offset = 3,
183          .count = 1,
184          .smooth = true,
185       };
186    }
187 
188    /* Z */
189    if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 2)) {
190       var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
191          .cf_base = var->nr_cf++,
192          .slot = VARYING_SLOT_POS,
193          .offset = 2,
194          .count = 1,
195          .smooth = true,
196       };
197 
198       var->reads_z = true;
199    }
200 
201    static_assert(VARYING_SLOT_POS == 0, "special and handled first");
202 
203    for (unsigned i = VARYING_SLOT_POS + 1; i < VARYING_SLOT_MAX; ++i) {
204       bool smooth = BITSET_TEST_RANGE(info.smooth, i * 4, (i * 4) + 3);
205       bool flat = BITSET_TEST_RANGE(info.flat, i * 4, (i * 4) + 3);
206       bool noperspective =
207          BITSET_TEST_RANGE(info.noperspective, i * 4, (i * 4) + 3);
208 
209       if (!(smooth || flat || noperspective))
210          continue;
211 
212       /* From the GLSL 4.60 spec ("Input Layout Qualifiers"):
213        *
214        *    when location aliasing, the aliases sharing the location must have
215        *    the same underlying numerical type and bit width (floating-point or
216        *    integer, 32-bit versus 64-bit, etc.) and the same auxiliary storage
217        *    and interpolation qualification.
218        *
219        * SPIR-V should obey this as well although the spec text is muddier.
220        */
221       assert((smooth + flat + noperspective) == 1 &&
222              "slots must have consistent interpolation");
223 
224       BITSET_WORD *set = smooth ? info.smooth
225                          : flat ? info.flat
226                                 : info.noperspective;
227 
228       /* Find the start offset */
229       unsigned offset = 0;
230       for (offset = 0; offset < 4 && !BITSET_TEST(set, (i * 4) + offset);
231            ++offset)
232          ;
233 
234       /* Find the end offset. TODO: Do we ever need to split into two bindings
235        * to handle e.g. x_zw read masks?
236        */
237       unsigned count = 0;
238       for (unsigned c = offset; c < 4; ++c) {
239          if (BITSET_TEST(set, (i * 4) + c))
240             count = c - offset + 1;
241       }
242       assert(count >= 1 && (count + offset) <= 4);
243 
244       var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
245          .cf_base = var->nr_cf,
246          .slot = i,
247          .offset = offset,
248          .count = count,
249          .smooth = !flat,
250          .perspective = smooth,
251       };
252 
253       var->nr_cf += count;
254    }
255 }
256 
257 static agx_index
agx_get_cf(agx_context * ctx,gl_varying_slot slot,unsigned offset)258 agx_get_cf(agx_context *ctx, gl_varying_slot slot, unsigned offset)
259 {
260    struct agx_varyings_fs *varyings = &ctx->out->varyings.fs;
261 
262    /* We already have an appropriate binding, find it */
263    for (unsigned b = 0; b < varyings->nr_bindings; ++b) {
264       if (varyings->bindings[b].slot == slot &&
265           (slot != VARYING_SLOT_POS ||
266            offset == varyings->bindings[b].offset)) {
267 
268          signed cf_offset = offset - varyings->bindings[b].offset;
269          assert(cf_offset >= 0);
270 
271          return agx_immediate(varyings->bindings[b].cf_base + cf_offset);
272       }
273    }
274 
275    unreachable("all coefficient registers preassigned");
276 }
277 
278 /* Builds a 64-bit hash table key for an index */
279 static uint64_t
agx_index_to_key(agx_index idx)280 agx_index_to_key(agx_index idx)
281 {
282    STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
283 
284    uint64_t key = 0;
285    memcpy(&key, &idx, sizeof(idx));
286    return key;
287 }
288 
289 /*
290  * Extract a single channel out of a vector source. We split vectors with
291  * p_split so we can use the split components directly, without emitting a
292  * machine instruction. This has advantages of RA, as the split can usually be
293  * optimized away.
294  */
295 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)296 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
297 {
298    agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
299                                                        agx_index_to_key(vec));
300 
301    assert(components != NULL && "missing agx_emit_collect_to");
302 
303    return components[channel];
304 }
305 
306 static agx_index
agx_extract_nir_src(agx_builder * b,nir_src src,unsigned channel)307 agx_extract_nir_src(agx_builder *b, nir_src src, unsigned channel)
308 {
309    agx_index idx = agx_src_index(&src);
310 
311    /* We only deal with scalars, extract a single scalar if needed */
312    if (nir_src_num_components(src) > 1)
313       return agx_emit_extract(b, idx, channel);
314    else
315       return idx;
316 }
317 
318 static void
agx_cache_collect(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)319 agx_cache_collect(agx_builder *b, agx_index dst, unsigned nr_srcs,
320                   agx_index *srcs)
321 {
322    /* Lifetime of a hash table entry has to be at least as long as the table */
323    agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);
324 
325    for (unsigned i = 0; i < nr_srcs; ++i)
326       channels[i] = srcs[i];
327 
328    _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
329                                channels);
330 }
331 
332 /*
333  * Combine multiple scalars into a vector destination. This corresponds to
334  * collect, lowered to moves (a shuffle in general) after register allocation.
335  *
336  * To optimize vector extractions, we record the individual channels
337  */
338 static agx_instr *
agx_emit_collect_to(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)339 agx_emit_collect_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
340                     agx_index *srcs)
341 {
342    agx_cache_collect(b, dst, nr_srcs, srcs);
343 
344    if (nr_srcs == 1)
345       return agx_mov_to(b, dst, srcs[0]);
346 
347    agx_instr *I = agx_collect_to(b, dst, nr_srcs);
348 
349    agx_foreach_src(I, s)
350       I->src[s] = srcs[s];
351 
352    return I;
353 }
354 
355 static agx_index
agx_emit_collect(agx_builder * b,unsigned nr_srcs,agx_index * srcs)356 agx_emit_collect(agx_builder *b, unsigned nr_srcs, agx_index *srcs)
357 {
358    agx_index dst = agx_vec_temp(b->shader, srcs[0].size, nr_srcs);
359    agx_emit_collect_to(b, dst, nr_srcs, srcs);
360    return dst;
361 }
362 
363 static agx_index
agx_vec2(agx_builder * b,agx_index s0,agx_index s1)364 agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
365 {
366    return agx_emit_collect(b, 2, (agx_index[]){s0, s1});
367 }
368 
369 static agx_index
agx_recollect_vector(agx_builder * b,nir_src vec)370 agx_recollect_vector(agx_builder *b, nir_src vec)
371 {
372    agx_index comps[4];
373    unsigned nr = nir_src_num_components(vec);
374 
375    for (unsigned i = 0; i < nr; ++i)
376       comps[i] = agx_extract_nir_src(b, vec, i);
377 
378    return agx_emit_collect(b, nr, comps);
379 }
380 
381 /*
382  * Extract the lower or upper N-bits from a (2*N)-bit quantity. We use a split
383  * without null destinations to let us CSE (and coalesce) the splits when both x
384  * and y are split.
385  */
386 static agx_instr *
agx_subdivide_to(agx_builder * b,agx_index dst,agx_index s0,unsigned comp)387 agx_subdivide_to(agx_builder *b, agx_index dst, agx_index s0, unsigned comp)
388 {
389    assert((s0.size == (dst.size + 1)) && "only 2x subdivide handled");
390    assert((comp == 0 || comp == 1) && "too many components");
391 
392    /* Handle immediates specially so we don't have to constant fold splits. */
393    if (s0.type == AGX_INDEX_IMMEDIATE) {
394       unsigned bits = 16 * agx_size_align_16(dst.size);
395       return agx_mov_imm_to(b, dst, (s0.value >> bits) & BITFIELD64_MASK(bits));
396    }
397 
398    agx_instr *split = agx_split(b, 2, s0);
399    split->dest[comp] = dst;
400    split->dest[1 - comp] = agx_temp(b->shader, dst.size);
401    return split;
402 }
403 
404 void
agx_block_add_successor(agx_block * block,agx_block * successor)405 agx_block_add_successor(agx_block *block, agx_block *successor)
406 {
407    assert(block != NULL && successor != NULL);
408 
409    /* Cull impossible edges */
410    if (block->unconditional_jumps)
411       return;
412 
413    for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
414       if (block->successors[i]) {
415          if (block->successors[i] == successor)
416             return;
417          else
418             continue;
419       }
420 
421       block->successors[i] = successor;
422       util_dynarray_append(&successor->predecessors, agx_block *, block);
423       return;
424    }
425 
426    unreachable("Too many successors");
427 }
428 
429 /*
430  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
431  * split pseudo-instruction.
432  *
433  * Pre-condition: dests is filled with agx_null().
434  */
435 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)436 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
437 {
438    agx_instr *I = agx_split(b, n, vec);
439 
440    agx_foreach_dest(I, d) {
441       dests[d] = agx_temp(b->shader, vec.size);
442       I->dest[d] = dests[d];
443    }
444 }
445 
446 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)447 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
448 {
449    agx_index dests[4] = {agx_null(), agx_null(), agx_null(), agx_null()};
450    agx_emit_split(b, dests, vec, n);
451    agx_cache_collect(b, vec, n, dests);
452 }
453 
454 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)455 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
456 {
457    /* Ensure we've been scalarized and bit size lowered */
458    unsigned bit_size = instr->def.bit_size;
459    assert(instr->def.num_components == 1);
460 
461    /* Emit move, later passes can inline/push if useful */
462    agx_mov_imm_to(b, agx_def_index(&instr->def),
463                   nir_const_value_as_uint(instr->value[0], bit_size));
464 }
465 
466 /*
467  * Implement mul_high of 32-bit sources by doing a 32x32->64-bit multiply and
468  * extracting only the high word.
469  */
470 static agx_instr *
agx_mul_high_to(agx_builder * b,agx_index dst,agx_index P,agx_index Q,bool is_signed)471 agx_mul_high_to(agx_builder *b, agx_index dst, agx_index P, agx_index Q,
472                 bool is_signed)
473 {
474    assert(P.size == Q.size && "source sizes must match");
475    assert(P.size == dst.size && "dest size must match");
476    assert(P.size != AGX_SIZE_64 && "64x64 multiply should have been lowered");
477 
478    static_assert(AGX_SIZE_64 == (AGX_SIZE_32 + 1), "enum wrong");
479    static_assert(AGX_SIZE_32 == (AGX_SIZE_16 + 1), "enum wrong");
480 
481    if (!is_signed) {
482       P = agx_abs(P);
483       Q = agx_abs(Q);
484    }
485 
486    agx_index product = agx_temp(b->shader, P.size + 1);
487    agx_imad_to(b, product, P, Q, agx_zero(), 0);
488 
489    return agx_subdivide_to(b, dst, product, 1);
490 }
491 
492 static enum agx_format
agx_format_for_pipe(enum pipe_format format)493 agx_format_for_pipe(enum pipe_format format)
494 {
495 #define CASE(x)                                                                \
496    if (format == (enum pipe_format)AIL_ISA_FORMAT_##x)                         \
497       return AGX_FORMAT_##x;
498 
499    CASE(I8);
500    CASE(I16);
501    CASE(I32);
502    CASE(F16);
503    CASE(U8NORM);
504    CASE(S8NORM);
505    CASE(U16NORM);
506    CASE(S16NORM);
507    CASE(RGB10A2);
508    CASE(SRGBA8);
509    CASE(RG11B10F);
510    CASE(RGB9E5);
511 
512 #undef CASE
513    unreachable("Invalid format");
514 }
515 
516 static agx_index
cf_for_intrinsic(agx_builder * b,nir_intrinsic_instr * intr)517 cf_for_intrinsic(agx_builder *b, nir_intrinsic_instr *intr)
518 {
519    /* Determine the base location, taking into account a constant offset */
520    unsigned location = nir_intrinsic_io_semantics(intr).location;
521    bool compact = location == VARYING_SLOT_CLIP_DIST0 ||
522                   location == VARYING_SLOT_CLIP_DIST1;
523 
524    nir_src *offset = nir_get_io_offset_src(intr);
525    if (nir_src_is_const(*offset)) {
526       /* XXX: NIR is broken and uses constant offsets in slots but dynamic
527        * offsets in scalars for compact varyings. This needs to be fixed
528        * upstream.
529        */
530       location += nir_src_as_uint(*offset);
531    }
532 
533    agx_index I = agx_get_cf(b->shader, location, nir_intrinsic_component(intr));
534 
535    /* If we have a non-constant offset, we add it to the CF. Offsets are in
536     * vec4 slots (unless we're compact) but the CF is in components, so we need
537     * to shift the offset by 2 before adding.
538     */
539    if (!nir_src_is_const(*offset)) {
540       I = agx_iadd(b, I, agx_src_index(offset), compact ? 0 : 2);
541    }
542 
543    return I;
544 }
545 
546 static enum agx_interpolation
agx_interp_for_bary(nir_intrinsic_instr * bary,agx_index * sample_index)547 agx_interp_for_bary(nir_intrinsic_instr *bary, agx_index *sample_index)
548 {
549    switch (bary->intrinsic) {
550    case nir_intrinsic_load_barycentric_pixel:
551       return AGX_INTERPOLATION_CENTER;
552 
553    case nir_intrinsic_load_barycentric_centroid:
554       return AGX_INTERPOLATION_CENTROID;
555 
556    case nir_intrinsic_load_barycentric_at_sample:
557       *sample_index = agx_src_index(&bary->src[0]);
558       return AGX_INTERPOLATION_SAMPLE;
559 
560    default:
561       unreachable("should have been lowered");
562    }
563 }
564 
565 static void
agx_emit_load_vary(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)566 agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
567 {
568    ASSERTED unsigned components = instr->num_components;
569    nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
570 
571    assert(components >= 1 && components <= 4);
572 
573    agx_index sample_index = agx_zero();
574    enum agx_interpolation interp = agx_interp_for_bary(bary, &sample_index);
575 
576    bool perspective =
577       nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE;
578 
579    agx_index I = cf_for_intrinsic(b, instr);
580 
581    /* For perspective interpolation, we project (multiply by 1/W) */
582    if (perspective) {
583       agx_index J = agx_get_cf(b->shader, VARYING_SLOT_POS, 3);
584       agx_iterproj_to(b, dest, I, J, sample_index, components, interp);
585    } else {
586       agx_iter_to(b, dest, I, sample_index, components, interp);
587    }
588 
589    agx_emit_cached_split(b, dest, components);
590 }
591 
592 static agx_instr *
agx_emit_local_store_pixel(agx_builder * b,nir_intrinsic_instr * instr)593 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
594 {
595    bool explicit = nir_intrinsic_explicit_coord(instr);
596 
597    /* TODO: Reverse-engineer interactions with MRT */
598    if (b->shader->stage == MESA_SHADER_FRAGMENT) {
599       if (b->shader->key->fs.ignore_tib_dependencies) {
600          assert(b->shader->nir->info.internal && "only for clear shaders");
601       } else if (b->shader->did_writeout) {
602          agx_wait_pix(b, 0x0004);
603       } else {
604          agx_wait_pix(b, 0x000C);
605       }
606    }
607 
608    /* Compact the registers according to the mask */
609    agx_index compacted[4] = {agx_null()};
610 
611    unsigned compact_count = 0;
612    u_foreach_bit(i, nir_intrinsic_write_mask(instr)) {
613       compacted[compact_count++] = agx_extract_nir_src(b, instr->src[0], i);
614    }
615 
616    agx_index collected = agx_emit_collect(b, compact_count, compacted);
617    agx_index coords = explicit ? agx_src_index(&instr->src[2]) : agx_null();
618 
619    b->shader->did_writeout = true;
620    b->shader->out->tag_write_disable = false;
621    return agx_st_tile(b, collected, agx_src_index(&instr->src[1]), coords,
622                       agx_format_for_pipe(nir_intrinsic_format(instr)),
623                       nir_intrinsic_write_mask(instr),
624                       nir_intrinsic_base(instr), explicit);
625 }
626 
627 static agx_instr *
agx_emit_store_zs(agx_builder * b,nir_intrinsic_instr * instr)628 agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
629 {
630    unsigned base = nir_intrinsic_base(instr);
631    bool write_z = base & 1;
632    bool write_s = base & 2;
633 
634    /* TODO: Handle better */
635    assert(!b->shader->key->fs.ignore_tib_dependencies && "not used");
636    agx_wait_pix(b, 0x0001);
637 
638    agx_index z = agx_src_index(&instr->src[1]);
639    agx_index s = agx_src_index(&instr->src[2]);
640 
641    assert(!write_z || z.size == AGX_SIZE_32);
642    assert(!write_s || s.size == AGX_SIZE_16);
643 
644    if (write_z && write_s) {
645       agx_index u2u32 = agx_temp(b->shader, AGX_SIZE_32);
646       agx_mov_to(b, u2u32, s);
647       s = u2u32;
648    }
649 
650    agx_index zs = (write_z && write_s) ? agx_vec2(b, z, s) : write_z ? z : s;
651 
652    /* Not necessarily a sample mask but overlapping hw mechanism... Should
653     * maybe rename this flag to something more general.
654     */
655    b->shader->out->writes_sample_mask = true;
656 
657    return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
658 }
659 
660 static void
agx_emit_local_load_pixel(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)661 agx_emit_local_load_pixel(agx_builder *b, agx_index dest,
662                           nir_intrinsic_instr *instr)
663 {
664    /* TODO: Reverse-engineer interactions with MRT */
665    assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
666    agx_wait_pix(b, 0x0008);
667    b->shader->did_writeout = true;
668 
669    unsigned nr_comps = instr->def.num_components;
670    agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]), agx_null(),
671                   agx_format_for_pipe(nir_intrinsic_format(instr)),
672                   BITFIELD_MASK(nr_comps), nir_intrinsic_base(instr), false);
673    agx_emit_cached_split(b, dest, nr_comps);
674 }
675 
676 static void
agx_emit_load(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)677 agx_emit_load(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
678 {
679    agx_index addr = agx_src_index(&instr->src[0]);
680    agx_index offset = agx_src_index(&instr->src[1]);
681    enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
682    unsigned shift = nir_intrinsic_base(instr);
683 
684    /* Zero-extend offset if we're not sign-extending */
685    if (!nir_intrinsic_sign_extend(instr))
686       offset = agx_abs(offset);
687 
688    agx_device_load_to(b, dest, addr, offset, fmt,
689                       BITFIELD_MASK(instr->def.num_components), shift);
690    agx_emit_cached_split(b, dest, instr->def.num_components);
691 }
692 
693 static void
agx_emit_store(agx_builder * b,nir_intrinsic_instr * instr)694 agx_emit_store(agx_builder *b, nir_intrinsic_instr *instr)
695 {
696    agx_index addr = agx_src_index(&instr->src[1]);
697    agx_index offset = agx_src_index(&instr->src[2]);
698    enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
699    unsigned shift = nir_intrinsic_base(instr);
700 
701    /* Zero-extend offset if we're not sign-extending */
702    if (!nir_intrinsic_sign_extend(instr))
703       offset = agx_abs(offset);
704 
705    agx_device_store(b, agx_recollect_vector(b, instr->src[0]), addr, offset,
706                     fmt, BITFIELD_MASK(nir_src_num_components(instr->src[0])),
707                     shift);
708 }
709 
710 /* Preambles write directly to uniform registers, so move from uniform to GPR */
711 static agx_instr *
agx_emit_load_preamble(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)712 agx_emit_load_preamble(agx_builder *b, agx_index dst,
713                        nir_intrinsic_instr *instr)
714 {
715    agx_index srcs[4] = {agx_null()};
716    unsigned dim = instr->def.num_components;
717    assert(dim <= ARRAY_SIZE(srcs) && "shouldn't see larger vectors");
718 
719    unsigned base = nir_intrinsic_base(instr);
720    unsigned stride = agx_size_align_16(dst.size);
721 
722    for (unsigned i = 0; i < dim; ++i)
723       srcs[i] = agx_uniform(base + i * stride, dst.size);
724 
725    return agx_emit_collect_to(b, dst, dim, srcs);
726 }
727 
728 static agx_instr *
agx_emit_store_preamble(agx_builder * b,nir_intrinsic_instr * instr)729 agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
730 {
731    agx_index vec = agx_src_index(&instr->src[0]);
732    unsigned base = nir_intrinsic_base(instr);
733    unsigned stride = agx_size_align_16(vec.size);
734    unsigned nr = nir_src_num_components(instr->src[0]);
735 
736    for (unsigned i = 0; i < nr; i += (4 / stride)) {
737       agx_index data[4] = {0};
738       unsigned count = MIN2(4 / stride, nr - i);
739 
740       for (unsigned c = 0; c < count; ++c) {
741          data[c] = agx_extract_nir_src(b, instr->src[0], i + c);
742       }
743 
744       agx_uniform_store(b, agx_emit_collect(b, count, data),
745                         agx_immediate(base + i * stride), BITFIELD_MASK(count));
746    }
747 
748    return NULL;
749 }
750 
751 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)752 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
753 {
754    switch (dim) {
755    case GLSL_SAMPLER_DIM_1D:
756       return array ? AGX_DIM_1D_ARRAY : AGX_DIM_1D;
757 
758    case GLSL_SAMPLER_DIM_2D:
759    case GLSL_SAMPLER_DIM_RECT:
760    case GLSL_SAMPLER_DIM_EXTERNAL:
761       return array ? AGX_DIM_2D_ARRAY : AGX_DIM_2D;
762 
763    case GLSL_SAMPLER_DIM_MS:
764       return array ? AGX_DIM_2D_MS_ARRAY : AGX_DIM_2D_MS;
765 
766    case GLSL_SAMPLER_DIM_3D:
767       assert(!array && "3D arrays unsupported");
768       return AGX_DIM_3D;
769 
770    case GLSL_SAMPLER_DIM_CUBE:
771       return array ? AGX_DIM_CUBE_ARRAY : AGX_DIM_CUBE;
772 
773    case GLSL_SAMPLER_DIM_BUF:
774       unreachable("Buffer textures should have been lowered");
775 
776    default:
777       unreachable("Invalid sampler dim\n");
778    }
779 }
780 
781 /*
782  * In the hardware, bindless texture sources are specified as a 64-bit uniform
783  * base address summed with a 32-bit register index. In NIR, we model this as a
784  * vec2, where the first source is the (constant) uniform register number and
785  * the second source is the (dynamic) byte offset.
786  */
787 static agx_index
agx_translate_bindless_handle(agx_builder * b,nir_src * handle,agx_index * base)788 agx_translate_bindless_handle(agx_builder *b, nir_src *handle, agx_index *base)
789 {
790    nir_scalar base_scalar = nir_scalar_resolved(handle->ssa, 0);
791    assert(nir_scalar_is_const(base_scalar) && "base must be constant");
792 
793    unsigned base_uint = nir_scalar_as_uint(base_scalar);
794    *base = agx_uniform(base_uint, AGX_SIZE_64);
795 
796    return agx_emit_extract(b, agx_src_index(handle), 1);
797 }
798 
799 static agx_instr *
agx_emit_block_image_store(agx_builder * b,nir_intrinsic_instr * instr)800 agx_emit_block_image_store(agx_builder *b, nir_intrinsic_instr *instr)
801 {
802    agx_index offset = agx_src_index(&instr->src[1]);
803    agx_index coords = agx_src_index(&instr->src[2]);
804    enum agx_format format = agx_format_for_pipe(nir_intrinsic_format(instr));
805 
806    bool ms = nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_MS;
807    bool array = nir_intrinsic_image_array(instr);
808    enum agx_dim dim = agx_tex_dim(nir_intrinsic_image_dim(instr), array);
809    bool explicit = nir_intrinsic_explicit_coord(instr);
810 
811    /* 32-bit source physically, 16-bit in NIR, top half ignored but needed
812     * logically to ensure alignment.
813     */
814    offset = agx_vec2(b, offset, agx_undef(AGX_SIZE_16));
815    offset.channels_m1--;
816    offset.size = AGX_SIZE_32;
817 
818    /* Modified coordinate descriptor */
819    if (!explicit) {
820       if (array) {
821          agx_index layer = coords;
822          coords = agx_temp(b->shader, AGX_SIZE_32);
823          agx_emit_collect_to(b, coords, 2,
824                              (agx_index[]){
825                                 ms ? agx_mov_imm(b, 16, 0) : layer,
826                                 ms ? layer : agx_undef(AGX_SIZE_16),
827                              });
828       } else {
829          coords = agx_null();
830       }
831    }
832 
833    agx_index base, index;
834    if (instr->intrinsic == nir_intrinsic_bindless_image_store_block_agx) {
835       index = agx_translate_bindless_handle(b, &instr->src[0], &base);
836 
837       assert(base.size == AGX_SIZE_64);
838       assert(index.size == AGX_SIZE_32);
839    } else {
840       base = agx_zero();
841       index = agx_src_index(&instr->src[0]);
842 
843       assert(index.size == AGX_SIZE_16);
844    }
845 
846    // XXX: how does this possibly work
847    if (format == AGX_FORMAT_F16)
848       format = AGX_FORMAT_I16;
849 
850    return agx_block_image_store(b, base, index, offset, coords, format, dim,
851                                 explicit);
852 }
853 
854 static agx_instr *
agx_load_compute_dimension(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,enum agx_sr base)855 agx_load_compute_dimension(agx_builder *b, agx_index dst,
856                            nir_intrinsic_instr *instr, enum agx_sr base)
857 {
858    unsigned dim = instr->def.num_components;
859    unsigned size = instr->def.bit_size;
860    assert(size == 16 || size == 32);
861 
862    agx_index srcs[] = {
863       agx_get_sr(b, size, base + 0),
864       agx_get_sr(b, size, base + 1),
865       agx_get_sr(b, size, base + 2),
866    };
867 
868    return agx_emit_collect_to(b, dst, dim, srcs);
869 }
870 
871 static enum agx_atomic_opc
translate_atomic_opcode(nir_atomic_op op)872 translate_atomic_opcode(nir_atomic_op op)
873 {
874    /* clang-format off */
875    switch (op) {
876    case nir_atomic_op_iadd:    return AGX_ATOMIC_OPC_ADD;
877    case nir_atomic_op_imin:    return AGX_ATOMIC_OPC_IMIN;
878    case nir_atomic_op_umin:    return AGX_ATOMIC_OPC_UMIN;
879    case nir_atomic_op_imax:    return AGX_ATOMIC_OPC_IMAX;
880    case nir_atomic_op_umax:    return AGX_ATOMIC_OPC_UMAX;
881    case nir_atomic_op_iand:    return AGX_ATOMIC_OPC_AND;
882    case nir_atomic_op_ior:     return AGX_ATOMIC_OPC_OR;
883    case nir_atomic_op_ixor:    return AGX_ATOMIC_OPC_XOR;
884    case nir_atomic_op_xchg:    return AGX_ATOMIC_OPC_XCHG;
885    case nir_atomic_op_cmpxchg: return AGX_ATOMIC_OPC_CMPXCHG;
886    default: unreachable("unknown atomic opcode");
887    }
888    /* clang-format on */
889 }
890 
891 /*
892  * The "base" of a local load/store/atomic can be zero but no other immediates.
893  * This would be a little silly to handle when inlining immediates, so we
894  * instead exclude these ops from immediate inlining and just handle 0 specially
895  * when translating.
896  */
897 static agx_index
agx_local_base(nir_src src)898 agx_local_base(nir_src src)
899 {
900    if (nir_src_is_const(src) && nir_src_as_uint(src) == 0)
901       return agx_zero();
902    else
903       return agx_src_index(&src);
904 }
905 
906 static void
agx_emit_atomic(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,bool local)907 agx_emit_atomic(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr,
908                 bool local)
909 {
910    enum agx_atomic_opc op =
911       translate_atomic_opcode(nir_intrinsic_atomic_op(instr));
912    agx_index base =
913       local ? agx_local_base(instr->src[0]) : agx_src_index(&instr->src[0]);
914    agx_index value = agx_src_index(&instr->src[local ? 1 : 2]);
915    agx_index index = local ? agx_zero() : agx_src_index(&instr->src[1]);
916 
917    /* cmpxchg (only) takes 2 sources, passed in consecutive registers */
918    if (op == AGX_ATOMIC_OPC_CMPXCHG) {
919       agx_index value2 = agx_src_index(&instr->src[local ? 2 : 3]);
920       value = agx_vec2(b, value2, value);
921    }
922 
923    if (local) {
924       assert(base.size == AGX_SIZE_16);
925       agx_local_atomic_to(b, dst, value, base, index, op);
926    } else {
927       assert(base.size == AGX_SIZE_64);
928       agx_atomic_to(b, dst, value, base, index, op);
929    }
930 }
931 
932 static enum agx_format
format_for_bitsize(unsigned bitsize)933 format_for_bitsize(unsigned bitsize)
934 {
935    switch (bitsize) {
936    case 8:
937       return AGX_FORMAT_I8;
938    case 16:
939       return AGX_FORMAT_I16;
940    case 32:
941       return AGX_FORMAT_I32;
942    default:
943       unreachable("should've been lowered");
944    }
945 }
946 
947 static void
agx_emit_local_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)948 agx_emit_local_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
949 {
950    agx_index base = agx_local_base(instr->src[0]);
951    agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
952    assert(base.size == AGX_SIZE_16);
953 
954    enum agx_format format = format_for_bitsize(instr->def.bit_size);
955    unsigned nr = instr->def.num_components;
956    unsigned mask = BITFIELD_MASK(nr);
957 
958    agx_local_load_to(b, dst, base, index, format, mask);
959    agx_emit_cached_split(b, dst, nr);
960 }
961 
962 static void
agx_emit_local_store(agx_builder * b,nir_intrinsic_instr * instr)963 agx_emit_local_store(agx_builder *b, nir_intrinsic_instr *instr)
964 {
965    agx_index value = agx_src_index(&instr->src[0]);
966    agx_index base = agx_local_base(instr->src[1]);
967    agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
968    assert(base.size == AGX_SIZE_16);
969 
970    enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
971    unsigned mask = BITFIELD_MASK(
972       nir_src_num_components(instr->src[0])); /* XXX: there's a write mask */
973 
974    agx_local_store(b, value, base, index, format, mask);
975 }
976 
977 static void
agx_emit_load_scratch(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)978 agx_emit_load_scratch(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
979 {
980    agx_index offset = agx_src_index(&instr->src[0]);
981    enum agx_format format = format_for_bitsize(instr->def.bit_size);
982    unsigned nr = instr->def.num_components;
983    unsigned mask = BITFIELD_MASK(nr);
984 
985    agx_stack_load_to(b, dst, offset, format, mask);
986    agx_emit_cached_split(b, dst, nr);
987    b->shader->any_scratch = true;
988 }
989 
990 static void
agx_emit_store_scratch(agx_builder * b,nir_intrinsic_instr * instr)991 agx_emit_store_scratch(agx_builder *b, nir_intrinsic_instr *instr)
992 {
993    agx_index value = agx_recollect_vector(b, instr->src[0]);
994    agx_index offset = agx_src_index(&instr->src[1]);
995    enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
996    unsigned mask = BITFIELD_MASK(nir_src_num_components(instr->src[0]));
997 
998    agx_stack_store(b, value, offset, format, mask);
999    b->shader->any_scratch = true;
1000 }
1001 
1002 static unsigned
agx_expand_tex_to(agx_builder * b,nir_def * def,agx_index src,bool masked)1003 agx_expand_tex_to(agx_builder *b, nir_def *def, agx_index src, bool masked)
1004 {
1005    unsigned nr_channels = def->num_components;
1006    nir_component_mask_t mask = nir_def_components_read(def);
1007 
1008    if (!masked)
1009       mask = (nir_component_mask_t)BITFIELD_MASK(nr_channels);
1010 
1011    agx_index packed_channels[4] = {agx_null()};
1012    agx_index unpacked_channels[4] = {agx_null()};
1013 
1014    /* Hardware writes the masked components contiguously, expand out for NIR */
1015    agx_emit_split(b, packed_channels, src, 4 /* XXX: why not nr_channels */);
1016 
1017    for (unsigned i = 0; i < nr_channels; ++i) {
1018       unpacked_channels[i] =
1019          (mask & BITFIELD_BIT(i))
1020             ? packed_channels[util_bitcount(mask & BITFIELD_MASK(i))]
1021             : agx_undef(src.size);
1022    }
1023 
1024    agx_emit_collect_to(b, agx_def_index(def), nr_channels, unpacked_channels);
1025    return mask;
1026 }
1027 
1028 static agx_instr *
agx_emit_image_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * intr)1029 agx_emit_image_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *intr)
1030 {
1031    agx_index ms_index = agx_src_index(&intr->src[2]);
1032    agx_index lod = agx_src_index(&intr->src[3]);
1033    enum agx_lod_mode lod_mode = AGX_LOD_MODE_LOD_MIN;
1034 
1035    agx_index bindless = agx_immediate(0), texture;
1036    if (intr->intrinsic == nir_intrinsic_bindless_image_load)
1037       texture = agx_translate_bindless_handle(b, &intr->src[0], &bindless);
1038    else if (nir_src_is_const(intr->src[0]) &&
1039             nir_src_as_uint(intr->src[0]) < 0x100)
1040       texture = agx_immediate(nir_src_as_uint(intr->src[0]));
1041    else
1042       texture = agx_src_index(&intr->src[0]);
1043 
1044    assert(nir_src_num_components(intr->src[1]) == 4);
1045    agx_index coord[4] = {
1046       agx_extract_nir_src(b, intr->src[1], 0),
1047       agx_extract_nir_src(b, intr->src[1], 1),
1048       agx_extract_nir_src(b, intr->src[1], 2),
1049       agx_extract_nir_src(b, intr->src[1], 3),
1050    };
1051 
1052    /* Get the image dimension. Cubes are lowered to 2D, since they are logically
1053     * equivalent for imageLoad, but out-of-bounds behaviour for cubes on G13
1054     * is wrong according to Piglit's arb_shader_image_load_store-invalid.
1055     *
1056     * This requires a matching transform in the driver.
1057     */
1058    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1059    bool is_array = nir_intrinsic_image_array(intr);
1060 
1061    if (dim == GLSL_SAMPLER_DIM_CUBE) {
1062       dim = GLSL_SAMPLER_DIM_2D;
1063       is_array = true;
1064    }
1065 
1066    bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1067    unsigned coord_comps = glsl_get_sampler_dim_coordinate_components(dim);
1068    if (is_array && is_ms) {
1069       agx_index layer = agx_temp(b->shader, AGX_SIZE_16);
1070       agx_subdivide_to(b, layer, coord[coord_comps], 0);
1071 
1072       assert(ms_index.size == AGX_SIZE_16);
1073       agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1074       agx_emit_collect_to(b, tmp, 2, (agx_index[]){ms_index, layer});
1075       coord[coord_comps++] = tmp;
1076    } else if (is_ms) {
1077       agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1078       agx_mov_to(b, tmp, ms_index);
1079       coord[coord_comps++] = tmp;
1080    } else if (is_array) {
1081       coord_comps++;
1082    }
1083 
1084    /* Multisampled images do not support mipmapping */
1085    if (is_ms) {
1086       lod_mode = AGX_LOD_MODE_AUTO_LOD;
1087       lod = agx_zero();
1088    }
1089 
1090    agx_index coords = agx_emit_collect(b, coord_comps, coord);
1091    agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
1092 
1093    agx_instr *I = agx_image_load_to(
1094       b, tmp, coords, lod, bindless, texture, agx_immediate(0), agx_null(),
1095       agx_tex_dim(dim, is_array), lod_mode, 0, false);
1096    I->mask = agx_expand_tex_to(b, &intr->def, tmp, true);
1097 
1098    b->shader->out->uses_txf = true;
1099    return NULL;
1100 }
1101 
1102 static agx_instr *
agx_emit_export(agx_builder * b,unsigned base,nir_src src)1103 agx_emit_export(agx_builder *b, unsigned base, nir_src src)
1104 {
1105    agx_builder b_ = *b;
1106    agx_cursor after_cursor = agx_after_block(agx_exit_block(b->shader));
1107    b_.cursor = after_cursor;
1108 
1109    for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
1110       agx_index chan = agx_extract_nir_src(b, src, c);
1111       unsigned stride = agx_size_align_16(chan.size);
1112 
1113       agx_export(&b_, chan, base + (c * stride));
1114    }
1115 
1116    if (agx_cursors_equal(b->cursor, after_cursor)) {
1117       b->cursor = agx_after_block_logical(b->cursor.block);
1118    }
1119 
1120    return NULL;
1121 }
1122 
1123 static agx_instr *
agx_load_exported_to(agx_builder * b,agx_index dst,unsigned base,unsigned nr)1124 agx_load_exported_to(agx_builder *b, agx_index dst, unsigned base, unsigned nr)
1125 {
1126    agx_index chans[4] = {0};
1127    unsigned stride = agx_size_align_16(dst.size);
1128 
1129    for (unsigned c = 0; c < nr; ++c) {
1130       chans[c] = agx_cached_preload(b->shader, base + c * stride, dst.size);
1131    }
1132 
1133    return agx_emit_collect_to(b, dst, nr, chans);
1134 }
1135 
1136 static agx_instr *
agx_emit_image_store(agx_builder * b,nir_intrinsic_instr * instr)1137 agx_emit_image_store(agx_builder *b, nir_intrinsic_instr *instr)
1138 {
1139    /* See remarks in agx_emit_image_load */
1140    enum glsl_sampler_dim glsl_dim = nir_intrinsic_image_dim(instr);
1141    bool is_array = nir_intrinsic_image_array(instr);
1142 
1143    if (glsl_dim == GLSL_SAMPLER_DIM_CUBE) {
1144       glsl_dim = GLSL_SAMPLER_DIM_2D;
1145       is_array = true;
1146    }
1147 
1148    enum agx_dim dim = agx_tex_dim(glsl_dim, is_array);
1149    assert(glsl_dim != GLSL_SAMPLER_DIM_MS && "needs to be lowered");
1150 
1151    agx_index base, index;
1152    if (instr->intrinsic == nir_intrinsic_bindless_image_store) {
1153       index = agx_translate_bindless_handle(b, &instr->src[0], &base);
1154 
1155       assert(base.size == AGX_SIZE_64);
1156       assert(index.size == AGX_SIZE_32);
1157    } else {
1158       base = agx_zero();
1159       index = agx_src_index(&instr->src[0]);
1160 
1161       assert(index.size == AGX_SIZE_16);
1162    }
1163 
1164    agx_index coords4 = agx_src_index(&instr->src[1]);
1165    agx_index lod = agx_src_index(&instr->src[4]);
1166    assert(lod.size == AGX_SIZE_16);
1167 
1168    int coord_components = glsl_get_sampler_dim_coordinate_components(glsl_dim);
1169    if (is_array)
1170       coord_components++;
1171 
1172    agx_index coord_comps[4] = {};
1173    for (unsigned i = 0; i < coord_components; ++i)
1174       coord_comps[i] = agx_emit_extract(b, coords4, i);
1175 
1176    agx_index coords = agx_emit_collect(b, coord_components, coord_comps);
1177    agx_index data = agx_src_index(&instr->src[3]);
1178 
1179    /* If the image format has less than 4 components, nir_opt_shrink_stores can
1180     * shrink the store. But the IR still expects 4 components: pad with undef.
1181     */
1182    if (nir_src_num_components(instr->src[3]) < 4) {
1183       agx_index chan[4] = {agx_null()};
1184 
1185       for (unsigned i = 0; i < 4; ++i) {
1186          if (i < nir_src_num_components(instr->src[3]))
1187             chan[i] = agx_extract_nir_src(b, instr->src[3], i);
1188          else
1189             chan[i] = agx_undef(data.size);
1190       }
1191 
1192       data = agx_emit_collect(b, 4, chan);
1193    }
1194 
1195    /* Image stores act like tilebuffer stores when used for tib spilling */
1196    b->shader->out->tag_write_disable = false;
1197 
1198    return agx_image_write(b, data, coords, lod, base, index, dim);
1199 }
1200 
1201 static enum agx_simd_op
translate_simd_op(nir_op op)1202 translate_simd_op(nir_op op)
1203 {
1204 #define CASE(agx_, nir_)                                                       \
1205    case nir_op_##nir_:                                                         \
1206       return AGX_SIMD_OP_##agx_;
1207 
1208    switch (op) {
1209       CASE(AND, iand)
1210       CASE(FADD, fadd)
1211       CASE(OR, ior)
1212       CASE(FMUL, fmul)
1213       CASE(XOR, ixor)
1214       CASE(FMIN, fmin)
1215       CASE(FMAX, fmax)
1216       CASE(IADD, iadd)
1217       CASE(SMIN, imin)
1218       CASE(SMAX, imax)
1219       CASE(UMIN, umin)
1220       CASE(UMAX, umax)
1221    default:
1222       unreachable("unknown simd op");
1223    }
1224 #undef CASE
1225 }
1226 
1227 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)1228 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
1229 {
1230    agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1231                       ? agx_def_index(&instr->def)
1232                       : agx_null();
1233    gl_shader_stage stage = b->shader->stage;
1234 
1235    switch (instr->intrinsic) {
1236    case nir_intrinsic_load_barycentric_pixel:
1237    case nir_intrinsic_load_barycentric_centroid:
1238    case nir_intrinsic_load_barycentric_at_sample:
1239    case nir_intrinsic_load_barycentric_at_offset:
1240       /* handled later via load_vary */
1241       return NULL;
1242    case nir_intrinsic_load_interpolated_input:
1243       assert(stage == MESA_SHADER_FRAGMENT);
1244       agx_emit_load_vary(b, dst, instr);
1245       return NULL;
1246 
1247    case nir_intrinsic_load_coefficients_agx:
1248       assert(stage == MESA_SHADER_FRAGMENT);
1249       agx_ldcf_to(b, dst, cf_for_intrinsic(b, instr), 1);
1250       agx_emit_cached_split(b, dst, 3);
1251       return NULL;
1252 
1253    case nir_intrinsic_load_agx:
1254    case nir_intrinsic_load_constant_agx:
1255       agx_emit_load(b, dst, instr);
1256       return NULL;
1257 
1258    case nir_intrinsic_store_uvs_agx:
1259       assert(stage == MESA_SHADER_VERTEX);
1260       return agx_st_vary(b, agx_src_index(&instr->src[1]),
1261                          agx_src_index(&instr->src[0]));
1262 
1263    case nir_intrinsic_store_agx:
1264       agx_emit_store(b, instr);
1265       return NULL;
1266 
1267    case nir_intrinsic_store_shared:
1268       agx_emit_local_store(b, instr);
1269       return NULL;
1270 
1271    case nir_intrinsic_load_shared:
1272       agx_emit_local_load(b, dst, instr);
1273       return NULL;
1274 
1275    case nir_intrinsic_global_atomic_agx:
1276    case nir_intrinsic_global_atomic_swap_agx:
1277       agx_emit_atomic(b, dst, instr, false);
1278       return NULL;
1279 
1280    case nir_intrinsic_shared_atomic:
1281    case nir_intrinsic_shared_atomic_swap:
1282       agx_emit_atomic(b, dst, instr, true);
1283       return NULL;
1284 
1285    case nir_intrinsic_store_zs_agx:
1286       assert(stage == MESA_SHADER_FRAGMENT);
1287       return agx_emit_store_zs(b, instr);
1288 
1289    case nir_intrinsic_store_local_pixel_agx:
1290       return agx_emit_local_store_pixel(b, instr);
1291 
1292    case nir_intrinsic_load_local_pixel_agx:
1293       assert(stage == MESA_SHADER_FRAGMENT);
1294       agx_emit_local_load_pixel(b, dst, instr);
1295       return NULL;
1296 
1297    case nir_intrinsic_load_pixel_coord:
1298       return agx_emit_collect_to(
1299          b, dst, 2,
1300          (agx_index[2]){
1301             agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_X),
1302             agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_Y),
1303          });
1304 
1305    case nir_intrinsic_load_frag_coord_zw: {
1306       agx_index cf = agx_get_cf(b->shader, VARYING_SLOT_POS,
1307                                 nir_intrinsic_component(instr));
1308 
1309       return agx_iter_to(b, dst, cf, agx_zero(), 1, AGX_INTERPOLATION_CENTER);
1310    }
1311 
1312    case nir_intrinsic_sample_mask_agx: {
1313       assert(stage == MESA_SHADER_FRAGMENT);
1314       b->shader->out->writes_sample_mask = true;
1315 
1316       /* We need to wait_pix before running Z/S tests, but we don't need to
1317        * wait_pix before merely discarding. Omit the wait_pix when the affected
1318        * samples are unconditionally killed.
1319        */
1320       bool no_tests =
1321          nir_src_is_const(instr->src[1]) && nir_src_as_uint(instr->src[1]) == 0;
1322 
1323       if (!no_tests)
1324          agx_wait_pix(b, 0x0001);
1325 
1326       return agx_sample_mask(b, agx_src_index(&instr->src[0]),
1327                              agx_src_index(&instr->src[1]));
1328    }
1329 
1330    case nir_intrinsic_load_back_face_agx:
1331       return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
1332 
1333    case nir_intrinsic_load_samples_log2_agx:
1334       return agx_get_sr_to(b, dst, AGX_SR_SAMPLES_LOG2);
1335 
1336    case nir_intrinsic_load_sample_mask_in:
1337       return agx_get_sr_to(b, dst, AGX_SR_INPUT_SAMPLE_MASK);
1338 
1339    case nir_intrinsic_load_sample_mask:
1340       return agx_get_sr_coverage_to(b, dst, AGX_SR_COVERAGE_MASK);
1341 
1342    case nir_intrinsic_load_helper_invocation:
1343       /* Compare special register to zero. We could lower this in NIR (letting
1344        * us fold in an inot) but meh?
1345        */
1346       return agx_icmp_to(b, dst,
1347                          agx_get_sr_coverage(b, 32, AGX_SR_IS_ACTIVE_THREAD),
1348                          agx_zero(), AGX_ICOND_UEQ, false);
1349 
1350    case nir_intrinsic_load_vertex_id:
1351       /* We don't assert the HW stage since we use this same ABI with SW VS */
1352       return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1353 
1354    case nir_intrinsic_load_instance_id:
1355       return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
1356 
1357    case nir_intrinsic_load_preamble:
1358       return agx_emit_load_preamble(b, dst, instr);
1359 
1360    case nir_intrinsic_store_preamble:
1361       return agx_emit_store_preamble(b, instr);
1362 
1363    case nir_intrinsic_image_load:
1364    case nir_intrinsic_bindless_image_load:
1365       return agx_emit_image_load(b, dst, instr);
1366 
1367    case nir_intrinsic_image_store:
1368    case nir_intrinsic_bindless_image_store:
1369       return agx_emit_image_store(b, instr);
1370 
1371    case nir_intrinsic_image_store_block_agx:
1372    case nir_intrinsic_bindless_image_store_block_agx:
1373       return agx_emit_block_image_store(b, instr);
1374 
1375    case nir_intrinsic_load_workgroup_id:
1376       return agx_load_compute_dimension(b, dst, instr,
1377                                         AGX_SR_THREADGROUP_POSITION_IN_GRID_X);
1378 
1379    case nir_intrinsic_load_workgroup_size:
1380       return agx_load_compute_dimension(b, dst, instr,
1381                                         AGX_SR_THREADS_PER_THREADGROUP_X);
1382 
1383    case nir_intrinsic_load_global_invocation_id:
1384       return agx_load_compute_dimension(b, dst, instr,
1385                                         AGX_SR_THREAD_POSITION_IN_GRID_X);
1386 
1387    case nir_intrinsic_load_local_invocation_id:
1388       return agx_load_compute_dimension(
1389          b, dst, instr, AGX_SR_THREAD_POSITION_IN_THREADGROUP_X);
1390 
1391    case nir_intrinsic_load_local_invocation_index:
1392       return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_THREADGROUP);
1393 
1394    case nir_intrinsic_load_layer_id:
1395       return agx_get_sr_to(b, dst, AGX_SR_THREADGROUP_POSITION_IN_GRID_Z);
1396 
1397    case nir_intrinsic_barrier: {
1398       assert(!b->shader->is_preamble && "invalid");
1399 
1400       bool needs_image_barriers = false;
1401 
1402       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) {
1403          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
1404 
1405          if (modes & (nir_var_mem_global | nir_var_image)) {
1406             agx_memory_barrier(b);
1407 
1408             /* Pull out all the big hammers to make cross-workgroup memory
1409              * barriers work. Found experimentally, seems to work on G13G at
1410              * least.
1411              *
1412              * TODO: check on other models, we may need more barriers for G13D.
1413              */
1414             if (nir_intrinsic_memory_scope(instr) >= SCOPE_QUEUE_FAMILY) {
1415                agx_memory_barrier_2(b);
1416                agx_unknown_barrier_1(b);
1417             }
1418          }
1419 
1420          if (modes & nir_var_image) {
1421             agx_image_barrier_1(b);
1422             agx_image_barrier_2(b);
1423             needs_image_barriers = true;
1424          }
1425       }
1426 
1427       /* Nothing to do for subgroup barriers */
1428       if (nir_intrinsic_execution_scope(instr) >= SCOPE_WORKGROUP) {
1429          assert(gl_shader_stage_is_compute(b->shader->nir->info.stage));
1430 
1431          agx_threadgroup_barrier(b);
1432       }
1433 
1434       if (needs_image_barriers) {
1435          agx_image_barrier_3(b);
1436          agx_image_barrier_4(b);
1437       }
1438 
1439       return NULL;
1440    }
1441 
1442    case nir_intrinsic_fence_pbe_to_tex_agx: {
1443       agx_image_barrier_1(b);
1444       agx_image_barrier_2(b);
1445       agx_image_barrier_3(b);
1446       agx_image_barrier_4(b);
1447       return NULL;
1448    }
1449 
1450    case nir_intrinsic_fence_mem_to_tex_agx: {
1451       /* Flush out the atomic to main memory... Found experimentally... */
1452       agx_memory_barrier(b);
1453       agx_memory_barrier_2(b);
1454 
1455       /* TODO: Which ones do we actually need? */
1456       agx_image_barrier_1(b);
1457       agx_image_barrier_2(b);
1458       agx_image_barrier_3(b);
1459       agx_image_barrier_4(b);
1460 
1461       /* Flush out the texture cache */
1462       agx_flush_memory_to_texture(b);
1463       return NULL;
1464    }
1465 
1466    case nir_intrinsic_fence_pbe_to_tex_pixel_agx: {
1467       agx_image_barrier_1(b);
1468       agx_image_barrier_2(b);
1469       agx_flush_memory_to_texture(b);
1470       agx_image_barrier_3(b);
1471       return NULL;
1472    }
1473 
1474    case nir_intrinsic_fence_helper_exit_agx: {
1475       assert(b->shader->key->is_helper);
1476       agx_memory_barrier(b);
1477       agx_unknown_barrier_1(b);
1478       agx_memory_barrier_2(b);
1479       agx_unknown_barrier_2(b);
1480       agx_memory_barrier_3(b);
1481       return NULL;
1482    }
1483 
1484    case nir_intrinsic_begin_invocation_interlock: {
1485       if (!b->shader->did_writeout &&
1486           !b->shader->key->fs.ignore_tib_dependencies)
1487          agx_wait_pix(b, 0x000C);
1488 
1489       b->shader->did_writeout = true;
1490       return NULL;
1491    }
1492 
1493    case nir_intrinsic_ddx:
1494    case nir_intrinsic_ddx_coarse:
1495    case nir_intrinsic_ddx_fine:
1496       return agx_dfdx_to(b, dst, agx_src_index(&instr->src[0]));
1497 
1498    case nir_intrinsic_ddy:
1499    case nir_intrinsic_ddy_coarse:
1500    case nir_intrinsic_ddy_fine:
1501       return agx_dfdy_to(b, dst, agx_src_index(&instr->src[0]));
1502 
1503    case nir_intrinsic_load_subgroup_invocation:
1504       return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_SUBGROUP);
1505 
1506    case nir_intrinsic_load_subgroup_id:
1507       return agx_get_sr_to(b, dst, AGX_SR_SUBGROUP_INDEX_IN_THREADGROUP);
1508 
1509    case nir_intrinsic_load_active_subgroup_invocation_agx:
1510       return agx_get_sr_coverage_to(b, dst,
1511                                     AGX_SR_ACTIVE_THREAD_INDEX_IN_SUBGROUP);
1512 
1513    case nir_intrinsic_load_active_subgroup_count_agx:
1514       return agx_get_sr_coverage_to(b, dst,
1515                                     AGX_SR_TOTAL_ACTIVE_THREADS_IN_SUBGROUP);
1516 
1517    case nir_intrinsic_reduce: {
1518       assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1519               instr->def.bit_size == 32) &&
1520              "should've been lowered");
1521 
1522       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
1523       assert(cluster_size == 0 || cluster_size == 4 || cluster_size >= 32);
1524 
1525       enum agx_simd_op op =
1526          translate_simd_op(nir_intrinsic_reduction_op(instr));
1527 
1528       agx_index src0 = agx_src_index(&instr->src[0]);
1529 
1530       if (cluster_size == 4)
1531          return agx_quad_reduce_to(b, dst, src0, op);
1532       else
1533          return agx_simd_reduce_to(b, dst, src0, op);
1534    }
1535 
1536    case nir_intrinsic_exclusive_scan: {
1537       assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1538               instr->def.bit_size == 32) &&
1539              "should've been lowered");
1540 
1541       return agx_simd_prefix_to(
1542          b, dst, agx_src_index(&instr->src[0]),
1543          translate_simd_op(nir_intrinsic_reduction_op(instr)));
1544    }
1545 
1546    case nir_intrinsic_read_invocation: {
1547       /* TODO: Check if we're actually inside divergent control flow */
1548       b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1549 
1550       /* Lane ID guaranteed to be uniform */
1551       return agx_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1552                             agx_src_index(&instr->src[1]));
1553    }
1554 
1555    case nir_intrinsic_quad_broadcast: {
1556       /* TODO: Check if we're actually inside divergent control flow */
1557       b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1558 
1559       /* Lane ID guaranteed to be uniform */
1560       return agx_quad_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1561                                  agx_src_index(&instr->src[1]));
1562    }
1563 
1564    case nir_intrinsic_quad_swap_horizontal: {
1565       return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1566                                      agx_immediate(1));
1567    }
1568 
1569    case nir_intrinsic_quad_swap_vertical: {
1570       return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1571                                      agx_immediate(2));
1572    }
1573 
1574    case nir_intrinsic_quad_swap_diagonal: {
1575       return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1576                                      agx_immediate(3));
1577    }
1578 
1579    case nir_intrinsic_ballot: {
1580       return agx_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1581    }
1582 
1583    case nir_intrinsic_quad_ballot_agx: {
1584       return agx_quad_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1585    }
1586 
1587    case nir_intrinsic_doorbell_agx: {
1588       return agx_doorbell(b, nir_src_as_uint(instr->src[0]));
1589    }
1590 
1591    case nir_intrinsic_stack_map_agx: {
1592       return agx_stack_map(b, agx_src_index(&instr->src[1]),
1593                            nir_src_as_uint(instr->src[0]));
1594    }
1595 
1596    case nir_intrinsic_stack_unmap_agx: {
1597       return agx_stack_unmap_to(b, dst, nir_src_as_uint(instr->src[0]));
1598    }
1599 
1600    case nir_intrinsic_load_scratch:
1601       agx_emit_load_scratch(b, dst, instr);
1602       return NULL;
1603 
1604    case nir_intrinsic_store_scratch:
1605       agx_emit_store_scratch(b, instr);
1606       return NULL;
1607 
1608    case nir_intrinsic_load_core_id_agx:
1609       return agx_get_sr_to(b, dst, AGX_SR_CORE_ID);
1610 
1611    case nir_intrinsic_load_helper_op_id_agx:
1612       assert(b->shader->key->is_helper);
1613       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_OP);
1614 
1615    case nir_intrinsic_load_helper_arg_lo_agx:
1616       assert(b->shader->key->is_helper);
1617       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_L);
1618 
1619    case nir_intrinsic_load_helper_arg_hi_agx:
1620       assert(b->shader->key->is_helper);
1621       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_H);
1622 
1623    case nir_intrinsic_load_exported_agx:
1624       return agx_load_exported_to(b, dst, nir_intrinsic_base(instr),
1625                                   instr->def.num_components);
1626 
1627    case nir_intrinsic_export_agx:
1628       return agx_emit_export(b, nir_intrinsic_base(instr), instr->src[0]);
1629 
1630    case nir_intrinsic_load_barycentric_sample:
1631    case nir_intrinsic_load_sample_id:
1632    case nir_intrinsic_load_sample_pos:
1633       unreachable("Sample shading should have been lowered");
1634 
1635    default:
1636       fprintf(stderr, "Unhandled intrinsic %s\n",
1637               nir_intrinsic_infos[instr->intrinsic].name);
1638       unreachable("Unhandled intrinsic");
1639    }
1640 }
1641 
1642 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)1643 agx_alu_src_index(agx_builder *b, nir_alu_src src)
1644 {
1645    /* Check well-formedness of the input NIR */
1646    ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
1647    unsigned comps = nir_src_num_components(src.src);
1648    unsigned channel = src.swizzle[0];
1649 
1650    assert(bitsize == 1 || bitsize == 8 || bitsize == 16 || bitsize == 32 ||
1651           bitsize == 64);
1652    assert(channel < comps);
1653 
1654    return agx_extract_nir_src(b, src.src, channel);
1655 }
1656 
1657 /*
1658  * Emit an instruction translating (s0 * s1) + (s2 << s3). Assuming s3 is
1659  * constant, this is an imad instruction. If s1 == 1, then this is optimized to
1660  * an iadd instruction, which is faster.
1661  */
1662 static agx_instr *
agx_emit_imadshl_agx(agx_builder * b,nir_alu_instr * alu,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)1663 agx_emit_imadshl_agx(agx_builder *b, nir_alu_instr *alu, agx_index dst,
1664                      agx_index s0, agx_index s1, agx_index s2, agx_index s3)
1665 {
1666    /* If the shift is not constant, use a variable shift. This should never
1667     * happen in practice but we don't want to constrain the NIR.
1668     */
1669    unsigned shift;
1670    if (!nir_src_is_const(alu->src[3].src)) {
1671       s2 = agx_bfi(b, agx_immediate(0), s2, s3, 0);
1672       shift = 0;
1673    } else {
1674       shift = nir_alu_src_as_uint(alu->src[3]);
1675    }
1676 
1677    assert(shift <= 4 && "domain restriction on the input NIR");
1678 
1679    /* Emit iadd if possible, else imad */
1680    if (nir_src_is_const(alu->src[1].src) &&
1681        nir_alu_src_as_uint(alu->src[1]) == 1) {
1682 
1683       return agx_iadd_to(b, dst, s0, s2, shift);
1684    } else {
1685       return agx_imad_to(b, dst, s0, s1, s2, shift);
1686    }
1687 }
1688 
1689 static bool
is_conversion_to_8bit(nir_op op)1690 is_conversion_to_8bit(nir_op op)
1691 {
1692    switch (op) {
1693    case nir_op_i2i8:
1694    case nir_op_u2u8:
1695    case nir_op_f2i8:
1696    case nir_op_f2u8:
1697    case nir_op_b2i8:
1698       return true;
1699    default:
1700       return false;
1701    }
1702 }
1703 
1704 static agx_instr *
agx_fminmax_to(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,nir_alu_instr * alu)1705 agx_fminmax_to(agx_builder *b, agx_index dst, agx_index s0, agx_index s1,
1706                nir_alu_instr *alu)
1707 {
1708    /* The hardware gtn/ltn modes are unfortunately incorrect for signed zeros */
1709    assert(!nir_alu_instr_is_signed_zero_preserve(alu) &&
1710           "should've been lowered");
1711 
1712    bool fmax = alu->op == nir_op_fmax;
1713    enum agx_fcond fcond = fmax ? AGX_FCOND_GTN : AGX_FCOND_LTN;
1714 
1715    /* Calculate min/max with the appropriate hardware instruction */
1716    agx_index tmp = agx_fcmpsel(b, s0, s1, s0, s1, fcond);
1717 
1718    /* G13 flushes fp32 denorms and preserves fp16 denorms. Since cmpsel
1719     * preserves denorms, we need to canonicalize for fp32. Canonicalizing fp16
1720     * would be harmless but wastes an instruction.
1721     */
1722    if (alu->def.bit_size == 32)
1723       return agx_fadd_to(b, dst, tmp, agx_negzero());
1724    else
1725       return agx_mov_to(b, dst, tmp);
1726 }
1727 
1728 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)1729 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
1730 {
1731    unsigned srcs = nir_op_infos[instr->op].num_inputs;
1732    unsigned sz = instr->def.bit_size;
1733    unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
1734    ASSERTED unsigned comps = instr->def.num_components;
1735 
1736    assert(comps == 1 || nir_op_is_vec_or_mov(instr->op));
1737    assert(sz == 1 ||
1738           ((nir_op_is_vec_or_mov(instr->op) ||
1739             is_conversion_to_8bit(instr->op) || instr->op == nir_op_bcsel) &&
1740            sz == 8) ||
1741           sz == 16 || sz == 32 || sz == 64);
1742 
1743    agx_index dst = agx_def_index(&instr->def);
1744    agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
1745    agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
1746    agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
1747    agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
1748 
1749    agx_index i0 = agx_immediate(0);
1750    agx_index i1 = agx_immediate(1);
1751 
1752 #define UNOP(nop, aop)                                                         \
1753    case nir_op_##nop:                                                          \
1754       return agx_##aop##_to(b, dst, s0);
1755 #define BINOP(nop, aop)                                                        \
1756    case nir_op_##nop:                                                          \
1757       return agx_##aop##_to(b, dst, s0, s1);
1758 #define TRIOP(nop, aop)                                                        \
1759    case nir_op_##nop:                                                          \
1760       return agx_##aop##_to(b, dst, s0, s1, s2);
1761 
1762    switch (instr->op) {
1763       BINOP(fadd, fadd);
1764       BINOP(fmul, fmul);
1765       TRIOP(ffma, fma);
1766 
1767       UNOP(f2f16, fmov);
1768       UNOP(f2f16_rtne, fmov);
1769       UNOP(f2f32, fmov);
1770       UNOP(fround_even, roundeven);
1771       UNOP(ftrunc, trunc);
1772       UNOP(ffloor, floor);
1773       UNOP(fceil, ceil);
1774       UNOP(frcp, rcp);
1775       UNOP(frsq, rsqrt);
1776       UNOP(flog2, log2);
1777       UNOP(fexp2, exp2);
1778 
1779       UNOP(mov, mov);
1780       UNOP(u2u32, mov);
1781       UNOP(bitfield_reverse, bitrev);
1782       UNOP(bit_count, popcount);
1783       UNOP(ufind_msb, ffs);
1784       BINOP(iand, and);
1785       BINOP(ior, or);
1786       BINOP(ixor, xor);
1787       BINOP(interleave_agx, intl);
1788 
1789    case nir_op_feq:
1790       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, false);
1791    case nir_op_flt:
1792       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_LT, false);
1793    case nir_op_fge:
1794       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_GE, false);
1795    case nir_op_fneu:
1796       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, true);
1797 
1798    case nir_op_ieq:
1799       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, false);
1800    case nir_op_ine:
1801       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, true);
1802    case nir_op_ilt:
1803       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, false);
1804    case nir_op_ige:
1805       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, true);
1806    case nir_op_ult:
1807       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, false);
1808    case nir_op_uge:
1809       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, true);
1810 
1811    case nir_op_inot:
1812       if (sz == 1)
1813          return agx_xor_to(b, dst, s0, i1);
1814       else
1815          return agx_not_to(b, dst, s0);
1816 
1817    case nir_op_b2b1:
1818       return agx_icmp_to(b, dst, s0, i0, AGX_ICOND_UEQ, true);
1819 
1820    case nir_op_fsqrt:
1821       return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
1822    case nir_op_fabs:
1823       return agx_fmov_to(b, dst, agx_abs(s0));
1824    case nir_op_fneg:
1825       return agx_fmov_to(b, dst, agx_neg(s0));
1826 
1827    case nir_op_fmin:
1828    case nir_op_fmax:
1829       return agx_fminmax_to(b, dst, s0, s1, instr);
1830 
1831    case nir_op_imin:
1832       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
1833    case nir_op_imax:
1834       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
1835    case nir_op_umin:
1836       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
1837    case nir_op_umax:
1838       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
1839 
1840    case nir_op_iadd:
1841       return agx_iadd_to(b, dst, s0, s1, 0);
1842    case nir_op_imadshl_agx:
1843       return agx_emit_imadshl_agx(b, instr, dst, s0, s1, s2, s3);
1844    case nir_op_imsubshl_agx:
1845       return agx_emit_imadshl_agx(b, instr, dst, s0, s1, agx_neg(s2), s3);
1846    case nir_op_isub:
1847       return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1848    case nir_op_ineg:
1849       return agx_iadd_to(b, dst, i0, agx_neg(s0), 0);
1850    case nir_op_imul:
1851       return agx_imad_to(b, dst, s0, s1, i0, 0);
1852    case nir_op_umul_2x32_64:
1853       return agx_imad_to(b, dst, agx_abs(s0), agx_abs(s1), i0, 0);
1854    case nir_op_imul_2x32_64:
1855       return agx_imad_to(b, dst, s0, s1, i0, 0);
1856    case nir_op_umul_high:
1857       return agx_mul_high_to(b, dst, s0, s1, false);
1858    case nir_op_imul_high:
1859       return agx_mul_high_to(b, dst, s0, s1, true);
1860 
1861    case nir_op_ishl:
1862       return agx_bfi_to(b, dst, i0, s0, s1, 0);
1863    case nir_op_ushr:
1864       return agx_ushr_to(b, dst, s0, s1);
1865    case nir_op_ishr:
1866       return agx_asr_to(b, dst, s0, s1);
1867 
1868    case nir_op_extr_agx:
1869       return agx_extr_to(b, dst, s0, s1, s2,
1870                          nir_alu_src_as_uint(instr->src[3]));
1871 
1872    case nir_op_ubitfield_extract: {
1873       unsigned m = nir_alu_src_as_uint(instr->src[2]);
1874       assert(m != 0 && "should've been optimized");
1875 
1876       /* Disable masking if the whole thing is used */
1877       if (m >= 32)
1878          m = 0;
1879 
1880       return agx_bfeil_to(b, dst, i0, s0, s1, m);
1881    }
1882 
1883    case nir_op_bcsel:
1884       return agx_icmpsel_to(b, dst, s0, i0, s2, s1, AGX_ICOND_UEQ);
1885 
1886    case nir_op_i2i32: {
1887       if (src_sz == 8) {
1888          /* Sign extend in software, NIR likes 8-bit conversions */
1889          agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1890          return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1891       } else {
1892          assert(s0.size == AGX_SIZE_16 && "other conversions lowered");
1893          return agx_iadd_to(b, dst, s0, i0, 0);
1894       }
1895    }
1896 
1897    case nir_op_i2i16: {
1898       if (src_sz == 8) {
1899          /* Sign extend in software, NIR likes 8-bit conversions */
1900          agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1901          return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1902       } else {
1903          assert(s0.size == AGX_SIZE_32 && "other conversions lowered");
1904          return agx_subdivide_to(b, dst, s0, 0);
1905       }
1906    }
1907 
1908    case nir_op_u2u16: {
1909       if (s0.size == AGX_SIZE_32)
1910          return agx_subdivide_to(b, dst, s0, 0);
1911       else
1912          return agx_mov_to(b, dst, s0);
1913    }
1914 
1915    /* It will be put into a 16-bit register, but zero out the garbage. We could
1916     * optimize this in the future but it ensures correctness for u2u16(u2u8(x))
1917     * sequences.
1918     */
1919    case nir_op_u2u8:
1920    case nir_op_i2i8:
1921       return agx_and_to(b, dst, s0, agx_immediate(0xFF));
1922 
1923    case nir_op_iadd_sat: {
1924       agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
1925       I->saturate = true;
1926       return I;
1927    }
1928 
1929    case nir_op_isub_sat: {
1930       agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1931       I->saturate = true;
1932       return I;
1933    }
1934 
1935    case nir_op_uadd_sat: {
1936       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
1937       I->saturate = true;
1938       return I;
1939    }
1940 
1941    case nir_op_usub_sat: {
1942       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
1943       I->saturate = true;
1944       return I;
1945    }
1946 
1947    case nir_op_fsat: {
1948       agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
1949       I->saturate = true;
1950       return I;
1951    }
1952 
1953    case nir_op_fsin_agx: {
1954       agx_index fixup = agx_sin_pt_1(b, s0);
1955       agx_index sinc = agx_sin_pt_2(b, fixup);
1956       return agx_fmul_to(b, dst, sinc, fixup);
1957    }
1958 
1959    case nir_op_f2i16:
1960       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S16), s0,
1961                             AGX_ROUND_RTZ);
1962 
1963    case nir_op_f2i32:
1964       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S32), s0,
1965                             AGX_ROUND_RTZ);
1966 
1967    case nir_op_f2u16:
1968       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U16), s0,
1969                             AGX_ROUND_RTZ);
1970 
1971    case nir_op_f2u32:
1972       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U32), s0,
1973                             AGX_ROUND_RTZ);
1974 
1975    case nir_op_u2f16:
1976    case nir_op_u2f32: {
1977       if (src_sz == 64)
1978          unreachable("64-bit conversions unimplemented");
1979 
1980       enum agx_convert mode = (src_sz == 32)   ? AGX_CONVERT_U32_TO_F
1981                               : (src_sz == 16) ? AGX_CONVERT_U16_TO_F
1982                                                : AGX_CONVERT_U8_TO_F;
1983 
1984       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
1985    }
1986 
1987    case nir_op_i2f16:
1988    case nir_op_i2f32: {
1989       if (src_sz == 64)
1990          unreachable("64-bit conversions unimplemented");
1991 
1992       enum agx_convert mode = (src_sz == 32)   ? AGX_CONVERT_S32_TO_F
1993                               : (src_sz == 16) ? AGX_CONVERT_S16_TO_F
1994                                                : AGX_CONVERT_S8_TO_F;
1995 
1996       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
1997    }
1998 
1999    case nir_op_pack_32_2x16_split:
2000    case nir_op_pack_64_2x32_split: {
2001       agx_index idx[] = {s0, s1};
2002       return agx_emit_collect_to(b, dst, 2, idx);
2003    }
2004 
2005    case nir_op_unpack_64_2x32_split_x:
2006    case nir_op_unpack_32_2x16_split_x:
2007       return agx_subdivide_to(b, dst, s0, 0);
2008 
2009    case nir_op_unpack_64_2x32_split_y:
2010    case nir_op_unpack_32_2x16_split_y:
2011       return agx_subdivide_to(b, dst, s0, 1);
2012 
2013    case nir_op_vec2:
2014    case nir_op_vec3:
2015    case nir_op_vec4: {
2016       agx_index idx[] = {s0, s1, s2, s3};
2017       return agx_emit_collect_to(b, dst, srcs, idx);
2018    }
2019 
2020    case nir_op_vec8:
2021    case nir_op_vec16:
2022       unreachable("should've been lowered");
2023 
2024    default:
2025       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2026       unreachable("Unhandled ALU instruction");
2027    }
2028 }
2029 
2030 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op,bool biased,bool min_lod,bool lod_is_zero)2031 agx_lod_mode_for_nir(nir_texop op, bool biased, bool min_lod, bool lod_is_zero)
2032 {
2033    switch (op) {
2034    case nir_texop_tex:
2035    case nir_texop_tg4:
2036       /* We could support this for tex, but it's never actually seen because tex
2037        * is always turned into txb to implement sampler LOD bias in Vulkan.
2038        */
2039       assert(!min_lod && "unimplemented");
2040 
2041       return AGX_LOD_MODE_AUTO_LOD;
2042    case nir_texop_txb:
2043       return min_lod ? AGX_LOD_MODE_AUTO_LOD_BIAS_MIN
2044                      : AGX_LOD_MODE_AUTO_LOD_BIAS;
2045    case nir_texop_lod:
2046       assert(!min_lod);
2047       return biased ? AGX_LOD_MODE_AUTO_LOD_BIAS : AGX_LOD_MODE_AUTO_LOD;
2048    case nir_texop_txd:
2049       return min_lod ? AGX_LOD_MODE_LOD_GRAD_MIN : AGX_LOD_MODE_LOD_GRAD;
2050    case nir_texop_txl:
2051       assert(!min_lod);
2052       return AGX_LOD_MODE_LOD_MIN;
2053    case nir_texop_txf:
2054       assert(!min_lod);
2055       return lod_is_zero ? AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
2056    case nir_texop_txf_ms:
2057       assert(!min_lod);
2058       assert(lod_is_zero && "no mipmapping");
2059       return AGX_LOD_MODE_AUTO_LOD;
2060    default:
2061       unreachable("Unhandled texture op");
2062    }
2063 }
2064 
2065 static enum agx_gather
agx_gather_for_nir(nir_tex_instr * tex)2066 agx_gather_for_nir(nir_tex_instr *tex)
2067 {
2068    if (tex->op == nir_texop_tg4) {
2069       enum agx_gather components[] = {
2070          AGX_GATHER_R,
2071          AGX_GATHER_G,
2072          AGX_GATHER_B,
2073          AGX_GATHER_A,
2074       };
2075 
2076       assert(tex->component < ARRAY_SIZE(components));
2077       return components[tex->component];
2078    } else {
2079       return AGX_GATHER_NONE;
2080    }
2081 }
2082 
2083 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)2084 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
2085 {
2086    agx_index coords = agx_null(), bindless = agx_immediate(0),
2087              texture = agx_immediate(instr->texture_index),
2088              sampler = agx_immediate(0), lod = agx_immediate(0),
2089              compare = agx_null(), packed_offset = agx_null(),
2090              min_lod = agx_null();
2091 
2092    bool lod_is_zero = true;
2093 
2094    for (unsigned i = 0; i < instr->num_srcs; ++i) {
2095       agx_index index = agx_src_index(&instr->src[i].src);
2096 
2097       switch (instr->src[i].src_type) {
2098       case nir_tex_src_backend1:
2099          coords = index;
2100          break;
2101 
2102       case nir_tex_src_backend2:
2103          packed_offset = index;
2104          break;
2105 
2106       case nir_tex_src_lod:
2107       case nir_tex_src_bias:
2108          lod = index;
2109          lod_is_zero = nir_src_is_const(instr->src[i].src) &&
2110                        nir_src_as_uint(instr->src[i].src) == 0;
2111          break;
2112 
2113       case nir_tex_src_min_lod:
2114          assert(index.size == AGX_SIZE_16);
2115          min_lod = index;
2116          break;
2117 
2118       case nir_tex_src_comparator:
2119          assert(index.size == AGX_SIZE_32);
2120          compare = index;
2121          break;
2122 
2123       case nir_tex_src_texture_offset:
2124          texture = index;
2125          break;
2126       case nir_tex_src_sampler_handle:
2127          sampler = index;
2128          break;
2129 
2130       case nir_tex_src_texture_handle:
2131          texture =
2132             agx_translate_bindless_handle(b, &instr->src[i].src, &bindless);
2133          break;
2134 
2135       case nir_tex_src_ddx: {
2136          int y_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2137          assert(y_idx >= 0 && "we only handle gradients");
2138 
2139          int min_idx = nir_tex_instr_src_index(instr, nir_tex_src_min_lod);
2140          bool has_min = min_idx >= 0;
2141          agx_index min;
2142 
2143          unsigned n = nir_tex_instr_src_size(instr, y_idx);
2144          assert((n == 2 || n == 3) && "other sizes not supported");
2145 
2146          agx_index index2 = agx_src_index(&instr->src[y_idx].src);
2147 
2148          if (has_min) {
2149             min = agx_src_index(&instr->src[min_idx].src);
2150 
2151             /* Undef extend to 32-bit since our IR is iffy */
2152             min = agx_vec2(b, min, agx_undef(AGX_SIZE_16));
2153             min.channels_m1--;
2154             min.size = AGX_SIZE_32;
2155          }
2156 
2157          /* We explicitly don't cache about the split cache for this */
2158          unsigned chans = (2 * n) + (has_min ? 1 : 0);
2159          lod = agx_vec_temp(b->shader, AGX_SIZE_32, chans);
2160          agx_instr *I = agx_collect_to(b, lod, chans);
2161 
2162          for (unsigned i = 0; i < n; ++i) {
2163             I->src[(2 * i) + 0] = agx_emit_extract(b, index, i);
2164             I->src[(2 * i) + 1] = agx_emit_extract(b, index2, i);
2165          }
2166 
2167          if (has_min)
2168             I->src[2 * n] = min;
2169 
2170          break;
2171       }
2172 
2173       case nir_tex_src_ddy:
2174          /* handled above */
2175          break;
2176 
2177       default:
2178          unreachable("Unexpected texture source");
2179       }
2180    }
2181 
2182    enum agx_lod_mode lod_mode = agx_lod_mode_for_nir(
2183       instr->op, nir_tex_instr_src_index(instr, nir_tex_src_bias) >= 0,
2184       nir_tex_instr_src_index(instr, nir_tex_src_min_lod) >= 0, lod_is_zero);
2185 
2186    if (lod_mode == AGX_LOD_MODE_AUTO_LOD) {
2187       /* Ignored logically but asserted 0 */
2188       lod = agx_immediate(0);
2189    } else if (lod_mode == AGX_LOD_MODE_AUTO_LOD_BIAS_MIN) {
2190       /* Combine min with lod */
2191       lod = agx_vec2(b, lod, min_lod);
2192    }
2193 
2194    agx_index dst = agx_def_index(&instr->def);
2195 
2196    /* Pack shadow reference value (compare) and packed offset together */
2197    agx_index compare_offset = agx_null();
2198 
2199    if (!agx_is_null(compare) && !agx_is_null(packed_offset))
2200       compare_offset = agx_vec2(b, compare, packed_offset);
2201    else if (!agx_is_null(packed_offset))
2202       compare_offset = packed_offset;
2203    else if (!agx_is_null(compare))
2204       compare_offset = compare;
2205 
2206    agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
2207    agx_instr *I = agx_texture_sample_to(
2208       b, tmp, coords, lod, bindless, texture, sampler, compare_offset,
2209       agx_tex_dim(instr->sampler_dim, instr->is_array), lod_mode, 0,
2210       !agx_is_null(packed_offset), !agx_is_null(compare),
2211       instr->op == nir_texop_lod, agx_gather_for_nir(instr));
2212 
2213    if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms) {
2214       I->op = AGX_OPCODE_TEXTURE_LOAD;
2215       b->shader->out->uses_txf = true;
2216    }
2217 
2218    /* Destination masking doesn't seem to work properly for gathers (because
2219     * it's mostly pointless), but it does show up in the lowering of
2220     * textureGatherOffsets. Don't try to mask the destination for gathers.
2221     */
2222    bool masked = (instr->op != nir_texop_tg4);
2223    I->mask = agx_expand_tex_to(b, &instr->def, tmp, masked);
2224 }
2225 
2226 /*
2227  * Determine if a NIR loop (CF list) uses a continue jump, including within
2228  * if-else statements but not including nested loops.
2229  */
2230 static bool
cf_list_uses_continue(struct exec_list * list)2231 cf_list_uses_continue(struct exec_list *list)
2232 {
2233    foreach_list_typed(nir_cf_node, node, node, list) {
2234       if (node->type == nir_cf_node_block) {
2235          nir_block *block = nir_cf_node_as_block(node);
2236 
2237          nir_foreach_instr(instr, block) {
2238             if (instr->type == nir_instr_type_jump &&
2239                 nir_instr_as_jump(instr)->type == nir_jump_continue)
2240                return true;
2241          }
2242       } else if (node->type == nir_cf_node_if) {
2243          nir_if *nif = nir_cf_node_as_if(node);
2244 
2245          if (cf_list_uses_continue(&nif->then_list) ||
2246              cf_list_uses_continue(&nif->else_list))
2247             return true;
2248       } else {
2249          assert(node->type == nir_cf_node_loop && "don't care about nesting");
2250       }
2251    }
2252 
2253    return false;
2254 }
2255 
2256 static bool
loop_uses_continue(nir_loop * loop)2257 loop_uses_continue(nir_loop *loop)
2258 {
2259    return cf_list_uses_continue(&loop->body);
2260 }
2261 
2262 /*
2263  * NIR loops are treated as a pair of AGX loops:
2264  *
2265  *    do {
2266  *       do {
2267  *          ...
2268  *       } while (0);
2269  *    } while (cond);
2270  *
2271  * By manipulating the nesting counter, we may break out of nested loops, so
2272  * under the model, both break and continue may be implemented as breaks, where
2273  * break breaks out of the outer loop (2 layers) and continue breaks out of the
2274  * inner loop (1 layer).
2275  *
2276  * After manipulating the nesting counter directly, pop_exec #0 must be used to
2277  * flush the update to the execution mask.
2278  */
2279 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)2280 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
2281 {
2282    agx_context *ctx = b->shader;
2283    assert(instr->type == nir_jump_break || instr->type == nir_jump_continue);
2284 
2285    /* Break out of either one or two loops */
2286    unsigned nestings = b->shader->loop_nesting;
2287 
2288    if (instr->type == nir_jump_continue) {
2289       nestings += 1;
2290       agx_block_add_successor(ctx->current_block, ctx->continue_block);
2291    } else if (instr->type == nir_jump_break) {
2292       nestings += ctx->loop_continues ? 2 : 1;
2293       agx_block_add_successor(ctx->current_block, ctx->break_block);
2294    }
2295 
2296    agx_break(b, nestings, ctx->break_block);
2297    ctx->current_block->unconditional_jumps = true;
2298 }
2299 
2300 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)2301 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
2302 {
2303    agx_instr *I =
2304       agx_phi_to(b, agx_def_index(&instr->def), exec_list_length(&instr->srcs));
2305 
2306    /* Deferred */
2307    I->phi = instr;
2308 }
2309 
2310 /* Look up the AGX block corresponding to a given NIR block. Used when
2311  * translating phi nodes after emitting all blocks.
2312  */
2313 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)2314 agx_from_nir_block(agx_context *ctx, nir_block *block)
2315 {
2316    return ctx->indexed_nir_blocks[block->index];
2317 }
2318 
2319 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)2320 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
2321 {
2322    nir_phi_instr *phi = I->phi;
2323    I->phi = NULL;
2324 
2325    /* Guaranteed by lower_phis_to_scalar */
2326    assert(phi->def.num_components == 1);
2327 
2328    nir_foreach_phi_src(src, phi) {
2329       agx_block *pred = agx_from_nir_block(ctx, src->pred);
2330       unsigned i = agx_predecessor_index(block, pred);
2331       assert(i < I->nr_srcs);
2332 
2333       I->src[i] = agx_src_index(&src->src);
2334    }
2335 }
2336 
2337 static void
agx_emit_phis_deferred(agx_context * ctx)2338 agx_emit_phis_deferred(agx_context *ctx)
2339 {
2340    agx_foreach_block(ctx, block) {
2341       agx_foreach_phi_in_block(block, I)
2342          agx_emit_phi_deferred(ctx, block, I);
2343    }
2344 }
2345 
2346 static void
agx_emit_undef(agx_builder * b,nir_undef_instr * instr)2347 agx_emit_undef(agx_builder *b, nir_undef_instr *instr)
2348 {
2349    /* For now, just lower undefs to zero. This doesn't matter too much, since
2350     * the lowering happens in NIR and this just allows for late lowering passes
2351     * to result in undefs.
2352     */
2353    if (instr->def.num_components > 1) {
2354       assert(instr->def.num_components <= 4);
2355       agx_index zero = agx_mov_imm(b, instr->def.bit_size, 0);
2356 
2357       agx_emit_collect_to(b, agx_def_index(&instr->def),
2358                           instr->def.num_components,
2359                           (agx_index[4]){zero, zero, zero, zero});
2360    } else {
2361       agx_mov_imm_to(b, agx_def_index(&instr->def), 0);
2362    }
2363 }
2364 
2365 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)2366 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
2367 {
2368    switch (instr->type) {
2369    case nir_instr_type_load_const:
2370       agx_emit_load_const(b, nir_instr_as_load_const(instr));
2371       break;
2372 
2373    case nir_instr_type_intrinsic:
2374       agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2375       break;
2376 
2377    case nir_instr_type_alu:
2378       agx_emit_alu(b, nir_instr_as_alu(instr));
2379       break;
2380 
2381    case nir_instr_type_tex:
2382       agx_emit_tex(b, nir_instr_as_tex(instr));
2383       break;
2384 
2385    case nir_instr_type_jump:
2386       agx_emit_jump(b, nir_instr_as_jump(instr));
2387       break;
2388 
2389    case nir_instr_type_phi:
2390       agx_emit_phi(b, nir_instr_as_phi(instr));
2391       break;
2392 
2393    case nir_instr_type_undef:
2394       agx_emit_undef(b, nir_instr_as_undef(instr));
2395       break;
2396 
2397    default:
2398       unreachable("should've been lowered");
2399    }
2400 }
2401 
2402 static agx_block *
agx_create_block(agx_context * ctx)2403 agx_create_block(agx_context *ctx)
2404 {
2405    agx_block *blk = rzalloc(ctx, agx_block);
2406 
2407    util_dynarray_init(&blk->predecessors, blk);
2408 
2409    return blk;
2410 }
2411 
2412 static agx_block *
emit_block(agx_context * ctx,nir_block * block)2413 emit_block(agx_context *ctx, nir_block *block)
2414 {
2415    if (ctx->after_block) {
2416       ctx->current_block = ctx->after_block;
2417       ctx->after_block = NULL;
2418    } else {
2419       ctx->current_block = agx_create_block(ctx);
2420    }
2421 
2422    agx_block *blk = ctx->current_block;
2423    list_addtail(&blk->link, &ctx->blocks);
2424    list_inithead(&blk->instructions);
2425 
2426    ctx->indexed_nir_blocks[block->index] = blk;
2427 
2428    agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
2429 
2430    nir_foreach_instr(instr, block) {
2431       agx_emit_instr(&_b, instr);
2432    }
2433 
2434    return blk;
2435 }
2436 
2437 static agx_block *emit_cf_list(agx_context *ctx, struct exec_list *list);
2438 
2439 /* Emit if-else as
2440  *
2441  *    if_icmp cond != 0
2442  *       ...
2443  *    else_icmp cond == 0
2444  *       ...
2445  *    pop_exec
2446  *
2447  * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
2448  * an empty else block can become nonempty after RA due to phi lowering. This is
2449  * not usually optimal, but it's a start.
2450  */
2451 
2452 static void
emit_if(agx_context * ctx,nir_if * nif)2453 emit_if(agx_context *ctx, nir_if *nif)
2454 {
2455    agx_block *first_block = ctx->current_block;
2456    agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
2457    agx_index cond = agx_src_index(&nif->condition);
2458 
2459    agx_instr *if_ = agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true,
2460                                 NULL /* filled in later */);
2461    ctx->loop_nesting++;
2462    ctx->total_nesting++;
2463 
2464    /* Emit the two subblocks. */
2465    agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
2466    agx_block *end_then = ctx->current_block;
2467 
2468    _b.cursor = agx_after_block(ctx->current_block);
2469 
2470    agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
2471    agx_block *end_else = ctx->current_block;
2472 
2473    /* If the "if" fails, we fallthrough to the else */
2474    if_->target = else_block;
2475 
2476    /* Insert an else instruction at the beginning of the else block. We use
2477     * "else_fcmp 0.0, 0.0, eq" as unconditional else, matching the blob.
2478     *
2479     * If it fails, we fall through to the logical end of the last else block.
2480     */
2481    _b.cursor = agx_before_block(else_block);
2482    agx_else_fcmp(&_b, agx_zero(), agx_zero(), 1, AGX_FCOND_EQ, false, end_else);
2483 
2484    ctx->after_block = agx_create_block(ctx);
2485 
2486    agx_block_add_successor(first_block, if_block);
2487    agx_block_add_successor(first_block, else_block);
2488    agx_block_add_successor(end_then, ctx->after_block);
2489    agx_block_add_successor(end_else, ctx->after_block);
2490 
2491    _b.cursor = agx_after_block(ctx->current_block);
2492    agx_pop_exec(&_b, 1);
2493    ctx->loop_nesting--;
2494    ctx->total_nesting--;
2495 }
2496 
2497 static void
emit_loop(agx_context * ctx,nir_loop * nloop)2498 emit_loop(agx_context *ctx, nir_loop *nloop)
2499 {
2500    assert(!nir_loop_has_continue_construct(nloop));
2501    /* We only track nesting within the innermost loop, so push and reset */
2502    unsigned pushed_nesting = ctx->loop_nesting;
2503    ctx->loop_nesting = 0;
2504    ctx->total_nesting++;
2505 
2506    bool old_continues = ctx->loop_continues;
2507    ctx->loop_continues = loop_uses_continue(nloop);
2508 
2509    agx_block *popped_break = ctx->break_block;
2510    agx_block *popped_continue = ctx->continue_block;
2511 
2512    ctx->break_block = agx_create_block(ctx);
2513    ctx->continue_block = agx_create_block(ctx);
2514 
2515    /* If we are emitting a loop inside other control flow, there might be
2516     * threads masked off (TODO: divergence analysis), so push_exec them so
2517     * we get the lower nesting count values to ourselves.
2518     */
2519    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2520    if (ctx->total_nesting > 1)
2521       agx_push_exec(&_b, ctx->loop_continues ? 2 : 1);
2522 
2523    /* Fallthrough to body */
2524    agx_block_add_successor(ctx->current_block, ctx->continue_block);
2525 
2526    /* Emit the body */
2527    ctx->after_block = ctx->continue_block;
2528    ctx->after_block->loop_header = true;
2529    agx_block *start_block = emit_cf_list(ctx, &nloop->body);
2530 
2531    /* If we used any continue jumps, we need to reactivate the continued
2532     * threads. We do this with an always true while_icmp, which behaves like:
2533     *
2534     *    if (r0l == 1) {
2535     *       r0l = 0;
2536     *    }
2537     *    update_exec
2538     *
2539     * If we did not use continue, this would be a no-op so it is omitted.
2540     */
2541    _b.cursor = agx_after_block(ctx->current_block);
2542 
2543    if (ctx->loop_continues) {
2544       agx_while_icmp(
2545          &_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false,
2546          NULL /* no semantic target, used purely for side effects */);
2547    }
2548 
2549    agx_jmp_exec_any(&_b, start_block);
2550    agx_pop_exec(&_b, ctx->loop_continues ? 2 : 1);
2551    agx_block_add_successor(ctx->current_block, ctx->continue_block);
2552 
2553    /* Pop off */
2554    ctx->after_block = ctx->break_block;
2555    ctx->break_block = popped_break;
2556    ctx->continue_block = popped_continue;
2557 
2558    /* Update shader-db stats */
2559    ++ctx->loop_count;
2560 
2561    /* All nested control flow must have finished */
2562    assert(ctx->loop_nesting == 0);
2563 
2564    /* Restore loop nesting (we might be inside an if inside an outer loop) */
2565    ctx->loop_nesting = pushed_nesting;
2566    ctx->total_nesting--;
2567    ctx->loop_continues = old_continues;
2568 }
2569 
2570 /* Before the first control flow structure, the nesting counter needs to be
2571  * zeroed for correct operation. This only happens at most once, since by
2572  * definition this occurs at the end of the first block, which dominates the
2573  * rest of the program. */
2574 
2575 static void
emit_first_cf(agx_context * ctx)2576 emit_first_cf(agx_context *ctx)
2577 {
2578    if (ctx->any_cf)
2579       return;
2580 
2581    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2582    agx_begin_cf(&_b);
2583    ctx->any_cf = true;
2584 }
2585 
2586 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)2587 emit_cf_list(agx_context *ctx, struct exec_list *list)
2588 {
2589    agx_block *start_block = NULL;
2590 
2591    foreach_list_typed(nir_cf_node, node, node, list) {
2592       switch (node->type) {
2593       case nir_cf_node_block: {
2594          agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2595 
2596          if (!start_block)
2597             start_block = block;
2598 
2599          break;
2600       }
2601 
2602       case nir_cf_node_if:
2603          emit_first_cf(ctx);
2604          emit_if(ctx, nir_cf_node_as_if(node));
2605          break;
2606 
2607       case nir_cf_node_loop:
2608          emit_first_cf(ctx);
2609          emit_loop(ctx, nir_cf_node_as_loop(node));
2610          break;
2611 
2612       default:
2613          unreachable("Unknown control flow");
2614       }
2615    }
2616 
2617    return start_block;
2618 }
2619 
2620 static void
agx_set_st_vary_final(agx_context * ctx)2621 agx_set_st_vary_final(agx_context *ctx)
2622 {
2623    agx_foreach_instr_global_rev(ctx, I) {
2624       if (I->op == AGX_OPCODE_ST_VARY) {
2625          I->last = true;
2626          return;
2627       }
2628    }
2629 
2630    /* If we got here, there was no varying written. We need to mark that. */
2631    agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
2632    agx_builder _b = agx_init_builder(ctx, agx_after_block_logical(last_block));
2633    agx_no_varyings(&_b);
2634 }
2635 
2636 static int
agx_dump_stats(agx_context * ctx,unsigned size,char ** out)2637 agx_dump_stats(agx_context *ctx, unsigned size, char **out)
2638 {
2639    unsigned nr_ins = 0, spills = 0, fills = 0;
2640 
2641    /* Count instructions */
2642    agx_foreach_instr_global(ctx, I) {
2643       nr_ins++;
2644 
2645       if (I->op == AGX_OPCODE_STACK_STORE)
2646          spills++;
2647       else if (I->op == AGX_OPCODE_STACK_LOAD)
2648          fills++;
2649    }
2650 
2651    struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx);
2652 
2653    unsigned nr_threads =
2654       agx_occupancy_for_register_count(ctx->max_reg).max_threads;
2655 
2656    return asprintf(
2657       out,
2658       "%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, "
2659       "%u uniforms, %u scratch, %u threads, %u loops, "
2660       "%u:%u spills:fills",
2661       gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib,
2662       cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size,
2663       nr_threads, ctx->loop_count, spills, fills);
2664 }
2665 
2666 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)2667 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
2668 {
2669    if (instr->type != nir_instr_type_alu)
2670       return false;
2671 
2672    nir_alu_instr *alu = nir_instr_as_alu(instr);
2673    return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
2674 }
2675 
2676 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
2677  * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
2678  * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
2679  * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
2680  * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
2681  * need to change units from radians to quadrants modulo turns. Cosine is
2682  * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
2683  */
2684 
2685 static nir_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)2686 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
2687 {
2688    nir_alu_instr *alu = nir_instr_as_alu(instr);
2689    nir_def *x = nir_mov_alu(b, alu->src[0], 1);
2690    nir_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
2691 
2692    if (alu->op == nir_op_fcos)
2693       turns = nir_fadd_imm(b, turns, 0.25f);
2694 
2695    nir_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
2696    return nir_fsin_agx(b, quadrants);
2697 }
2698 
2699 static bool
agx_lower_sincos(nir_shader * shader)2700 agx_lower_sincos(nir_shader *shader)
2701 {
2702    return nir_shader_lower_instructions(shader, agx_lower_sincos_filter,
2703                                         agx_lower_sincos_impl, NULL);
2704 }
2705 
2706 static bool
agx_lower_front_face(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)2707 agx_lower_front_face(struct nir_builder *b, nir_intrinsic_instr *intr,
2708                      UNUSED void *data)
2709 {
2710    if (intr->intrinsic != nir_intrinsic_load_front_face)
2711       return false;
2712 
2713    nir_def *def = &intr->def;
2714    assert(def->bit_size == 1);
2715 
2716    b->cursor = nir_before_instr(&intr->instr);
2717    nir_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
2718    return true;
2719 }
2720 
2721 /*
2722  * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
2723  * again at shader variant compile time. Unless there was a complex shader key,
2724  * the latter run should be almost a no-op.
2725  */
2726 static void
agx_optimize_loop_nir(nir_shader * nir)2727 agx_optimize_loop_nir(nir_shader *nir)
2728 {
2729    bool progress;
2730 
2731    do {
2732       progress = false;
2733 
2734       NIR_PASS(progress, nir, nir_copy_prop);
2735       NIR_PASS(progress, nir, nir_opt_remove_phis);
2736       NIR_PASS(progress, nir, nir_opt_dce);
2737       NIR_PASS(progress, nir, nir_opt_dead_cf);
2738       NIR_PASS(progress, nir, nir_opt_cse);
2739       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
2740       NIR_PASS(progress, nir, nir_opt_phi_precision);
2741       NIR_PASS(progress, nir, nir_opt_algebraic);
2742       NIR_PASS(progress, nir, nir_opt_constant_folding);
2743       NIR_PASS(progress, nir, nir_opt_undef);
2744       NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
2745       NIR_PASS(progress, nir, nir_opt_loop_unroll);
2746    } while (progress);
2747 }
2748 
2749 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2750 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
2751                  unsigned num_components, nir_intrinsic_instr *low,
2752                  nir_intrinsic_instr *high, void *data)
2753 {
2754    /* Must be aligned to the size of the load */
2755    unsigned align = nir_combined_align(align_mul, align_offset);
2756    if ((bit_size / 8) > align)
2757       return false;
2758 
2759    if (num_components > 4)
2760       return false;
2761 
2762    if (bit_size > 32)
2763       return false;
2764 
2765    return true;
2766 }
2767 
2768 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2769 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2770 {
2771    if (!nir_intrinsic_has_access(intr))
2772       return false;
2773 
2774    nir_intrinsic_set_access(intr,
2775                             ACCESS_CAN_SPECULATE | nir_intrinsic_access(intr));
2776    return true;
2777 }
2778 
2779 static void
agx_optimize_nir(nir_shader * nir,bool soft_fault,unsigned * preamble_size)2780 agx_optimize_nir(nir_shader *nir, bool soft_fault, unsigned *preamble_size)
2781 {
2782    /* This runs only once up front since other optimizations don't affect it */
2783    NIR_PASS(_, nir, nir_opt_shrink_stores, true);
2784 
2785    agx_optimize_loop_nir(nir);
2786 
2787    /* If soft fault is enabled, we can freely speculate everything. That lets us
2788     * peephole select and form preambles more aggressively.
2789     */
2790    if (soft_fault) {
2791       NIR_PASS(_, nir, nir_shader_intrinsics_pass, set_speculate,
2792                nir_metadata_control_flow, NULL);
2793    }
2794 
2795    /* Peephole select again after setting the speculate flag but before
2796     * vectorizing. This cleans up short-circuit loads in unrolled loops.
2797     *
2798     * XXX: Set indirect_load_ok once we can investigate CTS flakes.
2799     */
2800    NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
2801 
2802    NIR_PASS(_, nir, nir_opt_load_store_vectorize,
2803             &(const nir_load_store_vectorize_options){
2804                .modes = nir_var_mem_global | nir_var_mem_constant,
2805                .callback = mem_vectorize_cb,
2806             });
2807    NIR_PASS(_, nir, nir_lower_pack);
2808 
2809    nir_convert_to_lcssa(nir, true, true);
2810    NIR_PASS_V(nir, nir_divergence_analysis);
2811    bool progress = false;
2812 
2813    static const nir_lower_subgroups_options subgroups_options = {
2814       .ballot_bit_size = 32,
2815       .ballot_components = 1,
2816       .lower_elect = true,
2817       .lower_subgroup_masks = true,
2818    };
2819 
2820    NIR_PASS(progress, nir, nir_opt_uniform_atomics, true);
2821    NIR_PASS(progress, nir, nir_opt_uniform_subgroup, &subgroups_options);
2822 
2823    /* The above create operations that need lowering/optimizing */
2824    if (progress) {
2825       NIR_PASS(_, nir, agx_nir_lower_subgroups);
2826       NIR_PASS(_, nir, nir_opt_algebraic);
2827    }
2828 
2829    progress = false;
2830    NIR_PASS(progress, nir, agx_nir_lower_address);
2831 
2832    /* If address lowering made progress, clean up before forming preambles.
2833     * Otherwise the optimized preambles might just be constants! Do it before
2834     * lowering int64 too, to avoid lowering constant int64 arithmetic.
2835     */
2836    if (progress) {
2837       NIR_PASS(_, nir, nir_opt_constant_folding);
2838       NIR_PASS(_, nir, nir_opt_dce);
2839    }
2840 
2841    /* Only lower int64 after optimizing address arithmetic, so that u2u64/i2i64
2842     * conversions remain.
2843     */
2844    progress = false;
2845    NIR_PASS(progress, nir, nir_lower_int64);
2846 
2847    /* If we lowered actual int64 arithmetic (not folded into the address
2848     * calculations), then clean up after the lowering.
2849     */
2850    if (progress) {
2851       do {
2852          progress = false;
2853 
2854          NIR_PASS(progress, nir, nir_opt_algebraic);
2855          NIR_PASS(progress, nir, nir_opt_constant_folding);
2856          NIR_PASS(progress, nir, nir_opt_dce);
2857       } while (progress);
2858    }
2859 
2860    if (preamble_size && (!(agx_compiler_debug & AGX_DBG_NOPREAMBLE)))
2861       NIR_PASS(_, nir, agx_nir_opt_preamble, preamble_size);
2862 
2863    /* Forming preambles may dramatically reduce the instruction count
2864     * in certain blocks, causing some if-else statements to become
2865     * trivial. We want to peephole select those, given that control flow
2866     * prediction instructions are costly.
2867     *
2868     * We need to lower int64 again to deal with the resulting 64-bit csels.
2869     */
2870    NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
2871    NIR_PASS(_, nir, nir_lower_int64);
2872 
2873    NIR_PASS(_, nir, nir_opt_algebraic_late);
2874 
2875    /* Fuse add/sub/multiplies/shifts after running opt_algebraic_late to fuse
2876     * isub but before shifts are lowered.
2877     */
2878    do {
2879       progress = false;
2880 
2881       NIR_PASS(progress, nir, nir_opt_dce);
2882       NIR_PASS(progress, nir, nir_opt_cse);
2883       NIR_PASS(progress, nir, agx_nir_fuse_algebraic_late);
2884    } while (progress);
2885 
2886    /* Do remaining lowering late, since this inserts &s for shifts so we want to
2887     * do it after fusing constant shifts. Constant folding will clean up.
2888     */
2889    NIR_PASS(_, nir, agx_nir_lower_algebraic_late);
2890    NIR_PASS(_, nir, agx_nir_fuse_selects);
2891    NIR_PASS(_, nir, nir_opt_constant_folding);
2892    NIR_PASS(_, nir, nir_opt_combine_barriers, NULL, NULL);
2893    NIR_PASS(_, nir, nir_copy_prop);
2894    NIR_PASS(_, nir, nir_opt_dce);
2895    NIR_PASS(_, nir, nir_opt_cse);
2896    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
2897    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2898 
2899    /* Cleanup optimizations */
2900    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
2901                                nir_move_load_input | nir_move_comparisons |
2902                                nir_move_copies | nir_move_load_ssbo |
2903                                nir_move_alu;
2904 
2905    NIR_PASS(_, nir, nir_opt_sink, move_all);
2906    NIR_PASS(_, nir, nir_opt_move, move_all);
2907    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
2908 }
2909 
2910 /*
2911  * Varyings that are used as texture coordinates should be kept at fp32, because
2912  * fp16 does not have enough precision for large textures. It's technically
2913  * conformant not to, but every app gets this wrong.
2914  */
2915 static bool
gather_texcoords(nir_builder * b,nir_instr * instr,void * data)2916 gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
2917 {
2918    uint64_t *mask = data;
2919 
2920    if (instr->type != nir_instr_type_tex)
2921       return false;
2922 
2923    nir_tex_instr *tex = nir_instr_as_tex(instr);
2924 
2925    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
2926    if (coord_idx < 0)
2927       return false;
2928 
2929    nir_src src = tex->src[coord_idx].src;
2930    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
2931    nir_scalar y = nir_scalar_resolved(src.ssa, 1);
2932 
2933    if (x.def != y.def)
2934       return false;
2935 
2936    nir_instr *parent = x.def->parent_instr;
2937 
2938    if (parent->type != nir_instr_type_intrinsic)
2939       return false;
2940 
2941    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
2942 
2943    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
2944       return false;
2945 
2946    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2947    *mask |= BITFIELD64_BIT(sem.location);
2948    return false;
2949 }
2950 
2951 static bool
gather_interp(nir_builder * b,nir_intrinsic_instr * intr,void * data)2952 gather_interp(nir_builder *b, nir_intrinsic_instr *intr, void *data)
2953 {
2954    struct agx_interp_info *masks = data;
2955 
2956    if (intr->intrinsic == nir_intrinsic_load_input) {
2957       nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2958       masks->flat |= BITFIELD64_RANGE(sem.location, sem.num_slots);
2959    } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
2960               nir_intrinsic_interp_mode(nir_src_as_intrinsic(intr->src[0])) ==
2961                  INTERP_MODE_NOPERSPECTIVE) {
2962       nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2963       masks->linear |= BITFIELD64_RANGE(sem.location, sem.num_slots);
2964    }
2965 
2966    return false;
2967 }
2968 
2969 /*
2970  * Build a bit mask of varyings (by location) that are flatshaded and linear
2971  * shaded. This information is needed by the driver.
2972  */
2973 struct agx_interp_info
agx_gather_interp_info(nir_shader * nir)2974 agx_gather_interp_info(nir_shader *nir)
2975 {
2976    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
2977 
2978    struct agx_interp_info masks = {0};
2979    nir_shader_intrinsics_pass(nir, gather_interp, nir_metadata_all, &masks);
2980    return masks;
2981 }
2982 
2983 /*
2984  * Build a bit mask of varyings (by location) that are used as texture
2985  * coordinates. This information is needed by lower_mediump_io.
2986  */
2987 uint64_t
agx_gather_texcoords(nir_shader * nir)2988 agx_gather_texcoords(nir_shader *nir)
2989 {
2990    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
2991 
2992    uint64_t mask = 0;
2993    nir_shader_instructions_pass(nir, gather_texcoords, nir_metadata_all, &mask);
2994    return mask;
2995 }
2996 
2997 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)2998 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
2999                          uint8_t bit_size, uint32_t align,
3000                          uint32_t align_offset, bool offset_is_const,
3001                          const void *cb_data)
3002 {
3003    align = nir_combined_align(align, align_offset);
3004 
3005    assert(util_is_power_of_two_nonzero(align));
3006 
3007    if ((bytes & 1) || (align == 1))
3008       bit_size = 8;
3009    else if ((bytes & 2) || (align == 2))
3010       bit_size = 16;
3011    else if (bit_size >= 32)
3012       bit_size = 32;
3013 
3014    return (nir_mem_access_size_align){
3015       .num_components = MIN2(bytes / (bit_size / 8), 4),
3016       .bit_size = bit_size,
3017       .align = bit_size / 8,
3018    };
3019 }
3020 
3021 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * _)3022 lower_bit_size_callback(const nir_instr *instr, UNUSED void *_)
3023 {
3024    if (instr->type == nir_instr_type_intrinsic) {
3025       /* Handle small subgroup ops */
3026       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3027 
3028       switch (intr->intrinsic) {
3029       case nir_intrinsic_reduce:
3030       case nir_intrinsic_exclusive_scan:
3031       case nir_intrinsic_inclusive_scan:
3032          /* The identity for iand doesn't work for lowered 1-bit booleans, so
3033           * lower that explicitly.
3034           */
3035          if (nir_intrinsic_reduction_op(intr) == nir_op_iand &&
3036              intr->def.bit_size == 1)
3037             return 16;
3038 
3039          /* In general, we have 16-bit ops instead of 8-bit, so lower those. */
3040          return intr->def.bit_size == 8 ? 16 : 0;
3041       default:
3042          return 0;
3043       }
3044    } else if (instr->type == nir_instr_type_alu) {
3045       /* Lower 8-bit ALU to 16-bit. We check the destination, as we do not want
3046        * to lower conversions from 8-bit to larger types. Those conversions get
3047        * implemented natively.
3048        */
3049       nir_alu_instr *alu = nir_instr_as_alu(instr);
3050       if (alu->def.bit_size == 8 && !is_conversion_to_8bit(alu->op))
3051          return 16;
3052       else if (alu->def.bit_size == 1 && alu->src[0].src.ssa->bit_size == 8)
3053          return 16 /* comparisons */;
3054    }
3055 
3056    return 0;
3057 }
3058 
3059 static bool
lower_load_from_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * data)3060 lower_load_from_texture_handle(nir_builder *b, nir_intrinsic_instr *intr,
3061                                void *data)
3062 {
3063    if (intr->intrinsic != nir_intrinsic_load_from_texture_handle_agx)
3064       return false;
3065 
3066    /* Bindless handles are a vec2, where the first source is the (constant)
3067     * uniform register number and the second source is the byte offset.
3068     */
3069    nir_scalar uniform = nir_scalar_resolved(intr->src[0].ssa, 0);
3070    unsigned uniform_idx = nir_scalar_as_uint(uniform);
3071 
3072    b->cursor = nir_instr_remove(&intr->instr);
3073    nir_def *base = nir_load_preamble(b, 1, 64, uniform_idx);
3074    nir_def *offset = nir_u2u64(b, nir_channel(b, intr->src[0].ssa, 1));
3075 
3076    nir_def_rewrite_uses(&intr->def, nir_iadd(b, base, offset));
3077    return true;
3078 }
3079 
3080 static void
agx_remove_unreachable_block(agx_block * block)3081 agx_remove_unreachable_block(agx_block *block)
3082 {
3083    /* Delete the edges */
3084    agx_foreach_successor(block, succ) {
3085       unsigned block_idx = agx_predecessor_index(succ, block);
3086 
3087       /* Remove the corresponding predecessor from the successor */
3088       struct util_dynarray *blocks = &succ->predecessors;
3089       int remaining = agx_num_predecessors(succ) - (block_idx + 1);
3090       assert(remaining >= 0);
3091 
3092       memcpy(util_dynarray_element(blocks, agx_block *, block_idx),
3093              util_dynarray_element(blocks, agx_block *, block_idx + 1),
3094              remaining * sizeof(agx_block *));
3095       blocks->size -= sizeof(agx_block *);
3096 
3097       /* Remove the corresponding source from the phis */
3098       agx_foreach_phi_in_block(succ, phi) {
3099          assert(block_idx + 1 <= phi->nr_srcs);
3100 
3101          memcpy(phi->src + block_idx, phi->src + block_idx + 1,
3102                 (phi->nr_srcs - (block_idx + 1)) * sizeof(phi->src[0]));
3103 
3104          phi->nr_srcs--;
3105 
3106          /* This might cause phis to become trivial. Lower 1-source phis to
3107           * moves and let copyprop take it from here.
3108           */
3109          if (phi->nr_srcs == 1) {
3110             phi->op = AGX_OPCODE_MOV;
3111          }
3112       }
3113    }
3114 
3115    /* Remove the successor from the predecessor. */
3116    block->successors[0] = NULL;
3117    block->successors[1] = NULL;
3118 
3119    /* Note: we do not remove the block itself, although it is now fully orphaned
3120     * in the control flow graph. We still need it in source order if it has any
3121     * pop_exec instructions, for a loop continue block.
3122     *
3123     * TODO: Is there a better way to handle this?
3124     *
3125     * Affects: dEQP-VK.graphicsfuzz.cov-matching-if-always-true-inside-loop
3126     */
3127 }
3128 
3129 /*
3130  * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These
3131  * blocks have no predecessors, but do have successors and can contribute to
3132  * phis. They are dead and do not need to be here. Further, they violate the IR
3133  * invariant:
3134  *
3135  *    Live-in sources are live-out in all predecessors.
3136  *
3137  * ...which RA depends on when handling live range splits. The simplest solution
3138  * is to simply delete these dead blocks. Fortunately, because they are
3139  * unreachable, this does not have any ill effects. Notably, this cannot
3140  * introduce critical edges.
3141  *
3142  * Deleting a block may cause a successor to become unreachable, so we use a
3143  * fixed-point algorithm to converge.
3144  */
3145 static void
agx_remove_unreachable_blocks(agx_context * ctx)3146 agx_remove_unreachable_blocks(agx_context *ctx)
3147 {
3148    agx_block *start = agx_start_block(ctx);
3149    bool progress;
3150 
3151    do {
3152       progress = false;
3153 
3154       agx_foreach_block_safe(ctx, pred) {
3155          if (pred != start && agx_num_predecessors(pred) == 0 &&
3156              agx_num_successors(pred) > 0) {
3157 
3158             agx_remove_unreachable_block(pred);
3159             progress = true;
3160          }
3161       }
3162    } while (progress);
3163 }
3164 
3165 static bool
agx_should_dump(nir_shader * nir,unsigned agx_dbg_bit)3166 agx_should_dump(nir_shader *nir, unsigned agx_dbg_bit)
3167 {
3168    return (agx_compiler_debug & agx_dbg_bit) &&
3169           !(nir->info.internal && !(agx_compiler_debug & AGX_DBG_INTERNAL));
3170 }
3171 
3172 static unsigned
agx_compile_function_nir(nir_shader * nir,nir_function_impl * impl,struct agx_shader_key * key,struct util_debug_callback * debug,struct util_dynarray * binary,struct agx_shader_info * out)3173 agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
3174                          struct agx_shader_key *key,
3175                          struct util_debug_callback *debug,
3176                          struct util_dynarray *binary,
3177                          struct agx_shader_info *out)
3178 {
3179    nir_index_blocks(impl);
3180    nir_index_ssa_defs(impl);
3181 
3182    agx_context *ctx = rzalloc(NULL, agx_context);
3183    ctx->nir = nir;
3184    ctx->is_preamble = impl->function->is_preamble;
3185    ctx->out = out;
3186    ctx->key = key;
3187    ctx->stage = nir->info.stage;
3188    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
3189    ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
3190    list_inithead(&ctx->blocks);
3191 
3192    if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble) {
3193       ctx->any_cf = key->fs.inside_sample_loop;
3194    }
3195 
3196    ctx->alloc = impl->ssa_alloc;
3197    emit_cf_list(ctx, &impl->body);
3198    agx_emit_phis_deferred(ctx);
3199 
3200    /* Index blocks now that we're done emitting so the order is consistent. Do
3201     * this before agx_remove_unreachable_blocks so we match NIR indexing. This
3202     * makes for easier debugging.
3203     */
3204    agx_foreach_block(ctx, block) {
3205       block->index = ctx->num_blocks++;
3206    }
3207 
3208    agx_remove_unreachable_blocks(ctx);
3209 
3210    /* Only allocate scratch if it's statically used, regardless of if the NIR
3211     * info claims otherwise.
3212     */
3213    if (ctx->any_scratch) {
3214       assert(!ctx->is_preamble && "preambles don't use scratch");
3215       ctx->scratch_size = ALIGN(nir->scratch_size, 16);
3216    }
3217 
3218    /* Stop the main shader or preamble shader after the exit block. For real
3219     * functions, we would return here.
3220     */
3221    if (!ctx->key->no_stop || ctx->is_preamble) {
3222       agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
3223       agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
3224       agx_stop(&_b);
3225    }
3226 
3227    agx_validate(ctx, "IR translation");
3228 
3229    if (likely(!(agx_compiler_debug & AGX_DBG_NOOPT))) {
3230       /* Eliminate dead instructions before CSE to avoid silly scheduling */
3231       agx_dce(ctx, false);
3232 
3233       /* CSE before eliminating dead destinations so that subdivision is
3234        * optimized properly.
3235        */
3236       agx_opt_cse(ctx);
3237 
3238       /* After DCE, use counts are right so we can run the optimizer. */
3239       agx_optimizer(ctx);
3240       agx_opt_compact_constants(ctx);
3241 
3242       /* After inlining constants, promote what's left */
3243       if (key->promote_constants && !key->secondary &&
3244           !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
3245          agx_opt_promote_constants(ctx);
3246       }
3247    }
3248 
3249    /* For correctness, lower uniform sources after copyprop (for correctness,
3250     * as copyprop creates uniform sources). To keep register pressure in
3251     * check, lower after CSE, since moves are cheaper than registers.
3252     */
3253    agx_lower_uniform_sources(ctx);
3254 
3255    /* RA correctness depends on DCE */
3256    agx_dce(ctx, true);
3257    agx_validate(ctx, "Pre-RA passes");
3258 
3259    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3260       agx_print_shader(ctx, stdout);
3261 
3262    if (likely(!(agx_compiler_debug & AGX_DBG_NOSCHED))) {
3263       agx_pressure_schedule(ctx);
3264       agx_validate(ctx, "Pre-RA scheduler");
3265    }
3266 
3267    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3268       agx_print_shader(ctx, stdout);
3269 
3270    agx_ra(ctx);
3271    agx_validate(ctx, "RA");
3272    agx_lower_64bit_postra(ctx);
3273 
3274    if (ctx->scratch_size > 0) {
3275       /* Apple always allocate 40 more bytes in the entrypoint and align to 4. */
3276       uint64_t stack_size = ALIGN(DIV_ROUND_UP(ctx->scratch_size, 4) + 10, 4);
3277 
3278       assert(stack_size < INT16_MAX);
3279 
3280       agx_block *start_block = agx_start_block(ctx);
3281       agx_builder _b = agx_init_builder(ctx, agx_before_block(start_block));
3282       agx_stack_adjust(&_b, stack_size);
3283 
3284       /* If we're going to execute multiple times, make sure we clean up after
3285        * ourselves, else the hardware faults.
3286        */
3287       if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble &&
3288           ctx->key->fs.inside_sample_loop) {
3289 
3290          _b = agx_init_builder(ctx, agx_after_block(agx_end_block(ctx)));
3291          agx_stack_adjust(&_b, -stack_size);
3292       }
3293 
3294       if (ctx->is_preamble)
3295          out->preamble_scratch_size = stack_size;
3296       else
3297          out->scratch_size = stack_size;
3298    }
3299 
3300    if (ctx->stage == MESA_SHADER_VERTEX && !impl->function->is_preamble &&
3301        !ctx->key->secondary)
3302       agx_set_st_vary_final(ctx);
3303 
3304    agx_insert_waits(ctx);
3305    agx_opt_empty_else(ctx);
3306    agx_opt_break_if(ctx);
3307    agx_opt_jmp_none(ctx);
3308 
3309    if (ctx->any_quad_divergent_shuffle)
3310       agx_lower_divergent_shuffle(ctx);
3311 
3312    agx_lower_pseudo(ctx);
3313 
3314    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3315       agx_print_shader(ctx, stdout);
3316 
3317    /* Pad binary */
3318    if (binary->size % AGX_CODE_ALIGN) {
3319       unsigned ngrow = AGX_CODE_ALIGN - (binary->size % AGX_CODE_ALIGN);
3320       memset(util_dynarray_grow_bytes(binary, ngrow, 1), 0, ngrow);
3321    }
3322 
3323    unsigned offset = binary->size;
3324    assert((offset % AGX_CODE_ALIGN) == 0);
3325 
3326    agx_pack_binary(ctx, binary);
3327 
3328    unsigned nr_gprs = ctx->max_reg + 1;
3329 
3330    /* If the preamble uses scratch (due to spilling), we need to set maximal
3331     * GPRs. Do it here so the driver doesn't have to worry about it.
3332     */
3333    if (impl->function->is_preamble)
3334       out->nr_preamble_gprs = ctx->scratch_size ? 256 : nr_gprs;
3335    else
3336       out->nr_gprs = nr_gprs;
3337 
3338    /* Don't dump statistics for preambles, since they're not worth optimizing */
3339    if (!impl->function->is_preamble) {
3340       char *stats;
3341       int ret = agx_dump_stats(ctx, binary->size, &stats);
3342 
3343       if (ret >= 0) {
3344          if (agx_should_dump(nir, AGX_DBG_SHADERDB)) {
3345             fprintf(stderr, "SHADER-DB: %s - %s\n", nir->info.label ?: "",
3346                     stats);
3347          }
3348 
3349          if (debug)
3350             util_debug_message(debug, SHADER_INFO, "%s", stats);
3351 
3352          free(stats);
3353       }
3354    }
3355 
3356    ralloc_free(ctx);
3357 
3358    return offset;
3359 }
3360 
3361 void
agx_link_libagx(nir_shader * nir,const nir_shader * libagx)3362 agx_link_libagx(nir_shader *nir, const nir_shader *libagx)
3363 {
3364    nir_link_shader_functions(nir, libagx);
3365    NIR_PASS(_, nir, nir_inline_functions);
3366    nir_remove_non_entrypoints(nir);
3367    NIR_PASS(_, nir, nir_opt_deref);
3368    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3369    NIR_PASS(_, nir, nir_remove_dead_derefs);
3370    NIR_PASS(_, nir, nir_remove_dead_variables,
3371             nir_var_function_temp | nir_var_shader_temp, NULL);
3372    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
3373             nir_var_shader_temp | nir_var_function_temp,
3374             glsl_get_cl_type_size_align);
3375 }
3376 
3377 /*
3378  * The hardware frcp instruction is sometimes off by 1 ULP. For correctly
3379  * rounded frcp, a refinement step is required. This routine has been
3380  * exhaustively tested with a modified math_bruteforce.
3381  *
3382  * While Khronos APIs allow 2.5 ULP error for divides, nir_lower_idiv relies on
3383  * correctly rounded frcp. This is therefore load bearing for integer division
3384  * on all APIs.
3385  */
3386 static nir_def *
libagx_frcp(nir_builder * b,nir_def * x)3387 libagx_frcp(nir_builder *b, nir_def *x)
3388 {
3389    nir_def *u = nir_frcp(b, x);
3390 
3391    /* Do 1 Newton-Raphson refinement step.
3392     *
3393     * Define f(u) = xu - 1. Then f(u) = 0 iff u = 1/x. Newton's method gives:
3394     *
3395     * u_2 = u - f(u) / f'(u) = u - (xu - 1) / x
3396     *
3397     * Our original guess is close, so we approximate (1 / x) by u:
3398     *
3399     * u_2 = u - u(xu - 1) = u + u(1 - xu)
3400     *     = fma(fma(-x, u, 1), u, u)
3401     */
3402    nir_def *one = nir_imm_float(b, 1.0);
3403    nir_def *u_2 = nir_ffma(b, nir_ffma(b, nir_fneg(b, x), u, one), u, u);
3404 
3405    /* If the original value was infinite, frcp will generate the correct zero.
3406     * However, the Newton-Raphson step would multiply 0 * Inf and get a NaN. So
3407     * skip the refinement step for infinite inputs. We do this backwards,
3408     * checking whether the refined result is NaN, since we can implement this
3409     * check in a single fcmpsel instruction. The other case where the refinement
3410     * is NaN is a NaN input, in which skipping refinement is acceptable.
3411     */
3412    return nir_bcsel(b, nir_fisnan(b, u_2), u, u_2);
3413 }
3414 
3415 static bool
agx_nir_lower_fdiv(nir_builder * b,nir_alu_instr * alu,void * _)3416 agx_nir_lower_fdiv(nir_builder *b, nir_alu_instr *alu, void *_)
3417 {
3418    if (alu->op != nir_op_frcp || !alu->exact || alu->def.bit_size != 32)
3419       return false;
3420 
3421    b->cursor = nir_before_instr(&alu->instr);
3422    nir_def_replace(&alu->def, libagx_frcp(b, nir_ssa_for_alu_src(b, alu, 0)));
3423    return true;
3424 }
3425 
3426 /* Preprocess NIR independent of shader state */
3427 void
agx_preprocess_nir(nir_shader * nir,const nir_shader * libagx)3428 agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
3429 {
3430    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3431 
3432    /* Lower large arrays to scratch and small arrays to csel */
3433    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
3434             glsl_get_natural_size_align_bytes,
3435             glsl_get_natural_size_align_bytes);
3436    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3437    NIR_PASS(_, nir, nir_split_var_copies);
3438    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
3439    NIR_PASS(_, nir, nir_lower_var_copies);
3440 
3441    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3442       NIR_PASS(_, nir, agx_nir_lower_frag_sidefx);
3443    }
3444 
3445    /* Clean up deref gunk after lowering I/O */
3446    NIR_PASS(_, nir, nir_opt_dce);
3447 
3448    agx_link_libagx(nir, libagx);
3449 
3450    /* Runs before we lower away idiv, to work at all. But runs after lowering
3451     * textures, since the cube map array lowering generates division by 6.
3452     */
3453    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3454 
3455    nir_lower_idiv_options idiv_options = {
3456       .allow_fp16 = true,
3457    };
3458 
3459    NIR_PASS(_, nir, nir_lower_idiv, &idiv_options);
3460    NIR_PASS(_, nir, nir_lower_frexp);
3461    NIR_PASS(_, nir, nir_lower_alu);
3462    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3463    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3464    NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false);
3465    NIR_PASS(_, nir, agx_lower_sincos);
3466    NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_lower_front_face,
3467             nir_metadata_control_flow, NULL);
3468    NIR_PASS(_, nir, agx_nir_lower_subgroups);
3469    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3470    NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv,
3471             nir_metadata_control_flow, NULL);
3472 
3473    /* After lowering, run through the standard suite of NIR optimizations. We
3474     * will run through the loop later, once we have the shader key, but if we
3475     * run now, that run will ideally be almost a no-op.
3476     */
3477    agx_optimize_loop_nir(nir);
3478 
3479    NIR_PASS(_, nir, nir_opt_deref);
3480    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3481 
3482    /* We're lowered away all variables. Remove them all for smaller shaders. */
3483    NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
3484    nir->info.io_lowered = true;
3485 
3486    /* Move before lowering */
3487    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3488                                nir_move_load_input | nir_move_comparisons |
3489                                nir_move_copies | nir_move_load_ssbo |
3490                                nir_move_alu;
3491 
3492    NIR_PASS(_, nir, nir_opt_sink, move_all);
3493    NIR_PASS(_, nir, nir_opt_move, move_all);
3494    NIR_PASS(_, nir, agx_nir_lower_shared_bitsize);
3495 }
3496 
3497 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_debug_callback * debug,struct agx_shader_part * out)3498 agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
3499                        struct util_debug_callback *debug,
3500                        struct agx_shader_part *out)
3501 {
3502    agx_compiler_debug = agx_get_compiler_debug();
3503    struct agx_shader_info *info = &out->info;
3504 
3505    struct util_dynarray binary;
3506    util_dynarray_init(&binary, NULL);
3507 
3508    memset(out, 0, sizeof *out);
3509 
3510    assert(nir->info.io_lowered &&
3511           "agx_preprocess_nir is called first, then the shader is specalized,"
3512           "then the specialized shader is compiled");
3513 
3514    /* If required, tag writes will be enabled by instruction selection */
3515    if (nir->info.stage == MESA_SHADER_FRAGMENT)
3516       info->tag_write_disable = !nir->info.writes_memory;
3517 
3518    bool needs_libagx = true /* TODO: Optimize */;
3519 
3520    NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
3521    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3522 
3523    if (needs_libagx) {
3524       agx_link_libagx(nir, key->libagx);
3525 
3526       NIR_PASS(_, nir, nir_opt_deref);
3527       NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3528       NIR_PASS(_, nir, nir_lower_explicit_io,
3529                nir_var_shader_temp | nir_var_function_temp |
3530                   nir_var_mem_shared | nir_var_mem_global,
3531                nir_address_format_62bit_generic);
3532    }
3533 
3534    /* Late sysval lowering creates large loads. Load lowering creates unpacks */
3535    nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
3536       .modes = nir_var_mem_ssbo | nir_var_mem_constant |
3537                nir_var_mem_task_payload | nir_var_shader_temp |
3538                nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
3539       .callback = mem_access_size_align_cb,
3540    };
3541    NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
3542 
3543    /* Cleanup 8-bit math before lowering */
3544    bool progress;
3545    do {
3546       progress = false;
3547 
3548       NIR_PASS(progress, nir, nir_opt_algebraic);
3549       NIR_PASS(progress, nir, nir_opt_constant_folding);
3550       NIR_PASS(progress, nir, nir_opt_dce);
3551    } while (progress);
3552 
3553    NIR_PASS(_, nir, nir_lower_bit_size, lower_bit_size_callback, NULL);
3554 
3555    /* Late blend lowering creates vectors */
3556    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3557    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3558 
3559    /* Late VBO lowering creates constant udiv instructions */
3560    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3561 
3562    NIR_PASS(_, nir, nir_opt_constant_folding);
3563    NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
3564             nir_metadata_control_flow, NULL);
3565 
3566    info->push_count = key->reserved_preamble;
3567    agx_optimize_nir(nir, key->dev.soft_fault,
3568                     key->secondary ? NULL : &info->push_count);
3569 
3570    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3571       info->varyings.fs.nr_cf = key->fs.cf_base;
3572       assign_coefficient_regs(nir, &info->varyings.fs);
3573    }
3574 
3575    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3576       nir_print_shader(nir, stdout);
3577 
3578    info->local_size = nir->info.shared_size;
3579 
3580    nir_foreach_function_with_impl(func, impl, nir) {
3581       unsigned offset =
3582          agx_compile_function_nir(nir, impl, key, debug, &binary, &out->info);
3583 
3584       if (func->is_preamble) {
3585          info->preamble_offset = offset;
3586          info->has_preamble = true;
3587       } else if (func->is_entrypoint) {
3588          info->main_offset = offset;
3589          info->main_size = binary.size - offset;
3590       } else {
3591          unreachable("General functions not yet supported");
3592       }
3593    }
3594 
3595    info->stage = nir->info.stage;
3596 
3597    /* Check these outside the stage check since nir->info.stage is the hardware
3598     * stage and these are read in the vertex *software* stage.
3599     */
3600    info->uses_draw_id =
3601       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
3602 
3603    info->uses_base_param =
3604       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
3605       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
3606 
3607    if (nir->info.stage == MESA_SHADER_VERTEX) {
3608       info->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
3609 
3610       info->writes_layer_viewport =
3611          nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
3612 
3613    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3614       info->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
3615                                   nir->info.fs.needs_quad_helper_invocations ||
3616                                   nir->info.writes_memory;
3617 
3618       /* Writing the sample mask requires tag writes */
3619       info->tag_write_disable &= !info->writes_sample_mask;
3620 
3621       /* Report a canonical depth layout. This happens at the end because the
3622        * sample mask lowering affects it.
3623        */
3624       enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;
3625 
3626       if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
3627          info->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
3628       else if (layout == FRAG_DEPTH_LAYOUT_NONE)
3629          info->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
3630       else
3631          info->depth_layout = layout;
3632 
3633       info->reads_tib = nir->info.fs.uses_fbfetch_output;
3634       info->early_fragment_tests = nir->info.fs.early_fragment_tests;
3635    } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
3636       info->imageblock_stride = nir->info.cs.image_block_size_per_thread_agx;
3637    }
3638 
3639    out->binary = binary.data;
3640    out->binary_size = binary.size;
3641 }
3642