1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * Copyright 2020 Collabora Ltd.
4 * Copyright 2016 Broadcom
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "agx_compile.h"
9 #include "asahi/layout/layout.h"
10 #include "compiler/nir/nir_builder.h"
11 #include "util/bitset.h"
12 #include "util/glheader.h"
13 #include "util/list.h"
14 #include "util/macros.h"
15 #include "util/u_debug.h"
16 #include "util/u_dynarray.h"
17 #include "agx_builder.h"
18 #include "agx_compiler.h"
19 #include "agx_debug.h"
20 #include "agx_nir.h"
21 #include "glsl_types.h"
22 #include "nir.h"
23 #include "nir_builtin_builder.h"
24 #include "nir_intrinsics.h"
25 #include "nir_intrinsics_indices.h"
26 #include "shader_enums.h"
27
28 /* Alignment for shader programs. I'm not sure what the optimal value is. */
29 #define AGX_CODE_ALIGN 0x100
30
31 /* clang-format off */
32 static const struct debug_named_value agx_debug_options[] = {
33 {"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"},
34 {"shaderdb", AGX_DBG_SHADERDB, "Print statistics"},
35 {"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"},
36 {"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"},
37 {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
38 {"noopt", AGX_DBG_NOOPT, "Disable backend optimizations"},
39 {"wait", AGX_DBG_WAIT, "Wait after all async instructions"},
40 {"nopreamble",AGX_DBG_NOPREAMBLE,"Do not use shader preambles"},
41 {"demand", AGX_DBG_DEMAND, "Bound tightly to register demand"},
42 {"nosched", AGX_DBG_NOSCHED, "Do not schedule the shader"},
43 {"spill", AGX_DBG_SPILL, "Spill (almost) everything"},
44 {"nopromote", AGX_DBG_NOPROMOTE, "Do not promote constants to uniforms"},
45 DEBUG_NAMED_VALUE_END
46 };
47 /* clang-format on */
48
49 DEBUG_GET_ONCE_FLAGS_OPTION(agx_compiler_debug, "AGX_MESA_DEBUG",
50 agx_debug_options, 0)
51
52 int agx_compiler_debug = 0;
53
54 uint64_t
agx_get_compiler_debug(void)55 agx_get_compiler_debug(void)
56 {
57 return debug_get_option_agx_compiler_debug();
58 }
59
60 static agx_index
agx_cached_preload(agx_context * ctx,unsigned base,enum agx_size size)61 agx_cached_preload(agx_context *ctx, unsigned base, enum agx_size size)
62 {
63 if (agx_is_null(ctx->preloaded[base])) {
64 agx_block *block = agx_start_block(ctx);
65 agx_builder b = agx_init_builder(ctx, agx_before_block(block));
66 ctx->preloaded[base] = agx_preload(&b, agx_register(base, size));
67 }
68
69 return ctx->preloaded[base];
70 }
71
72 static agx_index
agx_vertex_id(agx_builder * b)73 agx_vertex_id(agx_builder *b)
74 {
75 return agx_cached_preload(b->shader, 10, AGX_SIZE_32);
76 }
77
78 static agx_index
agx_instance_id(agx_builder * b)79 agx_instance_id(agx_builder *b)
80 {
81 return agx_cached_preload(b->shader, 12, AGX_SIZE_32);
82 }
83
84 #define VARYING_NUM_COMPONENTS (VARYING_SLOT_MAX * 4)
85
86 struct coefficient_info {
87 BITSET_DECLARE(smooth, VARYING_NUM_COMPONENTS);
88 BITSET_DECLARE(flat, VARYING_NUM_COMPONENTS);
89 BITSET_DECLARE(noperspective, VARYING_NUM_COMPONENTS);
90 };
91
92 static BITSET_WORD *
bitset_for_interp(struct coefficient_info * info,enum glsl_interp_mode mode)93 bitset_for_interp(struct coefficient_info *info, enum glsl_interp_mode mode)
94 {
95 /* clang-format off */
96 switch (mode) {
97 case INTERP_MODE_NONE:
98 case INTERP_MODE_SMOOTH: return info->smooth;
99 case INTERP_MODE_NOPERSPECTIVE: return info->noperspective;
100 case INTERP_MODE_FLAT: return info->flat;
101 default: unreachable("invalid interp mode");
102 }
103 /* clang-format on */
104 }
105
106 static bool
gather_cf(nir_builder * b,nir_intrinsic_instr * intr,void * data)107 gather_cf(nir_builder *b, nir_intrinsic_instr *intr, void *data)
108 {
109 /* First handle frag coord loads */
110 struct coefficient_info *info = data;
111 if (intr->intrinsic == nir_intrinsic_load_frag_coord_zw) {
112 BITSET_SET(info->noperspective,
113 VARYING_SLOT_POS + nir_intrinsic_component(intr));
114 return false;
115 }
116
117 /* Look for input loads and grab the instruction with the interp mode */
118 nir_intrinsic_instr *bary;
119 unsigned nr = 1;
120
121 if (intr->intrinsic == nir_intrinsic_load_coefficients_agx) {
122 bary = intr;
123 /* Always load a scalar */
124 } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
125 bary = nir_src_as_intrinsic(intr->src[0]);
126 nr = intr->num_components;
127
128 /* Perspective interpolation internally reads W */
129 if (nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE)
130 BITSET_SET(info->noperspective, VARYING_SLOT_POS + 3);
131 } else {
132 return false;
133 }
134
135 BITSET_WORD *set = bitset_for_interp(data, nir_intrinsic_interp_mode(bary));
136 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
137 nir_src *offset = nir_get_io_offset_src(intr);
138
139 /* Mark the exact range for direct loads to minimize CF registers, but mark a
140 * conservative bounding range for indirect array access.
141 */
142 if (nir_src_is_const(*offset)) {
143 unsigned location = sem.location + nir_src_as_uint(*offset);
144 unsigned start_comp = (location * 4) + nir_intrinsic_component(intr);
145
146 BITSET_SET_RANGE(set, start_comp, start_comp + nr - 1);
147 } else {
148 unsigned start_comp = (sem.location * 4) + nir_intrinsic_component(intr);
149 bool compact = sem.location == VARYING_SLOT_CLIP_DIST0 ||
150 sem.location == VARYING_SLOT_CLIP_DIST1;
151 unsigned stride = compact ? 1 : 4;
152
153 /* For now we have to assign CF for the whole vec4 to make indirect
154 * indexiing work. This could be optimized later.
155 */
156 nr = stride;
157
158 for (unsigned i = 0; i < sem.num_slots; ++i) {
159 BITSET_SET_RANGE(set, start_comp + (i * stride),
160 start_comp + (i * stride) + nr - 1);
161 }
162 }
163
164 return false;
165 }
166
167 /*
168 * We assign all coefficient registers up front to ensure we have a consistent
169 * layout required for indirects to work.
170 */
171 static void
assign_coefficient_regs(nir_shader * nir,struct agx_varyings_fs * var)172 assign_coefficient_regs(nir_shader *nir, struct agx_varyings_fs *var)
173 {
174 struct coefficient_info info = {0};
175 nir_shader_intrinsics_pass(nir, gather_cf, nir_metadata_all, &info);
176
177 /* W */
178 if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 3)) {
179 var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
180 .cf_base = var->nr_cf++,
181 .slot = VARYING_SLOT_POS,
182 .offset = 3,
183 .count = 1,
184 .smooth = true,
185 };
186 }
187
188 /* Z */
189 if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 2)) {
190 var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
191 .cf_base = var->nr_cf++,
192 .slot = VARYING_SLOT_POS,
193 .offset = 2,
194 .count = 1,
195 .smooth = true,
196 };
197
198 var->reads_z = true;
199 }
200
201 static_assert(VARYING_SLOT_POS == 0, "special and handled first");
202
203 for (unsigned i = VARYING_SLOT_POS + 1; i < VARYING_SLOT_MAX; ++i) {
204 bool smooth = BITSET_TEST_RANGE(info.smooth, i * 4, (i * 4) + 3);
205 bool flat = BITSET_TEST_RANGE(info.flat, i * 4, (i * 4) + 3);
206 bool noperspective =
207 BITSET_TEST_RANGE(info.noperspective, i * 4, (i * 4) + 3);
208
209 if (!(smooth || flat || noperspective))
210 continue;
211
212 /* From the GLSL 4.60 spec ("Input Layout Qualifiers"):
213 *
214 * when location aliasing, the aliases sharing the location must have
215 * the same underlying numerical type and bit width (floating-point or
216 * integer, 32-bit versus 64-bit, etc.) and the same auxiliary storage
217 * and interpolation qualification.
218 *
219 * SPIR-V should obey this as well although the spec text is muddier.
220 */
221 assert((smooth + flat + noperspective) == 1 &&
222 "slots must have consistent interpolation");
223
224 BITSET_WORD *set = smooth ? info.smooth
225 : flat ? info.flat
226 : info.noperspective;
227
228 /* Find the start offset */
229 unsigned offset = 0;
230 for (offset = 0; offset < 4 && !BITSET_TEST(set, (i * 4) + offset);
231 ++offset)
232 ;
233
234 /* Find the end offset. TODO: Do we ever need to split into two bindings
235 * to handle e.g. x_zw read masks?
236 */
237 unsigned count = 0;
238 for (unsigned c = offset; c < 4; ++c) {
239 if (BITSET_TEST(set, (i * 4) + c))
240 count = c - offset + 1;
241 }
242 assert(count >= 1 && (count + offset) <= 4);
243
244 var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
245 .cf_base = var->nr_cf,
246 .slot = i,
247 .offset = offset,
248 .count = count,
249 .smooth = !flat,
250 .perspective = smooth,
251 };
252
253 var->nr_cf += count;
254 }
255 }
256
257 static agx_index
agx_get_cf(agx_context * ctx,gl_varying_slot slot,unsigned offset)258 agx_get_cf(agx_context *ctx, gl_varying_slot slot, unsigned offset)
259 {
260 struct agx_varyings_fs *varyings = &ctx->out->varyings.fs;
261
262 /* We already have an appropriate binding, find it */
263 for (unsigned b = 0; b < varyings->nr_bindings; ++b) {
264 if (varyings->bindings[b].slot == slot &&
265 (slot != VARYING_SLOT_POS ||
266 offset == varyings->bindings[b].offset)) {
267
268 signed cf_offset = offset - varyings->bindings[b].offset;
269 assert(cf_offset >= 0);
270
271 return agx_immediate(varyings->bindings[b].cf_base + cf_offset);
272 }
273 }
274
275 unreachable("all coefficient registers preassigned");
276 }
277
278 /* Builds a 64-bit hash table key for an index */
279 static uint64_t
agx_index_to_key(agx_index idx)280 agx_index_to_key(agx_index idx)
281 {
282 STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
283
284 uint64_t key = 0;
285 memcpy(&key, &idx, sizeof(idx));
286 return key;
287 }
288
289 /*
290 * Extract a single channel out of a vector source. We split vectors with
291 * p_split so we can use the split components directly, without emitting a
292 * machine instruction. This has advantages of RA, as the split can usually be
293 * optimized away.
294 */
295 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)296 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
297 {
298 agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
299 agx_index_to_key(vec));
300
301 assert(components != NULL && "missing agx_emit_collect_to");
302
303 return components[channel];
304 }
305
306 static agx_index
agx_extract_nir_src(agx_builder * b,nir_src src,unsigned channel)307 agx_extract_nir_src(agx_builder *b, nir_src src, unsigned channel)
308 {
309 agx_index idx = agx_src_index(&src);
310
311 /* We only deal with scalars, extract a single scalar if needed */
312 if (nir_src_num_components(src) > 1)
313 return agx_emit_extract(b, idx, channel);
314 else
315 return idx;
316 }
317
318 static void
agx_cache_collect(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)319 agx_cache_collect(agx_builder *b, agx_index dst, unsigned nr_srcs,
320 agx_index *srcs)
321 {
322 /* Lifetime of a hash table entry has to be at least as long as the table */
323 agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);
324
325 for (unsigned i = 0; i < nr_srcs; ++i)
326 channels[i] = srcs[i];
327
328 _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
329 channels);
330 }
331
332 /*
333 * Combine multiple scalars into a vector destination. This corresponds to
334 * collect, lowered to moves (a shuffle in general) after register allocation.
335 *
336 * To optimize vector extractions, we record the individual channels
337 */
338 static agx_instr *
agx_emit_collect_to(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)339 agx_emit_collect_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
340 agx_index *srcs)
341 {
342 agx_cache_collect(b, dst, nr_srcs, srcs);
343
344 if (nr_srcs == 1)
345 return agx_mov_to(b, dst, srcs[0]);
346
347 agx_instr *I = agx_collect_to(b, dst, nr_srcs);
348
349 agx_foreach_src(I, s)
350 I->src[s] = srcs[s];
351
352 return I;
353 }
354
355 static agx_index
agx_emit_collect(agx_builder * b,unsigned nr_srcs,agx_index * srcs)356 agx_emit_collect(agx_builder *b, unsigned nr_srcs, agx_index *srcs)
357 {
358 agx_index dst = agx_vec_temp(b->shader, srcs[0].size, nr_srcs);
359 agx_emit_collect_to(b, dst, nr_srcs, srcs);
360 return dst;
361 }
362
363 static agx_index
agx_vec2(agx_builder * b,agx_index s0,agx_index s1)364 agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
365 {
366 return agx_emit_collect(b, 2, (agx_index[]){s0, s1});
367 }
368
369 static agx_index
agx_recollect_vector(agx_builder * b,nir_src vec)370 agx_recollect_vector(agx_builder *b, nir_src vec)
371 {
372 agx_index comps[4];
373 unsigned nr = nir_src_num_components(vec);
374
375 for (unsigned i = 0; i < nr; ++i)
376 comps[i] = agx_extract_nir_src(b, vec, i);
377
378 return agx_emit_collect(b, nr, comps);
379 }
380
381 /*
382 * Extract the lower or upper N-bits from a (2*N)-bit quantity. We use a split
383 * without null destinations to let us CSE (and coalesce) the splits when both x
384 * and y are split.
385 */
386 static agx_instr *
agx_subdivide_to(agx_builder * b,agx_index dst,agx_index s0,unsigned comp)387 agx_subdivide_to(agx_builder *b, agx_index dst, agx_index s0, unsigned comp)
388 {
389 assert((s0.size == (dst.size + 1)) && "only 2x subdivide handled");
390 assert((comp == 0 || comp == 1) && "too many components");
391
392 /* Handle immediates specially so we don't have to constant fold splits. */
393 if (s0.type == AGX_INDEX_IMMEDIATE) {
394 unsigned bits = 16 * agx_size_align_16(dst.size);
395 return agx_mov_imm_to(b, dst, (s0.value >> bits) & BITFIELD64_MASK(bits));
396 }
397
398 agx_instr *split = agx_split(b, 2, s0);
399 split->dest[comp] = dst;
400 split->dest[1 - comp] = agx_temp(b->shader, dst.size);
401 return split;
402 }
403
404 void
agx_block_add_successor(agx_block * block,agx_block * successor)405 agx_block_add_successor(agx_block *block, agx_block *successor)
406 {
407 assert(block != NULL && successor != NULL);
408
409 /* Cull impossible edges */
410 if (block->unconditional_jumps)
411 return;
412
413 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
414 if (block->successors[i]) {
415 if (block->successors[i] == successor)
416 return;
417 else
418 continue;
419 }
420
421 block->successors[i] = successor;
422 util_dynarray_append(&successor->predecessors, agx_block *, block);
423 return;
424 }
425
426 unreachable("Too many successors");
427 }
428
429 /*
430 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
431 * split pseudo-instruction.
432 *
433 * Pre-condition: dests is filled with agx_null().
434 */
435 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)436 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
437 {
438 agx_instr *I = agx_split(b, n, vec);
439
440 agx_foreach_dest(I, d) {
441 dests[d] = agx_temp(b->shader, vec.size);
442 I->dest[d] = dests[d];
443 }
444 }
445
446 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)447 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
448 {
449 agx_index dests[4] = {agx_null(), agx_null(), agx_null(), agx_null()};
450 agx_emit_split(b, dests, vec, n);
451 agx_cache_collect(b, vec, n, dests);
452 }
453
454 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)455 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
456 {
457 /* Ensure we've been scalarized and bit size lowered */
458 unsigned bit_size = instr->def.bit_size;
459 assert(instr->def.num_components == 1);
460
461 /* Emit move, later passes can inline/push if useful */
462 agx_mov_imm_to(b, agx_def_index(&instr->def),
463 nir_const_value_as_uint(instr->value[0], bit_size));
464 }
465
466 /*
467 * Implement mul_high of 32-bit sources by doing a 32x32->64-bit multiply and
468 * extracting only the high word.
469 */
470 static agx_instr *
agx_mul_high_to(agx_builder * b,agx_index dst,agx_index P,agx_index Q,bool is_signed)471 agx_mul_high_to(agx_builder *b, agx_index dst, agx_index P, agx_index Q,
472 bool is_signed)
473 {
474 assert(P.size == Q.size && "source sizes must match");
475 assert(P.size == dst.size && "dest size must match");
476 assert(P.size != AGX_SIZE_64 && "64x64 multiply should have been lowered");
477
478 static_assert(AGX_SIZE_64 == (AGX_SIZE_32 + 1), "enum wrong");
479 static_assert(AGX_SIZE_32 == (AGX_SIZE_16 + 1), "enum wrong");
480
481 if (!is_signed) {
482 P = agx_abs(P);
483 Q = agx_abs(Q);
484 }
485
486 agx_index product = agx_temp(b->shader, P.size + 1);
487 agx_imad_to(b, product, P, Q, agx_zero(), 0);
488
489 return agx_subdivide_to(b, dst, product, 1);
490 }
491
492 static enum agx_format
agx_format_for_pipe(enum pipe_format format)493 agx_format_for_pipe(enum pipe_format format)
494 {
495 #define CASE(x) \
496 if (format == (enum pipe_format)AIL_ISA_FORMAT_##x) \
497 return AGX_FORMAT_##x;
498
499 CASE(I8);
500 CASE(I16);
501 CASE(I32);
502 CASE(F16);
503 CASE(U8NORM);
504 CASE(S8NORM);
505 CASE(U16NORM);
506 CASE(S16NORM);
507 CASE(RGB10A2);
508 CASE(SRGBA8);
509 CASE(RG11B10F);
510 CASE(RGB9E5);
511
512 #undef CASE
513 unreachable("Invalid format");
514 }
515
516 static agx_index
cf_for_intrinsic(agx_builder * b,nir_intrinsic_instr * intr)517 cf_for_intrinsic(agx_builder *b, nir_intrinsic_instr *intr)
518 {
519 /* Determine the base location, taking into account a constant offset */
520 unsigned location = nir_intrinsic_io_semantics(intr).location;
521 bool compact = location == VARYING_SLOT_CLIP_DIST0 ||
522 location == VARYING_SLOT_CLIP_DIST1;
523
524 nir_src *offset = nir_get_io_offset_src(intr);
525 if (nir_src_is_const(*offset)) {
526 /* XXX: NIR is broken and uses constant offsets in slots but dynamic
527 * offsets in scalars for compact varyings. This needs to be fixed
528 * upstream.
529 */
530 location += nir_src_as_uint(*offset);
531 }
532
533 agx_index I = agx_get_cf(b->shader, location, nir_intrinsic_component(intr));
534
535 /* If we have a non-constant offset, we add it to the CF. Offsets are in
536 * vec4 slots (unless we're compact) but the CF is in components, so we need
537 * to shift the offset by 2 before adding.
538 */
539 if (!nir_src_is_const(*offset)) {
540 I = agx_iadd(b, I, agx_src_index(offset), compact ? 0 : 2);
541 }
542
543 return I;
544 }
545
546 static enum agx_interpolation
agx_interp_for_bary(nir_intrinsic_instr * bary,agx_index * sample_index)547 agx_interp_for_bary(nir_intrinsic_instr *bary, agx_index *sample_index)
548 {
549 switch (bary->intrinsic) {
550 case nir_intrinsic_load_barycentric_pixel:
551 return AGX_INTERPOLATION_CENTER;
552
553 case nir_intrinsic_load_barycentric_centroid:
554 return AGX_INTERPOLATION_CENTROID;
555
556 case nir_intrinsic_load_barycentric_at_sample:
557 *sample_index = agx_src_index(&bary->src[0]);
558 return AGX_INTERPOLATION_SAMPLE;
559
560 default:
561 unreachable("should have been lowered");
562 }
563 }
564
565 static void
agx_emit_load_vary(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)566 agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
567 {
568 ASSERTED unsigned components = instr->num_components;
569 nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
570
571 assert(components >= 1 && components <= 4);
572
573 agx_index sample_index = agx_zero();
574 enum agx_interpolation interp = agx_interp_for_bary(bary, &sample_index);
575
576 bool perspective =
577 nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE;
578
579 agx_index I = cf_for_intrinsic(b, instr);
580
581 /* For perspective interpolation, we project (multiply by 1/W) */
582 if (perspective) {
583 agx_index J = agx_get_cf(b->shader, VARYING_SLOT_POS, 3);
584 agx_iterproj_to(b, dest, I, J, sample_index, components, interp);
585 } else {
586 agx_iter_to(b, dest, I, sample_index, components, interp);
587 }
588
589 agx_emit_cached_split(b, dest, components);
590 }
591
592 static agx_instr *
agx_emit_local_store_pixel(agx_builder * b,nir_intrinsic_instr * instr)593 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
594 {
595 bool explicit = nir_intrinsic_explicit_coord(instr);
596
597 /* TODO: Reverse-engineer interactions with MRT */
598 if (b->shader->stage == MESA_SHADER_FRAGMENT) {
599 if (b->shader->key->fs.ignore_tib_dependencies) {
600 assert(b->shader->nir->info.internal && "only for clear shaders");
601 } else if (b->shader->did_writeout) {
602 agx_wait_pix(b, 0x0004);
603 } else {
604 agx_wait_pix(b, 0x000C);
605 }
606 }
607
608 /* Compact the registers according to the mask */
609 agx_index compacted[4] = {agx_null()};
610
611 unsigned compact_count = 0;
612 u_foreach_bit(i, nir_intrinsic_write_mask(instr)) {
613 compacted[compact_count++] = agx_extract_nir_src(b, instr->src[0], i);
614 }
615
616 agx_index collected = agx_emit_collect(b, compact_count, compacted);
617 agx_index coords = explicit ? agx_src_index(&instr->src[2]) : agx_null();
618
619 b->shader->did_writeout = true;
620 b->shader->out->tag_write_disable = false;
621 return agx_st_tile(b, collected, agx_src_index(&instr->src[1]), coords,
622 agx_format_for_pipe(nir_intrinsic_format(instr)),
623 nir_intrinsic_write_mask(instr),
624 nir_intrinsic_base(instr), explicit);
625 }
626
627 static agx_instr *
agx_emit_store_zs(agx_builder * b,nir_intrinsic_instr * instr)628 agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
629 {
630 unsigned base = nir_intrinsic_base(instr);
631 bool write_z = base & 1;
632 bool write_s = base & 2;
633
634 /* TODO: Handle better */
635 assert(!b->shader->key->fs.ignore_tib_dependencies && "not used");
636 agx_wait_pix(b, 0x0001);
637
638 agx_index z = agx_src_index(&instr->src[1]);
639 agx_index s = agx_src_index(&instr->src[2]);
640
641 assert(!write_z || z.size == AGX_SIZE_32);
642 assert(!write_s || s.size == AGX_SIZE_16);
643
644 if (write_z && write_s) {
645 agx_index u2u32 = agx_temp(b->shader, AGX_SIZE_32);
646 agx_mov_to(b, u2u32, s);
647 s = u2u32;
648 }
649
650 agx_index zs = (write_z && write_s) ? agx_vec2(b, z, s) : write_z ? z : s;
651
652 /* Not necessarily a sample mask but overlapping hw mechanism... Should
653 * maybe rename this flag to something more general.
654 */
655 b->shader->out->writes_sample_mask = true;
656
657 return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
658 }
659
660 static void
agx_emit_local_load_pixel(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)661 agx_emit_local_load_pixel(agx_builder *b, agx_index dest,
662 nir_intrinsic_instr *instr)
663 {
664 /* TODO: Reverse-engineer interactions with MRT */
665 assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
666 agx_wait_pix(b, 0x0008);
667 b->shader->did_writeout = true;
668
669 unsigned nr_comps = instr->def.num_components;
670 agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]), agx_null(),
671 agx_format_for_pipe(nir_intrinsic_format(instr)),
672 BITFIELD_MASK(nr_comps), nir_intrinsic_base(instr), false);
673 agx_emit_cached_split(b, dest, nr_comps);
674 }
675
676 static void
agx_emit_load(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)677 agx_emit_load(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
678 {
679 agx_index addr = agx_src_index(&instr->src[0]);
680 agx_index offset = agx_src_index(&instr->src[1]);
681 enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
682 unsigned shift = nir_intrinsic_base(instr);
683
684 /* Zero-extend offset if we're not sign-extending */
685 if (!nir_intrinsic_sign_extend(instr))
686 offset = agx_abs(offset);
687
688 agx_device_load_to(b, dest, addr, offset, fmt,
689 BITFIELD_MASK(instr->def.num_components), shift);
690 agx_emit_cached_split(b, dest, instr->def.num_components);
691 }
692
693 static void
agx_emit_store(agx_builder * b,nir_intrinsic_instr * instr)694 agx_emit_store(agx_builder *b, nir_intrinsic_instr *instr)
695 {
696 agx_index addr = agx_src_index(&instr->src[1]);
697 agx_index offset = agx_src_index(&instr->src[2]);
698 enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
699 unsigned shift = nir_intrinsic_base(instr);
700
701 /* Zero-extend offset if we're not sign-extending */
702 if (!nir_intrinsic_sign_extend(instr))
703 offset = agx_abs(offset);
704
705 agx_device_store(b, agx_recollect_vector(b, instr->src[0]), addr, offset,
706 fmt, BITFIELD_MASK(nir_src_num_components(instr->src[0])),
707 shift);
708 }
709
710 /* Preambles write directly to uniform registers, so move from uniform to GPR */
711 static agx_instr *
agx_emit_load_preamble(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)712 agx_emit_load_preamble(agx_builder *b, agx_index dst,
713 nir_intrinsic_instr *instr)
714 {
715 agx_index srcs[4] = {agx_null()};
716 unsigned dim = instr->def.num_components;
717 assert(dim <= ARRAY_SIZE(srcs) && "shouldn't see larger vectors");
718
719 unsigned base = nir_intrinsic_base(instr);
720 unsigned stride = agx_size_align_16(dst.size);
721
722 for (unsigned i = 0; i < dim; ++i)
723 srcs[i] = agx_uniform(base + i * stride, dst.size);
724
725 return agx_emit_collect_to(b, dst, dim, srcs);
726 }
727
728 static agx_instr *
agx_emit_store_preamble(agx_builder * b,nir_intrinsic_instr * instr)729 agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
730 {
731 agx_index vec = agx_src_index(&instr->src[0]);
732 unsigned base = nir_intrinsic_base(instr);
733 unsigned stride = agx_size_align_16(vec.size);
734 unsigned nr = nir_src_num_components(instr->src[0]);
735
736 for (unsigned i = 0; i < nr; i += (4 / stride)) {
737 agx_index data[4] = {0};
738 unsigned count = MIN2(4 / stride, nr - i);
739
740 for (unsigned c = 0; c < count; ++c) {
741 data[c] = agx_extract_nir_src(b, instr->src[0], i + c);
742 }
743
744 agx_uniform_store(b, agx_emit_collect(b, count, data),
745 agx_immediate(base + i * stride), BITFIELD_MASK(count));
746 }
747
748 return NULL;
749 }
750
751 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)752 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
753 {
754 switch (dim) {
755 case GLSL_SAMPLER_DIM_1D:
756 return array ? AGX_DIM_1D_ARRAY : AGX_DIM_1D;
757
758 case GLSL_SAMPLER_DIM_2D:
759 case GLSL_SAMPLER_DIM_RECT:
760 case GLSL_SAMPLER_DIM_EXTERNAL:
761 return array ? AGX_DIM_2D_ARRAY : AGX_DIM_2D;
762
763 case GLSL_SAMPLER_DIM_MS:
764 return array ? AGX_DIM_2D_MS_ARRAY : AGX_DIM_2D_MS;
765
766 case GLSL_SAMPLER_DIM_3D:
767 assert(!array && "3D arrays unsupported");
768 return AGX_DIM_3D;
769
770 case GLSL_SAMPLER_DIM_CUBE:
771 return array ? AGX_DIM_CUBE_ARRAY : AGX_DIM_CUBE;
772
773 case GLSL_SAMPLER_DIM_BUF:
774 unreachable("Buffer textures should have been lowered");
775
776 default:
777 unreachable("Invalid sampler dim\n");
778 }
779 }
780
781 /*
782 * In the hardware, bindless texture sources are specified as a 64-bit uniform
783 * base address summed with a 32-bit register index. In NIR, we model this as a
784 * vec2, where the first source is the (constant) uniform register number and
785 * the second source is the (dynamic) byte offset.
786 */
787 static agx_index
agx_translate_bindless_handle(agx_builder * b,nir_src * handle,agx_index * base)788 agx_translate_bindless_handle(agx_builder *b, nir_src *handle, agx_index *base)
789 {
790 nir_scalar base_scalar = nir_scalar_resolved(handle->ssa, 0);
791 assert(nir_scalar_is_const(base_scalar) && "base must be constant");
792
793 unsigned base_uint = nir_scalar_as_uint(base_scalar);
794 *base = agx_uniform(base_uint, AGX_SIZE_64);
795
796 return agx_emit_extract(b, agx_src_index(handle), 1);
797 }
798
799 static agx_instr *
agx_emit_block_image_store(agx_builder * b,nir_intrinsic_instr * instr)800 agx_emit_block_image_store(agx_builder *b, nir_intrinsic_instr *instr)
801 {
802 agx_index offset = agx_src_index(&instr->src[1]);
803 agx_index coords = agx_src_index(&instr->src[2]);
804 enum agx_format format = agx_format_for_pipe(nir_intrinsic_format(instr));
805
806 bool ms = nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_MS;
807 bool array = nir_intrinsic_image_array(instr);
808 enum agx_dim dim = agx_tex_dim(nir_intrinsic_image_dim(instr), array);
809 bool explicit = nir_intrinsic_explicit_coord(instr);
810
811 /* 32-bit source physically, 16-bit in NIR, top half ignored but needed
812 * logically to ensure alignment.
813 */
814 offset = agx_vec2(b, offset, agx_undef(AGX_SIZE_16));
815 offset.channels_m1--;
816 offset.size = AGX_SIZE_32;
817
818 /* Modified coordinate descriptor */
819 if (!explicit) {
820 if (array) {
821 agx_index layer = coords;
822 coords = agx_temp(b->shader, AGX_SIZE_32);
823 agx_emit_collect_to(b, coords, 2,
824 (agx_index[]){
825 ms ? agx_mov_imm(b, 16, 0) : layer,
826 ms ? layer : agx_undef(AGX_SIZE_16),
827 });
828 } else {
829 coords = agx_null();
830 }
831 }
832
833 agx_index base, index;
834 if (instr->intrinsic == nir_intrinsic_bindless_image_store_block_agx) {
835 index = agx_translate_bindless_handle(b, &instr->src[0], &base);
836
837 assert(base.size == AGX_SIZE_64);
838 assert(index.size == AGX_SIZE_32);
839 } else {
840 base = agx_zero();
841 index = agx_src_index(&instr->src[0]);
842
843 assert(index.size == AGX_SIZE_16);
844 }
845
846 // XXX: how does this possibly work
847 if (format == AGX_FORMAT_F16)
848 format = AGX_FORMAT_I16;
849
850 return agx_block_image_store(b, base, index, offset, coords, format, dim,
851 explicit);
852 }
853
854 static agx_instr *
agx_load_compute_dimension(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,enum agx_sr base)855 agx_load_compute_dimension(agx_builder *b, agx_index dst,
856 nir_intrinsic_instr *instr, enum agx_sr base)
857 {
858 unsigned dim = instr->def.num_components;
859 unsigned size = instr->def.bit_size;
860 assert(size == 16 || size == 32);
861
862 agx_index srcs[] = {
863 agx_get_sr(b, size, base + 0),
864 agx_get_sr(b, size, base + 1),
865 agx_get_sr(b, size, base + 2),
866 };
867
868 return agx_emit_collect_to(b, dst, dim, srcs);
869 }
870
871 static enum agx_atomic_opc
translate_atomic_opcode(nir_atomic_op op)872 translate_atomic_opcode(nir_atomic_op op)
873 {
874 /* clang-format off */
875 switch (op) {
876 case nir_atomic_op_iadd: return AGX_ATOMIC_OPC_ADD;
877 case nir_atomic_op_imin: return AGX_ATOMIC_OPC_IMIN;
878 case nir_atomic_op_umin: return AGX_ATOMIC_OPC_UMIN;
879 case nir_atomic_op_imax: return AGX_ATOMIC_OPC_IMAX;
880 case nir_atomic_op_umax: return AGX_ATOMIC_OPC_UMAX;
881 case nir_atomic_op_iand: return AGX_ATOMIC_OPC_AND;
882 case nir_atomic_op_ior: return AGX_ATOMIC_OPC_OR;
883 case nir_atomic_op_ixor: return AGX_ATOMIC_OPC_XOR;
884 case nir_atomic_op_xchg: return AGX_ATOMIC_OPC_XCHG;
885 case nir_atomic_op_cmpxchg: return AGX_ATOMIC_OPC_CMPXCHG;
886 default: unreachable("unknown atomic opcode");
887 }
888 /* clang-format on */
889 }
890
891 /*
892 * The "base" of a local load/store/atomic can be zero but no other immediates.
893 * This would be a little silly to handle when inlining immediates, so we
894 * instead exclude these ops from immediate inlining and just handle 0 specially
895 * when translating.
896 */
897 static agx_index
agx_local_base(nir_src src)898 agx_local_base(nir_src src)
899 {
900 if (nir_src_is_const(src) && nir_src_as_uint(src) == 0)
901 return agx_zero();
902 else
903 return agx_src_index(&src);
904 }
905
906 static void
agx_emit_atomic(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,bool local)907 agx_emit_atomic(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr,
908 bool local)
909 {
910 enum agx_atomic_opc op =
911 translate_atomic_opcode(nir_intrinsic_atomic_op(instr));
912 agx_index base =
913 local ? agx_local_base(instr->src[0]) : agx_src_index(&instr->src[0]);
914 agx_index value = agx_src_index(&instr->src[local ? 1 : 2]);
915 agx_index index = local ? agx_zero() : agx_src_index(&instr->src[1]);
916
917 /* cmpxchg (only) takes 2 sources, passed in consecutive registers */
918 if (op == AGX_ATOMIC_OPC_CMPXCHG) {
919 agx_index value2 = agx_src_index(&instr->src[local ? 2 : 3]);
920 value = agx_vec2(b, value2, value);
921 }
922
923 if (local) {
924 assert(base.size == AGX_SIZE_16);
925 agx_local_atomic_to(b, dst, value, base, index, op);
926 } else {
927 assert(base.size == AGX_SIZE_64);
928 agx_atomic_to(b, dst, value, base, index, op);
929 }
930 }
931
932 static enum agx_format
format_for_bitsize(unsigned bitsize)933 format_for_bitsize(unsigned bitsize)
934 {
935 switch (bitsize) {
936 case 8:
937 return AGX_FORMAT_I8;
938 case 16:
939 return AGX_FORMAT_I16;
940 case 32:
941 return AGX_FORMAT_I32;
942 default:
943 unreachable("should've been lowered");
944 }
945 }
946
947 static void
agx_emit_local_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)948 agx_emit_local_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
949 {
950 agx_index base = agx_local_base(instr->src[0]);
951 agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
952 assert(base.size == AGX_SIZE_16);
953
954 enum agx_format format = format_for_bitsize(instr->def.bit_size);
955 unsigned nr = instr->def.num_components;
956 unsigned mask = BITFIELD_MASK(nr);
957
958 agx_local_load_to(b, dst, base, index, format, mask);
959 agx_emit_cached_split(b, dst, nr);
960 }
961
962 static void
agx_emit_local_store(agx_builder * b,nir_intrinsic_instr * instr)963 agx_emit_local_store(agx_builder *b, nir_intrinsic_instr *instr)
964 {
965 agx_index value = agx_src_index(&instr->src[0]);
966 agx_index base = agx_local_base(instr->src[1]);
967 agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
968 assert(base.size == AGX_SIZE_16);
969
970 enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
971 unsigned mask = BITFIELD_MASK(
972 nir_src_num_components(instr->src[0])); /* XXX: there's a write mask */
973
974 agx_local_store(b, value, base, index, format, mask);
975 }
976
977 static void
agx_emit_load_scratch(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)978 agx_emit_load_scratch(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
979 {
980 agx_index offset = agx_src_index(&instr->src[0]);
981 enum agx_format format = format_for_bitsize(instr->def.bit_size);
982 unsigned nr = instr->def.num_components;
983 unsigned mask = BITFIELD_MASK(nr);
984
985 agx_stack_load_to(b, dst, offset, format, mask);
986 agx_emit_cached_split(b, dst, nr);
987 b->shader->any_scratch = true;
988 }
989
990 static void
agx_emit_store_scratch(agx_builder * b,nir_intrinsic_instr * instr)991 agx_emit_store_scratch(agx_builder *b, nir_intrinsic_instr *instr)
992 {
993 agx_index value = agx_recollect_vector(b, instr->src[0]);
994 agx_index offset = agx_src_index(&instr->src[1]);
995 enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
996 unsigned mask = BITFIELD_MASK(nir_src_num_components(instr->src[0]));
997
998 agx_stack_store(b, value, offset, format, mask);
999 b->shader->any_scratch = true;
1000 }
1001
1002 static unsigned
agx_expand_tex_to(agx_builder * b,nir_def * def,agx_index src,bool masked)1003 agx_expand_tex_to(agx_builder *b, nir_def *def, agx_index src, bool masked)
1004 {
1005 unsigned nr_channels = def->num_components;
1006 nir_component_mask_t mask = nir_def_components_read(def);
1007
1008 if (!masked)
1009 mask = (nir_component_mask_t)BITFIELD_MASK(nr_channels);
1010
1011 agx_index packed_channels[4] = {agx_null()};
1012 agx_index unpacked_channels[4] = {agx_null()};
1013
1014 /* Hardware writes the masked components contiguously, expand out for NIR */
1015 agx_emit_split(b, packed_channels, src, 4 /* XXX: why not nr_channels */);
1016
1017 for (unsigned i = 0; i < nr_channels; ++i) {
1018 unpacked_channels[i] =
1019 (mask & BITFIELD_BIT(i))
1020 ? packed_channels[util_bitcount(mask & BITFIELD_MASK(i))]
1021 : agx_undef(src.size);
1022 }
1023
1024 agx_emit_collect_to(b, agx_def_index(def), nr_channels, unpacked_channels);
1025 return mask;
1026 }
1027
1028 static agx_instr *
agx_emit_image_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * intr)1029 agx_emit_image_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *intr)
1030 {
1031 agx_index ms_index = agx_src_index(&intr->src[2]);
1032 agx_index lod = agx_src_index(&intr->src[3]);
1033 enum agx_lod_mode lod_mode = AGX_LOD_MODE_LOD_MIN;
1034
1035 agx_index bindless = agx_immediate(0), texture;
1036 if (intr->intrinsic == nir_intrinsic_bindless_image_load)
1037 texture = agx_translate_bindless_handle(b, &intr->src[0], &bindless);
1038 else if (nir_src_is_const(intr->src[0]) &&
1039 nir_src_as_uint(intr->src[0]) < 0x100)
1040 texture = agx_immediate(nir_src_as_uint(intr->src[0]));
1041 else
1042 texture = agx_src_index(&intr->src[0]);
1043
1044 assert(nir_src_num_components(intr->src[1]) == 4);
1045 agx_index coord[4] = {
1046 agx_extract_nir_src(b, intr->src[1], 0),
1047 agx_extract_nir_src(b, intr->src[1], 1),
1048 agx_extract_nir_src(b, intr->src[1], 2),
1049 agx_extract_nir_src(b, intr->src[1], 3),
1050 };
1051
1052 /* Get the image dimension. Cubes are lowered to 2D, since they are logically
1053 * equivalent for imageLoad, but out-of-bounds behaviour for cubes on G13
1054 * is wrong according to Piglit's arb_shader_image_load_store-invalid.
1055 *
1056 * This requires a matching transform in the driver.
1057 */
1058 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1059 bool is_array = nir_intrinsic_image_array(intr);
1060
1061 if (dim == GLSL_SAMPLER_DIM_CUBE) {
1062 dim = GLSL_SAMPLER_DIM_2D;
1063 is_array = true;
1064 }
1065
1066 bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1067 unsigned coord_comps = glsl_get_sampler_dim_coordinate_components(dim);
1068 if (is_array && is_ms) {
1069 agx_index layer = agx_temp(b->shader, AGX_SIZE_16);
1070 agx_subdivide_to(b, layer, coord[coord_comps], 0);
1071
1072 assert(ms_index.size == AGX_SIZE_16);
1073 agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1074 agx_emit_collect_to(b, tmp, 2, (agx_index[]){ms_index, layer});
1075 coord[coord_comps++] = tmp;
1076 } else if (is_ms) {
1077 agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1078 agx_mov_to(b, tmp, ms_index);
1079 coord[coord_comps++] = tmp;
1080 } else if (is_array) {
1081 coord_comps++;
1082 }
1083
1084 /* Multisampled images do not support mipmapping */
1085 if (is_ms) {
1086 lod_mode = AGX_LOD_MODE_AUTO_LOD;
1087 lod = agx_zero();
1088 }
1089
1090 agx_index coords = agx_emit_collect(b, coord_comps, coord);
1091 agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
1092
1093 agx_instr *I = agx_image_load_to(
1094 b, tmp, coords, lod, bindless, texture, agx_immediate(0), agx_null(),
1095 agx_tex_dim(dim, is_array), lod_mode, 0, false);
1096 I->mask = agx_expand_tex_to(b, &intr->def, tmp, true);
1097
1098 b->shader->out->uses_txf = true;
1099 return NULL;
1100 }
1101
1102 static agx_instr *
agx_emit_export(agx_builder * b,unsigned base,nir_src src)1103 agx_emit_export(agx_builder *b, unsigned base, nir_src src)
1104 {
1105 agx_builder b_ = *b;
1106 agx_cursor after_cursor = agx_after_block(agx_exit_block(b->shader));
1107 b_.cursor = after_cursor;
1108
1109 for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
1110 agx_index chan = agx_extract_nir_src(b, src, c);
1111 unsigned stride = agx_size_align_16(chan.size);
1112
1113 agx_export(&b_, chan, base + (c * stride));
1114 }
1115
1116 if (agx_cursors_equal(b->cursor, after_cursor)) {
1117 b->cursor = agx_after_block_logical(b->cursor.block);
1118 }
1119
1120 return NULL;
1121 }
1122
1123 static agx_instr *
agx_load_exported_to(agx_builder * b,agx_index dst,unsigned base,unsigned nr)1124 agx_load_exported_to(agx_builder *b, agx_index dst, unsigned base, unsigned nr)
1125 {
1126 agx_index chans[4] = {0};
1127 unsigned stride = agx_size_align_16(dst.size);
1128
1129 for (unsigned c = 0; c < nr; ++c) {
1130 chans[c] = agx_cached_preload(b->shader, base + c * stride, dst.size);
1131 }
1132
1133 return agx_emit_collect_to(b, dst, nr, chans);
1134 }
1135
1136 static agx_instr *
agx_emit_image_store(agx_builder * b,nir_intrinsic_instr * instr)1137 agx_emit_image_store(agx_builder *b, nir_intrinsic_instr *instr)
1138 {
1139 /* See remarks in agx_emit_image_load */
1140 enum glsl_sampler_dim glsl_dim = nir_intrinsic_image_dim(instr);
1141 bool is_array = nir_intrinsic_image_array(instr);
1142
1143 if (glsl_dim == GLSL_SAMPLER_DIM_CUBE) {
1144 glsl_dim = GLSL_SAMPLER_DIM_2D;
1145 is_array = true;
1146 }
1147
1148 enum agx_dim dim = agx_tex_dim(glsl_dim, is_array);
1149 assert(glsl_dim != GLSL_SAMPLER_DIM_MS && "needs to be lowered");
1150
1151 agx_index base, index;
1152 if (instr->intrinsic == nir_intrinsic_bindless_image_store) {
1153 index = agx_translate_bindless_handle(b, &instr->src[0], &base);
1154
1155 assert(base.size == AGX_SIZE_64);
1156 assert(index.size == AGX_SIZE_32);
1157 } else {
1158 base = agx_zero();
1159 index = agx_src_index(&instr->src[0]);
1160
1161 assert(index.size == AGX_SIZE_16);
1162 }
1163
1164 agx_index coords4 = agx_src_index(&instr->src[1]);
1165 agx_index lod = agx_src_index(&instr->src[4]);
1166 assert(lod.size == AGX_SIZE_16);
1167
1168 int coord_components = glsl_get_sampler_dim_coordinate_components(glsl_dim);
1169 if (is_array)
1170 coord_components++;
1171
1172 agx_index coord_comps[4] = {};
1173 for (unsigned i = 0; i < coord_components; ++i)
1174 coord_comps[i] = agx_emit_extract(b, coords4, i);
1175
1176 agx_index coords = agx_emit_collect(b, coord_components, coord_comps);
1177 agx_index data = agx_src_index(&instr->src[3]);
1178
1179 /* If the image format has less than 4 components, nir_opt_shrink_stores can
1180 * shrink the store. But the IR still expects 4 components: pad with undef.
1181 */
1182 if (nir_src_num_components(instr->src[3]) < 4) {
1183 agx_index chan[4] = {agx_null()};
1184
1185 for (unsigned i = 0; i < 4; ++i) {
1186 if (i < nir_src_num_components(instr->src[3]))
1187 chan[i] = agx_extract_nir_src(b, instr->src[3], i);
1188 else
1189 chan[i] = agx_undef(data.size);
1190 }
1191
1192 data = agx_emit_collect(b, 4, chan);
1193 }
1194
1195 /* Image stores act like tilebuffer stores when used for tib spilling */
1196 b->shader->out->tag_write_disable = false;
1197
1198 return agx_image_write(b, data, coords, lod, base, index, dim);
1199 }
1200
1201 static enum agx_simd_op
translate_simd_op(nir_op op)1202 translate_simd_op(nir_op op)
1203 {
1204 #define CASE(agx_, nir_) \
1205 case nir_op_##nir_: \
1206 return AGX_SIMD_OP_##agx_;
1207
1208 switch (op) {
1209 CASE(AND, iand)
1210 CASE(FADD, fadd)
1211 CASE(OR, ior)
1212 CASE(FMUL, fmul)
1213 CASE(XOR, ixor)
1214 CASE(FMIN, fmin)
1215 CASE(FMAX, fmax)
1216 CASE(IADD, iadd)
1217 CASE(SMIN, imin)
1218 CASE(SMAX, imax)
1219 CASE(UMIN, umin)
1220 CASE(UMAX, umax)
1221 default:
1222 unreachable("unknown simd op");
1223 }
1224 #undef CASE
1225 }
1226
1227 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)1228 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
1229 {
1230 agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1231 ? agx_def_index(&instr->def)
1232 : agx_null();
1233 gl_shader_stage stage = b->shader->stage;
1234
1235 switch (instr->intrinsic) {
1236 case nir_intrinsic_load_barycentric_pixel:
1237 case nir_intrinsic_load_barycentric_centroid:
1238 case nir_intrinsic_load_barycentric_at_sample:
1239 case nir_intrinsic_load_barycentric_at_offset:
1240 /* handled later via load_vary */
1241 return NULL;
1242 case nir_intrinsic_load_interpolated_input:
1243 assert(stage == MESA_SHADER_FRAGMENT);
1244 agx_emit_load_vary(b, dst, instr);
1245 return NULL;
1246
1247 case nir_intrinsic_load_coefficients_agx:
1248 assert(stage == MESA_SHADER_FRAGMENT);
1249 agx_ldcf_to(b, dst, cf_for_intrinsic(b, instr), 1);
1250 agx_emit_cached_split(b, dst, 3);
1251 return NULL;
1252
1253 case nir_intrinsic_load_agx:
1254 case nir_intrinsic_load_constant_agx:
1255 agx_emit_load(b, dst, instr);
1256 return NULL;
1257
1258 case nir_intrinsic_store_uvs_agx:
1259 assert(stage == MESA_SHADER_VERTEX);
1260 return agx_st_vary(b, agx_src_index(&instr->src[1]),
1261 agx_src_index(&instr->src[0]));
1262
1263 case nir_intrinsic_store_agx:
1264 agx_emit_store(b, instr);
1265 return NULL;
1266
1267 case nir_intrinsic_store_shared:
1268 agx_emit_local_store(b, instr);
1269 return NULL;
1270
1271 case nir_intrinsic_load_shared:
1272 agx_emit_local_load(b, dst, instr);
1273 return NULL;
1274
1275 case nir_intrinsic_global_atomic_agx:
1276 case nir_intrinsic_global_atomic_swap_agx:
1277 agx_emit_atomic(b, dst, instr, false);
1278 return NULL;
1279
1280 case nir_intrinsic_shared_atomic:
1281 case nir_intrinsic_shared_atomic_swap:
1282 agx_emit_atomic(b, dst, instr, true);
1283 return NULL;
1284
1285 case nir_intrinsic_store_zs_agx:
1286 assert(stage == MESA_SHADER_FRAGMENT);
1287 return agx_emit_store_zs(b, instr);
1288
1289 case nir_intrinsic_store_local_pixel_agx:
1290 return agx_emit_local_store_pixel(b, instr);
1291
1292 case nir_intrinsic_load_local_pixel_agx:
1293 assert(stage == MESA_SHADER_FRAGMENT);
1294 agx_emit_local_load_pixel(b, dst, instr);
1295 return NULL;
1296
1297 case nir_intrinsic_load_pixel_coord:
1298 return agx_emit_collect_to(
1299 b, dst, 2,
1300 (agx_index[2]){
1301 agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_X),
1302 agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_Y),
1303 });
1304
1305 case nir_intrinsic_load_frag_coord_zw: {
1306 agx_index cf = agx_get_cf(b->shader, VARYING_SLOT_POS,
1307 nir_intrinsic_component(instr));
1308
1309 return agx_iter_to(b, dst, cf, agx_zero(), 1, AGX_INTERPOLATION_CENTER);
1310 }
1311
1312 case nir_intrinsic_sample_mask_agx: {
1313 assert(stage == MESA_SHADER_FRAGMENT);
1314 b->shader->out->writes_sample_mask = true;
1315
1316 /* We need to wait_pix before running Z/S tests, but we don't need to
1317 * wait_pix before merely discarding. Omit the wait_pix when the affected
1318 * samples are unconditionally killed.
1319 */
1320 bool no_tests =
1321 nir_src_is_const(instr->src[1]) && nir_src_as_uint(instr->src[1]) == 0;
1322
1323 if (!no_tests)
1324 agx_wait_pix(b, 0x0001);
1325
1326 return agx_sample_mask(b, agx_src_index(&instr->src[0]),
1327 agx_src_index(&instr->src[1]));
1328 }
1329
1330 case nir_intrinsic_load_back_face_agx:
1331 return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
1332
1333 case nir_intrinsic_load_samples_log2_agx:
1334 return agx_get_sr_to(b, dst, AGX_SR_SAMPLES_LOG2);
1335
1336 case nir_intrinsic_load_sample_mask_in:
1337 return agx_get_sr_to(b, dst, AGX_SR_INPUT_SAMPLE_MASK);
1338
1339 case nir_intrinsic_load_sample_mask:
1340 return agx_get_sr_coverage_to(b, dst, AGX_SR_COVERAGE_MASK);
1341
1342 case nir_intrinsic_load_helper_invocation:
1343 /* Compare special register to zero. We could lower this in NIR (letting
1344 * us fold in an inot) but meh?
1345 */
1346 return agx_icmp_to(b, dst,
1347 agx_get_sr_coverage(b, 32, AGX_SR_IS_ACTIVE_THREAD),
1348 agx_zero(), AGX_ICOND_UEQ, false);
1349
1350 case nir_intrinsic_load_vertex_id:
1351 /* We don't assert the HW stage since we use this same ABI with SW VS */
1352 return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1353
1354 case nir_intrinsic_load_instance_id:
1355 return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
1356
1357 case nir_intrinsic_load_preamble:
1358 return agx_emit_load_preamble(b, dst, instr);
1359
1360 case nir_intrinsic_store_preamble:
1361 return agx_emit_store_preamble(b, instr);
1362
1363 case nir_intrinsic_image_load:
1364 case nir_intrinsic_bindless_image_load:
1365 return agx_emit_image_load(b, dst, instr);
1366
1367 case nir_intrinsic_image_store:
1368 case nir_intrinsic_bindless_image_store:
1369 return agx_emit_image_store(b, instr);
1370
1371 case nir_intrinsic_image_store_block_agx:
1372 case nir_intrinsic_bindless_image_store_block_agx:
1373 return agx_emit_block_image_store(b, instr);
1374
1375 case nir_intrinsic_load_workgroup_id:
1376 return agx_load_compute_dimension(b, dst, instr,
1377 AGX_SR_THREADGROUP_POSITION_IN_GRID_X);
1378
1379 case nir_intrinsic_load_workgroup_size:
1380 return agx_load_compute_dimension(b, dst, instr,
1381 AGX_SR_THREADS_PER_THREADGROUP_X);
1382
1383 case nir_intrinsic_load_global_invocation_id:
1384 return agx_load_compute_dimension(b, dst, instr,
1385 AGX_SR_THREAD_POSITION_IN_GRID_X);
1386
1387 case nir_intrinsic_load_local_invocation_id:
1388 return agx_load_compute_dimension(
1389 b, dst, instr, AGX_SR_THREAD_POSITION_IN_THREADGROUP_X);
1390
1391 case nir_intrinsic_load_local_invocation_index:
1392 return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_THREADGROUP);
1393
1394 case nir_intrinsic_load_layer_id:
1395 return agx_get_sr_to(b, dst, AGX_SR_THREADGROUP_POSITION_IN_GRID_Z);
1396
1397 case nir_intrinsic_barrier: {
1398 assert(!b->shader->is_preamble && "invalid");
1399
1400 bool needs_image_barriers = false;
1401
1402 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) {
1403 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
1404
1405 if (modes & (nir_var_mem_global | nir_var_image)) {
1406 agx_memory_barrier(b);
1407
1408 /* Pull out all the big hammers to make cross-workgroup memory
1409 * barriers work. Found experimentally, seems to work on G13G at
1410 * least.
1411 *
1412 * TODO: check on other models, we may need more barriers for G13D.
1413 */
1414 if (nir_intrinsic_memory_scope(instr) >= SCOPE_QUEUE_FAMILY) {
1415 agx_memory_barrier_2(b);
1416 agx_unknown_barrier_1(b);
1417 }
1418 }
1419
1420 if (modes & nir_var_image) {
1421 agx_image_barrier_1(b);
1422 agx_image_barrier_2(b);
1423 needs_image_barriers = true;
1424 }
1425 }
1426
1427 /* Nothing to do for subgroup barriers */
1428 if (nir_intrinsic_execution_scope(instr) >= SCOPE_WORKGROUP) {
1429 assert(gl_shader_stage_is_compute(b->shader->nir->info.stage));
1430
1431 agx_threadgroup_barrier(b);
1432 }
1433
1434 if (needs_image_barriers) {
1435 agx_image_barrier_3(b);
1436 agx_image_barrier_4(b);
1437 }
1438
1439 return NULL;
1440 }
1441
1442 case nir_intrinsic_fence_pbe_to_tex_agx: {
1443 agx_image_barrier_1(b);
1444 agx_image_barrier_2(b);
1445 agx_image_barrier_3(b);
1446 agx_image_barrier_4(b);
1447 return NULL;
1448 }
1449
1450 case nir_intrinsic_fence_mem_to_tex_agx: {
1451 /* Flush out the atomic to main memory... Found experimentally... */
1452 agx_memory_barrier(b);
1453 agx_memory_barrier_2(b);
1454
1455 /* TODO: Which ones do we actually need? */
1456 agx_image_barrier_1(b);
1457 agx_image_barrier_2(b);
1458 agx_image_barrier_3(b);
1459 agx_image_barrier_4(b);
1460
1461 /* Flush out the texture cache */
1462 agx_flush_memory_to_texture(b);
1463 return NULL;
1464 }
1465
1466 case nir_intrinsic_fence_pbe_to_tex_pixel_agx: {
1467 agx_image_barrier_1(b);
1468 agx_image_barrier_2(b);
1469 agx_flush_memory_to_texture(b);
1470 agx_image_barrier_3(b);
1471 return NULL;
1472 }
1473
1474 case nir_intrinsic_fence_helper_exit_agx: {
1475 assert(b->shader->key->is_helper);
1476 agx_memory_barrier(b);
1477 agx_unknown_barrier_1(b);
1478 agx_memory_barrier_2(b);
1479 agx_unknown_barrier_2(b);
1480 agx_memory_barrier_3(b);
1481 return NULL;
1482 }
1483
1484 case nir_intrinsic_begin_invocation_interlock: {
1485 if (!b->shader->did_writeout &&
1486 !b->shader->key->fs.ignore_tib_dependencies)
1487 agx_wait_pix(b, 0x000C);
1488
1489 b->shader->did_writeout = true;
1490 return NULL;
1491 }
1492
1493 case nir_intrinsic_ddx:
1494 case nir_intrinsic_ddx_coarse:
1495 case nir_intrinsic_ddx_fine:
1496 return agx_dfdx_to(b, dst, agx_src_index(&instr->src[0]));
1497
1498 case nir_intrinsic_ddy:
1499 case nir_intrinsic_ddy_coarse:
1500 case nir_intrinsic_ddy_fine:
1501 return agx_dfdy_to(b, dst, agx_src_index(&instr->src[0]));
1502
1503 case nir_intrinsic_load_subgroup_invocation:
1504 return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_SUBGROUP);
1505
1506 case nir_intrinsic_load_subgroup_id:
1507 return agx_get_sr_to(b, dst, AGX_SR_SUBGROUP_INDEX_IN_THREADGROUP);
1508
1509 case nir_intrinsic_load_active_subgroup_invocation_agx:
1510 return agx_get_sr_coverage_to(b, dst,
1511 AGX_SR_ACTIVE_THREAD_INDEX_IN_SUBGROUP);
1512
1513 case nir_intrinsic_load_active_subgroup_count_agx:
1514 return agx_get_sr_coverage_to(b, dst,
1515 AGX_SR_TOTAL_ACTIVE_THREADS_IN_SUBGROUP);
1516
1517 case nir_intrinsic_reduce: {
1518 assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1519 instr->def.bit_size == 32) &&
1520 "should've been lowered");
1521
1522 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
1523 assert(cluster_size == 0 || cluster_size == 4 || cluster_size >= 32);
1524
1525 enum agx_simd_op op =
1526 translate_simd_op(nir_intrinsic_reduction_op(instr));
1527
1528 agx_index src0 = agx_src_index(&instr->src[0]);
1529
1530 if (cluster_size == 4)
1531 return agx_quad_reduce_to(b, dst, src0, op);
1532 else
1533 return agx_simd_reduce_to(b, dst, src0, op);
1534 }
1535
1536 case nir_intrinsic_exclusive_scan: {
1537 assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1538 instr->def.bit_size == 32) &&
1539 "should've been lowered");
1540
1541 return agx_simd_prefix_to(
1542 b, dst, agx_src_index(&instr->src[0]),
1543 translate_simd_op(nir_intrinsic_reduction_op(instr)));
1544 }
1545
1546 case nir_intrinsic_read_invocation: {
1547 /* TODO: Check if we're actually inside divergent control flow */
1548 b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1549
1550 /* Lane ID guaranteed to be uniform */
1551 return agx_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1552 agx_src_index(&instr->src[1]));
1553 }
1554
1555 case nir_intrinsic_quad_broadcast: {
1556 /* TODO: Check if we're actually inside divergent control flow */
1557 b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1558
1559 /* Lane ID guaranteed to be uniform */
1560 return agx_quad_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1561 agx_src_index(&instr->src[1]));
1562 }
1563
1564 case nir_intrinsic_quad_swap_horizontal: {
1565 return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1566 agx_immediate(1));
1567 }
1568
1569 case nir_intrinsic_quad_swap_vertical: {
1570 return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1571 agx_immediate(2));
1572 }
1573
1574 case nir_intrinsic_quad_swap_diagonal: {
1575 return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1576 agx_immediate(3));
1577 }
1578
1579 case nir_intrinsic_ballot: {
1580 return agx_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1581 }
1582
1583 case nir_intrinsic_quad_ballot_agx: {
1584 return agx_quad_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1585 }
1586
1587 case nir_intrinsic_doorbell_agx: {
1588 return agx_doorbell(b, nir_src_as_uint(instr->src[0]));
1589 }
1590
1591 case nir_intrinsic_stack_map_agx: {
1592 return agx_stack_map(b, agx_src_index(&instr->src[1]),
1593 nir_src_as_uint(instr->src[0]));
1594 }
1595
1596 case nir_intrinsic_stack_unmap_agx: {
1597 return agx_stack_unmap_to(b, dst, nir_src_as_uint(instr->src[0]));
1598 }
1599
1600 case nir_intrinsic_load_scratch:
1601 agx_emit_load_scratch(b, dst, instr);
1602 return NULL;
1603
1604 case nir_intrinsic_store_scratch:
1605 agx_emit_store_scratch(b, instr);
1606 return NULL;
1607
1608 case nir_intrinsic_load_core_id_agx:
1609 return agx_get_sr_to(b, dst, AGX_SR_CORE_ID);
1610
1611 case nir_intrinsic_load_helper_op_id_agx:
1612 assert(b->shader->key->is_helper);
1613 return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_OP);
1614
1615 case nir_intrinsic_load_helper_arg_lo_agx:
1616 assert(b->shader->key->is_helper);
1617 return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_L);
1618
1619 case nir_intrinsic_load_helper_arg_hi_agx:
1620 assert(b->shader->key->is_helper);
1621 return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_H);
1622
1623 case nir_intrinsic_load_exported_agx:
1624 return agx_load_exported_to(b, dst, nir_intrinsic_base(instr),
1625 instr->def.num_components);
1626
1627 case nir_intrinsic_export_agx:
1628 return agx_emit_export(b, nir_intrinsic_base(instr), instr->src[0]);
1629
1630 case nir_intrinsic_load_barycentric_sample:
1631 case nir_intrinsic_load_sample_id:
1632 case nir_intrinsic_load_sample_pos:
1633 unreachable("Sample shading should have been lowered");
1634
1635 default:
1636 fprintf(stderr, "Unhandled intrinsic %s\n",
1637 nir_intrinsic_infos[instr->intrinsic].name);
1638 unreachable("Unhandled intrinsic");
1639 }
1640 }
1641
1642 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)1643 agx_alu_src_index(agx_builder *b, nir_alu_src src)
1644 {
1645 /* Check well-formedness of the input NIR */
1646 ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
1647 unsigned comps = nir_src_num_components(src.src);
1648 unsigned channel = src.swizzle[0];
1649
1650 assert(bitsize == 1 || bitsize == 8 || bitsize == 16 || bitsize == 32 ||
1651 bitsize == 64);
1652 assert(channel < comps);
1653
1654 return agx_extract_nir_src(b, src.src, channel);
1655 }
1656
1657 /*
1658 * Emit an instruction translating (s0 * s1) + (s2 << s3). Assuming s3 is
1659 * constant, this is an imad instruction. If s1 == 1, then this is optimized to
1660 * an iadd instruction, which is faster.
1661 */
1662 static agx_instr *
agx_emit_imadshl_agx(agx_builder * b,nir_alu_instr * alu,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)1663 agx_emit_imadshl_agx(agx_builder *b, nir_alu_instr *alu, agx_index dst,
1664 agx_index s0, agx_index s1, agx_index s2, agx_index s3)
1665 {
1666 /* If the shift is not constant, use a variable shift. This should never
1667 * happen in practice but we don't want to constrain the NIR.
1668 */
1669 unsigned shift;
1670 if (!nir_src_is_const(alu->src[3].src)) {
1671 s2 = agx_bfi(b, agx_immediate(0), s2, s3, 0);
1672 shift = 0;
1673 } else {
1674 shift = nir_alu_src_as_uint(alu->src[3]);
1675 }
1676
1677 assert(shift <= 4 && "domain restriction on the input NIR");
1678
1679 /* Emit iadd if possible, else imad */
1680 if (nir_src_is_const(alu->src[1].src) &&
1681 nir_alu_src_as_uint(alu->src[1]) == 1) {
1682
1683 return agx_iadd_to(b, dst, s0, s2, shift);
1684 } else {
1685 return agx_imad_to(b, dst, s0, s1, s2, shift);
1686 }
1687 }
1688
1689 static bool
is_conversion_to_8bit(nir_op op)1690 is_conversion_to_8bit(nir_op op)
1691 {
1692 switch (op) {
1693 case nir_op_i2i8:
1694 case nir_op_u2u8:
1695 case nir_op_f2i8:
1696 case nir_op_f2u8:
1697 case nir_op_b2i8:
1698 return true;
1699 default:
1700 return false;
1701 }
1702 }
1703
1704 static agx_instr *
agx_fminmax_to(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,nir_alu_instr * alu)1705 agx_fminmax_to(agx_builder *b, agx_index dst, agx_index s0, agx_index s1,
1706 nir_alu_instr *alu)
1707 {
1708 /* The hardware gtn/ltn modes are unfortunately incorrect for signed zeros */
1709 assert(!nir_alu_instr_is_signed_zero_preserve(alu) &&
1710 "should've been lowered");
1711
1712 bool fmax = alu->op == nir_op_fmax;
1713 enum agx_fcond fcond = fmax ? AGX_FCOND_GTN : AGX_FCOND_LTN;
1714
1715 /* Calculate min/max with the appropriate hardware instruction */
1716 agx_index tmp = agx_fcmpsel(b, s0, s1, s0, s1, fcond);
1717
1718 /* G13 flushes fp32 denorms and preserves fp16 denorms. Since cmpsel
1719 * preserves denorms, we need to canonicalize for fp32. Canonicalizing fp16
1720 * would be harmless but wastes an instruction.
1721 */
1722 if (alu->def.bit_size == 32)
1723 return agx_fadd_to(b, dst, tmp, agx_negzero());
1724 else
1725 return agx_mov_to(b, dst, tmp);
1726 }
1727
1728 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)1729 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
1730 {
1731 unsigned srcs = nir_op_infos[instr->op].num_inputs;
1732 unsigned sz = instr->def.bit_size;
1733 unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
1734 ASSERTED unsigned comps = instr->def.num_components;
1735
1736 assert(comps == 1 || nir_op_is_vec_or_mov(instr->op));
1737 assert(sz == 1 ||
1738 ((nir_op_is_vec_or_mov(instr->op) ||
1739 is_conversion_to_8bit(instr->op) || instr->op == nir_op_bcsel) &&
1740 sz == 8) ||
1741 sz == 16 || sz == 32 || sz == 64);
1742
1743 agx_index dst = agx_def_index(&instr->def);
1744 agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
1745 agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
1746 agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
1747 agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
1748
1749 agx_index i0 = agx_immediate(0);
1750 agx_index i1 = agx_immediate(1);
1751
1752 #define UNOP(nop, aop) \
1753 case nir_op_##nop: \
1754 return agx_##aop##_to(b, dst, s0);
1755 #define BINOP(nop, aop) \
1756 case nir_op_##nop: \
1757 return agx_##aop##_to(b, dst, s0, s1);
1758 #define TRIOP(nop, aop) \
1759 case nir_op_##nop: \
1760 return agx_##aop##_to(b, dst, s0, s1, s2);
1761
1762 switch (instr->op) {
1763 BINOP(fadd, fadd);
1764 BINOP(fmul, fmul);
1765 TRIOP(ffma, fma);
1766
1767 UNOP(f2f16, fmov);
1768 UNOP(f2f16_rtne, fmov);
1769 UNOP(f2f32, fmov);
1770 UNOP(fround_even, roundeven);
1771 UNOP(ftrunc, trunc);
1772 UNOP(ffloor, floor);
1773 UNOP(fceil, ceil);
1774 UNOP(frcp, rcp);
1775 UNOP(frsq, rsqrt);
1776 UNOP(flog2, log2);
1777 UNOP(fexp2, exp2);
1778
1779 UNOP(mov, mov);
1780 UNOP(u2u32, mov);
1781 UNOP(bitfield_reverse, bitrev);
1782 UNOP(bit_count, popcount);
1783 UNOP(ufind_msb, ffs);
1784 BINOP(iand, and);
1785 BINOP(ior, or);
1786 BINOP(ixor, xor);
1787 BINOP(interleave_agx, intl);
1788
1789 case nir_op_feq:
1790 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, false);
1791 case nir_op_flt:
1792 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_LT, false);
1793 case nir_op_fge:
1794 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_GE, false);
1795 case nir_op_fneu:
1796 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, true);
1797
1798 case nir_op_ieq:
1799 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, false);
1800 case nir_op_ine:
1801 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, true);
1802 case nir_op_ilt:
1803 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, false);
1804 case nir_op_ige:
1805 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, true);
1806 case nir_op_ult:
1807 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, false);
1808 case nir_op_uge:
1809 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, true);
1810
1811 case nir_op_inot:
1812 if (sz == 1)
1813 return agx_xor_to(b, dst, s0, i1);
1814 else
1815 return agx_not_to(b, dst, s0);
1816
1817 case nir_op_b2b1:
1818 return agx_icmp_to(b, dst, s0, i0, AGX_ICOND_UEQ, true);
1819
1820 case nir_op_fsqrt:
1821 return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
1822 case nir_op_fabs:
1823 return agx_fmov_to(b, dst, agx_abs(s0));
1824 case nir_op_fneg:
1825 return agx_fmov_to(b, dst, agx_neg(s0));
1826
1827 case nir_op_fmin:
1828 case nir_op_fmax:
1829 return agx_fminmax_to(b, dst, s0, s1, instr);
1830
1831 case nir_op_imin:
1832 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
1833 case nir_op_imax:
1834 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
1835 case nir_op_umin:
1836 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
1837 case nir_op_umax:
1838 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
1839
1840 case nir_op_iadd:
1841 return agx_iadd_to(b, dst, s0, s1, 0);
1842 case nir_op_imadshl_agx:
1843 return agx_emit_imadshl_agx(b, instr, dst, s0, s1, s2, s3);
1844 case nir_op_imsubshl_agx:
1845 return agx_emit_imadshl_agx(b, instr, dst, s0, s1, agx_neg(s2), s3);
1846 case nir_op_isub:
1847 return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1848 case nir_op_ineg:
1849 return agx_iadd_to(b, dst, i0, agx_neg(s0), 0);
1850 case nir_op_imul:
1851 return agx_imad_to(b, dst, s0, s1, i0, 0);
1852 case nir_op_umul_2x32_64:
1853 return agx_imad_to(b, dst, agx_abs(s0), agx_abs(s1), i0, 0);
1854 case nir_op_imul_2x32_64:
1855 return agx_imad_to(b, dst, s0, s1, i0, 0);
1856 case nir_op_umul_high:
1857 return agx_mul_high_to(b, dst, s0, s1, false);
1858 case nir_op_imul_high:
1859 return agx_mul_high_to(b, dst, s0, s1, true);
1860
1861 case nir_op_ishl:
1862 return agx_bfi_to(b, dst, i0, s0, s1, 0);
1863 case nir_op_ushr:
1864 return agx_ushr_to(b, dst, s0, s1);
1865 case nir_op_ishr:
1866 return agx_asr_to(b, dst, s0, s1);
1867
1868 case nir_op_extr_agx:
1869 return agx_extr_to(b, dst, s0, s1, s2,
1870 nir_alu_src_as_uint(instr->src[3]));
1871
1872 case nir_op_ubitfield_extract: {
1873 unsigned m = nir_alu_src_as_uint(instr->src[2]);
1874 assert(m != 0 && "should've been optimized");
1875
1876 /* Disable masking if the whole thing is used */
1877 if (m >= 32)
1878 m = 0;
1879
1880 return agx_bfeil_to(b, dst, i0, s0, s1, m);
1881 }
1882
1883 case nir_op_bcsel:
1884 return agx_icmpsel_to(b, dst, s0, i0, s2, s1, AGX_ICOND_UEQ);
1885
1886 case nir_op_i2i32: {
1887 if (src_sz == 8) {
1888 /* Sign extend in software, NIR likes 8-bit conversions */
1889 agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1890 return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1891 } else {
1892 assert(s0.size == AGX_SIZE_16 && "other conversions lowered");
1893 return agx_iadd_to(b, dst, s0, i0, 0);
1894 }
1895 }
1896
1897 case nir_op_i2i16: {
1898 if (src_sz == 8) {
1899 /* Sign extend in software, NIR likes 8-bit conversions */
1900 agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1901 return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1902 } else {
1903 assert(s0.size == AGX_SIZE_32 && "other conversions lowered");
1904 return agx_subdivide_to(b, dst, s0, 0);
1905 }
1906 }
1907
1908 case nir_op_u2u16: {
1909 if (s0.size == AGX_SIZE_32)
1910 return agx_subdivide_to(b, dst, s0, 0);
1911 else
1912 return agx_mov_to(b, dst, s0);
1913 }
1914
1915 /* It will be put into a 16-bit register, but zero out the garbage. We could
1916 * optimize this in the future but it ensures correctness for u2u16(u2u8(x))
1917 * sequences.
1918 */
1919 case nir_op_u2u8:
1920 case nir_op_i2i8:
1921 return agx_and_to(b, dst, s0, agx_immediate(0xFF));
1922
1923 case nir_op_iadd_sat: {
1924 agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
1925 I->saturate = true;
1926 return I;
1927 }
1928
1929 case nir_op_isub_sat: {
1930 agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1931 I->saturate = true;
1932 return I;
1933 }
1934
1935 case nir_op_uadd_sat: {
1936 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
1937 I->saturate = true;
1938 return I;
1939 }
1940
1941 case nir_op_usub_sat: {
1942 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
1943 I->saturate = true;
1944 return I;
1945 }
1946
1947 case nir_op_fsat: {
1948 agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
1949 I->saturate = true;
1950 return I;
1951 }
1952
1953 case nir_op_fsin_agx: {
1954 agx_index fixup = agx_sin_pt_1(b, s0);
1955 agx_index sinc = agx_sin_pt_2(b, fixup);
1956 return agx_fmul_to(b, dst, sinc, fixup);
1957 }
1958
1959 case nir_op_f2i16:
1960 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S16), s0,
1961 AGX_ROUND_RTZ);
1962
1963 case nir_op_f2i32:
1964 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S32), s0,
1965 AGX_ROUND_RTZ);
1966
1967 case nir_op_f2u16:
1968 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U16), s0,
1969 AGX_ROUND_RTZ);
1970
1971 case nir_op_f2u32:
1972 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U32), s0,
1973 AGX_ROUND_RTZ);
1974
1975 case nir_op_u2f16:
1976 case nir_op_u2f32: {
1977 if (src_sz == 64)
1978 unreachable("64-bit conversions unimplemented");
1979
1980 enum agx_convert mode = (src_sz == 32) ? AGX_CONVERT_U32_TO_F
1981 : (src_sz == 16) ? AGX_CONVERT_U16_TO_F
1982 : AGX_CONVERT_U8_TO_F;
1983
1984 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
1985 }
1986
1987 case nir_op_i2f16:
1988 case nir_op_i2f32: {
1989 if (src_sz == 64)
1990 unreachable("64-bit conversions unimplemented");
1991
1992 enum agx_convert mode = (src_sz == 32) ? AGX_CONVERT_S32_TO_F
1993 : (src_sz == 16) ? AGX_CONVERT_S16_TO_F
1994 : AGX_CONVERT_S8_TO_F;
1995
1996 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
1997 }
1998
1999 case nir_op_pack_32_2x16_split:
2000 case nir_op_pack_64_2x32_split: {
2001 agx_index idx[] = {s0, s1};
2002 return agx_emit_collect_to(b, dst, 2, idx);
2003 }
2004
2005 case nir_op_unpack_64_2x32_split_x:
2006 case nir_op_unpack_32_2x16_split_x:
2007 return agx_subdivide_to(b, dst, s0, 0);
2008
2009 case nir_op_unpack_64_2x32_split_y:
2010 case nir_op_unpack_32_2x16_split_y:
2011 return agx_subdivide_to(b, dst, s0, 1);
2012
2013 case nir_op_vec2:
2014 case nir_op_vec3:
2015 case nir_op_vec4: {
2016 agx_index idx[] = {s0, s1, s2, s3};
2017 return agx_emit_collect_to(b, dst, srcs, idx);
2018 }
2019
2020 case nir_op_vec8:
2021 case nir_op_vec16:
2022 unreachable("should've been lowered");
2023
2024 default:
2025 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2026 unreachable("Unhandled ALU instruction");
2027 }
2028 }
2029
2030 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op,bool biased,bool min_lod,bool lod_is_zero)2031 agx_lod_mode_for_nir(nir_texop op, bool biased, bool min_lod, bool lod_is_zero)
2032 {
2033 switch (op) {
2034 case nir_texop_tex:
2035 case nir_texop_tg4:
2036 /* We could support this for tex, but it's never actually seen because tex
2037 * is always turned into txb to implement sampler LOD bias in Vulkan.
2038 */
2039 assert(!min_lod && "unimplemented");
2040
2041 return AGX_LOD_MODE_AUTO_LOD;
2042 case nir_texop_txb:
2043 return min_lod ? AGX_LOD_MODE_AUTO_LOD_BIAS_MIN
2044 : AGX_LOD_MODE_AUTO_LOD_BIAS;
2045 case nir_texop_lod:
2046 assert(!min_lod);
2047 return biased ? AGX_LOD_MODE_AUTO_LOD_BIAS : AGX_LOD_MODE_AUTO_LOD;
2048 case nir_texop_txd:
2049 return min_lod ? AGX_LOD_MODE_LOD_GRAD_MIN : AGX_LOD_MODE_LOD_GRAD;
2050 case nir_texop_txl:
2051 assert(!min_lod);
2052 return AGX_LOD_MODE_LOD_MIN;
2053 case nir_texop_txf:
2054 assert(!min_lod);
2055 return lod_is_zero ? AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
2056 case nir_texop_txf_ms:
2057 assert(!min_lod);
2058 assert(lod_is_zero && "no mipmapping");
2059 return AGX_LOD_MODE_AUTO_LOD;
2060 default:
2061 unreachable("Unhandled texture op");
2062 }
2063 }
2064
2065 static enum agx_gather
agx_gather_for_nir(nir_tex_instr * tex)2066 agx_gather_for_nir(nir_tex_instr *tex)
2067 {
2068 if (tex->op == nir_texop_tg4) {
2069 enum agx_gather components[] = {
2070 AGX_GATHER_R,
2071 AGX_GATHER_G,
2072 AGX_GATHER_B,
2073 AGX_GATHER_A,
2074 };
2075
2076 assert(tex->component < ARRAY_SIZE(components));
2077 return components[tex->component];
2078 } else {
2079 return AGX_GATHER_NONE;
2080 }
2081 }
2082
2083 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)2084 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
2085 {
2086 agx_index coords = agx_null(), bindless = agx_immediate(0),
2087 texture = agx_immediate(instr->texture_index),
2088 sampler = agx_immediate(0), lod = agx_immediate(0),
2089 compare = agx_null(), packed_offset = agx_null(),
2090 min_lod = agx_null();
2091
2092 bool lod_is_zero = true;
2093
2094 for (unsigned i = 0; i < instr->num_srcs; ++i) {
2095 agx_index index = agx_src_index(&instr->src[i].src);
2096
2097 switch (instr->src[i].src_type) {
2098 case nir_tex_src_backend1:
2099 coords = index;
2100 break;
2101
2102 case nir_tex_src_backend2:
2103 packed_offset = index;
2104 break;
2105
2106 case nir_tex_src_lod:
2107 case nir_tex_src_bias:
2108 lod = index;
2109 lod_is_zero = nir_src_is_const(instr->src[i].src) &&
2110 nir_src_as_uint(instr->src[i].src) == 0;
2111 break;
2112
2113 case nir_tex_src_min_lod:
2114 assert(index.size == AGX_SIZE_16);
2115 min_lod = index;
2116 break;
2117
2118 case nir_tex_src_comparator:
2119 assert(index.size == AGX_SIZE_32);
2120 compare = index;
2121 break;
2122
2123 case nir_tex_src_texture_offset:
2124 texture = index;
2125 break;
2126 case nir_tex_src_sampler_handle:
2127 sampler = index;
2128 break;
2129
2130 case nir_tex_src_texture_handle:
2131 texture =
2132 agx_translate_bindless_handle(b, &instr->src[i].src, &bindless);
2133 break;
2134
2135 case nir_tex_src_ddx: {
2136 int y_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2137 assert(y_idx >= 0 && "we only handle gradients");
2138
2139 int min_idx = nir_tex_instr_src_index(instr, nir_tex_src_min_lod);
2140 bool has_min = min_idx >= 0;
2141 agx_index min;
2142
2143 unsigned n = nir_tex_instr_src_size(instr, y_idx);
2144 assert((n == 2 || n == 3) && "other sizes not supported");
2145
2146 agx_index index2 = agx_src_index(&instr->src[y_idx].src);
2147
2148 if (has_min) {
2149 min = agx_src_index(&instr->src[min_idx].src);
2150
2151 /* Undef extend to 32-bit since our IR is iffy */
2152 min = agx_vec2(b, min, agx_undef(AGX_SIZE_16));
2153 min.channels_m1--;
2154 min.size = AGX_SIZE_32;
2155 }
2156
2157 /* We explicitly don't cache about the split cache for this */
2158 unsigned chans = (2 * n) + (has_min ? 1 : 0);
2159 lod = agx_vec_temp(b->shader, AGX_SIZE_32, chans);
2160 agx_instr *I = agx_collect_to(b, lod, chans);
2161
2162 for (unsigned i = 0; i < n; ++i) {
2163 I->src[(2 * i) + 0] = agx_emit_extract(b, index, i);
2164 I->src[(2 * i) + 1] = agx_emit_extract(b, index2, i);
2165 }
2166
2167 if (has_min)
2168 I->src[2 * n] = min;
2169
2170 break;
2171 }
2172
2173 case nir_tex_src_ddy:
2174 /* handled above */
2175 break;
2176
2177 default:
2178 unreachable("Unexpected texture source");
2179 }
2180 }
2181
2182 enum agx_lod_mode lod_mode = agx_lod_mode_for_nir(
2183 instr->op, nir_tex_instr_src_index(instr, nir_tex_src_bias) >= 0,
2184 nir_tex_instr_src_index(instr, nir_tex_src_min_lod) >= 0, lod_is_zero);
2185
2186 if (lod_mode == AGX_LOD_MODE_AUTO_LOD) {
2187 /* Ignored logically but asserted 0 */
2188 lod = agx_immediate(0);
2189 } else if (lod_mode == AGX_LOD_MODE_AUTO_LOD_BIAS_MIN) {
2190 /* Combine min with lod */
2191 lod = agx_vec2(b, lod, min_lod);
2192 }
2193
2194 agx_index dst = agx_def_index(&instr->def);
2195
2196 /* Pack shadow reference value (compare) and packed offset together */
2197 agx_index compare_offset = agx_null();
2198
2199 if (!agx_is_null(compare) && !agx_is_null(packed_offset))
2200 compare_offset = agx_vec2(b, compare, packed_offset);
2201 else if (!agx_is_null(packed_offset))
2202 compare_offset = packed_offset;
2203 else if (!agx_is_null(compare))
2204 compare_offset = compare;
2205
2206 agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
2207 agx_instr *I = agx_texture_sample_to(
2208 b, tmp, coords, lod, bindless, texture, sampler, compare_offset,
2209 agx_tex_dim(instr->sampler_dim, instr->is_array), lod_mode, 0,
2210 !agx_is_null(packed_offset), !agx_is_null(compare),
2211 instr->op == nir_texop_lod, agx_gather_for_nir(instr));
2212
2213 if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms) {
2214 I->op = AGX_OPCODE_TEXTURE_LOAD;
2215 b->shader->out->uses_txf = true;
2216 }
2217
2218 /* Destination masking doesn't seem to work properly for gathers (because
2219 * it's mostly pointless), but it does show up in the lowering of
2220 * textureGatherOffsets. Don't try to mask the destination for gathers.
2221 */
2222 bool masked = (instr->op != nir_texop_tg4);
2223 I->mask = agx_expand_tex_to(b, &instr->def, tmp, masked);
2224 }
2225
2226 /*
2227 * Determine if a NIR loop (CF list) uses a continue jump, including within
2228 * if-else statements but not including nested loops.
2229 */
2230 static bool
cf_list_uses_continue(struct exec_list * list)2231 cf_list_uses_continue(struct exec_list *list)
2232 {
2233 foreach_list_typed(nir_cf_node, node, node, list) {
2234 if (node->type == nir_cf_node_block) {
2235 nir_block *block = nir_cf_node_as_block(node);
2236
2237 nir_foreach_instr(instr, block) {
2238 if (instr->type == nir_instr_type_jump &&
2239 nir_instr_as_jump(instr)->type == nir_jump_continue)
2240 return true;
2241 }
2242 } else if (node->type == nir_cf_node_if) {
2243 nir_if *nif = nir_cf_node_as_if(node);
2244
2245 if (cf_list_uses_continue(&nif->then_list) ||
2246 cf_list_uses_continue(&nif->else_list))
2247 return true;
2248 } else {
2249 assert(node->type == nir_cf_node_loop && "don't care about nesting");
2250 }
2251 }
2252
2253 return false;
2254 }
2255
2256 static bool
loop_uses_continue(nir_loop * loop)2257 loop_uses_continue(nir_loop *loop)
2258 {
2259 return cf_list_uses_continue(&loop->body);
2260 }
2261
2262 /*
2263 * NIR loops are treated as a pair of AGX loops:
2264 *
2265 * do {
2266 * do {
2267 * ...
2268 * } while (0);
2269 * } while (cond);
2270 *
2271 * By manipulating the nesting counter, we may break out of nested loops, so
2272 * under the model, both break and continue may be implemented as breaks, where
2273 * break breaks out of the outer loop (2 layers) and continue breaks out of the
2274 * inner loop (1 layer).
2275 *
2276 * After manipulating the nesting counter directly, pop_exec #0 must be used to
2277 * flush the update to the execution mask.
2278 */
2279 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)2280 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
2281 {
2282 agx_context *ctx = b->shader;
2283 assert(instr->type == nir_jump_break || instr->type == nir_jump_continue);
2284
2285 /* Break out of either one or two loops */
2286 unsigned nestings = b->shader->loop_nesting;
2287
2288 if (instr->type == nir_jump_continue) {
2289 nestings += 1;
2290 agx_block_add_successor(ctx->current_block, ctx->continue_block);
2291 } else if (instr->type == nir_jump_break) {
2292 nestings += ctx->loop_continues ? 2 : 1;
2293 agx_block_add_successor(ctx->current_block, ctx->break_block);
2294 }
2295
2296 agx_break(b, nestings, ctx->break_block);
2297 ctx->current_block->unconditional_jumps = true;
2298 }
2299
2300 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)2301 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
2302 {
2303 agx_instr *I =
2304 agx_phi_to(b, agx_def_index(&instr->def), exec_list_length(&instr->srcs));
2305
2306 /* Deferred */
2307 I->phi = instr;
2308 }
2309
2310 /* Look up the AGX block corresponding to a given NIR block. Used when
2311 * translating phi nodes after emitting all blocks.
2312 */
2313 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)2314 agx_from_nir_block(agx_context *ctx, nir_block *block)
2315 {
2316 return ctx->indexed_nir_blocks[block->index];
2317 }
2318
2319 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)2320 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
2321 {
2322 nir_phi_instr *phi = I->phi;
2323 I->phi = NULL;
2324
2325 /* Guaranteed by lower_phis_to_scalar */
2326 assert(phi->def.num_components == 1);
2327
2328 nir_foreach_phi_src(src, phi) {
2329 agx_block *pred = agx_from_nir_block(ctx, src->pred);
2330 unsigned i = agx_predecessor_index(block, pred);
2331 assert(i < I->nr_srcs);
2332
2333 I->src[i] = agx_src_index(&src->src);
2334 }
2335 }
2336
2337 static void
agx_emit_phis_deferred(agx_context * ctx)2338 agx_emit_phis_deferred(agx_context *ctx)
2339 {
2340 agx_foreach_block(ctx, block) {
2341 agx_foreach_phi_in_block(block, I)
2342 agx_emit_phi_deferred(ctx, block, I);
2343 }
2344 }
2345
2346 static void
agx_emit_undef(agx_builder * b,nir_undef_instr * instr)2347 agx_emit_undef(agx_builder *b, nir_undef_instr *instr)
2348 {
2349 /* For now, just lower undefs to zero. This doesn't matter too much, since
2350 * the lowering happens in NIR and this just allows for late lowering passes
2351 * to result in undefs.
2352 */
2353 if (instr->def.num_components > 1) {
2354 assert(instr->def.num_components <= 4);
2355 agx_index zero = agx_mov_imm(b, instr->def.bit_size, 0);
2356
2357 agx_emit_collect_to(b, agx_def_index(&instr->def),
2358 instr->def.num_components,
2359 (agx_index[4]){zero, zero, zero, zero});
2360 } else {
2361 agx_mov_imm_to(b, agx_def_index(&instr->def), 0);
2362 }
2363 }
2364
2365 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)2366 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
2367 {
2368 switch (instr->type) {
2369 case nir_instr_type_load_const:
2370 agx_emit_load_const(b, nir_instr_as_load_const(instr));
2371 break;
2372
2373 case nir_instr_type_intrinsic:
2374 agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2375 break;
2376
2377 case nir_instr_type_alu:
2378 agx_emit_alu(b, nir_instr_as_alu(instr));
2379 break;
2380
2381 case nir_instr_type_tex:
2382 agx_emit_tex(b, nir_instr_as_tex(instr));
2383 break;
2384
2385 case nir_instr_type_jump:
2386 agx_emit_jump(b, nir_instr_as_jump(instr));
2387 break;
2388
2389 case nir_instr_type_phi:
2390 agx_emit_phi(b, nir_instr_as_phi(instr));
2391 break;
2392
2393 case nir_instr_type_undef:
2394 agx_emit_undef(b, nir_instr_as_undef(instr));
2395 break;
2396
2397 default:
2398 unreachable("should've been lowered");
2399 }
2400 }
2401
2402 static agx_block *
agx_create_block(agx_context * ctx)2403 agx_create_block(agx_context *ctx)
2404 {
2405 agx_block *blk = rzalloc(ctx, agx_block);
2406
2407 util_dynarray_init(&blk->predecessors, blk);
2408
2409 return blk;
2410 }
2411
2412 static agx_block *
emit_block(agx_context * ctx,nir_block * block)2413 emit_block(agx_context *ctx, nir_block *block)
2414 {
2415 if (ctx->after_block) {
2416 ctx->current_block = ctx->after_block;
2417 ctx->after_block = NULL;
2418 } else {
2419 ctx->current_block = agx_create_block(ctx);
2420 }
2421
2422 agx_block *blk = ctx->current_block;
2423 list_addtail(&blk->link, &ctx->blocks);
2424 list_inithead(&blk->instructions);
2425
2426 ctx->indexed_nir_blocks[block->index] = blk;
2427
2428 agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
2429
2430 nir_foreach_instr(instr, block) {
2431 agx_emit_instr(&_b, instr);
2432 }
2433
2434 return blk;
2435 }
2436
2437 static agx_block *emit_cf_list(agx_context *ctx, struct exec_list *list);
2438
2439 /* Emit if-else as
2440 *
2441 * if_icmp cond != 0
2442 * ...
2443 * else_icmp cond == 0
2444 * ...
2445 * pop_exec
2446 *
2447 * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
2448 * an empty else block can become nonempty after RA due to phi lowering. This is
2449 * not usually optimal, but it's a start.
2450 */
2451
2452 static void
emit_if(agx_context * ctx,nir_if * nif)2453 emit_if(agx_context *ctx, nir_if *nif)
2454 {
2455 agx_block *first_block = ctx->current_block;
2456 agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
2457 agx_index cond = agx_src_index(&nif->condition);
2458
2459 agx_instr *if_ = agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true,
2460 NULL /* filled in later */);
2461 ctx->loop_nesting++;
2462 ctx->total_nesting++;
2463
2464 /* Emit the two subblocks. */
2465 agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
2466 agx_block *end_then = ctx->current_block;
2467
2468 _b.cursor = agx_after_block(ctx->current_block);
2469
2470 agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
2471 agx_block *end_else = ctx->current_block;
2472
2473 /* If the "if" fails, we fallthrough to the else */
2474 if_->target = else_block;
2475
2476 /* Insert an else instruction at the beginning of the else block. We use
2477 * "else_fcmp 0.0, 0.0, eq" as unconditional else, matching the blob.
2478 *
2479 * If it fails, we fall through to the logical end of the last else block.
2480 */
2481 _b.cursor = agx_before_block(else_block);
2482 agx_else_fcmp(&_b, agx_zero(), agx_zero(), 1, AGX_FCOND_EQ, false, end_else);
2483
2484 ctx->after_block = agx_create_block(ctx);
2485
2486 agx_block_add_successor(first_block, if_block);
2487 agx_block_add_successor(first_block, else_block);
2488 agx_block_add_successor(end_then, ctx->after_block);
2489 agx_block_add_successor(end_else, ctx->after_block);
2490
2491 _b.cursor = agx_after_block(ctx->current_block);
2492 agx_pop_exec(&_b, 1);
2493 ctx->loop_nesting--;
2494 ctx->total_nesting--;
2495 }
2496
2497 static void
emit_loop(agx_context * ctx,nir_loop * nloop)2498 emit_loop(agx_context *ctx, nir_loop *nloop)
2499 {
2500 assert(!nir_loop_has_continue_construct(nloop));
2501 /* We only track nesting within the innermost loop, so push and reset */
2502 unsigned pushed_nesting = ctx->loop_nesting;
2503 ctx->loop_nesting = 0;
2504 ctx->total_nesting++;
2505
2506 bool old_continues = ctx->loop_continues;
2507 ctx->loop_continues = loop_uses_continue(nloop);
2508
2509 agx_block *popped_break = ctx->break_block;
2510 agx_block *popped_continue = ctx->continue_block;
2511
2512 ctx->break_block = agx_create_block(ctx);
2513 ctx->continue_block = agx_create_block(ctx);
2514
2515 /* If we are emitting a loop inside other control flow, there might be
2516 * threads masked off (TODO: divergence analysis), so push_exec them so
2517 * we get the lower nesting count values to ourselves.
2518 */
2519 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2520 if (ctx->total_nesting > 1)
2521 agx_push_exec(&_b, ctx->loop_continues ? 2 : 1);
2522
2523 /* Fallthrough to body */
2524 agx_block_add_successor(ctx->current_block, ctx->continue_block);
2525
2526 /* Emit the body */
2527 ctx->after_block = ctx->continue_block;
2528 ctx->after_block->loop_header = true;
2529 agx_block *start_block = emit_cf_list(ctx, &nloop->body);
2530
2531 /* If we used any continue jumps, we need to reactivate the continued
2532 * threads. We do this with an always true while_icmp, which behaves like:
2533 *
2534 * if (r0l == 1) {
2535 * r0l = 0;
2536 * }
2537 * update_exec
2538 *
2539 * If we did not use continue, this would be a no-op so it is omitted.
2540 */
2541 _b.cursor = agx_after_block(ctx->current_block);
2542
2543 if (ctx->loop_continues) {
2544 agx_while_icmp(
2545 &_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false,
2546 NULL /* no semantic target, used purely for side effects */);
2547 }
2548
2549 agx_jmp_exec_any(&_b, start_block);
2550 agx_pop_exec(&_b, ctx->loop_continues ? 2 : 1);
2551 agx_block_add_successor(ctx->current_block, ctx->continue_block);
2552
2553 /* Pop off */
2554 ctx->after_block = ctx->break_block;
2555 ctx->break_block = popped_break;
2556 ctx->continue_block = popped_continue;
2557
2558 /* Update shader-db stats */
2559 ++ctx->loop_count;
2560
2561 /* All nested control flow must have finished */
2562 assert(ctx->loop_nesting == 0);
2563
2564 /* Restore loop nesting (we might be inside an if inside an outer loop) */
2565 ctx->loop_nesting = pushed_nesting;
2566 ctx->total_nesting--;
2567 ctx->loop_continues = old_continues;
2568 }
2569
2570 /* Before the first control flow structure, the nesting counter needs to be
2571 * zeroed for correct operation. This only happens at most once, since by
2572 * definition this occurs at the end of the first block, which dominates the
2573 * rest of the program. */
2574
2575 static void
emit_first_cf(agx_context * ctx)2576 emit_first_cf(agx_context *ctx)
2577 {
2578 if (ctx->any_cf)
2579 return;
2580
2581 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2582 agx_begin_cf(&_b);
2583 ctx->any_cf = true;
2584 }
2585
2586 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)2587 emit_cf_list(agx_context *ctx, struct exec_list *list)
2588 {
2589 agx_block *start_block = NULL;
2590
2591 foreach_list_typed(nir_cf_node, node, node, list) {
2592 switch (node->type) {
2593 case nir_cf_node_block: {
2594 agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2595
2596 if (!start_block)
2597 start_block = block;
2598
2599 break;
2600 }
2601
2602 case nir_cf_node_if:
2603 emit_first_cf(ctx);
2604 emit_if(ctx, nir_cf_node_as_if(node));
2605 break;
2606
2607 case nir_cf_node_loop:
2608 emit_first_cf(ctx);
2609 emit_loop(ctx, nir_cf_node_as_loop(node));
2610 break;
2611
2612 default:
2613 unreachable("Unknown control flow");
2614 }
2615 }
2616
2617 return start_block;
2618 }
2619
2620 static void
agx_set_st_vary_final(agx_context * ctx)2621 agx_set_st_vary_final(agx_context *ctx)
2622 {
2623 agx_foreach_instr_global_rev(ctx, I) {
2624 if (I->op == AGX_OPCODE_ST_VARY) {
2625 I->last = true;
2626 return;
2627 }
2628 }
2629
2630 /* If we got here, there was no varying written. We need to mark that. */
2631 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
2632 agx_builder _b = agx_init_builder(ctx, agx_after_block_logical(last_block));
2633 agx_no_varyings(&_b);
2634 }
2635
2636 static int
agx_dump_stats(agx_context * ctx,unsigned size,char ** out)2637 agx_dump_stats(agx_context *ctx, unsigned size, char **out)
2638 {
2639 unsigned nr_ins = 0, spills = 0, fills = 0;
2640
2641 /* Count instructions */
2642 agx_foreach_instr_global(ctx, I) {
2643 nr_ins++;
2644
2645 if (I->op == AGX_OPCODE_STACK_STORE)
2646 spills++;
2647 else if (I->op == AGX_OPCODE_STACK_LOAD)
2648 fills++;
2649 }
2650
2651 struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx);
2652
2653 unsigned nr_threads =
2654 agx_occupancy_for_register_count(ctx->max_reg).max_threads;
2655
2656 return asprintf(
2657 out,
2658 "%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, "
2659 "%u uniforms, %u scratch, %u threads, %u loops, "
2660 "%u:%u spills:fills",
2661 gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib,
2662 cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size,
2663 nr_threads, ctx->loop_count, spills, fills);
2664 }
2665
2666 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)2667 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
2668 {
2669 if (instr->type != nir_instr_type_alu)
2670 return false;
2671
2672 nir_alu_instr *alu = nir_instr_as_alu(instr);
2673 return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
2674 }
2675
2676 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
2677 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
2678 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
2679 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
2680 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
2681 * need to change units from radians to quadrants modulo turns. Cosine is
2682 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
2683 */
2684
2685 static nir_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)2686 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
2687 {
2688 nir_alu_instr *alu = nir_instr_as_alu(instr);
2689 nir_def *x = nir_mov_alu(b, alu->src[0], 1);
2690 nir_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
2691
2692 if (alu->op == nir_op_fcos)
2693 turns = nir_fadd_imm(b, turns, 0.25f);
2694
2695 nir_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
2696 return nir_fsin_agx(b, quadrants);
2697 }
2698
2699 static bool
agx_lower_sincos(nir_shader * shader)2700 agx_lower_sincos(nir_shader *shader)
2701 {
2702 return nir_shader_lower_instructions(shader, agx_lower_sincos_filter,
2703 agx_lower_sincos_impl, NULL);
2704 }
2705
2706 static bool
agx_lower_front_face(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)2707 agx_lower_front_face(struct nir_builder *b, nir_intrinsic_instr *intr,
2708 UNUSED void *data)
2709 {
2710 if (intr->intrinsic != nir_intrinsic_load_front_face)
2711 return false;
2712
2713 nir_def *def = &intr->def;
2714 assert(def->bit_size == 1);
2715
2716 b->cursor = nir_before_instr(&intr->instr);
2717 nir_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
2718 return true;
2719 }
2720
2721 /*
2722 * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
2723 * again at shader variant compile time. Unless there was a complex shader key,
2724 * the latter run should be almost a no-op.
2725 */
2726 static void
agx_optimize_loop_nir(nir_shader * nir)2727 agx_optimize_loop_nir(nir_shader *nir)
2728 {
2729 bool progress;
2730
2731 do {
2732 progress = false;
2733
2734 NIR_PASS(progress, nir, nir_copy_prop);
2735 NIR_PASS(progress, nir, nir_opt_remove_phis);
2736 NIR_PASS(progress, nir, nir_opt_dce);
2737 NIR_PASS(progress, nir, nir_opt_dead_cf);
2738 NIR_PASS(progress, nir, nir_opt_cse);
2739 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
2740 NIR_PASS(progress, nir, nir_opt_phi_precision);
2741 NIR_PASS(progress, nir, nir_opt_algebraic);
2742 NIR_PASS(progress, nir, nir_opt_constant_folding);
2743 NIR_PASS(progress, nir, nir_opt_undef);
2744 NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
2745 NIR_PASS(progress, nir, nir_opt_loop_unroll);
2746 } while (progress);
2747 }
2748
2749 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2750 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
2751 unsigned num_components, nir_intrinsic_instr *low,
2752 nir_intrinsic_instr *high, void *data)
2753 {
2754 /* Must be aligned to the size of the load */
2755 unsigned align = nir_combined_align(align_mul, align_offset);
2756 if ((bit_size / 8) > align)
2757 return false;
2758
2759 if (num_components > 4)
2760 return false;
2761
2762 if (bit_size > 32)
2763 return false;
2764
2765 return true;
2766 }
2767
2768 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2769 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2770 {
2771 if (!nir_intrinsic_has_access(intr))
2772 return false;
2773
2774 nir_intrinsic_set_access(intr,
2775 ACCESS_CAN_SPECULATE | nir_intrinsic_access(intr));
2776 return true;
2777 }
2778
2779 static void
agx_optimize_nir(nir_shader * nir,bool soft_fault,unsigned * preamble_size)2780 agx_optimize_nir(nir_shader *nir, bool soft_fault, unsigned *preamble_size)
2781 {
2782 /* This runs only once up front since other optimizations don't affect it */
2783 NIR_PASS(_, nir, nir_opt_shrink_stores, true);
2784
2785 agx_optimize_loop_nir(nir);
2786
2787 /* If soft fault is enabled, we can freely speculate everything. That lets us
2788 * peephole select and form preambles more aggressively.
2789 */
2790 if (soft_fault) {
2791 NIR_PASS(_, nir, nir_shader_intrinsics_pass, set_speculate,
2792 nir_metadata_control_flow, NULL);
2793 }
2794
2795 /* Peephole select again after setting the speculate flag but before
2796 * vectorizing. This cleans up short-circuit loads in unrolled loops.
2797 *
2798 * XXX: Set indirect_load_ok once we can investigate CTS flakes.
2799 */
2800 NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
2801
2802 NIR_PASS(_, nir, nir_opt_load_store_vectorize,
2803 &(const nir_load_store_vectorize_options){
2804 .modes = nir_var_mem_global | nir_var_mem_constant,
2805 .callback = mem_vectorize_cb,
2806 });
2807 NIR_PASS(_, nir, nir_lower_pack);
2808
2809 nir_convert_to_lcssa(nir, true, true);
2810 NIR_PASS_V(nir, nir_divergence_analysis);
2811 bool progress = false;
2812
2813 static const nir_lower_subgroups_options subgroups_options = {
2814 .ballot_bit_size = 32,
2815 .ballot_components = 1,
2816 .lower_elect = true,
2817 .lower_subgroup_masks = true,
2818 };
2819
2820 NIR_PASS(progress, nir, nir_opt_uniform_atomics, true);
2821 NIR_PASS(progress, nir, nir_opt_uniform_subgroup, &subgroups_options);
2822
2823 /* The above create operations that need lowering/optimizing */
2824 if (progress) {
2825 NIR_PASS(_, nir, agx_nir_lower_subgroups);
2826 NIR_PASS(_, nir, nir_opt_algebraic);
2827 }
2828
2829 progress = false;
2830 NIR_PASS(progress, nir, agx_nir_lower_address);
2831
2832 /* If address lowering made progress, clean up before forming preambles.
2833 * Otherwise the optimized preambles might just be constants! Do it before
2834 * lowering int64 too, to avoid lowering constant int64 arithmetic.
2835 */
2836 if (progress) {
2837 NIR_PASS(_, nir, nir_opt_constant_folding);
2838 NIR_PASS(_, nir, nir_opt_dce);
2839 }
2840
2841 /* Only lower int64 after optimizing address arithmetic, so that u2u64/i2i64
2842 * conversions remain.
2843 */
2844 progress = false;
2845 NIR_PASS(progress, nir, nir_lower_int64);
2846
2847 /* If we lowered actual int64 arithmetic (not folded into the address
2848 * calculations), then clean up after the lowering.
2849 */
2850 if (progress) {
2851 do {
2852 progress = false;
2853
2854 NIR_PASS(progress, nir, nir_opt_algebraic);
2855 NIR_PASS(progress, nir, nir_opt_constant_folding);
2856 NIR_PASS(progress, nir, nir_opt_dce);
2857 } while (progress);
2858 }
2859
2860 if (preamble_size && (!(agx_compiler_debug & AGX_DBG_NOPREAMBLE)))
2861 NIR_PASS(_, nir, agx_nir_opt_preamble, preamble_size);
2862
2863 /* Forming preambles may dramatically reduce the instruction count
2864 * in certain blocks, causing some if-else statements to become
2865 * trivial. We want to peephole select those, given that control flow
2866 * prediction instructions are costly.
2867 *
2868 * We need to lower int64 again to deal with the resulting 64-bit csels.
2869 */
2870 NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
2871 NIR_PASS(_, nir, nir_lower_int64);
2872
2873 NIR_PASS(_, nir, nir_opt_algebraic_late);
2874
2875 /* Fuse add/sub/multiplies/shifts after running opt_algebraic_late to fuse
2876 * isub but before shifts are lowered.
2877 */
2878 do {
2879 progress = false;
2880
2881 NIR_PASS(progress, nir, nir_opt_dce);
2882 NIR_PASS(progress, nir, nir_opt_cse);
2883 NIR_PASS(progress, nir, agx_nir_fuse_algebraic_late);
2884 } while (progress);
2885
2886 /* Do remaining lowering late, since this inserts &s for shifts so we want to
2887 * do it after fusing constant shifts. Constant folding will clean up.
2888 */
2889 NIR_PASS(_, nir, agx_nir_lower_algebraic_late);
2890 NIR_PASS(_, nir, agx_nir_fuse_selects);
2891 NIR_PASS(_, nir, nir_opt_constant_folding);
2892 NIR_PASS(_, nir, nir_opt_combine_barriers, NULL, NULL);
2893 NIR_PASS(_, nir, nir_copy_prop);
2894 NIR_PASS(_, nir, nir_opt_dce);
2895 NIR_PASS(_, nir, nir_opt_cse);
2896 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
2897 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2898
2899 /* Cleanup optimizations */
2900 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
2901 nir_move_load_input | nir_move_comparisons |
2902 nir_move_copies | nir_move_load_ssbo |
2903 nir_move_alu;
2904
2905 NIR_PASS(_, nir, nir_opt_sink, move_all);
2906 NIR_PASS(_, nir, nir_opt_move, move_all);
2907 NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
2908 }
2909
2910 /*
2911 * Varyings that are used as texture coordinates should be kept at fp32, because
2912 * fp16 does not have enough precision for large textures. It's technically
2913 * conformant not to, but every app gets this wrong.
2914 */
2915 static bool
gather_texcoords(nir_builder * b,nir_instr * instr,void * data)2916 gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
2917 {
2918 uint64_t *mask = data;
2919
2920 if (instr->type != nir_instr_type_tex)
2921 return false;
2922
2923 nir_tex_instr *tex = nir_instr_as_tex(instr);
2924
2925 int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
2926 if (coord_idx < 0)
2927 return false;
2928
2929 nir_src src = tex->src[coord_idx].src;
2930 nir_scalar x = nir_scalar_resolved(src.ssa, 0);
2931 nir_scalar y = nir_scalar_resolved(src.ssa, 1);
2932
2933 if (x.def != y.def)
2934 return false;
2935
2936 nir_instr *parent = x.def->parent_instr;
2937
2938 if (parent->type != nir_instr_type_intrinsic)
2939 return false;
2940
2941 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
2942
2943 if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
2944 return false;
2945
2946 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2947 *mask |= BITFIELD64_BIT(sem.location);
2948 return false;
2949 }
2950
2951 static bool
gather_interp(nir_builder * b,nir_intrinsic_instr * intr,void * data)2952 gather_interp(nir_builder *b, nir_intrinsic_instr *intr, void *data)
2953 {
2954 struct agx_interp_info *masks = data;
2955
2956 if (intr->intrinsic == nir_intrinsic_load_input) {
2957 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2958 masks->flat |= BITFIELD64_RANGE(sem.location, sem.num_slots);
2959 } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
2960 nir_intrinsic_interp_mode(nir_src_as_intrinsic(intr->src[0])) ==
2961 INTERP_MODE_NOPERSPECTIVE) {
2962 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2963 masks->linear |= BITFIELD64_RANGE(sem.location, sem.num_slots);
2964 }
2965
2966 return false;
2967 }
2968
2969 /*
2970 * Build a bit mask of varyings (by location) that are flatshaded and linear
2971 * shaded. This information is needed by the driver.
2972 */
2973 struct agx_interp_info
agx_gather_interp_info(nir_shader * nir)2974 agx_gather_interp_info(nir_shader *nir)
2975 {
2976 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
2977
2978 struct agx_interp_info masks = {0};
2979 nir_shader_intrinsics_pass(nir, gather_interp, nir_metadata_all, &masks);
2980 return masks;
2981 }
2982
2983 /*
2984 * Build a bit mask of varyings (by location) that are used as texture
2985 * coordinates. This information is needed by lower_mediump_io.
2986 */
2987 uint64_t
agx_gather_texcoords(nir_shader * nir)2988 agx_gather_texcoords(nir_shader *nir)
2989 {
2990 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
2991
2992 uint64_t mask = 0;
2993 nir_shader_instructions_pass(nir, gather_texcoords, nir_metadata_all, &mask);
2994 return mask;
2995 }
2996
2997 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)2998 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
2999 uint8_t bit_size, uint32_t align,
3000 uint32_t align_offset, bool offset_is_const,
3001 const void *cb_data)
3002 {
3003 align = nir_combined_align(align, align_offset);
3004
3005 assert(util_is_power_of_two_nonzero(align));
3006
3007 if ((bytes & 1) || (align == 1))
3008 bit_size = 8;
3009 else if ((bytes & 2) || (align == 2))
3010 bit_size = 16;
3011 else if (bit_size >= 32)
3012 bit_size = 32;
3013
3014 return (nir_mem_access_size_align){
3015 .num_components = MIN2(bytes / (bit_size / 8), 4),
3016 .bit_size = bit_size,
3017 .align = bit_size / 8,
3018 };
3019 }
3020
3021 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * _)3022 lower_bit_size_callback(const nir_instr *instr, UNUSED void *_)
3023 {
3024 if (instr->type == nir_instr_type_intrinsic) {
3025 /* Handle small subgroup ops */
3026 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3027
3028 switch (intr->intrinsic) {
3029 case nir_intrinsic_reduce:
3030 case nir_intrinsic_exclusive_scan:
3031 case nir_intrinsic_inclusive_scan:
3032 /* The identity for iand doesn't work for lowered 1-bit booleans, so
3033 * lower that explicitly.
3034 */
3035 if (nir_intrinsic_reduction_op(intr) == nir_op_iand &&
3036 intr->def.bit_size == 1)
3037 return 16;
3038
3039 /* In general, we have 16-bit ops instead of 8-bit, so lower those. */
3040 return intr->def.bit_size == 8 ? 16 : 0;
3041 default:
3042 return 0;
3043 }
3044 } else if (instr->type == nir_instr_type_alu) {
3045 /* Lower 8-bit ALU to 16-bit. We check the destination, as we do not want
3046 * to lower conversions from 8-bit to larger types. Those conversions get
3047 * implemented natively.
3048 */
3049 nir_alu_instr *alu = nir_instr_as_alu(instr);
3050 if (alu->def.bit_size == 8 && !is_conversion_to_8bit(alu->op))
3051 return 16;
3052 else if (alu->def.bit_size == 1 && alu->src[0].src.ssa->bit_size == 8)
3053 return 16 /* comparisons */;
3054 }
3055
3056 return 0;
3057 }
3058
3059 static bool
lower_load_from_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * data)3060 lower_load_from_texture_handle(nir_builder *b, nir_intrinsic_instr *intr,
3061 void *data)
3062 {
3063 if (intr->intrinsic != nir_intrinsic_load_from_texture_handle_agx)
3064 return false;
3065
3066 /* Bindless handles are a vec2, where the first source is the (constant)
3067 * uniform register number and the second source is the byte offset.
3068 */
3069 nir_scalar uniform = nir_scalar_resolved(intr->src[0].ssa, 0);
3070 unsigned uniform_idx = nir_scalar_as_uint(uniform);
3071
3072 b->cursor = nir_instr_remove(&intr->instr);
3073 nir_def *base = nir_load_preamble(b, 1, 64, uniform_idx);
3074 nir_def *offset = nir_u2u64(b, nir_channel(b, intr->src[0].ssa, 1));
3075
3076 nir_def_rewrite_uses(&intr->def, nir_iadd(b, base, offset));
3077 return true;
3078 }
3079
3080 static void
agx_remove_unreachable_block(agx_block * block)3081 agx_remove_unreachable_block(agx_block *block)
3082 {
3083 /* Delete the edges */
3084 agx_foreach_successor(block, succ) {
3085 unsigned block_idx = agx_predecessor_index(succ, block);
3086
3087 /* Remove the corresponding predecessor from the successor */
3088 struct util_dynarray *blocks = &succ->predecessors;
3089 int remaining = agx_num_predecessors(succ) - (block_idx + 1);
3090 assert(remaining >= 0);
3091
3092 memcpy(util_dynarray_element(blocks, agx_block *, block_idx),
3093 util_dynarray_element(blocks, agx_block *, block_idx + 1),
3094 remaining * sizeof(agx_block *));
3095 blocks->size -= sizeof(agx_block *);
3096
3097 /* Remove the corresponding source from the phis */
3098 agx_foreach_phi_in_block(succ, phi) {
3099 assert(block_idx + 1 <= phi->nr_srcs);
3100
3101 memcpy(phi->src + block_idx, phi->src + block_idx + 1,
3102 (phi->nr_srcs - (block_idx + 1)) * sizeof(phi->src[0]));
3103
3104 phi->nr_srcs--;
3105
3106 /* This might cause phis to become trivial. Lower 1-source phis to
3107 * moves and let copyprop take it from here.
3108 */
3109 if (phi->nr_srcs == 1) {
3110 phi->op = AGX_OPCODE_MOV;
3111 }
3112 }
3113 }
3114
3115 /* Remove the successor from the predecessor. */
3116 block->successors[0] = NULL;
3117 block->successors[1] = NULL;
3118
3119 /* Note: we do not remove the block itself, although it is now fully orphaned
3120 * in the control flow graph. We still need it in source order if it has any
3121 * pop_exec instructions, for a loop continue block.
3122 *
3123 * TODO: Is there a better way to handle this?
3124 *
3125 * Affects: dEQP-VK.graphicsfuzz.cov-matching-if-always-true-inside-loop
3126 */
3127 }
3128
3129 /*
3130 * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These
3131 * blocks have no predecessors, but do have successors and can contribute to
3132 * phis. They are dead and do not need to be here. Further, they violate the IR
3133 * invariant:
3134 *
3135 * Live-in sources are live-out in all predecessors.
3136 *
3137 * ...which RA depends on when handling live range splits. The simplest solution
3138 * is to simply delete these dead blocks. Fortunately, because they are
3139 * unreachable, this does not have any ill effects. Notably, this cannot
3140 * introduce critical edges.
3141 *
3142 * Deleting a block may cause a successor to become unreachable, so we use a
3143 * fixed-point algorithm to converge.
3144 */
3145 static void
agx_remove_unreachable_blocks(agx_context * ctx)3146 agx_remove_unreachable_blocks(agx_context *ctx)
3147 {
3148 agx_block *start = agx_start_block(ctx);
3149 bool progress;
3150
3151 do {
3152 progress = false;
3153
3154 agx_foreach_block_safe(ctx, pred) {
3155 if (pred != start && agx_num_predecessors(pred) == 0 &&
3156 agx_num_successors(pred) > 0) {
3157
3158 agx_remove_unreachable_block(pred);
3159 progress = true;
3160 }
3161 }
3162 } while (progress);
3163 }
3164
3165 static bool
agx_should_dump(nir_shader * nir,unsigned agx_dbg_bit)3166 agx_should_dump(nir_shader *nir, unsigned agx_dbg_bit)
3167 {
3168 return (agx_compiler_debug & agx_dbg_bit) &&
3169 !(nir->info.internal && !(agx_compiler_debug & AGX_DBG_INTERNAL));
3170 }
3171
3172 static unsigned
agx_compile_function_nir(nir_shader * nir,nir_function_impl * impl,struct agx_shader_key * key,struct util_debug_callback * debug,struct util_dynarray * binary,struct agx_shader_info * out)3173 agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
3174 struct agx_shader_key *key,
3175 struct util_debug_callback *debug,
3176 struct util_dynarray *binary,
3177 struct agx_shader_info *out)
3178 {
3179 nir_index_blocks(impl);
3180 nir_index_ssa_defs(impl);
3181
3182 agx_context *ctx = rzalloc(NULL, agx_context);
3183 ctx->nir = nir;
3184 ctx->is_preamble = impl->function->is_preamble;
3185 ctx->out = out;
3186 ctx->key = key;
3187 ctx->stage = nir->info.stage;
3188 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
3189 ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
3190 list_inithead(&ctx->blocks);
3191
3192 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble) {
3193 ctx->any_cf = key->fs.inside_sample_loop;
3194 }
3195
3196 ctx->alloc = impl->ssa_alloc;
3197 emit_cf_list(ctx, &impl->body);
3198 agx_emit_phis_deferred(ctx);
3199
3200 /* Index blocks now that we're done emitting so the order is consistent. Do
3201 * this before agx_remove_unreachable_blocks so we match NIR indexing. This
3202 * makes for easier debugging.
3203 */
3204 agx_foreach_block(ctx, block) {
3205 block->index = ctx->num_blocks++;
3206 }
3207
3208 agx_remove_unreachable_blocks(ctx);
3209
3210 /* Only allocate scratch if it's statically used, regardless of if the NIR
3211 * info claims otherwise.
3212 */
3213 if (ctx->any_scratch) {
3214 assert(!ctx->is_preamble && "preambles don't use scratch");
3215 ctx->scratch_size = ALIGN(nir->scratch_size, 16);
3216 }
3217
3218 /* Stop the main shader or preamble shader after the exit block. For real
3219 * functions, we would return here.
3220 */
3221 if (!ctx->key->no_stop || ctx->is_preamble) {
3222 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
3223 agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
3224 agx_stop(&_b);
3225 }
3226
3227 agx_validate(ctx, "IR translation");
3228
3229 if (likely(!(agx_compiler_debug & AGX_DBG_NOOPT))) {
3230 /* Eliminate dead instructions before CSE to avoid silly scheduling */
3231 agx_dce(ctx, false);
3232
3233 /* CSE before eliminating dead destinations so that subdivision is
3234 * optimized properly.
3235 */
3236 agx_opt_cse(ctx);
3237
3238 /* After DCE, use counts are right so we can run the optimizer. */
3239 agx_optimizer(ctx);
3240 agx_opt_compact_constants(ctx);
3241
3242 /* After inlining constants, promote what's left */
3243 if (key->promote_constants && !key->secondary &&
3244 !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
3245 agx_opt_promote_constants(ctx);
3246 }
3247 }
3248
3249 /* For correctness, lower uniform sources after copyprop (for correctness,
3250 * as copyprop creates uniform sources). To keep register pressure in
3251 * check, lower after CSE, since moves are cheaper than registers.
3252 */
3253 agx_lower_uniform_sources(ctx);
3254
3255 /* RA correctness depends on DCE */
3256 agx_dce(ctx, true);
3257 agx_validate(ctx, "Pre-RA passes");
3258
3259 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3260 agx_print_shader(ctx, stdout);
3261
3262 if (likely(!(agx_compiler_debug & AGX_DBG_NOSCHED))) {
3263 agx_pressure_schedule(ctx);
3264 agx_validate(ctx, "Pre-RA scheduler");
3265 }
3266
3267 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3268 agx_print_shader(ctx, stdout);
3269
3270 agx_ra(ctx);
3271 agx_validate(ctx, "RA");
3272 agx_lower_64bit_postra(ctx);
3273
3274 if (ctx->scratch_size > 0) {
3275 /* Apple always allocate 40 more bytes in the entrypoint and align to 4. */
3276 uint64_t stack_size = ALIGN(DIV_ROUND_UP(ctx->scratch_size, 4) + 10, 4);
3277
3278 assert(stack_size < INT16_MAX);
3279
3280 agx_block *start_block = agx_start_block(ctx);
3281 agx_builder _b = agx_init_builder(ctx, agx_before_block(start_block));
3282 agx_stack_adjust(&_b, stack_size);
3283
3284 /* If we're going to execute multiple times, make sure we clean up after
3285 * ourselves, else the hardware faults.
3286 */
3287 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble &&
3288 ctx->key->fs.inside_sample_loop) {
3289
3290 _b = agx_init_builder(ctx, agx_after_block(agx_end_block(ctx)));
3291 agx_stack_adjust(&_b, -stack_size);
3292 }
3293
3294 if (ctx->is_preamble)
3295 out->preamble_scratch_size = stack_size;
3296 else
3297 out->scratch_size = stack_size;
3298 }
3299
3300 if (ctx->stage == MESA_SHADER_VERTEX && !impl->function->is_preamble &&
3301 !ctx->key->secondary)
3302 agx_set_st_vary_final(ctx);
3303
3304 agx_insert_waits(ctx);
3305 agx_opt_empty_else(ctx);
3306 agx_opt_break_if(ctx);
3307 agx_opt_jmp_none(ctx);
3308
3309 if (ctx->any_quad_divergent_shuffle)
3310 agx_lower_divergent_shuffle(ctx);
3311
3312 agx_lower_pseudo(ctx);
3313
3314 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3315 agx_print_shader(ctx, stdout);
3316
3317 /* Pad binary */
3318 if (binary->size % AGX_CODE_ALIGN) {
3319 unsigned ngrow = AGX_CODE_ALIGN - (binary->size % AGX_CODE_ALIGN);
3320 memset(util_dynarray_grow_bytes(binary, ngrow, 1), 0, ngrow);
3321 }
3322
3323 unsigned offset = binary->size;
3324 assert((offset % AGX_CODE_ALIGN) == 0);
3325
3326 agx_pack_binary(ctx, binary);
3327
3328 unsigned nr_gprs = ctx->max_reg + 1;
3329
3330 /* If the preamble uses scratch (due to spilling), we need to set maximal
3331 * GPRs. Do it here so the driver doesn't have to worry about it.
3332 */
3333 if (impl->function->is_preamble)
3334 out->nr_preamble_gprs = ctx->scratch_size ? 256 : nr_gprs;
3335 else
3336 out->nr_gprs = nr_gprs;
3337
3338 /* Don't dump statistics for preambles, since they're not worth optimizing */
3339 if (!impl->function->is_preamble) {
3340 char *stats;
3341 int ret = agx_dump_stats(ctx, binary->size, &stats);
3342
3343 if (ret >= 0) {
3344 if (agx_should_dump(nir, AGX_DBG_SHADERDB)) {
3345 fprintf(stderr, "SHADER-DB: %s - %s\n", nir->info.label ?: "",
3346 stats);
3347 }
3348
3349 if (debug)
3350 util_debug_message(debug, SHADER_INFO, "%s", stats);
3351
3352 free(stats);
3353 }
3354 }
3355
3356 ralloc_free(ctx);
3357
3358 return offset;
3359 }
3360
3361 void
agx_link_libagx(nir_shader * nir,const nir_shader * libagx)3362 agx_link_libagx(nir_shader *nir, const nir_shader *libagx)
3363 {
3364 nir_link_shader_functions(nir, libagx);
3365 NIR_PASS(_, nir, nir_inline_functions);
3366 nir_remove_non_entrypoints(nir);
3367 NIR_PASS(_, nir, nir_opt_deref);
3368 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3369 NIR_PASS(_, nir, nir_remove_dead_derefs);
3370 NIR_PASS(_, nir, nir_remove_dead_variables,
3371 nir_var_function_temp | nir_var_shader_temp, NULL);
3372 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
3373 nir_var_shader_temp | nir_var_function_temp,
3374 glsl_get_cl_type_size_align);
3375 }
3376
3377 /*
3378 * The hardware frcp instruction is sometimes off by 1 ULP. For correctly
3379 * rounded frcp, a refinement step is required. This routine has been
3380 * exhaustively tested with a modified math_bruteforce.
3381 *
3382 * While Khronos APIs allow 2.5 ULP error for divides, nir_lower_idiv relies on
3383 * correctly rounded frcp. This is therefore load bearing for integer division
3384 * on all APIs.
3385 */
3386 static nir_def *
libagx_frcp(nir_builder * b,nir_def * x)3387 libagx_frcp(nir_builder *b, nir_def *x)
3388 {
3389 nir_def *u = nir_frcp(b, x);
3390
3391 /* Do 1 Newton-Raphson refinement step.
3392 *
3393 * Define f(u) = xu - 1. Then f(u) = 0 iff u = 1/x. Newton's method gives:
3394 *
3395 * u_2 = u - f(u) / f'(u) = u - (xu - 1) / x
3396 *
3397 * Our original guess is close, so we approximate (1 / x) by u:
3398 *
3399 * u_2 = u - u(xu - 1) = u + u(1 - xu)
3400 * = fma(fma(-x, u, 1), u, u)
3401 */
3402 nir_def *one = nir_imm_float(b, 1.0);
3403 nir_def *u_2 = nir_ffma(b, nir_ffma(b, nir_fneg(b, x), u, one), u, u);
3404
3405 /* If the original value was infinite, frcp will generate the correct zero.
3406 * However, the Newton-Raphson step would multiply 0 * Inf and get a NaN. So
3407 * skip the refinement step for infinite inputs. We do this backwards,
3408 * checking whether the refined result is NaN, since we can implement this
3409 * check in a single fcmpsel instruction. The other case where the refinement
3410 * is NaN is a NaN input, in which skipping refinement is acceptable.
3411 */
3412 return nir_bcsel(b, nir_fisnan(b, u_2), u, u_2);
3413 }
3414
3415 static bool
agx_nir_lower_fdiv(nir_builder * b,nir_alu_instr * alu,void * _)3416 agx_nir_lower_fdiv(nir_builder *b, nir_alu_instr *alu, void *_)
3417 {
3418 if (alu->op != nir_op_frcp || !alu->exact || alu->def.bit_size != 32)
3419 return false;
3420
3421 b->cursor = nir_before_instr(&alu->instr);
3422 nir_def_replace(&alu->def, libagx_frcp(b, nir_ssa_for_alu_src(b, alu, 0)));
3423 return true;
3424 }
3425
3426 /* Preprocess NIR independent of shader state */
3427 void
agx_preprocess_nir(nir_shader * nir,const nir_shader * libagx)3428 agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
3429 {
3430 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3431
3432 /* Lower large arrays to scratch and small arrays to csel */
3433 NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
3434 glsl_get_natural_size_align_bytes,
3435 glsl_get_natural_size_align_bytes);
3436 NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3437 NIR_PASS(_, nir, nir_split_var_copies);
3438 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
3439 NIR_PASS(_, nir, nir_lower_var_copies);
3440
3441 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3442 NIR_PASS(_, nir, agx_nir_lower_frag_sidefx);
3443 }
3444
3445 /* Clean up deref gunk after lowering I/O */
3446 NIR_PASS(_, nir, nir_opt_dce);
3447
3448 agx_link_libagx(nir, libagx);
3449
3450 /* Runs before we lower away idiv, to work at all. But runs after lowering
3451 * textures, since the cube map array lowering generates division by 6.
3452 */
3453 NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3454
3455 nir_lower_idiv_options idiv_options = {
3456 .allow_fp16 = true,
3457 };
3458
3459 NIR_PASS(_, nir, nir_lower_idiv, &idiv_options);
3460 NIR_PASS(_, nir, nir_lower_frexp);
3461 NIR_PASS(_, nir, nir_lower_alu);
3462 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3463 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3464 NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false);
3465 NIR_PASS(_, nir, agx_lower_sincos);
3466 NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_lower_front_face,
3467 nir_metadata_control_flow, NULL);
3468 NIR_PASS(_, nir, agx_nir_lower_subgroups);
3469 NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3470 NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv,
3471 nir_metadata_control_flow, NULL);
3472
3473 /* After lowering, run through the standard suite of NIR optimizations. We
3474 * will run through the loop later, once we have the shader key, but if we
3475 * run now, that run will ideally be almost a no-op.
3476 */
3477 agx_optimize_loop_nir(nir);
3478
3479 NIR_PASS(_, nir, nir_opt_deref);
3480 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3481
3482 /* We're lowered away all variables. Remove them all for smaller shaders. */
3483 NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
3484 nir->info.io_lowered = true;
3485
3486 /* Move before lowering */
3487 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3488 nir_move_load_input | nir_move_comparisons |
3489 nir_move_copies | nir_move_load_ssbo |
3490 nir_move_alu;
3491
3492 NIR_PASS(_, nir, nir_opt_sink, move_all);
3493 NIR_PASS(_, nir, nir_opt_move, move_all);
3494 NIR_PASS(_, nir, agx_nir_lower_shared_bitsize);
3495 }
3496
3497 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_debug_callback * debug,struct agx_shader_part * out)3498 agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
3499 struct util_debug_callback *debug,
3500 struct agx_shader_part *out)
3501 {
3502 agx_compiler_debug = agx_get_compiler_debug();
3503 struct agx_shader_info *info = &out->info;
3504
3505 struct util_dynarray binary;
3506 util_dynarray_init(&binary, NULL);
3507
3508 memset(out, 0, sizeof *out);
3509
3510 assert(nir->info.io_lowered &&
3511 "agx_preprocess_nir is called first, then the shader is specalized,"
3512 "then the specialized shader is compiled");
3513
3514 /* If required, tag writes will be enabled by instruction selection */
3515 if (nir->info.stage == MESA_SHADER_FRAGMENT)
3516 info->tag_write_disable = !nir->info.writes_memory;
3517
3518 bool needs_libagx = true /* TODO: Optimize */;
3519
3520 NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
3521 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3522
3523 if (needs_libagx) {
3524 agx_link_libagx(nir, key->libagx);
3525
3526 NIR_PASS(_, nir, nir_opt_deref);
3527 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3528 NIR_PASS(_, nir, nir_lower_explicit_io,
3529 nir_var_shader_temp | nir_var_function_temp |
3530 nir_var_mem_shared | nir_var_mem_global,
3531 nir_address_format_62bit_generic);
3532 }
3533
3534 /* Late sysval lowering creates large loads. Load lowering creates unpacks */
3535 nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
3536 .modes = nir_var_mem_ssbo | nir_var_mem_constant |
3537 nir_var_mem_task_payload | nir_var_shader_temp |
3538 nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
3539 .callback = mem_access_size_align_cb,
3540 };
3541 NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
3542
3543 /* Cleanup 8-bit math before lowering */
3544 bool progress;
3545 do {
3546 progress = false;
3547
3548 NIR_PASS(progress, nir, nir_opt_algebraic);
3549 NIR_PASS(progress, nir, nir_opt_constant_folding);
3550 NIR_PASS(progress, nir, nir_opt_dce);
3551 } while (progress);
3552
3553 NIR_PASS(_, nir, nir_lower_bit_size, lower_bit_size_callback, NULL);
3554
3555 /* Late blend lowering creates vectors */
3556 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3557 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3558
3559 /* Late VBO lowering creates constant udiv instructions */
3560 NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3561
3562 NIR_PASS(_, nir, nir_opt_constant_folding);
3563 NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
3564 nir_metadata_control_flow, NULL);
3565
3566 info->push_count = key->reserved_preamble;
3567 agx_optimize_nir(nir, key->dev.soft_fault,
3568 key->secondary ? NULL : &info->push_count);
3569
3570 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3571 info->varyings.fs.nr_cf = key->fs.cf_base;
3572 assign_coefficient_regs(nir, &info->varyings.fs);
3573 }
3574
3575 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3576 nir_print_shader(nir, stdout);
3577
3578 info->local_size = nir->info.shared_size;
3579
3580 nir_foreach_function_with_impl(func, impl, nir) {
3581 unsigned offset =
3582 agx_compile_function_nir(nir, impl, key, debug, &binary, &out->info);
3583
3584 if (func->is_preamble) {
3585 info->preamble_offset = offset;
3586 info->has_preamble = true;
3587 } else if (func->is_entrypoint) {
3588 info->main_offset = offset;
3589 info->main_size = binary.size - offset;
3590 } else {
3591 unreachable("General functions not yet supported");
3592 }
3593 }
3594
3595 info->stage = nir->info.stage;
3596
3597 /* Check these outside the stage check since nir->info.stage is the hardware
3598 * stage and these are read in the vertex *software* stage.
3599 */
3600 info->uses_draw_id =
3601 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
3602
3603 info->uses_base_param =
3604 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
3605 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
3606
3607 if (nir->info.stage == MESA_SHADER_VERTEX) {
3608 info->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
3609
3610 info->writes_layer_viewport =
3611 nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
3612
3613 } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3614 info->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
3615 nir->info.fs.needs_quad_helper_invocations ||
3616 nir->info.writes_memory;
3617
3618 /* Writing the sample mask requires tag writes */
3619 info->tag_write_disable &= !info->writes_sample_mask;
3620
3621 /* Report a canonical depth layout. This happens at the end because the
3622 * sample mask lowering affects it.
3623 */
3624 enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;
3625
3626 if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
3627 info->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
3628 else if (layout == FRAG_DEPTH_LAYOUT_NONE)
3629 info->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
3630 else
3631 info->depth_layout = layout;
3632
3633 info->reads_tib = nir->info.fs.uses_fbfetch_output;
3634 info->early_fragment_tests = nir->info.fs.early_fragment_tests;
3635 } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
3636 info->imageblock_stride = nir->info.cs.image_block_size_per_thread_agx;
3637 }
3638
3639 out->binary = binary.data;
3640 out->binary_size = binary.size;
3641 }
3642