1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 * Copyright (C) 2022 Alyssa Rosenzweig <[email protected]>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors (Collabora):
25 * Alyssa Rosenzweig <[email protected]>
26 */
27
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/glsl_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32
33 #include "bifrost/disassemble.h"
34 #include "panfrost/lib/pan_props.h"
35 #include "valhall/disassemble.h"
36 #include "valhall/va_compiler.h"
37 #include "bi_builder.h"
38 #include "bi_quirks.h"
39 #include "bifrost_compile.h"
40 #include "bifrost_nir.h"
41 #include "compiler.h"
42
43 /* clang-format off */
44 static const struct debug_named_value bifrost_debug_options[] = {
45 {"msgs", BIFROST_DBG_MSGS, "Print debug messages"},
46 {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"},
47 {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"},
48 {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"},
49 {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"},
50 {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"},
51 {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"},
52 {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"},
53 {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
54 {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"},
55 {"noidvs", BIFROST_DBG_NOIDVS, "Disable IDVS"},
56 {"nosb", BIFROST_DBG_NOSB, "Disable scoreboarding"},
57 {"nopreload", BIFROST_DBG_NOPRELOAD, "Disable message preloading"},
58 {"spill", BIFROST_DBG_SPILL, "Test register spilling"},
59 DEBUG_NAMED_VALUE_END
60 };
61 /* clang-format on */
62
63 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
64 bifrost_debug_options, 0)
65
66 /* How many bytes are prefetched by the Bifrost shader core. From the final
67 * clause of the shader, this range must be valid instructions or zero. */
68 #define BIFROST_SHADER_PREFETCH 128
69
70 int bifrost_debug = 0;
71
72 #define DBG(fmt, ...) \
73 do { \
74 if (bifrost_debug & BIFROST_DBG_MSGS) \
75 fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
76 } while (0)
77
78 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
79
80 static bi_index
bi_preload(bi_builder * b,unsigned reg)81 bi_preload(bi_builder *b, unsigned reg)
82 {
83 if (bi_is_null(b->shader->preloaded[reg])) {
84 /* Insert at the beginning of the shader */
85 bi_builder b_ = *b;
86 b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
87
88 /* Cache the result */
89 b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
90 }
91
92 return b->shader->preloaded[reg];
93 }
94
95 static bi_index
bi_coverage(bi_builder * b)96 bi_coverage(bi_builder *b)
97 {
98 if (bi_is_null(b->shader->coverage))
99 b->shader->coverage = bi_preload(b, 60);
100
101 return b->shader->coverage;
102 }
103
104 /*
105 * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
106 * changed from Bifrost to Valhall. Provide helpers that smooth over the
107 * architectural difference.
108 */
109 static inline bi_index
bi_vertex_id(bi_builder * b)110 bi_vertex_id(bi_builder *b)
111 {
112 return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
113 }
114
115 static inline bi_index
bi_instance_id(bi_builder * b)116 bi_instance_id(bi_builder *b)
117 {
118 return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
119 }
120
121 static inline bi_index
bi_draw_id(bi_builder * b)122 bi_draw_id(bi_builder *b)
123 {
124 assert(b->shader->arch >= 9);
125 return bi_preload(b, 62);
126 }
127
128 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)129 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
130 {
131 bi_instr *branch = bi_jump(b, bi_zero());
132
133 switch (instr->type) {
134 case nir_jump_break:
135 branch->branch_target = b->shader->break_block;
136 break;
137 case nir_jump_continue:
138 branch->branch_target = b->shader->continue_block;
139 break;
140 default:
141 unreachable("Unhandled jump type");
142 }
143
144 bi_block_add_successor(b->shader->current_block, branch->branch_target);
145 b->shader->current_block->unconditional_jumps = true;
146 }
147
148 /* Builds a 64-bit hash table key for an index */
149 static uint64_t
bi_index_to_key(bi_index idx)150 bi_index_to_key(bi_index idx)
151 {
152 static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
153
154 uint64_t key = 0;
155 memcpy(&key, &idx, sizeof(idx));
156 return key;
157 }
158
159 /*
160 * Extract a single channel out of a vector source. We split vectors with SPLIT
161 * so we can use the split components directly, without emitting an extract.
162 * This has advantages of RA, as the split can usually be optimized away.
163 */
164 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)165 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
166 {
167 bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
168 bi_index_to_key(vec));
169
170 /* No extract needed for scalars.
171 *
172 * This is a bit imprecise, but actual bugs (missing splits for vectors)
173 * should be caught by the following assertion. It is too difficult to
174 * ensure bi_extract is only called for real vectors.
175 */
176 if (components == NULL && channel == 0)
177 return vec;
178
179 assert(components != NULL && "missing bi_cache_collect()");
180 return components[channel];
181 }
182
183 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)184 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
185 {
186 /* Lifetime of a hash table entry has to be at least as long as the table */
187 bi_index *channels = ralloc_array(b->shader, bi_index, n);
188 memcpy(channels, s, sizeof(bi_index) * n);
189
190 _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
191 channels);
192 }
193
194 /*
195 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
196 * split pseudo-instruction.
197 *
198 * Pre-condition: dests is filled with bi_null().
199 */
200 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)201 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
202 {
203 /* Setup the destinations */
204 for (unsigned i = 0; i < n; ++i) {
205 dests[i] = bi_temp(b->shader);
206 }
207
208 /* Emit the split */
209 if (n == 1) {
210 bi_mov_i32_to(b, dests[0], vec);
211 } else {
212 bi_instr *I = bi_split_i32_to(b, n, vec);
213
214 bi_foreach_dest(I, j)
215 I->dest[j] = dests[j];
216 }
217 }
218
219 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)220 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
221 {
222 bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
223 bi_emit_split_i32(b, dests, vec, n);
224 bi_cache_collect(b, vec, dests, n);
225 }
226
227 /*
228 * Emit and cache a split for a vector of a given bitsize. The vector may not be
229 * composed of 32-bit words, but it will be split at 32-bit word boundaries.
230 */
231 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)232 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
233 {
234 bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
235 }
236
237 static void
bi_split_def(bi_builder * b,nir_def * def)238 bi_split_def(bi_builder *b, nir_def *def)
239 {
240 bi_emit_cached_split(b, bi_def_index(def),
241 def->bit_size * def->num_components);
242 }
243
244 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)245 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
246 {
247 /* Special case: COLLECT of a single value is a scalar move */
248 if (n == 1)
249 return bi_mov_i32_to(b, dst, chan[0]);
250
251 bi_instr *I = bi_collect_i32_to(b, dst, n);
252
253 bi_foreach_src(I, i)
254 I->src[i] = chan[i];
255
256 bi_cache_collect(b, dst, chan, n);
257 return I;
258 }
259
260 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)261 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
262 {
263 return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
264 }
265
266 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)267 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
268 bi_index s2)
269 {
270 return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
271 }
272
273 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)274 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
275 {
276 bi_index dst = bi_temp(b->shader);
277 bi_collect_v2i32_to(b, dst, s0, s1);
278 return dst;
279 }
280
281 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)282 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
283 {
284 switch (intr->intrinsic) {
285 case nir_intrinsic_load_barycentric_centroid:
286 case nir_intrinsic_load_barycentric_sample:
287 return bi_preload(b, 61);
288
289 /* Need to put the sample ID in the top 16-bits */
290 case nir_intrinsic_load_barycentric_at_sample:
291 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
292 bi_half(bi_src_index(&intr->src[0]), false));
293
294 /* Interpret as 8:8 signed fixed point positions in pixels along X and
295 * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
296 * is the center of the pixel so we first fixup and then convert. For
297 * fp16 input:
298 *
299 * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
300 * f2i16((256 * (x, y)) + (128, 128)) =
301 * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
302 *
303 * For fp32 input, that lacks enough precision for MSAA 16x, but the
304 * idea is the same. FIXME: still doesn't pass
305 */
306 case nir_intrinsic_load_barycentric_at_offset: {
307 bi_index offset = bi_src_index(&intr->src[0]);
308 bi_index f16 = bi_null();
309 unsigned sz = nir_src_bit_size(intr->src[0]);
310
311 if (sz == 16) {
312 f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
313 } else {
314 assert(sz == 32);
315 bi_index f[2];
316 for (unsigned i = 0; i < 2; ++i) {
317 f[i] =
318 bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
319 bi_imm_u32(8), BI_SPECIAL_NONE);
320 }
321
322 f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
323 }
324
325 return bi_v2f16_to_v2s16(b, f16);
326 }
327
328 case nir_intrinsic_load_barycentric_pixel:
329 default:
330 return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
331 }
332 }
333
334 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)335 bi_interp_for_intrinsic(nir_intrinsic_op op)
336 {
337 switch (op) {
338 case nir_intrinsic_load_barycentric_centroid:
339 return BI_SAMPLE_CENTROID;
340 case nir_intrinsic_load_barycentric_sample:
341 case nir_intrinsic_load_barycentric_at_sample:
342 return BI_SAMPLE_SAMPLE;
343 case nir_intrinsic_load_barycentric_at_offset:
344 return BI_SAMPLE_EXPLICIT;
345 case nir_intrinsic_load_barycentric_pixel:
346 default:
347 return BI_SAMPLE_CENTER;
348 }
349 }
350
351 /* auto, 64-bit omitted */
352 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)353 bi_reg_fmt_for_nir(nir_alu_type T)
354 {
355 switch (T) {
356 case nir_type_float16:
357 return BI_REGISTER_FORMAT_F16;
358 case nir_type_float32:
359 return BI_REGISTER_FORMAT_F32;
360 case nir_type_int16:
361 return BI_REGISTER_FORMAT_S16;
362 case nir_type_uint16:
363 return BI_REGISTER_FORMAT_U16;
364 case nir_type_int32:
365 return BI_REGISTER_FORMAT_S32;
366 case nir_type_uint32:
367 return BI_REGISTER_FORMAT_U32;
368 default:
369 unreachable("Invalid type for register format");
370 }
371 }
372
373 static bool
va_is_valid_const_narrow_index(bi_index idx)374 va_is_valid_const_narrow_index(bi_index idx)
375 {
376 if (idx.type != BI_INDEX_CONSTANT)
377 return false;
378
379 unsigned index = pan_res_handle_get_index(idx.value);
380 unsigned table_index = pan_res_handle_get_table(idx.value);
381
382 return index < 1024 && va_is_valid_const_table(table_index);
383 }
384
385 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
386 * immediate to be used (which applies even if _IMM can't be used) */
387
388 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)389 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
390 unsigned max)
391 {
392 nir_src *offset = nir_get_io_offset_src(instr);
393
394 if (!nir_src_is_const(*offset))
395 return false;
396
397 *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
398 return (*immediate) < max;
399 }
400
401 static bool
bi_is_imm_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate,unsigned max)402 bi_is_imm_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
403 uint32_t *immediate, unsigned max)
404 {
405 nir_src *offset = nir_get_io_offset_src(instr);
406
407 if (!nir_src_is_const(*offset))
408 return false;
409
410 if (b->shader->arch >= 9) {
411 uint32_t res_handle =
412 nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
413 uint32_t table_index = pan_res_handle_get_table(res_handle);
414 uint32_t res_index = pan_res_handle_get_index(res_handle);
415
416 if (!va_is_valid_const_table(table_index) || res_index >= max)
417 return false;
418
419 *immediate = res_handle;
420 return true;
421 }
422
423 return bi_is_intr_immediate(instr, immediate, max);
424 }
425
426 static bool
bi_is_imm_var_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate)427 bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
428 uint32_t *immediate)
429 {
430 unsigned max = b->shader->arch >= 9 ? 256 : 20;
431
432 return bi_is_imm_desc_handle(b, instr, immediate, max);
433 }
434
435 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
436 unsigned *channel, unsigned count, unsigned bitsize);
437
438 /* Bifrost's load instructions lack a component offset despite operating in
439 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
440 * but they may be unavoidable with separate shaders in use. To solve this, we
441 * lower to a larger load and an explicit copy of the desired components. */
442
443 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)444 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
445 {
446 unsigned component = nir_intrinsic_component(instr);
447 unsigned nr = instr->num_components;
448 unsigned total = nr + component;
449 unsigned bitsize = instr->def.bit_size;
450
451 assert(total <= 4 && "should be vec4");
452 bi_emit_cached_split(b, tmp, total * bitsize);
453
454 if (component == 0)
455 return;
456
457 bi_index srcs[] = {tmp, tmp, tmp};
458 unsigned channels[] = {component, component + 1, component + 2};
459
460 bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
461 instr->def.bit_size);
462 }
463
464 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)465 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
466 {
467 /* Disregard the signedness of an integer, since loading 32-bits into a
468 * 32-bit register should be bit exact so should not incur any clamping.
469 *
470 * If we are reading as a u32, then it must be paired with an integer (u32 or
471 * s32) source, so use .auto32 to disregard.
472 */
473 nir_alu_type T = nir_intrinsic_dest_type(instr);
474 assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
475 enum bi_register_format regfmt =
476 T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;
477
478 nir_src *offset = nir_get_io_offset_src(instr);
479 unsigned component = nir_intrinsic_component(instr);
480 enum bi_vecsize vecsize = (instr->num_components + component - 1);
481 unsigned imm_index = 0;
482 unsigned base = nir_intrinsic_base(instr);
483 bool constant = nir_src_is_const(*offset);
484 bool immediate = bi_is_imm_desc_handle(b, instr, &imm_index, 16);
485 bi_index dest =
486 (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
487 bi_instr *I;
488
489 if (immediate) {
490 I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), bi_instance_id(b), regfmt,
491 vecsize, pan_res_handle_get_index(imm_index));
492
493 if (b->shader->arch >= 9)
494 I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
495 } else {
496 bi_index idx = bi_src_index(&instr->src[0]);
497
498 if (constant)
499 idx = bi_imm_u32(imm_index);
500 else if (base != 0)
501 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
502
503 I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), idx,
504 regfmt, vecsize);
505 }
506
507 bi_copy_component(b, instr, dest);
508 }
509
510 /*
511 * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
512 * come later, sparsely packed. This handles both linked and separable shaders
513 * with a common code path, with minimal keying only for desktop GL. Each slot
514 * consumes 16 bytes (TODO: fp16, partial vectors).
515 */
516 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)517 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
518 {
519 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
520 uint32_t mask = ctx->inputs->fixed_varying_mask;
521
522 if (sem.location >= VARYING_SLOT_VAR0) {
523 unsigned nr_special = util_bitcount(mask);
524 unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
525
526 return 16 * (nr_special + general_index);
527 } else {
528 return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
529 }
530 }
531
532 /*
533 * Compute the offset in bytes of a varying with an immediate offset, adding the
534 * offset to the base computed above. Convenience method.
535 */
536 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)537 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
538 {
539 nir_src *src = nir_get_io_offset_src(intr);
540 assert(nir_src_is_const(*src) && "assumes immediate offset");
541
542 return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
543 }
544
545 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)546 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
547 {
548 enum bi_sample sample = BI_SAMPLE_CENTER;
549 enum bi_update update = BI_UPDATE_STORE;
550 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
551 bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
552 bi_index src0 = bi_null();
553
554 unsigned component = nir_intrinsic_component(instr);
555 enum bi_vecsize vecsize = (instr->num_components + component - 1);
556 bi_index dest =
557 (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
558
559 unsigned sz = instr->def.bit_size;
560
561 if (smooth) {
562 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
563 assert(parent);
564
565 sample = bi_interp_for_intrinsic(parent->intrinsic);
566 src0 = bi_varying_src0_for_barycentric(b, parent);
567
568 assert(sz == 16 || sz == 32);
569 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
570 } else {
571 assert(sz == 32);
572 regfmt = BI_REGISTER_FORMAT_U32;
573
574 /* Valhall can't have bi_null() here, although the source is
575 * logically unused for flat varyings
576 */
577 if (b->shader->arch >= 9)
578 src0 = bi_preload(b, 61);
579
580 /* Gather info as we go */
581 b->shader->info.bifrost->uses_flat_shading = true;
582 }
583
584 enum bi_source_format source_format =
585 smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
586
587 nir_src *offset = nir_get_io_offset_src(instr);
588 unsigned imm_index = 0;
589 bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
590 unsigned base = nir_intrinsic_base(instr);
591
592 /* On Valhall, ensure the table and index are valid for usage with immediate
593 * form when IDVS isn't used */
594 if (b->shader->arch >= 9 && !b->shader->malloc_idvs)
595 immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
596 pan_res_handle_get_index(base) < 256;
597
598 if (b->shader->malloc_idvs && immediate) {
599 /* Immediate index given in bytes. */
600 bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
601 update, vecsize,
602 bi_varying_offset(b->shader, instr));
603 } else if (immediate) {
604 bi_instr *I;
605
606 if (smooth) {
607 I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
608 pan_res_handle_get_index(imm_index));
609 } else {
610 I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
611 pan_res_handle_get_index(imm_index));
612 }
613
614 /* Valhall usually uses machine-allocated IDVS. If this is disabled,
615 * use a simple Midgard-style ABI.
616 */
617 if (b->shader->arch >= 9)
618 I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
619 } else {
620 bi_index idx = bi_src_index(offset);
621
622 if (b->shader->malloc_idvs) {
623 /* Index needs to be in bytes, but NIR gives the index
624 * in slots. For now assume 16 bytes per element.
625 */
626 bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
627 unsigned vbase = bi_varying_base_bytes(b->shader, instr);
628
629 if (vbase != 0)
630 idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
631
632 bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
633 source_format, update, vecsize);
634 } else {
635 if (base != 0)
636 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
637
638 if (smooth)
639 bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
640 else
641 bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
642 }
643 }
644
645 bi_copy_component(b, instr, dest);
646 }
647
648 static bi_index
bi_make_vec8_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)649 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
650 unsigned count)
651 {
652 assert(1 <= count && count <= 4);
653
654 bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};
655
656 for (unsigned i = 0; i < count; ++i) {
657 unsigned chan = channel ? channel[i] : 0;
658 unsigned lane = chan & 3;
659 bi_index raw_data = bi_extract(b, src[i], chan >> 2);
660
661 /* On Bifrost, MKVEC.v4i8 cannot select b1 or b3 */
662 if (b->shader->arch < 9 && lane != 0 && lane != 2) {
663 bytes[i] = bi_byte(bi_rshift_or(b, 32, raw_data, bi_zero(),
664 bi_imm_u8(lane * 8), false),
665 0);
666 } else {
667 bytes[i] = bi_byte(raw_data, lane);
668 }
669
670 assert(b->shader->arch >= 9 || bytes[i].swizzle == BI_SWIZZLE_B0000 ||
671 bytes[i].swizzle == BI_SWIZZLE_B2222);
672 }
673
674 if (b->shader->arch >= 9) {
675 bi_index vec = bi_zero();
676
677 if (count >= 3)
678 vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
679
680 return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
681 } else {
682 return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
683 }
684 }
685
686 static bi_index
bi_make_vec16_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)687 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
688 unsigned count)
689 {
690 unsigned chan0 = channel ? channel[0] : 0;
691 bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
692 bi_index h0 = bi_half(w0, chan0 & 1);
693
694 /* Zero extend */
695 if (count == 1)
696 return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
697
698 /* Else, create a vector */
699 assert(count == 2);
700
701 unsigned chan1 = channel ? channel[1] : 0;
702 bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
703 bi_index h1 = bi_half(w1, chan1 & 1);
704
705 if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
706 return bi_mov_i32(b, w0);
707 else if (bi_is_word_equiv(w0, w1))
708 return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
709 else
710 return bi_mkvec_v2i16(b, h0, h1);
711 }
712
713 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)714 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
715 unsigned count, unsigned bitsize)
716 {
717 assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
718 unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
719 unsigned chan_per_word = 1 << shift;
720
721 assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
722 "unnecessarily large vector should have been lowered");
723
724 bi_index srcs[BI_MAX_VEC];
725
726 for (unsigned i = 0; i < count; i += chan_per_word) {
727 unsigned rem = MIN2(count - i, chan_per_word);
728 unsigned *channel_offset = channel ? (channel + i) : NULL;
729
730 if (bitsize == 32)
731 srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
732 else if (bitsize == 16)
733 srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
734 else
735 srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
736 }
737
738 bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
739 }
740
741 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)742 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
743 bi_index src1)
744 {
745 bi_instr *I;
746
747 if (b->shader->arch >= 9) {
748 I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
749 I->seg = BI_SEG_UBO;
750 } else {
751 I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
752 }
753
754 bi_emit_cached_split(b, dest0, bitsize);
755 return I;
756 }
757
758 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)759 bi_load_sample_id_to(bi_builder *b, bi_index dst)
760 {
761 /* r61[16:23] contains the sampleID, mask it out. Upper bits
762 * seem to read garbage (despite being architecturally defined
763 * as zero), so use a 5-bit mask instead of 8-bits */
764
765 bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
766 bi_imm_u8(16), false);
767 }
768
769 static bi_index
bi_load_sample_id(bi_builder * b)770 bi_load_sample_id(bi_builder *b)
771 {
772 bi_index sample_id = bi_temp(b->shader);
773 bi_load_sample_id_to(b, sample_id);
774 return sample_id;
775 }
776
777 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)778 bi_pixel_indices(bi_builder *b, unsigned rt)
779 {
780 /* We want to load the current pixel. */
781 struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};
782
783 uint32_t indices_u32 = 0;
784 memcpy(&indices_u32, &pix, sizeof(indices_u32));
785 bi_index indices = bi_imm_u32(indices_u32);
786
787 /* Sample index above is left as zero. For multisampling, we need to
788 * fill in the actual sample ID in the lower byte */
789
790 if (b->shader->inputs->blend.nr_samples > 1)
791 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
792
793 return indices;
794 }
795
796 /* Source color is passed through r0-r3, or r4-r7 for the second source when
797 * dual-source blending. Preload the corresponding vector.
798 */
799 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)800 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
801 {
802 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
803 unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
804 unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
805 assert(size == 16 || size == 32);
806
807 bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
808 bi_preload(b, base + 2), bi_preload(b, base + 3)};
809
810 bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
811 }
812
813 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)814 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
815 nir_alu_type T2, unsigned rt)
816 {
817 /* Reads 2 or 4 staging registers to cover the input */
818 unsigned size = nir_alu_type_get_type_size(T);
819 unsigned size_2 = nir_alu_type_get_type_size(T2);
820 unsigned sr_count = (size <= 16) ? 2 : 4;
821 unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
822 const struct panfrost_compile_inputs *inputs = b->shader->inputs;
823 uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
824 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
825
826 /* Workaround for NIR-to-TGSI */
827 if (b->shader->nir->info.fs.untyped_color_outputs)
828 regfmt = BI_REGISTER_FORMAT_AUTO;
829
830 if (inputs->is_blend && inputs->blend.nr_samples > 1) {
831 /* Conversion descriptor comes from the compile inputs, pixel
832 * indices derived at run time based on sample ID */
833 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
834 bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
835 } else if (b->shader->inputs->is_blend) {
836 uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
837
838 /* Blend descriptor comes from the compile inputs */
839 /* Put the result in r0 */
840
841 bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
842 bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
843 bi_null(), regfmt, sr_count, 0);
844 } else {
845 /* Blend descriptor comes from the FAU RAM. By convention, the
846 * return address on Bifrost is stored in r48 and will be used
847 * by the blend shader to jump back to the fragment shader */
848
849 bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
850 bi_fau(BIR_FAU_BLEND_0 + rt, false),
851 bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
852 sr_count_2);
853 }
854
855 assert(rt < 8);
856 b->shader->info.bifrost->blend[rt].type = T;
857
858 if (T2)
859 b->shader->info.bifrost->blend_src1_type = T2;
860 }
861
862 /* Blend shaders do not need to run ATEST since they are dependent on a
863 * fragment shader that runs it. Blit shaders may not need to run ATEST, since
864 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
865 * there are no writes to the coverage mask. The latter two are satisfied for
866 * all blit shaders, so we just care about early-z, which blit shaders force
867 * iff they do not write depth or stencil */
868
869 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)870 bi_skip_atest(bi_context *ctx, bool emit_zs)
871 {
872 return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
873 }
874
875 static void
bi_emit_atest(bi_builder * b,bi_index alpha)876 bi_emit_atest(bi_builder *b, bi_index alpha)
877 {
878 b->shader->coverage =
879 bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
880 b->shader->emitted_atest = true;
881 }
882
883 static bi_index
bi_src_color_vec4(bi_builder * b,nir_src * src,nir_alu_type T)884 bi_src_color_vec4(bi_builder *b, nir_src *src, nir_alu_type T)
885 {
886 unsigned num_components = nir_src_num_components(*src);
887 bi_index base = bi_src_index(src);
888
889 /* short-circuit the common case */
890 if (num_components == 4)
891 return base;
892
893 unsigned size = nir_alu_type_get_type_size(T);
894 assert(size == 16 || size == 32);
895
896 bi_index src_vals[4];
897
898 unsigned i;
899 for (i = 0; i < num_components; i++)
900 src_vals[i] = bi_extract(b, base, i);
901
902 for (; i < 3; i++)
903 src_vals[i] = (size == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0);
904 src_vals[3] = (size == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0);
905 bi_index temp = bi_temp(b->shader);
906 bi_make_vec_to(b, temp, src_vals, NULL, 4, size);
907 return temp;
908 }
909
910 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)911 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
912 {
913 bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;
914
915 unsigned writeout =
916 combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;
917
918 bool emit_blend = writeout & (PAN_WRITEOUT_C);
919 bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
920
921 unsigned loc = nir_intrinsic_io_semantics(instr).location;
922 bi_index src0 = bi_src_index(&instr->src[0]);
923
924 /* By ISA convention, the coverage mask is stored in R60. The store
925 * itself will be handled by a subsequent ATEST instruction */
926 if (loc == FRAG_RESULT_SAMPLE_MASK) {
927 b->shader->coverage = bi_extract(b, src0, 0);
928 return;
929 }
930
931 /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
932 * value, but render target #0 might not be floating point. However the
933 * alpha value is only used for alpha-to-coverage, a stage which is
934 * skipped for pure integer framebuffers, so the issue is moot. */
935
936 if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
937 nir_alu_type T = nir_intrinsic_src_type(instr);
938
939 bi_index rgba = bi_src_index(&instr->src[0]);
940 bi_index alpha;
941
942 if (nir_src_num_components(instr->src[0]) < 4) {
943 /* Don't read out-of-bounds */
944 alpha = bi_imm_f32(1.0);
945 } else if (T == nir_type_float16) {
946 alpha = bi_half(bi_extract(b, rgba, 1), true);
947 } else if (T == nir_type_float32) {
948 alpha = bi_extract(b, rgba, 3);
949 } else {
950 alpha = bi_dontcare(b);
951 }
952 bi_emit_atest(b, alpha);
953 }
954
955 if (emit_zs) {
956 bi_index z = bi_dontcare(b), s = bi_dontcare(b);
957
958 if (writeout & PAN_WRITEOUT_Z)
959 z = bi_src_index(&instr->src[2]);
960
961 if (writeout & PAN_WRITEOUT_S)
962 s = bi_src_index(&instr->src[3]);
963
964 b->shader->coverage =
965 bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
966 writeout & PAN_WRITEOUT_Z);
967 }
968
969 if (emit_blend) {
970 unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
971 bool dual = (writeout & PAN_WRITEOUT_2);
972 nir_alu_type T = nir_intrinsic_src_type(instr);
973 nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
974 bi_index color = bi_src_color_vec4(b, &instr->src[0], T);
975 bi_index color2 =
976 dual ? bi_src_color_vec4(b, &instr->src[4], T2) : bi_null();
977
978 if (instr->intrinsic == nir_intrinsic_store_output &&
979 loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
980 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
981
982 unsigned rt_offs = nir_src_as_uint(instr->src[1]);
983
984 assert(rt + rt_offs < 8 && "RT not in the [0-7] range");
985 rt += rt_offs;
986 }
987
988 /* Explicit copy since BLEND inputs are precoloured to R0-R3,
989 * TODO: maybe schedule around this or implement in RA as a
990 * spill */
991 bool has_mrt =
992 (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
993
994 if (has_mrt) {
995 bi_index srcs[4] = {color, color, color, color};
996 unsigned channels[4] = {0, 1, 2, 3};
997 color = bi_temp(b->shader);
998 bi_make_vec_to(
999 b, color, srcs, channels, nir_src_num_components(instr->src[0]),
1000 nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
1001 }
1002
1003 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
1004 }
1005
1006 if (b->shader->inputs->is_blend) {
1007 /* Jump back to the fragment shader, return address is stored
1008 * in r48 (see above). On Valhall, only jump if the address is
1009 * nonzero. The check is free there and it implements the "jump
1010 * to 0 terminates the blend shader" that's automatic on
1011 * Bifrost.
1012 */
1013 if (b->shader->arch >= 8)
1014 bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
1015 else
1016 bi_jump(b, bi_preload(b, 48));
1017 }
1018 }
1019
1020 /**
1021 * In a vertex shader, is the specified variable a position output? These kinds
1022 * of outputs are written from position shaders when IDVS is enabled. All other
1023 * outputs are written from the varying shader.
1024 */
1025 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)1026 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
1027 {
1028 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1029
1030 switch (sem.location) {
1031 case VARYING_SLOT_POS:
1032 case VARYING_SLOT_PSIZ:
1033 case VARYING_SLOT_LAYER:
1034 return idvs == BI_IDVS_VARYING;
1035 default:
1036 return idvs == BI_IDVS_POSITION;
1037 }
1038 }
1039
1040 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)1041 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
1042 {
1043 enum bi_idvs_mode *idvs = data;
1044
1045 if (instr->type != nir_instr_type_intrinsic)
1046 return false;
1047
1048 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1049
1050 if (intr->intrinsic != nir_intrinsic_store_output)
1051 return false;
1052
1053 if (bi_should_remove_store(intr, *idvs)) {
1054 nir_instr_remove(instr);
1055 return true;
1056 }
1057
1058 return false;
1059 }
1060
1061 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)1062 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
1063 {
1064 /* In principle we can do better for 16-bit. At the moment we require
1065 * 32-bit to permit the use of .auto, in order to force .u32 for flat
1066 * varyings, to handle internal TGSI shaders that set flat in the VS
1067 * but smooth in the FS */
1068
1069 ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
1070 ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
1071 assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
1072 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
1073
1074 unsigned imm_index = 0;
1075 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
1076
1077 /* Only look at the total components needed. In effect, we fill in all
1078 * the intermediate "holes" in the write mask, since we can't mask off
1079 * stores. Since nir_lower_io_to_temporaries ensures each varying is
1080 * written at most once, anything that's masked out is undefined, so it
1081 * doesn't matter what we write there. So we may as well do the
1082 * simplest thing possible. */
1083 unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
1084 assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
1085
1086 bi_index data = bi_src_index(&instr->src[0]);
1087
1088 /* To keep the vector dimensions consistent, we need to drop some
1089 * components. This should be coalesced.
1090 *
1091 * TODO: This is ugly and maybe inefficient. Would we rather
1092 * introduce a TRIM.i32 pseudoinstruction?
1093 */
1094 if (nr < nir_intrinsic_src_components(instr, 0)) {
1095 assert(T_size == 32 && "todo: 16-bit trim");
1096
1097 bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
1098 unsigned src_comps = nir_intrinsic_src_components(instr, 0);
1099
1100 bi_emit_split_i32(b, chans, data, src_comps);
1101
1102 bi_index tmp = bi_temp(b->shader);
1103 bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
1104
1105 bi_foreach_src(collect, w)
1106 collect->src[w] = chans[w];
1107
1108 data = tmp;
1109 }
1110
1111 bool psiz =
1112 (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
1113 bool layer =
1114 (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_LAYER);
1115
1116 bi_index a[4] = {bi_null()};
1117
1118 if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1119 /* Bifrost position shaders have a fast path */
1120 assert(T == nir_type_float16 || T == nir_type_float32);
1121 unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1122 unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1123 unsigned snap4 = 0x5E;
1124 uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1125
1126 bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1127 bi_imm_u32(format), regfmt, nr - 1);
1128 } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1129 bi_index index = bi_preload(b, 59);
1130 unsigned pos_attr_offset = 0;
1131 unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
1132
1133 if (psiz || layer)
1134 index = bi_iadd_imm_i32(b, index, 4);
1135
1136 if (layer) {
1137 assert(nr == 1 && src_bit_sz == 32);
1138 src_bit_sz = 8;
1139 pos_attr_offset = 2;
1140 data = bi_byte(data, 0);
1141 }
1142
1143 if (psiz)
1144 assert(T_size == 16 && "should've been lowered");
1145
1146 bi_index address = bi_lea_buf_imm(b, index);
1147 bi_emit_split_i32(b, a, address, 2);
1148
1149 bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1150
1151 bi_store(b, nr * src_bit_sz, data, a[0], a[1],
1152 varying ? BI_SEG_VARY : BI_SEG_POS,
1153 varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
1154 } else if (immediate) {
1155 bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
1156 regfmt, imm_index);
1157 bi_emit_split_i32(b, a, address, 3);
1158
1159 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1160 } else {
1161 bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
1162 bi_imm_u32(nir_intrinsic_base(instr)), false);
1163 bi_index address =
1164 bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
1165 bi_emit_split_i32(b, a, address, 3);
1166
1167 bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1168 }
1169 }
1170
1171 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1172 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1173 {
1174 nir_src *offset = nir_get_io_offset_src(instr);
1175
1176 bool offset_is_const = nir_src_is_const(*offset);
1177 bi_index dyn_offset = bi_src_index(offset);
1178 uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1179
1180 bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
1181 bi_def_index(&instr->def),
1182 offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
1183 bi_src_index(&instr->src[0]));
1184 }
1185
1186 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1187 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1188 {
1189 assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1190
1191 nir_src *offset = &instr->src[0];
1192 assert(nir_src_is_const(*offset) && "no indirect push constants");
1193 uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
1194 assert((base & 3) == 0 && "unaligned push constants");
1195
1196 unsigned bits = instr->def.bit_size * instr->def.num_components;
1197
1198 unsigned n = DIV_ROUND_UP(bits, 32);
1199 assert(n <= 4);
1200 bi_index channels[4] = {bi_null()};
1201
1202 for (unsigned i = 0; i < n; ++i) {
1203 unsigned word = (base >> 2) + i;
1204
1205 channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1206 }
1207
1208 bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);
1209
1210 /* Update push->count to report the highest push constant word being accessed
1211 * by this shader.
1212 */
1213 b->shader->info.push->count =
1214 MAX2((base / 4) + n, b->shader->info.push->count);
1215 }
1216
1217 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1218 bi_addr_high(bi_builder *b, nir_src *src)
1219 {
1220 return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
1221 : bi_zero();
1222 }
1223
1224 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1225 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
1226 enum bi_seg seg, int16_t *offset)
1227 {
1228 /* Not needed on Bifrost or for global accesses */
1229 if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1230 return;
1231
1232 /* There is no segment modifier on Valhall. Instead, we need to
1233 * emit the arithmetic ourselves. We do have an offset
1234 * available, which saves an instruction for constant offsets.
1235 */
1236 bool wls = (seg == BI_SEG_WLS);
1237 assert(wls || (seg == BI_SEG_TL));
1238
1239 enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1240
1241 bi_index base_lo = bi_fau(fau, false);
1242
1243 if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
1244 addr_lo->value == (int16_t)addr_lo->value) {
1245 *offset = addr_lo->value;
1246 *addr_lo = base_lo;
1247 } else {
1248 *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1249 }
1250
1251 /* Do not allow overflow for WLS or TLS */
1252 *addr_hi = bi_fau(fau, true);
1253 }
1254
1255 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1256 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1257 {
1258 int16_t offset = 0;
1259 unsigned bits = instr->num_components * instr->def.bit_size;
1260 bi_index dest = bi_def_index(&instr->def);
1261 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1262 bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1263
1264 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1265
1266 bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1267 bi_emit_cached_split(b, dest, bits);
1268 }
1269
1270 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1271 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1272 {
1273 /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1274 assert(nir_intrinsic_write_mask(instr) ==
1275 BITFIELD_MASK(instr->num_components));
1276
1277 int16_t offset = 0;
1278 bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1279 bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1280
1281 bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1282
1283 bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1284 bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
1285 }
1286
1287 /* Exchanges the staging register with memory */
1288
1289 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1290 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
1291 enum bi_seg seg)
1292 {
1293 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1294
1295 unsigned sz = nir_src_bit_size(*arg);
1296 assert(sz == 32 || sz == 64);
1297
1298 bi_index data = bi_src_index(arg);
1299
1300 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1301
1302 if (b->shader->arch >= 9)
1303 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1304 else if (seg == BI_SEG_WLS)
1305 addr_hi = bi_zero();
1306
1307 bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1308 }
1309
1310 /* Exchanges the second staging register with memory if comparison with first
1311 * staging register passes */
1312
1313 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1314 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
1315 nir_src *arg_2, enum bi_seg seg)
1316 {
1317 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1318
1319 /* hardware is swapped from NIR */
1320 bi_index src0 = bi_src_index(arg_2);
1321 bi_index src1 = bi_src_index(arg_1);
1322
1323 unsigned sz = nir_src_bit_size(*arg_1);
1324 assert(sz == 32 || sz == 64);
1325
1326 bi_index data_words[] = {
1327 bi_extract(b, src0, 0),
1328 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1329
1330 /* 64-bit */
1331 bi_extract(b, src1, 0),
1332 sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1333 };
1334
1335 bi_index in = bi_temp(b->shader);
1336 bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1337 bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1338
1339 if (b->shader->arch >= 9)
1340 bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1341 else if (seg == BI_SEG_WLS)
1342 addr_hi = bi_zero();
1343
1344 bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1345 bi_emit_cached_split(b, out, sz);
1346
1347 bi_index inout_words[] = {bi_extract(b, out, 0),
1348 sz == 64 ? bi_extract(b, out, 1) : bi_null()};
1349
1350 bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1351 }
1352
1353 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_atomic_op op)1354 bi_atom_opc_for_nir(nir_atomic_op op)
1355 {
1356 /* clang-format off */
1357 switch (op) {
1358 case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
1359 case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
1360 case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
1361 case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
1362 case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
1363 case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
1364 case nir_atomic_op_ior: return BI_ATOM_OPC_AOR;
1365 case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
1366 default: unreachable("Unexpected computational atomic");
1367 }
1368 /* clang-format on */
1369 }
1370
1371 /* Optimized unary atomics are available with an implied #1 argument */
1372
1373 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1374 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1375 {
1376 /* Check we have a compatible constant */
1377 if (arg.type != BI_INDEX_CONSTANT)
1378 return false;
1379
1380 if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1381 return false;
1382
1383 /* Check for a compatible operation */
1384 switch (op) {
1385 case BI_ATOM_OPC_AADD:
1386 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1387 return true;
1388 case BI_ATOM_OPC_ASMAX:
1389 *out = BI_ATOM_OPC_ASMAX1;
1390 return true;
1391 case BI_ATOM_OPC_AUMAX:
1392 *out = BI_ATOM_OPC_AUMAX1;
1393 return true;
1394 case BI_ATOM_OPC_AOR:
1395 *out = BI_ATOM_OPC_AOR1;
1396 return true;
1397 default:
1398 return false;
1399 }
1400 }
1401
1402 /*
1403 * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1404 * translate between these forms (with MKVEC.v2i16).
1405 *
1406 * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1407 * arrays. For uniform handling, we also treat 3D textures like 2D arrays.
1408 *
1409 * Our indexing needs to reflects this. Since Valhall and Bifrost are quite
1410 * different, we provide separate functions for these.
1411 */
1412 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1413 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1414 unsigned coord_comps, bool is_array, bool is_msaa)
1415 {
1416 assert(coord_comps > 0 && coord_comps <= 3);
1417
1418 /* MSAA load store should have been lowered */
1419 assert(!is_msaa);
1420 if (src_idx == 0) {
1421 if (coord_comps == 1 || (coord_comps == 2 && is_array))
1422 return bi_extract(b, coord, 0);
1423 else
1424 return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1425 bi_half(bi_extract(b, coord, 1), false));
1426 } else {
1427 if (coord_comps == 3)
1428 return bi_extract(b, coord, 2);
1429 else if (coord_comps == 2 && is_array)
1430 return bi_extract(b, coord, 1);
1431 else
1432 return bi_zero();
1433 }
1434 }
1435
1436 static bi_index
va_emit_image_coord(bi_builder * b,bi_index coord,bi_index sample_index,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1437 va_emit_image_coord(bi_builder *b, bi_index coord, bi_index sample_index,
1438 unsigned src_idx, unsigned coord_comps, bool is_array,
1439 bool is_msaa)
1440 {
1441 assert(coord_comps > 0 && coord_comps <= 3);
1442 if (src_idx == 0) {
1443 if (coord_comps == 1 || (coord_comps == 2 && is_array))
1444 return bi_extract(b, coord, 0);
1445 else
1446 return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1447 bi_half(bi_extract(b, coord, 1), false));
1448 } else if (is_msaa) {
1449 bi_index array_idx = bi_extract(b, sample_index, 0);
1450 if (coord_comps == 3)
1451 return bi_mkvec_v2i16(b, bi_half(array_idx, false),
1452 bi_half(bi_extract(b, coord, 2), false));
1453 else if (coord_comps == 2)
1454 return array_idx;
1455 } else if (coord_comps == 3)
1456 return bi_mkvec_v2i16(b, bi_imm_u16(0),
1457 bi_half(bi_extract(b, coord, 2), false));
1458 else if (coord_comps == 2 && is_array)
1459 return bi_mkvec_v2i16(b, bi_imm_u16(0),
1460 bi_half(bi_extract(b, coord, 1), false));
1461 return bi_zero();
1462 }
1463
1464 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1465 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1466 {
1467 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1468 unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1469 bool array = nir_intrinsic_image_array(instr);
1470
1471 bi_index coords = bi_src_index(&instr->src[1]);
1472 bi_index indexvar = bi_src_index(&instr->src[2]);
1473 bi_index xy, zw;
1474 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS);
1475 if (b->shader->arch < 9) {
1476 xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1477 zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1478 } else {
1479 xy =
1480 va_emit_image_coord(b, coords, indexvar, 0, coord_comps, array, is_ms);
1481 zw =
1482 va_emit_image_coord(b, coords, indexvar, 1, coord_comps, array, is_ms);
1483 }
1484 bi_index dest = bi_def_index(&instr->def);
1485 enum bi_register_format regfmt =
1486 bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1487 enum bi_vecsize vecsize = instr->num_components - 1;
1488
1489 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1490 const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1491 const unsigned table_index = pan_res_handle_get_table(raw_value);
1492 const unsigned texture_index = pan_res_handle_get_index(raw_value);
1493
1494 if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1495 bi_instr *I =
1496 bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, texture_index);
1497 I->table = va_res_fold_table_idx(table_index);
1498 } else {
1499 bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1500 vecsize);
1501 }
1502 } else if (b->shader->arch >= 9) {
1503 bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1504 vecsize);
1505 } else {
1506 bi_ld_attr_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1507 vecsize);
1508 }
1509
1510 bi_split_def(b, &instr->def);
1511 }
1512
1513 static void
bi_emit_lea_image_to(bi_builder * b,bi_index dest,nir_intrinsic_instr * instr)1514 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
1515 {
1516 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1517 bool array = nir_intrinsic_image_array(instr);
1518 unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1519
1520 enum bi_register_format type =
1521 (instr->intrinsic == nir_intrinsic_image_store)
1522 ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
1523 : BI_REGISTER_FORMAT_AUTO;
1524
1525 bi_index coords = bi_src_index(&instr->src[1]);
1526 bi_index indices = bi_src_index(&instr->src[2]);
1527 bi_index xy, zw;
1528 bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1529 if (b->shader->arch < 9) {
1530 xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1531 zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1532 } else {
1533 xy =
1534 va_emit_image_coord(b, coords, indices, 0, coord_comps, array, is_ms);
1535 zw =
1536 va_emit_image_coord(b, coords, indices, 1, coord_comps, array, is_ms);
1537 }
1538
1539 if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1540 const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1541 unsigned table_index = pan_res_handle_get_table(raw_value);
1542 unsigned texture_index = pan_res_handle_get_index(raw_value);
1543
1544 if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1545 bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, texture_index);
1546 I->table = va_res_fold_table_idx(table_index);
1547 } else {
1548 bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1549 }
1550 } else if (b->shader->arch >= 9) {
1551 bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1552 } else {
1553 bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1554 bi_src_index(&instr->src[0]), type);
1555
1556 /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1557 * our ABI has all images in the primary attribute table
1558 */
1559 I->table = BI_TABLE_ATTRIBUTE_1;
1560 }
1561
1562 bi_emit_cached_split(b, dest, 3 * 32);
1563 }
1564
1565 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1566 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1567 {
1568 bi_index dest = bi_temp(b->shader);
1569 bi_emit_lea_image_to(b, dest, instr);
1570 return dest;
1571 }
1572
1573 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1574 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1575 {
1576 bi_index a[4] = {bi_null()};
1577 bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1578
1579 /* Due to SPIR-V limitations, the source type is not fully reliable: it
1580 * reports uint32 even for write_imagei. This causes an incorrect
1581 * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
1582 * instead, which will match per the OpenCL spec. Of course this does
1583 * not work for 16-bit stores, but those are not available in OpenCL.
1584 */
1585 nir_alu_type T = nir_intrinsic_src_type(instr);
1586 assert(nir_alu_type_get_type_size(T) == 32);
1587
1588 bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1589 BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
1590 }
1591
1592 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_atomic_op op)1593 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
1594 nir_atomic_op op)
1595 {
1596 enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
1597 enum bi_atom_opc post_opc = opc;
1598 bool bifrost = b->shader->arch <= 8;
1599
1600 /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1601 * take any vector but can still output in RETURN mode */
1602 bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1603 unsigned sr_count = bifrost ? 2 : 1;
1604
1605 /* Generate either ATOM or ATOM1 as required */
1606 if (bi_promote_atom_c1(opc, arg, &opc)) {
1607 bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1608 bi_extract(b, addr, 1), opc, sr_count);
1609 } else {
1610 bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1611 bi_extract(b, addr, 1), opc, sr_count);
1612 }
1613
1614 if (bifrost) {
1615 /* Post-process it */
1616 bi_emit_cached_split_i32(b, tmp_dest, 2);
1617 bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
1618 bi_extract(b, tmp_dest, 1), post_opc);
1619 }
1620 }
1621
1622 static void
bi_emit_load_frag_coord_zw(bi_builder * b,bi_index dst,unsigned channel)1623 bi_emit_load_frag_coord_zw(bi_builder *b, bi_index dst, unsigned channel)
1624 {
1625 bi_ld_var_special_to(
1626 b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1627 BI_UPDATE_CLOBBER,
1628 (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
1629 BI_VECSIZE_NONE);
1630 }
1631
1632 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1633 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1634 {
1635 bi_index dest = bi_def_index(&instr->def);
1636 nir_alu_type T = nir_intrinsic_dest_type(instr);
1637 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1638 unsigned size = instr->def.bit_size;
1639 unsigned nr = instr->num_components;
1640
1641 /* Get the render target */
1642 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
1643 unsigned loc = sem.location;
1644 assert(loc >= FRAG_RESULT_DATA0);
1645 unsigned rt = (loc - FRAG_RESULT_DATA0);
1646
1647 bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
1648 bi_src_index(&instr->src[0]), regfmt, nr - 1);
1649 bi_emit_cached_split(b, dest, size * nr);
1650 }
1651
1652 /*
1653 * Older Bifrost hardware has a limited CLPER instruction. Add a safe helper
1654 * that uses the hardware functionality if available and lowers otherwise.
1655 */
1656 static bi_index
bi_clper(bi_builder * b,bi_index s0,bi_index s1,enum bi_lane_op lop)1657 bi_clper(bi_builder *b, bi_index s0, bi_index s1, enum bi_lane_op lop)
1658 {
1659 if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
1660 if (lop == BI_LANE_OP_XOR) {
1661 bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
1662 s1 = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
1663 } else {
1664 assert(lop == BI_LANE_OP_NONE);
1665 }
1666
1667 return bi_clper_old_i32(b, s0, s1);
1668 } else {
1669 return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, lop,
1670 BI_SUBGROUP_SUBGROUP4);
1671 }
1672 }
1673
1674 static bool
bi_nir_all_uses_fabs(nir_def * def)1675 bi_nir_all_uses_fabs(nir_def *def)
1676 {
1677 nir_foreach_use(use, def) {
1678 nir_instr *instr = nir_src_parent_instr(use);
1679
1680 if (instr->type != nir_instr_type_alu ||
1681 nir_instr_as_alu(instr)->op != nir_op_fabs)
1682 return false;
1683 }
1684
1685 return true;
1686 }
1687
1688 static void
bi_emit_derivative(bi_builder * b,bi_index dst,nir_intrinsic_instr * instr,unsigned axis,bool coarse)1689 bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
1690 unsigned axis, bool coarse)
1691 {
1692 bi_index left, right;
1693 bi_index s0 = bi_src_index(&instr->src[0]);
1694 unsigned sz = instr->def.bit_size;
1695
1696 /* If all uses are fabs, the sign of the derivative doesn't matter. This is
1697 * inherently based on fine derivatives so we can't do it for coarse.
1698 */
1699 if (bi_nir_all_uses_fabs(&instr->def) && !coarse) {
1700 left = s0;
1701 right = bi_clper(b, s0, bi_imm_u32(axis), BI_LANE_OP_XOR);
1702 } else {
1703 bi_index lane1, lane2;
1704 if (coarse) {
1705 lane1 = bi_imm_u32(0);
1706 lane2 = bi_imm_u32(axis);
1707 } else {
1708 lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
1709 bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));
1710
1711 lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
1712 }
1713
1714 left = bi_clper(b, s0, lane1, BI_LANE_OP_NONE);
1715 right = bi_clper(b, s0, lane2, BI_LANE_OP_NONE);
1716 }
1717
1718 bi_fadd_to(b, sz, dst, right, bi_neg(left));
1719 }
1720
1721 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1722 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1723 {
1724 bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1725 ? bi_def_index(&instr->def)
1726 : bi_null();
1727 gl_shader_stage stage = b->shader->stage;
1728
1729 switch (instr->intrinsic) {
1730 case nir_intrinsic_load_barycentric_pixel:
1731 case nir_intrinsic_load_barycentric_centroid:
1732 case nir_intrinsic_load_barycentric_sample:
1733 case nir_intrinsic_load_barycentric_at_sample:
1734 case nir_intrinsic_load_barycentric_at_offset:
1735 /* handled later via load_vary */
1736 break;
1737 case nir_intrinsic_load_interpolated_input:
1738 case nir_intrinsic_load_input:
1739 if (b->shader->inputs->is_blend)
1740 bi_emit_load_blend_input(b, instr);
1741 else if (stage == MESA_SHADER_FRAGMENT)
1742 bi_emit_load_vary(b, instr);
1743 else if (stage == MESA_SHADER_VERTEX)
1744 bi_emit_load_attr(b, instr);
1745 else
1746 unreachable("Unsupported shader stage");
1747 break;
1748
1749 case nir_intrinsic_store_output:
1750 if (stage == MESA_SHADER_FRAGMENT)
1751 bi_emit_fragment_out(b, instr);
1752 else if (stage == MESA_SHADER_VERTEX)
1753 bi_emit_store_vary(b, instr);
1754 else
1755 unreachable("Unsupported shader stage");
1756 break;
1757
1758 case nir_intrinsic_store_combined_output_pan:
1759 assert(stage == MESA_SHADER_FRAGMENT);
1760 bi_emit_fragment_out(b, instr);
1761 break;
1762
1763 case nir_intrinsic_load_ubo:
1764 bi_emit_load_ubo(b, instr);
1765 break;
1766
1767 case nir_intrinsic_load_push_constant:
1768 bi_emit_load_push_constant(b, instr);
1769 break;
1770
1771 case nir_intrinsic_load_global:
1772 case nir_intrinsic_load_global_constant:
1773 bi_emit_load(b, instr, BI_SEG_NONE);
1774 break;
1775
1776 case nir_intrinsic_store_global:
1777 bi_emit_store(b, instr, BI_SEG_NONE);
1778 break;
1779
1780 case nir_intrinsic_load_scratch:
1781 bi_emit_load(b, instr, BI_SEG_TL);
1782 break;
1783
1784 case nir_intrinsic_store_scratch:
1785 bi_emit_store(b, instr, BI_SEG_TL);
1786 break;
1787
1788 case nir_intrinsic_load_shared:
1789 bi_emit_load(b, instr, BI_SEG_WLS);
1790 break;
1791
1792 case nir_intrinsic_store_shared:
1793 bi_emit_store(b, instr, BI_SEG_WLS);
1794 break;
1795
1796 case nir_intrinsic_barrier:
1797 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
1798 assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1799 assert(nir_intrinsic_execution_scope(instr) > SCOPE_SUBGROUP &&
1800 "todo: subgroup barriers (different divergence rules)");
1801 bi_barrier(b);
1802 }
1803 /* Blob doesn't seem to do anything for memory barriers, so no need to
1804 * check nir_intrinsic_memory_scope().
1805 */
1806 break;
1807
1808 case nir_intrinsic_shared_atomic: {
1809 nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1810
1811 if (op == nir_atomic_op_xchg) {
1812 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1813 BI_SEG_WLS);
1814 } else {
1815 assert(nir_src_bit_size(instr->src[1]) == 32);
1816
1817 bi_index addr = bi_src_index(&instr->src[0]);
1818 bi_index addr_hi;
1819
1820 if (b->shader->arch >= 9) {
1821 bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1822 addr = bi_collect_v2i32(b, addr, addr_hi);
1823 } else {
1824 addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1825 bi_emit_cached_split(b, addr, 64);
1826 }
1827
1828 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
1829 }
1830
1831 bi_split_def(b, &instr->def);
1832 break;
1833 }
1834
1835 case nir_intrinsic_global_atomic: {
1836 nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1837
1838 if (op == nir_atomic_op_xchg) {
1839 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1840 BI_SEG_NONE);
1841 } else {
1842 assert(nir_src_bit_size(instr->src[1]) == 32);
1843
1844 bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
1845 bi_src_index(&instr->src[1]), op);
1846 }
1847
1848 bi_split_def(b, &instr->def);
1849 break;
1850 }
1851
1852 case nir_intrinsic_image_texel_address:
1853 bi_emit_lea_image_to(b, dst, instr);
1854 break;
1855
1856 case nir_intrinsic_image_load:
1857 bi_emit_image_load(b, instr);
1858 break;
1859
1860 case nir_intrinsic_image_store:
1861 bi_emit_image_store(b, instr);
1862 break;
1863
1864 case nir_intrinsic_global_atomic_swap:
1865 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1866 &instr->src[2], BI_SEG_NONE);
1867 bi_split_def(b, &instr->def);
1868 break;
1869
1870 case nir_intrinsic_shared_atomic_swap:
1871 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1872 &instr->src[2], BI_SEG_WLS);
1873 bi_split_def(b, &instr->def);
1874 break;
1875
1876 case nir_intrinsic_load_pixel_coord:
1877 /* Vectorized load of the preloaded i16vec2 */
1878 bi_mov_i32_to(b, dst, bi_preload(b, 59));
1879 break;
1880
1881 case nir_intrinsic_load_frag_coord_zw:
1882 bi_emit_load_frag_coord_zw(b, dst, nir_intrinsic_component(instr));
1883 break;
1884
1885 case nir_intrinsic_load_converted_output_pan:
1886 bi_emit_ld_tile(b, instr);
1887 break;
1888
1889 case nir_intrinsic_terminate_if:
1890 bi_discard_b32(b, bi_src_index(&instr->src[0]));
1891 break;
1892
1893 case nir_intrinsic_terminate:
1894 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1895 break;
1896
1897 case nir_intrinsic_load_sample_positions_pan:
1898 bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1899 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1900 break;
1901
1902 case nir_intrinsic_load_sample_mask_in:
1903 /* r61[0:15] contains the coverage bitmap */
1904 bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1905 break;
1906
1907 case nir_intrinsic_load_sample_mask:
1908 bi_mov_i32_to(b, dst, bi_coverage(b));
1909 break;
1910
1911 case nir_intrinsic_load_sample_id:
1912 bi_load_sample_id_to(b, dst);
1913 break;
1914
1915 case nir_intrinsic_load_front_face:
1916 /* r58 == 0 means primitive is front facing */
1917 bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
1918 BI_RESULT_TYPE_M1);
1919 break;
1920
1921 case nir_intrinsic_load_point_coord:
1922 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1923 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1924 BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1925 bi_emit_cached_split_i32(b, dst, 2);
1926 break;
1927
1928 /* It appears vertex_id is zero-based with Bifrost geometry flows, but
1929 * not with Valhall's memory-allocation IDVS geometry flow. We only support
1930 * the new flow on Valhall so this is lowered in NIR.
1931 */
1932 case nir_intrinsic_load_vertex_id:
1933 case nir_intrinsic_load_vertex_id_zero_base:
1934 assert(b->shader->malloc_idvs ==
1935 (instr->intrinsic == nir_intrinsic_load_vertex_id));
1936
1937 bi_mov_i32_to(b, dst, bi_vertex_id(b));
1938 break;
1939
1940 case nir_intrinsic_load_instance_id:
1941 bi_mov_i32_to(b, dst, bi_instance_id(b));
1942 break;
1943
1944 case nir_intrinsic_load_draw_id:
1945 bi_mov_i32_to(b, dst, bi_draw_id(b));
1946 break;
1947
1948 case nir_intrinsic_load_subgroup_invocation:
1949 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1950 break;
1951
1952 case nir_intrinsic_load_local_invocation_id:
1953 bi_collect_v3i32_to(b, dst,
1954 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
1955 bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
1956 bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
1957 break;
1958
1959 case nir_intrinsic_load_workgroup_id:
1960 bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
1961 bi_preload(b, 59));
1962 break;
1963
1964 case nir_intrinsic_load_global_invocation_id:
1965 bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
1966 bi_preload(b, 62));
1967 break;
1968
1969 case nir_intrinsic_shader_clock:
1970 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1971 bi_split_def(b, &instr->def);
1972 break;
1973
1974 case nir_intrinsic_ddx:
1975 case nir_intrinsic_ddx_fine:
1976 bi_emit_derivative(b, dst, instr, 1, false);
1977 break;
1978 case nir_intrinsic_ddx_coarse:
1979 bi_emit_derivative(b, dst, instr, 1, true);
1980 break;
1981 case nir_intrinsic_ddy:
1982 case nir_intrinsic_ddy_fine:
1983 bi_emit_derivative(b, dst, instr, 2, false);
1984 break;
1985 case nir_intrinsic_ddy_coarse:
1986 bi_emit_derivative(b, dst, instr, 2, true);
1987 break;
1988
1989 case nir_intrinsic_load_layer_id:
1990 assert(b->shader->arch >= 9);
1991 bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
1992 break;
1993
1994 case nir_intrinsic_load_ssbo_address:
1995 assert(b->shader->arch >= 9);
1996 bi_lea_buffer_to(b, dst, bi_src_index(&instr->src[1]),
1997 bi_src_index(&instr->src[0]));
1998 bi_emit_cached_split(b, dst, 64);
1999 break;
2000
2001 case nir_intrinsic_load_ssbo: {
2002 assert(b->shader->arch >= 9);
2003 unsigned dst_bits = instr->num_components * instr->def.bit_size;
2004 bi_ld_buffer_to(b, dst_bits, dst, bi_src_index(&instr->src[1]),
2005 bi_src_index(&instr->src[0]));
2006 bi_emit_cached_split(b, dst, dst_bits);
2007 break;
2008 }
2009
2010 default:
2011 fprintf(stderr, "Unhandled intrinsic %s\n",
2012 nir_intrinsic_infos[instr->intrinsic].name);
2013 assert(0);
2014 }
2015 }
2016
2017 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)2018 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
2019 {
2020 /* Make sure we've been lowered */
2021 assert(instr->def.num_components <= (32 / instr->def.bit_size));
2022
2023 /* Accumulate all the channels of the constant, as if we did an
2024 * implicit SEL over them */
2025 uint32_t acc = 0;
2026
2027 for (unsigned i = 0; i < instr->def.num_components; ++i) {
2028 unsigned v =
2029 nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
2030 acc |= (v << (i * instr->def.bit_size));
2031 }
2032
2033 bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
2034 }
2035
2036 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)2037 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
2038 {
2039 unsigned bitsize = nir_src_bit_size(src.src);
2040
2041 /* the bi_index carries the 32-bit (word) offset separate from the
2042 * subword swizzle, first handle the offset */
2043
2044 unsigned offset = 0;
2045
2046 assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
2047 unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
2048
2049 for (unsigned i = 0; i < comps; ++i) {
2050 unsigned new_offset = (src.swizzle[i] >> subword_shift);
2051
2052 if (i > 0)
2053 assert(offset == new_offset && "wrong vectorization");
2054
2055 offset = new_offset;
2056 }
2057
2058 bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
2059
2060 /* Compose the subword swizzle with existing (identity) swizzle */
2061 assert(idx.swizzle == BI_SWIZZLE_H01);
2062
2063 /* Bigger vectors should have been lowered */
2064 assert(comps <= (1 << subword_shift));
2065
2066 if (bitsize == 16) {
2067 unsigned c0 = src.swizzle[0] & 1;
2068 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
2069 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
2070 } else if (bitsize == 8 && comps == 1) {
2071 idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
2072 } else if (bitsize == 8) {
2073 /* XXX: Use optimized swizzle when posisble */
2074 bi_index unoffset_srcs[NIR_MAX_VEC_COMPONENTS] = {bi_null()};
2075 unsigned channels[NIR_MAX_VEC_COMPONENTS] = {0};
2076
2077 for (unsigned i = 0; i < comps; ++i) {
2078 unoffset_srcs[i] = bi_src_index(&src.src);
2079 channels[i] = src.swizzle[i];
2080 }
2081
2082 bi_index temp = bi_temp(b->shader);
2083 bi_make_vec_to(b, temp, unoffset_srcs, channels, comps, bitsize);
2084
2085 static const enum bi_swizzle swizzle_lut[] = {
2086 BI_SWIZZLE_B0000, BI_SWIZZLE_B0011, BI_SWIZZLE_H01, BI_SWIZZLE_H01};
2087 assert(comps - 1 < ARRAY_SIZE(swizzle_lut));
2088
2089 /* Assign a coherent swizzle for the vector */
2090 temp.swizzle = swizzle_lut[comps - 1];
2091
2092 return temp;
2093 }
2094
2095 return idx;
2096 }
2097
2098 static enum bi_round
bi_nir_round(nir_op op)2099 bi_nir_round(nir_op op)
2100 {
2101 switch (op) {
2102 case nir_op_fround_even:
2103 return BI_ROUND_NONE;
2104 case nir_op_ftrunc:
2105 return BI_ROUND_RTZ;
2106 case nir_op_fceil:
2107 return BI_ROUND_RTP;
2108 case nir_op_ffloor:
2109 return BI_ROUND_RTN;
2110 default:
2111 unreachable("invalid nir round op");
2112 }
2113 }
2114
2115 /* Convenience for lowered transcendentals */
2116
2117 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)2118 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
2119 {
2120 return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
2121 }
2122
2123 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
2124 * Newton-Raphson to improve precision */
2125
2126 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)2127 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
2128 {
2129 bi_index x1 = bi_frcp_approx_f32(b, s0);
2130 bi_index m = bi_frexpm_f32(b, s0, false, false);
2131 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
2132 bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
2133 BI_SPECIAL_N);
2134 bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
2135 }
2136
2137 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)2138 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
2139 {
2140 bi_index x1 = bi_frsq_approx_f32(b, s0);
2141 bi_index m = bi_frexpm_f32(b, s0, false, true);
2142 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
2143 bi_index t1 = bi_fmul_f32(b, x1, x1);
2144 bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
2145 bi_imm_u32(-1), BI_SPECIAL_N);
2146 bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
2147 }
2148
2149 /* More complex transcendentals, see
2150 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
2151 * for documentation */
2152
2153 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)2154 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
2155 {
2156 bi_index t1 = bi_temp(b->shader);
2157 bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
2158 t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
2159
2160 bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
2161
2162 bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
2163 a2->clamp = BI_CLAMP_CLAMP_M1_1;
2164
2165 bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
2166 bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
2167 bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
2168 bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
2169 bi_imm_u32(0x3e75fffa));
2170 bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
2171 bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
2172 bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
2173 BI_SPECIAL_NONE);
2174 x->clamp = BI_CLAMP_CLAMP_0_INF;
2175
2176 bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
2177 max->sem = BI_SEM_NAN_PROPAGATE;
2178 }
2179
2180 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)2181 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
2182 {
2183 /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
2184 * fixed-point input */
2185 bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
2186 bi_imm_u32(24), BI_SPECIAL_NONE);
2187 bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
2188 fixed_pt->round = BI_ROUND_NONE; // XXX
2189
2190 /* Compute the result for the fixed-point input, but pass along
2191 * the floating-point scale for correct NaN propagation */
2192 bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
2193 }
2194
2195 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2196 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2197 {
2198 /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
2199 bi_index a1 = bi_frexpm_f32(b, s0, true, false);
2200 bi_index ei = bi_frexpe_f32(b, s0, true, false);
2201 bi_index ef = bi_s32_to_f32(b, ei);
2202
2203 /* xt estimates -log(r1), a coarse approximation of log(a1) */
2204 bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
2205 bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
2206
2207 /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
2208 * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
2209 * and then log(s0) = x1 + x2 */
2210 bi_index x1 = bi_fadd_f32(b, ef, xt);
2211
2212 /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
2213 * polynomial approximation around 1. The series is expressed around
2214 * 1, so set y = (a1 * r1) - 1.0 */
2215 bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
2216
2217 /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
2218 * log_e(1 + y) by the Taylor series (lower precision than the blob):
2219 * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
2220 bi_index loge =
2221 bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
2222
2223 bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
2224
2225 /* log(s0) = x1 + x2 */
2226 bi_fadd_f32_to(b, dst, x1, x2);
2227 }
2228
2229 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2230 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2231 {
2232 bi_index frexp = bi_frexpe_f32(b, s0, true, false);
2233 bi_index frexpi = bi_s32_to_f32(b, frexp);
2234 bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
2235 bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
2236 }
2237
2238 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2239 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2240 {
2241 bi_index log2_base = bi_null();
2242
2243 if (base.type == BI_INDEX_CONSTANT) {
2244 log2_base = bi_imm_f32(log2f(uif(base.value)));
2245 } else {
2246 log2_base = bi_temp(b->shader);
2247 bi_lower_flog2_32(b, log2_base, base);
2248 }
2249
2250 return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
2251 }
2252
2253 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2254 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2255 {
2256 bi_index log2_base = bi_null();
2257
2258 if (base.type == BI_INDEX_CONSTANT) {
2259 log2_base = bi_imm_f32(log2f(uif(base.value)));
2260 } else {
2261 log2_base = bi_temp(b->shader);
2262 bi_flog2_32(b, log2_base, base);
2263 }
2264
2265 return bi_fexp_32(b, dst, exp, log2_base);
2266 }
2267
2268 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
2269 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
2270 * calculates the results. We use them to calculate sin/cos via a Taylor
2271 * approximation:
2272 *
2273 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
2274 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
2275 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
2276 */
2277
2278 #define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f)
2279 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
2280 #define SINCOS_BIAS bi_imm_u32(0x49400000)
2281
2282 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)2283 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
2284 {
2285 /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
2286 bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
2287
2288 /* Approximate domain error (small) */
2289 bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
2290 MPI_OVER_TWO, s0);
2291
2292 /* Lookup sin(x), cos(x) */
2293 bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
2294 bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
2295
2296 /* e^2 / 2 */
2297 bi_index e2_over_2 =
2298 bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);
2299
2300 /* (-e^2)/2 f''(x) */
2301 bi_index quadratic =
2302 bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());
2303
2304 /* e f'(x) - (e^2/2) f''(x) */
2305 bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2306 cos ? bi_neg(sinx) : cosx, quadratic);
2307 I->clamp = BI_CLAMP_CLAMP_M1_1;
2308
2309 /* f(x) + e f'(x) - (e^2/2) f''(x) */
2310 bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2311 }
2312
2313 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2314 bi_translate_cmpf(nir_op op)
2315 {
2316 switch (op) {
2317 case nir_op_ieq8:
2318 case nir_op_ieq16:
2319 case nir_op_ieq32:
2320 case nir_op_feq16:
2321 case nir_op_feq32:
2322 return BI_CMPF_EQ;
2323
2324 case nir_op_ine8:
2325 case nir_op_ine16:
2326 case nir_op_ine32:
2327 case nir_op_fneu16:
2328 case nir_op_fneu32:
2329 return BI_CMPF_NE;
2330
2331 case nir_op_ilt8:
2332 case nir_op_ilt16:
2333 case nir_op_ilt32:
2334 case nir_op_flt16:
2335 case nir_op_flt32:
2336 case nir_op_ult8:
2337 case nir_op_ult16:
2338 case nir_op_ult32:
2339 return BI_CMPF_LT;
2340
2341 case nir_op_ige8:
2342 case nir_op_ige16:
2343 case nir_op_ige32:
2344 case nir_op_fge16:
2345 case nir_op_fge32:
2346 case nir_op_uge8:
2347 case nir_op_uge16:
2348 case nir_op_uge32:
2349 return BI_CMPF_GE;
2350
2351 default:
2352 unreachable("invalid comparison");
2353 }
2354 }
2355
2356 static bool
bi_nir_is_replicated(nir_alu_src * src)2357 bi_nir_is_replicated(nir_alu_src *src)
2358 {
2359 for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2360 if (src->swizzle[0] == src->swizzle[i])
2361 return false;
2362 }
2363
2364 return true;
2365 }
2366
2367 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2368 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2369 {
2370 bi_index dst = bi_def_index(&instr->def);
2371 unsigned srcs = nir_op_infos[instr->op].num_inputs;
2372 unsigned sz = instr->def.bit_size;
2373 unsigned comps = instr->def.num_components;
2374 unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2375
2376 /* Indicate scalarness */
2377 if (sz == 16 && comps == 1)
2378 dst.swizzle = BI_SWIZZLE_H00;
2379
2380 /* First, match against the various moves in NIR. These are
2381 * special-cased because they can operate on vectors even after
2382 * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2383 * instruction is no "bigger" than SIMD-within-a-register. These moves
2384 * are the exceptions that need to handle swizzles specially. */
2385
2386 switch (instr->op) {
2387 case nir_op_vec2:
2388 case nir_op_vec3:
2389 case nir_op_vec4:
2390 case nir_op_vec8:
2391 case nir_op_vec16: {
2392 bi_index unoffset_srcs[16] = {bi_null()};
2393 unsigned channels[16] = {0};
2394
2395 for (unsigned i = 0; i < srcs; ++i) {
2396 unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
2397 channels[i] = instr->src[i].swizzle[0];
2398 }
2399
2400 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2401 return;
2402 }
2403
2404 case nir_op_unpack_32_2x16: {
2405 /* Should have been scalarized */
2406 assert(comps == 2 && sz == 16);
2407
2408 bi_index vec = bi_src_index(&instr->src[0].src);
2409 unsigned chan = instr->src[0].swizzle[0];
2410
2411 bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2412 return;
2413 }
2414
2415 case nir_op_unpack_64_2x32_split_x: {
2416 unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
2417 bi_mov_i32_to(b, dst,
2418 bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2419 return;
2420 }
2421
2422 case nir_op_unpack_64_2x32_split_y: {
2423 unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
2424 bi_mov_i32_to(b, dst,
2425 bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2426 return;
2427 }
2428
2429 case nir_op_pack_64_2x32_split:
2430 bi_collect_v2i32_to(b, dst,
2431 bi_extract(b, bi_src_index(&instr->src[0].src),
2432 instr->src[0].swizzle[0]),
2433 bi_extract(b, bi_src_index(&instr->src[1].src),
2434 instr->src[1].swizzle[0]));
2435 return;
2436
2437 case nir_op_pack_64_2x32:
2438 bi_collect_v2i32_to(b, dst,
2439 bi_extract(b, bi_src_index(&instr->src[0].src),
2440 instr->src[0].swizzle[0]),
2441 bi_extract(b, bi_src_index(&instr->src[0].src),
2442 instr->src[0].swizzle[1]));
2443 return;
2444
2445 case nir_op_pack_uvec2_to_uint: {
2446 bi_index src = bi_src_index(&instr->src[0].src);
2447
2448 assert(sz == 32 && src_sz == 32);
2449 bi_mkvec_v2i16_to(
2450 b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
2451 bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
2452 return;
2453 }
2454
2455 case nir_op_pack_uvec4_to_uint: {
2456 bi_index src = bi_src_index(&instr->src[0].src);
2457
2458 assert(sz == 32 && src_sz == 32);
2459 bi_mkvec_v4i8_to(
2460 b, dst, bi_byte(bi_extract(b, src, instr->src[0].swizzle[0]), 0),
2461 bi_byte(bi_extract(b, src, instr->src[0].swizzle[1]), 0),
2462 bi_byte(bi_extract(b, src, instr->src[0].swizzle[2]), 0),
2463 bi_byte(bi_extract(b, src, instr->src[0].swizzle[3]), 0));
2464 return;
2465 }
2466
2467 case nir_op_mov: {
2468 bi_index idx = bi_src_index(&instr->src[0].src);
2469 bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2470
2471 unsigned channels[4] = {
2472 comps > 0 ? instr->src[0].swizzle[0] : 0,
2473 comps > 1 ? instr->src[0].swizzle[1] : 0,
2474 comps > 2 ? instr->src[0].swizzle[2] : 0,
2475 comps > 3 ? instr->src[0].swizzle[3] : 0,
2476 };
2477
2478 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2479 return;
2480 }
2481
2482 case nir_op_pack_32_2x16: {
2483 assert(comps == 1);
2484
2485 bi_index idx = bi_src_index(&instr->src[0].src);
2486 bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2487
2488 unsigned channels[2] = {instr->src[0].swizzle[0],
2489 instr->src[0].swizzle[1]};
2490
2491 bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2492 return;
2493 }
2494
2495 case nir_op_f2f16:
2496 case nir_op_f2f16_rtz:
2497 case nir_op_f2f16_rtne: {
2498 assert(src_sz == 32);
2499 bi_index idx = bi_src_index(&instr->src[0].src);
2500 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2501 bi_index s1 =
2502 comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2503
2504 bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2505
2506 /* Override rounding if explicitly requested. Otherwise, the
2507 * default rounding mode is selected by the builder. Depending
2508 * on the float controls required by the shader, the default
2509 * mode may not be nearest-even.
2510 */
2511 if (instr->op == nir_op_f2f16_rtz)
2512 I->round = BI_ROUND_RTZ;
2513 else if (instr->op == nir_op_f2f16_rtne)
2514 I->round = BI_ROUND_NONE; /* Nearest even */
2515
2516 return;
2517 }
2518
2519 /* Vectorized downcasts */
2520 case nir_op_u2u16:
2521 case nir_op_i2i16: {
2522 if (!(src_sz == 32 && comps == 2))
2523 break;
2524
2525 bi_index idx = bi_src_index(&instr->src[0].src);
2526 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2527 bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2528
2529 bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
2530 return;
2531 }
2532
2533 /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
2534 * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
2535 * scalarizing due to scheduling (equal cost on Valhall). Additionally
2536 * if the source is replicated the MKVEC.v2i16 can be optimized out.
2537 */
2538 case nir_op_u2f16:
2539 case nir_op_i2f16: {
2540 if (!(src_sz == 32 && comps == 2))
2541 break;
2542
2543 nir_alu_src *src = &instr->src[0];
2544 bi_index idx = bi_src_index(&src->src);
2545 bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2546 bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2547
2548 bi_index t =
2549 (src->swizzle[0] == src->swizzle[1])
2550 ? bi_half(s0, false)
2551 : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false));
2552
2553 if (instr->op == nir_op_u2f16)
2554 bi_v2u16_to_v2f16_to(b, dst, t);
2555 else
2556 bi_v2s16_to_v2f16_to(b, dst, t);
2557
2558 return;
2559 }
2560
2561 case nir_op_i2i8:
2562 case nir_op_u2u8: {
2563 /* Acts like an 8-bit swizzle */
2564 bi_index idx = bi_src_index(&instr->src[0].src);
2565 unsigned factor = src_sz / 8;
2566 unsigned chan[4] = {0};
2567
2568 for (unsigned i = 0; i < comps; ++i)
2569 chan[i] = instr->src[0].swizzle[i] * factor;
2570
2571 bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2572 return;
2573 }
2574
2575 case nir_op_b32csel: {
2576 if (sz != 16)
2577 break;
2578
2579 /* We allow vectorizing b32csel(cond, A, B) which can be
2580 * translated as MUX.v2i16, even though cond is a 32-bit vector.
2581 *
2582 * If the source condition vector is replicated, we can use
2583 * MUX.v2i16 directly, letting each component use the
2584 * corresponding half of the 32-bit source. NIR uses 0/~0
2585 * booleans so that's guaranteed to work (that is, 32-bit NIR
2586 * booleans are 16-bit replicated).
2587 *
2588 * If we're not replicated, we use the same trick but must
2589 * insert a MKVEC.v2i16 first to convert down to 16-bit.
2590 */
2591 bi_index idx = bi_src_index(&instr->src[0].src);
2592 bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2593 bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2594 bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2595
2596 if (!bi_nir_is_replicated(&instr->src[0])) {
2597 s0 = bi_mkvec_v2i16(
2598 b, bi_half(s0, false),
2599 bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2600 }
2601
2602 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2603 return;
2604 }
2605
2606 default:
2607 break;
2608 }
2609
2610 bi_index s0 =
2611 srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2612 bi_index s1 =
2613 srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2614 bi_index s2 =
2615 srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2616
2617 switch (instr->op) {
2618 case nir_op_ffma:
2619 bi_fma_to(b, sz, dst, s0, s1, s2);
2620 break;
2621
2622 case nir_op_fmul:
2623 bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2624 break;
2625
2626 case nir_op_fadd:
2627 bi_fadd_to(b, sz, dst, s0, s1);
2628 break;
2629
2630 case nir_op_fsat: {
2631 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2632 I->clamp = BI_CLAMP_CLAMP_0_1;
2633 break;
2634 }
2635
2636 case nir_op_fsat_signed_mali: {
2637 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2638 I->clamp = BI_CLAMP_CLAMP_M1_1;
2639 break;
2640 }
2641
2642 case nir_op_fclamp_pos_mali: {
2643 bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2644 I->clamp = BI_CLAMP_CLAMP_0_INF;
2645 break;
2646 }
2647
2648 case nir_op_fneg:
2649 bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2650 break;
2651
2652 case nir_op_fabs:
2653 bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2654 break;
2655
2656 case nir_op_fsin:
2657 bi_lower_fsincos_32(b, dst, s0, false);
2658 break;
2659
2660 case nir_op_fcos:
2661 bi_lower_fsincos_32(b, dst, s0, true);
2662 break;
2663
2664 case nir_op_fexp2:
2665 assert(sz == 32); /* should've been lowered */
2666
2667 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2668 bi_lower_fexp2_32(b, dst, s0);
2669 else
2670 bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2671
2672 break;
2673
2674 case nir_op_flog2:
2675 assert(sz == 32); /* should've been lowered */
2676
2677 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2678 bi_lower_flog2_32(b, dst, s0);
2679 else
2680 bi_flog2_32(b, dst, s0);
2681
2682 break;
2683
2684 case nir_op_fpow:
2685 assert(sz == 32); /* should've been lowered */
2686
2687 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2688 bi_lower_fpow_32(b, dst, s0, s1);
2689 else
2690 bi_fpow_32(b, dst, s0, s1);
2691
2692 break;
2693
2694 case nir_op_frexp_exp:
2695 bi_frexpe_to(b, sz, dst, s0, false, false);
2696 break;
2697
2698 case nir_op_frexp_sig:
2699 bi_frexpm_to(b, sz, dst, s0, false, false);
2700 break;
2701
2702 case nir_op_ldexp:
2703 bi_ldexp_to(b, sz, dst, s0, s1);
2704 break;
2705
2706 case nir_op_b8csel:
2707 bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2708 break;
2709
2710 case nir_op_b16csel:
2711 bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2712 break;
2713
2714 case nir_op_b32csel:
2715 bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2716 break;
2717
2718 case nir_op_extract_u8:
2719 case nir_op_extract_i8: {
2720 assert(comps == 1 && "should be scalarized");
2721 assert((src_sz == 16 || src_sz == 32) && "should be lowered");
2722 unsigned byte = nir_alu_src_as_uint(instr->src[1]);
2723
2724 if (s0.swizzle == BI_SWIZZLE_H11) {
2725 assert(byte < 2);
2726 byte += 2;
2727 } else if (s0.swizzle != BI_SWIZZLE_H01) {
2728 assert(s0.swizzle == BI_SWIZZLE_H00);
2729 }
2730
2731 assert(byte < 4);
2732
2733 s0.swizzle = BI_SWIZZLE_H01;
2734
2735 if (instr->op == nir_op_extract_i8)
2736 bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
2737 else
2738 bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
2739 break;
2740 }
2741
2742 case nir_op_extract_u16:
2743 case nir_op_extract_i16: {
2744 assert(comps == 1 && "should be scalarized");
2745 assert(src_sz == 32 && "should be lowered");
2746 unsigned half = nir_alu_src_as_uint(instr->src[1]);
2747 assert(half == 0 || half == 1);
2748
2749 if (instr->op == nir_op_extract_i16)
2750 bi_s16_to_s32_to(b, dst, bi_half(s0, half));
2751 else
2752 bi_u16_to_u32_to(b, dst, bi_half(s0, half));
2753 break;
2754 }
2755
2756 case nir_op_insert_u16: {
2757 assert(comps == 1 && "should be scalarized");
2758 unsigned half = nir_alu_src_as_uint(instr->src[1]);
2759 assert(half == 0 || half == 1);
2760
2761 if (half == 0)
2762 bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
2763 else
2764 bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
2765 break;
2766 }
2767
2768 case nir_op_ishl:
2769 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2770 break;
2771 case nir_op_ushr:
2772 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2773 break;
2774
2775 case nir_op_ishr:
2776 if (b->shader->arch >= 9)
2777 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2778 else
2779 bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2780 break;
2781
2782 case nir_op_imin:
2783 case nir_op_umin:
2784 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2785 s1, BI_CMPF_LT);
2786 break;
2787
2788 case nir_op_imax:
2789 case nir_op_umax:
2790 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2791 s1, BI_CMPF_GT);
2792 break;
2793
2794 case nir_op_f2f32:
2795 bi_f16_to_f32_to(b, dst, s0);
2796 break;
2797
2798 case nir_op_fquantize2f16: {
2799 bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2800 bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2801
2802 f16->ftz = f32->ftz = true;
2803 break;
2804 }
2805
2806 case nir_op_f2i32:
2807 if (src_sz == 32)
2808 bi_f32_to_s32_to(b, dst, s0);
2809 else
2810 bi_f16_to_s32_to(b, dst, s0);
2811 break;
2812
2813 /* Note 32-bit sources => no vectorization, so 32-bit works */
2814 case nir_op_f2u16:
2815 if (src_sz == 32)
2816 bi_f32_to_u32_to(b, dst, s0);
2817 else
2818 bi_v2f16_to_v2u16_to(b, dst, s0);
2819 break;
2820
2821 case nir_op_f2i16:
2822 if (src_sz == 32)
2823 bi_f32_to_s32_to(b, dst, s0);
2824 else
2825 bi_v2f16_to_v2s16_to(b, dst, s0);
2826 break;
2827
2828 case nir_op_f2u32:
2829 if (src_sz == 32)
2830 bi_f32_to_u32_to(b, dst, s0);
2831 else
2832 bi_f16_to_u32_to(b, dst, s0);
2833 break;
2834
2835 case nir_op_u2f16:
2836 if (src_sz == 32)
2837 bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
2838 else if (src_sz == 16)
2839 bi_v2u16_to_v2f16_to(b, dst, s0);
2840 else if (src_sz == 8)
2841 bi_v2u8_to_v2f16_to(b, dst, s0);
2842 break;
2843
2844 case nir_op_u2f32:
2845 if (src_sz == 32)
2846 bi_u32_to_f32_to(b, dst, s0);
2847 else if (src_sz == 16)
2848 bi_u16_to_f32_to(b, dst, s0);
2849 else
2850 bi_u8_to_f32_to(b, dst, s0);
2851 break;
2852
2853 case nir_op_i2f16:
2854 if (src_sz == 32)
2855 bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
2856 else if (src_sz == 16)
2857 bi_v2s16_to_v2f16_to(b, dst, s0);
2858 else if (src_sz == 8)
2859 bi_v2s8_to_v2f16_to(b, dst, s0);
2860 break;
2861
2862 case nir_op_i2f32:
2863 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2864
2865 if (src_sz == 32)
2866 bi_s32_to_f32_to(b, dst, s0);
2867 else if (src_sz == 16)
2868 bi_s16_to_f32_to(b, dst, s0);
2869 else if (src_sz == 8)
2870 bi_s8_to_f32_to(b, dst, s0);
2871 break;
2872
2873 case nir_op_i2i32:
2874 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2875
2876 if (src_sz == 32)
2877 bi_mov_i32_to(b, dst, s0);
2878 else if (src_sz == 16)
2879 bi_s16_to_s32_to(b, dst, s0);
2880 else if (src_sz == 8)
2881 bi_s8_to_s32_to(b, dst, s0);
2882 break;
2883
2884 case nir_op_u2u32:
2885 assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2886
2887 if (src_sz == 32)
2888 bi_mov_i32_to(b, dst, s0);
2889 else if (src_sz == 16)
2890 bi_u16_to_u32_to(b, dst, s0);
2891 else if (src_sz == 8)
2892 bi_u8_to_u32_to(b, dst, s0);
2893
2894 break;
2895
2896 case nir_op_i2i16:
2897 assert(src_sz == 8 || src_sz == 32);
2898
2899 if (src_sz == 8)
2900 bi_v2s8_to_v2s16_to(b, dst, s0);
2901 else
2902 bi_mov_i32_to(b, dst, s0);
2903 break;
2904
2905 case nir_op_u2u16:
2906 assert(src_sz == 8 || src_sz == 32);
2907
2908 if (src_sz == 8)
2909 bi_v2u8_to_v2u16_to(b, dst, s0);
2910 else
2911 bi_mov_i32_to(b, dst, s0);
2912 break;
2913
2914 case nir_op_b2i8:
2915 case nir_op_b2i16:
2916 case nir_op_b2i32:
2917 bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
2918 BI_MUX_INT_ZERO);
2919 break;
2920
2921 case nir_op_ieq8:
2922 case nir_op_ine8:
2923 case nir_op_ilt8:
2924 case nir_op_ige8:
2925 case nir_op_ieq16:
2926 case nir_op_ine16:
2927 case nir_op_ilt16:
2928 case nir_op_ige16:
2929 case nir_op_ieq32:
2930 case nir_op_ine32:
2931 case nir_op_ilt32:
2932 case nir_op_ige32:
2933 bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2934 BI_RESULT_TYPE_M1);
2935 break;
2936
2937 case nir_op_ult8:
2938 case nir_op_uge8:
2939 case nir_op_ult16:
2940 case nir_op_uge16:
2941 case nir_op_ult32:
2942 case nir_op_uge32:
2943 bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
2944 bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2945 break;
2946
2947 case nir_op_feq32:
2948 case nir_op_feq16:
2949 case nir_op_flt32:
2950 case nir_op_flt16:
2951 case nir_op_fge32:
2952 case nir_op_fge16:
2953 case nir_op_fneu32:
2954 case nir_op_fneu16:
2955 bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2956 BI_RESULT_TYPE_M1);
2957 break;
2958
2959 case nir_op_fround_even:
2960 case nir_op_fceil:
2961 case nir_op_ffloor:
2962 case nir_op_ftrunc:
2963 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2964 break;
2965
2966 case nir_op_fmin:
2967 bi_fmin_to(b, sz, dst, s0, s1);
2968 break;
2969
2970 case nir_op_fmax:
2971 bi_fmax_to(b, sz, dst, s0, s1);
2972 break;
2973
2974 case nir_op_iadd:
2975 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2976 break;
2977
2978 case nir_op_iadd_sat:
2979 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2980 break;
2981
2982 case nir_op_uadd_sat:
2983 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2984 break;
2985
2986 case nir_op_ihadd:
2987 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2988 break;
2989
2990 case nir_op_irhadd:
2991 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2992 break;
2993
2994 case nir_op_uhadd:
2995 bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
2996 break;
2997
2998 case nir_op_urhadd:
2999 bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
3000 break;
3001
3002 case nir_op_ineg:
3003 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
3004 break;
3005
3006 case nir_op_isub:
3007 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
3008 break;
3009
3010 case nir_op_isub_sat:
3011 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
3012 break;
3013
3014 case nir_op_usub_sat:
3015 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
3016 break;
3017
3018 case nir_op_imul:
3019 bi_imul_to(b, sz, dst, s0, s1);
3020 break;
3021
3022 case nir_op_iabs:
3023 bi_iabs_to(b, sz, dst, s0);
3024 break;
3025
3026 case nir_op_iand:
3027 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3028 break;
3029
3030 case nir_op_ior:
3031 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3032 break;
3033
3034 case nir_op_ixor:
3035 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3036 break;
3037
3038 case nir_op_inot:
3039 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
3040 break;
3041
3042 case nir_op_frsq:
3043 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3044 bi_lower_frsq_32(b, dst, s0);
3045 else
3046 bi_frsq_to(b, sz, dst, s0);
3047 break;
3048
3049 case nir_op_frcp:
3050 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3051 bi_lower_frcp_32(b, dst, s0);
3052 else
3053 bi_frcp_to(b, sz, dst, s0);
3054 break;
3055
3056 case nir_op_uclz:
3057 bi_clz_to(b, sz, dst, s0, false);
3058 break;
3059
3060 case nir_op_bit_count:
3061 assert(sz == 32 && src_sz == 32 && "should've been lowered");
3062 bi_popcount_i32_to(b, dst, s0);
3063 break;
3064
3065 case nir_op_bitfield_reverse:
3066 assert(sz == 32 && src_sz == 32 && "should've been lowered");
3067 bi_bitrev_i32_to(b, dst, s0);
3068 break;
3069
3070 case nir_op_ufind_msb: {
3071 bi_index clz = bi_clz(b, src_sz, s0, false);
3072
3073 if (sz == 8)
3074 clz = bi_byte(clz, 0);
3075 else if (sz == 16)
3076 clz = bi_half(clz, false);
3077
3078 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
3079 break;
3080 }
3081
3082 default:
3083 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
3084 unreachable("Unknown ALU op");
3085 }
3086 }
3087
3088 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
3089 * Midgard */
3090 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)3091 bifrost_tex_format(enum glsl_sampler_dim dim)
3092 {
3093 switch (dim) {
3094 case GLSL_SAMPLER_DIM_1D:
3095 case GLSL_SAMPLER_DIM_BUF:
3096 return 1;
3097
3098 case GLSL_SAMPLER_DIM_2D:
3099 case GLSL_SAMPLER_DIM_MS:
3100 case GLSL_SAMPLER_DIM_EXTERNAL:
3101 case GLSL_SAMPLER_DIM_RECT:
3102 case GLSL_SAMPLER_DIM_SUBPASS:
3103 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3104 return 2;
3105
3106 case GLSL_SAMPLER_DIM_3D:
3107 return 3;
3108
3109 case GLSL_SAMPLER_DIM_CUBE:
3110 return 0;
3111
3112 default:
3113 DBG("Unknown sampler dim type\n");
3114 assert(0);
3115 return 0;
3116 }
3117 }
3118
3119 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)3120 valhall_tex_dimension(enum glsl_sampler_dim dim)
3121 {
3122 switch (dim) {
3123 case GLSL_SAMPLER_DIM_1D:
3124 case GLSL_SAMPLER_DIM_BUF:
3125 return BI_DIMENSION_1D;
3126
3127 case GLSL_SAMPLER_DIM_2D:
3128 case GLSL_SAMPLER_DIM_MS:
3129 case GLSL_SAMPLER_DIM_EXTERNAL:
3130 case GLSL_SAMPLER_DIM_RECT:
3131 case GLSL_SAMPLER_DIM_SUBPASS:
3132 case GLSL_SAMPLER_DIM_SUBPASS_MS:
3133 return BI_DIMENSION_2D;
3134
3135 case GLSL_SAMPLER_DIM_3D:
3136 return BI_DIMENSION_3D;
3137
3138 case GLSL_SAMPLER_DIM_CUBE:
3139 return BI_DIMENSION_CUBE;
3140
3141 default:
3142 unreachable("Unknown sampler dim type");
3143 }
3144 }
3145
3146 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)3147 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
3148 {
3149 switch (T) {
3150 case nir_type_float16:
3151 return BIFROST_TEXTURE_FORMAT_F16 + clamp;
3152 case nir_type_float32:
3153 return BIFROST_TEXTURE_FORMAT_F32 + clamp;
3154 case nir_type_uint16:
3155 return BIFROST_TEXTURE_FORMAT_U16;
3156 case nir_type_int16:
3157 return BIFROST_TEXTURE_FORMAT_S16;
3158 case nir_type_uint32:
3159 return BIFROST_TEXTURE_FORMAT_U32;
3160 case nir_type_int32:
3161 return BIFROST_TEXTURE_FORMAT_S32;
3162 default:
3163 unreachable("Invalid type for texturing");
3164 }
3165 }
3166
3167 /* Array indices are specified as 32-bit uints, need to convert. In .z component
3168 * from NIR */
3169 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)3170 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
3171 {
3172 /* For (u)int we can just passthrough */
3173 nir_alu_type base = nir_alu_type_get_base_type(T);
3174 if (base == nir_type_int || base == nir_type_uint)
3175 return idx;
3176
3177 /* Otherwise we convert */
3178 assert(T == nir_type_float32);
3179
3180 /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
3181 * Texel Selection") defines the layer to be taken from clamp(RNE(r),
3182 * 0, dt - 1). So we use round RTE, clamping is handled at the data
3183 * structure level */
3184
3185 bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
3186 I->round = BI_ROUND_NONE;
3187 return I->dest[0];
3188 }
3189
3190 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
3191 * 16-bit 8:8 fixed-point format. We lower as:
3192 *
3193 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
3194 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
3195 */
3196
3197 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)3198 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
3199 {
3200 /* Precompute for constant LODs to avoid general constant folding */
3201 if (lod.type == BI_INDEX_CONSTANT) {
3202 uint32_t raw = lod.value;
3203 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
3204 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
3205 return bi_imm_u32(s32 & 0xFFFF);
3206 }
3207
3208 /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
3209 * the max LOD (16 since we cap at 2^16 texture dimensions), and
3210 * preferably small to minimize precision loss */
3211 const float max_lod = 16.0;
3212
3213 bi_instr *fsat =
3214 bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
3215 bi_imm_f32(1.0f / max_lod), bi_negzero());
3216
3217 fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3218
3219 bi_index fmul =
3220 bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());
3221
3222 return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
3223 bi_imm_u16(0));
3224 }
3225
3226 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3227 * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3228 * TODO: Cube face.
3229 */
3230
3231 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3232 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3233 {
3234 return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3235 }
3236
3237 /* The hardware specifies texel offsets and multisample indices together as a
3238 * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3239 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3240 * the bits we need and return that to be passed as a staging register. Else we
3241 * return 0 to avoid allocating a data register when everything is zero. */
3242
3243 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3244 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3245 {
3246 bi_index dest = bi_zero();
3247
3248 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3249 if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3250 nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3251 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3252 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3253 dest = bi_mkvec_v4i8(
3254 b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3255 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3256 (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3257 bi_imm_u8(0));
3258 }
3259
3260 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3261 if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3262 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3263 dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
3264 bi_imm_u8(24));
3265 }
3266
3267 return dest;
3268 }
3269
3270 /*
3271 * Valhall specifies specifies texel offsets, multisample indices, and (for
3272 * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3273 * component is either offset.z or multisample index depending on context. Build
3274 * this register.
3275 */
3276 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3277 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3278 {
3279 bi_index dest = bi_zero();
3280
3281 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3282 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3283 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3284
3285 /* Components 0-2: offsets */
3286 if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3287 nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3288 unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3289 bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3290
3291 /* No multisample index with 3D */
3292 assert((nr <= 2) || (ms_idx < 0));
3293
3294 /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3295 bi_index z = (nr > 2)
3296 ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3297 bi_imm_u8(0), bi_zero())
3298 : bi_zero();
3299
3300 dest = bi_mkvec_v2i8(
3301 b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3302 (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
3303 }
3304
3305 /* Component 2: multisample index */
3306 if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3307 nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3308 dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
3309 }
3310
3311 /* Component 3: 8-bit LOD */
3312 if (lod_idx >= 0 &&
3313 (!nir_src_is_const(instr->src[lod_idx].src) ||
3314 nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3315 nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3316 dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
3317 bi_imm_u8(24));
3318 }
3319
3320 return dest;
3321 }
3322
3323 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3324 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
3325 bi_index *t)
3326 {
3327 /* Compute max { |x|, |y|, |z| } */
3328 bi_index maxxyz = bi_temp(b->shader);
3329 *face = bi_temp(b->shader);
3330
3331 bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
3332 cz = bi_extract(b, coord, 2);
3333
3334 /* Use a pseudo op on Bifrost due to tuple restrictions */
3335 if (b->shader->arch <= 8) {
3336 bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3337 } else {
3338 bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3339 bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3340 }
3341
3342 /* Select coordinates */
3343 bi_index ssel =
3344 bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3345 bi_index tsel =
3346 bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);
3347
3348 /* The OpenGL ES specification requires us to transform an input vector
3349 * (x, y, z) to the coordinate, given the selected S/T:
3350 *
3351 * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3352 *
3353 * We implement (s shown, t similar) in a form friendlier to FMA
3354 * instructions, and clamp coordinates at the end for correct
3355 * NaN/infinity handling:
3356 *
3357 * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3358 *
3359 * Take the reciprocal of max{x, y, z}
3360 */
3361 bi_index rcp = bi_frcp_f32(b, maxxyz);
3362
3363 /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3364 bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3365
3366 /* Transform the coordinates */
3367 *s = bi_temp(b->shader);
3368 *t = bi_temp(b->shader);
3369
3370 bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3371 bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3372
3373 S->clamp = BI_CLAMP_CLAMP_0_1;
3374 T->clamp = BI_CLAMP_CLAMP_0_1;
3375 }
3376
3377 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3378 * 32-bits in passed pointer t. The packing of the face with the S coordinate
3379 * exploits the redundancy of floating points with the range restriction of
3380 * CUBEFACE output.
3381 *
3382 * struct cube_map_descriptor {
3383 * float s : 29;
3384 * unsigned face : 3;
3385 * float t : 32;
3386 * }
3387 *
3388 * Since the cube face index is preshifted, this is easy to pack with a bitwise
3389 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3390 * bits from face.
3391 */
3392
3393 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3394 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3395 {
3396 bi_index face, s;
3397 bi_emit_cube_coord(b, coord, &face, &s, t);
3398 bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3399 return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3400 }
3401
3402 /* Map to the main texture op used. Some of these (txd in particular) will
3403 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3404 * sequence). We assume that lowering is handled elsewhere.
3405 */
3406
3407 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3408 bi_tex_op(nir_texop op)
3409 {
3410 switch (op) {
3411 case nir_texop_tex:
3412 case nir_texop_txb:
3413 case nir_texop_txl:
3414 case nir_texop_txd:
3415 return BIFROST_TEX_OP_TEX;
3416 case nir_texop_txf:
3417 case nir_texop_txf_ms:
3418 case nir_texop_tg4:
3419 return BIFROST_TEX_OP_FETCH;
3420 case nir_texop_txs:
3421 case nir_texop_lod:
3422 case nir_texop_query_levels:
3423 case nir_texop_texture_samples:
3424 case nir_texop_samples_identical:
3425 unreachable("should've been lowered");
3426 default:
3427 unreachable("unsupported tex op");
3428 }
3429 }
3430
3431 /* Data registers required by texturing in the order they appear. All are
3432 * optional, the texture operation descriptor determines which are present.
3433 * Note since 3D arrays are not permitted at an API level, Z_COORD and
3434 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3435
3436 enum bifrost_tex_dreg {
3437 BIFROST_TEX_DREG_Z_COORD = 0,
3438 BIFROST_TEX_DREG_Y_DELTAS = 1,
3439 BIFROST_TEX_DREG_LOD = 2,
3440 BIFROST_TEX_DREG_GRDESC_HI = 3,
3441 BIFROST_TEX_DREG_SHADOW = 4,
3442 BIFROST_TEX_DREG_ARRAY = 5,
3443 BIFROST_TEX_DREG_OFFSETMS = 6,
3444 BIFROST_TEX_DREG_SAMPLER = 7,
3445 BIFROST_TEX_DREG_TEXTURE = 8,
3446 BIFROST_TEX_DREG_COUNT,
3447 };
3448
3449 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3450 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3451 {
3452 struct bifrost_texture_operation desc = {
3453 .op = bi_tex_op(instr->op),
3454 .offset_or_bias_disable = false, /* TODO */
3455 .shadow_or_clamp_disable = instr->is_shadow,
3456 .array = instr->is_array,
3457 .dimension = bifrost_tex_format(instr->sampler_dim),
3458 .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
3459 BI_CLAMP_NONE), /* TODO */
3460 .mask = 0xF,
3461 };
3462
3463 switch (desc.op) {
3464 case BIFROST_TEX_OP_TEX:
3465 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3466 break;
3467 case BIFROST_TEX_OP_FETCH:
3468 desc.lod_or_fetch = (enum bifrost_lod_mode)(
3469 instr->op == nir_texop_tg4
3470 ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
3471 : BIFROST_TEXTURE_FETCH_TEXEL);
3472 break;
3473 default:
3474 unreachable("texture op unsupported");
3475 }
3476
3477 /* 32-bit indices to be allocated as consecutive staging registers */
3478 bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
3479 bi_index cx = bi_null(), cy = bi_null();
3480
3481 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3482 bi_index index = bi_src_index(&instr->src[i].src);
3483 unsigned sz = nir_src_bit_size(instr->src[i].src);
3484 unsigned components = nir_src_num_components(instr->src[i].src);
3485 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3486 nir_alu_type T = base | sz;
3487
3488 switch (instr->src[i].src_type) {
3489 case nir_tex_src_coord:
3490 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3491 cx = bi_emit_texc_cube_coord(b, index, &cy);
3492 } else {
3493 /* Copy XY (for 2D+) or XX (for 1D) */
3494 cx = bi_extract(b, index, 0);
3495 cy = bi_extract(b, index, MIN2(1, components - 1));
3496
3497 assert(components >= 1 && components <= 3);
3498
3499 if (components == 3 && !desc.array) {
3500 /* 3D */
3501 dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
3502 }
3503 }
3504
3505 if (desc.array) {
3506 dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
3507 b, bi_extract(b, index, components - 1), T);
3508 }
3509
3510 break;
3511
3512 case nir_tex_src_lod:
3513 if (desc.op == BIFROST_TEX_OP_TEX &&
3514 nir_src_is_const(instr->src[i].src) &&
3515 nir_src_as_uint(instr->src[i].src) == 0) {
3516 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3517 } else if (desc.op == BIFROST_TEX_OP_TEX) {
3518 assert(base == nir_type_float);
3519
3520 assert(sz == 16 || sz == 32);
3521 dregs[BIFROST_TEX_DREG_LOD] =
3522 bi_emit_texc_lod_88(b, index, sz == 16);
3523 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3524 } else {
3525 assert(desc.op == BIFROST_TEX_OP_FETCH);
3526 assert(base == nir_type_uint || base == nir_type_int);
3527 assert(sz == 16 || sz == 32);
3528
3529 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
3530 }
3531
3532 break;
3533
3534 case nir_tex_src_bias:
3535 /* Upper 16-bits interpreted as a clamp, leave zero */
3536 assert(desc.op == BIFROST_TEX_OP_TEX);
3537 assert(base == nir_type_float);
3538 assert(sz == 16 || sz == 32);
3539 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3540 desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3541 break;
3542
3543 case nir_tex_src_ms_index:
3544 case nir_tex_src_offset:
3545 if (desc.offset_or_bias_disable)
3546 break;
3547
3548 dregs[BIFROST_TEX_DREG_OFFSETMS] =
3549 bi_emit_texc_offset_ms_index(b, instr);
3550 if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3551 desc.offset_or_bias_disable = true;
3552 break;
3553
3554 case nir_tex_src_comparator:
3555 dregs[BIFROST_TEX_DREG_SHADOW] = index;
3556 break;
3557
3558 case nir_tex_src_texture_offset:
3559 dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3560 break;
3561
3562 case nir_tex_src_sampler_offset:
3563 dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3564 break;
3565
3566 default:
3567 unreachable("Unhandled src type in texc emit");
3568 }
3569 }
3570
3571 if (desc.op == BIFROST_TEX_OP_FETCH &&
3572 bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3573 dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
3574 }
3575
3576 /* Choose an index mode */
3577
3578 bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3579 bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3580 bool direct = direct_tex && direct_samp;
3581
3582 desc.immediate_indices =
3583 direct && (instr->sampler_index < 16 && instr->texture_index < 128);
3584
3585 if (desc.immediate_indices) {
3586 desc.sampler_index_or_mode = instr->sampler_index;
3587 desc.index = instr->texture_index;
3588 } else {
3589 unsigned mode = 0;
3590
3591 if (direct && instr->sampler_index == instr->texture_index &&
3592 instr->sampler_index < 128) {
3593 mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3594 desc.index = instr->texture_index;
3595 } else if (direct && instr->sampler_index < 128) {
3596 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3597 desc.index = instr->sampler_index;
3598 dregs[BIFROST_TEX_DREG_TEXTURE] =
3599 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3600 } else if (direct_tex && instr->texture_index < 128) {
3601 mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3602 desc.index = instr->texture_index;
3603
3604 if (direct_samp) {
3605 dregs[BIFROST_TEX_DREG_SAMPLER] =
3606 bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3607 }
3608 } else if (direct_samp && instr->sampler_index < 128) {
3609 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3610 desc.index = instr->sampler_index;
3611
3612 if (direct_tex) {
3613 dregs[BIFROST_TEX_DREG_TEXTURE] =
3614 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3615 }
3616 } else {
3617 mode = BIFROST_INDEX_REGISTER;
3618
3619 if (direct_tex) {
3620 dregs[BIFROST_TEX_DREG_TEXTURE] =
3621 bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3622 }
3623
3624 if (direct_samp) {
3625 dregs[BIFROST_TEX_DREG_SAMPLER] =
3626 bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3627 }
3628 }
3629
3630 mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3631 desc.sampler_index_or_mode = mode;
3632 }
3633
3634 /* Allocate staging registers contiguously by compacting the array. */
3635 unsigned sr_count = 0;
3636
3637 for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3638 if (!bi_is_null(dregs[i]))
3639 dregs[sr_count++] = dregs[i];
3640 }
3641
3642 unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;
3643
3644 bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3645 bi_index dst = bi_temp(b->shader);
3646
3647 if (sr_count)
3648 bi_emit_collect_to(b, sr, dregs, sr_count);
3649
3650 uint32_t desc_u = 0;
3651 memcpy(&desc_u, &desc, sizeof(desc_u));
3652 bi_instr *I =
3653 bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u),
3654 !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3655 I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3656
3657 bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3658 bi_emit_split_i32(b, w, dst, res_size);
3659 bi_emit_collect_to(b, bi_def_index(&instr->def), w,
3660 DIV_ROUND_UP(instr->def.num_components * res_size, 4));
3661 }
3662
3663 /* Staging registers required by texturing in the order they appear (Valhall) */
3664
3665 enum valhall_tex_sreg {
3666 VALHALL_TEX_SREG_X_COORD = 0,
3667 VALHALL_TEX_SREG_Y_COORD = 1,
3668 VALHALL_TEX_SREG_Z_COORD = 2,
3669 VALHALL_TEX_SREG_Y_DELTAS = 3,
3670 VALHALL_TEX_SREG_ARRAY = 4,
3671 VALHALL_TEX_SREG_SHADOW = 5,
3672 VALHALL_TEX_SREG_OFFSETMS = 6,
3673 VALHALL_TEX_SREG_LOD = 7,
3674 VALHALL_TEX_SREG_GRDESC = 8,
3675 VALHALL_TEX_SREG_COUNT,
3676 };
3677
3678 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3679 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3680 {
3681 bool explicit_offset = false;
3682 enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3683
3684 bool has_lod_mode = (instr->op == nir_texop_tex) ||
3685 (instr->op == nir_texop_txl) ||
3686 (instr->op == nir_texop_txb);
3687
3688 /* 32-bit indices to be allocated as consecutive staging registers */
3689 bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
3690 bi_index sampler = bi_imm_u32(instr->sampler_index);
3691 bi_index texture = bi_imm_u32(instr->texture_index);
3692
3693 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3694 bi_index index = bi_src_index(&instr->src[i].src);
3695 unsigned sz = nir_src_bit_size(instr->src[i].src);
3696
3697 switch (instr->src[i].src_type) {
3698 case nir_tex_src_coord: {
3699 unsigned components =
3700 nir_src_num_components(instr->src[i].src) - instr->is_array;
3701
3702 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3703 sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
3704 b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
3705 } else {
3706 assert(components >= 1 && components <= 3);
3707
3708 /* Copy XY (for 2D+) or XX (for 1D) */
3709 sregs[VALHALL_TEX_SREG_X_COORD] = index;
3710
3711 if (components >= 2)
3712 sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3713
3714 if (components == 3)
3715 sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
3716 }
3717
3718 if (instr->is_array) {
3719 sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components);
3720 }
3721
3722 break;
3723 }
3724
3725 case nir_tex_src_lod:
3726 if (nir_src_is_const(instr->src[i].src) &&
3727 nir_src_as_uint(instr->src[i].src) == 0) {
3728 lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3729 } else if (has_lod_mode) {
3730 lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3731
3732 assert(sz == 16 || sz == 32);
3733 sregs[VALHALL_TEX_SREG_LOD] =
3734 bi_emit_texc_lod_88(b, index, sz == 16);
3735 }
3736 break;
3737
3738 case nir_tex_src_bias:
3739 /* Upper 16-bits interpreted as a clamp, leave zero */
3740 assert(sz == 16 || sz == 32);
3741 sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3742
3743 lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3744 break;
3745 case nir_tex_src_ms_index:
3746 case nir_tex_src_offset:
3747 /* Handled below */
3748 break;
3749
3750 case nir_tex_src_comparator:
3751 sregs[VALHALL_TEX_SREG_SHADOW] = index;
3752 break;
3753
3754 case nir_tex_src_texture_offset:
3755 /* This should always be 0 as lower_index_to_offset is expected to be
3756 * set */
3757 assert(instr->texture_index == 0);
3758 texture = index;
3759 break;
3760
3761 case nir_tex_src_sampler_offset:
3762 /* This should always be 0 as lower_index_to_offset is expected to be
3763 * set */
3764 assert(instr->sampler_index == 0);
3765 sampler = index;
3766 break;
3767
3768 default:
3769 unreachable("Unhandled src type in tex emit");
3770 }
3771 }
3772
3773 /* Generate packed offset + ms index + LOD register. These default to
3774 * zero so we only need to encode if these features are actually in use.
3775 */
3776 bi_index offsets = bi_emit_valhall_offsets(b, instr);
3777
3778 if (!bi_is_equiv(offsets, bi_zero())) {
3779 sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
3780 explicit_offset = true;
3781 }
3782
3783 /* Allocate staging registers contiguously by compacting the array. */
3784 unsigned sr_count = 0;
3785
3786 for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
3787 if (!bi_is_null(sregs[i]))
3788 sregs[sr_count++] = sregs[i];
3789 }
3790
3791 bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
3792
3793 if (sr_count)
3794 bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
3795
3796 bool narrow_indices = va_is_valid_const_narrow_index(texture) &&
3797 va_is_valid_const_narrow_index(sampler);
3798
3799 bi_index src0;
3800 bi_index src1;
3801
3802 if (narrow_indices) {
3803 unsigned tex_set =
3804 va_res_fold_table_idx(pan_res_handle_get_table(texture.value));
3805 unsigned sampler_set =
3806 va_res_fold_table_idx(pan_res_handle_get_table(sampler.value));
3807 unsigned texture_index = pan_res_handle_get_index(texture.value);
3808 unsigned sampler_index = pan_res_handle_get_index(sampler.value);
3809
3810 unsigned packed_handle = (tex_set << 27) | (texture_index << 16) |
3811 (sampler_set << 11) | sampler_index;
3812
3813 src0 = bi_imm_u32(packed_handle);
3814
3815 /* TODO: narrow offsetms */
3816 src1 = bi_zero();
3817 } else {
3818 src0 = sampler;
3819 src1 = texture;
3820 }
3821
3822 /* Only write the components that we actually read */
3823 unsigned mask = nir_def_components_read(&instr->def);
3824 unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
3825 unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
3826
3827 enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
3828 enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
3829 bi_index dest = bi_temp(b->shader);
3830
3831 switch (instr->op) {
3832 case nir_texop_tex:
3833 case nir_texop_txl:
3834 case nir_texop_txb:
3835 bi_tex_single_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
3836 instr->is_shadow, explicit_offset, lod_mode,
3837 !narrow_indices, mask, sr_count);
3838 break;
3839 case nir_texop_txf:
3840 case nir_texop_txf_ms:
3841 bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
3842 explicit_offset, !narrow_indices, mask, sr_count);
3843 break;
3844 case nir_texop_tg4:
3845 bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim,
3846 instr->component, false, regfmt, instr->is_shadow,
3847 explicit_offset, !narrow_indices, mask, sr_count);
3848 break;
3849 default:
3850 unreachable("Unhandled Valhall texture op");
3851 }
3852
3853 /* The hardware will write only what we read, and it will into
3854 * contiguous registers without gaps (different from Bifrost). NIR
3855 * expects the gaps, so fill in the holes (they'll be copypropped and
3856 * DCE'd away later).
3857 */
3858 bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3859
3860 bi_emit_cached_split_i32(b, dest, res_size);
3861
3862 /* Index into the packed component array */
3863 unsigned j = 0;
3864 unsigned comps[4] = {0};
3865 unsigned nr_components = instr->def.num_components;
3866
3867 for (unsigned i = 0; i < nr_components; ++i) {
3868 if (mask & BITFIELD_BIT(i)) {
3869 unpacked[i] = dest;
3870 comps[i] = j++;
3871 } else {
3872 unpacked[i] = bi_zero();
3873 }
3874 }
3875
3876 bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
3877 instr->def.num_components, instr->def.bit_size);
3878 }
3879
3880 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
3881 * textures with sufficiently small immediate indices. Anything else
3882 * needs a complete texture op. */
3883
3884 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)3885 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
3886 {
3887 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
3888 assert(coord_idx >= 0);
3889 bi_index coords = bi_src_index(&instr->src[coord_idx].src);
3890
3891 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3892 bi_index face, s, t;
3893 bi_emit_cube_coord(b, coords, &face, &s, &t);
3894
3895 bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
3896 face, instr->sampler_index, instr->texture_index);
3897 } else {
3898 bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
3899 bi_extract(b, coords, 0), bi_extract(b, coords, 1),
3900 instr->op != nir_texop_tex, /* zero LOD */
3901 instr->sampler_index, instr->texture_index);
3902 }
3903
3904 bi_split_def(b, &instr->def);
3905 }
3906
3907 static bool
bi_is_simple_tex(nir_tex_instr * instr)3908 bi_is_simple_tex(nir_tex_instr *instr)
3909 {
3910 if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
3911 return false;
3912
3913 if (instr->dest_type != nir_type_float32 &&
3914 instr->dest_type != nir_type_float16)
3915 return false;
3916
3917 if (instr->is_shadow || instr->is_array)
3918 return false;
3919
3920 switch (instr->sampler_dim) {
3921 case GLSL_SAMPLER_DIM_2D:
3922 case GLSL_SAMPLER_DIM_EXTERNAL:
3923 case GLSL_SAMPLER_DIM_RECT:
3924 break;
3925
3926 case GLSL_SAMPLER_DIM_CUBE:
3927 /* LOD can't be specified with TEXS_CUBE */
3928 if (instr->op == nir_texop_txl)
3929 return false;
3930 break;
3931
3932 default:
3933 return false;
3934 }
3935
3936 for (unsigned i = 0; i < instr->num_srcs; ++i) {
3937 if (instr->src[i].src_type != nir_tex_src_lod &&
3938 instr->src[i].src_type != nir_tex_src_coord)
3939 return false;
3940 }
3941
3942 /* Indices need to fit in provided bits */
3943 unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
3944 if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
3945 return false;
3946
3947 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3948 if (lod_idx < 0)
3949 return true;
3950
3951 nir_src lod = instr->src[lod_idx].src;
3952 return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
3953 }
3954
3955 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)3956 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
3957 {
3958 /* If txf is used, we assume there is a valid sampler bound at index 0. Use
3959 * it for txf operations, since there may be no other valid samplers. This is
3960 * a workaround: txf does not require a sampler in NIR (so sampler_index is
3961 * undefined) but we need one in the hardware. This is ABI with the driver.
3962 *
3963 * On Valhall, as the descriptor table is encoded in the index, this should
3964 * be handled by the driver.
3965 */
3966 if (!nir_tex_instr_need_sampler(instr) && b->shader->arch < 9)
3967 instr->sampler_index = 0;
3968
3969 if (b->shader->arch >= 9)
3970 bi_emit_tex_valhall(b, instr);
3971 else if (bi_is_simple_tex(instr))
3972 bi_emit_texs(b, instr);
3973 else
3974 bi_emit_texc(b, instr);
3975 }
3976
3977 static void
bi_emit_phi(bi_builder * b,nir_phi_instr * instr)3978 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
3979 {
3980 unsigned nr_srcs = exec_list_length(&instr->srcs);
3981 bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);
3982
3983 /* Deferred */
3984 I->phi = instr;
3985 }
3986
3987 /* Look up the AGX block corresponding to a given NIR block. Used when
3988 * translating phi nodes after emitting all blocks.
3989 */
3990 static bi_block *
bi_from_nir_block(bi_context * ctx,nir_block * block)3991 bi_from_nir_block(bi_context *ctx, nir_block *block)
3992 {
3993 return ctx->indexed_nir_blocks[block->index];
3994 }
3995
3996 static void
bi_emit_phi_deferred(bi_context * ctx,bi_block * block,bi_instr * I)3997 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
3998 {
3999 nir_phi_instr *phi = I->phi;
4000
4001 /* Guaranteed by lower_phis_to_scalar */
4002 assert(phi->def.num_components == 1);
4003
4004 nir_foreach_phi_src(src, phi) {
4005 bi_block *pred = bi_from_nir_block(ctx, src->pred);
4006 unsigned i = bi_predecessor_index(block, pred);
4007 assert(i < I->nr_srcs);
4008
4009 I->src[i] = bi_src_index(&src->src);
4010 }
4011
4012 I->phi = NULL;
4013 }
4014
4015 static void
bi_emit_phis_deferred(bi_context * ctx)4016 bi_emit_phis_deferred(bi_context *ctx)
4017 {
4018 bi_foreach_block(ctx, block) {
4019 bi_foreach_instr_in_block(block, I) {
4020 if (I->op == BI_OPCODE_PHI)
4021 bi_emit_phi_deferred(ctx, block, I);
4022 }
4023 }
4024 }
4025
4026 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)4027 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
4028 {
4029 switch (instr->type) {
4030 case nir_instr_type_load_const:
4031 bi_emit_load_const(b, nir_instr_as_load_const(instr));
4032 break;
4033
4034 case nir_instr_type_intrinsic:
4035 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
4036 break;
4037
4038 case nir_instr_type_alu:
4039 bi_emit_alu(b, nir_instr_as_alu(instr));
4040 break;
4041
4042 case nir_instr_type_tex:
4043 bi_emit_tex(b, nir_instr_as_tex(instr));
4044 break;
4045
4046 case nir_instr_type_jump:
4047 bi_emit_jump(b, nir_instr_as_jump(instr));
4048 break;
4049
4050 case nir_instr_type_phi:
4051 bi_emit_phi(b, nir_instr_as_phi(instr));
4052 break;
4053
4054 default:
4055 unreachable("should've been lowered");
4056 }
4057 }
4058
4059 static bi_block *
create_empty_block(bi_context * ctx)4060 create_empty_block(bi_context *ctx)
4061 {
4062 bi_block *blk = rzalloc(ctx, bi_block);
4063
4064 util_dynarray_init(&blk->predecessors, blk);
4065
4066 return blk;
4067 }
4068
4069 static bi_block *
emit_block(bi_context * ctx,nir_block * block)4070 emit_block(bi_context *ctx, nir_block *block)
4071 {
4072 if (ctx->after_block) {
4073 ctx->current_block = ctx->after_block;
4074 ctx->after_block = NULL;
4075 } else {
4076 ctx->current_block = create_empty_block(ctx);
4077 }
4078
4079 list_addtail(&ctx->current_block->link, &ctx->blocks);
4080 list_inithead(&ctx->current_block->instructions);
4081
4082 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4083
4084 ctx->indexed_nir_blocks[block->index] = ctx->current_block;
4085
4086 nir_foreach_instr(instr, block) {
4087 bi_emit_instr(&_b, instr);
4088 }
4089
4090 return ctx->current_block;
4091 }
4092
4093 static void
emit_if(bi_context * ctx,nir_if * nif)4094 emit_if(bi_context *ctx, nir_if *nif)
4095 {
4096 bi_block *before_block = ctx->current_block;
4097
4098 /* Speculatively emit the branch, but we can't fill it in until later */
4099 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4100 bi_instr *then_branch =
4101 bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
4102 bi_zero(), BI_CMPF_EQ);
4103
4104 /* Emit the two subblocks. */
4105 bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
4106 bi_block *end_then_block = ctx->current_block;
4107
4108 /* Emit second block */
4109
4110 bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
4111 bi_block *end_else_block = ctx->current_block;
4112 ctx->after_block = create_empty_block(ctx);
4113
4114 /* Now that we have the subblocks emitted, fix up the branches */
4115
4116 assert(then_block);
4117 assert(else_block);
4118
4119 then_branch->branch_target = else_block;
4120
4121 /* Emit a jump from the end of the then block to the end of the else */
4122 _b.cursor = bi_after_block(end_then_block);
4123 bi_instr *then_exit = bi_jump(&_b, bi_zero());
4124 then_exit->branch_target = ctx->after_block;
4125
4126 bi_block_add_successor(end_then_block, then_exit->branch_target);
4127 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
4128
4129 bi_block_add_successor(before_block,
4130 then_branch->branch_target); /* then_branch */
4131 bi_block_add_successor(before_block, then_block); /* fallthrough */
4132 }
4133
4134 static void
emit_loop(bi_context * ctx,nir_loop * nloop)4135 emit_loop(bi_context *ctx, nir_loop *nloop)
4136 {
4137 assert(!nir_loop_has_continue_construct(nloop));
4138
4139 /* Remember where we are */
4140 bi_block *start_block = ctx->current_block;
4141
4142 bi_block *saved_break = ctx->break_block;
4143 bi_block *saved_continue = ctx->continue_block;
4144
4145 ctx->continue_block = create_empty_block(ctx);
4146 ctx->break_block = create_empty_block(ctx);
4147 ctx->after_block = ctx->continue_block;
4148 ctx->after_block->loop_header = true;
4149
4150 /* Emit the body itself */
4151 emit_cf_list(ctx, &nloop->body);
4152
4153 /* Branch back to loop back */
4154 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4155 bi_instr *I = bi_jump(&_b, bi_zero());
4156 I->branch_target = ctx->continue_block;
4157 bi_block_add_successor(start_block, ctx->continue_block);
4158 bi_block_add_successor(ctx->current_block, ctx->continue_block);
4159
4160 ctx->after_block = ctx->break_block;
4161
4162 /* Pop off */
4163 ctx->break_block = saved_break;
4164 ctx->continue_block = saved_continue;
4165 ++ctx->loop_count;
4166 }
4167
4168 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)4169 emit_cf_list(bi_context *ctx, struct exec_list *list)
4170 {
4171 bi_block *start_block = NULL;
4172
4173 foreach_list_typed(nir_cf_node, node, node, list) {
4174 switch (node->type) {
4175 case nir_cf_node_block: {
4176 bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
4177
4178 if (!start_block)
4179 start_block = block;
4180
4181 break;
4182 }
4183
4184 case nir_cf_node_if:
4185 emit_if(ctx, nir_cf_node_as_if(node));
4186 break;
4187
4188 case nir_cf_node_loop:
4189 emit_loop(ctx, nir_cf_node_as_loop(node));
4190 break;
4191
4192 default:
4193 unreachable("Unknown control flow");
4194 }
4195 }
4196
4197 return start_block;
4198 }
4199
4200 /* shader-db stuff */
4201
4202 struct bi_stats {
4203 unsigned nr_clauses, nr_tuples, nr_ins;
4204 unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
4205 };
4206
4207 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)4208 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
4209 {
4210 /* Count instructions */
4211 stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
4212
4213 /* Non-message passing tuples are always arithmetic */
4214 if (tuple->add != clause->message) {
4215 stats->nr_arith++;
4216 return;
4217 }
4218
4219 /* Message + FMA we'll count as arithmetic _and_ message */
4220 if (tuple->fma)
4221 stats->nr_arith++;
4222
4223 switch (clause->message_type) {
4224 case BIFROST_MESSAGE_VARYING:
4225 /* Check components interpolated */
4226 stats->nr_varying +=
4227 (clause->message->vecsize + 1) *
4228 (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
4229 break;
4230
4231 case BIFROST_MESSAGE_VARTEX:
4232 /* 2 coordinates, fp32 each */
4233 stats->nr_varying += (2 * 2);
4234 FALLTHROUGH;
4235 case BIFROST_MESSAGE_TEX:
4236 stats->nr_texture++;
4237 break;
4238
4239 case BIFROST_MESSAGE_ATTRIBUTE:
4240 case BIFROST_MESSAGE_LOAD:
4241 case BIFROST_MESSAGE_STORE:
4242 case BIFROST_MESSAGE_ATOMIC:
4243 stats->nr_ldst++;
4244 break;
4245
4246 case BIFROST_MESSAGE_NONE:
4247 case BIFROST_MESSAGE_BARRIER:
4248 case BIFROST_MESSAGE_BLEND:
4249 case BIFROST_MESSAGE_TILE:
4250 case BIFROST_MESSAGE_Z_STENCIL:
4251 case BIFROST_MESSAGE_ATEST:
4252 case BIFROST_MESSAGE_JOB:
4253 case BIFROST_MESSAGE_64BIT:
4254 /* Nothing to do */
4255 break;
4256 };
4257 }
4258
4259 /*
4260 * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
4261 * shader completes. These costs are not accounted for in the general cycle
4262 * counts, so this function calculates the effective cost of these messages, as
4263 * if they were executed by shader code.
4264 */
4265 static unsigned
bi_count_preload_cost(bi_context * ctx)4266 bi_count_preload_cost(bi_context *ctx)
4267 {
4268 /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4269 * 16 fp16 varying components per cycle or fetch two texels per cycle.
4270 */
4271 unsigned cost = 0;
4272
4273 for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4274 struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4275
4276 if (msg.enabled && msg.texture) {
4277 /* 2 coordinate, 2 half-words each, plus texture */
4278 cost += 12;
4279 } else if (msg.enabled) {
4280 cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4281 }
4282 }
4283
4284 return cost;
4285 }
4286
4287 static const char *
bi_shader_stage_name(bi_context * ctx)4288 bi_shader_stage_name(bi_context *ctx)
4289 {
4290 if (ctx->idvs == BI_IDVS_VARYING)
4291 return "MESA_SHADER_VARYING";
4292 else if (ctx->idvs == BI_IDVS_POSITION)
4293 return "MESA_SHADER_POSITION";
4294 else if (ctx->inputs->is_blend)
4295 return "MESA_SHADER_BLEND";
4296 else
4297 return gl_shader_stage_name(ctx->stage);
4298 }
4299
4300 static char *
bi_print_stats(bi_context * ctx,unsigned size)4301 bi_print_stats(bi_context *ctx, unsigned size)
4302 {
4303 struct bi_stats stats = {0};
4304
4305 /* Count instructions, clauses, and tuples. Also attempt to construct
4306 * normalized execution engine cycle counts, using the following ratio:
4307 *
4308 * 24 arith tuples/cycle
4309 * 2 texture messages/cycle
4310 * 16 x 16-bit varying channels interpolated/cycle
4311 * 1 load store message/cycle
4312 *
4313 * These numbers seem to match Arm Mobile Studio's heuristic. The real
4314 * cycle counts are surely more complicated.
4315 */
4316
4317 bi_foreach_block(ctx, block) {
4318 bi_foreach_clause_in_block(block, clause) {
4319 stats.nr_clauses++;
4320 stats.nr_tuples += clause->tuple_count;
4321
4322 for (unsigned i = 0; i < clause->tuple_count; ++i)
4323 bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4324 }
4325 }
4326
4327 float cycles_arith = ((float)stats.nr_arith) / 24.0;
4328 float cycles_texture = ((float)stats.nr_texture) / 2.0;
4329 float cycles_varying = ((float)stats.nr_varying) / 16.0;
4330 float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
4331
4332 float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4333 float cycles_bound = MAX2(cycles_arith, cycles_message);
4334
4335 /* Thread count and register pressure are traded off only on v7 */
4336 bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4337 unsigned nr_threads = full_threads ? 2 : 1;
4338
4339 /* Dump stats */
4340 char *str = ralloc_asprintf(
4341 NULL,
4342 "%s shader: "
4343 "%u inst, %u tuples, %u clauses, "
4344 "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4345 "%u quadwords, %u threads",
4346 bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
4347 stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
4348 cycles_varying, cycles_ldst, size / 16, nr_threads);
4349
4350 if (ctx->arch == 7) {
4351 ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4352 }
4353
4354 ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
4355 ctx->loop_count, ctx->spills, ctx->fills);
4356
4357 return str;
4358 }
4359
4360 static char *
va_print_stats(bi_context * ctx,unsigned size)4361 va_print_stats(bi_context *ctx, unsigned size)
4362 {
4363 unsigned nr_ins = 0;
4364 struct va_stats stats = {0};
4365
4366 /* Count instructions */
4367 bi_foreach_instr_global(ctx, I) {
4368 nr_ins++;
4369 va_count_instr_stats(I, &stats);
4370 }
4371
4372 /* Mali G78 peak performance:
4373 *
4374 * 64 FMA instructions per cycle
4375 * 64 CVT instructions per cycle
4376 * 16 SFU instructions per cycle
4377 * 8 x 32-bit varying channels interpolated per cycle
4378 * 4 texture instructions per cycle
4379 * 1 load/store operation per cycle
4380 */
4381
4382 float cycles_fma = ((float)stats.fma) / 64.0;
4383 float cycles_cvt = ((float)stats.cvt) / 64.0;
4384 float cycles_sfu = ((float)stats.sfu) / 16.0;
4385 float cycles_v = ((float)stats.v) / 16.0;
4386 float cycles_t = ((float)stats.t) / 4.0;
4387 float cycles_ls = ((float)stats.ls) / 1.0;
4388
4389 /* Calculate the bound */
4390 float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4391 MAX3(cycles_v, cycles_t, cycles_ls));
4392
4393 /* Thread count and register pressure are traded off */
4394 unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4395
4396 /* Dump stats */
4397 return ralloc_asprintf(NULL,
4398 "%s shader: "
4399 "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4400 "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4401 "%u:%u spills:fills",
4402 bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
4403 cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
4404 size / 16, nr_threads, ctx->loop_count, ctx->spills,
4405 ctx->fills);
4406 }
4407
4408 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4409 glsl_type_size(const struct glsl_type *type, bool bindless)
4410 {
4411 return glsl_count_attribute_slots(type, false);
4412 }
4413
4414 /* Split stores to memory. We don't split stores to vertex outputs, since
4415 * nir_lower_io_to_temporaries will ensure there's only a single write.
4416 */
4417
4418 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4419 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4420 {
4421 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4422
4423 switch (intr->intrinsic) {
4424 case nir_intrinsic_store_ssbo:
4425 case nir_intrinsic_store_shared:
4426 case nir_intrinsic_store_global:
4427 case nir_intrinsic_store_scratch:
4428 return true;
4429 default:
4430 return false;
4431 }
4432 }
4433
4434 /*
4435 * Some operations are only available as 32-bit instructions. 64-bit floats are
4436 * unsupported and ints are lowered with nir_lower_int64. Certain 8-bit and
4437 * 16-bit instructions, however, are lowered here.
4438 */
4439 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4440 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4441 {
4442 if (instr->type != nir_instr_type_alu)
4443 return 0;
4444
4445 nir_alu_instr *alu = nir_instr_as_alu(instr);
4446
4447 switch (alu->op) {
4448 case nir_op_fexp2:
4449 case nir_op_flog2:
4450 case nir_op_fpow:
4451 case nir_op_fsin:
4452 case nir_op_fcos:
4453 case nir_op_bit_count:
4454 case nir_op_bitfield_reverse:
4455 return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
4456 default:
4457 return 0;
4458 }
4459 }
4460
4461 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4462 * transcendentals are an exception. Also shifts because of lane size mismatch
4463 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4464 * to be scalarized due to type size. */
4465
4466 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4467 bi_vectorize_filter(const nir_instr *instr, const void *data)
4468 {
4469 /* Defaults work for everything else */
4470 if (instr->type != nir_instr_type_alu)
4471 return 0;
4472
4473 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4474
4475 switch (alu->op) {
4476 case nir_op_frcp:
4477 case nir_op_frsq:
4478 case nir_op_ishl:
4479 case nir_op_ishr:
4480 case nir_op_ushr:
4481 case nir_op_f2i16:
4482 case nir_op_f2u16:
4483 case nir_op_extract_u8:
4484 case nir_op_extract_i8:
4485 case nir_op_extract_u16:
4486 case nir_op_extract_i16:
4487 case nir_op_insert_u16:
4488 return 1;
4489 default:
4490 break;
4491 }
4492
4493 /* Vectorized instructions cannot write more than 32-bit */
4494 int dst_bit_size = alu->def.bit_size;
4495 if (dst_bit_size == 16)
4496 return 2;
4497 else
4498 return 1;
4499 }
4500
4501 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4502 bi_scalarize_filter(const nir_instr *instr, const void *data)
4503 {
4504 if (instr->type != nir_instr_type_alu)
4505 return false;
4506
4507 const nir_alu_instr *alu = nir_instr_as_alu(instr);
4508
4509 switch (alu->op) {
4510 case nir_op_pack_uvec2_to_uint:
4511 case nir_op_pack_uvec4_to_uint:
4512 return false;
4513 default:
4514 return true;
4515 }
4516 }
4517
4518 /* Ensure we write exactly 4 components */
4519 static nir_def *
bifrost_nir_valid_channel(nir_builder * b,nir_def * in,unsigned channel,unsigned first,unsigned mask)4520 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
4521 unsigned first, unsigned mask)
4522 {
4523 if (!(mask & BITFIELD_BIT(channel)))
4524 channel = first;
4525
4526 return nir_channel(b, in, channel);
4527 }
4528
4529 /* Lower fragment store_output instructions to always write 4 components,
4530 * matching the hardware semantic. This may require additional moves. Skipping
4531 * these moves is possible in theory, but invokes undefined behaviour in the
4532 * compiler. The DDK inserts these moves, so we will as well. */
4533
4534 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)4535 bifrost_nir_lower_blend_components(struct nir_builder *b,
4536 nir_intrinsic_instr *intr, void *data)
4537 {
4538 if (intr->intrinsic != nir_intrinsic_store_output)
4539 return false;
4540
4541 nir_def *in = intr->src[0].ssa;
4542 unsigned first = nir_intrinsic_component(intr);
4543 unsigned mask = nir_intrinsic_write_mask(intr);
4544
4545 assert(first == 0 && "shouldn't get nonzero components");
4546
4547 /* Nothing to do */
4548 if (mask == BITFIELD_MASK(4))
4549 return false;
4550
4551 b->cursor = nir_before_instr(&intr->instr);
4552
4553 /* Replicate the first valid component instead */
4554 nir_def *replicated =
4555 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4556 bifrost_nir_valid_channel(b, in, 1, first, mask),
4557 bifrost_nir_valid_channel(b, in, 2, first, mask),
4558 bifrost_nir_valid_channel(b, in, 3, first, mask));
4559
4560 /* Rewrite to use our replicated version */
4561 nir_src_rewrite(&intr->src[0], replicated);
4562 nir_intrinsic_set_component(intr, 0);
4563 nir_intrinsic_set_write_mask(intr, 0xF);
4564 intr->num_components = 4;
4565
4566 return true;
4567 }
4568
4569 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,const void * cb_data)4570 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
4571 uint8_t bit_size, uint32_t align_mul,
4572 uint32_t align_offset, bool offset_is_const,
4573 const void *cb_data)
4574 {
4575 uint32_t align = nir_combined_align(align_mul, align_offset);
4576 assert(util_is_power_of_two_nonzero(align));
4577
4578 /* No more than 16 bytes at a time. */
4579 bytes = MIN2(bytes, 16);
4580
4581 /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
4582 * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
4583 *
4584 * But if we're only aligned to 1 byte, use 8-bit loads. If we're only
4585 * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
4586 * the size.
4587 */
4588 if ((bytes & 1) || (align == 1))
4589 bit_size = 8;
4590 else if ((bytes & 2) || (align == 2))
4591 bit_size = 16;
4592 else if (bit_size >= 32)
4593 bit_size = 32;
4594
4595 unsigned num_comps = MIN2(bytes / (bit_size / 8), 4);
4596
4597 /* Push constants require 32-bit loads. */
4598 if (intrin == nir_intrinsic_load_push_constant) {
4599 if (align_mul >= 4) {
4600 /* If align_mul is bigger than 4 we can use align_offset to find
4601 * the exact number of words we need to read.
4602 */
4603 num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4);
4604 } else {
4605 /* If bytes is aligned on 32-bit, the access might still cross one
4606 * word at the beginning, and one word at the end. If bytes is not
4607 * aligned on 32-bit, the extra two words should cover for both the
4608 * size and offset mis-alignment.
4609 */
4610 num_comps = (bytes / 4) + 2;
4611 }
4612
4613 bit_size = MIN2(bit_size, 32);
4614 }
4615
4616 return (nir_mem_access_size_align){
4617 .num_components = num_comps,
4618 .bit_size = bit_size,
4619 .align = bit_size / 8,
4620 };
4621 }
4622
4623 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)4624 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
4625 unsigned num_components, nir_intrinsic_instr *low,
4626 nir_intrinsic_instr *high, void *data)
4627 {
4628 /* Must be aligned to the size of the load */
4629 unsigned align = nir_combined_align(align_mul, align_offset);
4630 if ((bit_size / 8) > align)
4631 return false;
4632
4633 if (num_components > 4)
4634 return false;
4635
4636 if (bit_size > 32)
4637 return false;
4638
4639 return true;
4640 }
4641
4642 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4643 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4644 {
4645 NIR_PASS_V(nir, nir_opt_shrink_stores, true);
4646
4647 bool progress;
4648
4649 do {
4650 progress = false;
4651
4652 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4653 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4654
4655 NIR_PASS(progress, nir, nir_copy_prop);
4656 NIR_PASS(progress, nir, nir_opt_remove_phis);
4657 NIR_PASS(progress, nir, nir_opt_dce);
4658 NIR_PASS(progress, nir, nir_opt_dead_cf);
4659 NIR_PASS(progress, nir, nir_opt_cse);
4660 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4661 NIR_PASS(progress, nir, nir_opt_algebraic);
4662 NIR_PASS(progress, nir, nir_opt_constant_folding);
4663
4664 NIR_PASS(progress, nir, nir_opt_undef);
4665 NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4666
4667 NIR_PASS(progress, nir, nir_opt_shrink_vectors, false);
4668 NIR_PASS(progress, nir, nir_opt_loop_unroll);
4669 } while (progress);
4670
4671 NIR_PASS(
4672 progress, nir, nir_opt_load_store_vectorize,
4673 &(const nir_load_store_vectorize_options){
4674 .modes = nir_var_mem_global | nir_var_mem_shared | nir_var_shader_temp,
4675 .callback = mem_vectorize_cb,
4676 });
4677 NIR_PASS(progress, nir, nir_lower_pack);
4678
4679 /* TODO: Why is 64-bit getting rematerialized?
4680 * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4681 NIR_PASS(progress, nir, nir_lower_int64);
4682
4683 /* We need to cleanup after each iteration of late algebraic
4684 * optimizations, since otherwise NIR can produce weird edge cases
4685 * (like fneg of a constant) which we don't handle */
4686 bool late_algebraic = true;
4687 while (late_algebraic) {
4688 late_algebraic = false;
4689 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4690 NIR_PASS(progress, nir, nir_opt_constant_folding);
4691 NIR_PASS(progress, nir, nir_copy_prop);
4692 NIR_PASS(progress, nir, nir_opt_dce);
4693 NIR_PASS(progress, nir, nir_opt_cse);
4694 }
4695
4696 /* This opt currently helps on Bifrost but not Valhall */
4697 if (gpu_id < 0x9000)
4698 NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
4699
4700 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4701 NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
4702 NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
4703
4704 /* Prepass to simplify instruction selection */
4705 late_algebraic = false;
4706 NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
4707
4708 while (late_algebraic) {
4709 late_algebraic = false;
4710 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4711 NIR_PASS(progress, nir, nir_opt_constant_folding);
4712 NIR_PASS(progress, nir, nir_copy_prop);
4713 NIR_PASS(progress, nir, nir_opt_dce);
4714 NIR_PASS(progress, nir, nir_opt_cse);
4715 }
4716
4717 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4718 NIR_PASS(progress, nir, nir_opt_dce);
4719
4720 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4721 NIR_PASS_V(nir, nir_shader_intrinsics_pass,
4722 bifrost_nir_lower_blend_components,
4723 nir_metadata_control_flow, NULL);
4724 }
4725
4726 /* Backend scheduler is purely local, so do some global optimizations
4727 * to reduce register pressure. */
4728 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
4729 nir_move_load_input | nir_move_comparisons |
4730 nir_move_copies | nir_move_load_ssbo;
4731
4732 NIR_PASS_V(nir, nir_opt_sink, move_all);
4733 NIR_PASS_V(nir, nir_opt_move, move_all);
4734
4735 /* We might lower attribute, varying, and image indirects. Use the
4736 * gathered info to skip the extra analysis in the happy path. */
4737 bool any_indirects = nir->info.inputs_read_indirectly ||
4738 nir->info.outputs_accessed_indirectly ||
4739 nir->info.patch_inputs_read_indirectly ||
4740 nir->info.patch_outputs_accessed_indirectly ||
4741 nir->info.images_used[0];
4742
4743 if (any_indirects) {
4744 nir_convert_to_lcssa(nir, true, true);
4745 NIR_PASS_V(nir, nir_divergence_analysis);
4746 NIR_PASS_V(nir, bi_lower_divergent_indirects,
4747 pan_subgroup_size(pan_arch(gpu_id)));
4748 }
4749 }
4750
4751 static void
bi_opt_post_ra(bi_context * ctx)4752 bi_opt_post_ra(bi_context *ctx)
4753 {
4754 bi_foreach_instr_global_safe(ctx, ins) {
4755 if (ins->op == BI_OPCODE_MOV_I32 &&
4756 bi_is_equiv(ins->dest[0], ins->src[0]))
4757 bi_remove_instruction(ins);
4758 }
4759 }
4760
4761 /* Dead code elimination for branches at the end of a block - only one branch
4762 * per block is legal semantically, but unreachable jumps can be generated.
4763 * Likewise on Bifrost we can generate jumps to the terminal block which need
4764 * to be lowered away to a jump to #0x0, which induces successful termination.
4765 * That trick doesn't work on Valhall, which needs a NOP inserted in the
4766 * terminal block instead.
4767 */
4768 static void
bi_lower_branch(bi_context * ctx,bi_block * block)4769 bi_lower_branch(bi_context *ctx, bi_block *block)
4770 {
4771 bool cull_terminal = (ctx->arch <= 8);
4772 bool branched = false;
4773
4774 bi_foreach_instr_in_block_safe(block, ins) {
4775 if (!ins->branch_target)
4776 continue;
4777
4778 if (branched) {
4779 bi_remove_instruction(ins);
4780 continue;
4781 }
4782
4783 branched = true;
4784
4785 if (!bi_is_terminal_block(ins->branch_target))
4786 continue;
4787
4788 if (cull_terminal)
4789 ins->branch_target = NULL;
4790 else if (ins->branch_target)
4791 ins->branch_target->needs_nop = true;
4792 }
4793 }
4794
4795 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)4796 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
4797 {
4798 unsigned final_clause = bi_pack(ctx, binary);
4799
4800 /* If we need to wait for ATEST or BLEND in the first clause, pass the
4801 * corresponding bits through to the renderer state descriptor */
4802 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
4803 bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
4804
4805 unsigned first_deps = first_clause ? first_clause->dependencies : 0;
4806 ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
4807 ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
4808
4809 /* Pad the shader with enough zero bytes to trick the prefetcher,
4810 * unless we're compiling an empty shader (in which case we don't pad
4811 * so the size remains 0) */
4812 unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
4813
4814 if (binary->size - offset) {
4815 memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
4816 prefetch_size);
4817 }
4818 }
4819
4820 /*
4821 * Build a bit mask of varyings (by location) that are flatshaded. This
4822 * information is needed by lower_mediump_io, as we don't yet support 16-bit
4823 * flat varyings.
4824 *
4825 * Also varyings that are used as texture coordinates should be kept at fp32 so
4826 * the texture instruction may be promoted to VAR_TEX. In general this is a good
4827 * idea, as fp16 texture coordinates are not supported by the hardware and are
4828 * usually inappropriate. (There are both relevant CTS bugs here, even.)
4829 *
4830 * TODO: If we compacted the varyings with some fixup code in the vertex shader,
4831 * we could implement 16-bit flat varyings. Consider if this case matters.
4832 *
4833 * TODO: The texture coordinate handling could be less heavyhanded.
4834 */
4835 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)4836 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
4837 {
4838 uint64_t *mask = data;
4839
4840 if (instr->type != nir_instr_type_tex)
4841 return false;
4842
4843 nir_tex_instr *tex = nir_instr_as_tex(instr);
4844
4845 int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
4846 if (coord_idx < 0)
4847 return false;
4848
4849 nir_src src = tex->src[coord_idx].src;
4850 nir_scalar x = nir_scalar_resolved(src.ssa, 0);
4851 nir_scalar y = nir_scalar_resolved(src.ssa, 1);
4852
4853 if (x.def != y.def)
4854 return false;
4855
4856 nir_instr *parent = x.def->parent_instr;
4857
4858 if (parent->type != nir_instr_type_intrinsic)
4859 return false;
4860
4861 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
4862
4863 if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4864 return false;
4865
4866 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4867 *mask |= BITFIELD64_BIT(sem.location);
4868 return false;
4869 }
4870
4871 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)4872 bi_fp32_varying_mask(nir_shader *nir)
4873 {
4874 uint64_t mask = 0;
4875
4876 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4877
4878 nir_foreach_shader_in_variable(var, nir) {
4879 if (var->data.interpolation == INTERP_MODE_FLAT)
4880 mask |= BITFIELD64_BIT(var->data.location);
4881 }
4882
4883 nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
4884 &mask);
4885
4886 return mask;
4887 }
4888
4889 static bool
bi_lower_sample_mask_writes(nir_builder * b,nir_intrinsic_instr * intr,void * data)4890 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
4891 void *data)
4892 {
4893 if (intr->intrinsic != nir_intrinsic_store_output)
4894 return false;
4895
4896 assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
4897 if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
4898 return false;
4899
4900 b->cursor = nir_before_instr(&intr->instr);
4901
4902 nir_def *orig = nir_load_sample_mask(b);
4903
4904 nir_src_rewrite(&intr->src[0],
4905 nir_b32csel(b, nir_load_multisampled_pan(b),
4906 nir_iand(b, orig, intr->src[0].ssa), orig));
4907 return true;
4908 }
4909
4910 static bool
bi_lower_load_output(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4911 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
4912 UNUSED void *data)
4913 {
4914 if (intr->intrinsic != nir_intrinsic_load_output)
4915 return false;
4916
4917 unsigned loc = nir_intrinsic_io_semantics(intr).location;
4918 assert(loc >= FRAG_RESULT_DATA0);
4919 unsigned rt = loc - FRAG_RESULT_DATA0;
4920
4921 b->cursor = nir_before_instr(&intr->instr);
4922
4923 nir_def *conversion = nir_load_rt_conversion_pan(
4924 b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
4925
4926 nir_def *lowered = nir_load_converted_output_pan(
4927 b, intr->def.num_components, intr->def.bit_size, conversion,
4928 .dest_type = nir_intrinsic_dest_type(intr),
4929 .io_semantics = nir_intrinsic_io_semantics(intr));
4930
4931 nir_def_rewrite_uses(&intr->def, lowered);
4932 return true;
4933 }
4934
4935 bool
bifrost_nir_lower_load_output(nir_shader * nir)4936 bifrost_nir_lower_load_output(nir_shader *nir)
4937 {
4938 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4939
4940 return nir_shader_intrinsics_pass(
4941 nir, bi_lower_load_output,
4942 nir_metadata_control_flow, NULL);
4943 }
4944
4945 static bool
bi_lower_load_push_const_with_dyn_offset(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4946 bi_lower_load_push_const_with_dyn_offset(nir_builder *b,
4947 nir_intrinsic_instr *intr,
4948 UNUSED void *data)
4949 {
4950 if (intr->intrinsic != nir_intrinsic_load_push_constant)
4951 return false;
4952
4953 /* Offset is constant, nothing to do. */
4954 if (nir_src_is_const(intr->src[0]))
4955 return false;
4956
4957 /* nir_lower_mem_access_bit_sizes() should have lowered load_push_constant
4958 * to 32-bit and a maximum of 4 components.
4959 */
4960 assert(intr->def.num_components <= 4);
4961 assert(intr->def.bit_size == 32);
4962
4963 uint32_t base = nir_intrinsic_base(intr);
4964 uint32_t range = nir_intrinsic_range(intr);
4965 uint32_t nwords = intr->def.num_components;
4966
4967 b->cursor = nir_before_instr(&intr->instr);
4968
4969 /* Dynamic indexing is only allowed for vulkan push constants, which is
4970 * currently limited to 256 bytes. That gives us a maximum of 64 32-bit
4971 * words to read from.
4972 */
4973 nir_def *lut[64] = {0};
4974
4975 assert(range / 4 <= ARRAY_SIZE(lut));
4976
4977 /* Load all words in the range. */
4978 for (uint32_t w = 0; w < range / 4; w++) {
4979 lut[w] = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0),
4980 .base = base + (w * 4), .range = 4);
4981 }
4982
4983 nir_def *index = intr->src[0].ssa;
4984
4985 /* Index is dynamic, we need to do iteratively CSEL the values based on
4986 * the index. We start with the highest bit in the index, and for each
4987 * iteration we divide the scope by two.
4988 */
4989 for (uint32_t lut_sz = ARRAY_SIZE(lut); lut_sz > 0; lut_sz /= 2) {
4990 uint32_t stride = lut_sz / 2;
4991 nir_def *bit_test = NULL;
4992
4993 /* Stop when the LUT is smaller than the number of words we're trying to
4994 * extract.
4995 */
4996 if (lut_sz <= nwords)
4997 break;
4998
4999 for (uint32_t i = 0; i < stride; i++) {
5000 /* We only need a CSEL if we have two values, otherwise we pick the
5001 * non-NULL value.
5002 */
5003 if (lut[i] && lut[i + stride]) {
5004 /* Create the test src on-demand. The stride is in 32-bit words,
5005 * multiply by four to convert it into a byte stride we can use
5006 * to test if the corresponding bit is set in the index src.
5007 */
5008 if (!bit_test)
5009 bit_test = nir_i2b(b, nir_iand_imm(b, index, stride * 4));
5010
5011 lut[i] = nir_bcsel(b, bit_test, lut[i + stride], lut[i]);
5012 } else if (lut[i + stride]) {
5013 lut[i] = lut[i + stride];
5014 }
5015 }
5016 }
5017
5018 nir_def *res = nir_vec(b, &lut[0], nwords);
5019
5020 nir_def_rewrite_uses(&intr->def, res);
5021 nir_instr_remove(&intr->instr);
5022 return true;
5023 }
5024
5025 void
bifrost_preprocess_nir(nir_shader * nir,unsigned gpu_id)5026 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
5027 {
5028 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
5029 * (so we don't accidentally duplicate the epilogue since mesa/st has
5030 * messed with our I/O quite a bit already) */
5031
5032 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
5033
5034 if (nir->info.stage == MESA_SHADER_VERTEX) {
5035 NIR_PASS_V(nir, nir_lower_viewport_transform);
5036 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
5037
5038 nir_variable *psiz = nir_find_variable_with_location(
5039 nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
5040 if (psiz != NULL)
5041 psiz->data.precision = GLSL_PRECISION_MEDIUM;
5042 }
5043
5044 /* Get rid of any global vars before we lower to scratch. */
5045 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
5046
5047 /* Valhall introduces packed thread local storage, which improves cache
5048 * locality of TLS access. However, access to packed TLS cannot
5049 * straddle 16-byte boundaries. As such, when packed TLS is in use
5050 * (currently unconditional for Valhall), we force vec4 alignment for
5051 * scratch access.
5052 */
5053 glsl_type_size_align_func vars_to_scratch_size_align_func =
5054 (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
5055 : glsl_get_natural_size_align_bytes;
5056 /* Lower large arrays to scratch and small arrays to bcsel */
5057 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
5058 vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
5059 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
5060
5061 NIR_PASS_V(nir, nir_split_var_copies);
5062 NIR_PASS_V(nir, nir_lower_var_copies);
5063 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
5064 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
5065 glsl_type_size, 0);
5066
5067 /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
5068 * offsets it could figure out are constant. Do some constant folding
5069 * before bifrost_nir_lower_store_component below.
5070 */
5071 NIR_PASS_V(nir, nir_opt_constant_folding);
5072
5073 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
5074 NIR_PASS_V(nir, nir_lower_mediump_io,
5075 nir_var_shader_in | nir_var_shader_out,
5076 ~bi_fp32_varying_mask(nir), false);
5077
5078 NIR_PASS_V(nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
5079 nir_metadata_control_flow, NULL);
5080
5081 NIR_PASS_V(nir, bifrost_nir_lower_load_output);
5082 } else if (nir->info.stage == MESA_SHADER_VERTEX) {
5083 if (gpu_id >= 0x9000) {
5084 NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
5085 BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
5086 }
5087
5088 NIR_PASS_V(nir, pan_nir_lower_store_component);
5089 }
5090
5091 nir_lower_mem_access_bit_sizes_options mem_size_options = {
5092 .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
5093 nir_var_mem_constant | nir_var_mem_task_payload |
5094 nir_var_shader_temp | nir_var_function_temp |
5095 nir_var_mem_global | nir_var_mem_shared,
5096 .callback = mem_access_size_align_cb,
5097 };
5098 NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
5099
5100 NIR_PASS_V(nir, nir_shader_intrinsics_pass,
5101 bi_lower_load_push_const_with_dyn_offset,
5102 nir_metadata_control_flow, NULL);
5103
5104 nir_lower_ssbo_options ssbo_opts = {
5105 .native_loads = pan_arch(gpu_id) >= 9,
5106 .native_offset = pan_arch(gpu_id) >= 9,
5107 };
5108 NIR_PASS_V(nir, nir_lower_ssbo, &ssbo_opts);
5109
5110 NIR_PASS_V(nir, pan_lower_sample_pos);
5111 NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
5112 NIR_PASS_V(nir, nir_lower_64bit_phis);
5113 NIR_PASS_V(nir, pan_lower_helper_invocation);
5114 NIR_PASS_V(nir, nir_lower_int64);
5115
5116 NIR_PASS_V(nir, nir_opt_idiv_const, 8);
5117 NIR_PASS_V(nir, nir_lower_idiv,
5118 &(nir_lower_idiv_options){.allow_fp16 = true});
5119
5120 NIR_PASS_V(nir, nir_lower_tex,
5121 &(nir_lower_tex_options){
5122 .lower_txs_lod = true,
5123 .lower_txp = ~0,
5124 .lower_tg4_broadcom_swizzle = true,
5125 .lower_txd = true,
5126 .lower_invalid_implicit_lod = true,
5127 .lower_index_to_offset = true,
5128 });
5129
5130 NIR_PASS_V(nir, nir_lower_image_atomics_to_global);
5131
5132 /* on bifrost, lower MSAA load/stores to 3D load/stores */
5133 if (pan_arch(gpu_id) < 9)
5134 NIR_PASS_V(nir, pan_nir_lower_image_ms);
5135
5136 NIR_PASS_V(nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
5137 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
5138 NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
5139 NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
5140 NIR_PASS_V(nir, nir_lower_var_copies);
5141 NIR_PASS_V(nir, nir_lower_alu);
5142 NIR_PASS_V(nir, nir_lower_frag_coord_to_pixel_coord);
5143 }
5144
5145 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct bi_shader_info info,enum bi_idvs_mode idvs)5146 bi_compile_variant_nir(nir_shader *nir,
5147 const struct panfrost_compile_inputs *inputs,
5148 struct util_dynarray *binary, struct bi_shader_info info,
5149 enum bi_idvs_mode idvs)
5150 {
5151 bi_context *ctx = rzalloc(NULL, bi_context);
5152
5153 /* There may be another program in the dynarray, start at the end */
5154 unsigned offset = binary->size;
5155
5156 ctx->inputs = inputs;
5157 ctx->nir = nir;
5158 ctx->stage = nir->info.stage;
5159 ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
5160 ctx->arch = pan_arch(inputs->gpu_id);
5161 ctx->info = info;
5162 ctx->idvs = idvs;
5163 ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
5164
5165 if (idvs != BI_IDVS_NONE) {
5166 /* Specializing shaders for IDVS is destructive, so we need to
5167 * clone. However, the last (second) IDVS shader does not need
5168 * to be preserved so we can skip cloning that one.
5169 */
5170 if (offset == 0)
5171 ctx->nir = nir = nir_shader_clone(ctx, nir);
5172
5173 NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_specialize_idvs,
5174 nir_metadata_control_flow, &idvs);
5175
5176 /* After specializing, clean up the mess */
5177 bool progress = true;
5178
5179 while (progress) {
5180 progress = false;
5181
5182 NIR_PASS(progress, nir, nir_opt_dce);
5183 NIR_PASS(progress, nir, nir_opt_dead_cf);
5184 }
5185 }
5186
5187 /* If nothing is pushed, all UBOs need to be uploaded */
5188 ctx->ubo_mask = ~0;
5189
5190 list_inithead(&ctx->blocks);
5191
5192 bool skip_internal = nir->info.internal;
5193 skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
5194
5195 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5196 nir_print_shader(nir, stdout);
5197 }
5198
5199 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
5200
5201 nir_foreach_function_impl(impl, nir) {
5202 nir_index_blocks(impl);
5203
5204 ctx->indexed_nir_blocks =
5205 rzalloc_array(ctx, bi_block *, impl->num_blocks);
5206
5207 ctx->ssa_alloc += impl->ssa_alloc;
5208
5209 emit_cf_list(ctx, &impl->body);
5210 bi_emit_phis_deferred(ctx);
5211 break; /* TODO: Multi-function shaders */
5212 }
5213
5214 /* Index blocks now that we're done emitting */
5215 bi_foreach_block(ctx, block) {
5216 block->index = ctx->num_blocks++;
5217 }
5218
5219 bi_validate(ctx, "NIR -> BIR");
5220
5221 /* If the shader doesn't write any colour or depth outputs, it may
5222 * still need an ATEST at the very end! */
5223 bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
5224 !ctx->emitted_atest && !bi_skip_atest(ctx, false);
5225
5226 if (need_dummy_atest) {
5227 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
5228 bi_builder b = bi_init_builder(ctx, bi_after_block(end));
5229 bi_emit_atest(&b, bi_zero());
5230 }
5231
5232 bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
5233
5234 /* Runs before constant folding */
5235 bi_lower_swizzle(ctx);
5236 bi_validate(ctx, "Early lowering");
5237
5238 /* Runs before copy prop */
5239 if (optimize && !ctx->inputs->no_ubo_to_push) {
5240 bi_opt_push_ubo(ctx);
5241 }
5242
5243 if (likely(optimize)) {
5244 bi_opt_copy_prop(ctx);
5245
5246 while (bi_opt_constant_fold(ctx))
5247 bi_opt_copy_prop(ctx);
5248
5249 bi_opt_mod_prop_forward(ctx);
5250 bi_opt_mod_prop_backward(ctx);
5251
5252 /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
5253 * mod_prop_backward to fuse VAR_TEX */
5254 if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
5255 !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
5256 bi_opt_dce(ctx, false);
5257 bi_opt_message_preload(ctx);
5258 bi_opt_copy_prop(ctx);
5259 }
5260
5261 bi_opt_dce(ctx, false);
5262 bi_opt_cse(ctx);
5263 bi_opt_dce(ctx, false);
5264 if (!ctx->inputs->no_ubo_to_push)
5265 bi_opt_reorder_push(ctx);
5266 bi_validate(ctx, "Optimization passes");
5267 }
5268
5269 bi_lower_opt_instructions(ctx);
5270
5271 if (ctx->arch >= 9) {
5272 va_optimize(ctx);
5273 va_lower_isel(ctx);
5274
5275 bi_foreach_instr_global_safe(ctx, I) {
5276 /* Phis become single moves so shouldn't be affected */
5277 if (I->op == BI_OPCODE_PHI)
5278 continue;
5279
5280 va_lower_constants(ctx, I);
5281
5282 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
5283 va_repair_fau(&b, I);
5284 }
5285
5286 /* We need to clean up after constant lowering */
5287 if (likely(optimize)) {
5288 bi_opt_cse(ctx);
5289 bi_opt_dce(ctx, false);
5290 }
5291
5292 bi_validate(ctx, "Valhall passes");
5293 }
5294
5295 bi_foreach_block(ctx, block) {
5296 bi_lower_branch(ctx, block);
5297 }
5298
5299 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5300 bi_print_shader(ctx, stdout);
5301
5302 /* Analyze before register allocation to avoid false dependencies. The
5303 * skip bit is a function of only the data flow graph and is invariant
5304 * under valid scheduling. Helpers are only defined for fragment
5305 * shaders, so this analysis is only required in fragment shaders.
5306 */
5307 if (ctx->stage == MESA_SHADER_FRAGMENT) {
5308 bi_opt_dce(ctx, false);
5309 bi_analyze_helper_requirements(ctx);
5310 }
5311
5312 /* Fuse TEXC after analyzing helper requirements so the analysis
5313 * doesn't have to know about dual textures */
5314 if (likely(optimize)) {
5315 bi_opt_fuse_dual_texture(ctx);
5316 }
5317
5318 /* Lower FAU after fusing dual texture, because fusing dual texture
5319 * creates new immediates that themselves may need lowering.
5320 */
5321 if (ctx->arch <= 8) {
5322 bi_lower_fau(ctx);
5323 }
5324
5325 /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
5326 if (likely(optimize)) {
5327 bi_opt_cse(ctx);
5328 bi_opt_dce(ctx, false);
5329 }
5330
5331 bi_validate(ctx, "Late lowering");
5332
5333 if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
5334 bi_pressure_schedule(ctx);
5335 bi_validate(ctx, "Pre-RA scheduling");
5336 }
5337
5338 bi_register_allocate(ctx);
5339
5340 if (likely(optimize))
5341 bi_opt_post_ra(ctx);
5342
5343 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5344 bi_print_shader(ctx, stdout);
5345
5346 if (ctx->arch >= 9) {
5347 va_assign_slots(ctx);
5348 va_insert_flow_control_nops(ctx);
5349 va_merge_flow(ctx);
5350 va_mark_last(ctx);
5351 } else {
5352 bi_schedule(ctx);
5353 bi_assign_scoreboard(ctx);
5354
5355 /* Analyze after scheduling since we depend on instruction
5356 * order. Valhall calls as part of va_insert_flow_control_nops,
5357 * as the handling for clauses differs from instructions.
5358 */
5359 bi_analyze_helper_terminate(ctx);
5360 bi_mark_clauses_td(ctx);
5361 }
5362
5363 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5364 bi_print_shader(ctx, stdout);
5365
5366 if (ctx->arch <= 8) {
5367 bi_pack_clauses(ctx, binary, offset);
5368 } else {
5369 bi_pack_valhall(ctx, binary);
5370 }
5371
5372 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5373 if (ctx->arch <= 8) {
5374 disassemble_bifrost(stdout, binary->data + offset,
5375 binary->size - offset,
5376 bifrost_debug & BIFROST_DBG_VERBOSE);
5377 } else {
5378 disassemble_valhall(stdout, binary->data + offset,
5379 binary->size - offset,
5380 bifrost_debug & BIFROST_DBG_VERBOSE);
5381 }
5382
5383 fflush(stdout);
5384 }
5385
5386 if (!skip_internal &&
5387 ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
5388 char *shaderdb;
5389
5390 if (ctx->arch >= 9) {
5391 shaderdb = va_print_stats(ctx, binary->size - offset);
5392 } else {
5393 shaderdb = bi_print_stats(ctx, binary->size - offset);
5394 }
5395
5396 if (bifrost_debug & BIFROST_DBG_SHADERDB)
5397 fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
5398
5399 if (inputs->debug)
5400 util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
5401
5402 ralloc_free(shaderdb);
5403 }
5404
5405 return ctx;
5406 }
5407
5408 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info,enum bi_idvs_mode idvs)5409 bi_compile_variant(nir_shader *nir,
5410 const struct panfrost_compile_inputs *inputs,
5411 struct util_dynarray *binary, struct pan_shader_info *info,
5412 enum bi_idvs_mode idvs)
5413 {
5414 struct bi_shader_info local_info = {
5415 .push = &info->push,
5416 .bifrost = &info->bifrost,
5417 .tls_size = info->tls_size,
5418 .push_offset = info->push.count,
5419 };
5420
5421 unsigned offset = binary->size;
5422
5423 /* If there is no position shader (gl_Position is not written), then
5424 * there is no need to build a varying shader either. This case is hit
5425 * for transform feedback only vertex shaders which only make sense with
5426 * rasterizer discard.
5427 */
5428 if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5429 return;
5430
5431 /* Software invariant: Only a secondary shader can appear at a nonzero
5432 * offset, to keep the ABI simple. */
5433 assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5434
5435 bi_context *ctx =
5436 bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
5437
5438 /* A register is preloaded <==> it is live before the first block */
5439 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5440 uint64_t preload = first_block->reg_live_in;
5441
5442 /* If multisampling is used with a blend shader, the blend shader needs
5443 * to access the sample coverage mask in r60 and the sample ID in r61.
5444 * Blend shaders run in the same context as fragment shaders, so if a
5445 * blend shader could run, we need to preload these registers
5446 * conservatively. There is believed to be little cost to doing so, so
5447 * do so always to avoid variants of the preload descriptor.
5448 *
5449 * We only do this on Valhall, as Bifrost has to update the RSD for
5450 * multisampling w/ blend shader anyway, so this is handled in the
5451 * driver. We could unify the paths if the cost is acceptable.
5452 */
5453 if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5454 preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5455
5456 info->ubo_mask |= ctx->ubo_mask;
5457 info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5458
5459 if (idvs == BI_IDVS_VARYING) {
5460 info->vs.secondary_enable = (binary->size > offset);
5461 info->vs.secondary_offset = offset;
5462 info->vs.secondary_preload = preload;
5463 info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5464 } else {
5465 info->preload = preload;
5466 info->work_reg_count = ctx->info.work_reg_count;
5467 }
5468
5469 if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
5470 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5471 /* Find the psiz write */
5472 bi_instr *write = NULL;
5473
5474 bi_foreach_instr_global(ctx, I) {
5475 if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5476 write = I;
5477 break;
5478 }
5479 }
5480
5481 assert(write != NULL);
5482
5483 /* NOP it out, preserving its flow control. TODO: maybe DCE */
5484 if (write->flow) {
5485 bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5486 bi_instr *nop = bi_nop(&b);
5487 nop->flow = write->flow;
5488 }
5489
5490 bi_remove_instruction(write);
5491
5492 info->vs.no_psiz_offset = binary->size;
5493 bi_pack_valhall(ctx, binary);
5494 }
5495
5496 ralloc_free(ctx);
5497 }
5498
5499 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5500 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5501 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5502 {
5503 /* Opt-out */
5504 if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5505 return false;
5506
5507 /* IDVS splits up vertex shaders, not defined on other shader stages */
5508 if (nir->info.stage != MESA_SHADER_VERTEX)
5509 return false;
5510
5511 /* Bifrost cannot write gl_PointSize during IDVS */
5512 if ((inputs->gpu_id < 0x9000) &&
5513 nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5514 return false;
5515
5516 /* Otherwise, IDVS is usually better */
5517 return true;
5518 }
5519
5520 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5521 bifrost_compile_shader_nir(nir_shader *nir,
5522 const struct panfrost_compile_inputs *inputs,
5523 struct util_dynarray *binary,
5524 struct pan_shader_info *info)
5525 {
5526 bifrost_debug = debug_get_option_bifrost_debug();
5527
5528 /* Combine stores late, to give the driver a chance to lower dual-source
5529 * blending as regular store_output intrinsics.
5530 */
5531 NIR_PASS_V(nir, pan_nir_lower_zs_store);
5532
5533 bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
5534
5535 info->tls_size = nir->scratch_size;
5536 info->vs.idvs = bi_should_idvs(nir, inputs);
5537
5538 pan_nir_collect_varyings(nir, info);
5539
5540 if (info->vs.idvs) {
5541 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
5542 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
5543 } else {
5544 bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
5545 }
5546
5547 if (gl_shader_stage_is_compute(nir->info.stage)) {
5548 /* Workgroups may be merged if the structure of the workgroup is
5549 * not software visible. This is true if neither shared memory
5550 * nor barriers are used. The hardware may be able to optimize
5551 * compute shaders that set this flag.
5552 */
5553 info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
5554 !nir->info.uses_control_barrier &&
5555 !nir->info.uses_memory_barrier;
5556 }
5557
5558 info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5559 }
5560