xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bifrost_compile.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  * Copyright (C) 2022 Alyssa Rosenzweig <[email protected]>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors (Collabora):
25  *      Alyssa Rosenzweig <[email protected]>
26  */
27 
28 #include "compiler/glsl/glsl_to_nir.h"
29 #include "compiler/glsl_types.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "util/u_debug.h"
32 
33 #include "bifrost/disassemble.h"
34 #include "panfrost/lib/pan_props.h"
35 #include "valhall/disassemble.h"
36 #include "valhall/va_compiler.h"
37 #include "bi_builder.h"
38 #include "bi_quirks.h"
39 #include "bifrost_compile.h"
40 #include "bifrost_nir.h"
41 #include "compiler.h"
42 
43 /* clang-format off */
44 static const struct debug_named_value bifrost_debug_options[] = {
45    {"msgs",       BIFROST_DBG_MSGS,		   "Print debug messages"},
46    {"shaders",    BIFROST_DBG_SHADERS,	   "Dump shaders in NIR and MIR"},
47    {"shaderdb",   BIFROST_DBG_SHADERDB,	"Print statistics"},
48    {"verbose",    BIFROST_DBG_VERBOSE,	   "Disassemble verbosely"},
49    {"internal",   BIFROST_DBG_INTERNAL,	"Dump even internal shaders"},
50    {"nosched",    BIFROST_DBG_NOSCHED, 	"Force trivial bundling"},
51    {"nopsched",   BIFROST_DBG_NOPSCHED,   "Disable scheduling for pressure"},
52    {"inorder",    BIFROST_DBG_INORDER, 	"Force in-order bundling"},
53    {"novalidate", BIFROST_DBG_NOVALIDATE, "Skip IR validation"},
54    {"noopt",      BIFROST_DBG_NOOPT,      "Skip optimization passes"},
55    {"noidvs",     BIFROST_DBG_NOIDVS,     "Disable IDVS"},
56    {"nosb",       BIFROST_DBG_NOSB,       "Disable scoreboarding"},
57    {"nopreload",  BIFROST_DBG_NOPRELOAD,  "Disable message preloading"},
58    {"spill",      BIFROST_DBG_SPILL,      "Test register spilling"},
59    DEBUG_NAMED_VALUE_END
60 };
61 /* clang-format on */
62 
63 DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG",
64                             bifrost_debug_options, 0)
65 
66 /* How many bytes are prefetched by the Bifrost shader core. From the final
67  * clause of the shader, this range must be valid instructions or zero. */
68 #define BIFROST_SHADER_PREFETCH 128
69 
70 int bifrost_debug = 0;
71 
72 #define DBG(fmt, ...)                                                          \
73    do {                                                                        \
74       if (bifrost_debug & BIFROST_DBG_MSGS)                                    \
75          fprintf(stderr, "%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__);    \
76    } while (0)
77 
78 static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list);
79 
80 static bi_index
bi_preload(bi_builder * b,unsigned reg)81 bi_preload(bi_builder *b, unsigned reg)
82 {
83    if (bi_is_null(b->shader->preloaded[reg])) {
84       /* Insert at the beginning of the shader */
85       bi_builder b_ = *b;
86       b_.cursor = bi_before_block(bi_start_block(&b->shader->blocks));
87 
88       /* Cache the result */
89       b->shader->preloaded[reg] = bi_mov_i32(&b_, bi_register(reg));
90    }
91 
92    return b->shader->preloaded[reg];
93 }
94 
95 static bi_index
bi_coverage(bi_builder * b)96 bi_coverage(bi_builder *b)
97 {
98    if (bi_is_null(b->shader->coverage))
99       b->shader->coverage = bi_preload(b, 60);
100 
101    return b->shader->coverage;
102 }
103 
104 /*
105  * Vertex ID and Instance ID are preloaded registers. Where they are preloaded
106  * changed from Bifrost to Valhall. Provide helpers that smooth over the
107  * architectural difference.
108  */
109 static inline bi_index
bi_vertex_id(bi_builder * b)110 bi_vertex_id(bi_builder *b)
111 {
112    return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61);
113 }
114 
115 static inline bi_index
bi_instance_id(bi_builder * b)116 bi_instance_id(bi_builder *b)
117 {
118    return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62);
119 }
120 
121 static inline bi_index
bi_draw_id(bi_builder * b)122 bi_draw_id(bi_builder *b)
123 {
124    assert(b->shader->arch >= 9);
125    return bi_preload(b, 62);
126 }
127 
128 static void
bi_emit_jump(bi_builder * b,nir_jump_instr * instr)129 bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
130 {
131    bi_instr *branch = bi_jump(b, bi_zero());
132 
133    switch (instr->type) {
134    case nir_jump_break:
135       branch->branch_target = b->shader->break_block;
136       break;
137    case nir_jump_continue:
138       branch->branch_target = b->shader->continue_block;
139       break;
140    default:
141       unreachable("Unhandled jump type");
142    }
143 
144    bi_block_add_successor(b->shader->current_block, branch->branch_target);
145    b->shader->current_block->unconditional_jumps = true;
146 }
147 
148 /* Builds a 64-bit hash table key for an index */
149 static uint64_t
bi_index_to_key(bi_index idx)150 bi_index_to_key(bi_index idx)
151 {
152    static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
153 
154    uint64_t key = 0;
155    memcpy(&key, &idx, sizeof(idx));
156    return key;
157 }
158 
159 /*
160  * Extract a single channel out of a vector source. We split vectors with SPLIT
161  * so we can use the split components directly, without emitting an extract.
162  * This has advantages of RA, as the split can usually be optimized away.
163  */
164 static bi_index
bi_extract(bi_builder * b,bi_index vec,unsigned channel)165 bi_extract(bi_builder *b, bi_index vec, unsigned channel)
166 {
167    bi_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
168                                                       bi_index_to_key(vec));
169 
170    /* No extract needed for scalars.
171     *
172     * This is a bit imprecise, but actual bugs (missing splits for vectors)
173     * should be caught by the following assertion. It is too difficult to
174     * ensure bi_extract is only called for real vectors.
175     */
176    if (components == NULL && channel == 0)
177       return vec;
178 
179    assert(components != NULL && "missing bi_cache_collect()");
180    return components[channel];
181 }
182 
183 static void
bi_cache_collect(bi_builder * b,bi_index dst,bi_index * s,unsigned n)184 bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
185 {
186    /* Lifetime of a hash table entry has to be at least as long as the table */
187    bi_index *channels = ralloc_array(b->shader, bi_index, n);
188    memcpy(channels, s, sizeof(bi_index) * n);
189 
190    _mesa_hash_table_u64_insert(b->shader->allocated_vec, bi_index_to_key(dst),
191                                channels);
192 }
193 
194 /*
195  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
196  * split pseudo-instruction.
197  *
198  * Pre-condition: dests is filled with bi_null().
199  */
200 static void
bi_emit_split_i32(bi_builder * b,bi_index dests[4],bi_index vec,unsigned n)201 bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
202 {
203    /* Setup the destinations */
204    for (unsigned i = 0; i < n; ++i) {
205       dests[i] = bi_temp(b->shader);
206    }
207 
208    /* Emit the split */
209    if (n == 1) {
210       bi_mov_i32_to(b, dests[0], vec);
211    } else {
212       bi_instr *I = bi_split_i32_to(b, n, vec);
213 
214       bi_foreach_dest(I, j)
215          I->dest[j] = dests[j];
216    }
217 }
218 
219 static void
bi_emit_cached_split_i32(bi_builder * b,bi_index vec,unsigned n)220 bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
221 {
222    bi_index dests[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
223    bi_emit_split_i32(b, dests, vec, n);
224    bi_cache_collect(b, vec, dests, n);
225 }
226 
227 /*
228  * Emit and cache a split for a vector of a given bitsize. The vector may not be
229  * composed of 32-bit words, but it will be split at 32-bit word boundaries.
230  */
231 static void
bi_emit_cached_split(bi_builder * b,bi_index vec,unsigned bits)232 bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
233 {
234    bi_emit_cached_split_i32(b, vec, DIV_ROUND_UP(bits, 32));
235 }
236 
237 static void
bi_split_def(bi_builder * b,nir_def * def)238 bi_split_def(bi_builder *b, nir_def *def)
239 {
240    bi_emit_cached_split(b, bi_def_index(def),
241                         def->bit_size * def->num_components);
242 }
243 
244 static bi_instr *
bi_emit_collect_to(bi_builder * b,bi_index dst,bi_index * chan,unsigned n)245 bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
246 {
247    /* Special case: COLLECT of a single value is a scalar move */
248    if (n == 1)
249       return bi_mov_i32_to(b, dst, chan[0]);
250 
251    bi_instr *I = bi_collect_i32_to(b, dst, n);
252 
253    bi_foreach_src(I, i)
254       I->src[i] = chan[i];
255 
256    bi_cache_collect(b, dst, chan, n);
257    return I;
258 }
259 
260 static bi_instr *
bi_collect_v2i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1)261 bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
262 {
263    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1}, 2);
264 }
265 
266 static bi_instr *
bi_collect_v3i32_to(bi_builder * b,bi_index dst,bi_index s0,bi_index s1,bi_index s2)267 bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1,
268                     bi_index s2)
269 {
270    return bi_emit_collect_to(b, dst, (bi_index[]){s0, s1, s2}, 3);
271 }
272 
273 static bi_index
bi_collect_v2i32(bi_builder * b,bi_index s0,bi_index s1)274 bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
275 {
276    bi_index dst = bi_temp(b->shader);
277    bi_collect_v2i32_to(b, dst, s0, s1);
278    return dst;
279 }
280 
281 static bi_index
bi_varying_src0_for_barycentric(bi_builder * b,nir_intrinsic_instr * intr)282 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
283 {
284    switch (intr->intrinsic) {
285    case nir_intrinsic_load_barycentric_centroid:
286    case nir_intrinsic_load_barycentric_sample:
287       return bi_preload(b, 61);
288 
289    /* Need to put the sample ID in the top 16-bits */
290    case nir_intrinsic_load_barycentric_at_sample:
291       return bi_mkvec_v2i16(b, bi_half(bi_dontcare(b), false),
292                             bi_half(bi_src_index(&intr->src[0]), false));
293 
294    /* Interpret as 8:8 signed fixed point positions in pixels along X and
295     * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0)
296     * is the center of the pixel so we first fixup and then convert. For
297     * fp16 input:
298     *
299     * f2i16(((x, y) + (0.5, 0.5)) * 2**8) =
300     * f2i16((256 * (x, y)) + (128, 128)) =
301     * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128))
302     *
303     * For fp32 input, that lacks enough precision for MSAA 16x, but the
304     * idea is the same. FIXME: still doesn't pass
305     */
306    case nir_intrinsic_load_barycentric_at_offset: {
307       bi_index offset = bi_src_index(&intr->src[0]);
308       bi_index f16 = bi_null();
309       unsigned sz = nir_src_bit_size(intr->src[0]);
310 
311       if (sz == 16) {
312          f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), bi_imm_f16(128.0));
313       } else {
314          assert(sz == 32);
315          bi_index f[2];
316          for (unsigned i = 0; i < 2; ++i) {
317             f[i] =
318                bi_fadd_rscale_f32(b, bi_extract(b, offset, i), bi_imm_f32(0.5),
319                                   bi_imm_u32(8), BI_SPECIAL_NONE);
320          }
321 
322          f16 = bi_v2f32_to_v2f16(b, f[0], f[1]);
323       }
324 
325       return bi_v2f16_to_v2s16(b, f16);
326    }
327 
328    case nir_intrinsic_load_barycentric_pixel:
329    default:
330       return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b);
331    }
332 }
333 
334 static enum bi_sample
bi_interp_for_intrinsic(nir_intrinsic_op op)335 bi_interp_for_intrinsic(nir_intrinsic_op op)
336 {
337    switch (op) {
338    case nir_intrinsic_load_barycentric_centroid:
339       return BI_SAMPLE_CENTROID;
340    case nir_intrinsic_load_barycentric_sample:
341    case nir_intrinsic_load_barycentric_at_sample:
342       return BI_SAMPLE_SAMPLE;
343    case nir_intrinsic_load_barycentric_at_offset:
344       return BI_SAMPLE_EXPLICIT;
345    case nir_intrinsic_load_barycentric_pixel:
346    default:
347       return BI_SAMPLE_CENTER;
348    }
349 }
350 
351 /* auto, 64-bit omitted */
352 static enum bi_register_format
bi_reg_fmt_for_nir(nir_alu_type T)353 bi_reg_fmt_for_nir(nir_alu_type T)
354 {
355    switch (T) {
356    case nir_type_float16:
357       return BI_REGISTER_FORMAT_F16;
358    case nir_type_float32:
359       return BI_REGISTER_FORMAT_F32;
360    case nir_type_int16:
361       return BI_REGISTER_FORMAT_S16;
362    case nir_type_uint16:
363       return BI_REGISTER_FORMAT_U16;
364    case nir_type_int32:
365       return BI_REGISTER_FORMAT_S32;
366    case nir_type_uint32:
367       return BI_REGISTER_FORMAT_U32;
368    default:
369       unreachable("Invalid type for register format");
370    }
371 }
372 
373 static bool
va_is_valid_const_narrow_index(bi_index idx)374 va_is_valid_const_narrow_index(bi_index idx)
375 {
376    if (idx.type != BI_INDEX_CONSTANT)
377       return false;
378 
379    unsigned index = pan_res_handle_get_index(idx.value);
380    unsigned table_index = pan_res_handle_get_table(idx.value);
381 
382    return index < 1024 && va_is_valid_const_table(table_index);
383 }
384 
385 /* Checks if the _IMM variant of an intrinsic can be used, returning in imm the
386  * immediate to be used (which applies even if _IMM can't be used) */
387 
388 static bool
bi_is_intr_immediate(nir_intrinsic_instr * instr,unsigned * immediate,unsigned max)389 bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate,
390                      unsigned max)
391 {
392    nir_src *offset = nir_get_io_offset_src(instr);
393 
394    if (!nir_src_is_const(*offset))
395       return false;
396 
397    *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
398    return (*immediate) < max;
399 }
400 
401 static bool
bi_is_imm_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate,unsigned max)402 bi_is_imm_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
403                       uint32_t *immediate, unsigned max)
404 {
405    nir_src *offset = nir_get_io_offset_src(instr);
406 
407    if (!nir_src_is_const(*offset))
408       return false;
409 
410    if (b->shader->arch >= 9) {
411       uint32_t res_handle =
412          nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
413       uint32_t table_index = pan_res_handle_get_table(res_handle);
414       uint32_t res_index = pan_res_handle_get_index(res_handle);
415 
416       if (!va_is_valid_const_table(table_index) || res_index >= max)
417          return false;
418 
419       *immediate = res_handle;
420       return true;
421    }
422 
423    return bi_is_intr_immediate(instr, immediate, max);
424 }
425 
426 static bool
bi_is_imm_var_desc_handle(bi_builder * b,nir_intrinsic_instr * instr,uint32_t * immediate)427 bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
428                           uint32_t *immediate)
429 {
430    unsigned max = b->shader->arch >= 9 ? 256 : 20;
431 
432    return bi_is_imm_desc_handle(b, instr, immediate, max);
433 }
434 
435 static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
436                            unsigned *channel, unsigned count, unsigned bitsize);
437 
438 /* Bifrost's load instructions lack a component offset despite operating in
439  * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
440  * but they may be unavoidable with separate shaders in use. To solve this, we
441  * lower to a larger load and an explicit copy of the desired components. */
442 
443 static void
bi_copy_component(bi_builder * b,nir_intrinsic_instr * instr,bi_index tmp)444 bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp)
445 {
446    unsigned component = nir_intrinsic_component(instr);
447    unsigned nr = instr->num_components;
448    unsigned total = nr + component;
449    unsigned bitsize = instr->def.bit_size;
450 
451    assert(total <= 4 && "should be vec4");
452    bi_emit_cached_split(b, tmp, total * bitsize);
453 
454    if (component == 0)
455       return;
456 
457    bi_index srcs[] = {tmp, tmp, tmp};
458    unsigned channels[] = {component, component + 1, component + 2};
459 
460    bi_make_vec_to(b, bi_def_index(&instr->def), srcs, channels, nr,
461                   instr->def.bit_size);
462 }
463 
464 static void
bi_emit_load_attr(bi_builder * b,nir_intrinsic_instr * instr)465 bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
466 {
467    /* Disregard the signedness of an integer, since loading 32-bits into a
468     * 32-bit register should be bit exact so should not incur any clamping.
469     *
470     * If we are reading as a u32, then it must be paired with an integer (u32 or
471     * s32) source, so use .auto32 to disregard.
472     */
473    nir_alu_type T = nir_intrinsic_dest_type(instr);
474    assert(T == nir_type_uint32 || T == nir_type_int32 || T == nir_type_float32);
475    enum bi_register_format regfmt =
476       T == nir_type_float32 ? BI_REGISTER_FORMAT_F32 : BI_REGISTER_FORMAT_AUTO;
477 
478    nir_src *offset = nir_get_io_offset_src(instr);
479    unsigned component = nir_intrinsic_component(instr);
480    enum bi_vecsize vecsize = (instr->num_components + component - 1);
481    unsigned imm_index = 0;
482    unsigned base = nir_intrinsic_base(instr);
483    bool constant = nir_src_is_const(*offset);
484    bool immediate = bi_is_imm_desc_handle(b, instr, &imm_index, 16);
485    bi_index dest =
486       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
487    bi_instr *I;
488 
489    if (immediate) {
490       I = bi_ld_attr_imm_to(b, dest, bi_vertex_id(b), bi_instance_id(b), regfmt,
491                             vecsize, pan_res_handle_get_index(imm_index));
492 
493       if (b->shader->arch >= 9)
494          I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
495    } else {
496       bi_index idx = bi_src_index(&instr->src[0]);
497 
498       if (constant)
499          idx = bi_imm_u32(imm_index);
500       else if (base != 0)
501          idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
502 
503       I = bi_ld_attr_to(b, dest, bi_vertex_id(b), bi_instance_id(b), idx,
504                         regfmt, vecsize);
505    }
506 
507    bi_copy_component(b, instr, dest);
508 }
509 
510 /*
511  * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
512  * come later, sparsely packed. This handles both linked and separable shaders
513  * with a common code path, with minimal keying only for desktop GL. Each slot
514  * consumes 16 bytes (TODO: fp16, partial vectors).
515  */
516 static unsigned
bi_varying_base_bytes(bi_context * ctx,nir_intrinsic_instr * intr)517 bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
518 {
519    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
520    uint32_t mask = ctx->inputs->fixed_varying_mask;
521 
522    if (sem.location >= VARYING_SLOT_VAR0) {
523       unsigned nr_special = util_bitcount(mask);
524       unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
525 
526       return 16 * (nr_special + general_index);
527    } else {
528       return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
529    }
530 }
531 
532 /*
533  * Compute the offset in bytes of a varying with an immediate offset, adding the
534  * offset to the base computed above. Convenience method.
535  */
536 static unsigned
bi_varying_offset(bi_context * ctx,nir_intrinsic_instr * intr)537 bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
538 {
539    nir_src *src = nir_get_io_offset_src(intr);
540    assert(nir_src_is_const(*src) && "assumes immediate offset");
541 
542    return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
543 }
544 
545 static void
bi_emit_load_vary(bi_builder * b,nir_intrinsic_instr * instr)546 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
547 {
548    enum bi_sample sample = BI_SAMPLE_CENTER;
549    enum bi_update update = BI_UPDATE_STORE;
550    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
551    bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input;
552    bi_index src0 = bi_null();
553 
554    unsigned component = nir_intrinsic_component(instr);
555    enum bi_vecsize vecsize = (instr->num_components + component - 1);
556    bi_index dest =
557       (component == 0) ? bi_def_index(&instr->def) : bi_temp(b->shader);
558 
559    unsigned sz = instr->def.bit_size;
560 
561    if (smooth) {
562       nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
563       assert(parent);
564 
565       sample = bi_interp_for_intrinsic(parent->intrinsic);
566       src0 = bi_varying_src0_for_barycentric(b, parent);
567 
568       assert(sz == 16 || sz == 32);
569       regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 : BI_REGISTER_FORMAT_F32;
570    } else {
571       assert(sz == 32);
572       regfmt = BI_REGISTER_FORMAT_U32;
573 
574       /* Valhall can't have bi_null() here, although the source is
575        * logically unused for flat varyings
576        */
577       if (b->shader->arch >= 9)
578          src0 = bi_preload(b, 61);
579 
580       /* Gather info as we go */
581       b->shader->info.bifrost->uses_flat_shading = true;
582    }
583 
584    enum bi_source_format source_format =
585       smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
586 
587    nir_src *offset = nir_get_io_offset_src(instr);
588    unsigned imm_index = 0;
589    bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
590    unsigned base = nir_intrinsic_base(instr);
591 
592    /* On Valhall, ensure the table and index are valid for usage with immediate
593     * form when IDVS isn't used */
594    if (b->shader->arch >= 9 && !b->shader->malloc_idvs)
595       immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
596                    pan_res_handle_get_index(base) < 256;
597 
598    if (b->shader->malloc_idvs && immediate) {
599       /* Immediate index given in bytes. */
600       bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
601                            update, vecsize,
602                            bi_varying_offset(b->shader, instr));
603    } else if (immediate) {
604       bi_instr *I;
605 
606       if (smooth) {
607          I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
608                               pan_res_handle_get_index(imm_index));
609       } else {
610          I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
611                                    pan_res_handle_get_index(imm_index));
612       }
613 
614       /* Valhall usually uses machine-allocated IDVS. If this is disabled,
615        * use a simple Midgard-style ABI.
616        */
617       if (b->shader->arch >= 9)
618          I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
619    } else {
620       bi_index idx = bi_src_index(offset);
621 
622       if (b->shader->malloc_idvs) {
623          /* Index needs to be in bytes, but NIR gives the index
624           * in slots. For now assume 16 bytes per element.
625           */
626          bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
627          unsigned vbase = bi_varying_base_bytes(b->shader, instr);
628 
629          if (vbase != 0)
630             idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
631 
632          bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
633                           source_format, update, vecsize);
634       } else {
635          if (base != 0)
636             idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
637 
638          if (smooth)
639             bi_ld_var_to(b, dest, src0, idx, regfmt, sample, update, vecsize);
640          else
641             bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, regfmt, vecsize);
642       }
643    }
644 
645    bi_copy_component(b, instr, dest);
646 }
647 
648 static bi_index
bi_make_vec8_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)649 bi_make_vec8_helper(bi_builder *b, bi_index *src, unsigned *channel,
650                     unsigned count)
651 {
652    assert(1 <= count && count <= 4);
653 
654    bi_index bytes[4] = {bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0), bi_imm_u8(0)};
655 
656    for (unsigned i = 0; i < count; ++i) {
657       unsigned chan = channel ? channel[i] : 0;
658       unsigned lane = chan & 3;
659       bi_index raw_data = bi_extract(b, src[i], chan >> 2);
660 
661       /* On Bifrost, MKVEC.v4i8 cannot select b1 or b3 */
662       if (b->shader->arch < 9 && lane != 0 && lane != 2) {
663          bytes[i] = bi_byte(bi_rshift_or(b, 32, raw_data, bi_zero(),
664                                          bi_imm_u8(lane * 8), false),
665                             0);
666       } else {
667          bytes[i] = bi_byte(raw_data, lane);
668       }
669 
670       assert(b->shader->arch >= 9 || bytes[i].swizzle == BI_SWIZZLE_B0000 ||
671              bytes[i].swizzle == BI_SWIZZLE_B2222);
672    }
673 
674    if (b->shader->arch >= 9) {
675       bi_index vec = bi_zero();
676 
677       if (count >= 3)
678          vec = bi_mkvec_v2i8(b, bytes[2], bytes[3], vec);
679 
680       return bi_mkvec_v2i8(b, bytes[0], bytes[1], vec);
681    } else {
682       return bi_mkvec_v4i8(b, bytes[0], bytes[1], bytes[2], bytes[3]);
683    }
684 }
685 
686 static bi_index
bi_make_vec16_helper(bi_builder * b,bi_index * src,unsigned * channel,unsigned count)687 bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
688                      unsigned count)
689 {
690    unsigned chan0 = channel ? channel[0] : 0;
691    bi_index w0 = bi_extract(b, src[0], chan0 >> 1);
692    bi_index h0 = bi_half(w0, chan0 & 1);
693 
694    /* Zero extend */
695    if (count == 1)
696       return bi_mkvec_v2i16(b, h0, bi_imm_u16(0));
697 
698    /* Else, create a vector */
699    assert(count == 2);
700 
701    unsigned chan1 = channel ? channel[1] : 0;
702    bi_index w1 = bi_extract(b, src[1], chan1 >> 1);
703    bi_index h1 = bi_half(w1, chan1 & 1);
704 
705    if (bi_is_word_equiv(w0, w1) && (chan0 & 1) == 0 && ((chan1 & 1) == 1))
706       return bi_mov_i32(b, w0);
707    else if (bi_is_word_equiv(w0, w1))
708       return bi_swz_v2i16(b, bi_swz_16(w0, chan0 & 1, chan1 & 1));
709    else
710       return bi_mkvec_v2i16(b, h0, h1);
711 }
712 
713 static void
bi_make_vec_to(bi_builder * b,bi_index dst,bi_index * src,unsigned * channel,unsigned count,unsigned bitsize)714 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
715                unsigned count, unsigned bitsize)
716 {
717    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
718    unsigned shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
719    unsigned chan_per_word = 1 << shift;
720 
721    assert(DIV_ROUND_UP(count * bitsize, 32) <= BI_MAX_SRCS &&
722           "unnecessarily large vector should have been lowered");
723 
724    bi_index srcs[BI_MAX_VEC];
725 
726    for (unsigned i = 0; i < count; i += chan_per_word) {
727       unsigned rem = MIN2(count - i, chan_per_word);
728       unsigned *channel_offset = channel ? (channel + i) : NULL;
729 
730       if (bitsize == 32)
731          srcs[i] = bi_extract(b, src[i], channel_offset ? *channel_offset : 0);
732       else if (bitsize == 16)
733          srcs[i >> 1] = bi_make_vec16_helper(b, src + i, channel_offset, rem);
734       else
735          srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
736    }
737 
738    bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
739 }
740 
741 static inline bi_instr *
bi_load_ubo_to(bi_builder * b,unsigned bitsize,bi_index dest0,bi_index src0,bi_index src1)742 bi_load_ubo_to(bi_builder *b, unsigned bitsize, bi_index dest0, bi_index src0,
743                bi_index src1)
744 {
745    bi_instr *I;
746 
747    if (b->shader->arch >= 9) {
748       I = bi_ld_buffer_to(b, bitsize, dest0, src0, src1);
749       I->seg = BI_SEG_UBO;
750    } else {
751       I = bi_load_to(b, bitsize, dest0, src0, src1, BI_SEG_UBO, 0);
752    }
753 
754    bi_emit_cached_split(b, dest0, bitsize);
755    return I;
756 }
757 
758 static void
bi_load_sample_id_to(bi_builder * b,bi_index dst)759 bi_load_sample_id_to(bi_builder *b, bi_index dst)
760 {
761    /* r61[16:23] contains the sampleID, mask it out. Upper bits
762     * seem to read garbage (despite being architecturally defined
763     * as zero), so use a 5-bit mask instead of 8-bits */
764 
765    bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f),
766                         bi_imm_u8(16), false);
767 }
768 
769 static bi_index
bi_load_sample_id(bi_builder * b)770 bi_load_sample_id(bi_builder *b)
771 {
772    bi_index sample_id = bi_temp(b->shader);
773    bi_load_sample_id_to(b, sample_id);
774    return sample_id;
775 }
776 
777 static bi_index
bi_pixel_indices(bi_builder * b,unsigned rt)778 bi_pixel_indices(bi_builder *b, unsigned rt)
779 {
780    /* We want to load the current pixel. */
781    struct bifrost_pixel_indices pix = {.y = BIFROST_CURRENT_PIXEL, .rt = rt};
782 
783    uint32_t indices_u32 = 0;
784    memcpy(&indices_u32, &pix, sizeof(indices_u32));
785    bi_index indices = bi_imm_u32(indices_u32);
786 
787    /* Sample index above is left as zero. For multisampling, we need to
788     * fill in the actual sample ID in the lower byte */
789 
790    if (b->shader->inputs->blend.nr_samples > 1)
791       indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false);
792 
793    return indices;
794 }
795 
796 /* Source color is passed through r0-r3, or r4-r7 for the second source when
797  * dual-source blending. Preload the corresponding vector.
798  */
799 static void
bi_emit_load_blend_input(bi_builder * b,nir_intrinsic_instr * instr)800 bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr)
801 {
802    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
803    unsigned base = (sem.location == VARYING_SLOT_VAR0) ? 4 : 0;
804    unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr));
805    assert(size == 16 || size == 32);
806 
807    bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1),
808                       bi_preload(b, base + 2), bi_preload(b, base + 3)};
809 
810    bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2);
811 }
812 
813 static void
bi_emit_blend_op(bi_builder * b,bi_index rgba,nir_alu_type T,bi_index rgba2,nir_alu_type T2,unsigned rt)814 bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, bi_index rgba2,
815                  nir_alu_type T2, unsigned rt)
816 {
817    /* Reads 2 or 4 staging registers to cover the input */
818    unsigned size = nir_alu_type_get_type_size(T);
819    unsigned size_2 = nir_alu_type_get_type_size(T2);
820    unsigned sr_count = (size <= 16) ? 2 : 4;
821    unsigned sr_count_2 = (size_2 <= 16) ? 2 : 4;
822    const struct panfrost_compile_inputs *inputs = b->shader->inputs;
823    uint64_t blend_desc = inputs->blend.bifrost_blend_desc;
824    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
825 
826    /* Workaround for NIR-to-TGSI */
827    if (b->shader->nir->info.fs.untyped_color_outputs)
828       regfmt = BI_REGISTER_FORMAT_AUTO;
829 
830    if (inputs->is_blend && inputs->blend.nr_samples > 1) {
831       /* Conversion descriptor comes from the compile inputs, pixel
832        * indices derived at run time based on sample ID */
833       bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_coverage(b),
834                  bi_imm_u32(blend_desc >> 32), regfmt, BI_VECSIZE_V4);
835    } else if (b->shader->inputs->is_blend) {
836       uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc;
837 
838       /* Blend descriptor comes from the compile inputs */
839       /* Put the result in r0 */
840 
841       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
842                   bi_imm_u32(blend_desc), bi_imm_u32(blend_desc >> 32),
843                   bi_null(), regfmt, sr_count, 0);
844    } else {
845       /* Blend descriptor comes from the FAU RAM. By convention, the
846        * return address on Bifrost is stored in r48 and will be used
847        * by the blend shader to jump back to the fragment shader */
848 
849       bi_blend_to(b, bi_temp(b->shader), rgba, bi_coverage(b),
850                   bi_fau(BIR_FAU_BLEND_0 + rt, false),
851                   bi_fau(BIR_FAU_BLEND_0 + rt, true), rgba2, regfmt, sr_count,
852                   sr_count_2);
853    }
854 
855    assert(rt < 8);
856    b->shader->info.bifrost->blend[rt].type = T;
857 
858    if (T2)
859       b->shader->info.bifrost->blend_src1_type = T2;
860 }
861 
862 /* Blend shaders do not need to run ATEST since they are dependent on a
863  * fragment shader that runs it. Blit shaders may not need to run ATEST, since
864  * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and
865  * there are no writes to the coverage mask. The latter two are satisfied for
866  * all blit shaders, so we just care about early-z, which blit shaders force
867  * iff they do not write depth or stencil */
868 
869 static bool
bi_skip_atest(bi_context * ctx,bool emit_zs)870 bi_skip_atest(bi_context *ctx, bool emit_zs)
871 {
872    return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend;
873 }
874 
875 static void
bi_emit_atest(bi_builder * b,bi_index alpha)876 bi_emit_atest(bi_builder *b, bi_index alpha)
877 {
878    b->shader->coverage =
879       bi_atest(b, bi_coverage(b), alpha, bi_fau(BIR_FAU_ATEST_PARAM, false));
880    b->shader->emitted_atest = true;
881 }
882 
883 static bi_index
bi_src_color_vec4(bi_builder * b,nir_src * src,nir_alu_type T)884 bi_src_color_vec4(bi_builder *b, nir_src *src, nir_alu_type T)
885 {
886    unsigned num_components = nir_src_num_components(*src);
887    bi_index base = bi_src_index(src);
888 
889    /* short-circuit the common case */
890    if (num_components == 4)
891       return base;
892 
893    unsigned size = nir_alu_type_get_type_size(T);
894    assert(size == 16 || size == 32);
895 
896    bi_index src_vals[4];
897 
898    unsigned i;
899    for (i = 0; i < num_components; i++)
900       src_vals[i] = bi_extract(b, base, i);
901 
902    for (; i < 3; i++)
903       src_vals[i] = (size == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0);
904    src_vals[3] = (size == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0);
905    bi_index temp = bi_temp(b->shader);
906    bi_make_vec_to(b, temp, src_vals, NULL, 4, size);
907    return temp;
908 }
909 
910 static void
bi_emit_fragment_out(bi_builder * b,nir_intrinsic_instr * instr)911 bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr)
912 {
913    bool combined = instr->intrinsic == nir_intrinsic_store_combined_output_pan;
914 
915    unsigned writeout =
916       combined ? nir_intrinsic_component(instr) : PAN_WRITEOUT_C;
917 
918    bool emit_blend = writeout & (PAN_WRITEOUT_C);
919    bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S);
920 
921    unsigned loc = nir_intrinsic_io_semantics(instr).location;
922    bi_index src0 = bi_src_index(&instr->src[0]);
923 
924    /* By ISA convention, the coverage mask is stored in R60. The store
925     * itself will be handled by a subsequent ATEST instruction */
926    if (loc == FRAG_RESULT_SAMPLE_MASK) {
927       b->shader->coverage = bi_extract(b, src0, 0);
928       return;
929    }
930 
931    /* Emit ATEST if we have to, note ATEST requires a floating-point alpha
932     * value, but render target #0 might not be floating point. However the
933     * alpha value is only used for alpha-to-coverage, a stage which is
934     * skipped for pure integer framebuffers, so the issue is moot. */
935 
936    if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) {
937       nir_alu_type T = nir_intrinsic_src_type(instr);
938 
939       bi_index rgba = bi_src_index(&instr->src[0]);
940       bi_index alpha;
941 
942       if (nir_src_num_components(instr->src[0]) < 4) {
943          /* Don't read out-of-bounds */
944          alpha = bi_imm_f32(1.0);
945       } else if (T == nir_type_float16) {
946          alpha = bi_half(bi_extract(b, rgba, 1), true);
947       } else if (T == nir_type_float32) {
948          alpha = bi_extract(b, rgba, 3);
949       } else {
950          alpha = bi_dontcare(b);
951       }
952       bi_emit_atest(b, alpha);
953    }
954 
955    if (emit_zs) {
956       bi_index z = bi_dontcare(b), s = bi_dontcare(b);
957 
958       if (writeout & PAN_WRITEOUT_Z)
959          z = bi_src_index(&instr->src[2]);
960 
961       if (writeout & PAN_WRITEOUT_S)
962          s = bi_src_index(&instr->src[3]);
963 
964       b->shader->coverage =
965          bi_zs_emit(b, z, s, bi_coverage(b), writeout & PAN_WRITEOUT_S,
966                     writeout & PAN_WRITEOUT_Z);
967    }
968 
969    if (emit_blend) {
970       unsigned rt = loc ? (loc - FRAG_RESULT_DATA0) : 0;
971       bool dual = (writeout & PAN_WRITEOUT_2);
972       nir_alu_type T = nir_intrinsic_src_type(instr);
973       nir_alu_type T2 = dual ? nir_intrinsic_dest_type(instr) : 0;
974       bi_index color = bi_src_color_vec4(b, &instr->src[0], T);
975       bi_index color2 =
976          dual ? bi_src_color_vec4(b, &instr->src[4], T2) : bi_null();
977 
978       if (instr->intrinsic == nir_intrinsic_store_output &&
979           loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) {
980          assert(nir_src_is_const(instr->src[1]) && "no indirect outputs");
981 
982          unsigned rt_offs = nir_src_as_uint(instr->src[1]);
983 
984          assert(rt + rt_offs < 8 && "RT not in the [0-7] range");
985          rt += rt_offs;
986       }
987 
988       /* Explicit copy since BLEND inputs are precoloured to R0-R3,
989        * TODO: maybe schedule around this or implement in RA as a
990        * spill */
991       bool has_mrt =
992          (b->shader->nir->info.outputs_written >> FRAG_RESULT_DATA1);
993 
994       if (has_mrt) {
995          bi_index srcs[4] = {color, color, color, color};
996          unsigned channels[4] = {0, 1, 2, 3};
997          color = bi_temp(b->shader);
998          bi_make_vec_to(
999             b, color, srcs, channels, nir_src_num_components(instr->src[0]),
1000             nir_alu_type_get_type_size(nir_intrinsic_src_type(instr)));
1001       }
1002 
1003       bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), color2, T2, rt);
1004    }
1005 
1006    if (b->shader->inputs->is_blend) {
1007       /* Jump back to the fragment shader, return address is stored
1008        * in r48 (see above). On Valhall, only jump if the address is
1009        * nonzero. The check is free there and it implements the "jump
1010        * to 0 terminates the blend shader" that's automatic on
1011        * Bifrost.
1012        */
1013       if (b->shader->arch >= 8)
1014          bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE);
1015       else
1016          bi_jump(b, bi_preload(b, 48));
1017    }
1018 }
1019 
1020 /**
1021  * In a vertex shader, is the specified variable a position output? These kinds
1022  * of outputs are written from position shaders when IDVS is enabled. All other
1023  * outputs are written from the varying shader.
1024  */
1025 static bool
bi_should_remove_store(nir_intrinsic_instr * intr,enum bi_idvs_mode idvs)1026 bi_should_remove_store(nir_intrinsic_instr *intr, enum bi_idvs_mode idvs)
1027 {
1028    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1029 
1030    switch (sem.location) {
1031    case VARYING_SLOT_POS:
1032    case VARYING_SLOT_PSIZ:
1033    case VARYING_SLOT_LAYER:
1034       return idvs == BI_IDVS_VARYING;
1035    default:
1036       return idvs == BI_IDVS_POSITION;
1037    }
1038 }
1039 
1040 static bool
bifrost_nir_specialize_idvs(nir_builder * b,nir_instr * instr,void * data)1041 bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
1042 {
1043    enum bi_idvs_mode *idvs = data;
1044 
1045    if (instr->type != nir_instr_type_intrinsic)
1046       return false;
1047 
1048    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1049 
1050    if (intr->intrinsic != nir_intrinsic_store_output)
1051       return false;
1052 
1053    if (bi_should_remove_store(intr, *idvs)) {
1054       nir_instr_remove(instr);
1055       return true;
1056    }
1057 
1058    return false;
1059 }
1060 
1061 static void
bi_emit_store_vary(bi_builder * b,nir_intrinsic_instr * instr)1062 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
1063 {
1064    /* In principle we can do better for 16-bit. At the moment we require
1065     * 32-bit to permit the use of .auto, in order to force .u32 for flat
1066     * varyings, to handle internal TGSI shaders that set flat in the VS
1067     * but smooth in the FS */
1068 
1069    ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr);
1070    ASSERTED unsigned T_size = nir_alu_type_get_type_size(T);
1071    assert(T_size == 32 || (b->shader->arch >= 9 && T_size == 16));
1072    enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO;
1073 
1074    unsigned imm_index = 0;
1075    bool immediate = bi_is_intr_immediate(instr, &imm_index, 16);
1076 
1077    /* Only look at the total components needed. In effect, we fill in all
1078     * the intermediate "holes" in the write mask, since we can't mask off
1079     * stores. Since nir_lower_io_to_temporaries ensures each varying is
1080     * written at most once, anything that's masked out is undefined, so it
1081     * doesn't matter what we write there. So we may as well do the
1082     * simplest thing possible. */
1083    unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr));
1084    assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0));
1085 
1086    bi_index data = bi_src_index(&instr->src[0]);
1087 
1088    /* To keep the vector dimensions consistent, we need to drop some
1089     * components. This should be coalesced.
1090     *
1091     * TODO: This is ugly and maybe inefficient. Would we rather
1092     * introduce a TRIM.i32 pseudoinstruction?
1093     */
1094    if (nr < nir_intrinsic_src_components(instr, 0)) {
1095       assert(T_size == 32 && "todo: 16-bit trim");
1096 
1097       bi_index chans[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
1098       unsigned src_comps = nir_intrinsic_src_components(instr, 0);
1099 
1100       bi_emit_split_i32(b, chans, data, src_comps);
1101 
1102       bi_index tmp = bi_temp(b->shader);
1103       bi_instr *collect = bi_collect_i32_to(b, tmp, nr);
1104 
1105       bi_foreach_src(collect, w)
1106          collect->src[w] = chans[w];
1107 
1108       data = tmp;
1109    }
1110 
1111    bool psiz =
1112       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PSIZ);
1113    bool layer =
1114       (nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_LAYER);
1115 
1116    bi_index a[4] = {bi_null()};
1117 
1118    if (b->shader->arch <= 8 && b->shader->idvs == BI_IDVS_POSITION) {
1119       /* Bifrost position shaders have a fast path */
1120       assert(T == nir_type_float16 || T == nir_type_float32);
1121       unsigned regfmt = (T == nir_type_float16) ? 0 : 1;
1122       unsigned identity = (b->shader->arch == 6) ? 0x688 : 0;
1123       unsigned snap4 = 0x5E;
1124       uint32_t format = identity | (snap4 << 12) | (regfmt << 24);
1125 
1126       bi_st_cvt(b, data, bi_preload(b, 58), bi_preload(b, 59),
1127                 bi_imm_u32(format), regfmt, nr - 1);
1128    } else if (b->shader->arch >= 9 && b->shader->idvs != BI_IDVS_NONE) {
1129       bi_index index = bi_preload(b, 59);
1130       unsigned pos_attr_offset = 0;
1131       unsigned src_bit_sz = nir_src_bit_size(instr->src[0]);
1132 
1133       if (psiz || layer)
1134          index = bi_iadd_imm_i32(b, index, 4);
1135 
1136       if (layer) {
1137          assert(nr == 1 && src_bit_sz == 32);
1138          src_bit_sz = 8;
1139          pos_attr_offset = 2;
1140          data = bi_byte(data, 0);
1141       }
1142 
1143       if (psiz)
1144          assert(T_size == 16 && "should've been lowered");
1145 
1146       bi_index address = bi_lea_buf_imm(b, index);
1147       bi_emit_split_i32(b, a, address, 2);
1148 
1149       bool varying = (b->shader->idvs == BI_IDVS_VARYING);
1150 
1151       bi_store(b, nr * src_bit_sz, data, a[0], a[1],
1152                varying ? BI_SEG_VARY : BI_SEG_POS,
1153                varying ? bi_varying_offset(b->shader, instr) : pos_attr_offset);
1154    } else if (immediate) {
1155       bi_index address = bi_lea_attr_imm(b, bi_vertex_id(b), bi_instance_id(b),
1156                                          regfmt, imm_index);
1157       bi_emit_split_i32(b, a, address, 3);
1158 
1159       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1160    } else {
1161       bi_index idx = bi_iadd_u32(b, bi_src_index(nir_get_io_offset_src(instr)),
1162                                  bi_imm_u32(nir_intrinsic_base(instr)), false);
1163       bi_index address =
1164          bi_lea_attr(b, bi_vertex_id(b), bi_instance_id(b), idx, regfmt);
1165       bi_emit_split_i32(b, a, address, 3);
1166 
1167       bi_st_cvt(b, data, a[0], a[1], a[2], regfmt, nr - 1);
1168    }
1169 }
1170 
1171 static void
bi_emit_load_ubo(bi_builder * b,nir_intrinsic_instr * instr)1172 bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr)
1173 {
1174    nir_src *offset = nir_get_io_offset_src(instr);
1175 
1176    bool offset_is_const = nir_src_is_const(*offset);
1177    bi_index dyn_offset = bi_src_index(offset);
1178    uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0;
1179 
1180    bi_load_ubo_to(b, instr->num_components * instr->def.bit_size,
1181                   bi_def_index(&instr->def),
1182                   offset_is_const ? bi_imm_u32(const_offset) : dyn_offset,
1183                   bi_src_index(&instr->src[0]));
1184 }
1185 
1186 static void
bi_emit_load_push_constant(bi_builder * b,nir_intrinsic_instr * instr)1187 bi_emit_load_push_constant(bi_builder *b, nir_intrinsic_instr *instr)
1188 {
1189    assert(b->shader->inputs->no_ubo_to_push && "can't mix push constant forms");
1190 
1191    nir_src *offset = &instr->src[0];
1192    assert(nir_src_is_const(*offset) && "no indirect push constants");
1193    uint32_t base = nir_intrinsic_base(instr) + nir_src_as_uint(*offset);
1194    assert((base & 3) == 0 && "unaligned push constants");
1195 
1196    unsigned bits = instr->def.bit_size * instr->def.num_components;
1197 
1198    unsigned n = DIV_ROUND_UP(bits, 32);
1199    assert(n <= 4);
1200    bi_index channels[4] = {bi_null()};
1201 
1202    for (unsigned i = 0; i < n; ++i) {
1203       unsigned word = (base >> 2) + i;
1204 
1205       channels[i] = bi_fau(BIR_FAU_UNIFORM | (word >> 1), word & 1);
1206    }
1207 
1208    bi_emit_collect_to(b, bi_def_index(&instr->def), channels, n);
1209 
1210    /* Update push->count to report the highest push constant word being accessed
1211     * by this shader.
1212     */
1213    b->shader->info.push->count =
1214       MAX2((base / 4) + n, b->shader->info.push->count);
1215 }
1216 
1217 static bi_index
bi_addr_high(bi_builder * b,nir_src * src)1218 bi_addr_high(bi_builder *b, nir_src *src)
1219 {
1220    return (nir_src_bit_size(*src) == 64) ? bi_extract(b, bi_src_index(src), 1)
1221                                          : bi_zero();
1222 }
1223 
1224 static void
bi_handle_segment(bi_builder * b,bi_index * addr_lo,bi_index * addr_hi,enum bi_seg seg,int16_t * offset)1225 bi_handle_segment(bi_builder *b, bi_index *addr_lo, bi_index *addr_hi,
1226                   enum bi_seg seg, int16_t *offset)
1227 {
1228    /* Not needed on Bifrost or for global accesses */
1229    if (b->shader->arch < 9 || seg == BI_SEG_NONE)
1230       return;
1231 
1232    /* There is no segment modifier on Valhall. Instead, we need to
1233     * emit the arithmetic ourselves. We do have an offset
1234     * available, which saves an instruction for constant offsets.
1235     */
1236    bool wls = (seg == BI_SEG_WLS);
1237    assert(wls || (seg == BI_SEG_TL));
1238 
1239    enum bir_fau fau = wls ? BIR_FAU_WLS_PTR : BIR_FAU_TLS_PTR;
1240 
1241    bi_index base_lo = bi_fau(fau, false);
1242 
1243    if (offset && addr_lo->type == BI_INDEX_CONSTANT &&
1244        addr_lo->value == (int16_t)addr_lo->value) {
1245       *offset = addr_lo->value;
1246       *addr_lo = base_lo;
1247    } else {
1248       *addr_lo = bi_iadd_u32(b, base_lo, *addr_lo, false);
1249    }
1250 
1251    /* Do not allow overflow for WLS or TLS */
1252    *addr_hi = bi_fau(fau, true);
1253 }
1254 
1255 static void
bi_emit_load(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1256 bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1257 {
1258    int16_t offset = 0;
1259    unsigned bits = instr->num_components * instr->def.bit_size;
1260    bi_index dest = bi_def_index(&instr->def);
1261    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[0]), 0);
1262    bi_index addr_hi = bi_addr_high(b, &instr->src[0]);
1263 
1264    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1265 
1266    bi_load_to(b, bits, dest, addr_lo, addr_hi, seg, offset);
1267    bi_emit_cached_split(b, dest, bits);
1268 }
1269 
1270 static void
bi_emit_store(bi_builder * b,nir_intrinsic_instr * instr,enum bi_seg seg)1271 bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg)
1272 {
1273    /* Require contiguous masks, gauranteed by nir_lower_wrmasks */
1274    assert(nir_intrinsic_write_mask(instr) ==
1275           BITFIELD_MASK(instr->num_components));
1276 
1277    int16_t offset = 0;
1278    bi_index addr_lo = bi_extract(b, bi_src_index(&instr->src[1]), 0);
1279    bi_index addr_hi = bi_addr_high(b, &instr->src[1]);
1280 
1281    bi_handle_segment(b, &addr_lo, &addr_hi, seg, &offset);
1282 
1283    bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]),
1284             bi_src_index(&instr->src[0]), addr_lo, addr_hi, seg, offset);
1285 }
1286 
1287 /* Exchanges the staging register with memory */
1288 
1289 static void
bi_emit_axchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg,enum bi_seg seg)1290 bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg,
1291                  enum bi_seg seg)
1292 {
1293    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1294 
1295    unsigned sz = nir_src_bit_size(*arg);
1296    assert(sz == 32 || sz == 64);
1297 
1298    bi_index data = bi_src_index(arg);
1299 
1300    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1301 
1302    if (b->shader->arch >= 9)
1303       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1304    else if (seg == BI_SEG_WLS)
1305       addr_hi = bi_zero();
1306 
1307    bi_axchg_to(b, sz, dst, data, bi_extract(b, addr, 0), addr_hi, seg);
1308 }
1309 
1310 /* Exchanges the second staging register with memory if comparison with first
1311  * staging register passes */
1312 
1313 static void
bi_emit_acmpxchg_to(bi_builder * b,bi_index dst,bi_index addr,nir_src * arg_1,nir_src * arg_2,enum bi_seg seg)1314 bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1,
1315                     nir_src *arg_2, enum bi_seg seg)
1316 {
1317    assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS);
1318 
1319    /* hardware is swapped from NIR */
1320    bi_index src0 = bi_src_index(arg_2);
1321    bi_index src1 = bi_src_index(arg_1);
1322 
1323    unsigned sz = nir_src_bit_size(*arg_1);
1324    assert(sz == 32 || sz == 64);
1325 
1326    bi_index data_words[] = {
1327       bi_extract(b, src0, 0),
1328       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src0, 1),
1329 
1330       /* 64-bit */
1331       bi_extract(b, src1, 0),
1332       sz == 32 ? bi_extract(b, src1, 0) : bi_extract(b, src1, 1),
1333    };
1334 
1335    bi_index in = bi_temp(b->shader);
1336    bi_emit_collect_to(b, in, data_words, 2 * (sz / 32));
1337    bi_index addr_hi = (seg == BI_SEG_WLS) ? bi_zero() : bi_extract(b, addr, 1);
1338 
1339    if (b->shader->arch >= 9)
1340       bi_handle_segment(b, &addr, &addr_hi, seg, NULL);
1341    else if (seg == BI_SEG_WLS)
1342       addr_hi = bi_zero();
1343 
1344    bi_index out = bi_acmpxchg(b, sz, in, bi_extract(b, addr, 0), addr_hi, seg);
1345    bi_emit_cached_split(b, out, sz);
1346 
1347    bi_index inout_words[] = {bi_extract(b, out, 0),
1348                              sz == 64 ? bi_extract(b, out, 1) : bi_null()};
1349 
1350    bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32);
1351 }
1352 
1353 static enum bi_atom_opc
bi_atom_opc_for_nir(nir_atomic_op op)1354 bi_atom_opc_for_nir(nir_atomic_op op)
1355 {
1356    /* clang-format off */
1357    switch (op) {
1358    case nir_atomic_op_iadd: return BI_ATOM_OPC_AADD;
1359    case nir_atomic_op_imin: return BI_ATOM_OPC_ASMIN;
1360    case nir_atomic_op_umin: return BI_ATOM_OPC_AUMIN;
1361    case nir_atomic_op_imax: return BI_ATOM_OPC_ASMAX;
1362    case nir_atomic_op_umax: return BI_ATOM_OPC_AUMAX;
1363    case nir_atomic_op_iand: return BI_ATOM_OPC_AAND;
1364    case nir_atomic_op_ior:  return BI_ATOM_OPC_AOR;
1365    case nir_atomic_op_ixor: return BI_ATOM_OPC_AXOR;
1366    default: unreachable("Unexpected computational atomic");
1367    }
1368    /* clang-format on */
1369 }
1370 
1371 /* Optimized unary atomics are available with an implied #1 argument */
1372 
1373 static bool
bi_promote_atom_c1(enum bi_atom_opc op,bi_index arg,enum bi_atom_opc * out)1374 bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out)
1375 {
1376    /* Check we have a compatible constant */
1377    if (arg.type != BI_INDEX_CONSTANT)
1378       return false;
1379 
1380    if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD)))
1381       return false;
1382 
1383    /* Check for a compatible operation */
1384    switch (op) {
1385    case BI_ATOM_OPC_AADD:
1386       *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC;
1387       return true;
1388    case BI_ATOM_OPC_ASMAX:
1389       *out = BI_ATOM_OPC_ASMAX1;
1390       return true;
1391    case BI_ATOM_OPC_AUMAX:
1392       *out = BI_ATOM_OPC_AUMAX1;
1393       return true;
1394    case BI_ATOM_OPC_AOR:
1395       *out = BI_ATOM_OPC_AOR1;
1396       return true;
1397    default:
1398       return false;
1399    }
1400 }
1401 
1402 /*
1403  * Coordinates are 16-bit integers in Bifrost but 32-bit in NIR. We need to
1404  * translate between these forms (with MKVEC.v2i16).
1405  *
1406  * Aditionally on Valhall, cube maps in the attribute pipe are treated as 2D
1407  * arrays.  For uniform handling, we also treat 3D textures like 2D arrays.
1408  *
1409  * Our indexing needs to reflects this. Since Valhall and Bifrost are quite
1410  * different, we provide separate functions for these.
1411  */
1412 static bi_index
bi_emit_image_coord(bi_builder * b,bi_index coord,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1413 bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx,
1414                     unsigned coord_comps, bool is_array, bool is_msaa)
1415 {
1416    assert(coord_comps > 0 && coord_comps <= 3);
1417 
1418    /* MSAA load store should have been lowered */
1419    assert(!is_msaa);
1420    if (src_idx == 0) {
1421       if (coord_comps == 1 || (coord_comps == 2 && is_array))
1422          return bi_extract(b, coord, 0);
1423       else
1424          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1425                                bi_half(bi_extract(b, coord, 1), false));
1426    } else {
1427       if (coord_comps == 3)
1428          return bi_extract(b, coord, 2);
1429       else if (coord_comps == 2 && is_array)
1430          return bi_extract(b, coord, 1);
1431       else
1432          return bi_zero();
1433    }
1434 }
1435 
1436 static bi_index
va_emit_image_coord(bi_builder * b,bi_index coord,bi_index sample_index,unsigned src_idx,unsigned coord_comps,bool is_array,bool is_msaa)1437 va_emit_image_coord(bi_builder *b, bi_index coord, bi_index sample_index,
1438                     unsigned src_idx, unsigned coord_comps, bool is_array,
1439                     bool is_msaa)
1440 {
1441    assert(coord_comps > 0 && coord_comps <= 3);
1442    if (src_idx == 0) {
1443       if (coord_comps == 1 || (coord_comps == 2 && is_array))
1444          return bi_extract(b, coord, 0);
1445       else
1446          return bi_mkvec_v2i16(b, bi_half(bi_extract(b, coord, 0), false),
1447                                bi_half(bi_extract(b, coord, 1), false));
1448    } else if (is_msaa) {
1449       bi_index array_idx = bi_extract(b, sample_index, 0);
1450       if (coord_comps == 3)
1451          return bi_mkvec_v2i16(b, bi_half(array_idx, false),
1452                                bi_half(bi_extract(b, coord, 2), false));
1453       else if (coord_comps == 2)
1454          return array_idx;
1455    } else if (coord_comps == 3)
1456       return bi_mkvec_v2i16(b, bi_imm_u16(0),
1457                             bi_half(bi_extract(b, coord, 2), false));
1458    else if (coord_comps == 2 && is_array)
1459       return bi_mkvec_v2i16(b, bi_imm_u16(0),
1460                             bi_half(bi_extract(b, coord, 1), false));
1461    return bi_zero();
1462 }
1463 
1464 static void
bi_emit_image_load(bi_builder * b,nir_intrinsic_instr * instr)1465 bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr)
1466 {
1467    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1468    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1469    bool array = nir_intrinsic_image_array(instr);
1470 
1471    bi_index coords = bi_src_index(&instr->src[1]);
1472    bi_index indexvar = bi_src_index(&instr->src[2]);
1473    bi_index xy, zw;
1474    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS);
1475    if (b->shader->arch < 9) {
1476       xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1477       zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1478    } else {
1479       xy =
1480          va_emit_image_coord(b, coords, indexvar, 0, coord_comps, array, is_ms);
1481       zw =
1482          va_emit_image_coord(b, coords, indexvar, 1, coord_comps, array, is_ms);
1483    }
1484    bi_index dest = bi_def_index(&instr->def);
1485    enum bi_register_format regfmt =
1486       bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr));
1487    enum bi_vecsize vecsize = instr->num_components - 1;
1488 
1489    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1490       const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1491       const unsigned table_index = pan_res_handle_get_table(raw_value);
1492       const unsigned texture_index = pan_res_handle_get_index(raw_value);
1493 
1494       if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1495          bi_instr *I =
1496             bi_ld_tex_imm_to(b, dest, xy, zw, regfmt, vecsize, texture_index);
1497          I->table = va_res_fold_table_idx(table_index);
1498       } else {
1499          bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1500                       vecsize);
1501       }
1502    } else if (b->shader->arch >= 9) {
1503       bi_ld_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1504                    vecsize);
1505    } else {
1506       bi_ld_attr_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), regfmt,
1507                         vecsize);
1508    }
1509 
1510    bi_split_def(b, &instr->def);
1511 }
1512 
1513 static void
bi_emit_lea_image_to(bi_builder * b,bi_index dest,nir_intrinsic_instr * instr)1514 bi_emit_lea_image_to(bi_builder *b, bi_index dest, nir_intrinsic_instr *instr)
1515 {
1516    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1517    bool array = nir_intrinsic_image_array(instr);
1518    unsigned coord_comps = nir_image_intrinsic_coord_components(instr);
1519 
1520    enum bi_register_format type =
1521       (instr->intrinsic == nir_intrinsic_image_store)
1522          ? bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr))
1523          : BI_REGISTER_FORMAT_AUTO;
1524 
1525    bi_index coords = bi_src_index(&instr->src[1]);
1526    bi_index indices = bi_src_index(&instr->src[2]);
1527    bi_index xy, zw;
1528    bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1529    if (b->shader->arch < 9) {
1530       xy = bi_emit_image_coord(b, coords, 0, coord_comps, array, is_ms);
1531       zw = bi_emit_image_coord(b, coords, 1, coord_comps, array, is_ms);
1532    } else {
1533       xy =
1534          va_emit_image_coord(b, coords, indices, 0, coord_comps, array, is_ms);
1535       zw =
1536          va_emit_image_coord(b, coords, indices, 1, coord_comps, array, is_ms);
1537    }
1538 
1539    if (b->shader->arch >= 9 && nir_src_is_const(instr->src[0])) {
1540       const unsigned raw_value = nir_src_as_uint(instr->src[0]);
1541       unsigned table_index = pan_res_handle_get_table(raw_value);
1542       unsigned texture_index = pan_res_handle_get_index(raw_value);
1543 
1544       if (texture_index < 16 && va_is_valid_const_table(table_index)) {
1545          bi_instr *I = bi_lea_tex_imm_to(b, dest, xy, zw, false, texture_index);
1546          I->table = va_res_fold_table_idx(table_index);
1547       } else {
1548          bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1549       }
1550    } else if (b->shader->arch >= 9) {
1551       bi_lea_tex_to(b, dest, xy, zw, bi_src_index(&instr->src[0]), false);
1552    } else {
1553       bi_instr *I = bi_lea_attr_tex_to(b, dest, xy, zw,
1554                                        bi_src_index(&instr->src[0]), type);
1555 
1556       /* LEA_ATTR_TEX defaults to the secondary attribute table, but
1557        * our ABI has all images in the primary attribute table
1558        */
1559       I->table = BI_TABLE_ATTRIBUTE_1;
1560    }
1561 
1562    bi_emit_cached_split(b, dest, 3 * 32);
1563 }
1564 
1565 static bi_index
bi_emit_lea_image(bi_builder * b,nir_intrinsic_instr * instr)1566 bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr)
1567 {
1568    bi_index dest = bi_temp(b->shader);
1569    bi_emit_lea_image_to(b, dest, instr);
1570    return dest;
1571 }
1572 
1573 static void
bi_emit_image_store(bi_builder * b,nir_intrinsic_instr * instr)1574 bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr)
1575 {
1576    bi_index a[4] = {bi_null()};
1577    bi_emit_split_i32(b, a, bi_emit_lea_image(b, instr), 3);
1578 
1579    /* Due to SPIR-V limitations, the source type is not fully reliable: it
1580     * reports uint32 even for write_imagei. This causes an incorrect
1581     * u32->s32->u32 roundtrip which incurs an unwanted clamping. Use auto32
1582     * instead, which will match per the OpenCL spec. Of course this does
1583     * not work for 16-bit stores, but those are not available in OpenCL.
1584     */
1585    nir_alu_type T = nir_intrinsic_src_type(instr);
1586    assert(nir_alu_type_get_type_size(T) == 32);
1587 
1588    bi_st_cvt(b, bi_src_index(&instr->src[3]), a[0], a[1], a[2],
1589              BI_REGISTER_FORMAT_AUTO, instr->num_components - 1);
1590 }
1591 
1592 static void
bi_emit_atomic_i32_to(bi_builder * b,bi_index dst,bi_index addr,bi_index arg,nir_atomic_op op)1593 bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, bi_index addr, bi_index arg,
1594                       nir_atomic_op op)
1595 {
1596    enum bi_atom_opc opc = bi_atom_opc_for_nir(op);
1597    enum bi_atom_opc post_opc = opc;
1598    bool bifrost = b->shader->arch <= 8;
1599 
1600    /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't
1601     * take any vector but can still output in RETURN mode */
1602    bi_index tmp_dest = bifrost ? bi_temp(b->shader) : dst;
1603    unsigned sr_count = bifrost ? 2 : 1;
1604 
1605    /* Generate either ATOM or ATOM1 as required */
1606    if (bi_promote_atom_c1(opc, arg, &opc)) {
1607       bi_atom1_return_i32_to(b, tmp_dest, bi_extract(b, addr, 0),
1608                              bi_extract(b, addr, 1), opc, sr_count);
1609    } else {
1610       bi_atom_return_i32_to(b, tmp_dest, arg, bi_extract(b, addr, 0),
1611                             bi_extract(b, addr, 1), opc, sr_count);
1612    }
1613 
1614    if (bifrost) {
1615       /* Post-process it */
1616       bi_emit_cached_split_i32(b, tmp_dest, 2);
1617       bi_atom_post_i32_to(b, dst, bi_extract(b, tmp_dest, 0),
1618                           bi_extract(b, tmp_dest, 1), post_opc);
1619    }
1620 }
1621 
1622 static void
bi_emit_load_frag_coord_zw(bi_builder * b,bi_index dst,unsigned channel)1623 bi_emit_load_frag_coord_zw(bi_builder *b, bi_index dst, unsigned channel)
1624 {
1625    bi_ld_var_special_to(
1626       b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER,
1627       BI_UPDATE_CLOBBER,
1628       (channel == 2) ? BI_VARYING_NAME_FRAG_Z : BI_VARYING_NAME_FRAG_W,
1629       BI_VECSIZE_NONE);
1630 }
1631 
1632 static void
bi_emit_ld_tile(bi_builder * b,nir_intrinsic_instr * instr)1633 bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
1634 {
1635    bi_index dest = bi_def_index(&instr->def);
1636    nir_alu_type T = nir_intrinsic_dest_type(instr);
1637    enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
1638    unsigned size = instr->def.bit_size;
1639    unsigned nr = instr->num_components;
1640 
1641    /* Get the render target */
1642    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
1643    unsigned loc = sem.location;
1644    assert(loc >= FRAG_RESULT_DATA0);
1645    unsigned rt = (loc - FRAG_RESULT_DATA0);
1646 
1647    bi_ld_tile_to(b, dest, bi_pixel_indices(b, rt), bi_coverage(b),
1648                  bi_src_index(&instr->src[0]), regfmt, nr - 1);
1649    bi_emit_cached_split(b, dest, size * nr);
1650 }
1651 
1652 /*
1653  * Older Bifrost hardware has a limited CLPER instruction. Add a safe helper
1654  * that uses the hardware functionality if available and lowers otherwise.
1655  */
1656 static bi_index
bi_clper(bi_builder * b,bi_index s0,bi_index s1,enum bi_lane_op lop)1657 bi_clper(bi_builder *b, bi_index s0, bi_index s1, enum bi_lane_op lop)
1658 {
1659    if (b->shader->quirks & BIFROST_LIMITED_CLPER) {
1660       if (lop == BI_LANE_OP_XOR) {
1661          bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false);
1662          s1 = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0));
1663       } else {
1664          assert(lop == BI_LANE_OP_NONE);
1665       }
1666 
1667       return bi_clper_old_i32(b, s0, s1);
1668    } else {
1669       return bi_clper_i32(b, s0, s1, BI_INACTIVE_RESULT_ZERO, lop,
1670                           BI_SUBGROUP_SUBGROUP4);
1671    }
1672 }
1673 
1674 static bool
bi_nir_all_uses_fabs(nir_def * def)1675 bi_nir_all_uses_fabs(nir_def *def)
1676 {
1677    nir_foreach_use(use, def) {
1678       nir_instr *instr = nir_src_parent_instr(use);
1679 
1680       if (instr->type != nir_instr_type_alu ||
1681           nir_instr_as_alu(instr)->op != nir_op_fabs)
1682          return false;
1683    }
1684 
1685    return true;
1686 }
1687 
1688 static void
bi_emit_derivative(bi_builder * b,bi_index dst,nir_intrinsic_instr * instr,unsigned axis,bool coarse)1689 bi_emit_derivative(bi_builder *b, bi_index dst, nir_intrinsic_instr *instr,
1690                    unsigned axis, bool coarse)
1691 {
1692    bi_index left, right;
1693    bi_index s0 = bi_src_index(&instr->src[0]);
1694    unsigned sz = instr->def.bit_size;
1695 
1696    /* If all uses are fabs, the sign of the derivative doesn't matter. This is
1697     * inherently based on fine derivatives so we can't do it for coarse.
1698     */
1699    if (bi_nir_all_uses_fabs(&instr->def) && !coarse) {
1700       left = s0;
1701       right = bi_clper(b, s0, bi_imm_u32(axis), BI_LANE_OP_XOR);
1702    } else {
1703       bi_index lane1, lane2;
1704       if (coarse) {
1705          lane1 = bi_imm_u32(0);
1706          lane2 = bi_imm_u32(axis);
1707       } else {
1708          lane1 = bi_lshift_and_i32(b, bi_fau(BIR_FAU_LANE_ID, false),
1709                                    bi_imm_u32(0x3 & ~axis), bi_imm_u8(0));
1710 
1711          lane2 = bi_iadd_u32(b, lane1, bi_imm_u32(axis), false);
1712       }
1713 
1714       left = bi_clper(b, s0, lane1, BI_LANE_OP_NONE);
1715       right = bi_clper(b, s0, lane2, BI_LANE_OP_NONE);
1716    }
1717 
1718    bi_fadd_to(b, sz, dst, right, bi_neg(left));
1719 }
1720 
1721 static void
bi_emit_intrinsic(bi_builder * b,nir_intrinsic_instr * instr)1722 bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
1723 {
1724    bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1725                      ? bi_def_index(&instr->def)
1726                      : bi_null();
1727    gl_shader_stage stage = b->shader->stage;
1728 
1729    switch (instr->intrinsic) {
1730    case nir_intrinsic_load_barycentric_pixel:
1731    case nir_intrinsic_load_barycentric_centroid:
1732    case nir_intrinsic_load_barycentric_sample:
1733    case nir_intrinsic_load_barycentric_at_sample:
1734    case nir_intrinsic_load_barycentric_at_offset:
1735       /* handled later via load_vary */
1736       break;
1737    case nir_intrinsic_load_interpolated_input:
1738    case nir_intrinsic_load_input:
1739       if (b->shader->inputs->is_blend)
1740          bi_emit_load_blend_input(b, instr);
1741       else if (stage == MESA_SHADER_FRAGMENT)
1742          bi_emit_load_vary(b, instr);
1743       else if (stage == MESA_SHADER_VERTEX)
1744          bi_emit_load_attr(b, instr);
1745       else
1746          unreachable("Unsupported shader stage");
1747       break;
1748 
1749    case nir_intrinsic_store_output:
1750       if (stage == MESA_SHADER_FRAGMENT)
1751          bi_emit_fragment_out(b, instr);
1752       else if (stage == MESA_SHADER_VERTEX)
1753          bi_emit_store_vary(b, instr);
1754       else
1755          unreachable("Unsupported shader stage");
1756       break;
1757 
1758    case nir_intrinsic_store_combined_output_pan:
1759       assert(stage == MESA_SHADER_FRAGMENT);
1760       bi_emit_fragment_out(b, instr);
1761       break;
1762 
1763    case nir_intrinsic_load_ubo:
1764       bi_emit_load_ubo(b, instr);
1765       break;
1766 
1767    case nir_intrinsic_load_push_constant:
1768       bi_emit_load_push_constant(b, instr);
1769       break;
1770 
1771    case nir_intrinsic_load_global:
1772    case nir_intrinsic_load_global_constant:
1773       bi_emit_load(b, instr, BI_SEG_NONE);
1774       break;
1775 
1776    case nir_intrinsic_store_global:
1777       bi_emit_store(b, instr, BI_SEG_NONE);
1778       break;
1779 
1780    case nir_intrinsic_load_scratch:
1781       bi_emit_load(b, instr, BI_SEG_TL);
1782       break;
1783 
1784    case nir_intrinsic_store_scratch:
1785       bi_emit_store(b, instr, BI_SEG_TL);
1786       break;
1787 
1788    case nir_intrinsic_load_shared:
1789       bi_emit_load(b, instr, BI_SEG_WLS);
1790       break;
1791 
1792    case nir_intrinsic_store_shared:
1793       bi_emit_store(b, instr, BI_SEG_WLS);
1794       break;
1795 
1796    case nir_intrinsic_barrier:
1797       if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
1798          assert(b->shader->stage != MESA_SHADER_FRAGMENT);
1799          assert(nir_intrinsic_execution_scope(instr) > SCOPE_SUBGROUP &&
1800                 "todo: subgroup barriers (different divergence rules)");
1801          bi_barrier(b);
1802       }
1803       /* Blob doesn't seem to do anything for memory barriers, so no need to
1804        * check nir_intrinsic_memory_scope().
1805        */
1806       break;
1807 
1808    case nir_intrinsic_shared_atomic: {
1809       nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1810 
1811       if (op == nir_atomic_op_xchg) {
1812          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1813                           BI_SEG_WLS);
1814       } else {
1815          assert(nir_src_bit_size(instr->src[1]) == 32);
1816 
1817          bi_index addr = bi_src_index(&instr->src[0]);
1818          bi_index addr_hi;
1819 
1820          if (b->shader->arch >= 9) {
1821             bi_handle_segment(b, &addr, &addr_hi, BI_SEG_WLS, NULL);
1822             addr = bi_collect_v2i32(b, addr, addr_hi);
1823          } else {
1824             addr = bi_seg_add_i64(b, addr, bi_zero(), false, BI_SEG_WLS);
1825             bi_emit_cached_split(b, addr, 64);
1826          }
1827 
1828          bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), op);
1829       }
1830 
1831       bi_split_def(b, &instr->def);
1832       break;
1833    }
1834 
1835    case nir_intrinsic_global_atomic: {
1836       nir_atomic_op op = nir_intrinsic_atomic_op(instr);
1837 
1838       if (op == nir_atomic_op_xchg) {
1839          bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1840                           BI_SEG_NONE);
1841       } else {
1842          assert(nir_src_bit_size(instr->src[1]) == 32);
1843 
1844          bi_emit_atomic_i32_to(b, dst, bi_src_index(&instr->src[0]),
1845                                bi_src_index(&instr->src[1]), op);
1846       }
1847 
1848       bi_split_def(b, &instr->def);
1849       break;
1850    }
1851 
1852    case nir_intrinsic_image_texel_address:
1853       bi_emit_lea_image_to(b, dst, instr);
1854       break;
1855 
1856    case nir_intrinsic_image_load:
1857       bi_emit_image_load(b, instr);
1858       break;
1859 
1860    case nir_intrinsic_image_store:
1861       bi_emit_image_store(b, instr);
1862       break;
1863 
1864    case nir_intrinsic_global_atomic_swap:
1865       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1866                           &instr->src[2], BI_SEG_NONE);
1867       bi_split_def(b, &instr->def);
1868       break;
1869 
1870    case nir_intrinsic_shared_atomic_swap:
1871       bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), &instr->src[1],
1872                           &instr->src[2], BI_SEG_WLS);
1873       bi_split_def(b, &instr->def);
1874       break;
1875 
1876    case nir_intrinsic_load_pixel_coord:
1877       /* Vectorized load of the preloaded i16vec2 */
1878       bi_mov_i32_to(b, dst, bi_preload(b, 59));
1879       break;
1880 
1881    case nir_intrinsic_load_frag_coord_zw:
1882       bi_emit_load_frag_coord_zw(b, dst, nir_intrinsic_component(instr));
1883       break;
1884 
1885    case nir_intrinsic_load_converted_output_pan:
1886       bi_emit_ld_tile(b, instr);
1887       break;
1888 
1889    case nir_intrinsic_terminate_if:
1890       bi_discard_b32(b, bi_src_index(&instr->src[0]));
1891       break;
1892 
1893    case nir_intrinsic_terminate:
1894       bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ);
1895       break;
1896 
1897    case nir_intrinsic_load_sample_positions_pan:
1898       bi_collect_v2i32_to(b, dst, bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false),
1899                           bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true));
1900       break;
1901 
1902    case nir_intrinsic_load_sample_mask_in:
1903       /* r61[0:15] contains the coverage bitmap */
1904       bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false));
1905       break;
1906 
1907    case nir_intrinsic_load_sample_mask:
1908       bi_mov_i32_to(b, dst, bi_coverage(b));
1909       break;
1910 
1911    case nir_intrinsic_load_sample_id:
1912       bi_load_sample_id_to(b, dst);
1913       break;
1914 
1915    case nir_intrinsic_load_front_face:
1916       /* r58 == 0 means primitive is front facing */
1917       bi_icmp_i32_to(b, dst, bi_preload(b, 58), bi_zero(), BI_CMPF_EQ,
1918                      BI_RESULT_TYPE_M1);
1919       break;
1920 
1921    case nir_intrinsic_load_point_coord:
1922       bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32,
1923                            BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER,
1924                            BI_VARYING_NAME_POINT, BI_VECSIZE_V2);
1925       bi_emit_cached_split_i32(b, dst, 2);
1926       break;
1927 
1928    /* It appears vertex_id is zero-based with Bifrost geometry flows, but
1929     * not with Valhall's memory-allocation IDVS geometry flow. We only support
1930     * the new flow on Valhall so this is lowered in NIR.
1931     */
1932    case nir_intrinsic_load_vertex_id:
1933    case nir_intrinsic_load_vertex_id_zero_base:
1934       assert(b->shader->malloc_idvs ==
1935              (instr->intrinsic == nir_intrinsic_load_vertex_id));
1936 
1937       bi_mov_i32_to(b, dst, bi_vertex_id(b));
1938       break;
1939 
1940    case nir_intrinsic_load_instance_id:
1941       bi_mov_i32_to(b, dst, bi_instance_id(b));
1942       break;
1943 
1944    case nir_intrinsic_load_draw_id:
1945       bi_mov_i32_to(b, dst, bi_draw_id(b));
1946       break;
1947 
1948    case nir_intrinsic_load_subgroup_invocation:
1949       bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false));
1950       break;
1951 
1952    case nir_intrinsic_load_local_invocation_id:
1953       bi_collect_v3i32_to(b, dst,
1954                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)),
1955                           bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)),
1956                           bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0)));
1957       break;
1958 
1959    case nir_intrinsic_load_workgroup_id:
1960       bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58),
1961                           bi_preload(b, 59));
1962       break;
1963 
1964    case nir_intrinsic_load_global_invocation_id:
1965       bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61),
1966                           bi_preload(b, 62));
1967       break;
1968 
1969    case nir_intrinsic_shader_clock:
1970       bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER);
1971       bi_split_def(b, &instr->def);
1972       break;
1973 
1974    case nir_intrinsic_ddx:
1975    case nir_intrinsic_ddx_fine:
1976       bi_emit_derivative(b, dst, instr, 1, false);
1977       break;
1978    case nir_intrinsic_ddx_coarse:
1979       bi_emit_derivative(b, dst, instr, 1, true);
1980       break;
1981    case nir_intrinsic_ddy:
1982    case nir_intrinsic_ddy_fine:
1983       bi_emit_derivative(b, dst, instr, 2, false);
1984       break;
1985    case nir_intrinsic_ddy_coarse:
1986       bi_emit_derivative(b, dst, instr, 2, true);
1987       break;
1988 
1989    case nir_intrinsic_load_layer_id:
1990       assert(b->shader->arch >= 9);
1991       bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0)));
1992       break;
1993 
1994    case nir_intrinsic_load_ssbo_address:
1995       assert(b->shader->arch >= 9);
1996       bi_lea_buffer_to(b, dst, bi_src_index(&instr->src[1]),
1997                        bi_src_index(&instr->src[0]));
1998       bi_emit_cached_split(b, dst, 64);
1999       break;
2000 
2001    case nir_intrinsic_load_ssbo: {
2002       assert(b->shader->arch >= 9);
2003       unsigned dst_bits = instr->num_components * instr->def.bit_size;
2004       bi_ld_buffer_to(b, dst_bits, dst, bi_src_index(&instr->src[1]),
2005                       bi_src_index(&instr->src[0]));
2006       bi_emit_cached_split(b, dst, dst_bits);
2007       break;
2008    }
2009 
2010    default:
2011       fprintf(stderr, "Unhandled intrinsic %s\n",
2012               nir_intrinsic_infos[instr->intrinsic].name);
2013       assert(0);
2014    }
2015 }
2016 
2017 static void
bi_emit_load_const(bi_builder * b,nir_load_const_instr * instr)2018 bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr)
2019 {
2020    /* Make sure we've been lowered */
2021    assert(instr->def.num_components <= (32 / instr->def.bit_size));
2022 
2023    /* Accumulate all the channels of the constant, as if we did an
2024     * implicit SEL over them */
2025    uint32_t acc = 0;
2026 
2027    for (unsigned i = 0; i < instr->def.num_components; ++i) {
2028       unsigned v =
2029          nir_const_value_as_uint(instr->value[i], instr->def.bit_size);
2030       acc |= (v << (i * instr->def.bit_size));
2031    }
2032 
2033    bi_mov_i32_to(b, bi_get_index(instr->def.index), bi_imm_u32(acc));
2034 }
2035 
2036 static bi_index
bi_alu_src_index(bi_builder * b,nir_alu_src src,unsigned comps)2037 bi_alu_src_index(bi_builder *b, nir_alu_src src, unsigned comps)
2038 {
2039    unsigned bitsize = nir_src_bit_size(src.src);
2040 
2041    /* the bi_index carries the 32-bit (word) offset separate from the
2042     * subword swizzle, first handle the offset */
2043 
2044    unsigned offset = 0;
2045 
2046    assert(bitsize == 8 || bitsize == 16 || bitsize == 32);
2047    unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2;
2048 
2049    for (unsigned i = 0; i < comps; ++i) {
2050       unsigned new_offset = (src.swizzle[i] >> subword_shift);
2051 
2052       if (i > 0)
2053          assert(offset == new_offset && "wrong vectorization");
2054 
2055       offset = new_offset;
2056    }
2057 
2058    bi_index idx = bi_extract(b, bi_src_index(&src.src), offset);
2059 
2060    /* Compose the subword swizzle with existing (identity) swizzle */
2061    assert(idx.swizzle == BI_SWIZZLE_H01);
2062 
2063    /* Bigger vectors should have been lowered */
2064    assert(comps <= (1 << subword_shift));
2065 
2066    if (bitsize == 16) {
2067       unsigned c0 = src.swizzle[0] & 1;
2068       unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0;
2069       idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1);
2070    } else if (bitsize == 8 && comps == 1) {
2071       idx.swizzle = BI_SWIZZLE_B0000 + (src.swizzle[0] & 3);
2072    } else if (bitsize == 8) {
2073       /* XXX: Use optimized swizzle when posisble */
2074       bi_index unoffset_srcs[NIR_MAX_VEC_COMPONENTS] = {bi_null()};
2075       unsigned channels[NIR_MAX_VEC_COMPONENTS] = {0};
2076 
2077       for (unsigned i = 0; i < comps; ++i) {
2078          unoffset_srcs[i] = bi_src_index(&src.src);
2079          channels[i] = src.swizzle[i];
2080       }
2081 
2082       bi_index temp = bi_temp(b->shader);
2083       bi_make_vec_to(b, temp, unoffset_srcs, channels, comps, bitsize);
2084 
2085       static const enum bi_swizzle swizzle_lut[] = {
2086          BI_SWIZZLE_B0000, BI_SWIZZLE_B0011, BI_SWIZZLE_H01, BI_SWIZZLE_H01};
2087       assert(comps - 1 < ARRAY_SIZE(swizzle_lut));
2088 
2089       /* Assign a coherent swizzle for the vector */
2090       temp.swizzle = swizzle_lut[comps - 1];
2091 
2092       return temp;
2093    }
2094 
2095    return idx;
2096 }
2097 
2098 static enum bi_round
bi_nir_round(nir_op op)2099 bi_nir_round(nir_op op)
2100 {
2101    switch (op) {
2102    case nir_op_fround_even:
2103       return BI_ROUND_NONE;
2104    case nir_op_ftrunc:
2105       return BI_ROUND_RTZ;
2106    case nir_op_fceil:
2107       return BI_ROUND_RTP;
2108    case nir_op_ffloor:
2109       return BI_ROUND_RTN;
2110    default:
2111       unreachable("invalid nir round op");
2112    }
2113 }
2114 
2115 /* Convenience for lowered transcendentals */
2116 
2117 static bi_index
bi_fmul_f32(bi_builder * b,bi_index s0,bi_index s1)2118 bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1)
2119 {
2120    return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f));
2121 }
2122 
2123 /* Approximate with FRCP_APPROX.f32 and apply a single iteration of
2124  * Newton-Raphson to improve precision */
2125 
2126 static void
bi_lower_frcp_32(bi_builder * b,bi_index dst,bi_index s0)2127 bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0)
2128 {
2129    bi_index x1 = bi_frcp_approx_f32(b, s0);
2130    bi_index m = bi_frexpm_f32(b, s0, false, false);
2131    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false);
2132    bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), bi_zero(),
2133                                    BI_SPECIAL_N);
2134    bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, BI_SPECIAL_NONE);
2135 }
2136 
2137 static void
bi_lower_frsq_32(bi_builder * b,bi_index dst,bi_index s0)2138 bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0)
2139 {
2140    bi_index x1 = bi_frsq_approx_f32(b, s0);
2141    bi_index m = bi_frexpm_f32(b, s0, false, true);
2142    bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true);
2143    bi_index t1 = bi_fmul_f32(b, x1, x1);
2144    bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0),
2145                                    bi_imm_u32(-1), BI_SPECIAL_N);
2146    bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, BI_SPECIAL_N);
2147 }
2148 
2149 /* More complex transcendentals, see
2150  * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc
2151  * for documentation */
2152 
2153 static void
bi_lower_fexp2_32(bi_builder * b,bi_index dst,bi_index s0)2154 bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0)
2155 {
2156    bi_index t1 = bi_temp(b->shader);
2157    bi_instr *t1_instr = bi_fadd_f32_to(b, t1, s0, bi_imm_u32(0x49400000));
2158    t1_instr->clamp = BI_CLAMP_CLAMP_0_INF;
2159 
2160    bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000));
2161 
2162    bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), s0, bi_neg(t2));
2163    a2->clamp = BI_CLAMP_CLAMP_M1_1;
2164 
2165    bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE);
2166    bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false);
2167    bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4));
2168    bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635),
2169                             bi_imm_u32(0x3e75fffa));
2170    bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], bi_imm_u32(0x3f317218));
2171    bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2);
2172    bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), p3, a1t, a1t, a1i,
2173                                       BI_SPECIAL_NONE);
2174    x->clamp = BI_CLAMP_CLAMP_0_INF;
2175 
2176    bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0);
2177    max->sem = BI_SEM_NAN_PROPAGATE;
2178 }
2179 
2180 static void
bi_fexp_32(bi_builder * b,bi_index dst,bi_index s0,bi_index log2_base)2181 bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base)
2182 {
2183    /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24
2184     * fixed-point input */
2185    bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(),
2186                                       bi_imm_u32(24), BI_SPECIAL_NONE);
2187    bi_instr *fixed_pt = bi_f32_to_s32_to(b, bi_temp(b->shader), scale);
2188    fixed_pt->round = BI_ROUND_NONE; // XXX
2189 
2190    /* Compute the result for the fixed-point input, but pass along
2191     * the floating-point scale for correct NaN propagation */
2192    bi_fexp_f32_to(b, dst, fixed_pt->dest[0], scale);
2193 }
2194 
2195 static void
bi_lower_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2196 bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2197 {
2198    /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */
2199    bi_index a1 = bi_frexpm_f32(b, s0, true, false);
2200    bi_index ei = bi_frexpe_f32(b, s0, true, false);
2201    bi_index ef = bi_s32_to_f32(b, ei);
2202 
2203    /* xt estimates -log(r1), a coarse approximation of log(a1) */
2204    bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE);
2205    bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE);
2206 
2207    /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) -
2208     * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1),
2209     * and then log(s0) = x1 + x2 */
2210    bi_index x1 = bi_fadd_f32(b, ef, xt);
2211 
2212    /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by
2213     * polynomial approximation around 1. The series is expressed around
2214     * 1, so set y = (a1 * r1) - 1.0 */
2215    bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0));
2216 
2217    /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate
2218     * log_e(1 + y) by the Taylor series (lower precision than the blob):
2219     * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */
2220    bi_index loge =
2221       bi_fmul_f32(b, y, bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0)));
2222 
2223    bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0)));
2224 
2225    /* log(s0) = x1 + x2 */
2226    bi_fadd_f32_to(b, dst, x1, x2);
2227 }
2228 
2229 static void
bi_flog2_32(bi_builder * b,bi_index dst,bi_index s0)2230 bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0)
2231 {
2232    bi_index frexp = bi_frexpe_f32(b, s0, true, false);
2233    bi_index frexpi = bi_s32_to_f32(b, frexp);
2234    bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0);
2235    bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi);
2236 }
2237 
2238 static void
bi_lower_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2239 bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2240 {
2241    bi_index log2_base = bi_null();
2242 
2243    if (base.type == BI_INDEX_CONSTANT) {
2244       log2_base = bi_imm_f32(log2f(uif(base.value)));
2245    } else {
2246       log2_base = bi_temp(b->shader);
2247       bi_lower_flog2_32(b, log2_base, base);
2248    }
2249 
2250    return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base));
2251 }
2252 
2253 static void
bi_fpow_32(bi_builder * b,bi_index dst,bi_index base,bi_index exp)2254 bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp)
2255 {
2256    bi_index log2_base = bi_null();
2257 
2258    if (base.type == BI_INDEX_CONSTANT) {
2259       log2_base = bi_imm_f32(log2f(uif(base.value)));
2260    } else {
2261       log2_base = bi_temp(b->shader);
2262       bi_flog2_32(b, log2_base, base);
2263    }
2264 
2265    return bi_fexp_32(b, dst, exp, log2_base);
2266 }
2267 
2268 /* Bifrost has extremely coarse tables for approximating sin/cos, accessible as
2269  * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and
2270  * calculates the results. We use them to calculate sin/cos via a Taylor
2271  * approximation:
2272  *
2273  * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x)
2274  * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x)
2275  * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x)
2276  */
2277 
2278 #define TWO_OVER_PI  bi_imm_f32(2.0f / 3.14159f)
2279 #define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0)
2280 #define SINCOS_BIAS  bi_imm_u32(0x49400000)
2281 
2282 static void
bi_lower_fsincos_32(bi_builder * b,bi_index dst,bi_index s0,bool cos)2283 bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos)
2284 {
2285    /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */
2286    bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS);
2287 
2288    /* Approximate domain error (small) */
2289    bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS)),
2290                            MPI_OVER_TWO, s0);
2291 
2292    /* Lookup sin(x), cos(x) */
2293    bi_index sinx = bi_fsin_table_u6(b, x_u6, false);
2294    bi_index cosx = bi_fcos_table_u6(b, x_u6, false);
2295 
2296    /* e^2 / 2 */
2297    bi_index e2_over_2 =
2298       bi_fma_rscale_f32(b, e, e, bi_negzero(), bi_imm_u32(-1), BI_SPECIAL_NONE);
2299 
2300    /* (-e^2)/2 f''(x) */
2301    bi_index quadratic =
2302       bi_fma_f32(b, bi_neg(e2_over_2), cos ? cosx : sinx, bi_negzero());
2303 
2304    /* e f'(x) - (e^2/2) f''(x) */
2305    bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e,
2306                                cos ? bi_neg(sinx) : cosx, quadratic);
2307    I->clamp = BI_CLAMP_CLAMP_M1_1;
2308 
2309    /* f(x) + e f'(x) - (e^2/2) f''(x) */
2310    bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx);
2311 }
2312 
2313 static enum bi_cmpf
bi_translate_cmpf(nir_op op)2314 bi_translate_cmpf(nir_op op)
2315 {
2316    switch (op) {
2317    case nir_op_ieq8:
2318    case nir_op_ieq16:
2319    case nir_op_ieq32:
2320    case nir_op_feq16:
2321    case nir_op_feq32:
2322       return BI_CMPF_EQ;
2323 
2324    case nir_op_ine8:
2325    case nir_op_ine16:
2326    case nir_op_ine32:
2327    case nir_op_fneu16:
2328    case nir_op_fneu32:
2329       return BI_CMPF_NE;
2330 
2331    case nir_op_ilt8:
2332    case nir_op_ilt16:
2333    case nir_op_ilt32:
2334    case nir_op_flt16:
2335    case nir_op_flt32:
2336    case nir_op_ult8:
2337    case nir_op_ult16:
2338    case nir_op_ult32:
2339       return BI_CMPF_LT;
2340 
2341    case nir_op_ige8:
2342    case nir_op_ige16:
2343    case nir_op_ige32:
2344    case nir_op_fge16:
2345    case nir_op_fge32:
2346    case nir_op_uge8:
2347    case nir_op_uge16:
2348    case nir_op_uge32:
2349       return BI_CMPF_GE;
2350 
2351    default:
2352       unreachable("invalid comparison");
2353    }
2354 }
2355 
2356 static bool
bi_nir_is_replicated(nir_alu_src * src)2357 bi_nir_is_replicated(nir_alu_src *src)
2358 {
2359    for (unsigned i = 1; i < nir_src_num_components(src->src); ++i) {
2360       if (src->swizzle[0] == src->swizzle[i])
2361          return false;
2362    }
2363 
2364    return true;
2365 }
2366 
2367 static void
bi_emit_alu(bi_builder * b,nir_alu_instr * instr)2368 bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
2369 {
2370    bi_index dst = bi_def_index(&instr->def);
2371    unsigned srcs = nir_op_infos[instr->op].num_inputs;
2372    unsigned sz = instr->def.bit_size;
2373    unsigned comps = instr->def.num_components;
2374    unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0;
2375 
2376    /* Indicate scalarness */
2377    if (sz == 16 && comps == 1)
2378       dst.swizzle = BI_SWIZZLE_H00;
2379 
2380    /* First, match against the various moves in NIR. These are
2381     * special-cased because they can operate on vectors even after
2382     * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the
2383     * instruction is no "bigger" than SIMD-within-a-register. These moves
2384     * are the exceptions that need to handle swizzles specially. */
2385 
2386    switch (instr->op) {
2387    case nir_op_vec2:
2388    case nir_op_vec3:
2389    case nir_op_vec4:
2390    case nir_op_vec8:
2391    case nir_op_vec16: {
2392       bi_index unoffset_srcs[16] = {bi_null()};
2393       unsigned channels[16] = {0};
2394 
2395       for (unsigned i = 0; i < srcs; ++i) {
2396          unoffset_srcs[i] = bi_src_index(&instr->src[i].src);
2397          channels[i] = instr->src[i].swizzle[0];
2398       }
2399 
2400       bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz);
2401       return;
2402    }
2403 
2404    case nir_op_unpack_32_2x16: {
2405       /* Should have been scalarized */
2406       assert(comps == 2 && sz == 16);
2407 
2408       bi_index vec = bi_src_index(&instr->src[0].src);
2409       unsigned chan = instr->src[0].swizzle[0];
2410 
2411       bi_mov_i32_to(b, dst, bi_extract(b, vec, chan));
2412       return;
2413    }
2414 
2415    case nir_op_unpack_64_2x32_split_x: {
2416       unsigned chan = (instr->src[0].swizzle[0] * 2) + 0;
2417       bi_mov_i32_to(b, dst,
2418                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2419       return;
2420    }
2421 
2422    case nir_op_unpack_64_2x32_split_y: {
2423       unsigned chan = (instr->src[0].swizzle[0] * 2) + 1;
2424       bi_mov_i32_to(b, dst,
2425                     bi_extract(b, bi_src_index(&instr->src[0].src), chan));
2426       return;
2427    }
2428 
2429    case nir_op_pack_64_2x32_split:
2430       bi_collect_v2i32_to(b, dst,
2431                           bi_extract(b, bi_src_index(&instr->src[0].src),
2432                                      instr->src[0].swizzle[0]),
2433                           bi_extract(b, bi_src_index(&instr->src[1].src),
2434                                      instr->src[1].swizzle[0]));
2435       return;
2436 
2437    case nir_op_pack_64_2x32:
2438       bi_collect_v2i32_to(b, dst,
2439                           bi_extract(b, bi_src_index(&instr->src[0].src),
2440                                      instr->src[0].swizzle[0]),
2441                           bi_extract(b, bi_src_index(&instr->src[0].src),
2442                                      instr->src[0].swizzle[1]));
2443       return;
2444 
2445    case nir_op_pack_uvec2_to_uint: {
2446       bi_index src = bi_src_index(&instr->src[0].src);
2447 
2448       assert(sz == 32 && src_sz == 32);
2449       bi_mkvec_v2i16_to(
2450          b, dst, bi_half(bi_extract(b, src, instr->src[0].swizzle[0]), false),
2451          bi_half(bi_extract(b, src, instr->src[0].swizzle[1]), false));
2452       return;
2453    }
2454 
2455    case nir_op_pack_uvec4_to_uint: {
2456       bi_index src = bi_src_index(&instr->src[0].src);
2457 
2458       assert(sz == 32 && src_sz == 32);
2459       bi_mkvec_v4i8_to(
2460          b, dst, bi_byte(bi_extract(b, src, instr->src[0].swizzle[0]), 0),
2461          bi_byte(bi_extract(b, src, instr->src[0].swizzle[1]), 0),
2462          bi_byte(bi_extract(b, src, instr->src[0].swizzle[2]), 0),
2463          bi_byte(bi_extract(b, src, instr->src[0].swizzle[3]), 0));
2464       return;
2465    }
2466 
2467    case nir_op_mov: {
2468       bi_index idx = bi_src_index(&instr->src[0].src);
2469       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2470 
2471       unsigned channels[4] = {
2472          comps > 0 ? instr->src[0].swizzle[0] : 0,
2473          comps > 1 ? instr->src[0].swizzle[1] : 0,
2474          comps > 2 ? instr->src[0].swizzle[2] : 0,
2475          comps > 3 ? instr->src[0].swizzle[3] : 0,
2476       };
2477 
2478       bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, src_sz);
2479       return;
2480    }
2481 
2482    case nir_op_pack_32_2x16: {
2483       assert(comps == 1);
2484 
2485       bi_index idx = bi_src_index(&instr->src[0].src);
2486       bi_index unoffset_srcs[4] = {idx, idx, idx, idx};
2487 
2488       unsigned channels[2] = {instr->src[0].swizzle[0],
2489                               instr->src[0].swizzle[1]};
2490 
2491       bi_make_vec_to(b, dst, unoffset_srcs, channels, 2, 16);
2492       return;
2493    }
2494 
2495    case nir_op_f2f16:
2496    case nir_op_f2f16_rtz:
2497    case nir_op_f2f16_rtne: {
2498       assert(src_sz == 32);
2499       bi_index idx = bi_src_index(&instr->src[0].src);
2500       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2501       bi_index s1 =
2502          comps > 1 ? bi_extract(b, idx, instr->src[0].swizzle[1]) : s0;
2503 
2504       bi_instr *I = bi_v2f32_to_v2f16_to(b, dst, s0, s1);
2505 
2506       /* Override rounding if explicitly requested. Otherwise, the
2507        * default rounding mode is selected by the builder. Depending
2508        * on the float controls required by the shader, the default
2509        * mode may not be nearest-even.
2510        */
2511       if (instr->op == nir_op_f2f16_rtz)
2512          I->round = BI_ROUND_RTZ;
2513       else if (instr->op == nir_op_f2f16_rtne)
2514          I->round = BI_ROUND_NONE; /* Nearest even */
2515 
2516       return;
2517    }
2518 
2519    /* Vectorized downcasts */
2520    case nir_op_u2u16:
2521    case nir_op_i2i16: {
2522       if (!(src_sz == 32 && comps == 2))
2523          break;
2524 
2525       bi_index idx = bi_src_index(&instr->src[0].src);
2526       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2527       bi_index s1 = bi_extract(b, idx, instr->src[0].swizzle[1]);
2528 
2529       bi_mkvec_v2i16_to(b, dst, bi_half(s0, false), bi_half(s1, false));
2530       return;
2531    }
2532 
2533    /* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
2534     * MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
2535     * scalarizing due to scheduling (equal cost on Valhall). Additionally
2536     * if the source is replicated the MKVEC.v2i16 can be optimized out.
2537     */
2538    case nir_op_u2f16:
2539    case nir_op_i2f16: {
2540       if (!(src_sz == 32 && comps == 2))
2541          break;
2542 
2543       nir_alu_src *src = &instr->src[0];
2544       bi_index idx = bi_src_index(&src->src);
2545       bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
2546       bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
2547 
2548       bi_index t =
2549          (src->swizzle[0] == src->swizzle[1])
2550             ? bi_half(s0, false)
2551             : bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false));
2552 
2553       if (instr->op == nir_op_u2f16)
2554          bi_v2u16_to_v2f16_to(b, dst, t);
2555       else
2556          bi_v2s16_to_v2f16_to(b, dst, t);
2557 
2558       return;
2559    }
2560 
2561    case nir_op_i2i8:
2562    case nir_op_u2u8: {
2563       /* Acts like an 8-bit swizzle */
2564       bi_index idx = bi_src_index(&instr->src[0].src);
2565       unsigned factor = src_sz / 8;
2566       unsigned chan[4] = {0};
2567 
2568       for (unsigned i = 0; i < comps; ++i)
2569          chan[i] = instr->src[0].swizzle[i] * factor;
2570 
2571       bi_make_vec_to(b, dst, &idx, chan, comps, 8);
2572       return;
2573    }
2574 
2575    case nir_op_b32csel: {
2576       if (sz != 16)
2577          break;
2578 
2579       /* We allow vectorizing b32csel(cond, A, B) which can be
2580        * translated as MUX.v2i16, even though cond is a 32-bit vector.
2581        *
2582        * If the source condition vector is replicated, we can use
2583        * MUX.v2i16 directly, letting each component use the
2584        * corresponding half of the 32-bit source. NIR uses 0/~0
2585        * booleans so that's guaranteed to work (that is, 32-bit NIR
2586        * booleans are 16-bit replicated).
2587        *
2588        * If we're not replicated, we use the same trick but must
2589        * insert a MKVEC.v2i16 first to convert down to 16-bit.
2590        */
2591       bi_index idx = bi_src_index(&instr->src[0].src);
2592       bi_index s0 = bi_extract(b, idx, instr->src[0].swizzle[0]);
2593       bi_index s1 = bi_alu_src_index(b, instr->src[1], comps);
2594       bi_index s2 = bi_alu_src_index(b, instr->src[2], comps);
2595 
2596       if (!bi_nir_is_replicated(&instr->src[0])) {
2597          s0 = bi_mkvec_v2i16(
2598             b, bi_half(s0, false),
2599             bi_half(bi_extract(b, idx, instr->src[0].swizzle[1]), false));
2600       }
2601 
2602       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2603       return;
2604    }
2605 
2606    default:
2607       break;
2608    }
2609 
2610    bi_index s0 =
2611       srcs > 0 ? bi_alu_src_index(b, instr->src[0], comps) : bi_null();
2612    bi_index s1 =
2613       srcs > 1 ? bi_alu_src_index(b, instr->src[1], comps) : bi_null();
2614    bi_index s2 =
2615       srcs > 2 ? bi_alu_src_index(b, instr->src[2], comps) : bi_null();
2616 
2617    switch (instr->op) {
2618    case nir_op_ffma:
2619       bi_fma_to(b, sz, dst, s0, s1, s2);
2620       break;
2621 
2622    case nir_op_fmul:
2623       bi_fma_to(b, sz, dst, s0, s1, bi_negzero());
2624       break;
2625 
2626    case nir_op_fadd:
2627       bi_fadd_to(b, sz, dst, s0, s1);
2628       break;
2629 
2630    case nir_op_fsat: {
2631       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2632       I->clamp = BI_CLAMP_CLAMP_0_1;
2633       break;
2634    }
2635 
2636    case nir_op_fsat_signed_mali: {
2637       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2638       I->clamp = BI_CLAMP_CLAMP_M1_1;
2639       break;
2640    }
2641 
2642    case nir_op_fclamp_pos_mali: {
2643       bi_instr *I = bi_fclamp_to(b, sz, dst, s0);
2644       I->clamp = BI_CLAMP_CLAMP_0_INF;
2645       break;
2646    }
2647 
2648    case nir_op_fneg:
2649       bi_fabsneg_to(b, sz, dst, bi_neg(s0));
2650       break;
2651 
2652    case nir_op_fabs:
2653       bi_fabsneg_to(b, sz, dst, bi_abs(s0));
2654       break;
2655 
2656    case nir_op_fsin:
2657       bi_lower_fsincos_32(b, dst, s0, false);
2658       break;
2659 
2660    case nir_op_fcos:
2661       bi_lower_fsincos_32(b, dst, s0, true);
2662       break;
2663 
2664    case nir_op_fexp2:
2665       assert(sz == 32); /* should've been lowered */
2666 
2667       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2668          bi_lower_fexp2_32(b, dst, s0);
2669       else
2670          bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f));
2671 
2672       break;
2673 
2674    case nir_op_flog2:
2675       assert(sz == 32); /* should've been lowered */
2676 
2677       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2678          bi_lower_flog2_32(b, dst, s0);
2679       else
2680          bi_flog2_32(b, dst, s0);
2681 
2682       break;
2683 
2684    case nir_op_fpow:
2685       assert(sz == 32); /* should've been lowered */
2686 
2687       if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
2688          bi_lower_fpow_32(b, dst, s0, s1);
2689       else
2690          bi_fpow_32(b, dst, s0, s1);
2691 
2692       break;
2693 
2694    case nir_op_frexp_exp:
2695       bi_frexpe_to(b, sz, dst, s0, false, false);
2696       break;
2697 
2698    case nir_op_frexp_sig:
2699       bi_frexpm_to(b, sz, dst, s0, false, false);
2700       break;
2701 
2702    case nir_op_ldexp:
2703       bi_ldexp_to(b, sz, dst, s0, s1);
2704       break;
2705 
2706    case nir_op_b8csel:
2707       bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2708       break;
2709 
2710    case nir_op_b16csel:
2711       bi_mux_v2i16_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2712       break;
2713 
2714    case nir_op_b32csel:
2715       bi_mux_i32_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO);
2716       break;
2717 
2718    case nir_op_extract_u8:
2719    case nir_op_extract_i8: {
2720       assert(comps == 1 && "should be scalarized");
2721       assert((src_sz == 16 || src_sz == 32) && "should be lowered");
2722       unsigned byte = nir_alu_src_as_uint(instr->src[1]);
2723 
2724       if (s0.swizzle == BI_SWIZZLE_H11) {
2725          assert(byte < 2);
2726          byte += 2;
2727       } else if (s0.swizzle != BI_SWIZZLE_H01) {
2728          assert(s0.swizzle == BI_SWIZZLE_H00);
2729       }
2730 
2731       assert(byte < 4);
2732 
2733       s0.swizzle = BI_SWIZZLE_H01;
2734 
2735       if (instr->op == nir_op_extract_i8)
2736          bi_s8_to_s32_to(b, dst, bi_byte(s0, byte));
2737       else
2738          bi_u8_to_u32_to(b, dst, bi_byte(s0, byte));
2739       break;
2740    }
2741 
2742    case nir_op_extract_u16:
2743    case nir_op_extract_i16: {
2744       assert(comps == 1 && "should be scalarized");
2745       assert(src_sz == 32 && "should be lowered");
2746       unsigned half = nir_alu_src_as_uint(instr->src[1]);
2747       assert(half == 0 || half == 1);
2748 
2749       if (instr->op == nir_op_extract_i16)
2750          bi_s16_to_s32_to(b, dst, bi_half(s0, half));
2751       else
2752          bi_u16_to_u32_to(b, dst, bi_half(s0, half));
2753       break;
2754    }
2755 
2756    case nir_op_insert_u16: {
2757       assert(comps == 1 && "should be scalarized");
2758       unsigned half = nir_alu_src_as_uint(instr->src[1]);
2759       assert(half == 0 || half == 1);
2760 
2761       if (half == 0)
2762          bi_u16_to_u32_to(b, dst, bi_half(s0, 0));
2763       else
2764          bi_mkvec_v2i16_to(b, dst, bi_imm_u16(0), bi_half(s0, 0));
2765       break;
2766    }
2767 
2768    case nir_op_ishl:
2769       bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0));
2770       break;
2771    case nir_op_ushr:
2772       bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), false);
2773       break;
2774 
2775    case nir_op_ishr:
2776       if (b->shader->arch >= 9)
2777          bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0), true);
2778       else
2779          bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0));
2780       break;
2781 
2782    case nir_op_imin:
2783    case nir_op_umin:
2784       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2785                  s1, BI_CMPF_LT);
2786       break;
2787 
2788    case nir_op_imax:
2789    case nir_op_umax:
2790       bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, s0, s1, s0,
2791                  s1, BI_CMPF_GT);
2792       break;
2793 
2794    case nir_op_f2f32:
2795       bi_f16_to_f32_to(b, dst, s0);
2796       break;
2797 
2798    case nir_op_fquantize2f16: {
2799       bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
2800       bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
2801 
2802       f16->ftz = f32->ftz = true;
2803       break;
2804    }
2805 
2806    case nir_op_f2i32:
2807       if (src_sz == 32)
2808          bi_f32_to_s32_to(b, dst, s0);
2809       else
2810          bi_f16_to_s32_to(b, dst, s0);
2811       break;
2812 
2813    /* Note 32-bit sources => no vectorization, so 32-bit works */
2814    case nir_op_f2u16:
2815       if (src_sz == 32)
2816          bi_f32_to_u32_to(b, dst, s0);
2817       else
2818          bi_v2f16_to_v2u16_to(b, dst, s0);
2819       break;
2820 
2821    case nir_op_f2i16:
2822       if (src_sz == 32)
2823          bi_f32_to_s32_to(b, dst, s0);
2824       else
2825          bi_v2f16_to_v2s16_to(b, dst, s0);
2826       break;
2827 
2828    case nir_op_f2u32:
2829       if (src_sz == 32)
2830          bi_f32_to_u32_to(b, dst, s0);
2831       else
2832          bi_f16_to_u32_to(b, dst, s0);
2833       break;
2834 
2835    case nir_op_u2f16:
2836       if (src_sz == 32)
2837          bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
2838       else if (src_sz == 16)
2839          bi_v2u16_to_v2f16_to(b, dst, s0);
2840       else if (src_sz == 8)
2841          bi_v2u8_to_v2f16_to(b, dst, s0);
2842       break;
2843 
2844    case nir_op_u2f32:
2845       if (src_sz == 32)
2846          bi_u32_to_f32_to(b, dst, s0);
2847       else if (src_sz == 16)
2848          bi_u16_to_f32_to(b, dst, s0);
2849       else
2850          bi_u8_to_f32_to(b, dst, s0);
2851       break;
2852 
2853    case nir_op_i2f16:
2854       if (src_sz == 32)
2855          bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
2856       else if (src_sz == 16)
2857          bi_v2s16_to_v2f16_to(b, dst, s0);
2858       else if (src_sz == 8)
2859          bi_v2s8_to_v2f16_to(b, dst, s0);
2860       break;
2861 
2862    case nir_op_i2f32:
2863       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2864 
2865       if (src_sz == 32)
2866          bi_s32_to_f32_to(b, dst, s0);
2867       else if (src_sz == 16)
2868          bi_s16_to_f32_to(b, dst, s0);
2869       else if (src_sz == 8)
2870          bi_s8_to_f32_to(b, dst, s0);
2871       break;
2872 
2873    case nir_op_i2i32:
2874       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2875 
2876       if (src_sz == 32)
2877          bi_mov_i32_to(b, dst, s0);
2878       else if (src_sz == 16)
2879          bi_s16_to_s32_to(b, dst, s0);
2880       else if (src_sz == 8)
2881          bi_s8_to_s32_to(b, dst, s0);
2882       break;
2883 
2884    case nir_op_u2u32:
2885       assert(src_sz == 32 || src_sz == 16 || src_sz == 8);
2886 
2887       if (src_sz == 32)
2888          bi_mov_i32_to(b, dst, s0);
2889       else if (src_sz == 16)
2890          bi_u16_to_u32_to(b, dst, s0);
2891       else if (src_sz == 8)
2892          bi_u8_to_u32_to(b, dst, s0);
2893 
2894       break;
2895 
2896    case nir_op_i2i16:
2897       assert(src_sz == 8 || src_sz == 32);
2898 
2899       if (src_sz == 8)
2900          bi_v2s8_to_v2s16_to(b, dst, s0);
2901       else
2902          bi_mov_i32_to(b, dst, s0);
2903       break;
2904 
2905    case nir_op_u2u16:
2906       assert(src_sz == 8 || src_sz == 32);
2907 
2908       if (src_sz == 8)
2909          bi_v2u8_to_v2u16_to(b, dst, s0);
2910       else
2911          bi_mov_i32_to(b, dst, s0);
2912       break;
2913 
2914    case nir_op_b2i8:
2915    case nir_op_b2i16:
2916    case nir_op_b2i32:
2917       bi_mux_to(b, sz, dst, bi_imm_u8(0), bi_imm_uintN(1, sz), s0,
2918                 BI_MUX_INT_ZERO);
2919       break;
2920 
2921    case nir_op_ieq8:
2922    case nir_op_ine8:
2923    case nir_op_ilt8:
2924    case nir_op_ige8:
2925    case nir_op_ieq16:
2926    case nir_op_ine16:
2927    case nir_op_ilt16:
2928    case nir_op_ige16:
2929    case nir_op_ieq32:
2930    case nir_op_ine32:
2931    case nir_op_ilt32:
2932    case nir_op_ige32:
2933       bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2934                  BI_RESULT_TYPE_M1);
2935       break;
2936 
2937    case nir_op_ult8:
2938    case nir_op_uge8:
2939    case nir_op_ult16:
2940    case nir_op_uge16:
2941    case nir_op_ult32:
2942    case nir_op_uge32:
2943       bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1,
2944                  bi_translate_cmpf(instr->op), BI_RESULT_TYPE_M1);
2945       break;
2946 
2947    case nir_op_feq32:
2948    case nir_op_feq16:
2949    case nir_op_flt32:
2950    case nir_op_flt16:
2951    case nir_op_fge32:
2952    case nir_op_fge16:
2953    case nir_op_fneu32:
2954    case nir_op_fneu16:
2955       bi_fcmp_to(b, sz, dst, s0, s1, bi_translate_cmpf(instr->op),
2956                  BI_RESULT_TYPE_M1);
2957       break;
2958 
2959    case nir_op_fround_even:
2960    case nir_op_fceil:
2961    case nir_op_ffloor:
2962    case nir_op_ftrunc:
2963       bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op));
2964       break;
2965 
2966    case nir_op_fmin:
2967       bi_fmin_to(b, sz, dst, s0, s1);
2968       break;
2969 
2970    case nir_op_fmax:
2971       bi_fmax_to(b, sz, dst, s0, s1);
2972       break;
2973 
2974    case nir_op_iadd:
2975       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false);
2976       break;
2977 
2978    case nir_op_iadd_sat:
2979       bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true);
2980       break;
2981 
2982    case nir_op_uadd_sat:
2983       bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true);
2984       break;
2985 
2986    case nir_op_ihadd:
2987       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN);
2988       break;
2989 
2990    case nir_op_irhadd:
2991       bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP);
2992       break;
2993 
2994    case nir_op_uhadd:
2995       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTN);
2996       break;
2997 
2998    case nir_op_urhadd:
2999       bi_hadd_to(b, nir_type_uint, sz, dst, s0, s1, BI_ROUND_RTP);
3000       break;
3001 
3002    case nir_op_ineg:
3003       bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false);
3004       break;
3005 
3006    case nir_op_isub:
3007       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false);
3008       break;
3009 
3010    case nir_op_isub_sat:
3011       bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true);
3012       break;
3013 
3014    case nir_op_usub_sat:
3015       bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true);
3016       break;
3017 
3018    case nir_op_imul:
3019       bi_imul_to(b, sz, dst, s0, s1);
3020       break;
3021 
3022    case nir_op_iabs:
3023       bi_iabs_to(b, sz, dst, s0);
3024       break;
3025 
3026    case nir_op_iand:
3027       bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3028       break;
3029 
3030    case nir_op_ior:
3031       bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3032       break;
3033 
3034    case nir_op_ixor:
3035       bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0));
3036       break;
3037 
3038    case nir_op_inot:
3039       bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0));
3040       break;
3041 
3042    case nir_op_frsq:
3043       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3044          bi_lower_frsq_32(b, dst, s0);
3045       else
3046          bi_frsq_to(b, sz, dst, s0);
3047       break;
3048 
3049    case nir_op_frcp:
3050       if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS)
3051          bi_lower_frcp_32(b, dst, s0);
3052       else
3053          bi_frcp_to(b, sz, dst, s0);
3054       break;
3055 
3056    case nir_op_uclz:
3057       bi_clz_to(b, sz, dst, s0, false);
3058       break;
3059 
3060    case nir_op_bit_count:
3061       assert(sz == 32 && src_sz == 32 && "should've been lowered");
3062       bi_popcount_i32_to(b, dst, s0);
3063       break;
3064 
3065    case nir_op_bitfield_reverse:
3066       assert(sz == 32 && src_sz == 32 && "should've been lowered");
3067       bi_bitrev_i32_to(b, dst, s0);
3068       break;
3069 
3070    case nir_op_ufind_msb: {
3071       bi_index clz = bi_clz(b, src_sz, s0, false);
3072 
3073       if (sz == 8)
3074          clz = bi_byte(clz, 0);
3075       else if (sz == 16)
3076          clz = bi_half(clz, false);
3077 
3078       bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false);
3079       break;
3080    }
3081 
3082    default:
3083       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
3084       unreachable("Unknown ALU op");
3085    }
3086 }
3087 
3088 /* Returns dimension with 0 special casing cubemaps. Shamelessly copied from
3089  * Midgard */
3090 static unsigned
bifrost_tex_format(enum glsl_sampler_dim dim)3091 bifrost_tex_format(enum glsl_sampler_dim dim)
3092 {
3093    switch (dim) {
3094    case GLSL_SAMPLER_DIM_1D:
3095    case GLSL_SAMPLER_DIM_BUF:
3096       return 1;
3097 
3098    case GLSL_SAMPLER_DIM_2D:
3099    case GLSL_SAMPLER_DIM_MS:
3100    case GLSL_SAMPLER_DIM_EXTERNAL:
3101    case GLSL_SAMPLER_DIM_RECT:
3102    case GLSL_SAMPLER_DIM_SUBPASS:
3103    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3104       return 2;
3105 
3106    case GLSL_SAMPLER_DIM_3D:
3107       return 3;
3108 
3109    case GLSL_SAMPLER_DIM_CUBE:
3110       return 0;
3111 
3112    default:
3113       DBG("Unknown sampler dim type\n");
3114       assert(0);
3115       return 0;
3116    }
3117 }
3118 
3119 static enum bi_dimension
valhall_tex_dimension(enum glsl_sampler_dim dim)3120 valhall_tex_dimension(enum glsl_sampler_dim dim)
3121 {
3122    switch (dim) {
3123    case GLSL_SAMPLER_DIM_1D:
3124    case GLSL_SAMPLER_DIM_BUF:
3125       return BI_DIMENSION_1D;
3126 
3127    case GLSL_SAMPLER_DIM_2D:
3128    case GLSL_SAMPLER_DIM_MS:
3129    case GLSL_SAMPLER_DIM_EXTERNAL:
3130    case GLSL_SAMPLER_DIM_RECT:
3131    case GLSL_SAMPLER_DIM_SUBPASS:
3132    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3133       return BI_DIMENSION_2D;
3134 
3135    case GLSL_SAMPLER_DIM_3D:
3136       return BI_DIMENSION_3D;
3137 
3138    case GLSL_SAMPLER_DIM_CUBE:
3139       return BI_DIMENSION_CUBE;
3140 
3141    default:
3142       unreachable("Unknown sampler dim type");
3143    }
3144 }
3145 
3146 static enum bifrost_texture_format_full
bi_texture_format(nir_alu_type T,enum bi_clamp clamp)3147 bi_texture_format(nir_alu_type T, enum bi_clamp clamp)
3148 {
3149    switch (T) {
3150    case nir_type_float16:
3151       return BIFROST_TEXTURE_FORMAT_F16 + clamp;
3152    case nir_type_float32:
3153       return BIFROST_TEXTURE_FORMAT_F32 + clamp;
3154    case nir_type_uint16:
3155       return BIFROST_TEXTURE_FORMAT_U16;
3156    case nir_type_int16:
3157       return BIFROST_TEXTURE_FORMAT_S16;
3158    case nir_type_uint32:
3159       return BIFROST_TEXTURE_FORMAT_U32;
3160    case nir_type_int32:
3161       return BIFROST_TEXTURE_FORMAT_S32;
3162    default:
3163       unreachable("Invalid type for texturing");
3164    }
3165 }
3166 
3167 /* Array indices are specified as 32-bit uints, need to convert. In .z component
3168  * from NIR */
3169 static bi_index
bi_emit_texc_array_index(bi_builder * b,bi_index idx,nir_alu_type T)3170 bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T)
3171 {
3172    /* For (u)int we can just passthrough */
3173    nir_alu_type base = nir_alu_type_get_base_type(T);
3174    if (base == nir_type_int || base == nir_type_uint)
3175       return idx;
3176 
3177    /* Otherwise we convert */
3178    assert(T == nir_type_float32);
3179 
3180    /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and
3181     * Texel Selection") defines the layer to be taken from clamp(RNE(r),
3182     * 0, dt - 1). So we use round RTE, clamping is handled at the data
3183     * structure level */
3184 
3185    bi_instr *I = bi_f32_to_u32_to(b, bi_temp(b->shader), idx);
3186    I->round = BI_ROUND_NONE;
3187    return I->dest[0];
3188 }
3189 
3190 /* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a
3191  * 16-bit 8:8 fixed-point format. We lower as:
3192  *
3193  * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF =
3194  * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0)
3195  */
3196 
3197 static bi_index
bi_emit_texc_lod_88(bi_builder * b,bi_index lod,bool fp16)3198 bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16)
3199 {
3200    /* Precompute for constant LODs to avoid general constant folding */
3201    if (lod.type == BI_INDEX_CONSTANT) {
3202       uint32_t raw = lod.value;
3203       float x = fp16 ? _mesa_half_to_float(raw) : uif(raw);
3204       int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f;
3205       return bi_imm_u32(s32 & 0xFFFF);
3206    }
3207 
3208    /* Sort of arbitrary. Must be less than 128.0, greater than or equal to
3209     * the max LOD (16 since we cap at 2^16 texture dimensions), and
3210     * preferably small to minimize precision loss */
3211    const float max_lod = 16.0;
3212 
3213    bi_instr *fsat =
3214       bi_fma_f32_to(b, bi_temp(b->shader), fp16 ? bi_half(lod, false) : lod,
3215                     bi_imm_f32(1.0f / max_lod), bi_negzero());
3216 
3217    fsat->clamp = BI_CLAMP_CLAMP_M1_1;
3218 
3219    bi_index fmul =
3220       bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), bi_negzero());
3221 
3222    return bi_mkvec_v2i16(b, bi_half(bi_f32_to_s32(b, fmul), false),
3223                          bi_imm_u16(0));
3224 }
3225 
3226 /* FETCH takes a 32-bit staging register containing the LOD as an integer in
3227  * the bottom 16-bits and (if present) the cube face index in the top 16-bits.
3228  * TODO: Cube face.
3229  */
3230 
3231 static bi_index
bi_emit_texc_lod_cube(bi_builder * b,bi_index lod)3232 bi_emit_texc_lod_cube(bi_builder *b, bi_index lod)
3233 {
3234    return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8));
3235 }
3236 
3237 /* The hardware specifies texel offsets and multisample indices together as a
3238  * u8vec4 <offset, ms index>. By default all are zero, so if have either a
3239  * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with
3240  * the bits we need and return that to be passed as a staging register. Else we
3241  * return 0 to avoid allocating a data register when everything is zero. */
3242 
3243 static bi_index
bi_emit_texc_offset_ms_index(bi_builder * b,nir_tex_instr * instr)3244 bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr)
3245 {
3246    bi_index dest = bi_zero();
3247 
3248    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3249    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3250                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3251       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3252       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3253       dest = bi_mkvec_v4i8(
3254          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3255          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0),
3256          (nr > 2) ? bi_byte(bi_extract(b, idx, 2), 0) : bi_imm_u8(0),
3257          bi_imm_u8(0));
3258    }
3259 
3260    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3261    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3262                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3263       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[ms_idx].src), dest,
3264                               bi_imm_u8(24));
3265    }
3266 
3267    return dest;
3268 }
3269 
3270 /*
3271  * Valhall specifies specifies texel offsets, multisample indices, and (for
3272  * fetches) LOD together as a u8vec4 <offset.xyz, LOD>, where the third
3273  * component is either offset.z or multisample index depending on context. Build
3274  * this register.
3275  */
3276 static bi_index
bi_emit_valhall_offsets(bi_builder * b,nir_tex_instr * instr)3277 bi_emit_valhall_offsets(bi_builder *b, nir_tex_instr *instr)
3278 {
3279    bi_index dest = bi_zero();
3280 
3281    int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset);
3282    int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index);
3283    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3284 
3285    /* Components 0-2: offsets */
3286    if (offs_idx >= 0 && (!nir_src_is_const(instr->src[offs_idx].src) ||
3287                          nir_src_as_uint(instr->src[offs_idx].src) != 0)) {
3288       unsigned nr = nir_src_num_components(instr->src[offs_idx].src);
3289       bi_index idx = bi_src_index(&instr->src[offs_idx].src);
3290 
3291       /* No multisample index with 3D */
3292       assert((nr <= 2) || (ms_idx < 0));
3293 
3294       /* Zero extend the Z byte so we can use it with MKVEC.v2i8 */
3295       bi_index z = (nr > 2)
3296                       ? bi_mkvec_v2i8(b, bi_byte(bi_extract(b, idx, 2), 0),
3297                                       bi_imm_u8(0), bi_zero())
3298                       : bi_zero();
3299 
3300       dest = bi_mkvec_v2i8(
3301          b, (nr > 0) ? bi_byte(bi_extract(b, idx, 0), 0) : bi_imm_u8(0),
3302          (nr > 1) ? bi_byte(bi_extract(b, idx, 1), 0) : bi_imm_u8(0), z);
3303    }
3304 
3305    /* Component 2: multisample index */
3306    if (ms_idx >= 0 && (!nir_src_is_const(instr->src[ms_idx].src) ||
3307                        nir_src_as_uint(instr->src[ms_idx].src) != 0)) {
3308       dest = bi_mkvec_v2i16(b, dest, bi_src_index(&instr->src[ms_idx].src));
3309    }
3310 
3311    /* Component 3: 8-bit LOD */
3312    if (lod_idx >= 0 &&
3313        (!nir_src_is_const(instr->src[lod_idx].src) ||
3314         nir_src_as_uint(instr->src[lod_idx].src) != 0) &&
3315        nir_tex_instr_src_type(instr, lod_idx) != nir_type_float) {
3316       dest = bi_lshift_or_i32(b, bi_src_index(&instr->src[lod_idx].src), dest,
3317                               bi_imm_u8(24));
3318    }
3319 
3320    return dest;
3321 }
3322 
3323 static void
bi_emit_cube_coord(bi_builder * b,bi_index coord,bi_index * face,bi_index * s,bi_index * t)3324 bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s,
3325                    bi_index *t)
3326 {
3327    /* Compute max { |x|, |y|, |z| } */
3328    bi_index maxxyz = bi_temp(b->shader);
3329    *face = bi_temp(b->shader);
3330 
3331    bi_index cx = bi_extract(b, coord, 0), cy = bi_extract(b, coord, 1),
3332             cz = bi_extract(b, coord, 2);
3333 
3334    /* Use a pseudo op on Bifrost due to tuple restrictions */
3335    if (b->shader->arch <= 8) {
3336       bi_cubeface_to(b, maxxyz, *face, cx, cy, cz);
3337    } else {
3338       bi_cubeface1_to(b, maxxyz, cx, cy, cz);
3339       bi_cubeface2_v9_to(b, *face, cx, cy, cz);
3340    }
3341 
3342    /* Select coordinates */
3343    bi_index ssel =
3344       bi_cube_ssel(b, bi_extract(b, coord, 2), bi_extract(b, coord, 0), *face);
3345    bi_index tsel =
3346       bi_cube_tsel(b, bi_extract(b, coord, 1), bi_extract(b, coord, 2), *face);
3347 
3348    /* The OpenGL ES specification requires us to transform an input vector
3349     * (x, y, z) to the coordinate, given the selected S/T:
3350     *
3351     * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1))
3352     *
3353     * We implement (s shown, t similar) in a form friendlier to FMA
3354     * instructions, and clamp coordinates at the end for correct
3355     * NaN/infinity handling:
3356     *
3357     * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5)
3358     *
3359     * Take the reciprocal of max{x, y, z}
3360     */
3361    bi_index rcp = bi_frcp_f32(b, maxxyz);
3362 
3363    /* Calculate 0.5 * (1.0 / max{x, y, z}) */
3364    bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero());
3365 
3366    /* Transform the coordinates */
3367    *s = bi_temp(b->shader);
3368    *t = bi_temp(b->shader);
3369 
3370    bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f));
3371    bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f));
3372 
3373    S->clamp = BI_CLAMP_CLAMP_0_1;
3374    T->clamp = BI_CLAMP_CLAMP_0_1;
3375 }
3376 
3377 /* Emits a cube map descriptor, returning lower 32-bits and putting upper
3378  * 32-bits in passed pointer t. The packing of the face with the S coordinate
3379  * exploits the redundancy of floating points with the range restriction of
3380  * CUBEFACE output.
3381  *
3382  *     struct cube_map_descriptor {
3383  *         float s : 29;
3384  *         unsigned face : 3;
3385  *         float t : 32;
3386  *     }
3387  *
3388  * Since the cube face index is preshifted, this is easy to pack with a bitwise
3389  * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3
3390  * bits from face.
3391  */
3392 
3393 static bi_index
bi_emit_texc_cube_coord(bi_builder * b,bi_index coord,bi_index * t)3394 bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t)
3395 {
3396    bi_index face, s;
3397    bi_emit_cube_coord(b, coord, &face, &s, t);
3398    bi_index mask = bi_imm_u32(BITFIELD_MASK(29));
3399    return bi_mux_i32(b, s, face, mask, BI_MUX_BIT);
3400 }
3401 
3402 /* Map to the main texture op used. Some of these (txd in particular) will
3403  * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in
3404  * sequence). We assume that lowering is handled elsewhere.
3405  */
3406 
3407 static enum bifrost_tex_op
bi_tex_op(nir_texop op)3408 bi_tex_op(nir_texop op)
3409 {
3410    switch (op) {
3411    case nir_texop_tex:
3412    case nir_texop_txb:
3413    case nir_texop_txl:
3414    case nir_texop_txd:
3415       return BIFROST_TEX_OP_TEX;
3416    case nir_texop_txf:
3417    case nir_texop_txf_ms:
3418    case nir_texop_tg4:
3419       return BIFROST_TEX_OP_FETCH;
3420    case nir_texop_txs:
3421    case nir_texop_lod:
3422    case nir_texop_query_levels:
3423    case nir_texop_texture_samples:
3424    case nir_texop_samples_identical:
3425       unreachable("should've been lowered");
3426    default:
3427       unreachable("unsupported tex op");
3428    }
3429 }
3430 
3431 /* Data registers required by texturing in the order they appear. All are
3432  * optional, the texture operation descriptor determines which are present.
3433  * Note since 3D arrays are not permitted at an API level, Z_COORD and
3434  * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */
3435 
3436 enum bifrost_tex_dreg {
3437    BIFROST_TEX_DREG_Z_COORD = 0,
3438    BIFROST_TEX_DREG_Y_DELTAS = 1,
3439    BIFROST_TEX_DREG_LOD = 2,
3440    BIFROST_TEX_DREG_GRDESC_HI = 3,
3441    BIFROST_TEX_DREG_SHADOW = 4,
3442    BIFROST_TEX_DREG_ARRAY = 5,
3443    BIFROST_TEX_DREG_OFFSETMS = 6,
3444    BIFROST_TEX_DREG_SAMPLER = 7,
3445    BIFROST_TEX_DREG_TEXTURE = 8,
3446    BIFROST_TEX_DREG_COUNT,
3447 };
3448 
3449 static void
bi_emit_texc(bi_builder * b,nir_tex_instr * instr)3450 bi_emit_texc(bi_builder *b, nir_tex_instr *instr)
3451 {
3452    struct bifrost_texture_operation desc = {
3453       .op = bi_tex_op(instr->op),
3454       .offset_or_bias_disable = false, /* TODO */
3455       .shadow_or_clamp_disable = instr->is_shadow,
3456       .array = instr->is_array,
3457       .dimension = bifrost_tex_format(instr->sampler_dim),
3458       .format = bi_texture_format(instr->dest_type | instr->def.bit_size,
3459                                   BI_CLAMP_NONE), /* TODO */
3460       .mask = 0xF,
3461    };
3462 
3463    switch (desc.op) {
3464    case BIFROST_TEX_OP_TEX:
3465       desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE;
3466       break;
3467    case BIFROST_TEX_OP_FETCH:
3468       desc.lod_or_fetch = (enum bifrost_lod_mode)(
3469          instr->op == nir_texop_tg4
3470             ? BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component
3471             : BIFROST_TEXTURE_FETCH_TEXEL);
3472       break;
3473    default:
3474       unreachable("texture op unsupported");
3475    }
3476 
3477    /* 32-bit indices to be allocated as consecutive staging registers */
3478    bi_index dregs[BIFROST_TEX_DREG_COUNT] = {};
3479    bi_index cx = bi_null(), cy = bi_null();
3480 
3481    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3482       bi_index index = bi_src_index(&instr->src[i].src);
3483       unsigned sz = nir_src_bit_size(instr->src[i].src);
3484       unsigned components = nir_src_num_components(instr->src[i].src);
3485       ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i);
3486       nir_alu_type T = base | sz;
3487 
3488       switch (instr->src[i].src_type) {
3489       case nir_tex_src_coord:
3490          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3491             cx = bi_emit_texc_cube_coord(b, index, &cy);
3492          } else {
3493             /* Copy XY (for 2D+) or XX (for 1D) */
3494             cx = bi_extract(b, index, 0);
3495             cy = bi_extract(b, index, MIN2(1, components - 1));
3496 
3497             assert(components >= 1 && components <= 3);
3498 
3499             if (components == 3 && !desc.array) {
3500                /* 3D */
3501                dregs[BIFROST_TEX_DREG_Z_COORD] = bi_extract(b, index, 2);
3502             }
3503          }
3504 
3505          if (desc.array) {
3506             dregs[BIFROST_TEX_DREG_ARRAY] = bi_emit_texc_array_index(
3507                b, bi_extract(b, index, components - 1), T);
3508          }
3509 
3510          break;
3511 
3512       case nir_tex_src_lod:
3513          if (desc.op == BIFROST_TEX_OP_TEX &&
3514              nir_src_is_const(instr->src[i].src) &&
3515              nir_src_as_uint(instr->src[i].src) == 0) {
3516             desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO;
3517          } else if (desc.op == BIFROST_TEX_OP_TEX) {
3518             assert(base == nir_type_float);
3519 
3520             assert(sz == 16 || sz == 32);
3521             dregs[BIFROST_TEX_DREG_LOD] =
3522                bi_emit_texc_lod_88(b, index, sz == 16);
3523             desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT;
3524          } else {
3525             assert(desc.op == BIFROST_TEX_OP_FETCH);
3526             assert(base == nir_type_uint || base == nir_type_int);
3527             assert(sz == 16 || sz == 32);
3528 
3529             dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, index);
3530          }
3531 
3532          break;
3533 
3534       case nir_tex_src_bias:
3535          /* Upper 16-bits interpreted as a clamp, leave zero */
3536          assert(desc.op == BIFROST_TEX_OP_TEX);
3537          assert(base == nir_type_float);
3538          assert(sz == 16 || sz == 32);
3539          dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3540          desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS;
3541          break;
3542 
3543       case nir_tex_src_ms_index:
3544       case nir_tex_src_offset:
3545          if (desc.offset_or_bias_disable)
3546             break;
3547 
3548          dregs[BIFROST_TEX_DREG_OFFSETMS] =
3549             bi_emit_texc_offset_ms_index(b, instr);
3550          if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero()))
3551             desc.offset_or_bias_disable = true;
3552          break;
3553 
3554       case nir_tex_src_comparator:
3555          dregs[BIFROST_TEX_DREG_SHADOW] = index;
3556          break;
3557 
3558       case nir_tex_src_texture_offset:
3559          dregs[BIFROST_TEX_DREG_TEXTURE] = index;
3560          break;
3561 
3562       case nir_tex_src_sampler_offset:
3563          dregs[BIFROST_TEX_DREG_SAMPLER] = index;
3564          break;
3565 
3566       default:
3567          unreachable("Unhandled src type in texc emit");
3568       }
3569    }
3570 
3571    if (desc.op == BIFROST_TEX_OP_FETCH &&
3572        bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) {
3573       dregs[BIFROST_TEX_DREG_LOD] = bi_emit_texc_lod_cube(b, bi_zero());
3574    }
3575 
3576    /* Choose an index mode */
3577 
3578    bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]);
3579    bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]);
3580    bool direct = direct_tex && direct_samp;
3581 
3582    desc.immediate_indices =
3583       direct && (instr->sampler_index < 16 && instr->texture_index < 128);
3584 
3585    if (desc.immediate_indices) {
3586       desc.sampler_index_or_mode = instr->sampler_index;
3587       desc.index = instr->texture_index;
3588    } else {
3589       unsigned mode = 0;
3590 
3591       if (direct && instr->sampler_index == instr->texture_index &&
3592           instr->sampler_index < 128) {
3593          mode = BIFROST_INDEX_IMMEDIATE_SHARED;
3594          desc.index = instr->texture_index;
3595       } else if (direct && instr->sampler_index < 128) {
3596          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3597          desc.index = instr->sampler_index;
3598          dregs[BIFROST_TEX_DREG_TEXTURE] =
3599             bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3600       } else if (direct_tex && instr->texture_index < 128) {
3601          mode = BIFROST_INDEX_IMMEDIATE_TEXTURE;
3602          desc.index = instr->texture_index;
3603 
3604          if (direct_samp) {
3605             dregs[BIFROST_TEX_DREG_SAMPLER] =
3606                bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3607          }
3608       } else if (direct_samp && instr->sampler_index < 128) {
3609          mode = BIFROST_INDEX_IMMEDIATE_SAMPLER;
3610          desc.index = instr->sampler_index;
3611 
3612          if (direct_tex) {
3613             dregs[BIFROST_TEX_DREG_TEXTURE] =
3614                bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3615          }
3616       } else {
3617          mode = BIFROST_INDEX_REGISTER;
3618 
3619          if (direct_tex) {
3620             dregs[BIFROST_TEX_DREG_TEXTURE] =
3621                bi_mov_i32(b, bi_imm_u32(instr->texture_index));
3622          }
3623 
3624          if (direct_samp) {
3625             dregs[BIFROST_TEX_DREG_SAMPLER] =
3626                bi_mov_i32(b, bi_imm_u32(instr->sampler_index));
3627          }
3628       }
3629 
3630       mode |= (BIFROST_TEXTURE_OPERATION_SINGLE << 2);
3631       desc.sampler_index_or_mode = mode;
3632    }
3633 
3634    /* Allocate staging registers contiguously by compacting the array. */
3635    unsigned sr_count = 0;
3636 
3637    for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) {
3638       if (!bi_is_null(dregs[i]))
3639          dregs[sr_count++] = dregs[i];
3640    }
3641 
3642    unsigned res_size = instr->def.bit_size == 16 ? 2 : 4;
3643 
3644    bi_index sr = sr_count ? bi_temp(b->shader) : bi_null();
3645    bi_index dst = bi_temp(b->shader);
3646 
3647    if (sr_count)
3648       bi_emit_collect_to(b, sr, dregs, sr_count);
3649 
3650    uint32_t desc_u = 0;
3651    memcpy(&desc_u, &desc, sizeof(desc_u));
3652    bi_instr *I =
3653       bi_texc_to(b, dst, sr, cx, cy, bi_imm_u32(desc_u),
3654                  !nir_tex_instr_has_implicit_derivative(instr), sr_count, 0);
3655    I->register_format = bi_reg_fmt_for_nir(instr->dest_type);
3656 
3657    bi_index w[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3658    bi_emit_split_i32(b, w, dst, res_size);
3659    bi_emit_collect_to(b, bi_def_index(&instr->def), w,
3660                       DIV_ROUND_UP(instr->def.num_components * res_size, 4));
3661 }
3662 
3663 /* Staging registers required by texturing in the order they appear (Valhall) */
3664 
3665 enum valhall_tex_sreg {
3666    VALHALL_TEX_SREG_X_COORD = 0,
3667    VALHALL_TEX_SREG_Y_COORD = 1,
3668    VALHALL_TEX_SREG_Z_COORD = 2,
3669    VALHALL_TEX_SREG_Y_DELTAS = 3,
3670    VALHALL_TEX_SREG_ARRAY = 4,
3671    VALHALL_TEX_SREG_SHADOW = 5,
3672    VALHALL_TEX_SREG_OFFSETMS = 6,
3673    VALHALL_TEX_SREG_LOD = 7,
3674    VALHALL_TEX_SREG_GRDESC = 8,
3675    VALHALL_TEX_SREG_COUNT,
3676 };
3677 
3678 static void
bi_emit_tex_valhall(bi_builder * b,nir_tex_instr * instr)3679 bi_emit_tex_valhall(bi_builder *b, nir_tex_instr *instr)
3680 {
3681    bool explicit_offset = false;
3682    enum bi_va_lod_mode lod_mode = BI_VA_LOD_MODE_COMPUTED_LOD;
3683 
3684    bool has_lod_mode = (instr->op == nir_texop_tex) ||
3685                        (instr->op == nir_texop_txl) ||
3686                        (instr->op == nir_texop_txb);
3687 
3688    /* 32-bit indices to be allocated as consecutive staging registers */
3689    bi_index sregs[VALHALL_TEX_SREG_COUNT] = {};
3690    bi_index sampler = bi_imm_u32(instr->sampler_index);
3691    bi_index texture = bi_imm_u32(instr->texture_index);
3692 
3693    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3694       bi_index index = bi_src_index(&instr->src[i].src);
3695       unsigned sz = nir_src_bit_size(instr->src[i].src);
3696 
3697       switch (instr->src[i].src_type) {
3698       case nir_tex_src_coord: {
3699          unsigned components =
3700             nir_src_num_components(instr->src[i].src) - instr->is_array;
3701 
3702          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3703             sregs[VALHALL_TEX_SREG_X_COORD] = bi_emit_texc_cube_coord(
3704                b, index, &sregs[VALHALL_TEX_SREG_Y_COORD]);
3705          } else {
3706             assert(components >= 1 && components <= 3);
3707 
3708             /* Copy XY (for 2D+) or XX (for 1D) */
3709             sregs[VALHALL_TEX_SREG_X_COORD] = index;
3710 
3711             if (components >= 2)
3712                sregs[VALHALL_TEX_SREG_Y_COORD] = bi_extract(b, index, 1);
3713 
3714             if (components == 3)
3715                sregs[VALHALL_TEX_SREG_Z_COORD] = bi_extract(b, index, 2);
3716          }
3717 
3718          if (instr->is_array) {
3719             sregs[VALHALL_TEX_SREG_ARRAY] = bi_extract(b, index, components);
3720          }
3721 
3722          break;
3723       }
3724 
3725       case nir_tex_src_lod:
3726          if (nir_src_is_const(instr->src[i].src) &&
3727              nir_src_as_uint(instr->src[i].src) == 0) {
3728             lod_mode = BI_VA_LOD_MODE_ZERO_LOD;
3729          } else if (has_lod_mode) {
3730             lod_mode = BI_VA_LOD_MODE_EXPLICIT;
3731 
3732             assert(sz == 16 || sz == 32);
3733             sregs[VALHALL_TEX_SREG_LOD] =
3734                bi_emit_texc_lod_88(b, index, sz == 16);
3735          }
3736          break;
3737 
3738       case nir_tex_src_bias:
3739          /* Upper 16-bits interpreted as a clamp, leave zero */
3740          assert(sz == 16 || sz == 32);
3741          sregs[VALHALL_TEX_SREG_LOD] = bi_emit_texc_lod_88(b, index, sz == 16);
3742 
3743          lod_mode = BI_VA_LOD_MODE_COMPUTED_BIAS;
3744          break;
3745       case nir_tex_src_ms_index:
3746       case nir_tex_src_offset:
3747          /* Handled below */
3748          break;
3749 
3750       case nir_tex_src_comparator:
3751          sregs[VALHALL_TEX_SREG_SHADOW] = index;
3752          break;
3753 
3754       case nir_tex_src_texture_offset:
3755          /* This should always be 0 as lower_index_to_offset is expected to be
3756           * set */
3757          assert(instr->texture_index == 0);
3758          texture = index;
3759          break;
3760 
3761       case nir_tex_src_sampler_offset:
3762          /* This should always be 0 as lower_index_to_offset is expected to be
3763           * set */
3764          assert(instr->sampler_index == 0);
3765          sampler = index;
3766          break;
3767 
3768       default:
3769          unreachable("Unhandled src type in tex emit");
3770       }
3771    }
3772 
3773    /* Generate packed offset + ms index + LOD register. These default to
3774     * zero so we only need to encode if these features are actually in use.
3775     */
3776    bi_index offsets = bi_emit_valhall_offsets(b, instr);
3777 
3778    if (!bi_is_equiv(offsets, bi_zero())) {
3779       sregs[VALHALL_TEX_SREG_OFFSETMS] = offsets;
3780       explicit_offset = true;
3781    }
3782 
3783    /* Allocate staging registers contiguously by compacting the array. */
3784    unsigned sr_count = 0;
3785 
3786    for (unsigned i = 0; i < ARRAY_SIZE(sregs); ++i) {
3787       if (!bi_is_null(sregs[i]))
3788          sregs[sr_count++] = sregs[i];
3789    }
3790 
3791    bi_index idx = sr_count ? bi_temp(b->shader) : bi_null();
3792 
3793    if (sr_count)
3794       bi_make_vec_to(b, idx, sregs, NULL, sr_count, 32);
3795 
3796    bool narrow_indices = va_is_valid_const_narrow_index(texture) &&
3797                          va_is_valid_const_narrow_index(sampler);
3798 
3799    bi_index src0;
3800    bi_index src1;
3801 
3802    if (narrow_indices) {
3803       unsigned tex_set =
3804          va_res_fold_table_idx(pan_res_handle_get_table(texture.value));
3805       unsigned sampler_set =
3806          va_res_fold_table_idx(pan_res_handle_get_table(sampler.value));
3807       unsigned texture_index = pan_res_handle_get_index(texture.value);
3808       unsigned sampler_index = pan_res_handle_get_index(sampler.value);
3809 
3810       unsigned packed_handle = (tex_set << 27) | (texture_index << 16) |
3811                                (sampler_set << 11) | sampler_index;
3812 
3813       src0 = bi_imm_u32(packed_handle);
3814 
3815       /* TODO: narrow offsetms */
3816       src1 = bi_zero();
3817    } else {
3818       src0 = sampler;
3819       src1 = texture;
3820    }
3821 
3822    /* Only write the components that we actually read */
3823    unsigned mask = nir_def_components_read(&instr->def);
3824    unsigned comps_per_reg = instr->def.bit_size == 16 ? 2 : 1;
3825    unsigned res_size = DIV_ROUND_UP(util_bitcount(mask), comps_per_reg);
3826 
3827    enum bi_register_format regfmt = bi_reg_fmt_for_nir(instr->dest_type);
3828    enum bi_dimension dim = valhall_tex_dimension(instr->sampler_dim);
3829    bi_index dest = bi_temp(b->shader);
3830 
3831    switch (instr->op) {
3832    case nir_texop_tex:
3833    case nir_texop_txl:
3834    case nir_texop_txb:
3835       bi_tex_single_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
3836                        instr->is_shadow, explicit_offset, lod_mode,
3837                        !narrow_indices, mask, sr_count);
3838       break;
3839    case nir_texop_txf:
3840    case nir_texop_txf_ms:
3841       bi_tex_fetch_to(b, dest, idx, src0, src1, instr->is_array, dim, regfmt,
3842                       explicit_offset, !narrow_indices, mask, sr_count);
3843       break;
3844    case nir_texop_tg4:
3845       bi_tex_gather_to(b, dest, idx, src0, src1, instr->is_array, dim,
3846                        instr->component, false, regfmt, instr->is_shadow,
3847                        explicit_offset, !narrow_indices, mask, sr_count);
3848       break;
3849    default:
3850       unreachable("Unhandled Valhall texture op");
3851    }
3852 
3853    /* The hardware will write only what we read, and it will into
3854     * contiguous registers without gaps (different from Bifrost). NIR
3855     * expects the gaps, so fill in the holes (they'll be copypropped and
3856     * DCE'd away later).
3857     */
3858    bi_index unpacked[4] = {bi_null(), bi_null(), bi_null(), bi_null()};
3859 
3860    bi_emit_cached_split_i32(b, dest, res_size);
3861 
3862    /* Index into the packed component array */
3863    unsigned j = 0;
3864    unsigned comps[4] = {0};
3865    unsigned nr_components = instr->def.num_components;
3866 
3867    for (unsigned i = 0; i < nr_components; ++i) {
3868       if (mask & BITFIELD_BIT(i)) {
3869          unpacked[i] = dest;
3870          comps[i] = j++;
3871       } else {
3872          unpacked[i] = bi_zero();
3873       }
3874    }
3875 
3876    bi_make_vec_to(b, bi_def_index(&instr->def), unpacked, comps,
3877                   instr->def.num_components, instr->def.bit_size);
3878 }
3879 
3880 /* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube
3881  * textures with sufficiently small immediate indices. Anything else
3882  * needs a complete texture op. */
3883 
3884 static void
bi_emit_texs(bi_builder * b,nir_tex_instr * instr)3885 bi_emit_texs(bi_builder *b, nir_tex_instr *instr)
3886 {
3887    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
3888    assert(coord_idx >= 0);
3889    bi_index coords = bi_src_index(&instr->src[coord_idx].src);
3890 
3891    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
3892       bi_index face, s, t;
3893       bi_emit_cube_coord(b, coords, &face, &s, &t);
3894 
3895       bi_texs_cube_to(b, instr->def.bit_size, bi_def_index(&instr->def), s, t,
3896                       face, instr->sampler_index, instr->texture_index);
3897    } else {
3898       bi_texs_2d_to(b, instr->def.bit_size, bi_def_index(&instr->def),
3899                     bi_extract(b, coords, 0), bi_extract(b, coords, 1),
3900                     instr->op != nir_texop_tex, /* zero LOD */
3901                     instr->sampler_index, instr->texture_index);
3902    }
3903 
3904    bi_split_def(b, &instr->def);
3905 }
3906 
3907 static bool
bi_is_simple_tex(nir_tex_instr * instr)3908 bi_is_simple_tex(nir_tex_instr *instr)
3909 {
3910    if (instr->op != nir_texop_tex && instr->op != nir_texop_txl)
3911       return false;
3912 
3913    if (instr->dest_type != nir_type_float32 &&
3914        instr->dest_type != nir_type_float16)
3915       return false;
3916 
3917    if (instr->is_shadow || instr->is_array)
3918       return false;
3919 
3920    switch (instr->sampler_dim) {
3921    case GLSL_SAMPLER_DIM_2D:
3922    case GLSL_SAMPLER_DIM_EXTERNAL:
3923    case GLSL_SAMPLER_DIM_RECT:
3924       break;
3925 
3926    case GLSL_SAMPLER_DIM_CUBE:
3927       /* LOD can't be specified with TEXS_CUBE */
3928       if (instr->op == nir_texop_txl)
3929          return false;
3930       break;
3931 
3932    default:
3933       return false;
3934    }
3935 
3936    for (unsigned i = 0; i < instr->num_srcs; ++i) {
3937       if (instr->src[i].src_type != nir_tex_src_lod &&
3938           instr->src[i].src_type != nir_tex_src_coord)
3939          return false;
3940    }
3941 
3942    /* Indices need to fit in provided bits */
3943    unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3;
3944    if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits))
3945       return false;
3946 
3947    int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod);
3948    if (lod_idx < 0)
3949       return true;
3950 
3951    nir_src lod = instr->src[lod_idx].src;
3952    return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0;
3953 }
3954 
3955 static void
bi_emit_tex(bi_builder * b,nir_tex_instr * instr)3956 bi_emit_tex(bi_builder *b, nir_tex_instr *instr)
3957 {
3958    /* If txf is used, we assume there is a valid sampler bound at index 0. Use
3959     * it for txf operations, since there may be no other valid samplers. This is
3960     * a workaround: txf does not require a sampler in NIR (so sampler_index is
3961     * undefined) but we need one in the hardware. This is ABI with the driver.
3962     *
3963     * On Valhall, as the descriptor table is encoded in the index, this should
3964     * be handled by the driver.
3965     */
3966    if (!nir_tex_instr_need_sampler(instr) && b->shader->arch < 9)
3967       instr->sampler_index = 0;
3968 
3969    if (b->shader->arch >= 9)
3970       bi_emit_tex_valhall(b, instr);
3971    else if (bi_is_simple_tex(instr))
3972       bi_emit_texs(b, instr);
3973    else
3974       bi_emit_texc(b, instr);
3975 }
3976 
3977 static void
bi_emit_phi(bi_builder * b,nir_phi_instr * instr)3978 bi_emit_phi(bi_builder *b, nir_phi_instr *instr)
3979 {
3980    unsigned nr_srcs = exec_list_length(&instr->srcs);
3981    bi_instr *I = bi_phi_to(b, bi_def_index(&instr->def), nr_srcs);
3982 
3983    /* Deferred */
3984    I->phi = instr;
3985 }
3986 
3987 /* Look up the AGX block corresponding to a given NIR block. Used when
3988  * translating phi nodes after emitting all blocks.
3989  */
3990 static bi_block *
bi_from_nir_block(bi_context * ctx,nir_block * block)3991 bi_from_nir_block(bi_context *ctx, nir_block *block)
3992 {
3993    return ctx->indexed_nir_blocks[block->index];
3994 }
3995 
3996 static void
bi_emit_phi_deferred(bi_context * ctx,bi_block * block,bi_instr * I)3997 bi_emit_phi_deferred(bi_context *ctx, bi_block *block, bi_instr *I)
3998 {
3999    nir_phi_instr *phi = I->phi;
4000 
4001    /* Guaranteed by lower_phis_to_scalar */
4002    assert(phi->def.num_components == 1);
4003 
4004    nir_foreach_phi_src(src, phi) {
4005       bi_block *pred = bi_from_nir_block(ctx, src->pred);
4006       unsigned i = bi_predecessor_index(block, pred);
4007       assert(i < I->nr_srcs);
4008 
4009       I->src[i] = bi_src_index(&src->src);
4010    }
4011 
4012    I->phi = NULL;
4013 }
4014 
4015 static void
bi_emit_phis_deferred(bi_context * ctx)4016 bi_emit_phis_deferred(bi_context *ctx)
4017 {
4018    bi_foreach_block(ctx, block) {
4019       bi_foreach_instr_in_block(block, I) {
4020          if (I->op == BI_OPCODE_PHI)
4021             bi_emit_phi_deferred(ctx, block, I);
4022       }
4023    }
4024 }
4025 
4026 static void
bi_emit_instr(bi_builder * b,struct nir_instr * instr)4027 bi_emit_instr(bi_builder *b, struct nir_instr *instr)
4028 {
4029    switch (instr->type) {
4030    case nir_instr_type_load_const:
4031       bi_emit_load_const(b, nir_instr_as_load_const(instr));
4032       break;
4033 
4034    case nir_instr_type_intrinsic:
4035       bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
4036       break;
4037 
4038    case nir_instr_type_alu:
4039       bi_emit_alu(b, nir_instr_as_alu(instr));
4040       break;
4041 
4042    case nir_instr_type_tex:
4043       bi_emit_tex(b, nir_instr_as_tex(instr));
4044       break;
4045 
4046    case nir_instr_type_jump:
4047       bi_emit_jump(b, nir_instr_as_jump(instr));
4048       break;
4049 
4050    case nir_instr_type_phi:
4051       bi_emit_phi(b, nir_instr_as_phi(instr));
4052       break;
4053 
4054    default:
4055       unreachable("should've been lowered");
4056    }
4057 }
4058 
4059 static bi_block *
create_empty_block(bi_context * ctx)4060 create_empty_block(bi_context *ctx)
4061 {
4062    bi_block *blk = rzalloc(ctx, bi_block);
4063 
4064    util_dynarray_init(&blk->predecessors, blk);
4065 
4066    return blk;
4067 }
4068 
4069 static bi_block *
emit_block(bi_context * ctx,nir_block * block)4070 emit_block(bi_context *ctx, nir_block *block)
4071 {
4072    if (ctx->after_block) {
4073       ctx->current_block = ctx->after_block;
4074       ctx->after_block = NULL;
4075    } else {
4076       ctx->current_block = create_empty_block(ctx);
4077    }
4078 
4079    list_addtail(&ctx->current_block->link, &ctx->blocks);
4080    list_inithead(&ctx->current_block->instructions);
4081 
4082    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4083 
4084    ctx->indexed_nir_blocks[block->index] = ctx->current_block;
4085 
4086    nir_foreach_instr(instr, block) {
4087       bi_emit_instr(&_b, instr);
4088    }
4089 
4090    return ctx->current_block;
4091 }
4092 
4093 static void
emit_if(bi_context * ctx,nir_if * nif)4094 emit_if(bi_context *ctx, nir_if *nif)
4095 {
4096    bi_block *before_block = ctx->current_block;
4097 
4098    /* Speculatively emit the branch, but we can't fill it in until later */
4099    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4100    bi_instr *then_branch =
4101       bi_branchz_i16(&_b, bi_half(bi_src_index(&nif->condition), false),
4102                      bi_zero(), BI_CMPF_EQ);
4103 
4104    /* Emit the two subblocks. */
4105    bi_block *then_block = emit_cf_list(ctx, &nif->then_list);
4106    bi_block *end_then_block = ctx->current_block;
4107 
4108    /* Emit second block */
4109 
4110    bi_block *else_block = emit_cf_list(ctx, &nif->else_list);
4111    bi_block *end_else_block = ctx->current_block;
4112    ctx->after_block = create_empty_block(ctx);
4113 
4114    /* Now that we have the subblocks emitted, fix up the branches */
4115 
4116    assert(then_block);
4117    assert(else_block);
4118 
4119    then_branch->branch_target = else_block;
4120 
4121    /* Emit a jump from the end of the then block to the end of the else */
4122    _b.cursor = bi_after_block(end_then_block);
4123    bi_instr *then_exit = bi_jump(&_b, bi_zero());
4124    then_exit->branch_target = ctx->after_block;
4125 
4126    bi_block_add_successor(end_then_block, then_exit->branch_target);
4127    bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */
4128 
4129    bi_block_add_successor(before_block,
4130                           then_branch->branch_target); /* then_branch */
4131    bi_block_add_successor(before_block, then_block);   /* fallthrough */
4132 }
4133 
4134 static void
emit_loop(bi_context * ctx,nir_loop * nloop)4135 emit_loop(bi_context *ctx, nir_loop *nloop)
4136 {
4137    assert(!nir_loop_has_continue_construct(nloop));
4138 
4139    /* Remember where we are */
4140    bi_block *start_block = ctx->current_block;
4141 
4142    bi_block *saved_break = ctx->break_block;
4143    bi_block *saved_continue = ctx->continue_block;
4144 
4145    ctx->continue_block = create_empty_block(ctx);
4146    ctx->break_block = create_empty_block(ctx);
4147    ctx->after_block = ctx->continue_block;
4148    ctx->after_block->loop_header = true;
4149 
4150    /* Emit the body itself */
4151    emit_cf_list(ctx, &nloop->body);
4152 
4153    /* Branch back to loop back */
4154    bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block));
4155    bi_instr *I = bi_jump(&_b, bi_zero());
4156    I->branch_target = ctx->continue_block;
4157    bi_block_add_successor(start_block, ctx->continue_block);
4158    bi_block_add_successor(ctx->current_block, ctx->continue_block);
4159 
4160    ctx->after_block = ctx->break_block;
4161 
4162    /* Pop off */
4163    ctx->break_block = saved_break;
4164    ctx->continue_block = saved_continue;
4165    ++ctx->loop_count;
4166 }
4167 
4168 static bi_block *
emit_cf_list(bi_context * ctx,struct exec_list * list)4169 emit_cf_list(bi_context *ctx, struct exec_list *list)
4170 {
4171    bi_block *start_block = NULL;
4172 
4173    foreach_list_typed(nir_cf_node, node, node, list) {
4174       switch (node->type) {
4175       case nir_cf_node_block: {
4176          bi_block *block = emit_block(ctx, nir_cf_node_as_block(node));
4177 
4178          if (!start_block)
4179             start_block = block;
4180 
4181          break;
4182       }
4183 
4184       case nir_cf_node_if:
4185          emit_if(ctx, nir_cf_node_as_if(node));
4186          break;
4187 
4188       case nir_cf_node_loop:
4189          emit_loop(ctx, nir_cf_node_as_loop(node));
4190          break;
4191 
4192       default:
4193          unreachable("Unknown control flow");
4194       }
4195    }
4196 
4197    return start_block;
4198 }
4199 
4200 /* shader-db stuff */
4201 
4202 struct bi_stats {
4203    unsigned nr_clauses, nr_tuples, nr_ins;
4204    unsigned nr_arith, nr_texture, nr_varying, nr_ldst;
4205 };
4206 
4207 static void
bi_count_tuple_stats(bi_clause * clause,bi_tuple * tuple,struct bi_stats * stats)4208 bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats)
4209 {
4210    /* Count instructions */
4211    stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0);
4212 
4213    /* Non-message passing tuples are always arithmetic */
4214    if (tuple->add != clause->message) {
4215       stats->nr_arith++;
4216       return;
4217    }
4218 
4219    /* Message + FMA we'll count as arithmetic _and_ message */
4220    if (tuple->fma)
4221       stats->nr_arith++;
4222 
4223    switch (clause->message_type) {
4224    case BIFROST_MESSAGE_VARYING:
4225       /* Check components interpolated */
4226       stats->nr_varying +=
4227          (clause->message->vecsize + 1) *
4228          (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2);
4229       break;
4230 
4231    case BIFROST_MESSAGE_VARTEX:
4232       /* 2 coordinates, fp32 each */
4233       stats->nr_varying += (2 * 2);
4234       FALLTHROUGH;
4235    case BIFROST_MESSAGE_TEX:
4236       stats->nr_texture++;
4237       break;
4238 
4239    case BIFROST_MESSAGE_ATTRIBUTE:
4240    case BIFROST_MESSAGE_LOAD:
4241    case BIFROST_MESSAGE_STORE:
4242    case BIFROST_MESSAGE_ATOMIC:
4243       stats->nr_ldst++;
4244       break;
4245 
4246    case BIFROST_MESSAGE_NONE:
4247    case BIFROST_MESSAGE_BARRIER:
4248    case BIFROST_MESSAGE_BLEND:
4249    case BIFROST_MESSAGE_TILE:
4250    case BIFROST_MESSAGE_Z_STENCIL:
4251    case BIFROST_MESSAGE_ATEST:
4252    case BIFROST_MESSAGE_JOB:
4253    case BIFROST_MESSAGE_64BIT:
4254       /* Nothing to do */
4255       break;
4256    };
4257 }
4258 
4259 /*
4260  * v7 allows preloading LD_VAR or VAR_TEX messages that must complete before the
4261  * shader completes. These costs are not accounted for in the general cycle
4262  * counts, so this function calculates the effective cost of these messages, as
4263  * if they were executed by shader code.
4264  */
4265 static unsigned
bi_count_preload_cost(bi_context * ctx)4266 bi_count_preload_cost(bi_context *ctx)
4267 {
4268    /* Units: 1/16 of a normalized cycle, assuming that we may interpolate
4269     * 16 fp16 varying components per cycle or fetch two texels per cycle.
4270     */
4271    unsigned cost = 0;
4272 
4273    for (unsigned i = 0; i < ARRAY_SIZE(ctx->info.bifrost->messages); ++i) {
4274       struct bifrost_message_preload msg = ctx->info.bifrost->messages[i];
4275 
4276       if (msg.enabled && msg.texture) {
4277          /* 2 coordinate, 2 half-words each, plus texture */
4278          cost += 12;
4279       } else if (msg.enabled) {
4280          cost += (msg.num_components * (msg.fp16 ? 1 : 2));
4281       }
4282    }
4283 
4284    return cost;
4285 }
4286 
4287 static const char *
bi_shader_stage_name(bi_context * ctx)4288 bi_shader_stage_name(bi_context *ctx)
4289 {
4290    if (ctx->idvs == BI_IDVS_VARYING)
4291       return "MESA_SHADER_VARYING";
4292    else if (ctx->idvs == BI_IDVS_POSITION)
4293       return "MESA_SHADER_POSITION";
4294    else if (ctx->inputs->is_blend)
4295       return "MESA_SHADER_BLEND";
4296    else
4297       return gl_shader_stage_name(ctx->stage);
4298 }
4299 
4300 static char *
bi_print_stats(bi_context * ctx,unsigned size)4301 bi_print_stats(bi_context *ctx, unsigned size)
4302 {
4303    struct bi_stats stats = {0};
4304 
4305    /* Count instructions, clauses, and tuples. Also attempt to construct
4306     * normalized execution engine cycle counts, using the following ratio:
4307     *
4308     * 24 arith tuples/cycle
4309     * 2 texture messages/cycle
4310     * 16 x 16-bit varying channels interpolated/cycle
4311     * 1 load store message/cycle
4312     *
4313     * These numbers seem to match Arm Mobile Studio's heuristic. The real
4314     * cycle counts are surely more complicated.
4315     */
4316 
4317    bi_foreach_block(ctx, block) {
4318       bi_foreach_clause_in_block(block, clause) {
4319          stats.nr_clauses++;
4320          stats.nr_tuples += clause->tuple_count;
4321 
4322          for (unsigned i = 0; i < clause->tuple_count; ++i)
4323             bi_count_tuple_stats(clause, &clause->tuples[i], &stats);
4324       }
4325    }
4326 
4327    float cycles_arith = ((float)stats.nr_arith) / 24.0;
4328    float cycles_texture = ((float)stats.nr_texture) / 2.0;
4329    float cycles_varying = ((float)stats.nr_varying) / 16.0;
4330    float cycles_ldst = ((float)stats.nr_ldst) / 1.0;
4331 
4332    float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst);
4333    float cycles_bound = MAX2(cycles_arith, cycles_message);
4334 
4335    /* Thread count and register pressure are traded off only on v7 */
4336    bool full_threads = (ctx->arch == 7 && ctx->info.work_reg_count <= 32);
4337    unsigned nr_threads = full_threads ? 2 : 1;
4338 
4339    /* Dump stats */
4340    char *str = ralloc_asprintf(
4341       NULL,
4342       "%s shader: "
4343       "%u inst, %u tuples, %u clauses, "
4344       "%f cycles, %f arith, %f texture, %f vary, %f ldst, "
4345       "%u quadwords, %u threads",
4346       bi_shader_stage_name(ctx), stats.nr_ins, stats.nr_tuples,
4347       stats.nr_clauses, cycles_bound, cycles_arith, cycles_texture,
4348       cycles_varying, cycles_ldst, size / 16, nr_threads);
4349 
4350    if (ctx->arch == 7) {
4351       ralloc_asprintf_append(&str, ", %u preloads", bi_count_preload_cost(ctx));
4352    }
4353 
4354    ralloc_asprintf_append(&str, ", %u loops, %u:%u spills:fills",
4355                           ctx->loop_count, ctx->spills, ctx->fills);
4356 
4357    return str;
4358 }
4359 
4360 static char *
va_print_stats(bi_context * ctx,unsigned size)4361 va_print_stats(bi_context *ctx, unsigned size)
4362 {
4363    unsigned nr_ins = 0;
4364    struct va_stats stats = {0};
4365 
4366    /* Count instructions */
4367    bi_foreach_instr_global(ctx, I) {
4368       nr_ins++;
4369       va_count_instr_stats(I, &stats);
4370    }
4371 
4372    /* Mali G78 peak performance:
4373     *
4374     * 64 FMA instructions per cycle
4375     * 64 CVT instructions per cycle
4376     * 16 SFU instructions per cycle
4377     * 8 x 32-bit varying channels interpolated per cycle
4378     * 4 texture instructions per cycle
4379     * 1 load/store operation per cycle
4380     */
4381 
4382    float cycles_fma = ((float)stats.fma) / 64.0;
4383    float cycles_cvt = ((float)stats.cvt) / 64.0;
4384    float cycles_sfu = ((float)stats.sfu) / 16.0;
4385    float cycles_v = ((float)stats.v) / 16.0;
4386    float cycles_t = ((float)stats.t) / 4.0;
4387    float cycles_ls = ((float)stats.ls) / 1.0;
4388 
4389    /* Calculate the bound */
4390    float cycles = MAX2(MAX3(cycles_fma, cycles_cvt, cycles_sfu),
4391                        MAX3(cycles_v, cycles_t, cycles_ls));
4392 
4393    /* Thread count and register pressure are traded off */
4394    unsigned nr_threads = (ctx->info.work_reg_count <= 32) ? 2 : 1;
4395 
4396    /* Dump stats */
4397    return ralloc_asprintf(NULL,
4398                           "%s shader: "
4399                           "%u inst, %f cycles, %f fma, %f cvt, %f sfu, %f v, "
4400                           "%f t, %f ls, %u quadwords, %u threads, %u loops, "
4401                           "%u:%u spills:fills",
4402                           bi_shader_stage_name(ctx), nr_ins, cycles, cycles_fma,
4403                           cycles_cvt, cycles_sfu, cycles_v, cycles_t, cycles_ls,
4404                           size / 16, nr_threads, ctx->loop_count, ctx->spills,
4405                           ctx->fills);
4406 }
4407 
4408 static int
glsl_type_size(const struct glsl_type * type,bool bindless)4409 glsl_type_size(const struct glsl_type *type, bool bindless)
4410 {
4411    return glsl_count_attribute_slots(type, false);
4412 }
4413 
4414 /* Split stores to memory. We don't split stores to vertex outputs, since
4415  * nir_lower_io_to_temporaries will ensure there's only a single write.
4416  */
4417 
4418 static bool
should_split_wrmask(const nir_instr * instr,UNUSED const void * data)4419 should_split_wrmask(const nir_instr *instr, UNUSED const void *data)
4420 {
4421    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4422 
4423    switch (intr->intrinsic) {
4424    case nir_intrinsic_store_ssbo:
4425    case nir_intrinsic_store_shared:
4426    case nir_intrinsic_store_global:
4427    case nir_intrinsic_store_scratch:
4428       return true;
4429    default:
4430       return false;
4431    }
4432 }
4433 
4434 /*
4435  * Some operations are only available as 32-bit instructions. 64-bit floats are
4436  * unsupported and ints are lowered with nir_lower_int64.  Certain 8-bit and
4437  * 16-bit instructions, however, are lowered here.
4438  */
4439 static unsigned
bi_lower_bit_size(const nir_instr * instr,UNUSED void * data)4440 bi_lower_bit_size(const nir_instr *instr, UNUSED void *data)
4441 {
4442    if (instr->type != nir_instr_type_alu)
4443       return 0;
4444 
4445    nir_alu_instr *alu = nir_instr_as_alu(instr);
4446 
4447    switch (alu->op) {
4448    case nir_op_fexp2:
4449    case nir_op_flog2:
4450    case nir_op_fpow:
4451    case nir_op_fsin:
4452    case nir_op_fcos:
4453    case nir_op_bit_count:
4454    case nir_op_bitfield_reverse:
4455       return (nir_src_bit_size(alu->src[0].src) == 32) ? 0 : 32;
4456    default:
4457       return 0;
4458    }
4459 }
4460 
4461 /* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4,
4462  * transcendentals are an exception. Also shifts because of lane size mismatch
4463  * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need
4464  * to be scalarized due to type size. */
4465 
4466 static uint8_t
bi_vectorize_filter(const nir_instr * instr,const void * data)4467 bi_vectorize_filter(const nir_instr *instr, const void *data)
4468 {
4469    /* Defaults work for everything else */
4470    if (instr->type != nir_instr_type_alu)
4471       return 0;
4472 
4473    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4474 
4475    switch (alu->op) {
4476    case nir_op_frcp:
4477    case nir_op_frsq:
4478    case nir_op_ishl:
4479    case nir_op_ishr:
4480    case nir_op_ushr:
4481    case nir_op_f2i16:
4482    case nir_op_f2u16:
4483    case nir_op_extract_u8:
4484    case nir_op_extract_i8:
4485    case nir_op_extract_u16:
4486    case nir_op_extract_i16:
4487    case nir_op_insert_u16:
4488       return 1;
4489    default:
4490       break;
4491    }
4492 
4493    /* Vectorized instructions cannot write more than 32-bit */
4494    int dst_bit_size = alu->def.bit_size;
4495    if (dst_bit_size == 16)
4496       return 2;
4497    else
4498       return 1;
4499 }
4500 
4501 static bool
bi_scalarize_filter(const nir_instr * instr,const void * data)4502 bi_scalarize_filter(const nir_instr *instr, const void *data)
4503 {
4504    if (instr->type != nir_instr_type_alu)
4505       return false;
4506 
4507    const nir_alu_instr *alu = nir_instr_as_alu(instr);
4508 
4509    switch (alu->op) {
4510    case nir_op_pack_uvec2_to_uint:
4511    case nir_op_pack_uvec4_to_uint:
4512       return false;
4513    default:
4514       return true;
4515    }
4516 }
4517 
4518 /* Ensure we write exactly 4 components */
4519 static nir_def *
bifrost_nir_valid_channel(nir_builder * b,nir_def * in,unsigned channel,unsigned first,unsigned mask)4520 bifrost_nir_valid_channel(nir_builder *b, nir_def *in, unsigned channel,
4521                           unsigned first, unsigned mask)
4522 {
4523    if (!(mask & BITFIELD_BIT(channel)))
4524       channel = first;
4525 
4526    return nir_channel(b, in, channel);
4527 }
4528 
4529 /* Lower fragment store_output instructions to always write 4 components,
4530  * matching the hardware semantic. This may require additional moves. Skipping
4531  * these moves is possible in theory, but invokes undefined behaviour in the
4532  * compiler. The DDK inserts these moves, so we will as well. */
4533 
4534 static bool
bifrost_nir_lower_blend_components(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)4535 bifrost_nir_lower_blend_components(struct nir_builder *b,
4536                                    nir_intrinsic_instr *intr, void *data)
4537 {
4538    if (intr->intrinsic != nir_intrinsic_store_output)
4539       return false;
4540 
4541    nir_def *in = intr->src[0].ssa;
4542    unsigned first = nir_intrinsic_component(intr);
4543    unsigned mask = nir_intrinsic_write_mask(intr);
4544 
4545    assert(first == 0 && "shouldn't get nonzero components");
4546 
4547    /* Nothing to do */
4548    if (mask == BITFIELD_MASK(4))
4549       return false;
4550 
4551    b->cursor = nir_before_instr(&intr->instr);
4552 
4553    /* Replicate the first valid component instead */
4554    nir_def *replicated =
4555       nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask),
4556                bifrost_nir_valid_channel(b, in, 1, first, mask),
4557                bifrost_nir_valid_channel(b, in, 2, first, mask),
4558                bifrost_nir_valid_channel(b, in, 3, first, mask));
4559 
4560    /* Rewrite to use our replicated version */
4561    nir_src_rewrite(&intr->src[0], replicated);
4562    nir_intrinsic_set_component(intr, 0);
4563    nir_intrinsic_set_write_mask(intr, 0xF);
4564    intr->num_components = 4;
4565 
4566    return true;
4567 }
4568 
4569 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align_mul,uint32_t align_offset,bool offset_is_const,const void * cb_data)4570 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
4571                          uint8_t bit_size, uint32_t align_mul,
4572                          uint32_t align_offset, bool offset_is_const,
4573                          const void *cb_data)
4574 {
4575    uint32_t align = nir_combined_align(align_mul, align_offset);
4576    assert(util_is_power_of_two_nonzero(align));
4577 
4578    /* No more than 16 bytes at a time. */
4579    bytes = MIN2(bytes, 16);
4580 
4581    /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
4582     * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
4583     *
4584     * But if we're only aligned to 1 byte, use 8-bit loads. If we're only
4585     * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
4586     * the size.
4587     */
4588    if ((bytes & 1) || (align == 1))
4589       bit_size = 8;
4590    else if ((bytes & 2) || (align == 2))
4591       bit_size = 16;
4592    else if (bit_size >= 32)
4593       bit_size = 32;
4594 
4595    unsigned num_comps = MIN2(bytes / (bit_size / 8), 4);
4596 
4597    /* Push constants require 32-bit loads. */
4598    if (intrin == nir_intrinsic_load_push_constant) {
4599       if (align_mul >= 4) {
4600          /* If align_mul is bigger than 4 we can use align_offset to find
4601           * the exact number of words we need to read.
4602           */
4603          num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4);
4604       } else {
4605          /* If bytes is aligned on 32-bit, the access might still cross one
4606           * word at the beginning, and one word at the end. If bytes is not
4607           * aligned on 32-bit, the extra two words should cover for both the
4608           * size and offset mis-alignment.
4609           */
4610          num_comps = (bytes / 4) + 2;
4611       }
4612 
4613       bit_size = MIN2(bit_size, 32);
4614    }
4615 
4616    return (nir_mem_access_size_align){
4617       .num_components = num_comps,
4618       .bit_size = bit_size,
4619       .align = bit_size / 8,
4620    };
4621 }
4622 
4623 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)4624 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
4625                  unsigned num_components, nir_intrinsic_instr *low,
4626                  nir_intrinsic_instr *high, void *data)
4627 {
4628    /* Must be aligned to the size of the load */
4629    unsigned align = nir_combined_align(align_mul, align_offset);
4630    if ((bit_size / 8) > align)
4631       return false;
4632 
4633    if (num_components > 4)
4634       return false;
4635 
4636    if (bit_size > 32)
4637       return false;
4638 
4639    return true;
4640 }
4641 
4642 static void
bi_optimize_nir(nir_shader * nir,unsigned gpu_id,bool is_blend)4643 bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend)
4644 {
4645    NIR_PASS_V(nir, nir_opt_shrink_stores, true);
4646 
4647    bool progress;
4648 
4649    do {
4650       progress = false;
4651 
4652       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
4653       NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL);
4654 
4655       NIR_PASS(progress, nir, nir_copy_prop);
4656       NIR_PASS(progress, nir, nir_opt_remove_phis);
4657       NIR_PASS(progress, nir, nir_opt_dce);
4658       NIR_PASS(progress, nir, nir_opt_dead_cf);
4659       NIR_PASS(progress, nir, nir_opt_cse);
4660       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
4661       NIR_PASS(progress, nir, nir_opt_algebraic);
4662       NIR_PASS(progress, nir, nir_opt_constant_folding);
4663 
4664       NIR_PASS(progress, nir, nir_opt_undef);
4665       NIR_PASS(progress, nir, nir_lower_undef_to_zero);
4666 
4667       NIR_PASS(progress, nir, nir_opt_shrink_vectors, false);
4668       NIR_PASS(progress, nir, nir_opt_loop_unroll);
4669    } while (progress);
4670 
4671    NIR_PASS(
4672       progress, nir, nir_opt_load_store_vectorize,
4673       &(const nir_load_store_vectorize_options){
4674          .modes = nir_var_mem_global | nir_var_mem_shared | nir_var_shader_temp,
4675          .callback = mem_vectorize_cb,
4676       });
4677    NIR_PASS(progress, nir, nir_lower_pack);
4678 
4679    /* TODO: Why is 64-bit getting rematerialized?
4680     * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */
4681    NIR_PASS(progress, nir, nir_lower_int64);
4682 
4683    /* We need to cleanup after each iteration of late algebraic
4684     * optimizations, since otherwise NIR can produce weird edge cases
4685     * (like fneg of a constant) which we don't handle */
4686    bool late_algebraic = true;
4687    while (late_algebraic) {
4688       late_algebraic = false;
4689       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4690       NIR_PASS(progress, nir, nir_opt_constant_folding);
4691       NIR_PASS(progress, nir, nir_copy_prop);
4692       NIR_PASS(progress, nir, nir_opt_dce);
4693       NIR_PASS(progress, nir, nir_opt_cse);
4694    }
4695 
4696    /* This opt currently helps on Bifrost but not Valhall */
4697    if (gpu_id < 0x9000)
4698       NIR_PASS(progress, nir, bifrost_nir_opt_boolean_bitwise);
4699 
4700    NIR_PASS(progress, nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
4701    NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL);
4702    NIR_PASS(progress, nir, nir_lower_bool_to_bitsize);
4703 
4704    /* Prepass to simplify instruction selection */
4705    late_algebraic = false;
4706    NIR_PASS(late_algebraic, nir, bifrost_nir_lower_algebraic_late);
4707 
4708    while (late_algebraic) {
4709       late_algebraic = false;
4710       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
4711       NIR_PASS(progress, nir, nir_opt_constant_folding);
4712       NIR_PASS(progress, nir, nir_copy_prop);
4713       NIR_PASS(progress, nir, nir_opt_dce);
4714       NIR_PASS(progress, nir, nir_opt_cse);
4715    }
4716 
4717    NIR_PASS(progress, nir, nir_lower_load_const_to_scalar);
4718    NIR_PASS(progress, nir, nir_opt_dce);
4719 
4720    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
4721       NIR_PASS_V(nir, nir_shader_intrinsics_pass,
4722                  bifrost_nir_lower_blend_components,
4723                  nir_metadata_control_flow, NULL);
4724    }
4725 
4726    /* Backend scheduler is purely local, so do some global optimizations
4727     * to reduce register pressure. */
4728    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
4729                                nir_move_load_input | nir_move_comparisons |
4730                                nir_move_copies | nir_move_load_ssbo;
4731 
4732    NIR_PASS_V(nir, nir_opt_sink, move_all);
4733    NIR_PASS_V(nir, nir_opt_move, move_all);
4734 
4735    /* We might lower attribute, varying, and image indirects. Use the
4736     * gathered info to skip the extra analysis in the happy path. */
4737    bool any_indirects = nir->info.inputs_read_indirectly ||
4738                         nir->info.outputs_accessed_indirectly ||
4739                         nir->info.patch_inputs_read_indirectly ||
4740                         nir->info.patch_outputs_accessed_indirectly ||
4741                         nir->info.images_used[0];
4742 
4743    if (any_indirects) {
4744       nir_convert_to_lcssa(nir, true, true);
4745       NIR_PASS_V(nir, nir_divergence_analysis);
4746       NIR_PASS_V(nir, bi_lower_divergent_indirects,
4747                  pan_subgroup_size(pan_arch(gpu_id)));
4748    }
4749 }
4750 
4751 static void
bi_opt_post_ra(bi_context * ctx)4752 bi_opt_post_ra(bi_context *ctx)
4753 {
4754    bi_foreach_instr_global_safe(ctx, ins) {
4755       if (ins->op == BI_OPCODE_MOV_I32 &&
4756           bi_is_equiv(ins->dest[0], ins->src[0]))
4757          bi_remove_instruction(ins);
4758    }
4759 }
4760 
4761 /* Dead code elimination for branches at the end of a block - only one branch
4762  * per block is legal semantically, but unreachable jumps can be generated.
4763  * Likewise on Bifrost we can generate jumps to the terminal block which need
4764  * to be lowered away to a jump to #0x0, which induces successful termination.
4765  * That trick doesn't work on Valhall, which needs a NOP inserted in the
4766  * terminal block instead.
4767  */
4768 static void
bi_lower_branch(bi_context * ctx,bi_block * block)4769 bi_lower_branch(bi_context *ctx, bi_block *block)
4770 {
4771    bool cull_terminal = (ctx->arch <= 8);
4772    bool branched = false;
4773 
4774    bi_foreach_instr_in_block_safe(block, ins) {
4775       if (!ins->branch_target)
4776          continue;
4777 
4778       if (branched) {
4779          bi_remove_instruction(ins);
4780          continue;
4781       }
4782 
4783       branched = true;
4784 
4785       if (!bi_is_terminal_block(ins->branch_target))
4786          continue;
4787 
4788       if (cull_terminal)
4789          ins->branch_target = NULL;
4790       else if (ins->branch_target)
4791          ins->branch_target->needs_nop = true;
4792    }
4793 }
4794 
4795 static void
bi_pack_clauses(bi_context * ctx,struct util_dynarray * binary,unsigned offset)4796 bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary, unsigned offset)
4797 {
4798    unsigned final_clause = bi_pack(ctx, binary);
4799 
4800    /* If we need to wait for ATEST or BLEND in the first clause, pass the
4801     * corresponding bits through to the renderer state descriptor */
4802    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
4803    bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL);
4804 
4805    unsigned first_deps = first_clause ? first_clause->dependencies : 0;
4806    ctx->info.bifrost->wait_6 = (first_deps & (1 << 6));
4807    ctx->info.bifrost->wait_7 = (first_deps & (1 << 7));
4808 
4809    /* Pad the shader with enough zero bytes to trick the prefetcher,
4810     * unless we're compiling an empty shader (in which case we don't pad
4811     * so the size remains 0) */
4812    unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause;
4813 
4814    if (binary->size - offset) {
4815       memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 0,
4816              prefetch_size);
4817    }
4818 }
4819 
4820 /*
4821  * Build a bit mask of varyings (by location) that are flatshaded. This
4822  * information is needed by lower_mediump_io, as we don't yet support 16-bit
4823  * flat varyings.
4824  *
4825  * Also varyings that are used as texture coordinates should be kept at fp32 so
4826  * the texture instruction may be promoted to VAR_TEX. In general this is a good
4827  * idea, as fp16 texture coordinates are not supported by the hardware and are
4828  * usually inappropriate. (There are both relevant CTS bugs here, even.)
4829  *
4830  * TODO: If we compacted the varyings with some fixup code in the vertex shader,
4831  * we could implement 16-bit flat varyings. Consider if this case matters.
4832  *
4833  * TODO: The texture coordinate handling could be less heavyhanded.
4834  */
4835 static bool
bi_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)4836 bi_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
4837 {
4838    uint64_t *mask = data;
4839 
4840    if (instr->type != nir_instr_type_tex)
4841       return false;
4842 
4843    nir_tex_instr *tex = nir_instr_as_tex(instr);
4844 
4845    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
4846    if (coord_idx < 0)
4847       return false;
4848 
4849    nir_src src = tex->src[coord_idx].src;
4850    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
4851    nir_scalar y = nir_scalar_resolved(src.ssa, 1);
4852 
4853    if (x.def != y.def)
4854       return false;
4855 
4856    nir_instr *parent = x.def->parent_instr;
4857 
4858    if (parent->type != nir_instr_type_intrinsic)
4859       return false;
4860 
4861    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
4862 
4863    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
4864       return false;
4865 
4866    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
4867    *mask |= BITFIELD64_BIT(sem.location);
4868    return false;
4869 }
4870 
4871 static uint64_t
bi_fp32_varying_mask(nir_shader * nir)4872 bi_fp32_varying_mask(nir_shader *nir)
4873 {
4874    uint64_t mask = 0;
4875 
4876    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4877 
4878    nir_foreach_shader_in_variable(var, nir) {
4879       if (var->data.interpolation == INTERP_MODE_FLAT)
4880          mask |= BITFIELD64_BIT(var->data.location);
4881    }
4882 
4883    nir_shader_instructions_pass(nir, bi_gather_texcoords, nir_metadata_all,
4884                                 &mask);
4885 
4886    return mask;
4887 }
4888 
4889 static bool
bi_lower_sample_mask_writes(nir_builder * b,nir_intrinsic_instr * intr,void * data)4890 bi_lower_sample_mask_writes(nir_builder *b, nir_intrinsic_instr *intr,
4891                             void *data)
4892 {
4893    if (intr->intrinsic != nir_intrinsic_store_output)
4894       return false;
4895 
4896    assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
4897    if (nir_intrinsic_io_semantics(intr).location != FRAG_RESULT_SAMPLE_MASK)
4898       return false;
4899 
4900    b->cursor = nir_before_instr(&intr->instr);
4901 
4902    nir_def *orig = nir_load_sample_mask(b);
4903 
4904    nir_src_rewrite(&intr->src[0],
4905                    nir_b32csel(b, nir_load_multisampled_pan(b),
4906                                nir_iand(b, orig, intr->src[0].ssa), orig));
4907    return true;
4908 }
4909 
4910 static bool
bi_lower_load_output(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4911 bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
4912                      UNUSED void *data)
4913 {
4914    if (intr->intrinsic != nir_intrinsic_load_output)
4915       return false;
4916 
4917    unsigned loc = nir_intrinsic_io_semantics(intr).location;
4918    assert(loc >= FRAG_RESULT_DATA0);
4919    unsigned rt = loc - FRAG_RESULT_DATA0;
4920 
4921    b->cursor = nir_before_instr(&intr->instr);
4922 
4923    nir_def *conversion = nir_load_rt_conversion_pan(
4924       b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
4925 
4926    nir_def *lowered = nir_load_converted_output_pan(
4927       b, intr->def.num_components, intr->def.bit_size, conversion,
4928       .dest_type = nir_intrinsic_dest_type(intr),
4929       .io_semantics = nir_intrinsic_io_semantics(intr));
4930 
4931    nir_def_rewrite_uses(&intr->def, lowered);
4932    return true;
4933 }
4934 
4935 bool
bifrost_nir_lower_load_output(nir_shader * nir)4936 bifrost_nir_lower_load_output(nir_shader *nir)
4937 {
4938    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
4939 
4940    return nir_shader_intrinsics_pass(
4941       nir, bi_lower_load_output,
4942       nir_metadata_control_flow, NULL);
4943 }
4944 
4945 static bool
bi_lower_load_push_const_with_dyn_offset(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)4946 bi_lower_load_push_const_with_dyn_offset(nir_builder *b,
4947                                          nir_intrinsic_instr *intr,
4948                                          UNUSED void *data)
4949 {
4950    if (intr->intrinsic != nir_intrinsic_load_push_constant)
4951       return false;
4952 
4953    /* Offset is constant, nothing to do. */
4954    if (nir_src_is_const(intr->src[0]))
4955       return false;
4956 
4957    /* nir_lower_mem_access_bit_sizes() should have lowered load_push_constant
4958     * to 32-bit and a maximum of 4 components.
4959     */
4960    assert(intr->def.num_components <= 4);
4961    assert(intr->def.bit_size == 32);
4962 
4963    uint32_t base = nir_intrinsic_base(intr);
4964    uint32_t range = nir_intrinsic_range(intr);
4965    uint32_t nwords = intr->def.num_components;
4966 
4967    b->cursor = nir_before_instr(&intr->instr);
4968 
4969    /* Dynamic indexing is only allowed for vulkan push constants, which is
4970     * currently limited to 256 bytes. That gives us a maximum of 64 32-bit
4971     * words to read from.
4972     */
4973    nir_def *lut[64] = {0};
4974 
4975    assert(range / 4 <= ARRAY_SIZE(lut));
4976 
4977    /* Load all words in the range. */
4978    for (uint32_t w = 0; w < range / 4; w++) {
4979       lut[w] = nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0),
4980                                       .base = base + (w * 4), .range = 4);
4981    }
4982 
4983    nir_def *index = intr->src[0].ssa;
4984 
4985    /* Index is dynamic, we need to do iteratively CSEL the values based on
4986     * the index. We start with the highest bit in the index, and for each
4987     * iteration we divide the scope by two.
4988     */
4989    for (uint32_t lut_sz = ARRAY_SIZE(lut); lut_sz > 0; lut_sz /= 2) {
4990       uint32_t stride = lut_sz / 2;
4991       nir_def *bit_test = NULL;
4992 
4993       /* Stop when the LUT is smaller than the number of words we're trying to
4994        * extract.
4995        */
4996       if (lut_sz <= nwords)
4997          break;
4998 
4999       for (uint32_t i = 0; i < stride; i++) {
5000          /* We only need a CSEL if we have two values, otherwise we pick the
5001           * non-NULL value.
5002           */
5003          if (lut[i] && lut[i + stride]) {
5004             /* Create the test src on-demand. The stride is in 32-bit words,
5005              * multiply by four to convert it into a byte stride we can use
5006              * to test if the corresponding bit is set in the index src.
5007              */
5008             if (!bit_test)
5009                bit_test = nir_i2b(b, nir_iand_imm(b, index, stride * 4));
5010 
5011             lut[i] = nir_bcsel(b, bit_test, lut[i + stride], lut[i]);
5012          } else if (lut[i + stride]) {
5013             lut[i] = lut[i + stride];
5014          }
5015       }
5016    }
5017 
5018    nir_def *res = nir_vec(b, &lut[0], nwords);
5019 
5020    nir_def_rewrite_uses(&intr->def, res);
5021    nir_instr_remove(&intr->instr);
5022    return true;
5023 }
5024 
5025 void
bifrost_preprocess_nir(nir_shader * nir,unsigned gpu_id)5026 bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
5027 {
5028    /* Lower gl_Position pre-optimisation, but after lowering vars to ssa
5029     * (so we don't accidentally duplicate the epilogue since mesa/st has
5030     * messed with our I/O quite a bit already) */
5031 
5032    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
5033 
5034    if (nir->info.stage == MESA_SHADER_VERTEX) {
5035       NIR_PASS_V(nir, nir_lower_viewport_transform);
5036       NIR_PASS_V(nir, nir_lower_point_size, 1.0, 0.0);
5037 
5038       nir_variable *psiz = nir_find_variable_with_location(
5039          nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
5040       if (psiz != NULL)
5041          psiz->data.precision = GLSL_PRECISION_MEDIUM;
5042    }
5043 
5044    /* Get rid of any global vars before we lower to scratch. */
5045    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
5046 
5047    /* Valhall introduces packed thread local storage, which improves cache
5048     * locality of TLS access. However, access to packed TLS cannot
5049     * straddle 16-byte boundaries. As such, when packed TLS is in use
5050     * (currently unconditional for Valhall), we force vec4 alignment for
5051     * scratch access.
5052     */
5053    glsl_type_size_align_func vars_to_scratch_size_align_func =
5054       (gpu_id >= 0x9000) ? glsl_get_vec4_size_align_bytes
5055                          : glsl_get_natural_size_align_bytes;
5056    /* Lower large arrays to scratch and small arrays to bcsel */
5057    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
5058               vars_to_scratch_size_align_func, vars_to_scratch_size_align_func);
5059    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
5060 
5061    NIR_PASS_V(nir, nir_split_var_copies);
5062    NIR_PASS_V(nir, nir_lower_var_copies);
5063    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
5064    NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
5065               glsl_type_size, 0);
5066 
5067    /* nir_lower[_explicit]_io is lazy and emits mul+add chains even for
5068     * offsets it could figure out are constant.  Do some constant folding
5069     * before bifrost_nir_lower_store_component below.
5070     */
5071    NIR_PASS_V(nir, nir_opt_constant_folding);
5072 
5073    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
5074       NIR_PASS_V(nir, nir_lower_mediump_io,
5075                  nir_var_shader_in | nir_var_shader_out,
5076                  ~bi_fp32_varying_mask(nir), false);
5077 
5078       NIR_PASS_V(nir, nir_shader_intrinsics_pass, bi_lower_sample_mask_writes,
5079                  nir_metadata_control_flow, NULL);
5080 
5081       NIR_PASS_V(nir, bifrost_nir_lower_load_output);
5082    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
5083       if (gpu_id >= 0x9000) {
5084          NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
5085                     BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
5086       }
5087 
5088       NIR_PASS_V(nir, pan_nir_lower_store_component);
5089    }
5090 
5091    nir_lower_mem_access_bit_sizes_options mem_size_options = {
5092       .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
5093                nir_var_mem_constant | nir_var_mem_task_payload |
5094                nir_var_shader_temp | nir_var_function_temp |
5095                nir_var_mem_global | nir_var_mem_shared,
5096       .callback = mem_access_size_align_cb,
5097    };
5098    NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
5099 
5100    NIR_PASS_V(nir, nir_shader_intrinsics_pass,
5101               bi_lower_load_push_const_with_dyn_offset,
5102               nir_metadata_control_flow, NULL);
5103 
5104    nir_lower_ssbo_options ssbo_opts = {
5105       .native_loads = pan_arch(gpu_id) >= 9,
5106       .native_offset = pan_arch(gpu_id) >= 9,
5107    };
5108    NIR_PASS_V(nir, nir_lower_ssbo, &ssbo_opts);
5109 
5110    NIR_PASS_V(nir, pan_lower_sample_pos);
5111    NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);
5112    NIR_PASS_V(nir, nir_lower_64bit_phis);
5113    NIR_PASS_V(nir, pan_lower_helper_invocation);
5114    NIR_PASS_V(nir, nir_lower_int64);
5115 
5116    NIR_PASS_V(nir, nir_opt_idiv_const, 8);
5117    NIR_PASS_V(nir, nir_lower_idiv,
5118               &(nir_lower_idiv_options){.allow_fp16 = true});
5119 
5120    NIR_PASS_V(nir, nir_lower_tex,
5121               &(nir_lower_tex_options){
5122                  .lower_txs_lod = true,
5123                  .lower_txp = ~0,
5124                  .lower_tg4_broadcom_swizzle = true,
5125                  .lower_txd = true,
5126                  .lower_invalid_implicit_lod = true,
5127                  .lower_index_to_offset = true,
5128               });
5129 
5130    NIR_PASS_V(nir, nir_lower_image_atomics_to_global);
5131 
5132    /* on bifrost, lower MSAA load/stores to 3D load/stores */
5133    if (pan_arch(gpu_id) < 9)
5134       NIR_PASS_V(nir, pan_nir_lower_image_ms);
5135 
5136    NIR_PASS_V(nir, nir_lower_alu_to_scalar, bi_scalarize_filter, NULL);
5137    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
5138    NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
5139    NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
5140    NIR_PASS_V(nir, nir_lower_var_copies);
5141    NIR_PASS_V(nir, nir_lower_alu);
5142    NIR_PASS_V(nir, nir_lower_frag_coord_to_pixel_coord);
5143 }
5144 
5145 static bi_context *
bi_compile_variant_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct bi_shader_info info,enum bi_idvs_mode idvs)5146 bi_compile_variant_nir(nir_shader *nir,
5147                        const struct panfrost_compile_inputs *inputs,
5148                        struct util_dynarray *binary, struct bi_shader_info info,
5149                        enum bi_idvs_mode idvs)
5150 {
5151    bi_context *ctx = rzalloc(NULL, bi_context);
5152 
5153    /* There may be another program in the dynarray, start at the end */
5154    unsigned offset = binary->size;
5155 
5156    ctx->inputs = inputs;
5157    ctx->nir = nir;
5158    ctx->stage = nir->info.stage;
5159    ctx->quirks = bifrost_get_quirks(inputs->gpu_id);
5160    ctx->arch = pan_arch(inputs->gpu_id);
5161    ctx->info = info;
5162    ctx->idvs = idvs;
5163    ctx->malloc_idvs = (ctx->arch >= 9) && !inputs->no_idvs;
5164 
5165    if (idvs != BI_IDVS_NONE) {
5166       /* Specializing shaders for IDVS is destructive, so we need to
5167        * clone. However, the last (second) IDVS shader does not need
5168        * to be preserved so we can skip cloning that one.
5169        */
5170       if (offset == 0)
5171          ctx->nir = nir = nir_shader_clone(ctx, nir);
5172 
5173       NIR_PASS_V(nir, nir_shader_instructions_pass, bifrost_nir_specialize_idvs,
5174                  nir_metadata_control_flow, &idvs);
5175 
5176       /* After specializing, clean up the mess */
5177       bool progress = true;
5178 
5179       while (progress) {
5180          progress = false;
5181 
5182          NIR_PASS(progress, nir, nir_opt_dce);
5183          NIR_PASS(progress, nir, nir_opt_dead_cf);
5184       }
5185    }
5186 
5187    /* If nothing is pushed, all UBOs need to be uploaded */
5188    ctx->ubo_mask = ~0;
5189 
5190    list_inithead(&ctx->blocks);
5191 
5192    bool skip_internal = nir->info.internal;
5193    skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL);
5194 
5195    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5196       nir_print_shader(nir, stdout);
5197    }
5198 
5199    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
5200 
5201    nir_foreach_function_impl(impl, nir) {
5202       nir_index_blocks(impl);
5203 
5204       ctx->indexed_nir_blocks =
5205          rzalloc_array(ctx, bi_block *, impl->num_blocks);
5206 
5207       ctx->ssa_alloc += impl->ssa_alloc;
5208 
5209       emit_cf_list(ctx, &impl->body);
5210       bi_emit_phis_deferred(ctx);
5211       break; /* TODO: Multi-function shaders */
5212    }
5213 
5214    /* Index blocks now that we're done emitting */
5215    bi_foreach_block(ctx, block) {
5216       block->index = ctx->num_blocks++;
5217    }
5218 
5219    bi_validate(ctx, "NIR -> BIR");
5220 
5221    /* If the shader doesn't write any colour or depth outputs, it may
5222     * still need an ATEST at the very end! */
5223    bool need_dummy_atest = (ctx->stage == MESA_SHADER_FRAGMENT) &&
5224                            !ctx->emitted_atest && !bi_skip_atest(ctx, false);
5225 
5226    if (need_dummy_atest) {
5227       bi_block *end = list_last_entry(&ctx->blocks, bi_block, link);
5228       bi_builder b = bi_init_builder(ctx, bi_after_block(end));
5229       bi_emit_atest(&b, bi_zero());
5230    }
5231 
5232    bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT);
5233 
5234    /* Runs before constant folding */
5235    bi_lower_swizzle(ctx);
5236    bi_validate(ctx, "Early lowering");
5237 
5238    /* Runs before copy prop */
5239    if (optimize && !ctx->inputs->no_ubo_to_push) {
5240       bi_opt_push_ubo(ctx);
5241    }
5242 
5243    if (likely(optimize)) {
5244       bi_opt_copy_prop(ctx);
5245 
5246       while (bi_opt_constant_fold(ctx))
5247          bi_opt_copy_prop(ctx);
5248 
5249       bi_opt_mod_prop_forward(ctx);
5250       bi_opt_mod_prop_backward(ctx);
5251 
5252       /* Push LD_VAR_IMM/VAR_TEX instructions. Must run after
5253        * mod_prop_backward to fuse VAR_TEX */
5254       if (ctx->arch == 7 && ctx->stage == MESA_SHADER_FRAGMENT &&
5255           !(bifrost_debug & BIFROST_DBG_NOPRELOAD)) {
5256          bi_opt_dce(ctx, false);
5257          bi_opt_message_preload(ctx);
5258          bi_opt_copy_prop(ctx);
5259       }
5260 
5261       bi_opt_dce(ctx, false);
5262       bi_opt_cse(ctx);
5263       bi_opt_dce(ctx, false);
5264       if (!ctx->inputs->no_ubo_to_push)
5265          bi_opt_reorder_push(ctx);
5266       bi_validate(ctx, "Optimization passes");
5267    }
5268 
5269    bi_lower_opt_instructions(ctx);
5270 
5271    if (ctx->arch >= 9) {
5272       va_optimize(ctx);
5273       va_lower_isel(ctx);
5274 
5275       bi_foreach_instr_global_safe(ctx, I) {
5276          /* Phis become single moves so shouldn't be affected */
5277          if (I->op == BI_OPCODE_PHI)
5278             continue;
5279 
5280          va_lower_constants(ctx, I);
5281 
5282          bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
5283          va_repair_fau(&b, I);
5284       }
5285 
5286       /* We need to clean up after constant lowering */
5287       if (likely(optimize)) {
5288          bi_opt_cse(ctx);
5289          bi_opt_dce(ctx, false);
5290       }
5291 
5292       bi_validate(ctx, "Valhall passes");
5293    }
5294 
5295    bi_foreach_block(ctx, block) {
5296       bi_lower_branch(ctx, block);
5297    }
5298 
5299    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5300       bi_print_shader(ctx, stdout);
5301 
5302    /* Analyze before register allocation to avoid false dependencies. The
5303     * skip bit is a function of only the data flow graph and is invariant
5304     * under valid scheduling. Helpers are only defined for fragment
5305     * shaders, so this analysis is only required in fragment shaders.
5306     */
5307    if (ctx->stage == MESA_SHADER_FRAGMENT) {
5308       bi_opt_dce(ctx, false);
5309       bi_analyze_helper_requirements(ctx);
5310    }
5311 
5312    /* Fuse TEXC after analyzing helper requirements so the analysis
5313     * doesn't have to know about dual textures */
5314    if (likely(optimize)) {
5315       bi_opt_fuse_dual_texture(ctx);
5316    }
5317 
5318    /* Lower FAU after fusing dual texture, because fusing dual texture
5319     * creates new immediates that themselves may need lowering.
5320     */
5321    if (ctx->arch <= 8) {
5322       bi_lower_fau(ctx);
5323    }
5324 
5325    /* Lowering FAU can create redundant moves. Run CSE+DCE to clean up. */
5326    if (likely(optimize)) {
5327       bi_opt_cse(ctx);
5328       bi_opt_dce(ctx, false);
5329    }
5330 
5331    bi_validate(ctx, "Late lowering");
5332 
5333    if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) {
5334       bi_pressure_schedule(ctx);
5335       bi_validate(ctx, "Pre-RA scheduling");
5336    }
5337 
5338    bi_register_allocate(ctx);
5339 
5340    if (likely(optimize))
5341       bi_opt_post_ra(ctx);
5342 
5343    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5344       bi_print_shader(ctx, stdout);
5345 
5346    if (ctx->arch >= 9) {
5347       va_assign_slots(ctx);
5348       va_insert_flow_control_nops(ctx);
5349       va_merge_flow(ctx);
5350       va_mark_last(ctx);
5351    } else {
5352       bi_schedule(ctx);
5353       bi_assign_scoreboard(ctx);
5354 
5355       /* Analyze after scheduling since we depend on instruction
5356        * order. Valhall calls as part of va_insert_flow_control_nops,
5357        * as the handling for clauses differs from instructions.
5358        */
5359       bi_analyze_helper_terminate(ctx);
5360       bi_mark_clauses_td(ctx);
5361    }
5362 
5363    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal)
5364       bi_print_shader(ctx, stdout);
5365 
5366    if (ctx->arch <= 8) {
5367       bi_pack_clauses(ctx, binary, offset);
5368    } else {
5369       bi_pack_valhall(ctx, binary);
5370    }
5371 
5372    if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) {
5373       if (ctx->arch <= 8) {
5374          disassemble_bifrost(stdout, binary->data + offset,
5375                              binary->size - offset,
5376                              bifrost_debug & BIFROST_DBG_VERBOSE);
5377       } else {
5378          disassemble_valhall(stdout, binary->data + offset,
5379                              binary->size - offset,
5380                              bifrost_debug & BIFROST_DBG_VERBOSE);
5381       }
5382 
5383       fflush(stdout);
5384    }
5385 
5386    if (!skip_internal &&
5387        ((bifrost_debug & BIFROST_DBG_SHADERDB) || inputs->debug)) {
5388       char *shaderdb;
5389 
5390       if (ctx->arch >= 9) {
5391          shaderdb = va_print_stats(ctx, binary->size - offset);
5392       } else {
5393          shaderdb = bi_print_stats(ctx, binary->size - offset);
5394       }
5395 
5396       if (bifrost_debug & BIFROST_DBG_SHADERDB)
5397          fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
5398 
5399       if (inputs->debug)
5400          util_debug_message(inputs->debug, SHADER_INFO, "%s", shaderdb);
5401 
5402       ralloc_free(shaderdb);
5403    }
5404 
5405    return ctx;
5406 }
5407 
5408 static void
bi_compile_variant(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info,enum bi_idvs_mode idvs)5409 bi_compile_variant(nir_shader *nir,
5410                    const struct panfrost_compile_inputs *inputs,
5411                    struct util_dynarray *binary, struct pan_shader_info *info,
5412                    enum bi_idvs_mode idvs)
5413 {
5414    struct bi_shader_info local_info = {
5415       .push = &info->push,
5416       .bifrost = &info->bifrost,
5417       .tls_size = info->tls_size,
5418       .push_offset = info->push.count,
5419    };
5420 
5421    unsigned offset = binary->size;
5422 
5423    /* If there is no position shader (gl_Position is not written), then
5424     * there is no need to build a varying shader either. This case is hit
5425     * for transform feedback only vertex shaders which only make sense with
5426     * rasterizer discard.
5427     */
5428    if ((offset == 0) && (idvs == BI_IDVS_VARYING))
5429       return;
5430 
5431    /* Software invariant: Only a secondary shader can appear at a nonzero
5432     * offset, to keep the ABI simple. */
5433    assert((offset == 0) ^ (idvs == BI_IDVS_VARYING));
5434 
5435    bi_context *ctx =
5436       bi_compile_variant_nir(nir, inputs, binary, local_info, idvs);
5437 
5438    /* A register is preloaded <==> it is live before the first block */
5439    bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link);
5440    uint64_t preload = first_block->reg_live_in;
5441 
5442    /* If multisampling is used with a blend shader, the blend shader needs
5443     * to access the sample coverage mask in r60 and the sample ID in r61.
5444     * Blend shaders run in the same context as fragment shaders, so if a
5445     * blend shader could run, we need to preload these registers
5446     * conservatively. There is believed to be little cost to doing so, so
5447     * do so always to avoid variants of the preload descriptor.
5448     *
5449     * We only do this on Valhall, as Bifrost has to update the RSD for
5450     * multisampling w/ blend shader anyway, so this is handled in the
5451     * driver. We could unify the paths if the cost is acceptable.
5452     */
5453    if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
5454       preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
5455 
5456    info->ubo_mask |= ctx->ubo_mask;
5457    info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
5458 
5459    if (idvs == BI_IDVS_VARYING) {
5460       info->vs.secondary_enable = (binary->size > offset);
5461       info->vs.secondary_offset = offset;
5462       info->vs.secondary_preload = preload;
5463       info->vs.secondary_work_reg_count = ctx->info.work_reg_count;
5464    } else {
5465       info->preload = preload;
5466       info->work_reg_count = ctx->info.work_reg_count;
5467    }
5468 
5469    if (idvs == BI_IDVS_POSITION && !nir->info.internal &&
5470        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ)) {
5471       /* Find the psiz write */
5472       bi_instr *write = NULL;
5473 
5474       bi_foreach_instr_global(ctx, I) {
5475          if (I->op == BI_OPCODE_STORE_I16 && I->seg == BI_SEG_POS) {
5476             write = I;
5477             break;
5478          }
5479       }
5480 
5481       assert(write != NULL);
5482 
5483       /* NOP it out, preserving its flow control. TODO: maybe DCE */
5484       if (write->flow) {
5485          bi_builder b = bi_init_builder(ctx, bi_before_instr(write));
5486          bi_instr *nop = bi_nop(&b);
5487          nop->flow = write->flow;
5488       }
5489 
5490       bi_remove_instruction(write);
5491 
5492       info->vs.no_psiz_offset = binary->size;
5493       bi_pack_valhall(ctx, binary);
5494    }
5495 
5496    ralloc_free(ctx);
5497 }
5498 
5499 /* Decide if Index-Driven Vertex Shading should be used for a given shader */
5500 static bool
bi_should_idvs(nir_shader * nir,const struct panfrost_compile_inputs * inputs)5501 bi_should_idvs(nir_shader *nir, const struct panfrost_compile_inputs *inputs)
5502 {
5503    /* Opt-out */
5504    if (inputs->no_idvs || bifrost_debug & BIFROST_DBG_NOIDVS)
5505       return false;
5506 
5507    /* IDVS splits up vertex shaders, not defined on other shader stages */
5508    if (nir->info.stage != MESA_SHADER_VERTEX)
5509       return false;
5510 
5511    /* Bifrost cannot write gl_PointSize during IDVS */
5512    if ((inputs->gpu_id < 0x9000) &&
5513        nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ))
5514       return false;
5515 
5516    /* Otherwise, IDVS is usually better */
5517    return true;
5518 }
5519 
5520 void
bifrost_compile_shader_nir(nir_shader * nir,const struct panfrost_compile_inputs * inputs,struct util_dynarray * binary,struct pan_shader_info * info)5521 bifrost_compile_shader_nir(nir_shader *nir,
5522                            const struct panfrost_compile_inputs *inputs,
5523                            struct util_dynarray *binary,
5524                            struct pan_shader_info *info)
5525 {
5526    bifrost_debug = debug_get_option_bifrost_debug();
5527 
5528    /* Combine stores late, to give the driver a chance to lower dual-source
5529     * blending as regular store_output intrinsics.
5530     */
5531    NIR_PASS_V(nir, pan_nir_lower_zs_store);
5532 
5533    bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
5534 
5535    info->tls_size = nir->scratch_size;
5536    info->vs.idvs = bi_should_idvs(nir, inputs);
5537 
5538    pan_nir_collect_varyings(nir, info);
5539 
5540    if (info->vs.idvs) {
5541       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_POSITION);
5542       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_VARYING);
5543    } else {
5544       bi_compile_variant(nir, inputs, binary, info, BI_IDVS_NONE);
5545    }
5546 
5547    if (gl_shader_stage_is_compute(nir->info.stage)) {
5548       /* Workgroups may be merged if the structure of the workgroup is
5549        * not software visible. This is true if neither shared memory
5550        * nor barriers are used. The hardware may be able to optimize
5551        * compute shaders that set this flag.
5552        */
5553       info->cs.allow_merging_workgroups = (nir->info.shared_size == 0) &&
5554                                           !nir->info.uses_control_barrier &&
5555                                           !nir->info.uses_memory_barrier;
5556    }
5557 
5558    info->ubo_mask &= (1 << nir->info.num_ubos) - 1;
5559 }
5560