xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_compiler_nir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include <stdarg.h>
10 
11 #include "util/u_math.h"
12 #include "util/u_memory.h"
13 #include "util/u_string.h"
14 
15 #include "ir3_compiler.h"
16 #include "ir3_image.h"
17 #include "ir3_nir.h"
18 #include "ir3_shader.h"
19 
20 #include "instr-a3xx.h"
21 #include "ir3.h"
22 #include "ir3_context.h"
23 
24 static struct ir3_instruction_rpt
rpt_instr(struct ir3_instruction * instr,unsigned nrpt)25 rpt_instr(struct ir3_instruction *instr, unsigned nrpt)
26 {
27    struct ir3_instruction_rpt dst = {{0}};
28 
29    for (unsigned i = 0; i < nrpt; ++i)
30       dst.rpts[i] = instr;
31 
32    return dst;
33 }
34 
35 static void
cp_instrs(struct ir3_instruction * dst[],struct ir3_instruction * instrs[],unsigned n)36 cp_instrs(struct ir3_instruction *dst[], struct ir3_instruction *instrs[],
37           unsigned n)
38 {
39    for (unsigned i = 0; i < n; ++i)
40       dst[i] = instrs[i];
41 }
42 
43 static struct ir3_instruction_rpt
create_immed_rpt(struct ir3_block * block,unsigned nrpt,unsigned val)44 create_immed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val)
45 {
46    return rpt_instr(create_immed(block, val), nrpt);
47 }
48 
49 static struct ir3_instruction_rpt
create_immed_shared_rpt(struct ir3_block * block,unsigned nrpt,uint32_t val,bool shared)50 create_immed_shared_rpt(struct ir3_block *block, unsigned nrpt, uint32_t val,
51                         bool shared)
52 {
53    return rpt_instr(create_immed_shared(block, val, shared), nrpt);
54 }
55 
56 static struct ir3_instruction_rpt
create_immed_typed_rpt(struct ir3_block * block,unsigned nrpt,unsigned val,type_t type)57 create_immed_typed_rpt(struct ir3_block *block, unsigned nrpt, unsigned val,
58                        type_t type)
59 {
60    return rpt_instr(create_immed_typed(block, val, type), nrpt);
61 }
62 
63 static inline struct ir3_instruction_rpt
create_immed_typed_shared_rpt(struct ir3_block * block,unsigned nrpt,uint32_t val,type_t type,bool shared)64 create_immed_typed_shared_rpt(struct ir3_block *block, unsigned nrpt,
65                               uint32_t val, type_t type, bool shared)
66 {
67    return rpt_instr(create_immed_typed_shared(block, val, type, shared), nrpt);
68 }
69 
70 static void
set_instr_flags(struct ir3_instruction * instrs[],unsigned n,ir3_instruction_flags flags)71 set_instr_flags(struct ir3_instruction *instrs[], unsigned n,
72                 ir3_instruction_flags flags)
73 {
74    for (unsigned i = 0; i < n; ++i)
75       instrs[i]->flags |= flags;
76 }
77 
78 static void
set_cat1_round(struct ir3_instruction * instrs[],unsigned n,round_t round)79 set_cat1_round(struct ir3_instruction *instrs[], unsigned n, round_t round)
80 {
81    for (unsigned i = 0; i < n; ++i)
82       instrs[i]->cat1.round = round;
83 }
84 
85 static void
set_cat2_condition(struct ir3_instruction * instrs[],unsigned n,unsigned condition)86 set_cat2_condition(struct ir3_instruction *instrs[], unsigned n,
87                    unsigned condition)
88 {
89    for (unsigned i = 0; i < n; ++i)
90       instrs[i]->cat2.condition = condition;
91 }
92 
93 static void
set_dst_flags(struct ir3_instruction * instrs[],unsigned n,ir3_register_flags flags)94 set_dst_flags(struct ir3_instruction *instrs[], unsigned n,
95               ir3_register_flags flags)
96 {
97    for (unsigned i = 0; i < n; ++i)
98       instrs[i]->dsts[0]->flags |= flags;
99 }
100 
101 void
ir3_handle_nonuniform(struct ir3_instruction * instr,nir_intrinsic_instr * intrin)102 ir3_handle_nonuniform(struct ir3_instruction *instr,
103                       nir_intrinsic_instr *intrin)
104 {
105    if (nir_intrinsic_has_access(intrin) &&
106        (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)) {
107       instr->flags |= IR3_INSTR_NONUNIF;
108    }
109 }
110 
111 void
ir3_handle_bindless_cat6(struct ir3_instruction * instr,nir_src rsrc)112 ir3_handle_bindless_cat6(struct ir3_instruction *instr, nir_src rsrc)
113 {
114    nir_intrinsic_instr *intrin = ir3_bindless_resource(rsrc);
115    if (!intrin)
116       return;
117 
118    instr->flags |= IR3_INSTR_B;
119    instr->cat6.base = nir_intrinsic_desc_set(intrin);
120 }
121 
122 static struct ir3_instruction *
create_input(struct ir3_context * ctx,unsigned compmask)123 create_input(struct ir3_context *ctx, unsigned compmask)
124 {
125    struct ir3_instruction *in;
126 
127    in = ir3_instr_create(ctx->in_block, OPC_META_INPUT, 1, 0);
128    in->input.sysval = ~0;
129    __ssa_dst(in)->wrmask = compmask;
130 
131    array_insert(ctx->ir, ctx->ir->inputs, in);
132 
133    return in;
134 }
135 
136 static struct ir3_instruction_rpt
create_frag_input(struct ir3_context * ctx,struct ir3_instruction * coord,unsigned n,unsigned ncomp)137 create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
138                   unsigned n, unsigned ncomp)
139 {
140    struct ir3_block *block = ctx->block;
141    struct ir3_instruction_rpt instr;
142    /* packed inloc is fixed up later: */
143    struct ir3_instruction_rpt inloc;
144 
145    for (unsigned i = 0; i < ncomp; i++)
146       inloc.rpts[i] = create_immed(block, n + i);
147 
148    if (coord) {
149       instr =
150          ir3_BARY_F_rpt(block, ncomp, inloc, 0, rpt_instr(coord, ncomp), 0);
151    } else if (ctx->compiler->flat_bypass) {
152       if (ctx->compiler->gen >= 6) {
153          instr = ir3_FLAT_B_rpt(block, ncomp, inloc, 0, inloc, 0);
154       } else {
155          for (unsigned i = 0; i < ncomp; i++) {
156             instr.rpts[i] =
157                ir3_LDLV(block, inloc.rpts[i], 0, create_immed(block, 1), 0);
158             instr.rpts[i]->cat6.type = TYPE_U32;
159             instr.rpts[i]->cat6.iim_val = 1;
160          }
161       }
162    } else {
163       instr = ir3_BARY_F_rpt(block, ncomp, inloc, 0,
164                              rpt_instr(ctx->ij[IJ_PERSP_PIXEL], ncomp), 0);
165 
166       for (unsigned i = 0; i < ncomp; i++)
167          instr.rpts[i]->srcs[1]->wrmask = 0x3;
168    }
169 
170    return instr;
171 }
172 
173 static struct ir3_instruction *
create_driver_param(struct ir3_context * ctx,enum ir3_driver_param dp)174 create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
175 {
176    /* first four vec4 sysval's reserved for UBOs: */
177    /* NOTE: dp is in scalar, but there can be >4 dp components: */
178    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
179    unsigned n = const_state->offsets.driver_param;
180    unsigned r = regid(n + dp / 4, dp % 4);
181    return create_uniform(ctx->block, r);
182 }
183 
184 static struct ir3_instruction *
create_driver_param_indirect(struct ir3_context * ctx,enum ir3_driver_param dp,struct ir3_instruction * address)185 create_driver_param_indirect(struct ir3_context *ctx, enum ir3_driver_param dp,
186                              struct ir3_instruction *address)
187 {
188    /* first four vec4 sysval's reserved for UBOs: */
189    /* NOTE: dp is in scalar, but there can be >4 dp components: */
190    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
191    unsigned n = const_state->offsets.driver_param;
192    return create_uniform_indirect(ctx->block, n * 4 + dp, TYPE_U32, address);
193 }
194 
195 /*
196  * Adreno's comparisons produce a 1 for true and 0 for false, in either 16 or
197  * 32-bit registers.  We use NIR's 1-bit integers to represent bools, and
198  * trust that we will only see and/or/xor on those 1-bit values, so we can
199  * safely store NIR i1s in a 32-bit reg while always containing either a 1 or
200  * 0.
201  */
202 
203 /*
204  * alu/sfu instructions:
205  */
206 
207 static struct ir3_instruction_rpt
create_cov(struct ir3_context * ctx,unsigned nrpt,struct ir3_instruction_rpt src,unsigned src_bitsize,nir_op op)208 create_cov(struct ir3_context *ctx, unsigned nrpt,
209            struct ir3_instruction_rpt src, unsigned src_bitsize, nir_op op)
210 {
211    type_t src_type, dst_type;
212 
213    switch (op) {
214    case nir_op_f2f32:
215    case nir_op_f2f16_rtne:
216    case nir_op_f2f16_rtz:
217    case nir_op_f2f16:
218    case nir_op_f2i32:
219    case nir_op_f2i16:
220    case nir_op_f2i8:
221    case nir_op_f2u32:
222    case nir_op_f2u16:
223    case nir_op_f2u8:
224       switch (src_bitsize) {
225       case 32:
226          src_type = TYPE_F32;
227          break;
228       case 16:
229          src_type = TYPE_F16;
230          break;
231       default:
232          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
233       }
234       break;
235 
236    case nir_op_i2f32:
237    case nir_op_i2f16:
238    case nir_op_i2i32:
239    case nir_op_i2i16:
240    case nir_op_i2i8:
241       switch (src_bitsize) {
242       case 32:
243          src_type = TYPE_S32;
244          break;
245       case 16:
246          src_type = TYPE_S16;
247          break;
248       case 8:
249          src_type = TYPE_U8;
250          break;
251       default:
252          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
253       }
254       break;
255 
256    case nir_op_u2f32:
257    case nir_op_u2f16:
258    case nir_op_u2u32:
259    case nir_op_u2u16:
260    case nir_op_u2u8:
261       switch (src_bitsize) {
262       case 32:
263          src_type = TYPE_U32;
264          break;
265       case 16:
266          src_type = TYPE_U16;
267          break;
268       case 8:
269          src_type = TYPE_U8;
270          break;
271       default:
272          ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize);
273       }
274       break;
275 
276    case nir_op_b2f16:
277    case nir_op_b2f32:
278    case nir_op_b2i8:
279    case nir_op_b2i16:
280    case nir_op_b2i32:
281       src_type = ctx->compiler->bool_type;
282       break;
283 
284    default:
285       ir3_context_error(ctx, "invalid conversion op: %u", op);
286    }
287 
288    switch (op) {
289    case nir_op_f2f32:
290    case nir_op_i2f32:
291    case nir_op_u2f32:
292    case nir_op_b2f32:
293       dst_type = TYPE_F32;
294       break;
295 
296    case nir_op_f2f16_rtne:
297    case nir_op_f2f16_rtz:
298    case nir_op_f2f16:
299    case nir_op_i2f16:
300    case nir_op_u2f16:
301    case nir_op_b2f16:
302       dst_type = TYPE_F16;
303       break;
304 
305    case nir_op_f2i32:
306    case nir_op_i2i32:
307    case nir_op_b2i32:
308       dst_type = TYPE_S32;
309       break;
310 
311    case nir_op_f2i16:
312    case nir_op_i2i16:
313    case nir_op_b2i16:
314       dst_type = TYPE_S16;
315       break;
316 
317    case nir_op_f2i8:
318    case nir_op_i2i8:
319    case nir_op_b2i8:
320       dst_type = TYPE_U8;
321       break;
322 
323    case nir_op_f2u32:
324    case nir_op_u2u32:
325       dst_type = TYPE_U32;
326       break;
327 
328    case nir_op_f2u16:
329    case nir_op_u2u16:
330       dst_type = TYPE_U16;
331       break;
332 
333    case nir_op_f2u8:
334    case nir_op_u2u8:
335       dst_type = TYPE_U8;
336       break;
337 
338    default:
339       ir3_context_error(ctx, "invalid conversion op: %u", op);
340    }
341 
342    if (src_type == dst_type)
343       return src;
344 
345    /* Zero-extension of 8-bit values doesn't work with `cov`, so simple masking
346     * is used to achieve the result.
347     */
348    if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_U32) {
349       struct ir3_instruction_rpt mask =
350          create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8);
351       struct ir3_instruction_rpt cov =
352          ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0);
353       set_dst_flags(cov.rpts, nrpt, type_flags(dst_type));
354       return cov;
355    }
356 
357    /* Conversion of 8-bit values into floating-point values doesn't work with
358     * a simple `cov`, instead the 8-bit values first have to be converted into
359     * corresponding 16-bit values and converted from there.
360     */
361    if (src_type == TYPE_U8 && full_type(dst_type) == TYPE_F32) {
362       assert(op == nir_op_u2f16 || op == nir_op_i2f16 ||
363              op == nir_op_u2f32 || op == nir_op_i2f32);
364 
365       struct ir3_instruction_rpt cov;
366       if (op == nir_op_u2f16 || op == nir_op_u2f32) {
367          struct ir3_instruction_rpt mask =
368             create_immed_typed_rpt(ctx->block, nrpt, 0xff, TYPE_U8);
369          cov = ir3_AND_B_rpt(ctx->block, nrpt, src, 0, mask, 0);
370          set_dst_flags(cov.rpts, nrpt, IR3_REG_HALF);
371          cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_U16, dst_type);
372       } else {
373          cov = ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U8, TYPE_S16);
374          cov = ir3_COV_rpt(ctx->block, nrpt, cov, TYPE_S16, dst_type);
375       }
376       return cov;
377    }
378 
379    /* Conversion of floating-point values to 8-bit values also doesn't work
380     * through a single `cov`, instead the conversion has to go through the
381     * corresponding 16-bit type that's then truncated.
382     */
383    if (full_type(src_type) == TYPE_F32 && dst_type == TYPE_U8) {
384       assert(op == nir_op_f2u8 || op == nir_op_f2i8);
385 
386       type_t intermediate_type = op == nir_op_f2u8 ? TYPE_U16 : TYPE_S16;
387       struct ir3_instruction_rpt cov =
388          ir3_COV_rpt(ctx->block, nrpt, src, src_type, intermediate_type);
389       cov = ir3_COV_rpt(ctx->block, nrpt, cov, intermediate_type, TYPE_U8);
390       return cov;
391    }
392 
393    struct ir3_instruction_rpt cov =
394       ir3_COV_rpt(ctx->block, nrpt, src, src_type, dst_type);
395 
396    if (op == nir_op_f2f16_rtne) {
397       set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
398    } else if (op == nir_op_f2f16_rtz) {
399       set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
400    } else if (dst_type == TYPE_F16 || dst_type == TYPE_F32) {
401       unsigned execution_mode = ctx->s->info.float_controls_execution_mode;
402       nir_alu_type type =
403          dst_type == TYPE_F16 ? nir_type_float16 : nir_type_float32;
404       nir_rounding_mode rounding_mode =
405          nir_get_rounding_mode_from_float_controls(execution_mode, type);
406       if (rounding_mode == nir_rounding_mode_rtne)
407          set_cat1_round(cov.rpts, nrpt, ROUND_EVEN);
408       else if (rounding_mode == nir_rounding_mode_rtz)
409          set_cat1_round(cov.rpts, nrpt, ROUND_ZERO);
410    }
411 
412    return cov;
413 }
414 
415 /* For shift instructions NIR always has shift amount as 32 bit integer */
416 static struct ir3_instruction_rpt
resize_shift_amount(struct ir3_context * ctx,unsigned nrpt,struct ir3_instruction_rpt src,unsigned bs)417 resize_shift_amount(struct ir3_context *ctx, unsigned nrpt,
418                     struct ir3_instruction_rpt src, unsigned bs)
419 {
420    if (bs == 16)
421       return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U16);
422    else if (bs == 8)
423       return ir3_COV_rpt(ctx->block, nrpt, src, TYPE_U32, TYPE_U8);
424    else
425       return src;
426 }
427 
428 static void
emit_alu_dot_4x8_as_dp4acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)429 emit_alu_dot_4x8_as_dp4acc(struct ir3_context *ctx, nir_alu_instr *alu,
430                            struct ir3_instruction **dst,
431                            struct ir3_instruction **src)
432 {
433    if (ctx->compiler->has_compliant_dp4acc) {
434       dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, src[2], 0);
435 
436       /* This is actually the LHS signedness attribute.
437        * IR3_SRC_UNSIGNED ~ unsigned LHS (i.e. OpUDot and OpUDotAccSat).
438        */
439       if (alu->op == nir_op_udot_4x8_uadd ||
440           alu->op == nir_op_udot_4x8_uadd_sat) {
441          dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
442       } else {
443          dst[0]->cat3.signedness = IR3_SRC_MIXED;
444       }
445 
446       /* This is actually the RHS signedness attribute.
447        * IR3_SRC_PACKED_HIGH ~ signed RHS (i.e. OpSDot and OpSDotAccSat).
448        */
449       if (alu->op == nir_op_sdot_4x8_iadd ||
450           alu->op == nir_op_sdot_4x8_iadd_sat) {
451          dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
452       } else {
453          dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
454       }
455 
456       if (alu->op == nir_op_udot_4x8_uadd_sat ||
457           alu->op == nir_op_sdot_4x8_iadd_sat ||
458           alu->op == nir_op_sudot_4x8_iadd_sat) {
459          dst[0]->flags |= IR3_INSTR_SAT;
460       }
461       return;
462    }
463 
464    struct ir3_instruction *accumulator = NULL;
465    if (alu->op == nir_op_udot_4x8_uadd_sat) {
466       accumulator = create_immed(ctx->block, 0);
467    } else {
468       accumulator = src[2];
469    }
470 
471    dst[0] = ir3_DP4ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
472 
473    if (alu->op == nir_op_udot_4x8_uadd ||
474        alu->op == nir_op_udot_4x8_uadd_sat) {
475       dst[0]->cat3.signedness = IR3_SRC_UNSIGNED;
476    } else {
477       dst[0]->cat3.signedness = IR3_SRC_MIXED;
478    }
479 
480    /* For some reason (sat) doesn't work in unsigned case so
481     * we have to emulate it.
482     */
483    if (alu->op == nir_op_udot_4x8_uadd_sat) {
484       dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
485       dst[0]->flags |= IR3_INSTR_SAT;
486    } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
487       dst[0]->flags |= IR3_INSTR_SAT;
488    }
489 }
490 
491 static void
emit_alu_dot_4x8_as_dp2acc(struct ir3_context * ctx,nir_alu_instr * alu,struct ir3_instruction ** dst,struct ir3_instruction ** src)492 emit_alu_dot_4x8_as_dp2acc(struct ir3_context *ctx, nir_alu_instr *alu,
493                            struct ir3_instruction **dst,
494                            struct ir3_instruction **src)
495 {
496    int signedness;
497    if (alu->op == nir_op_udot_4x8_uadd ||
498        alu->op == nir_op_udot_4x8_uadd_sat) {
499       signedness = IR3_SRC_UNSIGNED;
500    } else {
501       signedness = IR3_SRC_MIXED;
502    }
503 
504    struct ir3_instruction *accumulator = NULL;
505    if (alu->op == nir_op_udot_4x8_uadd_sat ||
506        alu->op == nir_op_sudot_4x8_iadd_sat) {
507       accumulator = create_immed(ctx->block, 0);
508    } else {
509       accumulator = src[2];
510    }
511 
512    dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, accumulator, 0);
513    dst[0]->cat3.packed = IR3_SRC_PACKED_LOW;
514    dst[0]->cat3.signedness = signedness;
515 
516    dst[0] = ir3_DP2ACC(ctx->block, src[0], 0, src[1], 0, dst[0], 0);
517    dst[0]->cat3.packed = IR3_SRC_PACKED_HIGH;
518    dst[0]->cat3.signedness = signedness;
519 
520    if (alu->op == nir_op_udot_4x8_uadd_sat) {
521       dst[0] = ir3_ADD_U(ctx->block, dst[0], 0, src[2], 0);
522       dst[0]->flags |= IR3_INSTR_SAT;
523    } else if (alu->op == nir_op_sudot_4x8_iadd_sat) {
524       dst[0] = ir3_ADD_S(ctx->block, dst[0], 0, src[2], 0);
525       dst[0]->flags |= IR3_INSTR_SAT;
526    }
527 }
528 
529 static bool
all_sat_compatible(struct ir3_instruction * instrs[],unsigned n)530 all_sat_compatible(struct ir3_instruction *instrs[], unsigned n)
531 {
532    for (unsigned i = 0; i < n; i++) {
533       if (!is_sat_compatible(instrs[i]->opc))
534          return false;
535    }
536 
537    return true;
538 }
539 
540 /* Is src the only use of its def, taking components into account. */
541 static bool
is_unique_use(nir_src * src)542 is_unique_use(nir_src *src)
543 {
544    nir_def *def = src->ssa;
545 
546    if (list_is_singular(&def->uses))
547       return true;
548 
549    nir_component_mask_t src_read_mask = nir_src_components_read(src);
550 
551    nir_foreach_use (use, def) {
552       if (use == src)
553          continue;
554 
555       if (nir_src_components_read(use) & src_read_mask)
556          return false;
557    }
558 
559    return true;
560 }
561 
562 static void
emit_alu(struct ir3_context * ctx,nir_alu_instr * alu)563 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
564 {
565    const nir_op_info *info = &nir_op_infos[alu->op];
566    struct ir3_instruction_rpt dst, src[info->num_inputs];
567    unsigned bs[info->num_inputs]; /* bit size */
568    struct ir3_block *b = ctx->block;
569    unsigned dst_sz;
570    unsigned dst_bitsize = ir3_bitsize(ctx, alu->def.bit_size);
571    type_t dst_type = type_uint_size(dst_bitsize);
572 
573    dst_sz = alu->def.num_components;
574    assert(dst_sz == 1 || ir3_supports_vectorized_nir_op(alu->op));
575 
576    bool use_shared = !alu->def.divergent &&
577       ctx->compiler->has_scalar_alu &&
578       /* it probably isn't worth emulating these with scalar-only ops */
579       alu->op != nir_op_udot_4x8_uadd &&
580       alu->op != nir_op_udot_4x8_uadd_sat &&
581       alu->op != nir_op_sdot_4x8_iadd &&
582       alu->op != nir_op_sdot_4x8_iadd_sat &&
583       alu->op != nir_op_sudot_4x8_iadd &&
584       alu->op != nir_op_sudot_4x8_iadd_sat &&
585       /* not supported in HW, we have to fall back to normal registers */
586       alu->op != nir_op_ffma;
587 
588    struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz);
589 
590    /* Vectors are special in that they have non-scalarized writemasks,
591     * and just take the first swizzle channel for each argument in
592     * order into each writemask channel.
593     */
594    if ((alu->op == nir_op_vec2) || (alu->op == nir_op_vec3) ||
595        (alu->op == nir_op_vec4) || (alu->op == nir_op_vec8) ||
596        (alu->op == nir_op_vec16)) {
597       for (int i = 0; i < info->num_inputs; i++) {
598          nir_alu_src *asrc = &alu->src[i];
599          struct ir3_instruction *src =
600             ir3_get_src_shared(ctx, &asrc->src, use_shared)[asrc->swizzle[0]];
601          compile_assert(ctx, src);
602          def[i] = ir3_MOV(b, src, dst_type);
603       }
604 
605       ir3_instr_create_rpt(def, info->num_inputs);
606       ir3_put_def(ctx, &alu->def);
607       return;
608    }
609 
610    assert(dst_sz <= ARRAY_SIZE(src[0].rpts));
611 
612    for (int i = 0; i < info->num_inputs; i++) {
613       nir_alu_src *asrc = &alu->src[i];
614       struct ir3_instruction *const *input_src =
615          ir3_get_src_shared(ctx, &asrc->src, use_shared);
616       bs[i] = nir_src_bit_size(asrc->src);
617 
618       for (unsigned rpt = 0; rpt < dst_sz; rpt++) {
619          src[i].rpts[rpt] = input_src[asrc->swizzle[rpt]];
620          compile_assert(ctx, src[i].rpts[rpt]);
621       }
622    }
623 
624    switch (alu->op) {
625    case nir_op_mov:
626       dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
627       break;
628 
629    case nir_op_f2f32:
630    case nir_op_f2f16_rtne:
631    case nir_op_f2f16_rtz:
632    case nir_op_f2f16:
633    case nir_op_f2i32:
634    case nir_op_f2i16:
635    case nir_op_f2i8:
636    case nir_op_f2u32:
637    case nir_op_f2u16:
638    case nir_op_f2u8:
639    case nir_op_i2f32:
640    case nir_op_i2f16:
641    case nir_op_i2i32:
642    case nir_op_i2i16:
643    case nir_op_i2i8:
644    case nir_op_u2f32:
645    case nir_op_u2f16:
646    case nir_op_u2u32:
647    case nir_op_u2u16:
648    case nir_op_u2u8:
649    case nir_op_b2f16:
650    case nir_op_b2f32:
651    case nir_op_b2i8:
652    case nir_op_b2i16:
653    case nir_op_b2i32:
654       dst = create_cov(ctx, dst_sz, src[0], bs[0], alu->op);
655       break;
656 
657    case nir_op_fquantize2f16:
658       dst = create_cov(ctx, dst_sz,
659                        create_cov(ctx, dst_sz, src[0], 32, nir_op_f2f16_rtne),
660                        16, nir_op_f2f32);
661       break;
662 
663    case nir_op_b2b1:
664       /* b2b1 will appear when translating from
665        *
666        * - nir_intrinsic_load_shared of a 32-bit 0/~0 value.
667        * - nir_intrinsic_load_constant of a 32-bit 0/~0 value
668        *
669        * A negate can turn those into a 1 or 0 for us.
670        */
671       dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
672       break;
673 
674    case nir_op_b2b32:
675       /* b2b32 will appear when converting our 1-bit bools to a store_shared
676        * argument.
677        *
678        * A negate can turn those into a ~0 for us.
679        */
680       dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
681       break;
682 
683    case nir_op_fneg:
684       dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FNEG);
685       break;
686    case nir_op_fabs:
687       dst = ir3_ABSNEG_F_rpt(b, dst_sz, src[0], IR3_REG_FABS);
688       break;
689    case nir_op_fmax:
690       dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
691       break;
692    case nir_op_fmin:
693       dst = ir3_MIN_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
694       break;
695    case nir_op_fsat:
696       /* if there is just a single use of the src, and it supports
697        * (sat) bit, we can just fold the (sat) flag back to the
698        * src instruction and create a mov.  This is easier for cp
699        * to eliminate.
700        */
701       if (all_sat_compatible(src[0].rpts, dst_sz) &&
702           is_unique_use(&alu->src[0].src)) {
703          set_instr_flags(src[0].rpts, dst_sz, IR3_INSTR_SAT);
704          dst = ir3_MOV_rpt(b, dst_sz, src[0], dst_type);
705       } else {
706          /* otherwise generate a max.f that saturates.. blob does
707           * similar (generating a cat2 mov using max.f)
708           */
709          dst = ir3_MAX_F_rpt(b, dst_sz, src[0], 0, src[0], 0);
710          set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
711       }
712       break;
713    case nir_op_fmul:
714       dst = ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
715       break;
716    case nir_op_fadd:
717       dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
718       break;
719    case nir_op_fsub:
720       dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG);
721       break;
722    case nir_op_ffma:
723       dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
724       break;
725    case nir_op_flt:
726       dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
727       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
728       break;
729    case nir_op_fge:
730       dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
731       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
732       break;
733    case nir_op_feq:
734       dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
735       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
736       break;
737    case nir_op_fneu:
738       dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
739       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
740       break;
741    case nir_op_fceil:
742       dst = ir3_CEIL_F_rpt(b, dst_sz, src[0], 0);
743       break;
744    case nir_op_ffloor:
745       dst = ir3_FLOOR_F_rpt(b, dst_sz, src[0], 0);
746       break;
747    case nir_op_ftrunc:
748       dst = ir3_TRUNC_F_rpt(b, dst_sz, src[0], 0);
749       break;
750    case nir_op_fround_even:
751       dst = ir3_RNDNE_F_rpt(b, dst_sz, src[0], 0);
752       break;
753    case nir_op_fsign:
754       dst = ir3_SIGN_F_rpt(b, dst_sz, src[0], 0);
755       break;
756 
757    case nir_op_fsin:
758       dst = ir3_SIN_rpt(b, dst_sz, src[0], 0);
759       break;
760    case nir_op_fcos:
761       dst = ir3_COS_rpt(b, dst_sz, src[0], 0);
762       break;
763    case nir_op_frsq:
764       dst = ir3_RSQ_rpt(b, dst_sz, src[0], 0);
765       break;
766    case nir_op_frcp:
767       assert(dst_sz == 1);
768       dst.rpts[0] = ir3_RCP(b, src[0].rpts[0], 0);
769       break;
770    case nir_op_flog2:
771       dst = ir3_LOG2_rpt(b, dst_sz, src[0], 0);
772       break;
773    case nir_op_fexp2:
774       dst = ir3_EXP2_rpt(b, dst_sz, src[0], 0);
775       break;
776    case nir_op_fsqrt:
777       dst = ir3_SQRT_rpt(b, dst_sz, src[0], 0);
778       break;
779 
780    case nir_op_iabs:
781       dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SABS);
782       break;
783    case nir_op_iadd:
784       dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
785       break;
786    case nir_op_ihadd:
787       dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
788       set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
789       break;
790    case nir_op_uhadd:
791       dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
792       set_dst_flags(dst.rpts, dst_sz, IR3_REG_EI);
793       break;
794    case nir_op_iand:
795       dst = ir3_AND_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
796       break;
797    case nir_op_imax:
798       dst = ir3_MAX_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
799       break;
800    case nir_op_umax:
801       dst = ir3_MAX_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
802       break;
803    case nir_op_imin:
804       dst = ir3_MIN_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
805       break;
806    case nir_op_umin:
807       dst = ir3_MIN_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
808       break;
809    case nir_op_umul_low:
810       dst = ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
811       break;
812    case nir_op_imadsh_mix16:
813       if (use_shared) {
814          struct ir3_instruction_rpt sixteen =
815             create_immed_shared_rpt(b, dst_sz, 16, true);
816          struct ir3_instruction_rpt src1 =
817             ir3_SHR_B_rpt(b, dst_sz, src[1], 0, sixteen, 0);
818          struct ir3_instruction_rpt mul =
819             ir3_MULL_U_rpt(b, dst_sz, src[0], 0, src1, 0);
820          dst = ir3_ADD_U_rpt(b, dst_sz,
821                              ir3_SHL_B_rpt(b, dst_sz, mul, 0, sixteen, 0), 0,
822                              src[2], 0);
823       } else {
824          dst = ir3_MADSH_M16_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
825       }
826       break;
827    case nir_op_imad24_ir3:
828       if (use_shared) {
829          dst = ir3_ADD_U_rpt(b, dst_sz,
830                              ir3_MUL_U24_rpt(b, dst_sz, src[0], 0, src[1], 0),
831                              0, src[2], 0);
832       } else {
833          dst = ir3_MAD_S24_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
834       }
835       break;
836    case nir_op_imul:
837       compile_assert(ctx, alu->def.bit_size == 8 || alu->def.bit_size == 16);
838       dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
839       break;
840    case nir_op_imul24:
841       dst = ir3_MUL_S24_rpt(b, dst_sz, src[0], 0, src[1], 0);
842       break;
843    case nir_op_ineg:
844       dst = ir3_ABSNEG_S_rpt(b, dst_sz, src[0], IR3_REG_SNEG);
845       break;
846    case nir_op_inot:
847       if (bs[0] == 1) {
848          struct ir3_instruction_rpt one = create_immed_typed_shared_rpt(
849             ctx->block, dst_sz, 1, ctx->compiler->bool_type, use_shared);
850          dst = ir3_SUB_U_rpt(b, dst_sz, one, 0, src[0], 0);
851       } else {
852          dst = ir3_NOT_B_rpt(ctx->block, dst_sz, src[0], 0);
853       }
854       break;
855    case nir_op_ior:
856       dst = ir3_OR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
857       break;
858    case nir_op_ishl:
859       dst = ir3_SHL_B_rpt(ctx->block, dst_sz, src[0], 0,
860                           resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
861       break;
862    case nir_op_ishr:
863       dst = ir3_ASHR_B_rpt(ctx->block, dst_sz, src[0], 0,
864                            resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
865       break;
866    case nir_op_isub:
867       dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
868       break;
869    case nir_op_ixor:
870       dst = ir3_XOR_B_rpt(b, dst_sz, src[0], 0, src[1], 0);
871       break;
872    case nir_op_ushr:
873       dst = ir3_SHR_B_rpt(ctx->block, dst_sz, src[0], 0,
874                           resize_shift_amount(ctx, dst_sz, src[1], bs[0]), 0);
875       break;
876    case nir_op_ilt:
877       dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
878       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
879       break;
880    case nir_op_ige:
881       dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
882       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
883       break;
884    case nir_op_ieq:
885       dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
886       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_EQ);
887       break;
888    case nir_op_ine:
889       dst = ir3_CMPS_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
890       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_NE);
891       break;
892    case nir_op_ult:
893       dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
894       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_LT);
895       break;
896    case nir_op_uge:
897       dst = ir3_CMPS_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
898       set_cat2_condition(dst.rpts, dst_sz, IR3_COND_GE);
899       break;
900 
901    case nir_op_bcsel: {
902       struct ir3_instruction_rpt conds;
903 
904       compile_assert(ctx, bs[1] == bs[2]);
905 
906       /* TODO: repeat the covs when possible. */
907       for (unsigned rpt = 0; rpt < dst_sz; ++rpt) {
908          struct ir3_instruction *cond =
909             ir3_get_cond_for_nonzero_compare(src[0].rpts[rpt]);
910 
911          /* The condition's size has to match the other two arguments' size, so
912           * convert down if necessary.
913           *
914           * Single hashtable is fine, because the conversion will either be
915           * 16->32 or 32->16, but never both
916           */
917          if (is_half(src[1].rpts[rpt]) != is_half(cond)) {
918             struct hash_entry *prev_entry = _mesa_hash_table_search(
919                ctx->sel_cond_conversions, src[0].rpts[rpt]);
920             if (prev_entry) {
921                cond = prev_entry->data;
922             } else {
923                if (is_half(cond)) {
924                   if (bs[0] == 8) {
925                      /* Zero-extension of an 8-bit value has to be done through
926                       * masking, as in create_cov.
927                       */
928                      struct ir3_instruction *mask =
929                         create_immed_typed(b, 0xff, TYPE_U8);
930                      cond = ir3_AND_B(b, cond, 0, mask, 0);
931                   } else {
932                      cond = ir3_COV(b, cond, TYPE_U16, TYPE_U32);
933                   }
934                } else {
935                   cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
936                }
937                _mesa_hash_table_insert(ctx->sel_cond_conversions,
938                                        src[0].rpts[rpt], cond);
939             }
940          }
941          conds.rpts[rpt] = cond;
942       }
943 
944       if (is_half(src[1].rpts[0]))
945          dst = ir3_SEL_B16_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
946       else
947          dst = ir3_SEL_B32_rpt(b, dst_sz, src[1], 0, conds, 0, src[2], 0);
948       break;
949    }
950    case nir_op_bit_count: {
951       if (ctx->compiler->gen < 5 ||
952           (src[0].rpts[0]->dsts[0]->flags & IR3_REG_HALF)) {
953          dst = ir3_CBITS_B_rpt(b, dst_sz, src[0], 0);
954          break;
955       }
956 
957       // We need to do this 16b at a time on a5xx+a6xx.  Once half-precision
958       // support is in place, this should probably move to a NIR lowering pass:
959       struct ir3_instruction_rpt hi, lo;
960 
961       hi = ir3_COV_rpt(
962          b, dst_sz,
963          ir3_SHR_B_rpt(b, dst_sz, src[0], 0,
964                        create_immed_shared_rpt(b, dst_sz, 16, use_shared), 0),
965          TYPE_U32, TYPE_U16);
966       lo = ir3_COV_rpt(b, dst_sz, src[0], TYPE_U32, TYPE_U16);
967 
968       hi = ir3_CBITS_B_rpt(b, dst_sz, hi, 0);
969       lo = ir3_CBITS_B_rpt(b, dst_sz, lo, 0);
970 
971       // TODO maybe the builders should default to making dst half-precision
972       // if the src's were half precision, to make this less awkward.. otoh
973       // we should probably just do this lowering in NIR.
974       set_dst_flags(hi.rpts, dst_sz, IR3_REG_HALF);
975       set_dst_flags(lo.rpts, dst_sz, IR3_REG_HALF);
976 
977       dst = ir3_ADD_S_rpt(b, dst_sz, hi, 0, lo, 0);
978       set_dst_flags(dst.rpts, dst_sz, IR3_REG_HALF);
979       dst = ir3_COV_rpt(b, dst_sz, dst, TYPE_U16, TYPE_U32);
980       break;
981    }
982    case nir_op_ifind_msb: {
983       struct ir3_instruction_rpt cmp;
984       dst = ir3_CLZ_S_rpt(b, dst_sz, src[0], 0);
985       cmp =
986          ir3_CMPS_S_rpt(b, dst_sz, dst, 0,
987                         create_immed_shared_rpt(b, dst_sz, 0, use_shared), 0);
988       set_cat2_condition(cmp.rpts, dst_sz, IR3_COND_GE);
989       dst = ir3_SEL_B32_rpt(
990          b, dst_sz,
991          ir3_SUB_U_rpt(b, dst_sz,
992                        create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
993                        dst, 0),
994          0, cmp, 0, dst, 0);
995       break;
996    }
997    case nir_op_ufind_msb:
998       dst = ir3_CLZ_B_rpt(b, dst_sz, src[0], 0);
999       dst = ir3_SEL_B32_rpt(
1000          b, dst_sz,
1001          ir3_SUB_U_rpt(b, dst_sz,
1002                        create_immed_shared_rpt(b, dst_sz, 31, use_shared), 0,
1003                        dst, 0),
1004          0, src[0], 0, dst, 0);
1005       break;
1006    case nir_op_find_lsb:
1007       dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
1008       dst = ir3_CLZ_B_rpt(b, dst_sz, dst, 0);
1009       break;
1010    case nir_op_bitfield_reverse:
1011       dst = ir3_BFREV_B_rpt(b, dst_sz, src[0], 0);
1012       break;
1013 
1014    case nir_op_uadd_sat:
1015       dst = ir3_ADD_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
1016       set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1017       break;
1018    case nir_op_iadd_sat:
1019       dst = ir3_ADD_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
1020       set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1021       break;
1022    case nir_op_usub_sat:
1023       dst = ir3_SUB_U_rpt(b, dst_sz, src[0], 0, src[1], 0);
1024       set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1025       break;
1026    case nir_op_isub_sat:
1027       dst = ir3_SUB_S_rpt(b, dst_sz, src[0], 0, src[1], 0);
1028       set_instr_flags(dst.rpts, dst_sz, IR3_INSTR_SAT);
1029       break;
1030 
1031    case nir_op_udot_4x8_uadd:
1032    case nir_op_udot_4x8_uadd_sat:
1033    case nir_op_sdot_4x8_iadd:
1034    case nir_op_sdot_4x8_iadd_sat:
1035    case nir_op_sudot_4x8_iadd:
1036    case nir_op_sudot_4x8_iadd_sat: {
1037       assert(dst_sz == 1);
1038 
1039       struct ir3_instruction *src_rpt0[] = {src[0].rpts[0], src[1].rpts[0],
1040                                             src[2].rpts[0]};
1041 
1042       if (ctx->compiler->has_dp4acc) {
1043          emit_alu_dot_4x8_as_dp4acc(ctx, alu, dst.rpts, src_rpt0);
1044       } else if (ctx->compiler->has_dp2acc) {
1045          emit_alu_dot_4x8_as_dp2acc(ctx, alu, dst.rpts, src_rpt0);
1046       } else {
1047          ir3_context_error(ctx, "ALU op should have been lowered: %s\n",
1048                            nir_op_infos[alu->op].name);
1049       }
1050 
1051       break;
1052    }
1053 
1054    default:
1055       ir3_context_error(ctx, "Unhandled ALU op: %s\n",
1056                         nir_op_infos[alu->op].name);
1057       break;
1058    }
1059 
1060    if (nir_alu_type_get_base_type(info->output_type) == nir_type_bool) {
1061       assert(alu->def.bit_size == 1 || alu->op == nir_op_b2b32);
1062    } else {
1063       /* 1-bit values stored in 32-bit registers are only valid for certain
1064        * ALU ops.
1065        */
1066       switch (alu->op) {
1067       case nir_op_mov:
1068       case nir_op_iand:
1069       case nir_op_ior:
1070       case nir_op_ixor:
1071       case nir_op_inot:
1072       case nir_op_bcsel:
1073          break;
1074       default:
1075          compile_assert(ctx, alu->def.bit_size != 1);
1076       }
1077    }
1078 
1079    cp_instrs(def, dst.rpts, dst_sz);
1080    ir3_put_def(ctx, &alu->def);
1081 }
1082 
1083 static void
emit_intrinsic_load_ubo_ldc(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1084 emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1085                             struct ir3_instruction **dst)
1086 {
1087    struct ir3_block *b = ctx->block;
1088 
1089    /* This is only generated for us by nir_lower_ubo_vec4, which leaves base =
1090     * 0.
1091     */
1092    assert(nir_intrinsic_base(intr) == 0);
1093 
1094    unsigned ncomp = intr->num_components;
1095    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
1096    struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
1097    struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
1098    ldc->dsts[0]->wrmask = MASK(ncomp);
1099    ldc->cat6.iim_val = ncomp;
1100    ldc->cat6.d = nir_intrinsic_component(intr);
1101    ldc->cat6.type = utype_def(&intr->def);
1102 
1103    ir3_handle_bindless_cat6(ldc, intr->src[0]);
1104    if (ldc->flags & IR3_INSTR_B)
1105       ctx->so->bindless_ubo = true;
1106    ir3_handle_nonuniform(ldc, intr);
1107 
1108    if (!intr->def.divergent &&
1109        ctx->compiler->has_scalar_alu) {
1110       ldc->dsts[0]->flags |= IR3_REG_SHARED;
1111       ldc->flags |= IR3_INSTR_U;
1112    }
1113 
1114    ir3_split_dest(b, dst, ldc, 0, ncomp);
1115 }
1116 
1117 static void
emit_intrinsic_copy_ubo_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)1118 emit_intrinsic_copy_ubo_to_uniform(struct ir3_context *ctx,
1119                                    nir_intrinsic_instr *intr)
1120 {
1121    struct ir3_block *b = ctx->block;
1122 
1123    unsigned base = nir_intrinsic_base(intr);
1124    unsigned size = nir_intrinsic_range(intr);
1125 
1126    struct ir3_instruction *addr1 = ir3_get_addr1(ctx, base);
1127 
1128    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
1129    struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
1130    struct ir3_instruction *ldc = ir3_LDC_K(b, idx, 0, offset, 0);
1131    ldc->cat6.iim_val = size;
1132    ldc->barrier_class = ldc->barrier_conflict = IR3_BARRIER_CONST_W;
1133 
1134    ir3_handle_bindless_cat6(ldc, intr->src[0]);
1135    if (ldc->flags & IR3_INSTR_B)
1136       ctx->so->bindless_ubo = true;
1137 
1138    ir3_instr_set_address(ldc, addr1);
1139 
1140    /* The assembler isn't aware of what value a1.x has, so make sure that
1141     * constlen includes the ldc.k here.
1142     */
1143    ctx->so->constlen =
1144       MAX2(ctx->so->constlen, DIV_ROUND_UP(base + size * 4, 4));
1145 
1146    array_insert(b, b->keeps, ldc);
1147 }
1148 
1149 static void
emit_intrinsic_copy_global_to_uniform(struct ir3_context * ctx,nir_intrinsic_instr * intr)1150 emit_intrinsic_copy_global_to_uniform(struct ir3_context *ctx,
1151                                       nir_intrinsic_instr *intr)
1152 {
1153    struct ir3_block *b = ctx->block;
1154 
1155    unsigned size = nir_intrinsic_range(intr);
1156    unsigned dst = nir_intrinsic_range_base(intr);
1157    unsigned addr_offset = nir_intrinsic_base(intr);
1158    unsigned dst_lo = dst & 0xff;
1159    unsigned dst_hi = dst >> 8;
1160 
1161    struct ir3_instruction *a1 = NULL;
1162    if (dst_hi)
1163       a1 = ir3_get_addr1(ctx, dst_hi << 8);
1164 
1165    struct ir3_instruction *addr_lo = ir3_get_src(ctx, &intr->src[0])[0];
1166    struct ir3_instruction *addr_hi = ir3_get_src(ctx, &intr->src[0])[1];
1167    struct ir3_instruction *addr = ir3_collect(b, addr_lo, addr_hi);
1168    struct ir3_instruction *ldg = ir3_LDG_K(b, create_immed(b, dst_lo), 0, addr, 0,
1169                                            create_immed(b, addr_offset), 0,
1170                                            create_immed(b, size), 0);
1171    ldg->barrier_class = ldg->barrier_conflict = IR3_BARRIER_CONST_W;
1172    ldg->cat6.type = TYPE_U32;
1173 
1174    if (a1) {
1175       ir3_instr_set_address(ldg, a1);
1176       ldg->flags |= IR3_INSTR_A1EN;
1177    }
1178 
1179    /* The assembler isn't aware of what value a1.x has, so make sure that
1180     * constlen includes the ldg.k here.
1181     */
1182    ctx->so->constlen =
1183       MAX2(ctx->so->constlen, DIV_ROUND_UP(dst + size * 4, 4));
1184 
1185    array_insert(b, b->keeps, ldg);
1186 }
1187 
1188 
1189 /* handles direct/indirect UBO reads: */
1190 static void
emit_intrinsic_load_ubo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1191 emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1192                         struct ir3_instruction **dst)
1193 {
1194    struct ir3_block *b = ctx->block;
1195    struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
1196    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1197    unsigned ubo = regid(const_state->offsets.ubo, 0);
1198    const unsigned ptrsz = ir3_pointer_size(ctx->compiler);
1199 
1200    int off = 0;
1201 
1202    /* First src is ubo index, which could either be an immed or not: */
1203    src0 = ir3_get_src(ctx, &intr->src[0])[0];
1204    if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1205       base_lo = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz));
1206       base_hi = create_uniform(b, ubo + (src0->srcs[0]->iim_val * ptrsz) + 1);
1207    } else {
1208       base_lo = create_uniform_indirect(b, ubo, TYPE_U32,
1209                                         ir3_get_addr0(ctx, src0, ptrsz));
1210       base_hi = create_uniform_indirect(b, ubo + 1, TYPE_U32,
1211                                         ir3_get_addr0(ctx, src0, ptrsz));
1212 
1213       /* NOTE: since relative addressing is used, make sure constlen is
1214        * at least big enough to cover all the UBO addresses, since the
1215        * assembler won't know what the max address reg is.
1216        */
1217       ctx->so->constlen =
1218          MAX2(ctx->so->constlen,
1219               const_state->offsets.ubo + (ctx->s->info.num_ubos * ptrsz));
1220    }
1221 
1222    /* note: on 32bit gpu's base_hi is ignored and DCE'd */
1223    addr = base_lo;
1224 
1225    if (nir_src_is_const(intr->src[1])) {
1226       off += nir_src_as_uint(intr->src[1]);
1227    } else {
1228       /* For load_ubo_indirect, second src is indirect offset: */
1229       src1 = ir3_get_src(ctx, &intr->src[1])[0];
1230 
1231       /* and add offset to addr: */
1232       addr = ir3_ADD_S(b, addr, 0, src1, 0);
1233    }
1234 
1235    /* if offset is to large to encode in the ldg, split it out: */
1236    if ((off + (intr->num_components * 4)) > 1024) {
1237       /* split out the minimal amount to improve the odds that
1238        * cp can fit the immediate in the add.s instruction:
1239        */
1240       unsigned off2 = off + (intr->num_components * 4) - 1024;
1241       addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1242       off -= off2;
1243    }
1244 
1245    if (ptrsz == 2) {
1246       struct ir3_instruction *carry;
1247 
1248       /* handle 32b rollover, ie:
1249        *   if (addr < base_lo)
1250        *      base_hi++
1251        */
1252       carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
1253       carry->cat2.condition = IR3_COND_LT;
1254       base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
1255 
1256       addr = ir3_collect(b, addr, base_hi);
1257    }
1258 
1259    for (int i = 0; i < intr->num_components; i++) {
1260       struct ir3_instruction *load =
1261          ir3_LDG(b, addr, 0, create_immed(b, off + i * 4), 0,
1262                  create_immed(b, 1), 0); /* num components */
1263       load->cat6.type = TYPE_U32;
1264       dst[i] = load;
1265    }
1266 }
1267 
1268 /* Load a kernel param: src[] = { address }. */
1269 static void
emit_intrinsic_load_kernel_input(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1270 emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
1271                                  nir_intrinsic_instr *intr,
1272                                  struct ir3_instruction **dst)
1273 {
1274    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
1275    struct ir3_block *b = ctx->block;
1276    unsigned offset = nir_intrinsic_base(intr);
1277    unsigned p = regid(const_state->offsets.kernel_params, 0);
1278 
1279    struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
1280 
1281    if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
1282       offset += src0->srcs[0]->iim_val;
1283 
1284       /* kernel param position is in bytes, but constant space is 32b registers: */
1285       compile_assert(ctx, !(offset & 0x3));
1286 
1287       dst[0] = create_uniform(b, p + (offset / 4));
1288    } else {
1289       /* kernel param position is in bytes, but constant space is 32b registers: */
1290       compile_assert(ctx, !(offset & 0x3));
1291 
1292       /* TODO we should probably be lowering this in nir, and also handling
1293        * non-32b inputs.. Also we probably don't want to be using
1294        * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
1295        */
1296       src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
1297 
1298       dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
1299                                        ir3_get_addr0(ctx, src0, 1));
1300    }
1301 }
1302 
1303 /* src[] = { block_index } */
1304 static void
emit_intrinsic_ssbo_size(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1305 emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1306                          struct ir3_instruction **dst)
1307 {
1308    struct ir3_block *b = ctx->block;
1309    struct ir3_instruction *ibo = ir3_ssbo_to_ibo(ctx, intr->src[0]);
1310    struct ir3_instruction *resinfo = ir3_RESINFO(b, ibo, 0);
1311    resinfo->cat6.iim_val = 1;
1312    resinfo->cat6.d = ctx->compiler->gen >= 6 ? 1 : 2;
1313    resinfo->cat6.type = TYPE_U32;
1314    resinfo->cat6.typed = false;
1315    /* resinfo has no writemask and always writes out 3 components */
1316    resinfo->dsts[0]->wrmask = MASK(3);
1317    ir3_handle_bindless_cat6(resinfo, intr->src[0]);
1318    ir3_handle_nonuniform(resinfo, intr);
1319 
1320    if (ctx->compiler->gen >= 6) {
1321       ir3_split_dest(b, dst, resinfo, 0, 1);
1322    } else {
1323       /* On a5xx, resinfo returns the low 16 bits of ssbo size in .x and the high 16 bits in .y */
1324       struct ir3_instruction *resinfo_dst[2];
1325       ir3_split_dest(b, resinfo_dst, resinfo, 0, 2);
1326       *dst = ir3_ADD_U(b, ir3_SHL_B(b, resinfo_dst[1], 0, create_immed(b, 16), 0), 0, resinfo_dst[0], 0);
1327    }
1328 }
1329 
1330 /* src[] = { offset }. const_index[] = { base } */
1331 static void
emit_intrinsic_load_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1332 emit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1333                            struct ir3_instruction **dst)
1334 {
1335    struct ir3_block *b = ctx->block;
1336    struct ir3_instruction *ldl, *offset;
1337    unsigned base;
1338 
1339    offset = ir3_get_src(ctx, &intr->src[0])[0];
1340    base = nir_intrinsic_base(intr);
1341 
1342    ldl = ir3_LDL(b, offset, 0, create_immed(b, base), 0,
1343                  create_immed(b, intr->num_components), 0);
1344 
1345    ldl->cat6.type = utype_def(&intr->def);
1346    ldl->dsts[0]->wrmask = MASK(intr->num_components);
1347 
1348    ldl->barrier_class = IR3_BARRIER_SHARED_R;
1349    ldl->barrier_conflict = IR3_BARRIER_SHARED_W;
1350 
1351    ir3_split_dest(b, dst, ldl, 0, intr->num_components);
1352 }
1353 
1354 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
1355 static void
emit_intrinsic_store_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1356 emit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1357 {
1358    struct ir3_block *b = ctx->block;
1359    struct ir3_instruction *stl, *offset;
1360    struct ir3_instruction *const *value;
1361    unsigned base, wrmask, ncomp;
1362 
1363    value = ir3_get_src(ctx, &intr->src[0]);
1364    offset = ir3_get_src(ctx, &intr->src[1])[0];
1365 
1366    base = nir_intrinsic_base(intr);
1367    wrmask = nir_intrinsic_write_mask(intr);
1368    ncomp = ffs(~wrmask) - 1;
1369 
1370    assert(wrmask == BITFIELD_MASK(intr->num_components));
1371 
1372    stl = ir3_STL(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1373                  create_immed(b, ncomp), 0);
1374    stl->cat6.dst_offset = base;
1375    stl->cat6.type = utype_src(intr->src[0]);
1376    stl->barrier_class = IR3_BARRIER_SHARED_W;
1377    stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1378 
1379    array_insert(b, b->keeps, stl);
1380 }
1381 
1382 /* src[] = { offset }. const_index[] = { base } */
1383 static void
emit_intrinsic_load_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1384 emit_intrinsic_load_shared_ir3(struct ir3_context *ctx,
1385                                nir_intrinsic_instr *intr,
1386                                struct ir3_instruction **dst)
1387 {
1388    struct ir3_block *b = ctx->block;
1389    struct ir3_instruction *load, *offset;
1390    unsigned base;
1391 
1392    offset = ir3_get_src(ctx, &intr->src[0])[0];
1393    base = nir_intrinsic_base(intr);
1394 
1395    load = ir3_LDLW(b, offset, 0, create_immed(b, base), 0,
1396                    create_immed(b, intr->num_components), 0);
1397 
1398    /* for a650, use LDL for tess ctrl inputs: */
1399    if (ctx->so->type == MESA_SHADER_TESS_CTRL && ctx->compiler->tess_use_shared)
1400       load->opc = OPC_LDL;
1401 
1402    load->cat6.type = utype_def(&intr->def);
1403    load->dsts[0]->wrmask = MASK(intr->num_components);
1404 
1405    load->barrier_class = IR3_BARRIER_SHARED_R;
1406    load->barrier_conflict = IR3_BARRIER_SHARED_W;
1407 
1408    ir3_split_dest(b, dst, load, 0, intr->num_components);
1409 }
1410 
1411 /* src[] = { value, offset }. const_index[] = { base } */
1412 static void
emit_intrinsic_store_shared_ir3(struct ir3_context * ctx,nir_intrinsic_instr * intr)1413 emit_intrinsic_store_shared_ir3(struct ir3_context *ctx,
1414                                 nir_intrinsic_instr *intr)
1415 {
1416    struct ir3_block *b = ctx->block;
1417    struct ir3_instruction *store, *offset;
1418    struct ir3_instruction *const *value;
1419 
1420    value = ir3_get_src(ctx, &intr->src[0]);
1421    offset = ir3_get_src(ctx, &intr->src[1])[0];
1422 
1423    store = ir3_STLW(b, offset, 0,
1424                     ir3_create_collect(b, value, intr->num_components), 0,
1425                     create_immed(b, intr->num_components), 0);
1426 
1427    /* for a650, use STL for vertex outputs used by tess ctrl shader: */
1428    if (ctx->so->type == MESA_SHADER_VERTEX && ctx->so->key.tessellation &&
1429        ctx->compiler->tess_use_shared)
1430       store->opc = OPC_STL;
1431 
1432    store->cat6.dst_offset = nir_intrinsic_base(intr);
1433    store->cat6.type = utype_src(intr->src[0]);
1434    store->barrier_class = IR3_BARRIER_SHARED_W;
1435    store->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1436 
1437    array_insert(b, b->keeps, store);
1438 }
1439 
1440 /*
1441  * CS shared variable atomic intrinsics
1442  *
1443  * All of the shared variable atomic memory operations read a value from
1444  * memory, compute a new value using one of the operations below, write the
1445  * new value to memory, and return the original value read.
1446  *
1447  * All operations take 2 sources except CompSwap that takes 3. These
1448  * sources represent:
1449  *
1450  * 0: The offset into the shared variable storage region that the atomic
1451  *    operation will operate on.
1452  * 1: The data parameter to the atomic function (i.e. the value to add
1453  *    in, etc).
1454  * 2: For CompSwap only: the second data parameter.
1455  */
1456 static struct ir3_instruction *
emit_intrinsic_atomic_shared(struct ir3_context * ctx,nir_intrinsic_instr * intr)1457 emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1458 {
1459    struct ir3_block *b = ctx->block;
1460    struct ir3_instruction *atomic, *src0, *src1;
1461    type_t type = TYPE_U32;
1462 
1463    src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */
1464    src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */
1465 
1466    switch (nir_intrinsic_atomic_op(intr)) {
1467    case nir_atomic_op_iadd:
1468       atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0);
1469       break;
1470    case nir_atomic_op_imin:
1471       atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1472       type = TYPE_S32;
1473       break;
1474    case nir_atomic_op_umin:
1475       atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0);
1476       break;
1477    case nir_atomic_op_imax:
1478       atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1479       type = TYPE_S32;
1480       break;
1481    case nir_atomic_op_umax:
1482       atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0);
1483       break;
1484    case nir_atomic_op_iand:
1485       atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0);
1486       break;
1487    case nir_atomic_op_ior:
1488       atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0);
1489       break;
1490    case nir_atomic_op_ixor:
1491       atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0);
1492       break;
1493    case nir_atomic_op_xchg:
1494       atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0);
1495       break;
1496    case nir_atomic_op_cmpxchg:
1497       /* for cmpxchg, src1 is [ui]vec2(data, compare): */
1498       src1 = ir3_collect(b, ir3_get_src(ctx, &intr->src[2])[0], src1);
1499       atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0);
1500       break;
1501    default:
1502       unreachable("boo");
1503    }
1504 
1505    atomic->cat6.iim_val = 1;
1506    atomic->cat6.d = 1;
1507    atomic->cat6.type = type;
1508    atomic->barrier_class = IR3_BARRIER_SHARED_W;
1509    atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
1510 
1511    /* even if nothing consume the result, we can't DCE the instruction: */
1512    array_insert(b, b->keeps, atomic);
1513 
1514    return atomic;
1515 }
1516 
1517 static void
stp_ldp_offset(struct ir3_context * ctx,nir_src * src,struct ir3_instruction ** offset,int32_t * base)1518 stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
1519                struct ir3_instruction **offset, int32_t *base)
1520 {
1521    struct ir3_block *b = ctx->block;
1522 
1523    if (nir_src_is_const(*src)) {
1524       unsigned src_offset = nir_src_as_uint(*src);
1525       /* The base offset field is only 13 bits, and it's signed. Try to make the
1526        * offset constant whenever the original offsets are similar, to avoid
1527        * creating too many constants in the final shader.
1528        */
1529       *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
1530       uint32_t offset_val = src_offset - *base;
1531       *offset = create_immed(b, offset_val);
1532    } else {
1533       /* TODO: match on nir_iadd with a constant that fits */
1534       *base = 0;
1535       *offset = ir3_get_src(ctx, src)[0];
1536    }
1537 }
1538 
1539 /* src[] = { offset }. */
1540 static void
emit_intrinsic_load_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1541 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1542                             struct ir3_instruction **dst)
1543 {
1544    struct ir3_block *b = ctx->block;
1545    struct ir3_instruction *ldp, *offset;
1546    int32_t base;
1547 
1548    stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
1549 
1550    ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
1551                  create_immed(b, intr->num_components), 0);
1552 
1553    ldp->cat6.type = utype_def(&intr->def);
1554    ldp->dsts[0]->wrmask = MASK(intr->num_components);
1555 
1556    ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
1557    ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
1558 
1559    ir3_split_dest(b, dst, ldp, 0, intr->num_components);
1560 }
1561 
1562 /* src[] = { value, offset }. const_index[] = { write_mask } */
1563 static void
emit_intrinsic_store_scratch(struct ir3_context * ctx,nir_intrinsic_instr * intr)1564 emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1565 {
1566    struct ir3_block *b = ctx->block;
1567    struct ir3_instruction *stp, *offset;
1568    struct ir3_instruction *const *value;
1569    unsigned wrmask, ncomp;
1570    int32_t base;
1571 
1572    value = ir3_get_src(ctx, &intr->src[0]);
1573 
1574    stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
1575 
1576    wrmask = nir_intrinsic_write_mask(intr);
1577    ncomp = ffs(~wrmask) - 1;
1578 
1579    assert(wrmask == BITFIELD_MASK(intr->num_components));
1580 
1581    stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
1582                  create_immed(b, ncomp), 0);
1583    stp->cat6.dst_offset = base;
1584    stp->cat6.type = utype_src(intr->src[0]);
1585    stp->barrier_class = IR3_BARRIER_PRIVATE_W;
1586    stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
1587 
1588    array_insert(b, b->keeps, stp);
1589 }
1590 
1591 struct tex_src_info {
1592    /* For prefetch */
1593    unsigned tex_base, samp_base, tex_idx, samp_idx;
1594    /* For normal tex instructions */
1595    unsigned base, a1_val, flags;
1596    struct ir3_instruction *samp_tex;
1597 };
1598 
1599 /* TODO handle actual indirect/dynamic case.. which is going to be weird
1600  * to handle with the image_mapping table..
1601  */
1602 static struct tex_src_info
get_image_ssbo_samp_tex_src(struct ir3_context * ctx,nir_src * src,bool image)1603 get_image_ssbo_samp_tex_src(struct ir3_context *ctx, nir_src *src, bool image)
1604 {
1605    struct ir3_block *b = ctx->block;
1606    struct tex_src_info info = {0};
1607    nir_intrinsic_instr *bindless_tex = ir3_bindless_resource(*src);
1608 
1609    if (bindless_tex) {
1610       /* Bindless case */
1611       ctx->so->bindless_tex = true;
1612       info.flags |= IR3_INSTR_B;
1613 
1614       /* Gather information required to determine which encoding to
1615        * choose as well as for prefetch.
1616        */
1617       info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1618       bool tex_const = nir_src_is_const(bindless_tex->src[0]);
1619       if (tex_const)
1620          info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1621       info.samp_idx = 0;
1622 
1623       /* Choose encoding. */
1624       if (tex_const && info.tex_idx < 256) {
1625          if (info.tex_idx < 16) {
1626             /* Everything fits within the instruction */
1627             info.base = info.tex_base;
1628          } else {
1629             info.base = info.tex_base;
1630             if (ctx->compiler->gen <= 6) {
1631                info.a1_val = info.tex_idx << 3;
1632             } else {
1633                info.a1_val = info.samp_idx << 3;
1634             }
1635             info.flags |= IR3_INSTR_A1EN;
1636          }
1637          info.samp_tex = NULL;
1638       } else {
1639          info.flags |= IR3_INSTR_S2EN;
1640          info.base = info.tex_base;
1641 
1642          /* Note: the indirect source is now a vec2 instead of hvec2 */
1643          struct ir3_instruction *texture, *sampler;
1644 
1645          texture = ir3_get_src(ctx, src)[0];
1646          sampler = create_immed(b, 0);
1647          info.samp_tex = ir3_collect(b, texture, sampler);
1648       }
1649    } else {
1650       info.flags |= IR3_INSTR_S2EN;
1651       unsigned slot = nir_src_as_uint(*src);
1652       unsigned tex_idx = image ?
1653             ir3_image_to_tex(&ctx->so->image_mapping, slot) :
1654             ir3_ssbo_to_tex(&ctx->so->image_mapping, slot);
1655       struct ir3_instruction *texture, *sampler;
1656 
1657       ctx->so->num_samp = MAX2(ctx->so->num_samp, tex_idx + 1);
1658 
1659       texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1660       sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16);
1661 
1662       info.samp_tex = ir3_collect(b, sampler, texture);
1663    }
1664 
1665    return info;
1666 }
1667 
1668 static struct ir3_instruction *
emit_sam(struct ir3_context * ctx,opc_t opc,struct tex_src_info info,type_t type,unsigned wrmask,struct ir3_instruction * src0,struct ir3_instruction * src1)1669 emit_sam(struct ir3_context *ctx, opc_t opc, struct tex_src_info info,
1670          type_t type, unsigned wrmask, struct ir3_instruction *src0,
1671          struct ir3_instruction *src1)
1672 {
1673    struct ir3_instruction *sam, *addr;
1674    if (info.flags & IR3_INSTR_A1EN) {
1675       addr = ir3_get_addr1(ctx, info.a1_val);
1676    }
1677    sam = ir3_SAM(ctx->block, opc, type, wrmask, info.flags, info.samp_tex, src0,
1678                  src1);
1679    if (info.flags & IR3_INSTR_A1EN) {
1680       ir3_instr_set_address(sam, addr);
1681    }
1682    if (info.flags & IR3_INSTR_B) {
1683       sam->cat5.tex_base = info.base;
1684       sam->cat5.samp = info.samp_idx;
1685       sam->cat5.tex  = info.tex_idx;
1686    }
1687    return sam;
1688 }
1689 
1690 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
1691 static void
emit_intrinsic_load_image(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1692 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
1693                           struct ir3_instruction **dst)
1694 {
1695    /* If the image can be written, must use LDIB to retrieve data, rather than
1696     * through ISAM (which uses the texture cache and won't get previous writes).
1697     */
1698    if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER)) {
1699       ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1700       return;
1701    }
1702 
1703    /* The sparse set of texture descriptors for non-coherent load_images means we can't do indirection, so
1704     * fall back to coherent load.
1705     */
1706    if (ctx->compiler->gen >= 5 &&
1707        !ir3_bindless_resource(intr->src[0]) &&
1708        !nir_src_is_const(intr->src[0])) {
1709       ctx->funcs->emit_intrinsic_load_image(ctx, intr, dst);
1710       return;
1711    }
1712 
1713    struct ir3_block *b = ctx->block;
1714    struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1715    struct ir3_instruction *sam;
1716    struct ir3_instruction *const *src0 = ir3_get_src(ctx, &intr->src[1]);
1717    struct ir3_instruction *coords[4];
1718    unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1719    type_t type = ir3_get_type_for_image_intrinsic(intr);
1720 
1721    info.flags |= flags;
1722 
1723    /* hw doesn't do 1d, so we treat it as 2d with height of 1, and patch up the
1724     * y coord. Note that the array index must come after the fake y coord.
1725     */
1726    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1727    if (dim == GLSL_SAMPLER_DIM_1D || dim == GLSL_SAMPLER_DIM_BUF) {
1728       coords[0] = src0[0];
1729       coords[1] = create_immed(b, 0);
1730       for (unsigned i = 1; i < ncoords; i++)
1731          coords[i + 1] = src0[i];
1732       ncoords++;
1733    } else {
1734       for (unsigned i = 0; i < ncoords; i++)
1735          coords[i] = src0[i];
1736    }
1737 
1738    sam = emit_sam(ctx, OPC_ISAM, info, type, 0b1111,
1739                   ir3_create_collect(b, coords, ncoords), NULL);
1740 
1741    ir3_handle_nonuniform(sam, intr);
1742 
1743    sam->barrier_class = IR3_BARRIER_IMAGE_R;
1744    sam->barrier_conflict = IR3_BARRIER_IMAGE_W;
1745 
1746    ir3_split_dest(b, dst, sam, 0, 4);
1747 }
1748 
1749 /* A4xx version of image_size, see ir3_a6xx.c for newer resinfo version. */
1750 void
emit_intrinsic_image_size_tex(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1751 emit_intrinsic_image_size_tex(struct ir3_context *ctx,
1752                               nir_intrinsic_instr *intr,
1753                               struct ir3_instruction **dst)
1754 {
1755    struct ir3_block *b = ctx->block;
1756    struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], true);
1757    struct ir3_instruction *sam, *lod;
1758    unsigned flags, ncoords = ir3_get_image_coords(intr, &flags);
1759    type_t dst_type = intr->def.bit_size == 16 ? TYPE_U16 : TYPE_U32;
1760 
1761    info.flags |= flags;
1762    assert(nir_src_as_uint(intr->src[1]) == 0);
1763    lod = create_immed(b, 0);
1764    sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
1765 
1766    /* Array size actually ends up in .w rather than .z. This doesn't
1767     * matter for miplevel 0, but for higher mips the value in z is
1768     * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1769     * returned, which means that we have to add 1 to it for arrays for
1770     * a3xx.
1771     *
1772     * Note use a temporary dst and then copy, since the size of the dst
1773     * array that is passed in is based on nir's understanding of the
1774     * result size, not the hardware's
1775     */
1776    struct ir3_instruction *tmp[4];
1777 
1778    ir3_split_dest(b, tmp, sam, 0, 4);
1779 
1780    for (unsigned i = 0; i < ncoords; i++)
1781       dst[i] = tmp[i];
1782 
1783    if (flags & IR3_INSTR_A) {
1784       if (ctx->compiler->levels_add_one) {
1785          dst[ncoords - 1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
1786       } else {
1787          dst[ncoords - 1] = ir3_MOV(b, tmp[3], TYPE_U32);
1788       }
1789    }
1790 }
1791 
1792 static struct tex_src_info
get_bindless_samp_src(struct ir3_context * ctx,nir_src * tex,nir_src * samp)1793 get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex,
1794                       nir_src *samp)
1795 {
1796    struct ir3_block *b = ctx->block;
1797    struct tex_src_info info = {0};
1798 
1799    info.flags |= IR3_INSTR_B;
1800 
1801    /* Gather information required to determine which encoding to
1802     * choose as well as for prefetch.
1803     */
1804    nir_intrinsic_instr *bindless_tex = NULL;
1805    bool tex_const;
1806    if (tex) {
1807       ctx->so->bindless_tex = true;
1808       bindless_tex = ir3_bindless_resource(*tex);
1809       assert(bindless_tex);
1810       info.tex_base = nir_intrinsic_desc_set(bindless_tex);
1811       tex_const = nir_src_is_const(bindless_tex->src[0]);
1812       if (tex_const)
1813          info.tex_idx = nir_src_as_uint(bindless_tex->src[0]);
1814    } else {
1815       /* To simplify some of the logic below, assume the index is
1816        * constant 0 when it's not enabled.
1817        */
1818       tex_const = true;
1819       info.tex_idx = 0;
1820    }
1821    nir_intrinsic_instr *bindless_samp = NULL;
1822    bool samp_const;
1823    if (samp) {
1824       ctx->so->bindless_samp = true;
1825       bindless_samp = ir3_bindless_resource(*samp);
1826       assert(bindless_samp);
1827       info.samp_base = nir_intrinsic_desc_set(bindless_samp);
1828       samp_const = nir_src_is_const(bindless_samp->src[0]);
1829       if (samp_const)
1830          info.samp_idx = nir_src_as_uint(bindless_samp->src[0]);
1831    } else {
1832       samp_const = true;
1833       info.samp_idx = 0;
1834    }
1835 
1836    /* Choose encoding. */
1837    if (tex_const && samp_const && info.tex_idx < 256 &&
1838        info.samp_idx < 256) {
1839       if (info.tex_idx < 16 && info.samp_idx < 16 &&
1840           (!bindless_tex || !bindless_samp ||
1841            info.tex_base == info.samp_base)) {
1842          /* Everything fits within the instruction */
1843          info.base = info.tex_base;
1844       } else {
1845          info.base = info.tex_base;
1846          if (ctx->compiler->gen <= 6) {
1847             info.a1_val = info.tex_idx << 3 | info.samp_base;
1848          } else {
1849             info.a1_val = info.samp_idx << 3 | info.samp_base;
1850          }
1851 
1852          info.flags |= IR3_INSTR_A1EN;
1853       }
1854       info.samp_tex = NULL;
1855    } else {
1856       info.flags |= IR3_INSTR_S2EN;
1857       /* In the indirect case, we only use a1.x to store the sampler
1858        * base if it differs from the texture base.
1859        */
1860       if (!bindless_tex || !bindless_samp ||
1861           info.tex_base == info.samp_base) {
1862          info.base = info.tex_base;
1863       } else {
1864          info.base = info.tex_base;
1865          info.a1_val = info.samp_base;
1866          info.flags |= IR3_INSTR_A1EN;
1867       }
1868 
1869       /* Note: the indirect source is now a vec2 instead of hvec2, and
1870        * for some reason the texture and sampler are swapped.
1871        */
1872       struct ir3_instruction *texture, *sampler;
1873 
1874       if (bindless_tex) {
1875          texture = ir3_get_src(ctx, tex)[0];
1876       } else {
1877          texture = create_immed(b, 0);
1878       }
1879 
1880       if (bindless_samp) {
1881          sampler = ir3_get_src(ctx, samp)[0];
1882       } else {
1883          sampler = create_immed(b, 0);
1884       }
1885       info.samp_tex = ir3_collect(b, texture, sampler);
1886    }
1887 
1888    return info;
1889 }
1890 
1891 /* src[] = { buffer_index, offset }. No const_index */
1892 static void
emit_intrinsic_load_ssbo(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)1893 emit_intrinsic_load_ssbo(struct ir3_context *ctx,
1894                          nir_intrinsic_instr *intr,
1895                          struct ir3_instruction **dst)
1896 {
1897    /* Note: we can only use isam for vectorized loads/stores if isam.v is
1898     * available.
1899     * Note: isam also can't handle 8-bit loads.
1900     */
1901    if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) ||
1902        (intr->def.num_components > 1 && !ctx->compiler->has_isam_v) ||
1903        (ctx->compiler->options.storage_8bit && intr->def.bit_size == 8) ||
1904        !ctx->compiler->has_isam_ssbo) {
1905       ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst);
1906       return;
1907    }
1908 
1909    struct ir3_block *b = ctx->block;
1910    nir_src *offset_src = &intr->src[2];
1911    struct ir3_instruction *coords = NULL;
1912    unsigned imm_offset = 0;
1913 
1914    if (ctx->compiler->has_isam_v) {
1915       ir3_lower_imm_offset(ctx, intr, offset_src, 8, &coords, &imm_offset);
1916    } else {
1917       coords =
1918          ir3_collect(b, ir3_get_src(ctx, offset_src)[0], create_immed(b, 0));
1919    }
1920 
1921    struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false);
1922 
1923    unsigned num_components = intr->def.num_components;
1924    assert(num_components == 1 || ctx->compiler->has_isam_v);
1925 
1926    struct ir3_instruction *sam =
1927       emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size),
1928                MASK(num_components), coords, create_immed(b, imm_offset));
1929 
1930    if (ctx->compiler->has_isam_v) {
1931       sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D);
1932 
1933       if (imm_offset) {
1934          sam->flags |= IR3_INSTR_IMM_OFFSET;
1935       }
1936    }
1937 
1938    ir3_handle_nonuniform(sam, intr);
1939 
1940    sam->barrier_class = IR3_BARRIER_BUFFER_R;
1941    sam->barrier_conflict = IR3_BARRIER_BUFFER_W;
1942 
1943    ir3_split_dest(b, dst, sam, 0, num_components);
1944 }
1945 
1946 static void
emit_control_barrier(struct ir3_context * ctx)1947 emit_control_barrier(struct ir3_context *ctx)
1948 {
1949    /* Hull shaders dispatch 32 wide so an entire patch will always
1950     * fit in a single warp and execute in lock-step. Consequently,
1951     * we don't need to do anything for TCS barriers. Emitting
1952     * barrier instruction will deadlock.
1953     */
1954    if (ctx->so->type == MESA_SHADER_TESS_CTRL)
1955       return;
1956 
1957    struct ir3_block *b = ctx->block;
1958    struct ir3_instruction *barrier = ir3_BAR(b);
1959    barrier->cat7.g = true;
1960    if (ctx->compiler->gen < 6)
1961       barrier->cat7.l = true;
1962    barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
1963    barrier->barrier_class = IR3_BARRIER_EVERYTHING;
1964    array_insert(b, b->keeps, barrier);
1965 
1966    ctx->so->has_barrier = true;
1967 }
1968 
1969 static void
emit_intrinsic_barrier(struct ir3_context * ctx,nir_intrinsic_instr * intr)1970 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
1971 {
1972    struct ir3_block *b = ctx->block;
1973    struct ir3_instruction *barrier;
1974 
1975    /* TODO: find out why there is a major difference of .l usage
1976     * between a5xx and a6xx,
1977     */
1978 
1979    mesa_scope exec_scope = nir_intrinsic_execution_scope(intr);
1980    mesa_scope mem_scope = nir_intrinsic_memory_scope(intr);
1981    nir_variable_mode modes = nir_intrinsic_memory_modes(intr);
1982    /* loads/stores are always cache-coherent so we can filter out
1983     * available/visible.
1984     */
1985    nir_memory_semantics semantics =
1986       nir_intrinsic_memory_semantics(intr) & (NIR_MEMORY_ACQUIRE |
1987                                               NIR_MEMORY_RELEASE);
1988 
1989    if (ctx->so->type == MESA_SHADER_TESS_CTRL) {
1990       /* Remove mode corresponding to TCS patch barriers because hull shaders
1991        * dispatch 32 wide so an entire patch will always fit in a single warp
1992        * and execute in lock-step.
1993        *
1994        * TODO: memory barrier also tells us not to reorder stores, this
1995        * information is lost here (backend doesn't reorder stores so we
1996        * are safe for now).
1997        */
1998       modes &= ~nir_var_shader_out;
1999    }
2000 
2001    assert(!(modes & nir_var_shader_out));
2002 
2003    if ((modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_mem_global |
2004                  nir_var_image)) && semantics) {
2005       barrier = ir3_FENCE(b);
2006       barrier->cat7.r = true;
2007       barrier->cat7.w = true;
2008 
2009       if (modes & (nir_var_mem_ssbo | nir_var_image | nir_var_mem_global)) {
2010          barrier->cat7.g = true;
2011       }
2012 
2013       if (ctx->compiler->gen >= 6) {
2014          if (modes & (nir_var_mem_ssbo | nir_var_image)) {
2015             barrier->cat7.l = true;
2016          }
2017       } else {
2018          if (modes & (nir_var_mem_shared | nir_var_mem_ssbo | nir_var_image)) {
2019             barrier->cat7.l = true;
2020          }
2021       }
2022 
2023       barrier->barrier_class = 0;
2024       barrier->barrier_conflict = 0;
2025 
2026       if (modes & nir_var_mem_shared) {
2027          barrier->barrier_class |= IR3_BARRIER_SHARED_W;
2028          barrier->barrier_conflict |=
2029             IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W;
2030       }
2031 
2032       if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) {
2033          barrier->barrier_class |= IR3_BARRIER_BUFFER_W;
2034          barrier->barrier_conflict |=
2035             IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W;
2036       }
2037 
2038       if (modes & nir_var_image) {
2039          barrier->barrier_class |= IR3_BARRIER_IMAGE_W;
2040          barrier->barrier_conflict |=
2041             IR3_BARRIER_IMAGE_W | IR3_BARRIER_IMAGE_R;
2042       }
2043 
2044       /* make sure barrier doesn't get DCE'd */
2045       array_insert(b, b->keeps, barrier);
2046 
2047       if (ctx->compiler->gen >= 7 && mem_scope > SCOPE_WORKGROUP &&
2048           modes & (nir_var_mem_ssbo | nir_var_image) &&
2049           semantics & NIR_MEMORY_ACQUIRE) {
2050          /* "r + l" is not enough to synchronize reads with writes from other
2051           * workgroups, we can disable them since they are useless here.
2052           */
2053          barrier->cat7.r = false;
2054          barrier->cat7.l = false;
2055 
2056          struct ir3_instruction *ccinv = ir3_CCINV(b);
2057          /* A7XX TODO: ccinv should just stick to the barrier,
2058           * the barrier class/conflict introduces unnecessary waits.
2059           */
2060          ccinv->barrier_class = barrier->barrier_class;
2061          ccinv->barrier_conflict = barrier->barrier_conflict;
2062          array_insert(b, b->keeps, ccinv);
2063       }
2064    }
2065 
2066    if (exec_scope >= SCOPE_WORKGROUP) {
2067       emit_control_barrier(ctx);
2068    }
2069 }
2070 
2071 static void
add_sysval_input_compmask(struct ir3_context * ctx,gl_system_value slot,unsigned compmask,struct ir3_instruction * instr)2072 add_sysval_input_compmask(struct ir3_context *ctx, gl_system_value slot,
2073                           unsigned compmask, struct ir3_instruction *instr)
2074 {
2075    struct ir3_shader_variant *so = ctx->so;
2076    unsigned n = so->inputs_count++;
2077 
2078    assert(instr->opc == OPC_META_INPUT);
2079    instr->input.inidx = n;
2080    instr->input.sysval = slot;
2081 
2082    so->inputs[n].sysval = true;
2083    so->inputs[n].slot = slot;
2084    so->inputs[n].compmask = compmask;
2085    so->total_in++;
2086 
2087    so->sysval_in += util_last_bit(compmask);
2088 }
2089 
2090 static struct ir3_instruction *
create_sysval_input(struct ir3_context * ctx,gl_system_value slot,unsigned compmask)2091 create_sysval_input(struct ir3_context *ctx, gl_system_value slot,
2092                     unsigned compmask)
2093 {
2094    assert(compmask);
2095    struct ir3_instruction *sysval = create_input(ctx, compmask);
2096    add_sysval_input_compmask(ctx, slot, compmask, sysval);
2097    return sysval;
2098 }
2099 
2100 static struct ir3_instruction *
get_barycentric(struct ir3_context * ctx,enum ir3_bary bary)2101 get_barycentric(struct ir3_context *ctx, enum ir3_bary bary)
2102 {
2103    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_PIXEL ==
2104                  SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL);
2105    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_SAMPLE ==
2106                  SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE);
2107    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTROID ==
2108                  SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID);
2109    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_PERSP_CENTER_RHW ==
2110                  SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW);
2111    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_PIXEL ==
2112                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL);
2113    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_CENTROID ==
2114                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID);
2115    STATIC_ASSERT(SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + IJ_LINEAR_SAMPLE ==
2116                  SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE);
2117 
2118    if (!ctx->ij[bary]) {
2119       struct ir3_instruction *xy[2];
2120       struct ir3_instruction *ij;
2121 
2122       ij = create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL +
2123                                bary, 0x3);
2124       ir3_split_dest(ctx->in_block, xy, ij, 0, 2);
2125 
2126       ctx->ij[bary] = ir3_create_collect(ctx->in_block, xy, 2);
2127    }
2128 
2129    return ctx->ij[bary];
2130 }
2131 
2132 /* TODO: make this a common NIR helper?
2133  * there is a nir_system_value_from_intrinsic but it takes nir_intrinsic_op so
2134  * it can't be extended to work with this
2135  */
2136 static gl_system_value
nir_intrinsic_barycentric_sysval(nir_intrinsic_instr * intr)2137 nir_intrinsic_barycentric_sysval(nir_intrinsic_instr *intr)
2138 {
2139    enum glsl_interp_mode interp_mode = nir_intrinsic_interp_mode(intr);
2140    gl_system_value sysval;
2141 
2142    switch (intr->intrinsic) {
2143    case nir_intrinsic_load_barycentric_pixel:
2144       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2145          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2146       else
2147          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2148       break;
2149    case nir_intrinsic_load_barycentric_centroid:
2150       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2151          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID;
2152       else
2153          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID;
2154       break;
2155    case nir_intrinsic_load_barycentric_sample:
2156       if (interp_mode == INTERP_MODE_NOPERSPECTIVE)
2157          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE;
2158       else
2159          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE;
2160       break;
2161    default:
2162       unreachable("invalid barycentric intrinsic");
2163    }
2164 
2165    return sysval;
2166 }
2167 
2168 static void
emit_intrinsic_barycentric(struct ir3_context * ctx,nir_intrinsic_instr * intr,struct ir3_instruction ** dst)2169 emit_intrinsic_barycentric(struct ir3_context *ctx, nir_intrinsic_instr *intr,
2170                            struct ir3_instruction **dst)
2171 {
2172    gl_system_value sysval = nir_intrinsic_barycentric_sysval(intr);
2173 
2174    if (!ctx->so->key.msaa && ctx->compiler->gen < 6) {
2175       switch (sysval) {
2176       case SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE:
2177          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2178          break;
2179       case SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID:
2180          sysval = SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2181          break;
2182       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE:
2183          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2184          break;
2185       case SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID:
2186          sysval = SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL;
2187          break;
2188       default:
2189          break;
2190       }
2191    }
2192 
2193    enum ir3_bary bary = sysval - SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL;
2194 
2195    struct ir3_instruction *ij = get_barycentric(ctx, bary);
2196    ir3_split_dest(ctx->block, dst, ij, 0, 2);
2197 }
2198 
2199 static struct ir3_instruction *
get_frag_coord(struct ir3_context * ctx,nir_intrinsic_instr * intr)2200 get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2201 {
2202    if (!ctx->frag_coord) {
2203       struct ir3_block *b = ir3_after_preamble(ctx->ir);
2204       struct ir3_instruction_rpt xyzw;
2205       struct ir3_instruction *hw_frag_coord;
2206 
2207       hw_frag_coord = create_sysval_input(ctx, SYSTEM_VALUE_FRAG_COORD, 0xf);
2208       ir3_split_dest(b, xyzw.rpts, hw_frag_coord, 0, 4);
2209 
2210       /* for frag_coord.xy, we get unsigned values.. we need
2211        * to subtract (integer) 8 and divide by 16 (right-
2212        * shift by 4) then convert to float:
2213        *
2214        *    sub.s tmp, src, 8
2215        *    shr.b tmp, tmp, 4
2216        *    mov.u32f32 dst, tmp
2217        *
2218        */
2219       struct ir3_instruction_rpt xy =
2220          ir3_COV_rpt(b, 2, xyzw, TYPE_U32, TYPE_F32);
2221       xy =
2222          ir3_MUL_F_rpt(b, 2, xy, 0, create_immed_rpt(b, 2, fui(1.0 / 16.0)), 0);
2223       cp_instrs(xyzw.rpts, xy.rpts, 2);
2224       ctx->frag_coord = ir3_create_collect(b, xyzw.rpts, 4);
2225    }
2226 
2227    ctx->so->fragcoord_compmask |= nir_def_components_read(&intr->def);
2228 
2229    return ctx->frag_coord;
2230 }
2231 
2232 /* This is a bit of a hack until ir3_context is converted to store SSA values
2233  * as ir3_register's instead of ir3_instruction's. Pick out a given destination
2234  * of an instruction with multiple destinations using a mov that will get folded
2235  * away by ir3_cp.
2236  */
2237 static struct ir3_instruction *
create_multidst_mov(struct ir3_block * block,struct ir3_register * dst)2238 create_multidst_mov(struct ir3_block *block, struct ir3_register *dst)
2239 {
2240    struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2241    unsigned dst_flags = dst->flags & IR3_REG_HALF;
2242    unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2243 
2244    __ssa_dst(mov)->flags |= dst_flags;
2245    struct ir3_register *src =
2246       ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags);
2247    src->wrmask = dst->wrmask;
2248    src->def = dst;
2249    assert(!(dst->flags & IR3_REG_RELATIV));
2250    mov->cat1.src_type = mov->cat1.dst_type =
2251       (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
2252    return mov;
2253 }
2254 
2255 static reduce_op_t
get_reduce_op(nir_op opc)2256 get_reduce_op(nir_op opc)
2257 {
2258    switch (opc) {
2259    case nir_op_iadd: return REDUCE_OP_ADD_U;
2260    case nir_op_fadd: return REDUCE_OP_ADD_F;
2261    case nir_op_imul: return REDUCE_OP_MUL_U;
2262    case nir_op_fmul: return REDUCE_OP_MUL_F;
2263    case nir_op_umin: return REDUCE_OP_MIN_U;
2264    case nir_op_imin: return REDUCE_OP_MIN_S;
2265    case nir_op_fmin: return REDUCE_OP_MIN_F;
2266    case nir_op_umax: return REDUCE_OP_MAX_U;
2267    case nir_op_imax: return REDUCE_OP_MAX_S;
2268    case nir_op_fmax: return REDUCE_OP_MAX_F;
2269    case nir_op_iand: return REDUCE_OP_AND_B;
2270    case nir_op_ior:  return REDUCE_OP_OR_B;
2271    case nir_op_ixor: return REDUCE_OP_XOR_B;
2272    default:
2273       unreachable("unknown NIR reduce op");
2274    }
2275 }
2276 
2277 static uint32_t
get_reduce_identity(nir_op opc,unsigned size)2278 get_reduce_identity(nir_op opc, unsigned size)
2279 {
2280    switch (opc) {
2281    case nir_op_iadd:
2282       return 0;
2283    case nir_op_fadd:
2284       return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f);
2285    case nir_op_imul:
2286       return 1;
2287    case nir_op_fmul:
2288       return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f);
2289    case nir_op_umax:
2290       return 0;
2291    case nir_op_imax:
2292       return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN;
2293    case nir_op_fmax:
2294       return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY);
2295    case nir_op_umin:
2296       return size == 32 ? UINT32_MAX : UINT16_MAX;
2297    case nir_op_imin:
2298       return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX;
2299    case nir_op_fmin:
2300       return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY);
2301    case nir_op_iand:
2302       return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1);
2303    case nir_op_ior:
2304       return 0;
2305    case nir_op_ixor:
2306       return 0;
2307    default:
2308       unreachable("unknown NIR reduce op");
2309    }
2310 }
2311 
2312 static struct ir3_instruction *
emit_intrinsic_reduce(struct ir3_context * ctx,nir_intrinsic_instr * intr)2313 emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2314 {
2315    struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
2316    nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr);
2317    reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2318    unsigned dst_size = intr->def.bit_size;
2319    unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0;
2320 
2321    /* Note: the shared reg is initialized to the identity, so we need it to
2322     * always be 32-bit even when the source isn't because half shared regs are
2323     * not supported.
2324     */
2325    struct ir3_instruction *identity =
2326       create_immed_shared(ctx->block, get_reduce_identity(nir_reduce_op, dst_size),
2327                           true);
2328 
2329    /* OPC_SCAN_MACRO has the following destinations:
2330     * - Exclusive scan result (interferes with source)
2331     * - Inclusive scan result
2332     * - Shared reg reduction result, must be initialized to the identity
2333     *
2334     * The loop computes all three results at the same time, we just have to
2335     * choose which destination to return.
2336     */
2337    struct ir3_instruction *scan =
2338       ir3_instr_create(ctx->block, OPC_SCAN_MACRO, 3, 2);
2339    scan->cat1.reduce_op = reduce_op;
2340 
2341    struct ir3_register *exclusive = __ssa_dst(scan);
2342    exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER;
2343    struct ir3_register *inclusive = __ssa_dst(scan);
2344    inclusive->flags |= flags;
2345    struct ir3_register *reduce = __ssa_dst(scan);
2346    reduce->flags |= IR3_REG_SHARED;
2347 
2348    /* The 32-bit multiply macro reads its sources after writing a partial result
2349     * to the destination, therefore inclusive also interferes with the source.
2350     */
2351    if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32)
2352       inclusive->flags |= IR3_REG_EARLY_CLOBBER;
2353 
2354    /* Normal source */
2355    __ssa_src(scan, src, 0);
2356 
2357    /* shared reg tied source */
2358    struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2359    ir3_reg_tie(reduce, reduce_init);
2360 
2361    struct ir3_register *dst;
2362    switch (intr->intrinsic) {
2363    case nir_intrinsic_reduce: dst = reduce; break;
2364    case nir_intrinsic_inclusive_scan: dst = inclusive; break;
2365    case nir_intrinsic_exclusive_scan: dst = exclusive; break;
2366    default:
2367       unreachable("unknown reduce intrinsic");
2368    }
2369 
2370    return create_multidst_mov(ctx->block, dst);
2371 }
2372 
2373 static struct ir3_instruction *
emit_intrinsic_reduce_clusters(struct ir3_context * ctx,nir_intrinsic_instr * intr)2374 emit_intrinsic_reduce_clusters(struct ir3_context *ctx,
2375                                nir_intrinsic_instr *intr)
2376 {
2377    nir_op nir_reduce_op = (nir_op)nir_intrinsic_reduction_op(intr);
2378    reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
2379    unsigned dst_size = intr->def.bit_size;
2380 
2381    bool need_exclusive =
2382       intr->intrinsic == nir_intrinsic_exclusive_scan_clusters_ir3;
2383    bool need_scratch = reduce_op == REDUCE_OP_MUL_U && dst_size == 32;
2384 
2385    /* Note: the shared reg is initialized to the identity, so we need it to
2386     * always be 32-bit even when the source isn't because half shared regs are
2387     * not supported.
2388     */
2389    struct ir3_instruction *identity =
2390       create_immed_shared(ctx->block, get_reduce_identity(nir_reduce_op, dst_size),
2391                           true);
2392 
2393    struct ir3_instruction *inclusive_src = ir3_get_src(ctx, &intr->src[0])[0];
2394 
2395    struct ir3_instruction *exclusive_src = NULL;
2396    if (need_exclusive)
2397          exclusive_src = ir3_get_src(ctx, &intr->src[1])[0];
2398 
2399    /* OPC_SCAN_CLUSTERS_MACRO has the following destinations:
2400     * - Shared reg reduction result, must be initialized to the identity
2401     * - Inclusive scan result
2402     * - (iff exclusive) Exclusive scan result. Conditionally added because
2403     *   calculating the exclusive value is optional (i.e., not a side-effect of
2404     *   calculating the inclusive value) and won't be DCE'd anymore at this
2405     *   point.
2406     * - (iff 32b mul_u) Scratch register. We try to emit "op rx, ry, rx" for
2407     *   most ops but this isn't possible for the 32b mul_u macro since its
2408     *   destination is clobbered. So conditionally allocate an extra
2409     *   register in that case.
2410     *
2411     * Note that the getlast loop this macro expands to iterates over all
2412     * clusters. However, for each iteration, not only the fibers in the current
2413     * cluster are active but all later ones as well. Since they still need their
2414     * sources when their cluster is handled, all destinations interfere with
2415     * the sources.
2416     */
2417    unsigned ndst = 2 + need_exclusive + need_scratch;
2418    unsigned nsrc = 2 + need_exclusive;
2419    struct ir3_instruction *scan =
2420       ir3_instr_create(ctx->block, OPC_SCAN_CLUSTERS_MACRO, ndst, nsrc);
2421    scan->cat1.reduce_op = reduce_op;
2422 
2423    unsigned dst_flags = IR3_REG_EARLY_CLOBBER;
2424    if (ir3_bitsize(ctx, dst_size) == 16)
2425       dst_flags |= IR3_REG_HALF;
2426 
2427    struct ir3_register *reduce = __ssa_dst(scan);
2428    reduce->flags |= IR3_REG_SHARED;
2429    struct ir3_register *inclusive = __ssa_dst(scan);
2430    inclusive->flags |= dst_flags;
2431 
2432    struct ir3_register *exclusive = NULL;
2433    if (need_exclusive) {
2434       exclusive = __ssa_dst(scan);
2435       exclusive->flags |= dst_flags;
2436    }
2437 
2438    if (need_scratch) {
2439       struct ir3_register *scratch = __ssa_dst(scan);
2440       scratch->flags |= dst_flags;
2441    }
2442 
2443    struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
2444    ir3_reg_tie(reduce, reduce_init);
2445 
2446    __ssa_src(scan, inclusive_src, 0);
2447 
2448    if (need_exclusive)
2449       __ssa_src(scan, exclusive_src, 0);
2450 
2451    struct ir3_register *dst;
2452    switch (intr->intrinsic) {
2453    case nir_intrinsic_reduce_clusters_ir3:
2454       dst = reduce;
2455       break;
2456    case nir_intrinsic_inclusive_scan_clusters_ir3:
2457       dst = inclusive;
2458       break;
2459    case nir_intrinsic_exclusive_scan_clusters_ir3: {
2460       assert(exclusive != NULL);
2461       dst = exclusive;
2462       break;
2463    }
2464    default:
2465       unreachable("unknown reduce intrinsic");
2466    }
2467 
2468    return create_multidst_mov(ctx->block, dst);
2469 }
2470 
2471 static struct ir3_instruction *
emit_intrinsic_brcst_active(struct ir3_context * ctx,nir_intrinsic_instr * intr)2472 emit_intrinsic_brcst_active(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2473 {
2474    struct ir3_instruction *default_src = ir3_get_src(ctx, &intr->src[0])[0];
2475    struct ir3_instruction *brcst_val = ir3_get_src(ctx, &intr->src[1])[0];
2476    return ir3_BRCST_ACTIVE(ctx->block, nir_intrinsic_cluster_size(intr),
2477                            brcst_val, default_src);
2478 }
2479 
2480 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2481 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
2482 
2483 static void
emit_intrinsic(struct ir3_context * ctx,nir_intrinsic_instr * intr)2484 emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
2485 {
2486    const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
2487    struct ir3_instruction **dst;
2488    struct ir3_instruction *const *src;
2489    struct ir3_block *b = ctx->block;
2490    unsigned dest_components = nir_intrinsic_dest_components(intr);
2491    int idx;
2492    bool create_rpt = false;
2493 
2494    if (info->has_dest) {
2495       dst = ir3_get_def(ctx, &intr->def, dest_components);
2496    } else {
2497       dst = NULL;
2498    }
2499 
2500    const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
2501    const unsigned primitive_param = const_state->offsets.primitive_param * 4;
2502    const unsigned primitive_map = const_state->offsets.primitive_map * 4;
2503 
2504    switch (intr->intrinsic) {
2505    case nir_intrinsic_decl_reg:
2506       /* There's logically nothing to do, but this has a destination in NIR so
2507        * plug in something... It will get DCE'd.
2508        */
2509       dst[0] = create_immed(ctx->block, 0);
2510       break;
2511 
2512    case nir_intrinsic_load_reg:
2513    case nir_intrinsic_load_reg_indirect: {
2514       struct ir3_array *arr = ir3_get_array(ctx, intr->src[0].ssa);
2515       struct ir3_instruction *addr = NULL;
2516 
2517       if (intr->intrinsic == nir_intrinsic_load_reg_indirect) {
2518          addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[1])[0],
2519                               dest_components);
2520       }
2521 
2522       ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[0].ssa);
2523       assert(dest_components == nir_intrinsic_num_components(decl));
2524 
2525       for (unsigned i = 0; i < dest_components; i++) {
2526          unsigned n = nir_intrinsic_base(intr) * dest_components + i;
2527          compile_assert(ctx, n < arr->length);
2528          dst[i] = ir3_create_array_load(ctx, arr, n, addr);
2529       }
2530 
2531       break;
2532    }
2533 
2534    case nir_intrinsic_store_reg:
2535    case nir_intrinsic_store_reg_indirect: {
2536       struct ir3_array *arr = ir3_get_array(ctx, intr->src[1].ssa);
2537       unsigned num_components = nir_src_num_components(intr->src[0]);
2538       struct ir3_instruction *addr = NULL;
2539 
2540       ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(intr->src[1].ssa);
2541       assert(num_components == nir_intrinsic_num_components(decl));
2542 
2543       struct ir3_instruction *const *value = ir3_get_src(ctx, &intr->src[0]);
2544 
2545       if (intr->intrinsic == nir_intrinsic_store_reg_indirect) {
2546          addr = ir3_get_addr0(ctx, ir3_get_src(ctx, &intr->src[2])[0],
2547                               num_components);
2548       }
2549 
2550       u_foreach_bit(i, nir_intrinsic_write_mask(intr)) {
2551          assert(i < num_components);
2552 
2553          unsigned n = nir_intrinsic_base(intr) * num_components + i;
2554          compile_assert(ctx, n < arr->length);
2555          if (value[i])
2556             ir3_create_array_store(ctx, arr, n, value[i], addr);
2557       }
2558 
2559       break;
2560    }
2561 
2562    case nir_intrinsic_load_const_ir3:
2563       idx = nir_intrinsic_base(intr);
2564       if (nir_src_is_const(intr->src[0])) {
2565          idx += nir_src_as_uint(intr->src[0]);
2566          for (int i = 0; i < dest_components; i++) {
2567             dst[i] = create_uniform_typed(
2568                b, idx + i,
2569                intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32);
2570          }
2571          create_rpt = true;
2572       } else {
2573          src = ctx->compiler->has_scalar_alu ?
2574             ir3_get_src_maybe_shared(ctx, &intr->src[0]) :
2575             ir3_get_src(ctx, &intr->src[0]);
2576          for (int i = 0; i < dest_components; i++) {
2577             dst[i] = create_uniform_indirect(
2578                b, idx + i,
2579                intr->def.bit_size == 16 ? TYPE_F16 : TYPE_F32,
2580                ir3_get_addr0(ctx, src[0], 1));
2581             /* Since this may not be foldable into conversions into shared
2582              * registers, manually make it shared. Optimizations can undo this if
2583              * the user can't use shared regs.
2584              */
2585             if (ctx->compiler->has_scalar_alu && !intr->def.divergent)
2586                dst[i]->dsts[0]->flags |= IR3_REG_SHARED;
2587          }
2588          /* NOTE: if relative addressing is used, we set
2589           * constlen in the compiler (to worst-case value)
2590           * since we don't know in the assembler what the max
2591           * addr reg value can be:
2592           */
2593          ctx->so->constlen =
2594             MAX2(ctx->so->constlen,
2595                  ctx->so->shader_options.num_reserved_user_consts +
2596                  const_state->ubo_state.size / 16);
2597       }
2598       break;
2599 
2600    case nir_intrinsic_load_vs_primitive_stride_ir3:
2601       dst[0] = create_uniform(b, primitive_param + 0);
2602       break;
2603    case nir_intrinsic_load_vs_vertex_stride_ir3:
2604       dst[0] = create_uniform(b, primitive_param + 1);
2605       break;
2606    case nir_intrinsic_load_hs_patch_stride_ir3:
2607       dst[0] = create_uniform(b, primitive_param + 2);
2608       break;
2609    case nir_intrinsic_load_patch_vertices_in:
2610       dst[0] = create_uniform(b, primitive_param + 3);
2611       break;
2612    case nir_intrinsic_load_tess_param_base_ir3:
2613       dst[0] = create_uniform(b, primitive_param + 4);
2614       dst[1] = create_uniform(b, primitive_param + 5);
2615       break;
2616    case nir_intrinsic_load_tess_factor_base_ir3:
2617       dst[0] = create_uniform(b, primitive_param + 6);
2618       dst[1] = create_uniform(b, primitive_param + 7);
2619       break;
2620 
2621    case nir_intrinsic_load_primitive_location_ir3:
2622       idx = nir_intrinsic_driver_location(intr);
2623       dst[0] = create_uniform(b, primitive_map + idx);
2624       break;
2625 
2626    case nir_intrinsic_load_gs_header_ir3:
2627       dst[0] = ctx->gs_header;
2628       break;
2629    case nir_intrinsic_load_tcs_header_ir3:
2630       dst[0] = ctx->tcs_header;
2631       break;
2632 
2633    case nir_intrinsic_load_rel_patch_id_ir3:
2634       dst[0] = ctx->rel_patch_id;
2635       break;
2636 
2637    case nir_intrinsic_load_primitive_id:
2638       if (!ctx->primitive_id) {
2639          ctx->primitive_id =
2640             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
2641       }
2642       dst[0] = ctx->primitive_id;
2643       break;
2644 
2645    case nir_intrinsic_load_tess_coord_xy:
2646       if (!ctx->tess_coord) {
2647          ctx->tess_coord =
2648             create_sysval_input(ctx, SYSTEM_VALUE_TESS_COORD, 0x3);
2649       }
2650       ir3_split_dest(b, dst, ctx->tess_coord, 0, 2);
2651       break;
2652 
2653    case nir_intrinsic_store_global_ir3:
2654       ctx->funcs->emit_intrinsic_store_global_ir3(ctx, intr);
2655       break;
2656    case nir_intrinsic_load_global_ir3:
2657       ctx->funcs->emit_intrinsic_load_global_ir3(ctx, intr, dst);
2658       break;
2659 
2660    case nir_intrinsic_load_ubo:
2661       emit_intrinsic_load_ubo(ctx, intr, dst);
2662       break;
2663    case nir_intrinsic_load_ubo_vec4:
2664       emit_intrinsic_load_ubo_ldc(ctx, intr, dst);
2665       break;
2666    case nir_intrinsic_copy_ubo_to_uniform_ir3:
2667       emit_intrinsic_copy_ubo_to_uniform(ctx, intr);
2668       break;
2669    case nir_intrinsic_copy_global_to_uniform_ir3:
2670       emit_intrinsic_copy_global_to_uniform(ctx, intr);
2671       break;
2672    case nir_intrinsic_load_frag_coord:
2673    case nir_intrinsic_load_frag_coord_unscaled_ir3:
2674       ir3_split_dest(b, dst, get_frag_coord(ctx, intr), 0, 4);
2675       break;
2676    case nir_intrinsic_load_sample_pos_from_id: {
2677       /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32,
2678        * but that doesn't seem necessary.
2679        */
2680       struct ir3_instruction *offset =
2681          ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0);
2682       offset->dsts[0]->wrmask = 0x3;
2683       offset->cat5.type = TYPE_F32;
2684 
2685       ir3_split_dest(b, dst, offset, 0, 2);
2686 
2687       break;
2688    }
2689    case nir_intrinsic_load_persp_center_rhw_ir3:
2690       if (!ctx->ij[IJ_PERSP_CENTER_RHW]) {
2691          ctx->ij[IJ_PERSP_CENTER_RHW] =
2692             create_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTER_RHW, 0x1);
2693       }
2694       dst[0] = ctx->ij[IJ_PERSP_CENTER_RHW];
2695       break;
2696    case nir_intrinsic_load_barycentric_centroid:
2697    case nir_intrinsic_load_barycentric_sample:
2698    case nir_intrinsic_load_barycentric_pixel:
2699       emit_intrinsic_barycentric(ctx, intr, dst);
2700       break;
2701    case nir_intrinsic_load_interpolated_input:
2702    case nir_intrinsic_load_input:
2703       setup_input(ctx, intr);
2704       break;
2705    case nir_intrinsic_load_kernel_input:
2706       emit_intrinsic_load_kernel_input(ctx, intr, dst);
2707       break;
2708    /* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
2709     * pass and replaced by an ir3-specifc version that adds the
2710     * dword-offset in the last source.
2711     */
2712    case nir_intrinsic_load_ssbo_ir3:
2713       emit_intrinsic_load_ssbo(ctx, intr, dst);
2714       break;
2715    case nir_intrinsic_store_ssbo_ir3:
2716       ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr);
2717       break;
2718    case nir_intrinsic_get_ssbo_size:
2719       emit_intrinsic_ssbo_size(ctx, intr, dst);
2720       break;
2721    case nir_intrinsic_ssbo_atomic_ir3:
2722    case nir_intrinsic_ssbo_atomic_swap_ir3:
2723       dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr);
2724       break;
2725    case nir_intrinsic_load_shared:
2726       emit_intrinsic_load_shared(ctx, intr, dst);
2727       break;
2728    case nir_intrinsic_store_shared:
2729       emit_intrinsic_store_shared(ctx, intr);
2730       break;
2731    case nir_intrinsic_shared_atomic:
2732    case nir_intrinsic_shared_atomic_swap:
2733       dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
2734       break;
2735    case nir_intrinsic_load_scratch:
2736       emit_intrinsic_load_scratch(ctx, intr, dst);
2737       break;
2738    case nir_intrinsic_store_scratch:
2739       emit_intrinsic_store_scratch(ctx, intr);
2740       break;
2741    case nir_intrinsic_image_load:
2742    case nir_intrinsic_bindless_image_load:
2743       emit_intrinsic_load_image(ctx, intr, dst);
2744       break;
2745    case nir_intrinsic_image_store:
2746    case nir_intrinsic_bindless_image_store:
2747       ctx->funcs->emit_intrinsic_store_image(ctx, intr);
2748       break;
2749    case nir_intrinsic_image_size:
2750    case nir_intrinsic_bindless_image_size:
2751       ctx->funcs->emit_intrinsic_image_size(ctx, intr, dst);
2752       break;
2753    case nir_intrinsic_image_atomic:
2754    case nir_intrinsic_bindless_image_atomic:
2755    case nir_intrinsic_image_atomic_swap:
2756    case nir_intrinsic_bindless_image_atomic_swap:
2757       dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr);
2758       break;
2759    case nir_intrinsic_barrier:
2760       emit_intrinsic_barrier(ctx, intr);
2761       /* note that blk ptr no longer valid, make that obvious: */
2762       b = NULL;
2763       break;
2764    case nir_intrinsic_store_output:
2765       setup_output(ctx, intr);
2766       break;
2767    case nir_intrinsic_load_base_vertex:
2768    case nir_intrinsic_load_first_vertex:
2769       if (!ctx->basevertex) {
2770          ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
2771       }
2772       dst[0] = ctx->basevertex;
2773       break;
2774    case nir_intrinsic_load_is_indexed_draw:
2775       if (!ctx->is_indexed_draw) {
2776          ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_IS_INDEXED_DRAW);
2777       }
2778       dst[0] = ctx->is_indexed_draw;
2779       break;
2780    case nir_intrinsic_load_draw_id:
2781       if (!ctx->draw_id) {
2782          ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
2783       }
2784       dst[0] = ctx->draw_id;
2785       break;
2786    case nir_intrinsic_load_base_instance:
2787       if (!ctx->base_instance) {
2788          ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
2789       }
2790       dst[0] = ctx->base_instance;
2791       break;
2792    case nir_intrinsic_load_view_index:
2793       if (!ctx->view_index) {
2794          ctx->view_index =
2795             create_sysval_input(ctx, SYSTEM_VALUE_VIEW_INDEX, 0x1);
2796       }
2797       dst[0] = ctx->view_index;
2798       break;
2799    case nir_intrinsic_load_vertex_id_zero_base:
2800    case nir_intrinsic_load_vertex_id:
2801       if (!ctx->vertex_id) {
2802          gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id)
2803                                  ? SYSTEM_VALUE_VERTEX_ID
2804                                  : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
2805          ctx->vertex_id = create_sysval_input(ctx, sv, 0x1);
2806       }
2807       dst[0] = ctx->vertex_id;
2808       break;
2809    case nir_intrinsic_load_instance_id:
2810       if (!ctx->instance_id) {
2811          ctx->instance_id =
2812             create_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 0x1);
2813       }
2814       dst[0] = ctx->instance_id;
2815       break;
2816    case nir_intrinsic_load_sample_id:
2817    case nir_intrinsic_load_sample_id_no_per_sample:
2818       if (!ctx->samp_id) {
2819          ctx->samp_id = create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 0x1);
2820          ctx->samp_id->dsts[0]->flags |= IR3_REG_HALF;
2821       }
2822       dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
2823       break;
2824    case nir_intrinsic_load_sample_mask_in:
2825       if (!ctx->samp_mask_in) {
2826          ctx->samp_mask_in =
2827             create_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 0x1);
2828       }
2829       dst[0] = ctx->samp_mask_in;
2830       break;
2831    case nir_intrinsic_load_user_clip_plane:
2832       idx = nir_intrinsic_ucp_id(intr);
2833       for (int i = 0; i < dest_components; i++) {
2834          unsigned n = idx * 4 + i;
2835          dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
2836       }
2837       create_rpt = true;
2838       break;
2839    case nir_intrinsic_load_front_face:
2840       if (!ctx->frag_face) {
2841          ctx->so->frag_face = true;
2842          ctx->frag_face =
2843             create_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, 0x1);
2844          ctx->frag_face->dsts[0]->flags |= IR3_REG_HALF;
2845       }
2846       /* for fragface, we get -1 for back and 0 for front. However this is
2847        * the inverse of what nir expects (where ~0 is true).
2848        */
2849       dst[0] = ir3_CMPS_S(b, ctx->frag_face, 0,
2850                           create_immed_typed(b, 0, TYPE_U16), 0);
2851       dst[0]->cat2.condition = IR3_COND_EQ;
2852       break;
2853    case nir_intrinsic_load_local_invocation_id:
2854       if (!ctx->local_invocation_id) {
2855          ctx->local_invocation_id =
2856             create_sysval_input(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 0x7);
2857       }
2858       ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3);
2859       break;
2860    case nir_intrinsic_load_workgroup_id:
2861       if (ctx->compiler->has_shared_regfile) {
2862          if (!ctx->work_group_id) {
2863             ctx->work_group_id =
2864                create_sysval_input(ctx, SYSTEM_VALUE_WORKGROUP_ID, 0x7);
2865             ctx->work_group_id->dsts[0]->flags |= IR3_REG_SHARED;
2866          }
2867          ir3_split_dest(b, dst, ctx->work_group_id, 0, 3);
2868       } else {
2869          /* For a3xx/a4xx, this comes in via const injection by the hw */
2870          for (int i = 0; i < dest_components; i++) {
2871             dst[i] = create_driver_param(ctx, IR3_DP_WORKGROUP_ID_X + i);
2872          }
2873       }
2874       break;
2875    case nir_intrinsic_load_base_workgroup_id:
2876       for (int i = 0; i < dest_components; i++) {
2877          dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
2878       }
2879       create_rpt = true;
2880       break;
2881    case nir_intrinsic_load_num_workgroups:
2882       for (int i = 0; i < dest_components; i++) {
2883          dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
2884       }
2885       create_rpt = true;
2886       break;
2887    case nir_intrinsic_load_workgroup_size:
2888       for (int i = 0; i < dest_components; i++) {
2889          dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
2890       }
2891       create_rpt = true;
2892       break;
2893    case nir_intrinsic_load_subgroup_size: {
2894       assert(ctx->so->type == MESA_SHADER_COMPUTE ||
2895              ctx->so->type == MESA_SHADER_FRAGMENT);
2896       enum ir3_driver_param size = ctx->so->type == MESA_SHADER_COMPUTE ?
2897          IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE;
2898       dst[0] = create_driver_param(ctx, size);
2899       break;
2900    }
2901    case nir_intrinsic_load_subgroup_id_shift_ir3:
2902       dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
2903       break;
2904    case nir_intrinsic_load_work_dim:
2905       dst[0] = create_driver_param(ctx, IR3_DP_WORK_DIM);
2906       break;
2907    case nir_intrinsic_load_subgroup_invocation:
2908       assert(ctx->compiler->has_getfiberid);
2909       dst[0] = ir3_GETFIBERID(b);
2910       dst[0]->cat6.type = TYPE_U32;
2911       __ssa_dst(dst[0]);
2912       break;
2913    case nir_intrinsic_load_tess_level_outer_default:
2914       for (int i = 0; i < dest_components; i++) {
2915          dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_OUTER_LEVEL_X + i);
2916       }
2917       create_rpt = true;
2918       break;
2919    case nir_intrinsic_load_tess_level_inner_default:
2920       for (int i = 0; i < dest_components; i++) {
2921          dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_INNER_LEVEL_X + i);
2922       }
2923       create_rpt = true;
2924       break;
2925    case nir_intrinsic_load_frag_invocation_count:
2926       dst[0] = create_driver_param(ctx, IR3_DP_FS_FRAG_INVOCATION_COUNT);
2927       break;
2928    case nir_intrinsic_load_frag_size_ir3:
2929    case nir_intrinsic_load_frag_offset_ir3: {
2930       enum ir3_driver_param param =
2931          intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
2932          IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
2933       if (nir_src_is_const(intr->src[0])) {
2934          uint32_t view = nir_src_as_uint(intr->src[0]);
2935          for (int i = 0; i < dest_components; i++) {
2936             dst[i] = create_driver_param(ctx, param + 4 * view + i);
2937          }
2938          create_rpt = true;
2939       } else {
2940          struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
2941          for (int i = 0; i < dest_components; i++) {
2942             dst[i] = create_driver_param_indirect(ctx, param + i,
2943                                                   ir3_get_addr0(ctx, view, 4));
2944          }
2945          ctx->so->constlen =
2946             MAX2(ctx->so->constlen,
2947                  const_state->offsets.driver_param + param / 4 +
2948                  nir_intrinsic_range(intr));
2949       }
2950       break;
2951    }
2952    case nir_intrinsic_demote:
2953    case nir_intrinsic_demote_if:
2954    case nir_intrinsic_terminate:
2955    case nir_intrinsic_terminate_if: {
2956       struct ir3_instruction *cond, *kill;
2957 
2958       if (intr->intrinsic == nir_intrinsic_demote_if ||
2959           intr->intrinsic == nir_intrinsic_terminate_if) {
2960          /* conditional discard: */
2961          src = ir3_get_src(ctx, &intr->src[0]);
2962          cond = src[0];
2963       } else {
2964          /* unconditional discard: */
2965          cond = create_immed_typed(b, 1, ctx->compiler->bool_type);
2966       }
2967 
2968       /* NOTE: only cmps.*.* can write p0.x: */
2969       struct ir3_instruction *zero =
2970             create_immed_typed(b, 0, is_half(cond) ? TYPE_U16 : TYPE_U32);
2971       cond = ir3_CMPS_S(b, cond, 0, zero, 0);
2972       cond->cat2.condition = IR3_COND_NE;
2973 
2974       /* condition always goes in predicate register: */
2975       cond->dsts[0]->flags |= IR3_REG_PREDICATE;
2976 
2977       if (intr->intrinsic == nir_intrinsic_demote ||
2978           intr->intrinsic == nir_intrinsic_demote_if) {
2979          kill = ir3_DEMOTE(b, cond, 0);
2980       } else {
2981          kill = ir3_KILL(b, cond, 0);
2982       }
2983 
2984       /* - Side-effects should not be moved on a different side of the kill
2985        * - Instructions that depend on active fibers should not be reordered
2986        */
2987       kill->barrier_class = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
2988                             IR3_BARRIER_ACTIVE_FIBERS_W;
2989       kill->barrier_conflict = IR3_BARRIER_IMAGE_W | IR3_BARRIER_BUFFER_W |
2990                                IR3_BARRIER_ACTIVE_FIBERS_R;
2991       kill->srcs[0]->flags |= IR3_REG_PREDICATE;
2992 
2993       array_insert(b, b->keeps, kill);
2994       ctx->so->has_kill = true;
2995 
2996       break;
2997    }
2998 
2999    case nir_intrinsic_vote_any:
3000    case nir_intrinsic_vote_all: {
3001       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3002       struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
3003       if (intr->intrinsic == nir_intrinsic_vote_any)
3004          dst[0] = ir3_ANY_MACRO(ctx->block, pred, 0);
3005       else
3006          dst[0] = ir3_ALL_MACRO(ctx->block, pred, 0);
3007       dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
3008       break;
3009    }
3010    case nir_intrinsic_elect:
3011       dst[0] = ir3_ELECT_MACRO(ctx->block);
3012       dst[0]->flags |= IR3_INSTR_NEEDS_HELPERS;
3013       break;
3014    case nir_intrinsic_elect_any_ir3:
3015       dst[0] = ir3_ELECT_MACRO(ctx->block);
3016       break;
3017    case nir_intrinsic_preamble_start_ir3:
3018       dst[0] = ir3_SHPS_MACRO(ctx->block);
3019       break;
3020 
3021    case nir_intrinsic_read_invocation_cond_ir3: {
3022       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3023       struct ir3_instruction *cond = ir3_get_src(ctx, &intr->src[1])[0];
3024       dst[0] = ir3_READ_COND_MACRO(ctx->block, ir3_get_predicate(ctx, cond), 0,
3025                                    src, 0);
3026       dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3027       dst[0]->srcs[0]->flags |= IR3_REG_PREDICATE;
3028       /* Work around a bug with half-register shared -> non-shared moves by
3029        * adding an extra mov here so that the original destination stays full.
3030        */
3031       if (src->dsts[0]->flags & IR3_REG_HALF) {
3032          dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3033          if (!ctx->compiler->has_scalar_alu)
3034             dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3035       }
3036       break;
3037    }
3038 
3039    case nir_intrinsic_read_first_invocation: {
3040       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3041       dst[0] = ir3_READ_FIRST_MACRO(ctx->block, src, 0);
3042       dst[0]->dsts[0]->flags |= IR3_REG_SHARED;
3043       /* See above. */
3044       if (src->dsts[0]->flags & IR3_REG_HALF) {
3045          dst[0] = ir3_MOV(b, dst[0], TYPE_U32);
3046          if (!ctx->compiler->has_scalar_alu)
3047             dst[0]->dsts[0]->flags &= ~IR3_REG_SHARED;
3048       }
3049       break;
3050    }
3051 
3052    case nir_intrinsic_ballot: {
3053       struct ir3_instruction *ballot;
3054       unsigned components = intr->def.num_components;
3055       if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) {
3056          /* ballot(true) is just MOVMSK */
3057          ballot = ir3_MOVMSK(ctx->block, components);
3058       } else {
3059          struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3060          struct ir3_instruction *pred = ir3_get_predicate(ctx, src);
3061          ballot = ir3_BALLOT_MACRO(ctx->block, pred, components);
3062          ballot->srcs[0]->flags |= IR3_REG_PREDICATE;
3063       }
3064 
3065       ballot->barrier_class = IR3_BARRIER_ACTIVE_FIBERS_R;
3066       ballot->barrier_conflict = IR3_BARRIER_ACTIVE_FIBERS_W;
3067 
3068       ir3_split_dest(ctx->block, dst, ballot, 0, components);
3069       break;
3070    }
3071 
3072    case nir_intrinsic_quad_broadcast: {
3073       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3074       struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[1])[0];
3075 
3076       type_t dst_type = type_uint_size(intr->def.bit_size);
3077 
3078       if (dst_type != TYPE_U32)
3079          idx = ir3_COV(ctx->block, idx, TYPE_U32, dst_type);
3080 
3081       dst[0] = ir3_QUAD_SHUFFLE_BRCST(ctx->block, src, 0, idx, 0);
3082       dst[0]->cat5.type = dst_type;
3083       break;
3084    }
3085 
3086    case nir_intrinsic_quad_swap_horizontal: {
3087       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3088       dst[0] = ir3_QUAD_SHUFFLE_HORIZ(ctx->block, src, 0);
3089       dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3090       break;
3091    }
3092 
3093    case nir_intrinsic_quad_swap_vertical: {
3094       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3095       dst[0] = ir3_QUAD_SHUFFLE_VERT(ctx->block, src, 0);
3096       dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3097       break;
3098    }
3099 
3100    case nir_intrinsic_quad_swap_diagonal: {
3101       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3102       dst[0] = ir3_QUAD_SHUFFLE_DIAG(ctx->block, src, 0);
3103       dst[0]->cat5.type = type_uint_size(intr->def.bit_size);
3104       break;
3105    }
3106    case nir_intrinsic_ddx:
3107    case nir_intrinsic_ddx_coarse: {
3108       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3109       dst[0] = ir3_DSX(b, src, 0);
3110       dst[0]->cat5.type = TYPE_F32;
3111       break;
3112    }
3113    case nir_intrinsic_ddx_fine: {
3114       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3115       dst[0] = ir3_DSXPP_MACRO(b, src, 0);
3116       dst[0]->cat5.type = TYPE_F32;
3117       break;
3118    }
3119    case nir_intrinsic_ddy:
3120    case nir_intrinsic_ddy_coarse: {
3121       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3122       dst[0] = ir3_DSY(b, src, 0);
3123       dst[0]->cat5.type = TYPE_F32;
3124       break;
3125    }
3126    case nir_intrinsic_ddy_fine: {
3127       struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
3128       dst[0] = ir3_DSYPP_MACRO(b, src, 0);
3129       dst[0]->cat5.type = TYPE_F32;
3130       break;
3131    }
3132    case nir_intrinsic_load_shared_ir3:
3133       emit_intrinsic_load_shared_ir3(ctx, intr, dst);
3134       break;
3135    case nir_intrinsic_store_shared_ir3:
3136       emit_intrinsic_store_shared_ir3(ctx, intr);
3137       break;
3138    case nir_intrinsic_bindless_resource_ir3:
3139       dst[0] = ir3_get_src(ctx, &intr->src[0])[0];
3140       break;
3141    case nir_intrinsic_global_atomic_ir3:
3142    case nir_intrinsic_global_atomic_swap_ir3: {
3143       dst[0] = ctx->funcs->emit_intrinsic_atomic_global(ctx, intr);
3144       break;
3145    }
3146 
3147    case nir_intrinsic_reduce:
3148    case nir_intrinsic_inclusive_scan:
3149    case nir_intrinsic_exclusive_scan:
3150       dst[0] = emit_intrinsic_reduce(ctx, intr);
3151       break;
3152 
3153    case nir_intrinsic_reduce_clusters_ir3:
3154    case nir_intrinsic_inclusive_scan_clusters_ir3:
3155    case nir_intrinsic_exclusive_scan_clusters_ir3:
3156       dst[0] = emit_intrinsic_reduce_clusters(ctx, intr);
3157       break;
3158 
3159    case nir_intrinsic_brcst_active_ir3:
3160       dst[0] = emit_intrinsic_brcst_active(ctx, intr);
3161       break;
3162 
3163    case nir_intrinsic_preamble_end_ir3: {
3164       struct ir3_instruction *instr = ir3_SHPE(ctx->block);
3165       instr->barrier_class = instr->barrier_conflict = IR3_BARRIER_CONST_W;
3166       array_insert(b, b->keeps, instr);
3167       break;
3168    }
3169    case nir_intrinsic_store_const_ir3: {
3170       unsigned components = nir_src_num_components(intr->src[0]);
3171       unsigned dst = nir_intrinsic_base(intr);
3172       unsigned dst_lo = dst & 0xff;
3173       unsigned dst_hi = dst >> 8;
3174 
3175       struct ir3_instruction *src =
3176          ir3_create_collect(b, ir3_get_src_shared(ctx, &intr->src[0],
3177                                                   ctx->compiler->has_scalar_alu),
3178                             components);
3179       struct ir3_instruction *a1 = NULL;
3180       if (dst_hi) {
3181          /* Encode only the high part of the destination in a1.x to increase the
3182           * chance that we can reuse the a1.x value in subsequent stc
3183           * instructions.
3184           */
3185          a1 = ir3_get_addr1(ctx, dst_hi << 8);
3186       }
3187 
3188       struct ir3_instruction *stc =
3189          ir3_STC(ctx->block, create_immed(b, dst_lo),  0, src, 0);
3190       stc->cat6.iim_val = components;
3191       stc->cat6.type = TYPE_U32;
3192       stc->barrier_conflict = IR3_BARRIER_CONST_W;
3193       if (a1) {
3194          ir3_instr_set_address(stc, a1);
3195          stc->flags |= IR3_INSTR_A1EN;
3196       }
3197       /* The assembler isn't aware of what value a1.x has, so make sure that
3198        * constlen includes the stc here.
3199        */
3200       ctx->so->constlen =
3201          MAX2(ctx->so->constlen, DIV_ROUND_UP(dst + components, 4));
3202       array_insert(b, b->keeps, stc);
3203       break;
3204    }
3205    case nir_intrinsic_copy_push_const_to_uniform_ir3: {
3206       struct ir3_instruction *load =
3207          ir3_instr_create(ctx->block, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0);
3208       array_insert(b, b->keeps, load);
3209 
3210       load->push_consts.dst_base = nir_src_as_uint(intr->src[0]);
3211       load->push_consts.src_base = nir_intrinsic_base(intr);
3212       load->push_consts.src_size = nir_intrinsic_range(intr);
3213 
3214       ctx->so->constlen =
3215          MAX2(ctx->so->constlen,
3216               DIV_ROUND_UP(
3217                  load->push_consts.dst_base + load->push_consts.src_size, 4));
3218       break;
3219    }
3220    case nir_intrinsic_prefetch_sam_ir3: {
3221       struct tex_src_info info =
3222          get_bindless_samp_src(ctx, &intr->src[0], &intr->src[1]);
3223       struct ir3_instruction *sam =
3224          emit_sam(ctx, OPC_SAM, info, TYPE_F32, 0b1111, NULL, NULL);
3225 
3226       sam->dsts_count = 0;
3227       array_insert(ctx->block, ctx->block->keeps, sam);
3228       break;
3229    }
3230    case nir_intrinsic_prefetch_tex_ir3: {
3231       struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
3232       struct ir3_instruction *resinfo = ir3_RESINFO(b, idx, 0);
3233       resinfo->cat6.iim_val = 1;
3234       resinfo->cat6.d = 1;
3235       resinfo->cat6.type = TYPE_U32;
3236       resinfo->cat6.typed = false;
3237 
3238       ir3_handle_bindless_cat6(resinfo, intr->src[0]);
3239       if (resinfo->flags & IR3_INSTR_B)
3240          ctx->so->bindless_tex = true;
3241 
3242       resinfo->dsts_count = 0;
3243       array_insert(ctx->block, ctx->block->keeps, resinfo);
3244       break;
3245    }
3246    case nir_intrinsic_prefetch_ubo_ir3: {
3247       struct ir3_instruction *offset = create_immed(ctx->block, 0);
3248       struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
3249       struct ir3_instruction *ldc = ir3_LDC(b, idx, 0, offset, 0);
3250       ldc->cat6.iim_val = 1;
3251       ldc->cat6.type = TYPE_U32;
3252 
3253       ir3_handle_bindless_cat6(ldc, intr->src[0]);
3254       if (ldc->flags & IR3_INSTR_B)
3255          ctx->so->bindless_ubo = true;
3256 
3257       ldc->dsts_count = 0;
3258       array_insert(ctx->block, ctx->block->keeps, ldc);
3259       break;
3260    }
3261    default:
3262       ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
3263                         nir_intrinsic_infos[intr->intrinsic].name);
3264       break;
3265    }
3266 
3267    if (info->has_dest) {
3268       if (create_rpt)
3269          ir3_instr_create_rpt(dst, dest_components);
3270       ir3_put_def(ctx, &intr->def);
3271    }
3272 }
3273 
3274 static void
emit_load_const(struct ir3_context * ctx,nir_load_const_instr * instr)3275 emit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr)
3276 {
3277    struct ir3_instruction **dst =
3278       ir3_get_dst_ssa(ctx, &instr->def, instr->def.num_components);
3279    unsigned bit_size = ir3_bitsize(ctx, instr->def.bit_size);
3280 
3281    if (bit_size <= 8) {
3282       for (int i = 0; i < instr->def.num_components; i++)
3283          dst[i] = create_immed_typed(ctx->block, instr->value[i].u8, TYPE_U8);
3284    } else if (bit_size <= 16) {
3285       for (int i = 0; i < instr->def.num_components; i++)
3286          dst[i] = create_immed_typed(ctx->block, instr->value[i].u16, TYPE_U16);
3287    } else {
3288       for (int i = 0; i < instr->def.num_components; i++)
3289          dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, TYPE_U32);
3290    }
3291 }
3292 
3293 static void
emit_undef(struct ir3_context * ctx,nir_undef_instr * undef)3294 emit_undef(struct ir3_context *ctx, nir_undef_instr *undef)
3295 {
3296    struct ir3_instruction **dst =
3297       ir3_get_dst_ssa(ctx, &undef->def, undef->def.num_components);
3298    type_t type = utype_for_size(ir3_bitsize(ctx, undef->def.bit_size));
3299 
3300    /* backend doesn't want undefined instructions, so just plug
3301     * in 0.0..
3302     */
3303    for (int i = 0; i < undef->def.num_components; i++)
3304       dst[i] = create_immed_typed(ctx->block, fui(0.0), type);
3305 }
3306 
3307 /*
3308  * texture fetch/sample instructions:
3309  */
3310 
3311 static type_t
get_tex_dest_type(nir_tex_instr * tex)3312 get_tex_dest_type(nir_tex_instr *tex)
3313 {
3314    type_t type;
3315 
3316    switch (tex->dest_type) {
3317    case nir_type_float32:
3318       return TYPE_F32;
3319    case nir_type_float16:
3320       return TYPE_F16;
3321    case nir_type_int32:
3322       return TYPE_S32;
3323    case nir_type_int16:
3324       return TYPE_S16;
3325    case nir_type_bool32:
3326    case nir_type_uint32:
3327       return TYPE_U32;
3328    case nir_type_bool16:
3329    case nir_type_uint16:
3330       return TYPE_U16;
3331    case nir_type_invalid:
3332    default:
3333       unreachable("bad dest_type");
3334    }
3335 
3336    return type;
3337 }
3338 
3339 static void
tex_info(nir_tex_instr * tex,unsigned * flagsp,unsigned * coordsp)3340 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
3341 {
3342    unsigned coords =
3343       glsl_get_sampler_dim_coordinate_components(tex->sampler_dim);
3344    unsigned flags = 0;
3345 
3346    /* note: would use tex->coord_components.. except txs.. also,
3347     * since array index goes after shadow ref, we don't want to
3348     * count it:
3349     */
3350    if (coords == 3)
3351       flags |= IR3_INSTR_3D;
3352 
3353    if (tex->is_shadow && tex->op != nir_texop_lod)
3354       flags |= IR3_INSTR_S;
3355 
3356    if (tex->is_array && tex->op != nir_texop_lod)
3357       flags |= IR3_INSTR_A;
3358 
3359    *flagsp = flags;
3360    *coordsp = coords;
3361 }
3362 
3363 /* Gets the sampler/texture idx as a hvec2.  Which could either be dynamic
3364  * or immediate (in which case it will get lowered later to a non .s2en
3365  * version of the tex instruction which encode tex/samp as immediates:
3366  */
3367 static struct tex_src_info
get_tex_samp_tex_src(struct ir3_context * ctx,nir_tex_instr * tex)3368 get_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex)
3369 {
3370    struct ir3_block *b = ctx->block;
3371    struct tex_src_info info = {0};
3372    int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3373    int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
3374    struct ir3_instruction *texture, *sampler;
3375 
3376    if (texture_idx >= 0 || sampler_idx >= 0) {
3377       /* Bindless case */
3378       info = get_bindless_samp_src(ctx,
3379                                    texture_idx >= 0 ? &tex->src[texture_idx].src : NULL,
3380                                    sampler_idx >= 0 ? &tex->src[sampler_idx].src : NULL);
3381 
3382       if (tex->texture_non_uniform || tex->sampler_non_uniform)
3383          info.flags |= IR3_INSTR_NONUNIF;
3384    } else {
3385       info.flags |= IR3_INSTR_S2EN;
3386       texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset);
3387       sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset);
3388       if (texture_idx >= 0) {
3389          texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0];
3390          texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16);
3391       } else {
3392          /* TODO what to do for dynamic case? I guess we only need the
3393           * max index for astc srgb workaround so maybe not a problem
3394           * to worry about if we don't enable indirect samplers for
3395           * a4xx?
3396           */
3397          ctx->max_texture_index =
3398             MAX2(ctx->max_texture_index, tex->texture_index);
3399          texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16);
3400          info.tex_idx = tex->texture_index;
3401       }
3402 
3403       if (sampler_idx >= 0) {
3404          sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0];
3405          sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16);
3406       } else {
3407          sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16);
3408          info.samp_idx = tex->texture_index;
3409       }
3410 
3411       info.samp_tex = ir3_collect(b, sampler, texture);
3412    }
3413 
3414    return info;
3415 }
3416 
3417 static void
emit_tex(struct ir3_context * ctx,nir_tex_instr * tex)3418 emit_tex(struct ir3_context *ctx, nir_tex_instr *tex)
3419 {
3420    struct ir3_block *b = ctx->block;
3421    struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
3422    struct ir3_instruction *const *coord, *const *off, *const *ddx, *const *ddy;
3423    struct ir3_instruction *lod, *compare, *proj, *sample_index;
3424    struct tex_src_info info = {0};
3425    bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
3426    unsigned i, coords, flags, ncomp;
3427    unsigned nsrc0 = 0, nsrc1 = 0;
3428    type_t type;
3429    opc_t opc = 0;
3430 
3431    ncomp = tex->def.num_components;
3432 
3433    coord = off = ddx = ddy = NULL;
3434    lod = proj = compare = sample_index = NULL;
3435 
3436    dst = ir3_get_def(ctx, &tex->def, ncomp);
3437 
3438    for (unsigned i = 0; i < tex->num_srcs; i++) {
3439       switch (tex->src[i].src_type) {
3440       case nir_tex_src_coord:
3441          coord = ir3_get_src(ctx, &tex->src[i].src);
3442          break;
3443       case nir_tex_src_bias:
3444          lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3445          has_bias = true;
3446          break;
3447       case nir_tex_src_lod:
3448          lod = ir3_get_src(ctx, &tex->src[i].src)[0];
3449          has_lod = true;
3450          break;
3451       case nir_tex_src_comparator: /* shadow comparator */
3452          compare = ir3_get_src(ctx, &tex->src[i].src)[0];
3453          break;
3454       case nir_tex_src_projector:
3455          proj = ir3_get_src(ctx, &tex->src[i].src)[0];
3456          has_proj = true;
3457          break;
3458       case nir_tex_src_offset:
3459          off = ir3_get_src(ctx, &tex->src[i].src);
3460          has_off = true;
3461          break;
3462       case nir_tex_src_ddx:
3463          ddx = ir3_get_src(ctx, &tex->src[i].src);
3464          break;
3465       case nir_tex_src_ddy:
3466          ddy = ir3_get_src(ctx, &tex->src[i].src);
3467          break;
3468       case nir_tex_src_ms_index:
3469          sample_index = ir3_get_src(ctx, &tex->src[i].src)[0];
3470          break;
3471       case nir_tex_src_texture_offset:
3472       case nir_tex_src_sampler_offset:
3473       case nir_tex_src_texture_handle:
3474       case nir_tex_src_sampler_handle:
3475          /* handled in get_tex_samp_src() */
3476          break;
3477       default:
3478          ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n",
3479                            tex->src[i].src_type);
3480          return;
3481       }
3482    }
3483 
3484    switch (tex->op) {
3485    case nir_texop_tex_prefetch:
3486       compile_assert(ctx, !has_bias);
3487       compile_assert(ctx, !has_lod);
3488       compile_assert(ctx, !compare);
3489       compile_assert(ctx, !has_proj);
3490       compile_assert(ctx, !has_off);
3491       compile_assert(ctx, !ddx);
3492       compile_assert(ctx, !ddy);
3493       compile_assert(ctx, !sample_index);
3494       compile_assert(
3495          ctx, nir_tex_instr_src_index(tex, nir_tex_src_texture_offset) < 0);
3496       compile_assert(
3497          ctx, nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset) < 0);
3498 
3499       if (ctx->so->num_sampler_prefetch < ctx->prefetch_limit) {
3500          opc = OPC_META_TEX_PREFETCH;
3501          ctx->so->num_sampler_prefetch++;
3502          break;
3503       }
3504       FALLTHROUGH;
3505    case nir_texop_tex:
3506       opc = has_lod ? OPC_SAML : OPC_SAM;
3507       break;
3508    case nir_texop_txb:
3509       opc = OPC_SAMB;
3510       break;
3511    case nir_texop_txl:
3512       opc = OPC_SAML;
3513       break;
3514    case nir_texop_txd:
3515       opc = OPC_SAMGQ;
3516       break;
3517    case nir_texop_txf:
3518       opc = OPC_ISAML;
3519       break;
3520    case nir_texop_lod:
3521       opc = OPC_GETLOD;
3522       break;
3523    case nir_texop_tg4:
3524       switch (tex->component) {
3525       case 0:
3526          opc = OPC_GATHER4R;
3527          break;
3528       case 1:
3529          opc = OPC_GATHER4G;
3530          break;
3531       case 2:
3532          opc = OPC_GATHER4B;
3533          break;
3534       case 3:
3535          opc = OPC_GATHER4A;
3536          break;
3537       }
3538       break;
3539    case nir_texop_txf_ms_fb:
3540    case nir_texop_txf_ms:
3541       opc = OPC_ISAMM;
3542       break;
3543    default:
3544       ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
3545       return;
3546    }
3547 
3548    tex_info(tex, &flags, &coords);
3549 
3550    /*
3551     * lay out the first argument in the proper order:
3552     *  - actual coordinates first
3553     *  - shadow reference
3554     *  - array index
3555     *  - projection w
3556     *  - starting at offset 4, dpdx.xy, dpdy.xy
3557     *
3558     * bias/lod go into the second arg
3559     */
3560 
3561    /* insert tex coords: */
3562    for (i = 0; i < coords; i++)
3563       src0[i] = coord[i];
3564 
3565    nsrc0 = i;
3566 
3567    type_t coord_pad_type = is_half(coord[0]) ? TYPE_U16 : TYPE_U32;
3568    /* scale up integer coords for TXF based on the LOD */
3569    if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) {
3570       assert(has_lod);
3571       for (i = 0; i < coords; i++)
3572          src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
3573    }
3574 
3575    if (coords == 1) {
3576       /* hw doesn't do 1d, so we treat it as 2d with
3577        * height of 1, and patch up the y coord.
3578        */
3579       if (is_isam(opc)) {
3580          src0[nsrc0++] = create_immed_typed(b, 0, coord_pad_type);
3581       } else if (is_half(coord[0])) {
3582          src0[nsrc0++] = create_immed_typed(b, _mesa_float_to_half(0.5), coord_pad_type);
3583       } else {
3584          src0[nsrc0++] = create_immed_typed(b, fui(0.5), coord_pad_type);
3585       }
3586    }
3587 
3588    if (tex->is_shadow && tex->op != nir_texop_lod)
3589       src0[nsrc0++] = compare;
3590 
3591    if (tex->is_array && tex->op != nir_texop_lod)
3592       src0[nsrc0++] = coord[coords];
3593 
3594    if (has_proj) {
3595       src0[nsrc0++] = proj;
3596       flags |= IR3_INSTR_P;
3597    }
3598 
3599    /* pad to 4, then ddx/ddy: */
3600    if (tex->op == nir_texop_txd) {
3601       while (nsrc0 < 4)
3602          src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3603       for (i = 0; i < coords; i++)
3604          src0[nsrc0++] = ddx[i];
3605       if (coords < 2)
3606          src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3607       for (i = 0; i < coords; i++)
3608          src0[nsrc0++] = ddy[i];
3609       if (coords < 2)
3610          src0[nsrc0++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3611    }
3612 
3613    /* NOTE a3xx (and possibly a4xx?) might be different, using isaml
3614     * with scaled x coord according to requested sample:
3615     */
3616    if (opc == OPC_ISAMM) {
3617       if (ctx->compiler->txf_ms_with_isaml) {
3618          /* the samples are laid out in x dimension as
3619           *     0 1 2 3
3620           * x_ms = (x << ms) + sample_index;
3621           */
3622          struct ir3_instruction *ms;
3623          ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
3624 
3625          src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
3626          src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
3627 
3628          opc = OPC_ISAML;
3629       } else {
3630          src0[nsrc0++] = sample_index;
3631       }
3632    }
3633 
3634    /*
3635     * second argument (if applicable):
3636     *  - offsets
3637     *  - lod
3638     *  - bias
3639     */
3640    if (has_off | has_lod | has_bias) {
3641       if (has_off) {
3642          unsigned off_coords = coords;
3643          if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3644             off_coords--;
3645          for (i = 0; i < off_coords; i++)
3646             src1[nsrc1++] = off[i];
3647          if (off_coords < 2)
3648             src1[nsrc1++] = create_immed_typed(b, fui(0.0), coord_pad_type);
3649          flags |= IR3_INSTR_O;
3650       }
3651 
3652       if (has_lod | has_bias)
3653          src1[nsrc1++] = lod;
3654    }
3655 
3656    type = get_tex_dest_type(tex);
3657 
3658    if (opc == OPC_GETLOD)
3659       type = TYPE_S32;
3660 
3661    if (tex->op == nir_texop_txf_ms_fb) {
3662       compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT);
3663 
3664       ctx->so->fb_read = true;
3665       if (ctx->compiler->options.bindless_fb_read_descriptor >= 0) {
3666          ctx->so->bindless_tex = true;
3667          info.flags = IR3_INSTR_B;
3668          info.base = ctx->compiler->options.bindless_fb_read_descriptor;
3669          struct ir3_instruction *texture, *sampler;
3670 
3671          int base_index =
3672             nir_tex_instr_src_index(tex, nir_tex_src_texture_handle);
3673          nir_src tex_src = tex->src[base_index].src;
3674 
3675          if (nir_src_is_const(tex_src)) {
3676             texture = create_immed_typed(b,
3677                nir_src_as_uint(tex_src) + ctx->compiler->options.bindless_fb_read_slot,
3678                TYPE_U32);
3679          } else {
3680             texture = create_immed_typed(
3681                ctx->block, ctx->compiler->options.bindless_fb_read_slot, TYPE_U32);
3682             struct ir3_instruction *base =
3683                ir3_get_src(ctx, &tex->src[base_index].src)[0];
3684             texture = ir3_ADD_U(b, texture, 0, base, 0);
3685          }
3686          sampler = create_immed_typed(ctx->block, 0, TYPE_U32);
3687          info.samp_tex = ir3_collect(b, texture, sampler);
3688          info.flags |= IR3_INSTR_S2EN;
3689          if (tex->texture_non_uniform) {
3690             info.flags |= IR3_INSTR_NONUNIF;
3691          }
3692       } else {
3693          /* Otherwise append a sampler to be patched into the texture
3694           * state:
3695           */
3696          info.samp_tex = ir3_collect(
3697                b, create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16),
3698                create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16));
3699          info.flags = IR3_INSTR_S2EN;
3700       }
3701 
3702       ctx->so->num_samp++;
3703    } else {
3704       info = get_tex_samp_tex_src(ctx, tex);
3705    }
3706 
3707    bool tg4_swizzle_fixup = false;
3708    if (tex->op == nir_texop_tg4 && ctx->compiler->gen == 4 &&
3709          ctx->sampler_swizzles[tex->texture_index] != 0x688 /* rgba */) {
3710       uint16_t swizzles = ctx->sampler_swizzles[tex->texture_index];
3711       uint16_t swizzle = (swizzles >> (tex->component * 3)) & 7;
3712       if (swizzle > 3) {
3713          /* this would mean that we can just return 0 / 1, no texturing
3714           * necessary
3715           */
3716          struct ir3_instruction *imm = create_immed(b,
3717                type_float(type) ? fui(swizzle - 4) : (swizzle - 4));
3718          for (int i = 0; i < 4; i++)
3719             dst[i] = imm;
3720          ir3_put_def(ctx, &tex->def);
3721          return;
3722       }
3723       opc = OPC_GATHER4R + swizzle;
3724       tg4_swizzle_fixup = true;
3725    }
3726 
3727    struct ir3_instruction *col0 = ir3_create_collect(b, src0, nsrc0);
3728    struct ir3_instruction *col1 = ir3_create_collect(b, src1, nsrc1);
3729 
3730    if (opc == OPC_META_TEX_PREFETCH) {
3731       int idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3732 
3733 
3734       sam = ir3_SAM(ctx->in_block, opc, type, MASK(ncomp), 0, NULL,
3735                     get_barycentric(ctx, IJ_PERSP_PIXEL), 0);
3736       sam->prefetch.input_offset = ir3_nir_coord_offset(tex->src[idx].src.ssa);
3737       /* make sure not to add irrelevant flags like S2EN */
3738       sam->flags = flags | (info.flags & IR3_INSTR_B);
3739       sam->prefetch.tex = info.tex_idx;
3740       sam->prefetch.samp = info.samp_idx;
3741       sam->prefetch.tex_base = info.tex_base;
3742       sam->prefetch.samp_base = info.samp_base;
3743    } else {
3744       info.flags |= flags;
3745       sam = emit_sam(ctx, opc, info, type, MASK(ncomp), col0, col1);
3746    }
3747 
3748    if (tg4_swizzle_fixup) {
3749       /* TODO: fix-up for ASTC when alpha is selected? */
3750       array_insert(ctx->ir, ctx->ir->tg4, sam);
3751 
3752       ir3_split_dest(b, dst, sam, 0, 4);
3753 
3754       uint8_t tex_bits = ctx->sampler_swizzles[tex->texture_index] >> 12;
3755       if (!type_float(type) && tex_bits != 3 /* 32bpp */ &&
3756             tex_bits != 0 /* key unset */) {
3757          uint8_t bits = 0;
3758          switch (tex_bits) {
3759          case 1: /* 8bpp */
3760             bits = 8;
3761             break;
3762          case 2: /* 16bpp */
3763             bits = 16;
3764             break;
3765          case 4: /* 10bpp or 2bpp for alpha */
3766             if (opc == OPC_GATHER4A)
3767                bits = 2;
3768             else
3769                bits = 10;
3770             break;
3771          default:
3772             assert(0);
3773          }
3774 
3775          sam->cat5.type = TYPE_F32;
3776          for (int i = 0; i < 4; i++) {
3777             /* scale and offset the unorm data */
3778             dst[i] = ir3_MAD_F32(b, dst[i], 0, create_immed(b, fui((1 << bits) - 1)), 0, create_immed(b, fui(0.5f)), 0);
3779             /* convert the scaled value to integer */
3780             dst[i] = ir3_COV(b, dst[i], TYPE_F32, TYPE_U32);
3781             /* sign extend for signed values */
3782             if (type == TYPE_S32) {
3783                dst[i] = ir3_SHL_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3784                dst[i] = ir3_ASHR_B(b, dst[i], 0, create_immed(b, 32 - bits), 0);
3785             }
3786          }
3787       }
3788    } else if ((ctx->astc_srgb & (1 << tex->texture_index)) &&
3789        tex->op != nir_texop_tg4 && /* leave out tg4, unless it's on alpha? */
3790        !nir_tex_instr_is_query(tex)) {
3791       assert(opc != OPC_META_TEX_PREFETCH);
3792 
3793       /* only need first 3 components: */
3794       sam->dsts[0]->wrmask = 0x7;
3795       ir3_split_dest(b, dst, sam, 0, 3);
3796 
3797       /* we need to sample the alpha separately with a non-SRGB
3798        * texture state:
3799        */
3800       sam = ir3_SAM(b, opc, type, 0b1000, flags | info.flags, info.samp_tex,
3801                     col0, col1);
3802 
3803       array_insert(ctx->ir, ctx->ir->astc_srgb, sam);
3804 
3805       /* fixup .w component: */
3806       ir3_split_dest(b, &dst[3], sam, 3, 1);
3807    } else {
3808       /* normal (non-workaround) case: */
3809       ir3_split_dest(b, dst, sam, 0, ncomp);
3810    }
3811 
3812    /* GETLOD returns results in 4.8 fixed point */
3813    if (opc == OPC_GETLOD) {
3814       bool half = tex->def.bit_size == 16;
3815       struct ir3_instruction *factor =
3816          half ? create_immed_typed(b, _mesa_float_to_half(1.0 / 256), TYPE_F16)
3817               : create_immed(b, fui(1.0 / 256));
3818 
3819       for (i = 0; i < 2; i++) {
3820          dst[i] = ir3_MUL_F(
3821             b, ir3_COV(b, dst[i], TYPE_S32, half ? TYPE_F16 : TYPE_F32), 0,
3822             factor, 0);
3823       }
3824    }
3825 
3826    ir3_put_def(ctx, &tex->def);
3827 }
3828 
3829 static void
emit_tex_info(struct ir3_context * ctx,nir_tex_instr * tex,unsigned idx)3830 emit_tex_info(struct ir3_context *ctx, nir_tex_instr *tex, unsigned idx)
3831 {
3832    struct ir3_block *b = ctx->block;
3833    struct ir3_instruction **dst, *sam;
3834    type_t dst_type = get_tex_dest_type(tex);
3835    struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3836 
3837    dst = ir3_get_def(ctx, &tex->def, 1);
3838 
3839    sam = emit_sam(ctx, OPC_GETINFO, info, dst_type, 1 << idx, NULL, NULL);
3840 
3841    /* even though there is only one component, since it ends
3842     * up in .y/.z/.w rather than .x, we need a split_dest()
3843     */
3844    ir3_split_dest(b, dst, sam, idx, 1);
3845 
3846    /* The # of levels comes from getinfo.z. We need to add 1 to it, since
3847     * the value in TEX_CONST_0 is zero-based.
3848     */
3849    if (ctx->compiler->levels_add_one)
3850       dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
3851 
3852    ir3_put_def(ctx, &tex->def);
3853 }
3854 
3855 static void
emit_tex_txs(struct ir3_context * ctx,nir_tex_instr * tex)3856 emit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex)
3857 {
3858    struct ir3_block *b = ctx->block;
3859    struct ir3_instruction **dst, *sam;
3860    struct ir3_instruction *lod;
3861    unsigned flags, coords;
3862    type_t dst_type = get_tex_dest_type(tex);
3863    struct tex_src_info info = get_tex_samp_tex_src(ctx, tex);
3864 
3865    tex_info(tex, &flags, &coords);
3866    info.flags |= flags;
3867 
3868    /* Actually we want the number of dimensions, not coordinates. This
3869     * distinction only matters for cubes.
3870     */
3871    if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
3872       coords = 2;
3873 
3874    dst = ir3_get_def(ctx, &tex->def, 4);
3875 
3876    int lod_idx = nir_tex_instr_src_index(tex, nir_tex_src_lod);
3877    compile_assert(ctx, lod_idx >= 0);
3878 
3879    lod = ir3_get_src(ctx, &tex->src[lod_idx].src)[0];
3880 
3881    if (tex->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
3882       sam = emit_sam(ctx, OPC_GETSIZE, info, dst_type, 0b1111, lod, NULL);
3883    } else {
3884       /*
3885        * The maximum value which OPC_GETSIZE could return for one dimension
3886        * is 0x007ff0, however sampler buffer could be much bigger.
3887        * Blob uses OPC_GETBUF for them.
3888        */
3889       sam = emit_sam(ctx, OPC_GETBUF, info, dst_type, 0b1111, NULL, NULL);
3890    }
3891 
3892    ir3_split_dest(b, dst, sam, 0, 4);
3893 
3894    /* Array size actually ends up in .w rather than .z. This doesn't
3895     * matter for miplevel 0, but for higher mips the value in z is
3896     * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
3897     * returned, which means that we have to add 1 to it for arrays.
3898     */
3899    if (tex->is_array) {
3900       if (ctx->compiler->levels_add_one) {
3901          dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
3902       } else {
3903          dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
3904       }
3905    }
3906 
3907    ir3_put_def(ctx, &tex->def);
3908 }
3909 
3910 /* phi instructions are left partially constructed.  We don't resolve
3911  * their srcs until the end of the shader, since (eg. loops) one of
3912  * the phi's srcs might be defined after the phi due to back edges in
3913  * the CFG.
3914  */
3915 static void
emit_phi(struct ir3_context * ctx,nir_phi_instr * nphi)3916 emit_phi(struct ir3_context *ctx, nir_phi_instr *nphi)
3917 {
3918    struct ir3_instruction *phi, **dst;
3919 
3920    unsigned num_components = nphi->def.num_components;
3921    dst = ir3_get_def(ctx, &nphi->def, num_components);
3922 
3923    if (exec_list_is_singular(&nphi->srcs)) {
3924       nir_phi_src *src = list_entry(exec_list_get_head(&nphi->srcs),
3925                                     nir_phi_src, node);
3926       if (nphi->def.divergent == src->src.ssa->divergent) {
3927          struct ir3_instruction *const *srcs =
3928             ir3_get_src_maybe_shared(ctx, &src->src);
3929          memcpy(dst, srcs, num_components * sizeof(struct ir3_instruction *));
3930          ir3_put_def(ctx, &nphi->def);
3931          return;
3932       }
3933    }
3934 
3935    for (unsigned i = 0; i < num_components; i++) {
3936       phi = ir3_instr_create(ctx->block, OPC_META_PHI, 1,
3937                              exec_list_length(&nphi->srcs));
3938       __ssa_dst(phi);
3939       phi->phi.nphi = nphi;
3940       phi->phi.comp = i;
3941 
3942       if (ctx->compiler->has_scalar_alu && !nphi->def.divergent)
3943          phi->dsts[0]->flags |= IR3_REG_SHARED;
3944 
3945       dst[i] = phi;
3946    }
3947 
3948    ir3_put_def(ctx, &nphi->def);
3949 }
3950 
3951 static struct ir3_block *get_block(struct ir3_context *ctx,
3952                                    const nir_block *nblock);
3953 
3954 static struct ir3_instruction *
read_phi_src(struct ir3_context * ctx,struct ir3_block * blk,struct ir3_instruction * phi,nir_phi_instr * nphi)3955 read_phi_src(struct ir3_context *ctx, struct ir3_block *blk,
3956              struct ir3_instruction *phi, nir_phi_instr *nphi)
3957 {
3958    if (!blk->nblock) {
3959       struct ir3_instruction *continue_phi =
3960          ir3_instr_create(blk, OPC_META_PHI, 1, blk->predecessors_count);
3961       __ssa_dst(continue_phi)->flags = phi->dsts[0]->flags;
3962 
3963       for (unsigned i = 0; i < blk->predecessors_count; i++) {
3964          struct ir3_instruction *src =
3965             read_phi_src(ctx, blk->predecessors[i], phi, nphi);
3966          if (src)
3967             __ssa_src(continue_phi, src, 0);
3968          else
3969             ir3_src_create(continue_phi, INVALID_REG, phi->dsts[0]->flags);
3970       }
3971 
3972       return continue_phi;
3973    }
3974 
3975    nir_foreach_phi_src (nsrc, nphi) {
3976       if (blk->nblock == nsrc->pred) {
3977          if (nsrc->src.ssa->parent_instr->type == nir_instr_type_undef) {
3978             /* Create an ir3 undef */
3979             return NULL;
3980          } else {
3981             /* We need to insert the move at the end of the block */
3982             struct ir3_block *old_block = ctx->block;
3983             ctx->block = blk;
3984             struct ir3_instruction *src = ir3_get_src_shared(
3985                ctx, &nsrc->src,
3986                phi->dsts[0]->flags & IR3_REG_SHARED)[phi->phi.comp];
3987             ctx->block = old_block;
3988             return src;
3989          }
3990       }
3991    }
3992 
3993    unreachable("couldn't find phi node ir3 block");
3994    return NULL;
3995 }
3996 
3997 static void
resolve_phis(struct ir3_context * ctx,struct ir3_block * block)3998 resolve_phis(struct ir3_context *ctx, struct ir3_block *block)
3999 {
4000    foreach_instr (phi, &block->instr_list) {
4001       if (phi->opc != OPC_META_PHI)
4002          break;
4003 
4004       nir_phi_instr *nphi = phi->phi.nphi;
4005 
4006       if (!nphi) /* skip continue phis created above */
4007          continue;
4008 
4009       for (unsigned i = 0; i < block->predecessors_count; i++) {
4010          struct ir3_block *pred = block->predecessors[i];
4011          struct ir3_instruction *src = read_phi_src(ctx, pred, phi, nphi);
4012          if (src) {
4013             __ssa_src(phi, src, 0);
4014          } else {
4015             /* Create an ir3 undef */
4016             ir3_src_create(phi, INVALID_REG, phi->dsts[0]->flags);
4017          }
4018       }
4019    }
4020 }
4021 
4022 static void
emit_jump(struct ir3_context * ctx,nir_jump_instr * jump)4023 emit_jump(struct ir3_context *ctx, nir_jump_instr *jump)
4024 {
4025    switch (jump->type) {
4026    case nir_jump_break:
4027    case nir_jump_continue:
4028    case nir_jump_return:
4029       /* I *think* we can simply just ignore this, and use the
4030        * successor block link to figure out where we need to
4031        * jump to for break/continue
4032        */
4033       break;
4034    default:
4035       ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
4036       break;
4037    }
4038 }
4039 
4040 static void
emit_instr(struct ir3_context * ctx,nir_instr * instr)4041 emit_instr(struct ir3_context *ctx, nir_instr *instr)
4042 {
4043    switch (instr->type) {
4044    case nir_instr_type_alu:
4045       emit_alu(ctx, nir_instr_as_alu(instr));
4046       break;
4047    case nir_instr_type_deref:
4048       /* ignored, handled as part of the intrinsic they are src to */
4049       break;
4050    case nir_instr_type_intrinsic:
4051       emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
4052       break;
4053    case nir_instr_type_load_const:
4054       emit_load_const(ctx, nir_instr_as_load_const(instr));
4055       break;
4056    case nir_instr_type_undef:
4057       emit_undef(ctx, nir_instr_as_undef(instr));
4058       break;
4059    case nir_instr_type_tex: {
4060       nir_tex_instr *tex = nir_instr_as_tex(instr);
4061       /* couple tex instructions get special-cased:
4062        */
4063       switch (tex->op) {
4064       case nir_texop_txs:
4065          emit_tex_txs(ctx, tex);
4066          break;
4067       case nir_texop_query_levels:
4068          emit_tex_info(ctx, tex, 2);
4069          break;
4070       case nir_texop_texture_samples:
4071          emit_tex_info(ctx, tex, 3);
4072          break;
4073       default:
4074          emit_tex(ctx, tex);
4075          break;
4076       }
4077       break;
4078    }
4079    case nir_instr_type_jump:
4080       emit_jump(ctx, nir_instr_as_jump(instr));
4081       break;
4082    case nir_instr_type_phi:
4083       emit_phi(ctx, nir_instr_as_phi(instr));
4084       break;
4085    case nir_instr_type_call:
4086    case nir_instr_type_parallel_copy:
4087    case nir_instr_type_debug_info:
4088       ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n",
4089                         instr->type);
4090       break;
4091    }
4092 }
4093 
4094 static struct ir3_block *
get_block(struct ir3_context * ctx,const nir_block * nblock)4095 get_block(struct ir3_context *ctx, const nir_block *nblock)
4096 {
4097    struct ir3_block *block;
4098    struct hash_entry *hentry;
4099 
4100    hentry = _mesa_hash_table_search(ctx->block_ht, nblock);
4101    if (hentry)
4102       return hentry->data;
4103 
4104    block = ir3_block_create(ctx->ir);
4105    block->nblock = nblock;
4106    _mesa_hash_table_insert(ctx->block_ht, nblock, block);
4107 
4108    return block;
4109 }
4110 
4111 static struct ir3_block *
get_block_or_continue(struct ir3_context * ctx,const nir_block * nblock)4112 get_block_or_continue(struct ir3_context *ctx, const nir_block *nblock)
4113 {
4114    struct hash_entry *hentry;
4115 
4116    hentry = _mesa_hash_table_search(ctx->continue_block_ht, nblock);
4117    if (hentry)
4118       return hentry->data;
4119 
4120    return get_block(ctx, nblock);
4121 }
4122 
4123 static struct ir3_block *
create_continue_block(struct ir3_context * ctx,const nir_block * nblock)4124 create_continue_block(struct ir3_context *ctx, const nir_block *nblock)
4125 {
4126    struct ir3_block *block = ir3_block_create(ctx->ir);
4127    block->nblock = NULL;
4128    _mesa_hash_table_insert(ctx->continue_block_ht, nblock, block);
4129    return block;
4130 }
4131 
4132 static void
emit_block(struct ir3_context * ctx,nir_block * nblock)4133 emit_block(struct ir3_context *ctx, nir_block *nblock)
4134 {
4135    ctx->block = get_block(ctx, nblock);
4136 
4137    list_addtail(&ctx->block->node, &ctx->ir->block_list);
4138 
4139    ctx->block->loop_depth = ctx->loop_depth;
4140 
4141    /* re-emit addr register in each block if needed: */
4142    for (int i = 0; i < ARRAY_SIZE(ctx->addr0_ht); i++) {
4143       _mesa_hash_table_destroy(ctx->addr0_ht[i], NULL);
4144       ctx->addr0_ht[i] = NULL;
4145    }
4146 
4147    _mesa_hash_table_u64_destroy(ctx->addr1_ht);
4148    ctx->addr1_ht = NULL;
4149 
4150    nir_foreach_instr (instr, nblock) {
4151       ctx->cur_instr = instr;
4152       emit_instr(ctx, instr);
4153       ctx->cur_instr = NULL;
4154       if (ctx->error)
4155          return;
4156    }
4157 
4158    for (int i = 0; i < ARRAY_SIZE(ctx->block->successors); i++) {
4159       if (nblock->successors[i]) {
4160          ctx->block->successors[i] =
4161             get_block_or_continue(ctx, nblock->successors[i]);
4162       }
4163    }
4164 
4165    /* Emit unconditional branch if we only have one successor. Conditional
4166     * branches are emitted in emit_if.
4167     */
4168    if (ctx->block->successors[0] && !ctx->block->successors[1]) {
4169       if (!ir3_block_get_terminator(ctx->block))
4170          ir3_JUMP(ctx->block);
4171    }
4172 
4173    _mesa_hash_table_clear(ctx->sel_cond_conversions, NULL);
4174 }
4175 
4176 static void emit_cf_list(struct ir3_context *ctx, struct exec_list *list);
4177 
4178 /* Get the ir3 branch condition for a given nir source. This will strip any inot
4179  * instructions and set *inv when the condition should be inverted. This
4180  * inversion can be directly folded into branches (in the inv1/inv2 fields)
4181  * instead of adding an explicit not.b/sub.u instruction.
4182  */
4183 static struct ir3_instruction *
get_branch_condition(struct ir3_context * ctx,nir_src * src,unsigned comp,bool * inv)4184 get_branch_condition(struct ir3_context *ctx, nir_src *src, unsigned comp,
4185                      bool *inv)
4186 {
4187    struct ir3_instruction *condition = ir3_get_src(ctx, src)[comp];
4188 
4189    if (src->ssa->parent_instr->type == nir_instr_type_alu) {
4190       nir_alu_instr *nir_cond = nir_instr_as_alu(src->ssa->parent_instr);
4191 
4192       if (nir_cond->op == nir_op_inot) {
4193          struct ir3_instruction *inv_cond = get_branch_condition(
4194             ctx, &nir_cond->src[0].src, nir_cond->src[0].swizzle[comp], inv);
4195          *inv = !*inv;
4196          return inv_cond;
4197       }
4198    }
4199 
4200    *inv = false;
4201    return ir3_get_predicate(ctx, condition);
4202 }
4203 
4204 /* Try to fold br (and/or cond1, cond2) into braa/brao cond1, cond2.
4205  */
4206 static struct ir3_instruction *
fold_conditional_branch(struct ir3_context * ctx,struct nir_src * nir_cond)4207 fold_conditional_branch(struct ir3_context *ctx, struct nir_src *nir_cond)
4208 {
4209    if (!ctx->compiler->has_branch_and_or)
4210       return NULL;
4211 
4212    if (nir_cond->ssa->parent_instr->type != nir_instr_type_alu)
4213       return NULL;
4214 
4215    nir_alu_instr *alu_cond = nir_instr_as_alu(nir_cond->ssa->parent_instr);
4216 
4217    if ((alu_cond->op != nir_op_iand) && (alu_cond->op != nir_op_ior))
4218       return NULL;
4219 
4220    /* If the result of the and/or is also used for something else than an if
4221     * condition, the and/or cannot be removed. In that case, we will end-up with
4222     * extra predicate conversions for the conditions without actually removing
4223     * any instructions, resulting in an increase of instructions. Let's not fold
4224     * the conditions in the branch in that case.
4225     */
4226    if (!nir_def_only_used_by_if(&alu_cond->def))
4227       return NULL;
4228 
4229    bool inv1, inv2;
4230    struct ir3_instruction *cond1 = get_branch_condition(
4231       ctx, &alu_cond->src[0].src, alu_cond->src[0].swizzle[0], &inv1);
4232    struct ir3_instruction *cond2 = get_branch_condition(
4233       ctx, &alu_cond->src[1].src, alu_cond->src[1].swizzle[0], &inv2);
4234 
4235    struct ir3_instruction *branch;
4236    if (alu_cond->op == nir_op_iand) {
4237       branch = ir3_BRAA(ctx->block, cond1, IR3_REG_PREDICATE, cond2,
4238                         IR3_REG_PREDICATE);
4239    } else {
4240       branch = ir3_BRAO(ctx->block, cond1, IR3_REG_PREDICATE, cond2,
4241                         IR3_REG_PREDICATE);
4242    }
4243 
4244    branch->cat0.inv1 = inv1;
4245    branch->cat0.inv2 = inv2;
4246    return branch;
4247 }
4248 
4249 static bool
instr_can_be_predicated(nir_instr * instr)4250 instr_can_be_predicated(nir_instr *instr)
4251 {
4252    /* Anything that doesn't expand to control-flow can be predicated. */
4253    switch (instr->type) {
4254    case nir_instr_type_alu:
4255    case nir_instr_type_deref:
4256    case nir_instr_type_tex:
4257    case nir_instr_type_load_const:
4258    case nir_instr_type_undef:
4259    case nir_instr_type_phi:
4260    case nir_instr_type_parallel_copy:
4261       return true;
4262    case nir_instr_type_call:
4263    case nir_instr_type_jump:
4264    case nir_instr_type_debug_info:
4265       return false;
4266    case nir_instr_type_intrinsic: {
4267       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4268       switch (intrin->intrinsic) {
4269       case nir_intrinsic_reduce:
4270       case nir_intrinsic_inclusive_scan:
4271       case nir_intrinsic_exclusive_scan:
4272       case nir_intrinsic_reduce_clusters_ir3:
4273       case nir_intrinsic_inclusive_scan_clusters_ir3:
4274       case nir_intrinsic_exclusive_scan_clusters_ir3:
4275       case nir_intrinsic_brcst_active_ir3:
4276       case nir_intrinsic_ballot:
4277       case nir_intrinsic_elect:
4278       case nir_intrinsic_elect_any_ir3:
4279       case nir_intrinsic_read_invocation_cond_ir3:
4280       case nir_intrinsic_demote:
4281       case nir_intrinsic_demote_if:
4282       case nir_intrinsic_terminate:
4283       case nir_intrinsic_terminate_if:
4284          return false;
4285       default:
4286          return true;
4287       }
4288    }
4289    }
4290 
4291    unreachable("Checked all cases");
4292 }
4293 
4294 static bool
nif_can_be_predicated(nir_if * nif)4295 nif_can_be_predicated(nir_if *nif)
4296 {
4297    /* For non-divergent branches, predication is more expensive than a branch
4298     * because the latter can potentially skip all instructions.
4299     */
4300    if (!nir_src_is_divergent(nif->condition))
4301       return false;
4302 
4303    /* Although it could potentially be possible to allow a limited form of
4304     * nested predication (e.g., by resetting the predication mask after a nested
4305     * branch), let's avoid this for now and only use predication for leaf
4306     * branches. That is, for ifs that contain exactly one block in both branches
4307     * (note that they always contain at least one block).
4308     */
4309    if (!exec_list_is_singular(&nif->then_list) ||
4310        !exec_list_is_singular(&nif->else_list)) {
4311       return false;
4312    }
4313 
4314    nir_foreach_instr (instr, nir_if_first_then_block(nif)) {
4315       if (!instr_can_be_predicated(instr))
4316          return false;
4317    }
4318 
4319    nir_foreach_instr (instr, nir_if_first_else_block(nif)) {
4320       if (!instr_can_be_predicated(instr))
4321          return false;
4322    }
4323 
4324    return true;
4325 }
4326 
4327 /* A typical if-else block like this:
4328  * if (cond) {
4329  *     tblock;
4330  * } else {
4331  *     fblock;
4332  * }
4333  * Will be emitted as:
4334  *        |-- i --|
4335  *        | ...   |
4336  *        | predt |
4337  *        |-------|
4338  *    succ0 /   \ succ1
4339  * |-- i+1 --| |-- i+2 --|
4340  * | tblock  | | fblock  |
4341  * | predf   | | jump    |
4342  * |---------| |---------|
4343  *    succ0 \   / succ0
4344  *        |-- j --|
4345  *        |  ...  |
4346  *        |-------|
4347  * Where the numbers at the top of blocks are their indices. That is, the true
4348  * block and false block are laid-out contiguously after the current block. This
4349  * layout is verified during legalization in prede_sched which also inserts the
4350  * final prede instruction. Note that we don't insert prede right away to allow
4351  * opt_jump to optimize the jump in the false block.
4352  */
4353 static struct ir3_instruction *
emit_predicated_branch(struct ir3_context * ctx,nir_if * nif)4354 emit_predicated_branch(struct ir3_context *ctx, nir_if *nif)
4355 {
4356    if (!ctx->compiler->has_predication)
4357       return NULL;
4358    if (!nif_can_be_predicated(nif))
4359       return NULL;
4360 
4361    struct ir3_block *then_block = get_block(ctx, nir_if_first_then_block(nif));
4362    struct ir3_block *else_block = get_block(ctx, nir_if_first_else_block(nif));
4363    assert(list_is_empty(&then_block->instr_list) &&
4364           list_is_empty(&else_block->instr_list));
4365 
4366    bool inv;
4367    struct ir3_instruction *condition =
4368       get_branch_condition(ctx, &nif->condition, 0, &inv);
4369    struct ir3_instruction *pred, *pred_inv;
4370 
4371    if (!inv) {
4372       pred = ir3_PREDT(ctx->block, condition, IR3_REG_PREDICATE);
4373       pred_inv = ir3_PREDF(then_block, condition, IR3_REG_PREDICATE);
4374    } else {
4375       pred = ir3_PREDF(ctx->block, condition, IR3_REG_PREDICATE);
4376       pred_inv = ir3_PREDT(then_block, condition, IR3_REG_PREDICATE);
4377    }
4378 
4379    pred->srcs[0]->num = REG_P0_X;
4380    pred_inv->srcs[0]->num = REG_P0_X;
4381    return pred;
4382 }
4383 
4384 static struct ir3_instruction *
emit_conditional_branch(struct ir3_context * ctx,nir_if * nif)4385 emit_conditional_branch(struct ir3_context *ctx, nir_if *nif)
4386 {
4387    nir_src *nir_cond = &nif->condition;
4388    struct ir3_instruction *folded = fold_conditional_branch(ctx, nir_cond);
4389    if (folded)
4390       return folded;
4391 
4392    struct ir3_instruction *predicated = emit_predicated_branch(ctx, nif);
4393    if (predicated)
4394       return predicated;
4395 
4396    bool inv1;
4397    struct ir3_instruction *cond1 =
4398       get_branch_condition(ctx, nir_cond, 0, &inv1);
4399    struct ir3_instruction *branch =
4400       ir3_BR(ctx->block, cond1, IR3_REG_PREDICATE);
4401    branch->cat0.inv1 = inv1;
4402    return branch;
4403 }
4404 
4405 static void
emit_if(struct ir3_context * ctx,nir_if * nif)4406 emit_if(struct ir3_context *ctx, nir_if *nif)
4407 {
4408    struct ir3_instruction *condition = ir3_get_src_maybe_shared(ctx, &nif->condition)[0];
4409 
4410    if (condition->opc == OPC_ANY_MACRO && condition->block == ctx->block) {
4411       struct ir3_instruction *pred = ssa(condition->srcs[0]);
4412       ir3_BANY(ctx->block, pred, IR3_REG_PREDICATE);
4413    } else if (condition->opc == OPC_ALL_MACRO &&
4414               condition->block == ctx->block) {
4415       struct ir3_instruction *pred = ssa(condition->srcs[0]);
4416       ir3_BALL(ctx->block, pred, IR3_REG_PREDICATE);
4417    } else if (condition->opc == OPC_ELECT_MACRO &&
4418               condition->block == ctx->block) {
4419       struct ir3_instruction *branch = ir3_GETONE(ctx->block);
4420       branch->flags |= condition->flags & IR3_INSTR_NEEDS_HELPERS;
4421    } else if (condition->opc == OPC_SHPS_MACRO &&
4422               condition->block == ctx->block) {
4423       /* TODO: technically this only works if the block is the only user of the
4424        * shps, but we only use it in very constrained scenarios so this should
4425        * be ok.
4426        */
4427       ir3_SHPS(ctx->block);
4428    } else {
4429       emit_conditional_branch(ctx, nif);
4430    }
4431 
4432    ctx->block->divergent_condition = nif->condition.ssa->divergent;
4433 
4434    emit_cf_list(ctx, &nif->then_list);
4435    emit_cf_list(ctx, &nif->else_list);
4436 }
4437 
4438 static void
emit_loop(struct ir3_context * ctx,nir_loop * nloop)4439 emit_loop(struct ir3_context *ctx, nir_loop *nloop)
4440 {
4441    assert(!nir_loop_has_continue_construct(nloop));
4442    ctx->loop_depth++;
4443 
4444    struct nir_block *nstart = nir_loop_first_block(nloop);
4445    struct ir3_block *continue_blk = NULL;
4446 
4447    /* There's always one incoming edge from outside the loop, and if there
4448     * is more than one backedge from inside the loop (so more than 2 total
4449     * edges) then we need to create a continue block after the loop to ensure
4450     * that control reconverges at the end of each loop iteration.
4451     */
4452    if (nstart->predecessors->entries > 2) {
4453       continue_blk = create_continue_block(ctx, nstart);
4454    }
4455 
4456    emit_cf_list(ctx, &nloop->body);
4457 
4458    if (continue_blk) {
4459       struct ir3_block *start = get_block(ctx, nstart);
4460       ir3_JUMP(continue_blk);
4461       continue_blk->successors[0] = start;
4462       continue_blk->loop_depth = ctx->loop_depth;
4463       list_addtail(&continue_blk->node, &ctx->ir->block_list);
4464    }
4465 
4466    ctx->so->loops++;
4467    ctx->loop_depth--;
4468 }
4469 
4470 static void
emit_cf_list(struct ir3_context * ctx,struct exec_list * list)4471 emit_cf_list(struct ir3_context *ctx, struct exec_list *list)
4472 {
4473    foreach_list_typed (nir_cf_node, node, node, list) {
4474       switch (node->type) {
4475       case nir_cf_node_block:
4476          emit_block(ctx, nir_cf_node_as_block(node));
4477          break;
4478       case nir_cf_node_if:
4479          emit_if(ctx, nir_cf_node_as_if(node));
4480          break;
4481       case nir_cf_node_loop:
4482          emit_loop(ctx, nir_cf_node_as_loop(node));
4483          break;
4484       case nir_cf_node_function:
4485          ir3_context_error(ctx, "TODO\n");
4486          break;
4487       }
4488    }
4489 }
4490 
4491 /* emit stream-out code.  At this point, the current block is the original
4492  * (nir) end block, and nir ensures that all flow control paths terminate
4493  * into the end block.  We re-purpose the original end block to generate
4494  * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
4495  * block holding stream-out write instructions, followed by the new end
4496  * block:
4497  *
4498  *   blockOrigEnd {
4499  *      p0.x = (vtxcnt < maxvtxcnt)
4500  *      // succs: blockStreamOut, blockNewEnd
4501  *   }
4502  *   blockStreamOut {
4503  *      // preds: blockOrigEnd
4504  *      ... stream-out instructions ...
4505  *      // succs: blockNewEnd
4506  *   }
4507  *   blockNewEnd {
4508  *      // preds: blockOrigEnd, blockStreamOut
4509  *   }
4510  */
4511 static void
emit_stream_out(struct ir3_context * ctx)4512 emit_stream_out(struct ir3_context *ctx)
4513 {
4514    struct ir3 *ir = ctx->ir;
4515    struct ir3_stream_output_info *strmout = &ctx->so->stream_output;
4516    struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
4517    struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
4518    struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS];
4519 
4520    /* create vtxcnt input in input block at top of shader,
4521     * so that it is seen as live over the entire duration
4522     * of the shader:
4523     */
4524    vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
4525    maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
4526 
4527    /* at this point, we are at the original 'end' block,
4528     * re-purpose this block to stream-out condition, then
4529     * append stream-out block and new-end block
4530     */
4531    orig_end_block = ctx->block;
4532 
4533    // maybe w/ store_global intrinsic, we could do this
4534    // stuff in nir->nir pass
4535 
4536    stream_out_block = ir3_block_create(ir);
4537    list_addtail(&stream_out_block->node, &ir->block_list);
4538 
4539    new_end_block = ir3_block_create(ir);
4540    list_addtail(&new_end_block->node, &ir->block_list);
4541 
4542    orig_end_block->successors[0] = stream_out_block;
4543    orig_end_block->successors[1] = new_end_block;
4544 
4545    stream_out_block->successors[0] = new_end_block;
4546 
4547    /* setup 'if (vtxcnt < maxvtxcnt)' condition: */
4548    cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
4549    cond->dsts[0]->flags |= IR3_REG_PREDICATE;
4550    cond->cat2.condition = IR3_COND_LT;
4551 
4552    /* condition goes on previous block to the conditional,
4553     * since it is used to pick which of the two successor
4554     * paths to take:
4555     */
4556    ir3_BR(orig_end_block, cond, IR3_REG_PREDICATE);
4557 
4558    /* switch to stream_out_block to generate the stream-out
4559     * instructions:
4560     */
4561    ctx->block = stream_out_block;
4562 
4563    /* Calculate base addresses based on vtxcnt.  Instructions
4564     * generated for bases not used in following loop will be
4565     * stripped out in the backend.
4566     */
4567    for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
4568       const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
4569       unsigned stride = strmout->stride[i];
4570       struct ir3_instruction *base, *off;
4571 
4572       base = create_uniform(ctx->block, regid(const_state->offsets.tfbo, i));
4573 
4574       /* 24-bit should be enough: */
4575       off = ir3_MUL_U24(ctx->block, vtxcnt, 0,
4576                         create_immed(ctx->block, stride * 4), 0);
4577 
4578       bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
4579    }
4580 
4581    /* Generate the per-output store instructions: */
4582    for (unsigned i = 0; i < strmout->num_outputs; i++) {
4583       for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
4584          unsigned c = j + strmout->output[i].start_component;
4585          struct ir3_instruction *base, *out, *stg;
4586 
4587          base = bases[strmout->output[i].output_buffer];
4588          out = ctx->outputs[regid(strmout->output[i].register_index, c)];
4589 
4590          stg = ir3_STG(
4591             ctx->block, base, 0,
4592             create_immed(ctx->block, (strmout->output[i].dst_offset + j) * 4),
4593             0, out, 0, create_immed(ctx->block, 1), 0);
4594          stg->cat6.type = TYPE_U32;
4595 
4596          array_insert(ctx->block, ctx->block->keeps, stg);
4597       }
4598    }
4599 
4600    ir3_JUMP(ctx->block);
4601 
4602    /* and finally switch to the new_end_block: */
4603    ctx->block = new_end_block;
4604 }
4605 
4606 static void
setup_predecessors(struct ir3 * ir)4607 setup_predecessors(struct ir3 *ir)
4608 {
4609    foreach_block (block, &ir->block_list) {
4610       for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
4611          if (block->successors[i])
4612             ir3_block_add_predecessor(block->successors[i], block);
4613       }
4614    }
4615 }
4616 
4617 static void
emit_function(struct ir3_context * ctx,nir_function_impl * impl)4618 emit_function(struct ir3_context *ctx, nir_function_impl *impl)
4619 {
4620    nir_metadata_require(impl, nir_metadata_block_index);
4621 
4622    emit_cf_list(ctx, &impl->body);
4623    emit_block(ctx, impl->end_block);
4624 
4625    /* at this point, we should have a single empty block,
4626     * into which we emit the 'end' instruction.
4627     */
4628    compile_assert(ctx, list_is_empty(&ctx->block->instr_list));
4629 
4630    /* If stream-out (aka transform-feedback) enabled, emit the
4631     * stream-out instructions, followed by a new empty block (into
4632     * which the 'end' instruction lands).
4633     *
4634     * NOTE: it is done in this order, rather than inserting before
4635     * we emit end_block, because NIR guarantees that all blocks
4636     * flow into end_block, and that end_block has no successors.
4637     * So by re-purposing end_block as the first block of stream-
4638     * out, we guarantee that all exit paths flow into the stream-
4639     * out instructions.
4640     */
4641    if ((ctx->compiler->gen < 5) &&
4642        (ctx->so->stream_output.num_outputs > 0) &&
4643        !ctx->so->binning_pass) {
4644       assert(ctx->so->type == MESA_SHADER_VERTEX);
4645       emit_stream_out(ctx);
4646    }
4647 
4648    setup_predecessors(ctx->ir);
4649    foreach_block (block, &ctx->ir->block_list) {
4650       resolve_phis(ctx, block);
4651    }
4652 }
4653 
4654 static void
setup_input(struct ir3_context * ctx,nir_intrinsic_instr * intr)4655 setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4656 {
4657    struct ir3_shader_variant *so = ctx->so;
4658    struct ir3_instruction *coord = NULL;
4659 
4660    if (intr->intrinsic == nir_intrinsic_load_interpolated_input)
4661       coord = ir3_create_collect(ctx->block, ir3_get_src(ctx, &intr->src[0]), 2);
4662 
4663    compile_assert(ctx, nir_src_is_const(intr->src[coord ? 1 : 0]));
4664 
4665    unsigned frac = nir_intrinsic_component(intr);
4666    unsigned offset = nir_src_as_uint(intr->src[coord ? 1 : 0]);
4667    unsigned ncomp = nir_intrinsic_dest_components(intr);
4668    unsigned n = nir_intrinsic_base(intr) + offset;
4669    unsigned slot = nir_intrinsic_io_semantics(intr).location + offset;
4670    unsigned compmask = BITFIELD_MASK(ncomp + frac);
4671 
4672    /* Inputs are loaded using ldlw or ldg for other stages. */
4673    compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT ||
4674                           ctx->so->type == MESA_SHADER_VERTEX);
4675 
4676    /* for clip+cull distances, unused components can't be eliminated because
4677     * they're read by fixed-function, even if there's a hole.  Note that
4678     * clip/cull distance arrays must be declared in the FS, so we can just
4679     * use the NIR clip/cull distances to avoid reading ucp_enables in the
4680     * shader key.
4681     */
4682    if (ctx->so->type == MESA_SHADER_FRAGMENT &&
4683        (slot == VARYING_SLOT_CLIP_DIST0 ||
4684         slot == VARYING_SLOT_CLIP_DIST1)) {
4685       unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4686 
4687       if (slot == VARYING_SLOT_CLIP_DIST0)
4688          compmask = clip_cull_mask & 0xf;
4689       else
4690          compmask = clip_cull_mask >> 4;
4691    }
4692 
4693    /* for a4xx+ rasterflat */
4694    if (so->inputs[n].rasterflat && ctx->so->key.rasterflat)
4695       coord = NULL;
4696 
4697    so->total_in += util_bitcount(compmask & ~so->inputs[n].compmask);
4698 
4699    so->inputs[n].slot = slot;
4700    so->inputs[n].compmask |= compmask;
4701    so->inputs_count = MAX2(so->inputs_count, n + 1);
4702    compile_assert(ctx, so->inputs_count < ARRAY_SIZE(so->inputs));
4703    so->inputs[n].flat = !coord;
4704 
4705    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4706       compile_assert(ctx, slot != VARYING_SLOT_POS);
4707 
4708       so->inputs[n].bary = true;
4709       unsigned idx = (n * 4) + frac;
4710       struct ir3_instruction_rpt instr =
4711          create_frag_input(ctx, coord, idx, ncomp);
4712       cp_instrs(ctx->last_dst, instr.rpts, ncomp);
4713 
4714       if (slot == VARYING_SLOT_PRIMITIVE_ID)
4715          so->reads_primid = true;
4716 
4717       so->inputs[n].inloc = 4 * n;
4718       so->varying_in = MAX2(so->varying_in, 4 * n + 4);
4719    } else {
4720       struct ir3_instruction *input = NULL;
4721 
4722       foreach_input (in, ctx->ir) {
4723          if (in->input.inidx == n) {
4724             input = in;
4725             break;
4726          }
4727       }
4728 
4729       if (!input) {
4730          input = create_input(ctx, compmask);
4731          input->input.inidx = n;
4732       } else {
4733          /* For aliased inputs, just append to the wrmask.. ie. if we
4734           * first see a vec2 index at slot N, and then later a vec4,
4735           * the wrmask of the resulting overlapped vec2 and vec4 is 0xf
4736           */
4737          input->dsts[0]->wrmask |= compmask;
4738       }
4739 
4740       for (int i = 0; i < ncomp + frac; i++) {
4741          unsigned idx = (n * 4) + i;
4742          compile_assert(ctx, idx < ctx->ninputs);
4743 
4744          /* fixup the src wrmask to avoid validation fail */
4745          if (ctx->inputs[idx] && (ctx->inputs[idx] != input)) {
4746             ctx->inputs[idx]->srcs[0]->wrmask = input->dsts[0]->wrmask;
4747             continue;
4748          }
4749 
4750          ir3_split_dest(ctx->block, &ctx->inputs[idx], input, i, 1);
4751       }
4752 
4753       for (int i = 0; i < ncomp; i++) {
4754          unsigned idx = (n * 4) + i + frac;
4755          ctx->last_dst[i] = ctx->inputs[idx];
4756       }
4757    }
4758 }
4759 
4760 /* Initially we assign non-packed inloc's for varyings, as we don't really
4761  * know up-front which components will be unused.  After all the compilation
4762  * stages we scan the shader to see which components are actually used, and
4763  * re-pack the inlocs to eliminate unneeded varyings.
4764  */
4765 static void
pack_inlocs(struct ir3_context * ctx)4766 pack_inlocs(struct ir3_context *ctx)
4767 {
4768    struct ir3_shader_variant *so = ctx->so;
4769    uint8_t used_components[so->inputs_count];
4770 
4771    memset(used_components, 0, sizeof(used_components));
4772 
4773    /*
4774     * First Step: scan shader to find which bary.f/ldlv remain:
4775     */
4776 
4777    foreach_block (block, &ctx->ir->block_list) {
4778       foreach_instr (instr, &block->instr_list) {
4779          if (is_input(instr)) {
4780             unsigned inloc = instr->srcs[0]->iim_val;
4781             unsigned i = inloc / 4;
4782             unsigned j = inloc % 4;
4783 
4784             compile_assert(ctx, instr->srcs[0]->flags & IR3_REG_IMMED);
4785             compile_assert(ctx, i < so->inputs_count);
4786 
4787             used_components[i] |= 1 << j;
4788          } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4789             for (int n = 0; n < 2; n++) {
4790                unsigned inloc = instr->prefetch.input_offset + n;
4791                unsigned i = inloc / 4;
4792                unsigned j = inloc % 4;
4793 
4794                compile_assert(ctx, i < so->inputs_count);
4795 
4796                used_components[i] |= 1 << j;
4797             }
4798          }
4799       }
4800    }
4801 
4802    /*
4803     * Second Step: reassign varying inloc/slots:
4804     */
4805 
4806    unsigned inloc = 0;
4807 
4808    /* for clip+cull distances, unused components can't be eliminated because
4809     * they're read by fixed-function, even if there's a hole.  Note that
4810     * clip/cull distance arrays must be declared in the FS, so we can just
4811     * use the NIR clip/cull distances to avoid reading ucp_enables in the
4812     * shader key.
4813     */
4814    unsigned clip_cull_mask = so->clip_mask | so->cull_mask;
4815 
4816    so->varying_in = 0;
4817 
4818    for (unsigned i = 0; i < so->inputs_count; i++) {
4819       unsigned compmask = 0, maxcomp = 0;
4820 
4821       so->inputs[i].inloc = inloc;
4822       so->inputs[i].bary = false;
4823 
4824       if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0 ||
4825           so->inputs[i].slot == VARYING_SLOT_CLIP_DIST1) {
4826          if (so->inputs[i].slot == VARYING_SLOT_CLIP_DIST0)
4827             compmask = clip_cull_mask & 0xf;
4828          else
4829             compmask = clip_cull_mask >> 4;
4830          used_components[i] = compmask;
4831       }
4832 
4833       for (unsigned j = 0; j < 4; j++) {
4834          if (!(used_components[i] & (1 << j)))
4835             continue;
4836 
4837          compmask |= (1 << j);
4838          maxcomp = j + 1;
4839 
4840          /* at this point, since used_components[i] mask is only
4841           * considering varyings (ie. not sysvals) we know this
4842           * is a varying:
4843           */
4844          so->inputs[i].bary = true;
4845       }
4846 
4847       if (so->inputs[i].bary) {
4848          so->varying_in++;
4849          so->inputs[i].compmask = (1 << maxcomp) - 1;
4850          inloc += maxcomp;
4851       }
4852    }
4853 
4854    /*
4855     * Third Step: reassign packed inloc's:
4856     */
4857 
4858    foreach_block (block, &ctx->ir->block_list) {
4859       foreach_instr (instr, &block->instr_list) {
4860          if (is_input(instr)) {
4861             unsigned inloc = instr->srcs[0]->iim_val;
4862             unsigned i = inloc / 4;
4863             unsigned j = inloc % 4;
4864 
4865             instr->srcs[0]->iim_val = so->inputs[i].inloc + j;
4866             if (instr->opc == OPC_FLAT_B)
4867                instr->srcs[1]->iim_val = instr->srcs[0]->iim_val;
4868          } else if (instr->opc == OPC_META_TEX_PREFETCH) {
4869             unsigned i = instr->prefetch.input_offset / 4;
4870             unsigned j = instr->prefetch.input_offset % 4;
4871             instr->prefetch.input_offset = so->inputs[i].inloc + j;
4872          }
4873       }
4874    }
4875 }
4876 
4877 static void
setup_output(struct ir3_context * ctx,nir_intrinsic_instr * intr)4878 setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr)
4879 {
4880    struct ir3_shader_variant *so = ctx->so;
4881    nir_io_semantics io = nir_intrinsic_io_semantics(intr);
4882 
4883    compile_assert(ctx, nir_src_is_const(intr->src[1]));
4884 
4885    unsigned offset = nir_src_as_uint(intr->src[1]);
4886    unsigned n = nir_intrinsic_base(intr) + offset;
4887    unsigned frac = nir_intrinsic_component(intr);
4888    unsigned ncomp = nir_intrinsic_src_components(intr, 0);
4889 
4890    /* For per-view variables, each user-facing slot corresponds to multiple
4891     * views, each with a corresponding driver_location, and the offset is for
4892     * the driver_location. To properly figure out of the slot, we'd need to
4893     * plumb through the number of views. However, for now we only use
4894     * per-view with gl_Position, so we assume that the variable is not an
4895     * array or matrix (so there are no indirect accesses to the variable
4896     * itself) and the indirect offset corresponds to the view.
4897     */
4898    unsigned slot = io.location + (io.per_view ? 0 : offset);
4899 
4900    if (io.per_view && offset > 0)
4901       so->multi_pos_output = true;
4902 
4903    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
4904       switch (slot) {
4905       case FRAG_RESULT_DEPTH:
4906          so->writes_pos = true;
4907          break;
4908       case FRAG_RESULT_COLOR:
4909          if (!ctx->s->info.fs.color_is_dual_source) {
4910             so->color0_mrt = 1;
4911          } else {
4912             slot = FRAG_RESULT_DATA0 + io.dual_source_blend_index;
4913             if (io.dual_source_blend_index > 0)
4914                so->dual_src_blend = true;
4915          }
4916          break;
4917       case FRAG_RESULT_SAMPLE_MASK:
4918          so->writes_smask = true;
4919          break;
4920       case FRAG_RESULT_STENCIL:
4921          so->writes_stencilref = true;
4922          break;
4923       default:
4924          slot += io.dual_source_blend_index; /* For dual-src blend */
4925          if (io.dual_source_blend_index > 0)
4926             so->dual_src_blend = true;
4927          if (slot >= FRAG_RESULT_DATA0)
4928             break;
4929          ir3_context_error(ctx, "unknown FS output name: %s\n",
4930                            gl_frag_result_name(slot));
4931       }
4932    } else if (ctx->so->type == MESA_SHADER_VERTEX ||
4933               ctx->so->type == MESA_SHADER_TESS_EVAL ||
4934               ctx->so->type == MESA_SHADER_GEOMETRY) {
4935       switch (slot) {
4936       case VARYING_SLOT_POS:
4937          so->writes_pos = true;
4938          break;
4939       case VARYING_SLOT_PSIZ:
4940          so->writes_psize = true;
4941          break;
4942       case VARYING_SLOT_VIEWPORT:
4943          so->writes_viewport = true;
4944          break;
4945       case VARYING_SLOT_PRIMITIVE_ID:
4946       case VARYING_SLOT_GS_VERTEX_FLAGS_IR3:
4947          assert(ctx->so->type == MESA_SHADER_GEOMETRY);
4948          FALLTHROUGH;
4949       case VARYING_SLOT_COL0:
4950       case VARYING_SLOT_COL1:
4951       case VARYING_SLOT_BFC0:
4952       case VARYING_SLOT_BFC1:
4953       case VARYING_SLOT_FOGC:
4954       case VARYING_SLOT_CLIP_DIST0:
4955       case VARYING_SLOT_CLIP_DIST1:
4956       case VARYING_SLOT_CLIP_VERTEX:
4957       case VARYING_SLOT_LAYER:
4958          break;
4959       default:
4960          if (slot >= VARYING_SLOT_VAR0)
4961             break;
4962          if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
4963             break;
4964          ir3_context_error(ctx, "unknown %s shader output name: %s\n",
4965                            _mesa_shader_stage_to_string(ctx->so->type),
4966                            gl_varying_slot_name_for_stage(slot, ctx->so->type));
4967       }
4968    } else {
4969       ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type);
4970    }
4971 
4972    so->outputs_count = MAX2(so->outputs_count, n + 1);
4973    compile_assert(ctx, so->outputs_count <= ARRAY_SIZE(so->outputs));
4974 
4975    so->outputs[n].slot = slot;
4976    if (io.per_view)
4977       so->outputs[n].view = offset;
4978 
4979    for (int i = 0; i < ncomp; i++) {
4980       unsigned idx = (n * 4) + i + frac;
4981       compile_assert(ctx, idx < ctx->noutputs);
4982       ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
4983    }
4984 
4985    /* if varying packing doesn't happen, we could end up in a situation
4986     * with "holes" in the output, and since the per-generation code that
4987     * sets up varying linkage registers doesn't expect to have more than
4988     * one varying per vec4 slot, pad the holes.
4989     *
4990     * Note that this should probably generate a performance warning of
4991     * some sort.
4992     */
4993    for (int i = 0; i < frac; i++) {
4994       unsigned idx = (n * 4) + i;
4995       if (!ctx->outputs[idx]) {
4996          ctx->outputs[idx] = create_immed(ctx->block, fui(0.0));
4997       }
4998    }
4999 
5000    struct ir3_instruction *const *src = ir3_get_src(ctx, &intr->src[0]);
5001    for (int i = 0; i < ncomp; i++) {
5002       unsigned idx = (n * 4) + i + frac;
5003       ctx->outputs[idx] = src[i];
5004    }
5005 }
5006 
5007 static bool
uses_load_input(struct ir3_shader_variant * so)5008 uses_load_input(struct ir3_shader_variant *so)
5009 {
5010    return so->type == MESA_SHADER_VERTEX || so->type == MESA_SHADER_FRAGMENT;
5011 }
5012 
5013 static bool
uses_store_output(struct ir3_shader_variant * so)5014 uses_store_output(struct ir3_shader_variant *so)
5015 {
5016    switch (so->type) {
5017    case MESA_SHADER_VERTEX:
5018       return !so->key.has_gs && !so->key.tessellation;
5019    case MESA_SHADER_TESS_EVAL:
5020       return !so->key.has_gs;
5021    case MESA_SHADER_GEOMETRY:
5022    case MESA_SHADER_FRAGMENT:
5023       return true;
5024    case MESA_SHADER_TESS_CTRL:
5025    case MESA_SHADER_COMPUTE:
5026    case MESA_SHADER_KERNEL:
5027       return false;
5028    default:
5029       unreachable("unknown stage");
5030    }
5031 }
5032 
5033 static void
emit_instructions(struct ir3_context * ctx)5034 emit_instructions(struct ir3_context *ctx)
5035 {
5036    MESA_TRACE_FUNC();
5037 
5038    nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
5039 
5040    /* some varying setup which can't be done in setup_input(): */
5041    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5042       nir_foreach_shader_in_variable (var, ctx->s) {
5043          /* set rasterflat flag for front/back color */
5044          if (var->data.interpolation == INTERP_MODE_NONE) {
5045             switch (var->data.location) {
5046             case VARYING_SLOT_COL0:
5047             case VARYING_SLOT_COL1:
5048             case VARYING_SLOT_BFC0:
5049             case VARYING_SLOT_BFC1:
5050                ctx->so->inputs[var->data.driver_location].rasterflat = true;
5051                break;
5052             default:
5053                break;
5054             }
5055          }
5056       }
5057    }
5058 
5059    if (uses_load_input(ctx->so)) {
5060       ctx->so->inputs_count = ctx->s->num_inputs;
5061       compile_assert(ctx, ctx->so->inputs_count < ARRAY_SIZE(ctx->so->inputs));
5062       ctx->ninputs = ctx->s->num_inputs * 4;
5063       ctx->inputs = rzalloc_array(ctx, struct ir3_instruction *, ctx->ninputs);
5064    } else {
5065       ctx->ninputs = 0;
5066       ctx->so->inputs_count = 0;
5067    }
5068 
5069    if (uses_store_output(ctx->so)) {
5070       ctx->noutputs = ctx->s->num_outputs * 4;
5071       ctx->outputs =
5072          rzalloc_array(ctx, struct ir3_instruction *, ctx->noutputs);
5073    } else {
5074       ctx->noutputs = 0;
5075    }
5076 
5077    ctx->ir = ir3_create(ctx->compiler, ctx->so);
5078 
5079    /* Create inputs in first block: */
5080    ctx->block = get_block(ctx, nir_start_block(fxn));
5081    ctx->in_block = ctx->block;
5082 
5083    /* for fragment shader, the vcoord input register is used as the
5084     * base for bary.f varying fetch instrs:
5085     *
5086     * TODO defer creating ctx->ij_pixel and corresponding sysvals
5087     * until emit_intrinsic when we know they are actually needed.
5088     * For now, we defer creating ctx->ij_centroid, etc, since we
5089     * only need ij_pixel for "old style" varying inputs (ie.
5090     * tgsi_to_nir)
5091     */
5092    if (ctx->so->type == MESA_SHADER_FRAGMENT) {
5093       ctx->ij[IJ_PERSP_PIXEL] = create_input(ctx, 0x3);
5094    }
5095 
5096    /* Defer add_sysval_input() stuff until after setup_inputs(),
5097     * because sysvals need to be appended after varyings:
5098     */
5099    if (ctx->ij[IJ_PERSP_PIXEL]) {
5100       add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL, 0x3,
5101                                 ctx->ij[IJ_PERSP_PIXEL]);
5102    }
5103 
5104    /* Tesselation shaders always need primitive ID for indexing the
5105     * BO. Geometry shaders don't always need it but when they do it has be
5106     * delivered and unclobbered in the VS. To make things easy, we always
5107     * make room for it in VS/DS.
5108     */
5109    bool has_tess = ctx->so->key.tessellation != IR3_TESS_NONE;
5110    bool has_gs = ctx->so->key.has_gs;
5111    switch (ctx->so->type) {
5112    case MESA_SHADER_VERTEX:
5113       if (has_tess) {
5114          ctx->tcs_header =
5115             create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
5116          ctx->rel_patch_id =
5117             create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5118          ctx->primitive_id =
5119             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5120       } else if (has_gs) {
5121          ctx->gs_header =
5122             create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5123          ctx->primitive_id =
5124             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5125       }
5126       break;
5127    case MESA_SHADER_TESS_CTRL:
5128       ctx->tcs_header =
5129          create_sysval_input(ctx, SYSTEM_VALUE_TCS_HEADER_IR3, 0x1);
5130       ctx->rel_patch_id =
5131          create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5132       break;
5133    case MESA_SHADER_TESS_EVAL:
5134       if (has_gs) {
5135          ctx->gs_header =
5136             create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5137          ctx->primitive_id =
5138             create_sysval_input(ctx, SYSTEM_VALUE_PRIMITIVE_ID, 0x1);
5139       }
5140       ctx->rel_patch_id =
5141          create_sysval_input(ctx, SYSTEM_VALUE_REL_PATCH_ID_IR3, 0x1);
5142       break;
5143    case MESA_SHADER_GEOMETRY:
5144       ctx->gs_header =
5145          create_sysval_input(ctx, SYSTEM_VALUE_GS_HEADER_IR3, 0x1);
5146       break;
5147    default:
5148       break;
5149    }
5150 
5151    /* Find # of samplers. Just assume that we'll be reading from images.. if
5152     * it is write-only we don't have to count it, but after lowering derefs
5153     * is too late to compact indices for that.
5154     */
5155    ctx->so->num_samp =
5156       BITSET_LAST_BIT(ctx->s->info.textures_used) + ctx->s->info.num_images;
5157 
5158    /* Save off clip+cull information. Note that in OpenGL clip planes may
5159     * be individually enabled/disabled, and some gens handle lowering in
5160     * backend, so we also need to consider the shader key:
5161     */
5162    ctx->so->clip_mask = ctx->so->key.ucp_enables |
5163                         MASK(ctx->s->info.clip_distance_array_size);
5164    ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size)
5165                         << ctx->s->info.clip_distance_array_size;
5166 
5167    ctx->so->pvtmem_size = ctx->s->scratch_size;
5168    ctx->so->shared_size = ctx->s->info.shared_size;
5169 
5170    /* NOTE: need to do something more clever when we support >1 fxn */
5171    nir_foreach_reg_decl (decl, fxn) {
5172       ir3_declare_array(ctx, decl);
5173    }
5174 
5175    /* And emit the body: */
5176    ctx->impl = fxn;
5177    emit_function(ctx, fxn);
5178 
5179    if (ctx->so->type == MESA_SHADER_TESS_CTRL &&
5180        ctx->compiler->tess_use_shared) {
5181       /* Anything before shpe seems to be ignored in the main shader when early
5182        * preamble is enabled on a7xx, so we have to put the barrier after.
5183        */
5184       struct ir3_block *block = ir3_after_preamble(ctx->ir);
5185 
5186       struct ir3_instruction *barrier = ir3_BAR(block);
5187       barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
5188       barrier->barrier_class = IR3_BARRIER_EVERYTHING;
5189       array_insert(block, block->keeps, barrier);
5190       ctx->so->has_barrier = true;
5191 
5192       /* Move the barrier to the beginning of the block but after any phi/input
5193        * meta instructions that must be at the beginning. It must be before we
5194        * load VS outputs.
5195        */
5196       foreach_instr (instr, &block->instr_list) {
5197          if (instr->opc != OPC_META_INPUT &&
5198              instr->opc != OPC_META_TEX_PREFETCH &&
5199              instr->opc != OPC_META_PHI) {
5200             ir3_instr_move_before(barrier, instr);
5201             break;
5202          }
5203       }
5204    }
5205 }
5206 
5207 /* Fixup tex sampler state for astc/srgb workaround instructions.  We
5208  * need to assign the tex state indexes for these after we know the
5209  * max tex index.
5210  */
5211 static void
fixup_astc_srgb(struct ir3_context * ctx)5212 fixup_astc_srgb(struct ir3_context *ctx)
5213 {
5214    struct ir3_shader_variant *so = ctx->so;
5215    /* indexed by original tex idx, value is newly assigned alpha sampler
5216     * state tex idx.  Zero is invalid since there is at least one sampler
5217     * if we get here.
5218     */
5219    unsigned alt_tex_state[16] = {0};
5220    unsigned tex_idx = ctx->max_texture_index + 1;
5221    unsigned idx = 0;
5222 
5223    so->astc_srgb.base = tex_idx;
5224 
5225    for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
5226       struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
5227 
5228       compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
5229 
5230       if (alt_tex_state[sam->cat5.tex] == 0) {
5231          /* assign new alternate/alpha tex state slot: */
5232          alt_tex_state[sam->cat5.tex] = tex_idx++;
5233          so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
5234          so->astc_srgb.count++;
5235       }
5236 
5237       sam->cat5.tex = alt_tex_state[sam->cat5.tex];
5238    }
5239 }
5240 
5241 /* Fixup tex sampler state for tg4 workaround instructions.  We
5242  * need to assign the tex state indexes for these after we know the
5243  * max tex index.
5244  */
5245 static void
fixup_tg4(struct ir3_context * ctx)5246 fixup_tg4(struct ir3_context *ctx)
5247 {
5248    struct ir3_shader_variant *so = ctx->so;
5249    /* indexed by original tex idx, value is newly assigned alpha sampler
5250     * state tex idx.  Zero is invalid since there is at least one sampler
5251     * if we get here.
5252     */
5253    unsigned alt_tex_state[16] = {0};
5254    unsigned tex_idx = ctx->max_texture_index + so->astc_srgb.count + 1;
5255    unsigned idx = 0;
5256 
5257    so->tg4.base = tex_idx;
5258 
5259    for (unsigned i = 0; i < ctx->ir->tg4_count; i++) {
5260       struct ir3_instruction *sam = ctx->ir->tg4[i];
5261 
5262       compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
5263 
5264       if (alt_tex_state[sam->cat5.tex] == 0) {
5265          /* assign new alternate/alpha tex state slot: */
5266          alt_tex_state[sam->cat5.tex] = tex_idx++;
5267          so->tg4.orig_idx[idx++] = sam->cat5.tex;
5268          so->tg4.count++;
5269       }
5270 
5271       sam->cat5.tex = alt_tex_state[sam->cat5.tex];
5272    }
5273 }
5274 
5275 static struct ir3_instruction *
find_end(struct ir3 * ir)5276 find_end(struct ir3 *ir)
5277 {
5278    foreach_block_rev (block, &ir->block_list) {
5279       foreach_instr_rev (instr, &block->instr_list) {
5280          if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
5281             return instr;
5282       }
5283    }
5284    unreachable("couldn't find end instruction");
5285 }
5286 
5287 static void
collect_tex_prefetches(struct ir3_context * ctx,struct ir3 * ir)5288 collect_tex_prefetches(struct ir3_context *ctx, struct ir3 *ir)
5289 {
5290    unsigned idx = 0;
5291 
5292    /* Collect sampling instructions eligible for pre-dispatch. */
5293    foreach_block (block, &ir->block_list) {
5294       foreach_instr_safe (instr, &block->instr_list) {
5295          if (instr->opc == OPC_META_TEX_PREFETCH) {
5296             assert(idx < ARRAY_SIZE(ctx->so->sampler_prefetch));
5297             struct ir3_sampler_prefetch *fetch =
5298                &ctx->so->sampler_prefetch[idx];
5299             idx++;
5300 
5301             fetch->bindless = instr->flags & IR3_INSTR_B;
5302             if (fetch->bindless) {
5303                /* In bindless mode, the index is actually the base */
5304                fetch->tex_id = instr->prefetch.tex_base;
5305                fetch->samp_id = instr->prefetch.samp_base;
5306                fetch->tex_bindless_id = instr->prefetch.tex;
5307                fetch->samp_bindless_id = instr->prefetch.samp;
5308             } else {
5309                fetch->tex_id = instr->prefetch.tex;
5310                fetch->samp_id = instr->prefetch.samp;
5311             }
5312             fetch->tex_opc = OPC_SAM;
5313             fetch->wrmask = instr->dsts[0]->wrmask;
5314             fetch->dst = instr->dsts[0]->num;
5315             fetch->src = instr->prefetch.input_offset;
5316 
5317             /* These are the limits on a5xx/a6xx, we might need to
5318              * revisit if SP_FS_PREFETCH[n] changes on later gens:
5319              */
5320             assert(fetch->dst <= 0x3f);
5321             assert(fetch->tex_id <= 0x1f);
5322             assert(fetch->samp_id <= 0xf);
5323 
5324             ctx->so->total_in =
5325                MAX2(ctx->so->total_in, instr->prefetch.input_offset + 2);
5326 
5327             fetch->half_precision = !!(instr->dsts[0]->flags & IR3_REG_HALF);
5328 
5329             /* Remove the prefetch placeholder instruction: */
5330             list_delinit(&instr->node);
5331          }
5332       }
5333    }
5334 }
5335 
5336 int
ir3_compile_shader_nir(struct ir3_compiler * compiler,struct ir3_shader * shader,struct ir3_shader_variant * so)5337 ir3_compile_shader_nir(struct ir3_compiler *compiler,
5338                        struct ir3_shader *shader,
5339                        struct ir3_shader_variant *so)
5340 {
5341    struct ir3_context *ctx;
5342    struct ir3 *ir;
5343    int ret = 0, max_bary;
5344    bool progress;
5345 
5346    MESA_TRACE_FUNC();
5347 
5348    assert(!so->ir);
5349 
5350    ctx = ir3_context_init(compiler, shader, so);
5351    if (!ctx) {
5352       DBG("INIT failed!");
5353       ret = -1;
5354       goto out;
5355    }
5356 
5357    emit_instructions(ctx);
5358 
5359    if (ctx->error) {
5360       DBG("EMIT failed!");
5361       ret = -1;
5362       goto out;
5363    }
5364 
5365    ir = so->ir = ctx->ir;
5366 
5367    if (gl_shader_stage_is_compute(so->type)) {
5368       so->local_size[0] = ctx->s->info.workgroup_size[0];
5369       so->local_size[1] = ctx->s->info.workgroup_size[1];
5370       so->local_size[2] = ctx->s->info.workgroup_size[2];
5371       so->local_size_variable = ctx->s->info.workgroup_size_variable;
5372    }
5373 
5374    /* Vertex shaders in a tessellation or geometry pipeline treat END as a
5375     * NOP and has an epilogue that writes the VS outputs to local storage, to
5376     * be read by the HS.  Then it resets execution mask (chmask) and chains
5377     * to the next shader (chsh). There are also a few output values which we
5378     * must send to the next stage via registers, and in order for both stages
5379     * to agree on the register used we must force these to be in specific
5380     * registers.
5381     */
5382    if ((so->type == MESA_SHADER_VERTEX &&
5383         (so->key.has_gs || so->key.tessellation)) ||
5384        (so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
5385       struct ir3_instruction *outputs[3];
5386       unsigned outidxs[3];
5387       unsigned regids[3];
5388       unsigned outputs_count = 0;
5389 
5390       if (ctx->primitive_id) {
5391          unsigned n = so->outputs_count++;
5392          so->outputs[n].slot = VARYING_SLOT_PRIMITIVE_ID;
5393 
5394          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->primitive_id);
5395          outputs[outputs_count] = out;
5396          outidxs[outputs_count] = n;
5397          if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id)
5398             regids[outputs_count] = regid(0, 2);
5399          else
5400             regids[outputs_count] = regid(0, 1);
5401          outputs_count++;
5402       }
5403 
5404       if (so->type == MESA_SHADER_VERTEX && ctx->rel_patch_id) {
5405          unsigned n = so->outputs_count++;
5406          so->outputs[n].slot = VARYING_SLOT_REL_PATCH_ID_IR3;
5407          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->rel_patch_id);
5408          outputs[outputs_count] = out;
5409          outidxs[outputs_count] = n;
5410          regids[outputs_count] = regid(0, 1);
5411          outputs_count++;
5412       }
5413 
5414       if (ctx->gs_header) {
5415          unsigned n = so->outputs_count++;
5416          so->outputs[n].slot = VARYING_SLOT_GS_HEADER_IR3;
5417          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->gs_header);
5418          outputs[outputs_count] = out;
5419          outidxs[outputs_count] = n;
5420          regids[outputs_count] = regid(0, 0);
5421          outputs_count++;
5422       }
5423 
5424       if (ctx->tcs_header) {
5425          unsigned n = so->outputs_count++;
5426          so->outputs[n].slot = VARYING_SLOT_TCS_HEADER_IR3;
5427          struct ir3_instruction *out = ir3_collect(ctx->block, ctx->tcs_header);
5428          outputs[outputs_count] = out;
5429          outidxs[outputs_count] = n;
5430          regids[outputs_count] = regid(0, 0);
5431          outputs_count++;
5432       }
5433 
5434       struct ir3_instruction *chmask =
5435          ir3_instr_create(ctx->block, OPC_CHMASK, 0, outputs_count);
5436       chmask->barrier_class = IR3_BARRIER_EVERYTHING;
5437       chmask->barrier_conflict = IR3_BARRIER_EVERYTHING;
5438 
5439       for (unsigned i = 0; i < outputs_count; i++)
5440          __ssa_src(chmask, outputs[i], 0)->num = regids[i];
5441 
5442       chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
5443       memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
5444 
5445       array_insert(ctx->block, ctx->block->keeps, chmask);
5446 
5447       struct ir3_instruction *chsh = ir3_CHSH(ctx->block);
5448       chsh->barrier_class = IR3_BARRIER_EVERYTHING;
5449       chsh->barrier_conflict = IR3_BARRIER_EVERYTHING;
5450    } else {
5451       assert((ctx->noutputs % 4) == 0);
5452       unsigned outidxs[ctx->noutputs / 4];
5453       struct ir3_instruction *outputs[ctx->noutputs / 4];
5454       unsigned outputs_count = 0;
5455 
5456       struct ir3_block *b = ctx->block;
5457       /* Insert these collect's in the block before the end-block if
5458        * possible, so that any moves they generate can be shuffled around to
5459        * reduce nop's:
5460        */
5461       if (ctx->block->predecessors_count == 1)
5462          b = ctx->block->predecessors[0];
5463 
5464       /* Setup IR level outputs, which are "collects" that gather
5465        * the scalar components of outputs.
5466        */
5467       for (unsigned i = 0; i < ctx->noutputs; i += 4) {
5468          unsigned ncomp = 0;
5469          /* figure out the # of components written:
5470           *
5471           * TODO do we need to handle holes, ie. if .x and .z
5472           * components written, but .y component not written?
5473           */
5474          for (unsigned j = 0; j < 4; j++) {
5475             if (!ctx->outputs[i + j])
5476                break;
5477             ncomp++;
5478          }
5479 
5480          /* Note that in some stages, like TCS, store_output is
5481           * lowered to memory writes, so no components of the
5482           * are "written" from the PoV of traditional store-
5483           * output instructions:
5484           */
5485          if (!ncomp)
5486             continue;
5487 
5488          struct ir3_instruction *out =
5489             ir3_create_collect(b, &ctx->outputs[i], ncomp);
5490 
5491          int outidx = i / 4;
5492          assert(outidx < so->outputs_count);
5493 
5494          outidxs[outputs_count] = outidx;
5495          outputs[outputs_count] = out;
5496          outputs_count++;
5497       }
5498 
5499       /* for a6xx+, binning and draw pass VS use same VBO state, so we
5500        * need to make sure not to remove any inputs that are used by
5501        * the nonbinning VS.
5502        */
5503       if (ctx->compiler->gen >= 6 && so->binning_pass &&
5504           so->type == MESA_SHADER_VERTEX) {
5505          for (int i = 0; i < ctx->ninputs; i++) {
5506             struct ir3_instruction *in = ctx->inputs[i];
5507 
5508             if (!in)
5509                continue;
5510 
5511             unsigned n = i / 4;
5512             unsigned c = i % 4;
5513 
5514             assert(n < so->nonbinning->inputs_count);
5515 
5516             if (so->nonbinning->inputs[n].sysval)
5517                continue;
5518 
5519             /* be sure to keep inputs, even if only used in VS */
5520             if (so->nonbinning->inputs[n].compmask & (1 << c))
5521                array_insert(in->block, in->block->keeps, in);
5522          }
5523       }
5524 
5525       struct ir3_instruction *end =
5526          ir3_instr_create(ctx->block, OPC_END, 0, outputs_count);
5527 
5528       for (unsigned i = 0; i < outputs_count; i++) {
5529          __ssa_src(end, outputs[i], 0);
5530       }
5531 
5532       end->end.outidxs = ralloc_array(end, unsigned, outputs_count);
5533       memcpy(end->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
5534 
5535       array_insert(ctx->block, ctx->block->keeps, end);
5536    }
5537 
5538    if (so->type == MESA_SHADER_FRAGMENT &&
5539        ctx->s->info.fs.needs_quad_helper_invocations) {
5540       so->need_pixlod = true;
5541       so->need_full_quad = true;
5542    }
5543 
5544    ir3_debug_print(ir, "AFTER: nir->ir3");
5545    ir3_validate(ir);
5546 
5547    IR3_PASS(ir, ir3_remove_unreachable);
5548 
5549    IR3_PASS(ir, ir3_array_to_ssa);
5550 
5551    ir3_calc_reconvergence(so);
5552 
5553    IR3_PASS(ir, ir3_lower_shared_phis);
5554 
5555    do {
5556       progress = false;
5557 
5558       /* the folding doesn't seem to work reliably on a4xx */
5559       if (ctx->compiler->gen != 4)
5560          progress |= IR3_PASS(ir, ir3_cf);
5561       progress |= IR3_PASS(ir, ir3_cp, so);
5562       progress |= IR3_PASS(ir, ir3_cse);
5563       progress |= IR3_PASS(ir, ir3_dce, so);
5564       progress |= IR3_PASS(ir, ir3_opt_predicates, so);
5565       progress |= IR3_PASS(ir, ir3_shared_fold);
5566    } while (progress);
5567 
5568    IR3_PASS(ir, ir3_sched_add_deps);
5569 
5570    /* At this point, all the dead code should be long gone: */
5571    assert(!IR3_PASS(ir, ir3_dce, so));
5572 
5573    ret = ir3_sched(ir);
5574    if (ret) {
5575       DBG("SCHED failed!");
5576       goto out;
5577    }
5578 
5579    ir3_debug_print(ir, "AFTER: ir3_sched");
5580 
5581    /* Pre-assign VS inputs on a6xx+ binning pass shader, to align
5582     * with draw pass VS, so binning and draw pass can both use the
5583     * same VBO state.
5584     *
5585     * Note that VS inputs are expected to be full precision.
5586     */
5587    bool pre_assign_inputs = (ir->compiler->gen >= 6) &&
5588                             (ir->type == MESA_SHADER_VERTEX) &&
5589                             so->binning_pass;
5590 
5591    if (pre_assign_inputs) {
5592       foreach_input (in, ir) {
5593          assert(in->opc == OPC_META_INPUT);
5594          unsigned inidx = in->input.inidx;
5595 
5596          in->dsts[0]->num = so->nonbinning->inputs[inidx].regid;
5597       }
5598    } else if (ctx->tcs_header) {
5599       /* We need to have these values in the same registers between VS and TCS
5600        * since the VS chains to TCS and doesn't get the sysvals redelivered.
5601        */
5602 
5603       ctx->tcs_header->dsts[0]->num = regid(0, 0);
5604       ctx->rel_patch_id->dsts[0]->num = regid(0, 1);
5605       if (ctx->primitive_id)
5606          ctx->primitive_id->dsts[0]->num = regid(0, 2);
5607    } else if (ctx->gs_header) {
5608       /* We need to have these values in the same registers between producer
5609        * (VS or DS) and GS since the producer chains to GS and doesn't get
5610        * the sysvals redelivered.
5611        */
5612 
5613       ctx->gs_header->dsts[0]->num = regid(0, 0);
5614       if (ctx->primitive_id)
5615          ctx->primitive_id->dsts[0]->num = regid(0, 1);
5616    } else if (so->num_sampler_prefetch) {
5617       assert(so->type == MESA_SHADER_FRAGMENT);
5618       int idx = 0;
5619 
5620       foreach_input (instr, ir) {
5621          if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
5622             continue;
5623 
5624          assert(idx < 2);
5625          instr->dsts[0]->num = idx;
5626          idx++;
5627       }
5628    }
5629 
5630    IR3_PASS(ir, ir3_cleanup_rpt, so);
5631    ret = ir3_ra(so);
5632 
5633    if (ret) {
5634       mesa_loge("ir3_ra() failed!");
5635       goto out;
5636    }
5637 
5638    IR3_PASS(ir, ir3_merge_rpt, so);
5639    IR3_PASS(ir, ir3_postsched, so);
5640 
5641    IR3_PASS(ir, ir3_legalize_relative);
5642    IR3_PASS(ir, ir3_lower_subgroups);
5643 
5644    /* This isn't valid to do when transform feedback is done in HW, which is
5645     * a4xx onward, because the VS may use components not read by the FS for
5646     * transform feedback. Ideally we'd delete this, but a5xx and earlier seem to
5647     * be broken without it.
5648     */
5649    if (so->type == MESA_SHADER_FRAGMENT && ctx->compiler->gen < 6)
5650       pack_inlocs(ctx);
5651 
5652    /*
5653     * Fixup inputs/outputs to point to the actual registers assigned:
5654     *
5655     * 1) initialize to r63.x (invalid/unused)
5656     * 2) iterate IR level inputs/outputs and update the variants
5657     *    inputs/outputs table based on the assigned registers for
5658     *    the remaining inputs/outputs.
5659     */
5660 
5661    for (unsigned i = 0; i < so->inputs_count; i++)
5662       so->inputs[i].regid = INVALID_REG;
5663    for (unsigned i = 0; i < so->outputs_count; i++)
5664       so->outputs[i].regid = INVALID_REG;
5665 
5666    struct ir3_instruction *end = find_end(so->ir);
5667 
5668    for (unsigned i = 0; i < end->srcs_count; i++) {
5669       unsigned outidx = end->end.outidxs[i];
5670       struct ir3_register *reg = end->srcs[i];
5671 
5672       so->outputs[outidx].regid = reg->num;
5673       so->outputs[outidx].half = !!(reg->flags & IR3_REG_HALF);
5674    }
5675 
5676    foreach_input (in, ir) {
5677       assert(in->opc == OPC_META_INPUT);
5678       unsigned inidx = in->input.inidx;
5679 
5680       if (pre_assign_inputs && !so->inputs[inidx].sysval) {
5681          if (VALIDREG(so->nonbinning->inputs[inidx].regid)) {
5682             compile_assert(
5683                ctx, in->dsts[0]->num == so->nonbinning->inputs[inidx].regid);
5684             compile_assert(ctx, !!(in->dsts[0]->flags & IR3_REG_HALF) ==
5685                                    so->nonbinning->inputs[inidx].half);
5686          }
5687          so->inputs[inidx].regid = so->nonbinning->inputs[inidx].regid;
5688          so->inputs[inidx].half = so->nonbinning->inputs[inidx].half;
5689       } else {
5690          so->inputs[inidx].regid = in->dsts[0]->num;
5691          so->inputs[inidx].half = !!(in->dsts[0]->flags & IR3_REG_HALF);
5692       }
5693    }
5694 
5695    uint8_t clip_cull_mask = ctx->so->clip_mask | ctx->so->cull_mask;
5696    /* Having non-zero clip/cull mask and not writting corresponding regs
5697     * leads to a GPU fault on A7XX.
5698     */
5699    if (clip_cull_mask &&
5700        ir3_find_output_regid(ctx->so, VARYING_SLOT_CLIP_DIST0) == regid(63, 0)) {
5701       ctx->so->clip_mask &= 0xf0;
5702       ctx->so->cull_mask &= 0xf0;
5703    }
5704    if ((clip_cull_mask >> 4) &&
5705        ir3_find_output_regid(ctx->so, VARYING_SLOT_CLIP_DIST1) == regid(63, 0)) {
5706       ctx->so->clip_mask &= 0xf;
5707       ctx->so->cull_mask &= 0xf;
5708    }
5709 
5710    if (ctx->astc_srgb)
5711       fixup_astc_srgb(ctx);
5712 
5713    if (ctx->compiler->gen == 4 && ctx->s->info.uses_texture_gather)
5714       fixup_tg4(ctx);
5715 
5716    /* We need to do legalize after (for frag shader's) the "bary.f"
5717     * offsets (inloc) have been assigned.
5718     */
5719    IR3_PASS(ir, ir3_legalize, so, &max_bary);
5720 
5721    /* Set (ss)(sy) on first TCS and GEOMETRY instructions, since we don't
5722     * know what we might have to wait on when coming in from VS chsh.
5723     */
5724    if (so->type == MESA_SHADER_TESS_CTRL || so->type == MESA_SHADER_GEOMETRY) {
5725       foreach_block (block, &ir->block_list) {
5726          foreach_instr (instr, &block->instr_list) {
5727             instr->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
5728             break;
5729          }
5730       }
5731    }
5732 
5733    if (ctx->compiler->gen >= 7 && so->type == MESA_SHADER_COMPUTE) {
5734       struct ir3_instruction *end = find_end(so->ir);
5735       struct ir3_instruction *lock =
5736          ir3_instr_create(ctx->block, OPC_LOCK, 0, 0);
5737       /* TODO: This flags should be set by scheduler only when needed */
5738       lock->flags = IR3_INSTR_SS | IR3_INSTR_SY | IR3_INSTR_JP;
5739       ir3_instr_move_before(lock, end);
5740       struct ir3_instruction *unlock =
5741          ir3_instr_create(ctx->block, OPC_UNLOCK, 0, 0);
5742       ir3_instr_move_before(unlock, end);
5743    }
5744 
5745    so->pvtmem_size = ALIGN(so->pvtmem_size, compiler->pvtmem_per_fiber_align);
5746 
5747    /* Note that max_bary counts inputs that are not bary.f'd for FS: */
5748    if (so->type == MESA_SHADER_FRAGMENT)
5749       so->total_in = max_bary + 1;
5750 
5751    /* Collect sampling instructions eligible for pre-dispatch. */
5752    collect_tex_prefetches(ctx, ir);
5753 
5754    if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5755        !ctx->s->info.fs.early_fragment_tests)
5756       ctx->so->no_earlyz |= ctx->s->info.writes_memory;
5757 
5758    if ((ctx->so->type == MESA_SHADER_FRAGMENT) &&
5759        ctx->s->info.fs.post_depth_coverage)
5760       so->post_depth_coverage = true;
5761 
5762    ctx->so->per_samp = ctx->s->info.fs.uses_sample_shading;
5763 
5764    if (ctx->so->type == MESA_SHADER_FRAGMENT &&
5765        compiler->fs_must_have_non_zero_constlen_quirk) {
5766       so->constlen = MAX2(so->constlen, 4);
5767    }
5768 
5769 out:
5770    if (ret) {
5771       if (so->ir)
5772          ir3_destroy(so->ir);
5773       so->ir = NULL;
5774    }
5775    ir3_context_free(ctx);
5776 
5777    return ret;
5778 }
5779