xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2013 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef IR3_H_
7 #define IR3_H_
8 
9 #include <stdbool.h>
10 #include <stdint.h>
11 
12 #include "compiler/shader_enums.h"
13 
14 #include "util/bitscan.h"
15 #include "util/list.h"
16 #include "util/set.h"
17 #include "util/u_debug.h"
18 
19 #include "freedreno_common.h"
20 
21 #include "instr-a3xx.h"
22 
23 /* low level intermediate representation of an adreno shader program */
24 
25 struct ir3_compiler;
26 struct ir3;
27 struct ir3_instruction;
28 struct ir3_block;
29 
30 struct ir3_info {
31    void *data; /* used internally in ir3 assembler */
32    /* Size in bytes of the shader binary, including NIR constants and
33     * padding
34     */
35    uint32_t size;
36    /* byte offset from start of the shader to the NIR constant data. */
37    uint32_t constant_data_offset;
38    /* Size in dwords of the instructions. */
39    uint16_t sizedwords;
40    uint16_t instrs_count; /* expanded to account for rpt's */
41    uint16_t nops_count;   /* # of nop instructions, including nopN */
42    uint16_t mov_count;
43    uint16_t cov_count;
44    uint16_t stp_count;
45    uint16_t ldp_count;
46    /* NOTE: max_reg, etc, does not include registers not touched
47     * by the shader (ie. vertex fetched via VFD_DECODE but not
48     * touched by shader)
49     */
50    int8_t max_reg; /* highest GPR # used by shader */
51    int8_t max_half_reg;
52    int16_t max_const;
53    /* This is the maximum # of waves that can executed at once in one core,
54     * assuming that they are all executing this shader.
55     */
56    int8_t max_waves;
57    uint8_t subgroup_size;
58    bool double_threadsize;
59    bool multi_dword_ldp_stp;
60    bool early_preamble;
61 
62    /* number of sync bits: */
63    uint16_t ss, sy;
64 
65    /* estimate of number of cycles stalled on (ss) */
66    uint16_t sstall;
67    /* estimate of number of cycles stalled on (sy) */
68    uint16_t systall;
69 
70    uint16_t last_baryf; /* instruction # of last varying fetch */
71 
72    uint16_t last_helper; /* last instruction to use helper invocations */
73 
74    /* Number of instructions of a given category: */
75    uint16_t instrs_per_cat[8];
76 };
77 
78 struct ir3_merge_set {
79    uint16_t preferred_reg;
80    uint16_t size;
81    uint16_t alignment;
82 
83    unsigned interval_start;
84    unsigned spill_slot;
85 
86    unsigned regs_count;
87    struct ir3_register **regs;
88 };
89 
90 typedef enum ir3_register_flags {
91    IR3_REG_CONST = BIT(0),
92    IR3_REG_IMMED = BIT(1),
93    IR3_REG_HALF = BIT(2),
94    /* Shared registers have the same value for all threads when read.
95     * They can only be written when one thread is active (that is, inside
96     * a "getone" block).
97     */
98    IR3_REG_SHARED = BIT(3),
99    IR3_REG_RELATIV = BIT(4),
100    IR3_REG_R = BIT(5),
101    /* Most instructions, it seems, can do float abs/neg but not
102     * integer.  The CP pass needs to know what is intended (int or
103     * float) in order to do the right thing.  For this reason the
104     * abs/neg flags are split out into float and int variants.  In
105     * addition, .b (bitwise) operations, the negate is actually a
106     * bitwise not, so split that out into a new flag to make it
107     * more clear.
108     */
109    IR3_REG_FNEG = BIT(6),
110    IR3_REG_FABS = BIT(7),
111    IR3_REG_SNEG = BIT(8),
112    IR3_REG_SABS = BIT(9),
113    IR3_REG_BNOT = BIT(10),
114    /* (ei) flag, end-input?  Set on last bary, presumably to signal
115     * that the shader needs no more input:
116     *
117     * Note: Has different meaning on other instructions like add.s/u
118     */
119    IR3_REG_EI = BIT(11),
120    /* meta-flags, for intermediate stages of IR, ie.
121     * before register assignment is done:
122     */
123    IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
124    IR3_REG_ARRAY = BIT(13),
125 
126    /* Set on a use whenever the SSA value becomes dead after the current
127     * instruction.
128     */
129    IR3_REG_KILL = BIT(14),
130 
131    /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
132     * same SSA value in a single instruction, this is only set on the first
133     * use.
134     */
135    IR3_REG_FIRST_KILL = BIT(15),
136 
137    /* Set when a destination doesn't have any uses and is dead immediately
138     * after the instruction. This can happen even after optimizations for
139     * corner cases such as destinations of atomic instructions.
140     */
141    IR3_REG_UNUSED = BIT(16),
142 
143    /* "Early-clobber" on a destination means that the destination is
144     * (potentially) written before any sources are read and therefore
145     * interferes with the sources of the instruction.
146     */
147    IR3_REG_EARLY_CLOBBER = BIT(17),
148 
149    /* If this is the last usage of a specific value in the register, the
150     * register cannot be read without being written to first after this.
151     * Note: This effectively has the same semantics as IR3_REG_KILL.
152     */
153    IR3_REG_LAST_USE = BIT(18),
154 
155    /* Predicate register (p0.c). Cannot be combined with half or shared. */
156    IR3_REG_PREDICATE = BIT(19),
157 } ir3_register_flags;
158 
159 struct ir3_register {
160    BITMASK_ENUM(ir3_register_flags) flags;
161 
162    unsigned name;
163 
164    /* used for cat5 instructions, but also for internal/IR level
165     * tracking of what registers are read/written by an instruction.
166     * wrmask may be a bad name since it is used to represent both
167     * src and dst that touch multiple adjacent registers.
168     */
169    unsigned wrmask : 16; /* up to vec16 */
170 
171    /* for relative addressing, 32bits for array size is too small,
172     * but otoh we don't need to deal with disjoint sets, so instead
173     * use a simple size field (number of scalar components).
174     *
175     * Note the size field isn't important for relative const (since
176     * we don't have to do register allocation for constants).
177     */
178    unsigned size : 16;
179 
180    /* normal registers:
181     * the component is in the low two bits of the reg #, so
182     * rN.x becomes: (N << 2) | x
183     */
184    uint16_t num;
185    union {
186       /* immediate: */
187       int32_t iim_val;
188       uint32_t uim_val;
189       float fim_val;
190       /* relative: */
191       struct {
192          uint16_t id;
193          int16_t offset;
194          uint16_t base;
195       } array;
196    };
197 
198    /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
199     * containing this register.
200     */
201    struct ir3_instruction *instr;
202 
203    /* For IR3_REG_SSA, src registers contain ptr back to assigning
204     * instruction.
205     *
206     * For IR3_REG_ARRAY, the pointer is back to the last dependent
207     * array access (although the net effect is the same, it points
208     * back to a previous instruction that we depend on).
209     */
210    struct ir3_register *def;
211 
212    /* Pointer to another register in the instruction that must share the same
213     * physical register. Each destination can be tied with one source, and
214     * they must have "tied" pointing to each other.
215     */
216    struct ir3_register *tied;
217 
218    unsigned spill_slot, next_use;
219 
220    unsigned merge_set_offset;
221    struct ir3_merge_set *merge_set;
222    unsigned interval_start, interval_end;
223 };
224 
225 /*
226  * Stupid/simple growable array implementation:
227  */
228 #define DECLARE_ARRAY(type, name)                                              \
229    unsigned name##_count, name##_sz;                                           \
230    type *name;
231 
232 #define array_insert(ctx, arr, ...)                                            \
233    do {                                                                        \
234       if (arr##_count == arr##_sz) {                                           \
235          arr##_sz = MAX2(2 * arr##_sz, 16);                                    \
236          arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0]));             \
237       }                                                                        \
238       arr[arr##_count++] = __VA_ARGS__;                                        \
239    } while (0)
240 
241 typedef enum {
242    REDUCE_OP_ADD_U,
243    REDUCE_OP_ADD_F,
244    REDUCE_OP_MUL_U,
245    REDUCE_OP_MUL_F,
246    REDUCE_OP_MIN_U,
247    REDUCE_OP_MIN_S,
248    REDUCE_OP_MIN_F,
249    REDUCE_OP_MAX_U,
250    REDUCE_OP_MAX_S,
251    REDUCE_OP_MAX_F,
252    REDUCE_OP_AND_B,
253    REDUCE_OP_OR_B,
254    REDUCE_OP_XOR_B,
255 } reduce_op_t;
256 
257 typedef enum {
258    ALIAS_TEX = 0,
259    ALIAS_RT = 3,
260    ALIAS_MEM = 4,
261 } ir3_alias_scope;
262 
263 typedef enum ir3_instruction_flags {
264    /* (sy) flag is set on first instruction, and after sample
265     * instructions (probably just on RAW hazard).
266     */
267    IR3_INSTR_SY = BIT(0),
268    /* (ss) flag is set on first instruction, and first instruction
269     * to depend on the result of "long" instructions (RAW hazard):
270     *
271     *   rcp, rsq, log2, exp2, sin, cos, sqrt
272     *
273     * It seems to synchronize until all in-flight instructions are
274     * completed, for example:
275     *
276     *   rsq hr1.w, hr1.w
277     *   add.f hr2.z, (neg)hr2.z, hc0.y
278     *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
279     *   rsq hr2.x, hr2.x
280     *   (rpt1)nop
281     *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
282     *   nop
283     *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
284     *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
285     *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
286     *
287     * The last mul.f does not have (ss) set, presumably because the
288     * (ss) on the previous instruction does the job.
289     *
290     * The blob driver also seems to set it on WAR hazards, although
291     * not really clear if this is needed or just blob compiler being
292     * sloppy.  So far I haven't found a case where removing the (ss)
293     * causes problems for WAR hazard, but I could just be getting
294     * lucky:
295     *
296     *   rcp r1.y, r3.y
297     *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
298     *
299     */
300    IR3_INSTR_SS = BIT(1),
301    /* (jp) flag is set on jump targets:
302     */
303    IR3_INSTR_JP = BIT(2),
304    /* (eq) flag kills helper invocations when they are no longer needed */
305    IR3_INSTR_EQ = BIT(3),
306    IR3_INSTR_UL = BIT(4),
307    IR3_INSTR_3D = BIT(5),
308    IR3_INSTR_A = BIT(6),
309    IR3_INSTR_O = BIT(7),
310    IR3_INSTR_P = BIT(8),
311    IR3_INSTR_S = BIT(9),
312    IR3_INSTR_S2EN = BIT(10),
313    IR3_INSTR_SAT = BIT(11),
314    /* (cat5/cat6) Bindless */
315    IR3_INSTR_B = BIT(12),
316    /* (cat5/cat6) nonuniform */
317    IR3_INSTR_NONUNIF = BIT(13),
318    /* (cat5-only) Get some parts of the encoding from a1.x */
319    IR3_INSTR_A1EN = BIT(14),
320    /* uniform destination for ldc, which must be set if and only if it has a
321     * shared reg destination
322     */
323    IR3_INSTR_U = BIT(15),
324    /* meta-flags, for intermediate stages of IR, ie.
325     * before register assignment is done:
326     */
327    IR3_INSTR_MARK = BIT(16),
328 
329    /* Used by shared register allocation when creating spill/reload instructions
330     * to inform validation that this is created by RA. This also may be set on
331     * an instruction where a spill has been folded into it.
332     */
333    IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
334 
335    IR3_INSTR_UNUSED = BIT(17),
336 
337    /* Used to indicate that a mov comes from a lowered READ_FIRST/READ_COND
338     * and may broadcast a helper invocation's value from a vector register to a
339     * shared register that may be read by other invocations. This factors into
340     * (eq) calculations.
341     */
342    IR3_INSTR_NEEDS_HELPERS = BIT(18),
343 
344    /* isam.v */
345    IR3_INSTR_V = BIT(19),
346 
347    /* isam.1d. Note that .1d is an active-low bit. */
348    IR3_INSTR_INV_1D = BIT(20),
349 
350    /* isam.v/ldib.b/stib.b can optionally use an immediate offset with one of
351     * their sources.
352     */
353    IR3_INSTR_IMM_OFFSET = BIT(21),
354 } ir3_instruction_flags;
355 
356 struct ir3_instruction {
357    struct ir3_block *block;
358    opc_t opc;
359    BITMASK_ENUM(ir3_instruction_flags) flags;
360    uint8_t repeat;
361    uint8_t nop;
362 #if MESA_DEBUG
363    unsigned srcs_max, dsts_max;
364 #endif
365    unsigned srcs_count, dsts_count;
366    struct ir3_register **dsts;
367    struct ir3_register **srcs;
368    union {
369       struct {
370          char inv1, inv2;
371          int immed;
372          struct ir3_block *target;
373          const char *target_label;
374          unsigned idx; /* for brac.N */
375       } cat0;
376       struct {
377          type_t src_type, dst_type;
378          round_t round;
379          reduce_op_t reduce_op;
380       } cat1;
381       struct {
382          enum {
383             IR3_COND_LT = 0,
384             IR3_COND_LE = 1,
385             IR3_COND_GT = 2,
386             IR3_COND_GE = 3,
387             IR3_COND_EQ = 4,
388             IR3_COND_NE = 5,
389          } condition;
390       } cat2;
391       struct {
392          enum {
393             IR3_SRC_UNSIGNED = 0,
394             IR3_SRC_MIXED = 1,
395          } signedness;
396          enum {
397             IR3_SRC_PACKED_LOW = 0,
398             IR3_SRC_PACKED_HIGH = 1,
399          } packed;
400          bool swapped;
401       } cat3;
402       struct {
403          unsigned samp, tex;
404          unsigned tex_base : 3;
405          unsigned cluster_size : 4;
406          type_t type;
407       } cat5;
408       struct {
409          type_t type;
410          /* TODO remove dst_offset and handle as a ir3_register
411           * which might be IMMED, similar to how src_offset is
412           * handled.
413           */
414          int dst_offset;
415          int iim_val;       /* for ldgb/stgb, # of components */
416          unsigned d    : 3; /* for ldc, component offset */
417          bool typed    : 1;
418          unsigned base : 3;
419       } cat6;
420       struct {
421          unsigned w : 1; /* write */
422          unsigned r : 1; /* read */
423          unsigned l : 1; /* local */
424          unsigned g : 1; /* global */
425 
426          ir3_alias_scope alias_scope;
427       } cat7;
428       /* for meta-instructions, just used to hold extra data
429        * before instruction scheduling, etc
430        */
431       struct {
432          int off; /* component/offset */
433       } split;
434       struct {
435          /* Per-source index back to the entry in the
436           * ir3_shader_variant::outputs table.
437           */
438          unsigned *outidxs;
439       } end;
440       struct {
441          /* used to temporarily hold reference to nir_phi_instr
442           * until we resolve the phi srcs
443           */
444          void *nphi;
445          unsigned comp;
446       } phi;
447       struct {
448          unsigned samp, tex;
449          unsigned input_offset;
450          unsigned samp_base : 3;
451          unsigned tex_base  : 3;
452       } prefetch;
453       struct {
454          /* maps back to entry in ir3_shader_variant::inputs table: */
455          int inidx;
456          /* for sysvals, identifies the sysval type.  Mostly so we can
457           * identify the special cases where a sysval should not be DCE'd
458           * (currently, just pre-fs texture fetch)
459           */
460          gl_system_value sysval;
461       } input;
462       struct {
463          unsigned src_base, src_size;
464          unsigned dst_base;
465       } push_consts;
466       struct {
467          uint64_t value;
468       } raw;
469    };
470 
471    /* For assigning jump offsets, we need instruction's position: */
472    uint32_t ip;
473 
474    /* used for per-pass extra instruction data.
475     *
476     * TODO we should remove the per-pass data like this and 'use_count'
477     * and do something similar to what RA does w/ ir3_ra_instr_data..
478     * ie. use the ir3_count_instructions pass, and then use instr->ip
479     * to index into a table of pass-private data.
480     */
481    void *data;
482 
483    /**
484     * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
485     */
486    struct set *uses;
487 
488    int use_count; /* currently just updated/used by cp */
489 
490    /* an instruction can reference at most one address register amongst
491     * it's src/dst registers.  Beyond that, you need to insert mov's.
492     *
493     * NOTE: do not write this directly, use ir3_instr_set_address()
494     */
495    struct ir3_register *address;
496 
497    /* Tracking for additional dependent instructions.  Used to handle
498     * barriers, WAR hazards for arrays/SSBOs/etc.
499     */
500    DECLARE_ARRAY(struct ir3_instruction *, deps);
501 
502    /*
503     * From PoV of instruction scheduling, not execution (ie. ignores global/
504     * local distinction):
505     *                            shared  image  atomic  SSBO  everything
506     *   barrier()/            -   R/W     R/W    R/W     R/W       X
507     *     groupMemoryBarrier()
508     *     memoryBarrier()
509     *     (but only images declared coherent?)
510     *   memoryBarrierAtomic() -                  R/W
511     *   memoryBarrierBuffer() -                          R/W
512     *   memoryBarrierImage()  -           R/W
513     *   memoryBarrierShared() -   R/W
514     *
515     * TODO I think for SSBO/image/shared, in cases where we can determine
516     * which variable is accessed, we don't need to care about accesses to
517     * different variables (unless declared coherent??)
518     */
519    enum {
520       IR3_BARRIER_EVERYTHING = 1 << 0,
521       IR3_BARRIER_SHARED_R = 1 << 1,
522       IR3_BARRIER_SHARED_W = 1 << 2,
523       IR3_BARRIER_IMAGE_R = 1 << 3,
524       IR3_BARRIER_IMAGE_W = 1 << 4,
525       IR3_BARRIER_BUFFER_R = 1 << 5,
526       IR3_BARRIER_BUFFER_W = 1 << 6,
527       IR3_BARRIER_ARRAY_R = 1 << 7,
528       IR3_BARRIER_ARRAY_W = 1 << 8,
529       IR3_BARRIER_PRIVATE_R = 1 << 9,
530       IR3_BARRIER_PRIVATE_W = 1 << 10,
531       IR3_BARRIER_CONST_W = 1 << 11,
532       IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
533       IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
534    } barrier_class,
535       barrier_conflict;
536 
537    /* Entry in ir3_block's instruction list: */
538    struct list_head node;
539 
540    /* List of this instruction's repeat group. Vectorized NIR instructions are
541     * emitted as multiple scalar instructions that are linked together using
542     * this field. After RA, the ir3_combine_rpt pass iterates these groups and,
543     * if the register assignment allows it, merges them into a (rptN)
544     * instruction.
545     *
546     * NOTE: this is not a typical list as there is no empty list head. The list
547     * head is stored in the first instruction of the repeat group so also refers
548     * to a list entry. In order to distinguish the list's first entry, we use
549     * serialno: instructions in a repeat group are always emitted consecutively
550     * so the first will have the lowest serialno.
551     *
552     * As this is not a typical list, we have to be careful with using the
553     * existing list helper. For example, using list_length on the first
554     * instruction will yield one less than the number of instructions in its
555     * group.
556     */
557    struct list_head rpt_node;
558 
559    uint32_t serialno;
560 
561    // TODO only computerator/assembler:
562    int line;
563 };
564 
565 /* Represents repeat groups in return values and arguments of the rpt builder
566  * API functions.
567  */
568 struct ir3_instruction_rpt {
569    struct ir3_instruction *rpts[4];
570 };
571 
572 struct ir3 {
573    struct ir3_compiler *compiler;
574    gl_shader_stage type;
575 
576    DECLARE_ARRAY(struct ir3_instruction *, inputs);
577 
578    /* Track bary.f (and ldlv) instructions.. this is needed in
579     * scheduling to ensure that all varying fetches happen before
580     * any potential kill instructions.  The hw gets grumpy if all
581     * threads in a group are killed before the last bary.f gets
582     * a chance to signal end of input (ei).
583     */
584    DECLARE_ARRAY(struct ir3_instruction *, baryfs);
585 
586    /* Track all indirect instructions (read and write).  To avoid
587     * deadlock scenario where an address register gets scheduled,
588     * but other dependent src instructions cannot be scheduled due
589     * to dependency on a *different* address register value, the
590     * scheduler needs to ensure that all dependencies other than
591     * the instruction other than the address register are scheduled
592     * before the one that writes the address register.  Having a
593     * convenient list of instructions that reference some address
594     * register simplifies this.
595     */
596    DECLARE_ARRAY(struct ir3_instruction *, a0_users);
597 
598    /* same for a1.x: */
599    DECLARE_ARRAY(struct ir3_instruction *, a1_users);
600 
601    /* Track texture sample instructions which need texture state
602     * patched in (for astc-srgb workaround):
603     */
604    DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
605 
606    /* Track tg4 instructions which need texture state patched in (for tg4
607     * swizzling workaround):
608     */
609    DECLARE_ARRAY(struct ir3_instruction *, tg4);
610 
611    /* List of blocks: */
612    struct list_head block_list;
613 
614    /* List of ir3_array's: */
615    struct list_head array_list;
616 
617 #if MESA_DEBUG
618    unsigned block_count;
619 #endif
620    unsigned instr_count;
621 };
622 
623 struct ir3_array {
624    struct list_head node;
625    unsigned length;
626    unsigned id;
627 
628    struct nir_def *r;
629 
630    /* To avoid array write's from getting DCE'd, keep track of the
631     * most recent write.  Any array access depends on the most
632     * recent write.  This way, nothing depends on writes after the
633     * last read.  But all the writes that happen before that have
634     * something depending on them
635     */
636    struct ir3_register *last_write;
637 
638    /* extra stuff used in RA pass: */
639    unsigned base; /* base vreg name */
640    unsigned reg;  /* base physical reg */
641    uint16_t start_ip, end_ip;
642 
643    /* Indicates if half-precision */
644    bool half;
645 
646    bool unused;
647 };
648 
649 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
650 
651 struct ir3_block {
652    struct list_head node;
653    struct ir3 *shader;
654 
655    const struct nir_block *nblock;
656 
657    struct list_head instr_list; /* list of ir3_instruction */
658 
659    /* each block has either one or two successors.. in case of two
660     * successors, 'condition' decides which one to follow.  A block preceding
661     * an if/else has two successors.
662     *
663     * In some cases the path that the machine actually takes through the
664     * program may not match the per-thread view of the CFG. In particular
665     * this is the case for if/else, where the machine jumps from the end of
666     * the if to the beginning of the else and switches active lanes. While
667     * most things only care about the per-thread view, we need to use the
668     * "physical" view when allocating shared registers. "successors" contains
669     * the per-thread successors, and "physical_successors" contains the
670     * physical successors which includes the fallthrough edge from the if to
671     * the else.
672     */
673    struct ir3_block *successors[2];
674 
675    bool divergent_condition;
676 
677    DECLARE_ARRAY(struct ir3_block *, predecessors);
678    DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
679    DECLARE_ARRAY(struct ir3_block *, physical_successors);
680 
681    uint16_t start_ip, end_ip;
682 
683    bool reconvergence_point;
684 
685    bool in_early_preamble;
686 
687    /* Track instructions which do not write a register but other-
688     * wise must not be discarded (such as kill, stg, etc)
689     */
690    DECLARE_ARRAY(struct ir3_instruction *, keeps);
691 
692    /* used for per-pass extra block data.  Mainly used right
693     * now in RA step to track livein/liveout.
694     */
695    void *data;
696 
697    uint32_t index;
698 
699    struct ir3_block *imm_dom;
700    DECLARE_ARRAY(struct ir3_block *, dom_children);
701 
702    uint32_t dom_pre_index;
703    uint32_t dom_post_index;
704 
705    uint32_t loop_depth;
706 
707 #if MESA_DEBUG
708    uint32_t serialno;
709 #endif
710 };
711 
712 enum ir3_cursor_option {
713    IR3_CURSOR_BEFORE_BLOCK,
714    IR3_CURSOR_AFTER_BLOCK,
715    IR3_CURSOR_BEFORE_INSTR,
716    IR3_CURSOR_AFTER_INSTR,
717 };
718 
719 struct ir3_cursor {
720    enum ir3_cursor_option option;
721    union {
722       struct ir3_block *block;
723       struct ir3_instruction *instr;
724    };
725 };
726 
727 struct ir3_builder {
728    struct ir3_cursor cursor;
729 };
730 
731 static inline uint32_t
block_id(struct ir3_block * block)732 block_id(struct ir3_block *block)
733 {
734 #if MESA_DEBUG
735    return block->serialno;
736 #else
737    return (uint32_t)(unsigned long)block;
738 #endif
739 }
740 
741 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)742 ir3_start_block(struct ir3 *ir)
743 {
744    return list_first_entry(&ir->block_list, struct ir3_block, node);
745 }
746 
747 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)748 ir3_end_block(struct ir3 *ir)
749 {
750    return list_last_entry(&ir->block_list, struct ir3_block, node);
751 }
752 
753 struct ir3_instruction *ir3_block_get_terminator(struct ir3_block *block);
754 
755 struct ir3_instruction *ir3_block_take_terminator(struct ir3_block *block);
756 
757 struct ir3_instruction *
758 ir3_block_get_last_non_terminator(struct ir3_block *block);
759 
760 struct ir3_instruction *ir3_block_get_last_phi(struct ir3_block *block);
761 
762 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)763 ir3_after_preamble(struct ir3 *ir)
764 {
765    struct ir3_block *block = ir3_start_block(ir);
766    /* The preamble will have a usually-empty else branch, and we want to skip
767     * that to get to the block after the preamble.
768     */
769    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
770    if (terminator && (terminator->opc == OPC_SHPS))
771       return block->successors[1]->successors[0];
772    else
773       return block;
774 }
775 
776 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
777 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
778 void ir3_block_remove_predecessor(struct ir3_block *block,
779                                   struct ir3_block *pred);
780 unsigned ir3_block_get_pred_index(struct ir3_block *block,
781                                   struct ir3_block *pred);
782 
783 void ir3_calc_dominance(struct ir3 *ir);
784 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
785 
786 struct ir3_shader_variant;
787 
788 struct ir3 *ir3_create(struct ir3_compiler *compiler,
789                        struct ir3_shader_variant *v);
790 void ir3_destroy(struct ir3 *shader);
791 
792 void ir3_collect_info(struct ir3_shader_variant *v);
793 void *ir3_alloc(struct ir3 *shader, int sz);
794 
795 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
796                                          unsigned reg_count,
797                                          bool double_threadsize);
798 
799 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
800                                            bool double_threadsize);
801 
802 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
803                                   unsigned regs_count);
804 
805 struct ir3_block *ir3_block_create(struct ir3 *shader);
806 
807 struct ir3_instruction *ir3_build_instr(struct ir3_builder *builder, opc_t opc,
808                                         int ndst, int nsrc);
809 struct ir3_instruction *ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc,
810                                             int ndst, int nsrc);
811 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
812                                          int ndst, int nsrc);
813 struct ir3_instruction *ir3_instr_create_at_end(struct ir3_block *block,
814                                                 opc_t opc, int ndst, int nsrc);
815 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
816 void ir3_instr_add_dep(struct ir3_instruction *instr,
817                        struct ir3_instruction *dep);
818 const char *ir3_instr_name(struct ir3_instruction *instr);
819 void ir3_instr_remove(struct ir3_instruction *instr);
820 
821 void ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n);
822 bool ir3_instr_is_rpt(const struct ir3_instruction *instr);
823 bool ir3_instr_is_first_rpt(const struct ir3_instruction *instr);
824 struct ir3_instruction *ir3_instr_prev_rpt(const struct ir3_instruction *instr);
825 struct ir3_instruction *ir3_instr_first_rpt(struct ir3_instruction *instr);
826 unsigned ir3_instr_rpt_length(const struct ir3_instruction *instr);
827 
828 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
829                                     int flags);
830 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
831                                     int flags);
832 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
833                                    struct ir3_register *reg);
834 
835 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)836 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
837 {
838    assert(!dst->tied && !src->tied);
839    dst->tied = src;
840    src->tied = dst;
841 }
842 
843 void ir3_reg_set_last_array(struct ir3_instruction *instr,
844                             struct ir3_register *reg,
845                             struct ir3_register *last_write);
846 
847 void ir3_instr_set_address(struct ir3_instruction *instr,
848                            struct ir3_instruction *addr);
849 
850 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)851 ir3_instr_check_mark(struct ir3_instruction *instr)
852 {
853    if (instr->flags & IR3_INSTR_MARK)
854       return true; /* already visited */
855    instr->flags |= IR3_INSTR_MARK;
856    return false;
857 }
858 
859 void ir3_block_clear_mark(struct ir3_block *block);
860 void ir3_clear_mark(struct ir3 *shader);
861 
862 unsigned ir3_count_instructions(struct ir3 *ir);
863 unsigned ir3_count_instructions_sched(struct ir3 *ir);
864 unsigned ir3_count_instructions_ra(struct ir3 *ir);
865 
866 /**
867  * Move 'instr' to just before 'after'
868  */
869 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)870 ir3_instr_move_before(struct ir3_instruction *instr,
871                       struct ir3_instruction *after)
872 {
873    list_delinit(&instr->node);
874    list_addtail(&instr->node, &after->node);
875 }
876 
877 /**
878  * Move 'instr' to just after 'before':
879  */
880 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)881 ir3_instr_move_after(struct ir3_instruction *instr,
882                      struct ir3_instruction *before)
883 {
884    list_delinit(&instr->node);
885    list_add(&instr->node, &before->node);
886 }
887 
888 /**
889  * Move 'instr' to the beginning of the block:
890  */
891 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)892 ir3_instr_move_before_block(struct ir3_instruction *instr,
893                             struct ir3_block *block)
894 {
895    list_delinit(&instr->node);
896    list_add(&instr->node, &block->instr_list);
897 }
898 
899 typedef bool (*use_filter_cb)(struct ir3_instruction *use, unsigned src_n);
900 
901 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
902 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter);
903 
904 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
905 void ir3_fixup_src_type(struct ir3_instruction *instr);
906 
907 int ir3_flut(struct ir3_register *src_reg);
908 
909 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
910 
911 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
912 
913 /**
914  * Given an instruction whose result we want to test for nonzero, return a
915  * potentially different instruction for which the result would be the same.
916  * This might be one of its sources if instr doesn't change the nonzero-ness.
917  */
918 struct ir3_instruction *
919 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr);
920 
921 bool ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc);
922 
923 #include "util/set.h"
924 #define foreach_ssa_use(__use, __instr)                                        \
925    for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses;  \
926         __use = NULL)                                                          \
927       set_foreach ((__instr)->uses, __entry)                                   \
928          if ((__use = (void *)__entry->key))
929 
930 static inline uint32_t
reg_num(const struct ir3_register * reg)931 reg_num(const struct ir3_register *reg)
932 {
933    return reg->num >> 2;
934 }
935 
936 static inline uint32_t
reg_comp(const struct ir3_register * reg)937 reg_comp(const struct ir3_register *reg)
938 {
939    return reg->num & 0x3;
940 }
941 
942 static inline bool
is_flow(struct ir3_instruction * instr)943 is_flow(struct ir3_instruction *instr)
944 {
945    return (opc_cat(instr->opc) == 0);
946 }
947 
948 static inline bool
is_terminator(struct ir3_instruction * instr)949 is_terminator(struct ir3_instruction *instr)
950 {
951    switch (instr->opc) {
952    case OPC_BR:
953    case OPC_JUMP:
954    case OPC_BANY:
955    case OPC_BALL:
956    case OPC_BRAA:
957    case OPC_BRAO:
958    case OPC_SHPS:
959    case OPC_GETONE:
960    case OPC_GETLAST:
961    case OPC_PREDT:
962    case OPC_PREDF:
963       return true;
964    default:
965       return false;
966    }
967 }
968 
969 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)970 is_kill_or_demote(struct ir3_instruction *instr)
971 {
972    return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
973 }
974 
975 static inline bool
is_nop(struct ir3_instruction * instr)976 is_nop(struct ir3_instruction *instr)
977 {
978    return instr->opc == OPC_NOP;
979 }
980 
981 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)982 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
983 {
984    unsigned dst_type = (dst->flags & IR3_REG_HALF);
985    unsigned src_type = (src->flags & IR3_REG_HALF);
986 
987    /* Treat shared->normal copies and normal->shared copies as same-type. */
988    return dst_type == src_type;
989 }
990 
991 /* Is it a non-transformative (ie. not type changing) mov?  This can
992  * also include absneg.s/absneg.f, which for the most part can be
993  * treated as a mov (single src argument).
994  */
995 static inline bool
is_same_type_mov(struct ir3_instruction * instr)996 is_same_type_mov(struct ir3_instruction *instr)
997 {
998    struct ir3_register *dst;
999 
1000    switch (instr->opc) {
1001    case OPC_MOV:
1002       if (instr->cat1.src_type != instr->cat1.dst_type)
1003          return false;
1004       /* If the type of dest reg and src reg are different,
1005        * it shouldn't be considered as same type mov
1006        */
1007       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1008          return false;
1009       break;
1010    case OPC_ABSNEG_F:
1011    case OPC_ABSNEG_S:
1012       if (instr->flags & IR3_INSTR_SAT)
1013          return false;
1014       /* If the type of dest reg and src reg are different,
1015        * it shouldn't be considered as same type mov
1016        */
1017       if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1018          return false;
1019       break;
1020    default:
1021       return false;
1022    }
1023 
1024    dst = instr->dsts[0];
1025 
1026    /* mov's that write to a0 or p0.x are special: */
1027    if (dst->flags & IR3_REG_PREDICATE)
1028       return false;
1029    if (reg_num(dst) == REG_A0)
1030       return false;
1031 
1032    if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
1033       return false;
1034 
1035    return true;
1036 }
1037 
1038 /* A move from const, which changes size but not type, can also be
1039  * folded into dest instruction in some cases.
1040  */
1041 static inline bool
is_const_mov(struct ir3_instruction * instr)1042 is_const_mov(struct ir3_instruction *instr)
1043 {
1044    if (instr->opc != OPC_MOV)
1045       return false;
1046 
1047    if (!(instr->srcs[0]->flags & IR3_REG_CONST))
1048       return false;
1049 
1050    type_t src_type = instr->cat1.src_type;
1051    type_t dst_type = instr->cat1.dst_type;
1052 
1053    return (type_float(src_type) && type_float(dst_type)) ||
1054           (type_uint(src_type) && type_uint(dst_type)) ||
1055           (type_sint(src_type) && type_sint(dst_type));
1056 }
1057 
1058 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)1059 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
1060 {
1061    switch (instr->opc) {
1062    case OPC_BALLOT_MACRO:
1063    case OPC_ANY_MACRO:
1064    case OPC_ALL_MACRO:
1065    case OPC_ELECT_MACRO:
1066    case OPC_READ_COND_MACRO:
1067    case OPC_READ_FIRST_MACRO:
1068    case OPC_SCAN_MACRO:
1069    case OPC_SCAN_CLUSTERS_MACRO:
1070       return true;
1071    default:
1072       return false;
1073    }
1074 }
1075 
1076 static inline bool
is_alu(struct ir3_instruction * instr)1077 is_alu(struct ir3_instruction *instr)
1078 {
1079    return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
1080 }
1081 
1082 static inline bool
is_sfu(struct ir3_instruction * instr)1083 is_sfu(struct ir3_instruction *instr)
1084 {
1085    return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
1086 }
1087 
1088 static inline bool
is_tex(struct ir3_instruction * instr)1089 is_tex(struct ir3_instruction *instr)
1090 {
1091    return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1092 }
1093 
1094 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1095 is_tex_shuffle(struct ir3_instruction *instr)
1096 {
1097    switch (instr->opc) {
1098    case OPC_BRCST_ACTIVE:
1099    case OPC_QUAD_SHUFFLE_BRCST:
1100    case OPC_QUAD_SHUFFLE_HORIZ:
1101    case OPC_QUAD_SHUFFLE_VERT:
1102    case OPC_QUAD_SHUFFLE_DIAG:
1103       return true;
1104    default:
1105       return false;
1106    }
1107 }
1108 
1109 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1110 is_tex_or_prefetch(struct ir3_instruction *instr)
1111 {
1112    return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1113 }
1114 
1115 static inline bool
is_mem(struct ir3_instruction * instr)1116 is_mem(struct ir3_instruction *instr)
1117 {
1118    return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1119 }
1120 
1121 static inline bool
is_barrier(struct ir3_instruction * instr)1122 is_barrier(struct ir3_instruction *instr)
1123 {
1124    return (opc_cat(instr->opc) == 7);
1125 }
1126 
1127 static inline bool
is_half(struct ir3_instruction * instr)1128 is_half(struct ir3_instruction *instr)
1129 {
1130    return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1131 }
1132 
1133 static inline bool
is_shared(struct ir3_instruction * instr)1134 is_shared(struct ir3_instruction *instr)
1135 {
1136    return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1137 }
1138 
1139 static inline bool
is_store(struct ir3_instruction * instr)1140 is_store(struct ir3_instruction *instr)
1141 {
1142    /* these instructions, the "destination" register is
1143     * actually a source, the address to store to.
1144     */
1145    switch (instr->opc) {
1146    case OPC_STG:
1147    case OPC_STG_A:
1148    case OPC_STGB:
1149    case OPC_STIB:
1150    case OPC_STP:
1151    case OPC_STL:
1152    case OPC_STLW:
1153    case OPC_L2G:
1154    case OPC_G2L:
1155       return true;
1156    default:
1157       return false;
1158    }
1159 }
1160 
1161 static inline bool
is_load(struct ir3_instruction * instr)1162 is_load(struct ir3_instruction *instr)
1163 {
1164    switch (instr->opc) {
1165    case OPC_LDG:
1166    case OPC_LDG_A:
1167    case OPC_LDGB:
1168    case OPC_LDIB:
1169    case OPC_LDL:
1170    case OPC_LDP:
1171    case OPC_L2G:
1172    case OPC_LDLW:
1173    case OPC_LDLV:
1174       /* probably some others too.. */
1175       return true;
1176    case OPC_LDC:
1177       return instr->dsts_count > 0;
1178    default:
1179       return false;
1180    }
1181 }
1182 
1183 static inline bool
is_input(struct ir3_instruction * instr)1184 is_input(struct ir3_instruction *instr)
1185 {
1186    /* in some cases, ldlv is used to fetch varying without
1187     * interpolation.. fortunately inloc is the first src
1188     * register in either case
1189     */
1190    switch (instr->opc) {
1191    case OPC_LDLV:
1192    case OPC_BARY_F:
1193    case OPC_FLAT_B:
1194       return true;
1195    default:
1196       return false;
1197    }
1198 }
1199 
1200 /* Whether non-helper invocations can read the value of helper invocations. We
1201  * cannot insert (eq) before these instructions.
1202  */
1203 static inline bool
uses_helpers(struct ir3_instruction * instr)1204 uses_helpers(struct ir3_instruction *instr)
1205 {
1206    switch (instr->opc) {
1207    /* These require helper invocations to be present */
1208    case OPC_SAMB:
1209    case OPC_GETLOD:
1210    case OPC_DSX:
1211    case OPC_DSY:
1212    case OPC_DSXPP_1:
1213    case OPC_DSYPP_1:
1214    case OPC_DSXPP_MACRO:
1215    case OPC_DSYPP_MACRO:
1216    case OPC_QUAD_SHUFFLE_BRCST:
1217    case OPC_QUAD_SHUFFLE_HORIZ:
1218    case OPC_QUAD_SHUFFLE_VERT:
1219    case OPC_QUAD_SHUFFLE_DIAG:
1220    case OPC_META_TEX_PREFETCH:
1221       return true;
1222 
1223    /* sam requires helper invocations except for dummy prefetch instructions */
1224    case OPC_SAM:
1225       return instr->dsts_count != 0;
1226 
1227    /* Subgroup operations don't require helper invocations to be present, but
1228     * will use helper invocations if they are present.
1229     */
1230    case OPC_BALLOT_MACRO:
1231    case OPC_ANY_MACRO:
1232    case OPC_ALL_MACRO:
1233    case OPC_READ_FIRST_MACRO:
1234    case OPC_READ_COND_MACRO:
1235    case OPC_MOVMSK:
1236    case OPC_BRCST_ACTIVE:
1237       return true;
1238 
1239    /* Catch lowered READ_FIRST/READ_COND. For elect, don't include the getone
1240     * in the preamble because it doesn't actually matter which fiber is
1241     * selected.
1242     */
1243    case OPC_MOV:
1244    case OPC_ELECT_MACRO:
1245       return instr->flags & IR3_INSTR_NEEDS_HELPERS;
1246 
1247    default:
1248       return false;
1249    }
1250 }
1251 
1252 static inline bool
is_bool(struct ir3_instruction * instr)1253 is_bool(struct ir3_instruction *instr)
1254 {
1255    switch (instr->opc) {
1256    case OPC_CMPS_F:
1257    case OPC_CMPS_S:
1258    case OPC_CMPS_U:
1259       return true;
1260    default:
1261       return false;
1262    }
1263 }
1264 
1265 static inline opc_t
cat3_half_opc(opc_t opc)1266 cat3_half_opc(opc_t opc)
1267 {
1268    switch (opc) {
1269    case OPC_MAD_F32:
1270       return OPC_MAD_F16;
1271    case OPC_SEL_B32:
1272       return OPC_SEL_B16;
1273    case OPC_SEL_S32:
1274       return OPC_SEL_S16;
1275    case OPC_SEL_F32:
1276       return OPC_SEL_F16;
1277    case OPC_SAD_S32:
1278       return OPC_SAD_S16;
1279    default:
1280       return opc;
1281    }
1282 }
1283 
1284 static inline opc_t
cat3_full_opc(opc_t opc)1285 cat3_full_opc(opc_t opc)
1286 {
1287    switch (opc) {
1288    case OPC_MAD_F16:
1289       return OPC_MAD_F32;
1290    case OPC_SEL_B16:
1291       return OPC_SEL_B32;
1292    case OPC_SEL_S16:
1293       return OPC_SEL_S32;
1294    case OPC_SEL_F16:
1295       return OPC_SEL_F32;
1296    case OPC_SAD_S16:
1297       return OPC_SAD_S32;
1298    default:
1299       return opc;
1300    }
1301 }
1302 
1303 static inline opc_t
cat4_half_opc(opc_t opc)1304 cat4_half_opc(opc_t opc)
1305 {
1306    switch (opc) {
1307    case OPC_RSQ:
1308       return OPC_HRSQ;
1309    case OPC_LOG2:
1310       return OPC_HLOG2;
1311    case OPC_EXP2:
1312       return OPC_HEXP2;
1313    default:
1314       return opc;
1315    }
1316 }
1317 
1318 static inline opc_t
cat4_full_opc(opc_t opc)1319 cat4_full_opc(opc_t opc)
1320 {
1321    switch (opc) {
1322    case OPC_HRSQ:
1323       return OPC_RSQ;
1324    case OPC_HLOG2:
1325       return OPC_LOG2;
1326    case OPC_HEXP2:
1327       return OPC_EXP2;
1328    default:
1329       return opc;
1330    }
1331 }
1332 
1333 static inline bool
is_meta(struct ir3_instruction * instr)1334 is_meta(struct ir3_instruction *instr)
1335 {
1336    return (opc_cat(instr->opc) == OPC_META);
1337 }
1338 
1339 static inline unsigned
reg_elems(const struct ir3_register * reg)1340 reg_elems(const struct ir3_register *reg)
1341 {
1342    if (reg->flags & IR3_REG_ARRAY)
1343       return reg->size;
1344    else
1345       return util_last_bit(reg->wrmask);
1346 }
1347 
1348 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1349 reg_elem_size(const struct ir3_register *reg)
1350 {
1351    return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1352 }
1353 
1354 static inline unsigned
reg_size(const struct ir3_register * reg)1355 reg_size(const struct ir3_register *reg)
1356 {
1357    return reg_elems(reg) * reg_elem_size(reg);
1358 }
1359 
1360 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1361  * and have to handle relative accesses specially.
1362  */
1363 
1364 static inline unsigned
post_ra_reg_elems(struct ir3_register * reg)1365 post_ra_reg_elems(struct ir3_register *reg)
1366 {
1367    if (reg->flags & IR3_REG_RELATIV)
1368       return reg->size;
1369    return reg_elems(reg);
1370 }
1371 
1372 static inline unsigned
post_ra_reg_num(struct ir3_register * reg)1373 post_ra_reg_num(struct ir3_register *reg)
1374 {
1375    if (reg->flags & IR3_REG_RELATIV)
1376       return reg->array.base;
1377    return reg->num;
1378 }
1379 
1380 static inline unsigned
dest_regs(struct ir3_instruction * instr)1381 dest_regs(struct ir3_instruction *instr)
1382 {
1383    if (instr->dsts_count == 0)
1384       return 0;
1385 
1386    assert(instr->dsts_count == 1);
1387    return util_last_bit(instr->dsts[0]->wrmask);
1388 }
1389 
1390 static inline bool
is_reg_gpr(const struct ir3_register * reg)1391 is_reg_gpr(const struct ir3_register *reg)
1392 {
1393    if ((reg_num(reg) == REG_A0) || (reg->flags & IR3_REG_PREDICATE))
1394       return false;
1395    if (!(reg->flags & (IR3_REG_SSA | IR3_REG_RELATIV)) &&
1396        reg->num == INVALID_REG)
1397       return false;
1398    return true;
1399 }
1400 
1401 static inline bool
is_reg_a0(const struct ir3_register * reg)1402 is_reg_a0(const struct ir3_register *reg)
1403 {
1404    if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1405       return false;
1406    return reg->num == regid(REG_A0, 0);
1407 }
1408 
1409 /* is dst a normal temp register: */
1410 static inline bool
is_dest_gpr(const struct ir3_register * dst)1411 is_dest_gpr(const struct ir3_register *dst)
1412 {
1413    if (dst->wrmask == 0)
1414       return false;
1415    return is_reg_gpr(dst);
1416 }
1417 
1418 static inline bool
writes_gpr(struct ir3_instruction * instr)1419 writes_gpr(struct ir3_instruction *instr)
1420 {
1421    if (dest_regs(instr) == 0)
1422       return false;
1423    return is_dest_gpr(instr->dsts[0]);
1424 }
1425 
1426 static inline bool
writes_addr0(struct ir3_instruction * instr)1427 writes_addr0(struct ir3_instruction *instr)
1428 {
1429    /* Note: only the first dest can write to a0.x */
1430    if (instr->dsts_count > 0) {
1431       struct ir3_register *dst = instr->dsts[0];
1432       return dst->num == regid(REG_A0, 0);
1433    }
1434    return false;
1435 }
1436 
1437 static inline bool
writes_addr1(struct ir3_instruction * instr)1438 writes_addr1(struct ir3_instruction *instr)
1439 {
1440    /* Note: only the first dest can write to a1.x */
1441    if (instr->dsts_count > 0) {
1442       struct ir3_register *dst = instr->dsts[0];
1443       return dst->num == regid(REG_A0, 1);
1444    }
1445    return false;
1446 }
1447 
1448 static inline bool
writes_pred(struct ir3_instruction * instr)1449 writes_pred(struct ir3_instruction *instr)
1450 {
1451    /* Note: only the first dest can write to p0 */
1452    if (instr->dsts_count > 0) {
1453       struct ir3_register *dst = instr->dsts[0];
1454       return !!(dst->flags & IR3_REG_PREDICATE);
1455    }
1456    return false;
1457 }
1458 
1459 /* r0.x - r47.w are "normal" registers. r48.x - r55.w are shared registers.
1460  * Everything above those are non-GPR registers like a0.x and p0.x that aren't
1461  * assigned by RA.
1462  */
1463 #define GPR_REG_SIZE (4 * 48)
1464 #define SHARED_REG_START GPR_REG_SIZE
1465 #define SHARED_REG_SIZE (4 * 8)
1466 #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
1467 #define NONGPR_REG_SIZE (4 * 8)
1468 
1469 enum ir3_reg_file {
1470    IR3_FILE_FULL,
1471    IR3_FILE_HALF,
1472    IR3_FILE_SHARED,
1473    IR3_FILE_NONGPR,
1474 };
1475 
1476 /* Return a file + offset that can be used for determining if two registers
1477  * alias. The register is only really used for its flags, the num is taken from
1478  * the parameter. Registers overlap if they are in the same file and have an
1479  * overlapping offset. The offset is multiplied by 2 for full registers to
1480  * handle aliasing half and full registers, that is it's in units of half-regs.
1481  */
1482 static inline unsigned
ir3_reg_file_offset(const struct ir3_register * reg,unsigned num,bool mergedregs,enum ir3_reg_file * file)1483 ir3_reg_file_offset(const struct ir3_register *reg, unsigned num,
1484                     bool mergedregs, enum ir3_reg_file *file)
1485 {
1486    assert(!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
1487    unsigned size = reg_elem_size(reg);
1488    if (!is_reg_gpr(reg)) {
1489       *file = IR3_FILE_NONGPR;
1490       return (num - NONGPR_REG_START) * size;
1491    } else if (reg->flags & IR3_REG_SHARED) {
1492       *file = IR3_FILE_SHARED;
1493       return (num - SHARED_REG_START) * size;
1494    } else if (mergedregs || !(reg->flags & IR3_REG_HALF)) {
1495       *file = IR3_FILE_FULL;
1496       return num * size;
1497    } else {
1498       *file = IR3_FILE_HALF;
1499       return num;
1500    }
1501 }
1502 
1503 /* returns defining instruction for reg */
1504 /* TODO better name */
1505 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1506 ssa(struct ir3_register *reg)
1507 {
1508    if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1509       return reg->def->instr;
1510    return NULL;
1511 }
1512 
1513 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1514 conflicts(struct ir3_register *a, struct ir3_register *b)
1515 {
1516    return (a && b) && (a->def != b->def);
1517 }
1518 
1519 static inline bool
reg_gpr(struct ir3_register * r)1520 reg_gpr(struct ir3_register *r)
1521 {
1522    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE))
1523       return false;
1524    if (reg_num(r) == REG_A0)
1525       return false;
1526    return true;
1527 }
1528 
1529 static inline bool
reg_is_addr1(struct ir3_register * r)1530 reg_is_addr1(struct ir3_register *r)
1531 {
1532    if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1533       return false;
1534    return r->num == regid(REG_A0, 1);
1535 }
1536 
1537 static inline type_t
half_type(type_t type)1538 half_type(type_t type)
1539 {
1540    switch (type) {
1541    case TYPE_F32:
1542       return TYPE_F16;
1543    case TYPE_U32:
1544    case TYPE_U8_32:
1545       return TYPE_U16;
1546    case TYPE_S32:
1547       return TYPE_S16;
1548    case TYPE_F16:
1549    case TYPE_U16:
1550    case TYPE_S16:
1551       return type;
1552    case TYPE_U8:
1553       return type;
1554    default:
1555       assert(0);
1556       return (type_t)~0;
1557    }
1558 }
1559 
1560 static inline type_t
full_type(type_t type)1561 full_type(type_t type)
1562 {
1563    switch (type) {
1564    case TYPE_F16:
1565       return TYPE_F32;
1566    case TYPE_U8:
1567    case TYPE_U8_32:
1568    case TYPE_U16:
1569       return TYPE_U32;
1570    case TYPE_S16:
1571       return TYPE_S32;
1572    case TYPE_F32:
1573    case TYPE_U32:
1574    case TYPE_S32:
1575       return type;
1576    default:
1577       assert(0);
1578       return (type_t)~0;
1579    }
1580 }
1581 
1582 /* some cat2 instructions (ie. those which are not float) can embed an
1583  * immediate:
1584  */
1585 static inline bool
ir3_cat2_int(opc_t opc)1586 ir3_cat2_int(opc_t opc)
1587 {
1588    switch (opc) {
1589    case OPC_ADD_U:
1590    case OPC_ADD_S:
1591    case OPC_SUB_U:
1592    case OPC_SUB_S:
1593    case OPC_CMPS_U:
1594    case OPC_CMPS_S:
1595    case OPC_MIN_U:
1596    case OPC_MIN_S:
1597    case OPC_MAX_U:
1598    case OPC_MAX_S:
1599    case OPC_CMPV_U:
1600    case OPC_CMPV_S:
1601    case OPC_MUL_U24:
1602    case OPC_MUL_S24:
1603    case OPC_MULL_U:
1604    case OPC_CLZ_S:
1605    case OPC_ABSNEG_S:
1606    case OPC_AND_B:
1607    case OPC_OR_B:
1608    case OPC_NOT_B:
1609    case OPC_XOR_B:
1610    case OPC_BFREV_B:
1611    case OPC_CLZ_B:
1612    case OPC_SHL_B:
1613    case OPC_SHR_B:
1614    case OPC_ASHR_B:
1615    case OPC_MGEN_B:
1616    case OPC_GETBIT_B:
1617    case OPC_CBITS_B:
1618    case OPC_BARY_F:
1619    case OPC_FLAT_B:
1620       return true;
1621 
1622    default:
1623       return false;
1624    }
1625 }
1626 
1627 /* map cat2 instruction to valid abs/neg flags: */
1628 static inline unsigned
ir3_cat2_absneg(opc_t opc)1629 ir3_cat2_absneg(opc_t opc)
1630 {
1631    switch (opc) {
1632    case OPC_ADD_F:
1633    case OPC_MIN_F:
1634    case OPC_MAX_F:
1635    case OPC_MUL_F:
1636    case OPC_SIGN_F:
1637    case OPC_CMPS_F:
1638    case OPC_ABSNEG_F:
1639    case OPC_CMPV_F:
1640    case OPC_FLOOR_F:
1641    case OPC_CEIL_F:
1642    case OPC_RNDNE_F:
1643    case OPC_RNDAZ_F:
1644    case OPC_TRUNC_F:
1645    case OPC_BARY_F:
1646       return IR3_REG_FABS | IR3_REG_FNEG;
1647 
1648    case OPC_ADD_U:
1649    case OPC_ADD_S:
1650    case OPC_SUB_U:
1651    case OPC_SUB_S:
1652    case OPC_CMPS_U:
1653    case OPC_CMPS_S:
1654    case OPC_MIN_U:
1655    case OPC_MIN_S:
1656    case OPC_MAX_U:
1657    case OPC_MAX_S:
1658    case OPC_CMPV_U:
1659    case OPC_CMPV_S:
1660    case OPC_MUL_U24:
1661    case OPC_MUL_S24:
1662    case OPC_MULL_U:
1663    case OPC_CLZ_S:
1664       return 0;
1665 
1666    case OPC_ABSNEG_S:
1667       return IR3_REG_SABS | IR3_REG_SNEG;
1668 
1669    case OPC_AND_B:
1670    case OPC_OR_B:
1671    case OPC_NOT_B:
1672    case OPC_XOR_B:
1673    case OPC_BFREV_B:
1674    case OPC_CLZ_B:
1675    case OPC_SHL_B:
1676    case OPC_SHR_B:
1677    case OPC_ASHR_B:
1678    case OPC_MGEN_B:
1679    case OPC_GETBIT_B:
1680    case OPC_CBITS_B:
1681       return IR3_REG_BNOT;
1682 
1683    default:
1684       return 0;
1685    }
1686 }
1687 
1688 /* map cat3 instructions to valid abs/neg flags: */
1689 static inline unsigned
ir3_cat3_absneg(opc_t opc)1690 ir3_cat3_absneg(opc_t opc)
1691 {
1692    switch (opc) {
1693    case OPC_MAD_F16:
1694    case OPC_MAD_F32:
1695    case OPC_SEL_F16:
1696    case OPC_SEL_F32:
1697       return IR3_REG_FNEG;
1698 
1699    case OPC_MAD_U16:
1700    case OPC_MADSH_U16:
1701    case OPC_MAD_S16:
1702    case OPC_MADSH_M16:
1703    case OPC_MAD_U24:
1704    case OPC_MAD_S24:
1705    case OPC_SEL_S16:
1706    case OPC_SEL_S32:
1707    case OPC_SAD_S16:
1708    case OPC_SAD_S32:
1709       /* neg *may* work on 3rd src.. */
1710 
1711    case OPC_SEL_B16:
1712    case OPC_SEL_B32:
1713 
1714    case OPC_SHRM:
1715    case OPC_SHLM:
1716    case OPC_SHRG:
1717    case OPC_SHLG:
1718    case OPC_ANDG:
1719    case OPC_WMM:
1720    case OPC_WMM_ACCU:
1721 
1722    default:
1723       return 0;
1724    }
1725 }
1726 
1727 /* Return the type (float, int, or uint) the op uses when converting from the
1728  * internal result of the op (which is assumed to be the same size as the
1729  * sources) to the destination when they are not the same size. If F32 it does
1730  * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1731  * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1732  * doesn't do anything sensible or is unknown.
1733  */
1734 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1735 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1736 {
1737    *can_fold = true;
1738    switch (instr->opc) {
1739    case OPC_ADD_F:
1740    case OPC_MUL_F:
1741    case OPC_BARY_F:
1742    case OPC_MAD_F32:
1743    case OPC_MAD_F16:
1744    case OPC_WMM:
1745    case OPC_WMM_ACCU:
1746       return TYPE_F32;
1747 
1748    case OPC_ADD_U:
1749    case OPC_SUB_U:
1750    case OPC_MIN_U:
1751    case OPC_MAX_U:
1752    case OPC_AND_B:
1753    case OPC_OR_B:
1754    case OPC_NOT_B:
1755    case OPC_XOR_B:
1756    case OPC_MUL_U24:
1757    case OPC_MULL_U:
1758    case OPC_SHL_B:
1759    case OPC_SHR_B:
1760    case OPC_ASHR_B:
1761    case OPC_MAD_U24:
1762    case OPC_SHRM:
1763    case OPC_SHLM:
1764    case OPC_SHRG:
1765    case OPC_SHLG:
1766    case OPC_ANDG:
1767    /* Comparison ops zero-extend/truncate their results, so consider them as
1768     * unsigned here.
1769     */
1770    case OPC_CMPS_F:
1771    case OPC_CMPV_F:
1772    case OPC_CMPS_U:
1773    case OPC_CMPS_S:
1774       return TYPE_U32;
1775 
1776    case OPC_ADD_S:
1777    case OPC_SUB_S:
1778    case OPC_MIN_S:
1779    case OPC_MAX_S:
1780    case OPC_ABSNEG_S:
1781    case OPC_MUL_S24:
1782    case OPC_MAD_S24:
1783       return TYPE_S32;
1784 
1785    /* We assume that any move->move folding that could be done was done by
1786     * NIR.
1787     */
1788    case OPC_MOV:
1789    default:
1790       *can_fold = false;
1791       return TYPE_U32;
1792    }
1793 }
1794 
1795 /* Return the src and dst types for the conversion which is already folded
1796  * into the op. We can assume that instr has folded in a conversion from
1797  * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1798  * to call if ir3_output_conv_type() returns can_fold = true.
1799  */
1800 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1801 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1802 {
1803    switch (instr->opc) {
1804    case OPC_CMPS_F:
1805    case OPC_CMPV_F:
1806    case OPC_CMPS_U:
1807    case OPC_CMPS_S:
1808       /* Comparisons only return 0/1 and the size of the comparison sources
1809        * is irrelevant, never consider them as having an output conversion
1810        * by returning a type with the dest size here:
1811        */
1812       return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1813                                                     : full_type(base_type);
1814 
1815    case OPC_BARY_F:
1816       /* bary.f doesn't have an explicit source, but we can assume here that
1817        * the varying data it reads is in fp32.
1818        *
1819        * This may be fp16 on older gen's depending on some register
1820        * settings, but it's probably not worth plumbing that through for a
1821        * small improvement that NIR would hopefully handle for us anyway.
1822        */
1823       return TYPE_F32;
1824 
1825    case OPC_FLAT_B:
1826       /* Treat the input data as u32 if not interpolating. */
1827       return TYPE_U32;
1828 
1829    default:
1830       return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1831                                                     : full_type(base_type);
1832    }
1833 }
1834 
1835 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1836 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1837 {
1838    return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1839                                                  : full_type(base_type);
1840 }
1841 
1842 /* Some instructions have signed/unsigned variants which are identical except
1843  * for whether the folded conversion sign-extends or zero-extends, and we can
1844  * fold in a mismatching move by rewriting the opcode. Return the opcode to
1845  * switch signedness, and whether one exists.
1846  */
1847 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1848 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1849 {
1850    switch (opc) {
1851 #define PAIR(u, s)                                                             \
1852    case OPC_##u:                                                               \
1853       return OPC_##s;                                                          \
1854    case OPC_##s:                                                               \
1855       return OPC_##u;
1856       PAIR(ADD_U, ADD_S)
1857       PAIR(SUB_U, SUB_S)
1858       /* Note: these are only identical when the sources are half, but that's
1859        * the only case we call this function for anyway.
1860        */
1861       PAIR(MUL_U24, MUL_S24)
1862 
1863    default:
1864       *can_swap = false;
1865       return opc;
1866    }
1867 }
1868 
1869 #define MASK(n) ((1 << (n)) - 1)
1870 
1871 /* iterator for an instructions's sources (reg), also returns src #: */
1872 #define foreach_src_n(__srcreg, __n, __instr)                                  \
1873    if ((__instr)->srcs_count)                                                  \
1874       for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1875            __srcreg = NULL)                                                    \
1876          for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt;    \
1877               __n++)                                                           \
1878             if ((__srcreg = (__instr)->srcs[__n]))
1879 
1880 /* iterator for an instructions's sources (reg): */
1881 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1882 
1883 #define foreach_src_if(__srcreg, __instr, __filter)                            \
1884    foreach_src (__srcreg, __instr)                                             \
1885       if (__filter(__srcreg))
1886 
1887 /* iterator for an instructions's destinations (reg), also returns dst #: */
1888 #define foreach_dst_n(__dstreg, __n, __instr)                                  \
1889    if ((__instr)->dsts_count)                                                  \
1890       for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1891            __dstreg = NULL)                                                    \
1892          for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt;    \
1893               __n++)                                                           \
1894             if ((__dstreg = (__instr)->dsts[__n]))
1895 
1896 /* iterator for an instructions's destinations (reg): */
1897 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1898 
1899 #define foreach_dst_if(__dstreg, __instr, __filter)                            \
1900    foreach_dst (__dstreg, __instr)                                             \
1901       if (__filter(__dstreg))
1902 
1903 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1904 __ssa_src_cnt(struct ir3_instruction *instr)
1905 {
1906    return instr->srcs_count + instr->deps_count;
1907 }
1908 
1909 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1910 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1911 {
1912    if (n >= instr->srcs_count)
1913       return true;
1914    return false;
1915 }
1916 
1917 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1918 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1919 {
1920    if (__is_false_dep(instr, n))
1921       return &instr->deps[n - instr->srcs_count];
1922    if (ssa(instr->srcs[n]))
1923       return &instr->srcs[n]->def->instr;
1924    return NULL;
1925 }
1926 
1927 #define foreach_ssa_srcp_n(__srcp, __n, __instr)                               \
1928    for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL)   \
1929       for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt;      \
1930            __n++)                                                              \
1931          if ((__srcp = __ssa_srcp_n(__instr, __n)))
1932 
1933 #define foreach_ssa_srcp(__srcp, __instr)                                      \
1934    foreach_ssa_srcp_n (__srcp, __i, __instr)
1935 
1936 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1937 #define foreach_ssa_src_n(__srcinst, __n, __instr)                             \
1938    for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst;             \
1939         __srcinst = NULL)                                                      \
1940       foreach_ssa_srcp_n (__srcp, __n, __instr)                                \
1941          if ((__srcinst = *__srcp))
1942 
1943 /* iterator for an instruction's SSA sources (instr): */
1944 #define foreach_ssa_src(__srcinst, __instr)                                    \
1945    foreach_ssa_src_n (__srcinst, __i, __instr)
1946 
1947 /* iterators for shader inputs: */
1948 #define foreach_input_n(__ininstr, __cnt, __ir)                                \
1949    for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr;             \
1950         __ininstr = NULL)                                                      \
1951       for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++)          \
1952          if ((__ininstr = (__ir)->inputs[__cnt]))
1953 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1954 
1955 /* iterators for instructions: */
1956 #define foreach_instr(__instr, __list)                                         \
1957    list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1958 #define foreach_instr_from(__instr, __start, __list)                           \
1959    list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
1960                             __list, node)
1961 #define foreach_instr_rev(__instr, __list)                                     \
1962    list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1963 #define foreach_instr_safe(__instr, __list)                                    \
1964    list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1965 #define foreach_instr_from_safe(__instr, __start, __list)                      \
1966    list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start,     \
1967                                  __list, node)
1968 
1969 /* Iterate over all instructions in a repeat group. */
1970 #define foreach_instr_rpt(__rpt, __instr)                                      \
1971    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
1972       for (struct ir3_instruction *__rpt = __instr, *__first = __instr;        \
1973            __first || __rpt != __instr;                                        \
1974            __first = NULL, __rpt =                                             \
1975                               list_entry(__rpt->rpt_node.next,                 \
1976                                          struct ir3_instruction, rpt_node))
1977 
1978 /* Iterate over all instructions except the first one in a repeat group. */
1979 #define foreach_instr_rpt_excl(__rpt, __instr)                                 \
1980    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
1981       list_for_each_entry (struct ir3_instruction, __rpt, &__instr->rpt_node,  \
1982                            rpt_node)
1983 
1984 #define foreach_instr_rpt_excl_safe(__rpt, __instr)                            \
1985    if (assert(ir3_instr_is_first_rpt(__instr)), true)                          \
1986       list_for_each_entry_safe (struct ir3_instruction, __rpt,                 \
1987                                 &__instr->rpt_node, rpt_node)
1988 
1989 /* iterators for blocks: */
1990 #define foreach_block(__block, __list)                                         \
1991    list_for_each_entry (struct ir3_block, __block, __list, node)
1992 #define foreach_block_safe(__block, __list)                                    \
1993    list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1994 #define foreach_block_rev(__block, __list)                                     \
1995    list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1996 
1997 /* iterators for arrays: */
1998 #define foreach_array(__array, __list)                                         \
1999    list_for_each_entry (struct ir3_array, __array, __list, node)
2000 #define foreach_array_safe(__array, __list)                                    \
2001    list_for_each_entry_safe (struct ir3_array, __array, __list, node)
2002 
2003 #define IR3_PASS(ir, pass, ...)                                                \
2004    ({                                                                          \
2005       bool progress = pass(ir, ##__VA_ARGS__);                                 \
2006       if (progress) {                                                          \
2007          ir3_debug_print(ir, "AFTER: " #pass);                                 \
2008          ir3_validate(ir);                                                     \
2009       }                                                                        \
2010       progress;                                                                \
2011    })
2012 
2013 /* validate: */
2014 void ir3_validate(struct ir3 *ir);
2015 
2016 /* dump: */
2017 void ir3_print(struct ir3 *ir);
2018 void ir3_print_instr(struct ir3_instruction *instr);
2019 
2020 struct log_stream;
2021 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
2022 
2023 /* delay calculation: */
2024 int ir3_delayslots(struct ir3_compiler *compiler,
2025                    struct ir3_instruction *assigner,
2026                    struct ir3_instruction *consumer, unsigned n, bool soft);
2027 unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
2028                                     struct ir3_instruction *assigner,
2029                                     struct ir3_instruction *consumer,
2030                                     unsigned assigner_n, unsigned consumer_n);
2031 
2032 /* estimated (ss)/(sy) delay calculation */
2033 
2034 static inline bool
is_local_mem_load(struct ir3_instruction * instr)2035 is_local_mem_load(struct ir3_instruction *instr)
2036 {
2037    return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
2038       instr->opc == OPC_LDLW;
2039 }
2040 
2041 bool is_scalar_alu(struct ir3_instruction *instr,
2042                    const struct ir3_compiler *compiler);
2043 
2044 /* Does this instruction sometimes need (ss) to wait for its result? */
2045 static inline bool
is_ss_producer(struct ir3_instruction * instr)2046 is_ss_producer(struct ir3_instruction *instr)
2047 {
2048    foreach_dst (dst, instr) {
2049       if (dst->flags & IR3_REG_SHARED)
2050          return true;
2051    }
2052 
2053    if (instr->block->in_early_preamble && writes_addr1(instr))
2054       return true;
2055 
2056    return is_sfu(instr) || is_local_mem_load(instr);
2057 }
2058 
2059 static inline bool
needs_ss(const struct ir3_compiler * compiler,struct ir3_instruction * producer,struct ir3_instruction * consumer)2060 needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
2061          struct ir3_instruction *consumer)
2062 {
2063    if (is_scalar_alu(producer, compiler) &&
2064        is_scalar_alu(consumer, compiler) &&
2065        (producer->dsts[0]->flags & IR3_REG_HALF) ==
2066        (consumer->srcs[0]->flags & IR3_REG_HALF))
2067       return false;
2068 
2069    return is_ss_producer(producer);
2070 }
2071 
2072 /* The soft delay for approximating the cost of (ss). */
2073 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)2074 soft_ss_delay(struct ir3_instruction *instr)
2075 {
2076    /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
2077     * using nop's instead of (ss) is:
2078     *
2079     *     8 - single warp
2080     *     9 - two warps
2081     *    10 - four warps
2082     *
2083     * and so on. Not quite sure where it tapers out (ie. how many warps share an
2084     * SFU unit). But 10 seems like a reasonable # to choose:
2085     */
2086    if (is_sfu(instr) || is_local_mem_load(instr))
2087       return 10;
2088 
2089    /* The blob adds 6 nops between shared producers and consumers, and before we
2090     * used (ss) this was sufficient in most cases.
2091     */
2092    return 6;
2093 }
2094 
2095 static inline bool
is_sy_producer(struct ir3_instruction * instr)2096 is_sy_producer(struct ir3_instruction *instr)
2097 {
2098    return is_tex_or_prefetch(instr) ||
2099       (is_load(instr) && !is_local_mem_load(instr)) ||
2100       is_atomic(instr->opc);
2101 }
2102 
2103 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)2104 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
2105 {
2106    /* TODO: this is just an optimistic guess, we can do better post-RA.
2107     */
2108    bool double_wavesize =
2109       shader->type == MESA_SHADER_FRAGMENT ||
2110       shader->type == MESA_SHADER_COMPUTE;
2111 
2112    unsigned components = reg_elems(instr->dsts[0]);
2113 
2114    /* These numbers come from counting the number of delay slots to get
2115     * cat5/cat6 results back using nops instead of (sy). Note that these numbers
2116     * are with the result preloaded to cache by loading it before in the same
2117     * shader - uncached results are much larger.
2118     *
2119     * Note: most ALU instructions can't complete at the full doubled rate, so
2120     * they take 2 cycles. The only exception is fp16 instructions with no
2121     * built-in conversions. Therefore divide the latency by 2.
2122     *
2123     * TODO: Handle this properly in the scheduler and remove this.
2124     */
2125    if (instr->opc == OPC_LDC) {
2126       if (double_wavesize)
2127          return (21 + 8 * components) / 2;
2128       else
2129          return 18 + 4 * components;
2130    } else if (is_tex_or_prefetch(instr)) {
2131       if (double_wavesize) {
2132          switch (components) {
2133          case 1: return 58 / 2;
2134          case 2: return 60 / 2;
2135          case 3: return 77 / 2;
2136          case 4: return 79 / 2;
2137          default: unreachable("bad number of components");
2138          }
2139       } else {
2140          switch (components) {
2141          case 1: return 51;
2142          case 2: return 53;
2143          case 3: return 62;
2144          case 4: return 64;
2145          default: unreachable("bad number of components");
2146          }
2147       }
2148    } else {
2149       /* TODO: measure other cat6 opcodes like ldg */
2150       if (double_wavesize)
2151          return (172 + components) / 2;
2152       else
2153          return 109 + components;
2154    }
2155 }
2156 
2157 /* Some instructions don't immediately consume their sources so may introduce a
2158  * WAR hazard.
2159  */
2160 static inline bool
is_war_hazard_producer(struct ir3_instruction * instr)2161 is_war_hazard_producer(struct ir3_instruction *instr)
2162 {
2163    return is_tex(instr) || is_mem(instr) || is_ss_producer(instr) ||
2164           instr->opc == OPC_STC;
2165 }
2166 
2167 bool ir3_cleanup_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2168 bool ir3_merge_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2169 bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
2170 
2171 /* unreachable block elimination: */
2172 bool ir3_remove_unreachable(struct ir3 *ir);
2173 
2174 /* calculate reconvergence information: */
2175 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
2176 
2177 /* lower invalid shared phis after calculating reconvergence information: */
2178 bool ir3_lower_shared_phis(struct ir3 *ir);
2179 
2180 /* dead code elimination: */
2181 struct ir3_shader_variant;
2182 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
2183 
2184 /* fp16 conversion folding */
2185 bool ir3_cf(struct ir3 *ir);
2186 
2187 /* shared mov folding */
2188 bool ir3_shared_fold(struct ir3 *ir);
2189 
2190 /* copy-propagate: */
2191 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
2192 
2193 /* common subexpression elimination: */
2194 bool ir3_cse(struct ir3 *ir);
2195 
2196 /* Make arrays SSA */
2197 bool ir3_array_to_ssa(struct ir3 *ir);
2198 
2199 /* scheduling: */
2200 bool ir3_sched_add_deps(struct ir3 *ir);
2201 int ir3_sched(struct ir3 *ir);
2202 
2203 struct ir3_context;
2204 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
2205 
2206 /* register assignment: */
2207 int ir3_ra(struct ir3_shader_variant *v);
2208 void ir3_ra_predicates(struct ir3_shader_variant *v);
2209 
2210 /* lower subgroup ops: */
2211 bool ir3_lower_subgroups(struct ir3 *ir);
2212 
2213 /* legalize: */
2214 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
2215 bool ir3_legalize_relative(struct ir3 *ir);
2216 
2217 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)2218 ir3_has_latency_to_hide(struct ir3 *ir)
2219 {
2220    /* VS/GS/TCS/TESS  co-exist with frag shader invocations, but we don't
2221     * know the nature of the fragment shader.  Just assume it will have
2222     * latency to hide:
2223     */
2224    if (ir->type != MESA_SHADER_FRAGMENT)
2225       return true;
2226 
2227    foreach_block (block, &ir->block_list) {
2228       foreach_instr (instr, &block->instr_list) {
2229          if (is_tex_or_prefetch(instr))
2230             return true;
2231 
2232          if (is_load(instr)) {
2233             switch (instr->opc) {
2234             case OPC_LDLV:
2235             case OPC_LDL:
2236             case OPC_LDLW:
2237                break;
2238             default:
2239                return true;
2240             }
2241          }
2242       }
2243    }
2244 
2245    return false;
2246 }
2247 
2248 /**
2249  * Move 'instr' to after the last phi node at the beginning of the block:
2250  */
2251 static inline void
ir3_instr_move_after_phis(struct ir3_instruction * instr,struct ir3_block * block)2252 ir3_instr_move_after_phis(struct ir3_instruction *instr,
2253                           struct ir3_block *block)
2254 {
2255    struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
2256    if (last_phi)
2257       ir3_instr_move_after(instr, last_phi);
2258    else
2259       ir3_instr_move_before_block(instr, block);
2260 }
2261 
2262 static inline struct ir3_cursor
ir3_before_block(struct ir3_block * block)2263 ir3_before_block(struct ir3_block *block)
2264 {
2265    assert(block);
2266    struct ir3_cursor cursor;
2267    cursor.option = IR3_CURSOR_BEFORE_BLOCK;
2268    cursor.block = block;
2269    return cursor;
2270 }
2271 
2272 static inline struct ir3_cursor
ir3_after_block(struct ir3_block * block)2273 ir3_after_block(struct ir3_block *block)
2274 {
2275    assert(block);
2276    struct ir3_cursor cursor;
2277    cursor.option = IR3_CURSOR_AFTER_BLOCK;
2278    cursor.block = block;
2279    return cursor;
2280 }
2281 
2282 static inline struct ir3_cursor
ir3_before_instr(struct ir3_instruction * instr)2283 ir3_before_instr(struct ir3_instruction *instr)
2284 {
2285    assert(instr);
2286    struct ir3_cursor cursor;
2287    cursor.option = IR3_CURSOR_BEFORE_INSTR;
2288    cursor.instr = instr;
2289    return cursor;
2290 }
2291 
2292 static inline struct ir3_cursor
ir3_after_instr(struct ir3_instruction * instr)2293 ir3_after_instr(struct ir3_instruction *instr)
2294 {
2295    assert(instr);
2296    struct ir3_cursor cursor;
2297    cursor.option = IR3_CURSOR_AFTER_INSTR;
2298    cursor.instr = instr;
2299    return cursor;
2300 }
2301 
2302 static inline struct ir3_cursor
ir3_before_terminator(struct ir3_block * block)2303 ir3_before_terminator(struct ir3_block *block)
2304 {
2305    assert(block);
2306    struct ir3_instruction *terminator = ir3_block_get_terminator(block);
2307 
2308    if (terminator)
2309       return ir3_before_instr(terminator);
2310    return ir3_after_block(block);
2311 }
2312 
2313 static inline struct ir3_cursor
ir3_after_phis(struct ir3_block * block)2314 ir3_after_phis(struct ir3_block *block)
2315 {
2316    assert(block);
2317 
2318    foreach_instr (instr, &block->instr_list) {
2319       if (instr->opc != OPC_META_PHI)
2320          return ir3_before_instr(instr);
2321    }
2322 
2323    return ir3_after_block(block);
2324 }
2325 
2326 static inline struct ir3_builder
ir3_builder_at(struct ir3_cursor cursor)2327 ir3_builder_at(struct ir3_cursor cursor)
2328 {
2329    struct ir3_builder builder;
2330    builder.cursor = cursor;
2331    return builder;
2332 }
2333 
2334 
2335 /* ************************************************************************* */
2336 /* instruction helpers */
2337 
2338 /* creates SSA src of correct type (ie. half vs full precision) */
2339 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2340 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2341           unsigned flags)
2342 {
2343    struct ir3_register *reg;
2344    flags |= src->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2345    reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2346    reg->def = src->dsts[0];
2347    reg->wrmask = src->dsts[0]->wrmask;
2348    return reg;
2349 }
2350 
2351 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2352 __ssa_dst(struct ir3_instruction *instr)
2353 {
2354    struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2355    reg->instr = instr;
2356    return reg;
2357 }
2358 
2359 static BITMASK_ENUM(ir3_register_flags)
type_flags(type_t type)2360 type_flags(type_t type)
2361 {
2362    if (type_size(type) < 32)
2363       return IR3_REG_HALF;
2364    return (ir3_register_flags)0;
2365 }
2366 
2367 static inline struct ir3_instruction *
create_immed_typed_shared(struct ir3_block * block,uint32_t val,type_t type,bool shared)2368 create_immed_typed_shared(struct ir3_block *block, uint32_t val, type_t type, bool shared)
2369 {
2370    struct ir3_instruction *mov;
2371    ir3_register_flags flags = type_flags(type);
2372 
2373    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2374    mov->cat1.src_type = type;
2375    mov->cat1.dst_type = type;
2376    __ssa_dst(mov)->flags |= flags | (shared ? IR3_REG_SHARED : 0);
2377    ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2378 
2379    return mov;
2380 }
2381 
2382 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)2383 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
2384 {
2385    return create_immed_typed_shared(block, val, type, false);
2386 }
2387 
2388 static inline struct ir3_instruction *
create_immed_shared(struct ir3_block * block,uint32_t val,bool shared)2389 create_immed_shared(struct ir3_block *block, uint32_t val, bool shared)
2390 {
2391    return create_immed_typed_shared(block, val, TYPE_U32, shared);
2392 }
2393 
2394 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)2395 create_immed(struct ir3_block *block, uint32_t val)
2396 {
2397    return create_immed_shared(block, val, false);
2398 }
2399 
2400 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)2401 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
2402 {
2403    struct ir3_instruction *mov;
2404    ir3_register_flags flags = type_flags(type);
2405 
2406    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2407    mov->cat1.src_type = type;
2408    mov->cat1.dst_type = type;
2409    __ssa_dst(mov)->flags |= flags;
2410    ir3_src_create(mov, n, IR3_REG_CONST | flags);
2411 
2412    return mov;
2413 }
2414 
2415 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)2416 create_uniform(struct ir3_block *block, unsigned n)
2417 {
2418    return create_uniform_typed(block, n, TYPE_F32);
2419 }
2420 
2421 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)2422 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
2423                         struct ir3_instruction *address)
2424 {
2425    struct ir3_instruction *mov;
2426 
2427    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2428    mov->cat1.src_type = type;
2429    mov->cat1.dst_type = type;
2430    __ssa_dst(mov);
2431    ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2432 
2433    ir3_instr_set_address(mov, address);
2434 
2435    return mov;
2436 }
2437 
2438 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)2439 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
2440 {
2441    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2442    ir3_register_flags flags = type_flags(type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2443 
2444    __ssa_dst(instr)->flags |= flags;
2445    if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2446       struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2447       src_reg->array = src->dsts[0]->array;
2448    } else {
2449       __ssa_src(instr, src, 0);
2450    }
2451    assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2452    instr->cat1.src_type = type;
2453    instr->cat1.dst_type = type;
2454    return instr;
2455 }
2456 
2457 static inline struct ir3_instruction_rpt
ir3_MOV_rpt(struct ir3_block * block,unsigned nrpt,struct ir3_instruction_rpt src,type_t type)2458 ir3_MOV_rpt(struct ir3_block *block, unsigned nrpt,
2459             struct ir3_instruction_rpt src, type_t type)
2460 {
2461    struct ir3_instruction_rpt dst;
2462    assert(nrpt <= ARRAY_SIZE(dst.rpts));
2463 
2464    for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2465       dst.rpts[rpt] = ir3_MOV(block, src.rpts[rpt], type);
2466 
2467    ir3_instr_create_rpt(dst.rpts, nrpt);
2468    return dst;
2469 }
2470 
2471 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2472 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2473         type_t dst_type)
2474 {
2475    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2476    ir3_register_flags dst_flags = type_flags(dst_type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2477    ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2478 
2479    assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2480 
2481    __ssa_dst(instr)->flags |= dst_flags;
2482    __ssa_src(instr, src, 0);
2483    instr->cat1.src_type = src_type;
2484    instr->cat1.dst_type = dst_type;
2485    assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2486    return instr;
2487 }
2488 
2489 static inline struct ir3_instruction_rpt
ir3_COV_rpt(struct ir3_block * block,unsigned nrpt,struct ir3_instruction_rpt src,type_t src_type,type_t dst_type)2490 ir3_COV_rpt(struct ir3_block *block, unsigned nrpt,
2491             struct ir3_instruction_rpt src, type_t src_type, type_t dst_type)
2492 {
2493    struct ir3_instruction_rpt dst;
2494 
2495    for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2496       dst.rpts[rpt] = ir3_COV(block, src.rpts[rpt], src_type, dst_type);
2497 
2498    ir3_instr_create_rpt(dst.rpts, nrpt);
2499    return dst;
2500 }
2501 
2502 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2503 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2504 {
2505    struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2506 
2507    struct ir3_register *dst = __ssa_dst(instr);
2508    dst->flags |= IR3_REG_SHARED;
2509    dst->wrmask = (1 << components) - 1;
2510    instr->repeat = components - 1;
2511    return instr;
2512 }
2513 
2514 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2515 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2516                  unsigned components)
2517 {
2518    struct ir3_instruction *instr =
2519       ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2520 
2521    struct ir3_register *dst = __ssa_dst(instr);
2522    dst->flags |= IR3_REG_SHARED;
2523    dst->wrmask = (1 << components) - 1;
2524 
2525    __ssa_src(instr, src, 0);
2526 
2527    return instr;
2528 }
2529 
2530 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2531 ir3_NOP(struct ir3_block *block)
2532 {
2533    return ir3_instr_create(block, OPC_NOP, 0, 0);
2534 }
2535 
2536 /* clang-format off */
2537 #define __INSTR0(flag, name, opc)                                              \
2538 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block)      \
2539 {                                                                              \
2540    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0);         \
2541    instr->flags |= flag;                                                       \
2542    return instr;                                                               \
2543 }
2544 /* clang-format on */
2545 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2546 #define INSTR0(name)     __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2547 
2548 /* clang-format off */
2549 #define __INSTR1(flag, dst_count, name, opc, scalar_alu)                       \
2550 static inline struct ir3_instruction *ir3_##name(                              \
2551    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags)        \
2552 {                                                                              \
2553    struct ir3_instruction *instr =                                             \
2554       ir3_instr_create(block, opc, dst_count, 1);                              \
2555    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & IR3_REG_SHARED) : 0;  \
2556    for (unsigned i = 0; i < dst_count; i++)                                    \
2557       __ssa_dst(instr)->flags |= dst_flag;                                     \
2558    __ssa_src(instr, a, aflags);                                                \
2559    instr->flags |= flag;                                                       \
2560    return instr;                                                               \
2561 }                                                                              \
2562 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2563    struct ir3_block *block, unsigned nrpt,                                     \
2564    struct ir3_instruction_rpt a, unsigned aflags)                              \
2565 {                                                                              \
2566    struct ir3_instruction_rpt dst;                                             \
2567    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2568    for (unsigned rpt = 0; rpt < nrpt; rpt++)                                   \
2569       dst.rpts[rpt] = ir3_##name(block, a.rpts[rpt], aflags);                  \
2570    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2571    return dst;                                                                 \
2572 }
2573 
2574 /* clang-format on */
2575 #define INSTR1F(f, name)  __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name,   \
2576                                    false)
2577 #define INSTR1(name)      __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2578 #define INSTR1S(name)     __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2579 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2580 
2581 /* clang-format off */
2582 #define __INSTR2(flag, dst_count, name, opc, scalar_alu)                       \
2583 static inline struct ir3_instruction *ir3_##name(                              \
2584    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2585    struct ir3_instruction *b, unsigned bflags)                                 \
2586 {                                                                              \
2587    struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2588    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags &   \
2589                                      IR3_REG_SHARED) : 0;                      \
2590    for (unsigned i = 0; i < dst_count; i++)                                    \
2591       __ssa_dst(instr)->flags |= dst_flag;                                     \
2592    __ssa_src(instr, a, aflags);                                                \
2593    __ssa_src(instr, b, bflags);                                                \
2594    instr->flags |= flag;                                                       \
2595    return instr;                                                               \
2596 }                                                                              \
2597 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2598    struct ir3_block *block, unsigned nrpt,                                     \
2599    struct ir3_instruction_rpt a, unsigned aflags,                              \
2600    struct ir3_instruction_rpt b, unsigned bflags)                              \
2601 {                                                                              \
2602    struct ir3_instruction_rpt dst;                                             \
2603    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2604    for (unsigned rpt = 0; rpt < nrpt; rpt++) {                                 \
2605       dst.rpts[rpt] = ir3_##name(block, a.rpts[rpt], aflags,                   \
2606                                  b.rpts[rpt], bflags);                         \
2607    }                                                                           \
2608    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2609    return dst;                                                                 \
2610 }
2611 /* clang-format on */
2612 #define INSTR2F(f, name)   __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name,  \
2613                                     false)
2614 #define INSTR2(name)       __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2615 #define INSTR2S(name)      __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2616 #define INSTR2NODST(name)  __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2617 
2618 /* clang-format off */
2619 #define __INSTR3(flag, dst_count, name, opc, scalar_alu)                       \
2620 static inline struct ir3_instruction *ir3_##name(                              \
2621    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2622    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2623    unsigned cflags)                                                            \
2624 {                                                                              \
2625    struct ir3_instruction *instr =                                             \
2626       ir3_instr_create(block, opc, dst_count, 3);                              \
2627    unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags &   \
2628                                      c->dsts[0]->flags & IR3_REG_SHARED) : 0;  \
2629    for (unsigned i = 0; i < dst_count; i++)                                    \
2630       __ssa_dst(instr)->flags |= dst_flag;                                     \
2631    __ssa_src(instr, a, aflags);                                                \
2632    __ssa_src(instr, b, bflags);                                                \
2633    __ssa_src(instr, c, cflags);                                                \
2634    instr->flags |= flag;                                                       \
2635    return instr;                                                               \
2636 }                                                                              \
2637 static inline struct ir3_instruction_rpt ir3_##name##_rpt(                     \
2638    struct ir3_block *block, unsigned nrpt,                                     \
2639    struct ir3_instruction_rpt a, unsigned aflags,                              \
2640    struct ir3_instruction_rpt b, unsigned bflags,                              \
2641    struct ir3_instruction_rpt c, unsigned cflags)                              \
2642 {                                                                              \
2643    struct ir3_instruction_rpt dst;                                             \
2644    assert(nrpt <= ARRAY_SIZE(dst.rpts));                                       \
2645    for (unsigned rpt = 0; rpt < nrpt; rpt++) {                                 \
2646       dst.rpts[rpt] = ir3_##name(block, a.rpts[rpt], aflags,                   \
2647                                  b.rpts[rpt], bflags,                          \
2648                                  c.rpts[rpt], cflags);                         \
2649    }                                                                           \
2650    ir3_instr_create_rpt(dst.rpts, nrpt);                                       \
2651    return dst;                                                                 \
2652 }
2653 /* clang-format on */
2654 #define INSTR3F(f, name)  __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name,   \
2655                                    false)
2656 #define INSTR3(name)      __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2657 #define INSTR3S(name)     __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2658 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2659 
2660 /* clang-format off */
2661 #define __INSTR4(flag, dst_count, name, opc)                                   \
2662 static inline struct ir3_instruction *ir3_##name(                              \
2663    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2664    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2665    unsigned cflags, struct ir3_instruction *d, unsigned dflags)                \
2666 {                                                                              \
2667    struct ir3_instruction *instr =                                             \
2668       ir3_instr_create(block, opc, dst_count, 4);                              \
2669    for (unsigned i = 0; i < dst_count; i++)                                    \
2670       __ssa_dst(instr);                                                        \
2671    __ssa_src(instr, a, aflags);                                                \
2672    __ssa_src(instr, b, bflags);                                                \
2673    __ssa_src(instr, c, cflags);                                                \
2674    __ssa_src(instr, d, dflags);                                                \
2675    instr->flags |= flag;                                                       \
2676    return instr;                                                               \
2677 }
2678 /* clang-format on */
2679 #define INSTR4F(f, name)  __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2680 #define INSTR4(name)      __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2681 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2682 
2683 /* clang-format off */
2684 #define __INSTR5(flag, name, opc)                                              \
2685 static inline struct ir3_instruction *ir3_##name(                              \
2686    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2687    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2688    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2689    struct ir3_instruction *e, unsigned eflags)                                 \
2690 {                                                                              \
2691    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5);         \
2692    __ssa_dst(instr);                                                           \
2693    __ssa_src(instr, a, aflags);                                                \
2694    __ssa_src(instr, b, bflags);                                                \
2695    __ssa_src(instr, c, cflags);                                                \
2696    __ssa_src(instr, d, dflags);                                                \
2697    __ssa_src(instr, e, eflags);                                                \
2698    instr->flags |= flag;                                                       \
2699    return instr;                                                               \
2700 }
2701 /* clang-format on */
2702 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2703 #define INSTR5(name)     __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2704 
2705 /* clang-format off */
2706 #define __INSTR6(flag, dst_count, name, opc)                                   \
2707 static inline struct ir3_instruction *ir3_##name(                              \
2708    struct ir3_block *block, struct ir3_instruction *a, unsigned aflags,        \
2709    struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c,      \
2710    unsigned cflags, struct ir3_instruction *d, unsigned dflags,                \
2711    struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f,      \
2712    unsigned fflags)                                                            \
2713 {                                                                              \
2714    struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6);         \
2715    for (unsigned i = 0; i < dst_count; i++)                                    \
2716       __ssa_dst(instr);                                                        \
2717    __ssa_src(instr, a, aflags);                                                \
2718    __ssa_src(instr, b, bflags);                                                \
2719    __ssa_src(instr, c, cflags);                                                \
2720    __ssa_src(instr, d, dflags);                                                \
2721    __ssa_src(instr, e, eflags);                                                \
2722    __ssa_src(instr, f, fflags);                                                \
2723    instr->flags |= flag;                                                       \
2724    return instr;                                                               \
2725 }
2726 /* clang-format on */
2727 #define INSTR6F(f, name)  __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2728 #define INSTR6(name)      __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2729 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2730 
2731 /* cat0 instructions: */
2732 INSTR1NODST(BR)
INSTR1NODST(BALL)2733 INSTR1NODST(BALL)
2734 INSTR1NODST(BANY)
2735 INSTR2NODST(BRAA)
2736 INSTR2NODST(BRAO)
2737 INSTR0(JUMP)
2738 INSTR1NODST(KILL)
2739 INSTR1NODST(DEMOTE)
2740 INSTR0(END)
2741 INSTR0(CHSH)
2742 INSTR0(CHMASK)
2743 INSTR1NODST(PREDT)
2744 INSTR1NODST(PREDF)
2745 INSTR0(PREDE)
2746 INSTR0(GETONE)
2747 INSTR0(GETLAST)
2748 INSTR0(SHPS)
2749 INSTR0(SHPE)
2750 
2751 /* cat1 macros */
2752 INSTR1(ANY_MACRO)
2753 INSTR1(ALL_MACRO)
2754 INSTR1(READ_FIRST_MACRO)
2755 INSTR2(READ_COND_MACRO)
2756 
2757 static inline struct ir3_instruction *
2758 ir3_ELECT_MACRO(struct ir3_block *block)
2759 {
2760    struct ir3_instruction *instr =
2761       ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2762    __ssa_dst(instr);
2763    return instr;
2764 }
2765 
2766 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2767 ir3_SHPS_MACRO(struct ir3_block *block)
2768 {
2769    struct ir3_instruction *instr =
2770       ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2771    __ssa_dst(instr);
2772    return instr;
2773 }
2774 
2775 /* cat2 instructions, most 2 src but some 1 src: */
2776 INSTR2S(ADD_F)
INSTR2S(MIN_F)2777 INSTR2S(MIN_F)
2778 INSTR2S(MAX_F)
2779 INSTR2S(MUL_F)
2780 INSTR1S(SIGN_F)
2781 INSTR2S(CMPS_F)
2782 INSTR1S(ABSNEG_F)
2783 INSTR2S(CMPV_F)
2784 INSTR1S(FLOOR_F)
2785 INSTR1S(CEIL_F)
2786 INSTR1S(RNDNE_F)
2787 INSTR1S(RNDAZ_F)
2788 INSTR1S(TRUNC_F)
2789 INSTR2S(ADD_U)
2790 INSTR2S(ADD_S)
2791 INSTR2S(SUB_U)
2792 INSTR2S(SUB_S)
2793 INSTR2S(CMPS_U)
2794 INSTR2S(CMPS_S)
2795 INSTR2S(MIN_U)
2796 INSTR2S(MIN_S)
2797 INSTR2S(MAX_U)
2798 INSTR2S(MAX_S)
2799 INSTR1S(ABSNEG_S)
2800 INSTR2S(AND_B)
2801 INSTR2S(OR_B)
2802 INSTR1S(NOT_B)
2803 INSTR2S(XOR_B)
2804 INSTR2S(CMPV_U)
2805 INSTR2S(CMPV_S)
2806 INSTR2S(MUL_U24)
2807 INSTR2S(MUL_S24)
2808 INSTR2S(MULL_U)
2809 INSTR1S(BFREV_B)
2810 INSTR1S(CLZ_S)
2811 INSTR1S(CLZ_B)
2812 INSTR2S(SHL_B)
2813 INSTR2S(SHR_B)
2814 INSTR2S(ASHR_B)
2815 INSTR2(BARY_F)
2816 INSTR2(FLAT_B)
2817 INSTR2S(MGEN_B)
2818 INSTR2S(GETBIT_B)
2819 INSTR1(SETRM)
2820 INSTR1S(CBITS_B)
2821 INSTR2S(SHB)
2822 INSTR2S(MSAD)
2823 
2824 /* cat3 instructions: */
2825 INSTR3(MAD_U16)
2826 INSTR3(MADSH_U16)
2827 INSTR3(MAD_S16)
2828 INSTR3(MADSH_M16)
2829 INSTR3(MAD_U24)
2830 INSTR3(MAD_S24)
2831 INSTR3(MAD_F16)
2832 INSTR3(MAD_F32)
2833 INSTR3(DP2ACC)
2834 INSTR3(DP4ACC)
2835 /* NOTE: SEL_B32 checks for zero vs nonzero */
2836 INSTR3S(SEL_B16)
2837 INSTR3S(SEL_B32)
2838 INSTR3S(SEL_S16)
2839 INSTR3S(SEL_S32)
2840 INSTR3S(SEL_F16)
2841 INSTR3S(SEL_F32)
2842 INSTR3(SAD_S16)
2843 INSTR3(SAD_S32)
2844 
2845 /* cat4 instructions: */
2846 INSTR1S(RCP)
2847 INSTR1S(RSQ)
2848 INSTR1S(HRSQ)
2849 INSTR1S(LOG2)
2850 INSTR1S(HLOG2)
2851 INSTR1S(EXP2)
2852 INSTR1S(HEXP2)
2853 INSTR1S(SIN)
2854 INSTR1S(COS)
2855 INSTR1S(SQRT)
2856 
2857 /* cat5 instructions: */
2858 INSTR1(DSX)
2859 INSTR1(DSXPP_MACRO)
2860 INSTR1(DSY)
2861 INSTR1(DSYPP_MACRO)
2862 INSTR1F(3D, DSX)
2863 INSTR1F(3D, DSY)
2864 INSTR1(RGETPOS)
2865 
2866 static inline struct ir3_instruction *
2867 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2868         ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2869         struct ir3_instruction *src0, struct ir3_instruction *src1)
2870 {
2871    struct ir3_instruction *sam;
2872    unsigned nreg = 0;
2873 
2874    if (flags & IR3_INSTR_S2EN) {
2875       nreg++;
2876    }
2877    if (src0 || opc == OPC_SAM) {
2878       nreg++;
2879    }
2880    if (src1) {
2881       nreg++;
2882    }
2883 
2884    sam = ir3_instr_create(block, opc, 1, nreg);
2885    sam->flags |= flags;
2886    __ssa_dst(sam)->wrmask = wrmask;
2887    if (flags & IR3_INSTR_S2EN) {
2888       __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2889    }
2890    if (src0) {
2891       __ssa_src(sam, src0, 0);
2892    } else if (opc == OPC_SAM) {
2893       /* Create a dummy shared source for the coordinate, for the prefetch
2894        * case. It needs to be shared so that we don't accidentally disable early
2895        * preamble, and this is what the blob does.
2896        */
2897       ir3_src_create(sam, regid(48, 0), IR3_REG_SHARED);
2898    }
2899    if (src1) {
2900       __ssa_src(sam, src1, 0);
2901    }
2902    sam->cat5.type = type;
2903 
2904    return sam;
2905 }
2906 
2907 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
2908  * value or is set to ry. In order to model this in SSA form, we add an extra
2909  * argument (the initial value of rx) and tie it to the destination.
2910  */
2911 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_block * block,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)2912 ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size,
2913                  struct ir3_instruction *src,
2914                  struct ir3_instruction *dst_default)
2915 {
2916    struct ir3_instruction *brcst =
2917       ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2);
2918    brcst->cat5.cluster_size = cluster_size;
2919    brcst->cat5.type = TYPE_U32;
2920    struct ir3_register *brcst_dst = __ssa_dst(brcst);
2921    __ssa_src(brcst, src, 0);
2922    struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
2923    ir3_reg_tie(brcst_dst, default_src);
2924    return brcst;
2925 }
2926 
2927 /* cat6 instructions: */
2928 INSTR0(GETFIBERID)
2929 INSTR2(LDLV)
2930 INSTR3(LDG)
2931 INSTR3(LDL)
2932 INSTR3(LDLW)
2933 INSTR3(LDP)
2934 INSTR4NODST(STG)
2935 INSTR3NODST(STL)
2936 INSTR3NODST(STLW)
2937 INSTR3NODST(STP)
2938 INSTR1(RESINFO)
2939 INSTR1(RESFMT)
2940 INSTR2(ATOMIC_ADD)
2941 INSTR2(ATOMIC_SUB)
2942 INSTR2(ATOMIC_XCHG)
2943 INSTR2(ATOMIC_INC)
2944 INSTR2(ATOMIC_DEC)
2945 INSTR2(ATOMIC_CMPXCHG)
2946 INSTR2(ATOMIC_MIN)
2947 INSTR2(ATOMIC_MAX)
2948 INSTR2(ATOMIC_AND)
2949 INSTR2(ATOMIC_OR)
2950 INSTR2(ATOMIC_XOR)
2951 INSTR2(LDC)
2952 INSTR2(QUAD_SHUFFLE_BRCST)
2953 INSTR1(QUAD_SHUFFLE_HORIZ)
2954 INSTR1(QUAD_SHUFFLE_VERT)
2955 INSTR1(QUAD_SHUFFLE_DIAG)
2956 INSTR2NODST(LDC_K)
2957 INSTR2NODST(STC)
2958 INSTR2NODST(STSC)
2959 #ifndef GPU
2960 #elif GPU >= 600
2961 INSTR4NODST(STIB);
2962 INSTR3(LDIB);
2963 INSTR5(LDG_A);
2964 INSTR6NODST(STG_A);
2965 INSTR2(ATOMIC_G_ADD)
2966 INSTR2(ATOMIC_G_SUB)
2967 INSTR2(ATOMIC_G_XCHG)
2968 INSTR2(ATOMIC_G_INC)
2969 INSTR2(ATOMIC_G_DEC)
2970 INSTR2(ATOMIC_G_CMPXCHG)
2971 INSTR2(ATOMIC_G_MIN)
2972 INSTR2(ATOMIC_G_MAX)
2973 INSTR2(ATOMIC_G_AND)
2974 INSTR2(ATOMIC_G_OR)
2975 INSTR2(ATOMIC_G_XOR)
2976 INSTR3(ATOMIC_B_ADD)
2977 INSTR3(ATOMIC_B_SUB)
2978 INSTR3(ATOMIC_B_XCHG)
2979 INSTR3(ATOMIC_B_INC)
2980 INSTR3(ATOMIC_B_DEC)
2981 INSTR3(ATOMIC_B_CMPXCHG)
2982 INSTR3(ATOMIC_B_MIN)
2983 INSTR3(ATOMIC_B_MAX)
2984 INSTR3(ATOMIC_B_AND)
2985 INSTR3(ATOMIC_B_OR)
2986 INSTR3(ATOMIC_B_XOR)
2987 #elif GPU >= 400
2988 INSTR3(LDGB)
2989 #if GPU >= 500
2990 INSTR3(LDIB)
2991 #endif
2992 INSTR4NODST(STGB)
2993 INSTR4NODST(STIB)
2994 INSTR4(ATOMIC_S_ADD)
2995 INSTR4(ATOMIC_S_SUB)
2996 INSTR4(ATOMIC_S_XCHG)
2997 INSTR4(ATOMIC_S_INC)
2998 INSTR4(ATOMIC_S_DEC)
2999 INSTR4(ATOMIC_S_CMPXCHG)
3000 INSTR4(ATOMIC_S_MIN)
3001 INSTR4(ATOMIC_S_MAX)
3002 INSTR4(ATOMIC_S_AND)
3003 INSTR4(ATOMIC_S_OR)
3004 INSTR4(ATOMIC_S_XOR)
3005 #endif
3006 INSTR4NODST(LDG_K)
3007 
3008 /* cat7 instructions: */
3009 INSTR0(BAR)
3010 INSTR0(FENCE)
3011 INSTR0(CCINV)
3012 
3013 /* ************************************************************************* */
3014 #include "util/bitset.h"
3015 
3016 #define MAX_REG 256
3017 
3018 typedef BITSET_DECLARE(fullstate_t, 2 * GPR_REG_SIZE);
3019 typedef BITSET_DECLARE(halfstate_t, GPR_REG_SIZE);
3020 typedef BITSET_DECLARE(sharedstate_t, 2 * SHARED_REG_SIZE);
3021 typedef BITSET_DECLARE(nongprstate_t, 2 * NONGPR_REG_SIZE);
3022 
3023 typedef struct {
3024    bool mergedregs;
3025    fullstate_t full;
3026    halfstate_t half;
3027    sharedstate_t shared;
3028    nongprstate_t nongpr;
3029 } regmask_t;
3030 
3031 static inline BITSET_WORD *
__regmask_file(regmask_t * regmask,enum ir3_reg_file file)3032 __regmask_file(regmask_t *regmask, enum ir3_reg_file file)
3033 {
3034    switch (file) {
3035    case IR3_FILE_FULL:
3036       return regmask->full;
3037    case IR3_FILE_HALF:
3038       return regmask->half;
3039    case IR3_FILE_SHARED:
3040       return regmask->shared;
3041    case IR3_FILE_NONGPR:
3042       return regmask->nongpr;
3043    }
3044    unreachable("bad file");
3045 }
3046 
3047 static inline bool
__regmask_get(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3048 __regmask_get(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3049 {
3050    BITSET_WORD *regs = __regmask_file(regmask, file);
3051    for (unsigned i = 0; i < size; i++) {
3052       if (BITSET_TEST(regs, n + i))
3053          return true;
3054    }
3055    return false;
3056 }
3057 
3058 static inline void
__regmask_set(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3059 __regmask_set(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3060 {
3061    BITSET_WORD *regs = __regmask_file(regmask, file);
3062    for (unsigned i = 0; i < size; i++)
3063       BITSET_SET(regs, n + i);
3064 }
3065 
3066 static inline void
__regmask_clear(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3067 __regmask_clear(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3068 {
3069    BITSET_WORD *regs = __regmask_file(regmask, file);
3070    for (unsigned i = 0; i < size; i++)
3071       BITSET_CLEAR(regs, n + i);
3072 }
3073 
3074 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)3075 regmask_init(regmask_t *regmask, bool mergedregs)
3076 {
3077    memset(regmask, 0, sizeof(*regmask));
3078    regmask->mergedregs = mergedregs;
3079 }
3080 
3081 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)3082 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
3083 {
3084    assert(dst->mergedregs == a->mergedregs);
3085    assert(dst->mergedregs == b->mergedregs);
3086 
3087    for (unsigned i = 0; i < ARRAY_SIZE(dst->full); i++)
3088       dst->full[i] = a->full[i] | b->full[i];
3089    for (unsigned i = 0; i < ARRAY_SIZE(dst->half); i++)
3090       dst->half[i] = a->half[i] | b->half[i];
3091    for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3092       dst->shared[i] = a->shared[i] | b->shared[i];
3093    for (unsigned i = 0; i < ARRAY_SIZE(dst->nongpr); i++)
3094       dst->nongpr[i] = a->nongpr[i] | b->nongpr[i];
3095 }
3096 
3097 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)3098 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
3099 {
3100    for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3101       dst->shared[i] = a->shared[i] | b->shared[i];
3102 }
3103 
3104 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)3105 regmask_set(regmask_t *regmask, struct ir3_register *reg)
3106 {
3107    unsigned size = reg_elem_size(reg);
3108    enum ir3_reg_file file;
3109    unsigned num = post_ra_reg_num(reg);
3110    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3111    if (reg->flags & IR3_REG_RELATIV) {
3112       __regmask_set(regmask, file, n, size * reg->size);
3113    } else {
3114       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3115          if (mask & 1)
3116             __regmask_set(regmask, file, n, size);
3117    }
3118 }
3119 
3120 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)3121 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
3122 {
3123    unsigned size = reg_elem_size(reg);
3124    enum ir3_reg_file file;
3125    unsigned num = post_ra_reg_num(reg);
3126    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3127    if (reg->flags & IR3_REG_RELATIV) {
3128       __regmask_clear(regmask, file, n, size * reg->size);
3129    } else {
3130       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3131          if (mask & 1)
3132             __regmask_clear(regmask, file, n, size);
3133    }
3134 }
3135 
3136 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)3137 regmask_get(regmask_t *regmask, struct ir3_register *reg)
3138 {
3139    unsigned size = reg_elem_size(reg);
3140    enum ir3_reg_file file;
3141    unsigned num = post_ra_reg_num(reg);
3142    unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3143    if (reg->flags & IR3_REG_RELATIV) {
3144       return __regmask_get(regmask, file, n, size * reg->size);
3145    } else {
3146       for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3147          if (mask & 1)
3148             if (__regmask_get(regmask, file, n, size))
3149                return true;
3150    }
3151    return false;
3152 }
3153 /* ************************************************************************* */
3154 
3155 #endif /* IR3_H_ */
3156