1 /*
2 * Copyright © 2013 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #ifndef IR3_H_
7 #define IR3_H_
8
9 #include <stdbool.h>
10 #include <stdint.h>
11
12 #include "compiler/shader_enums.h"
13
14 #include "util/bitscan.h"
15 #include "util/list.h"
16 #include "util/set.h"
17 #include "util/u_debug.h"
18
19 #include "freedreno_common.h"
20
21 #include "instr-a3xx.h"
22
23 /* low level intermediate representation of an adreno shader program */
24
25 struct ir3_compiler;
26 struct ir3;
27 struct ir3_instruction;
28 struct ir3_block;
29
30 struct ir3_info {
31 void *data; /* used internally in ir3 assembler */
32 /* Size in bytes of the shader binary, including NIR constants and
33 * padding
34 */
35 uint32_t size;
36 /* byte offset from start of the shader to the NIR constant data. */
37 uint32_t constant_data_offset;
38 /* Size in dwords of the instructions. */
39 uint16_t sizedwords;
40 uint16_t instrs_count; /* expanded to account for rpt's */
41 uint16_t nops_count; /* # of nop instructions, including nopN */
42 uint16_t mov_count;
43 uint16_t cov_count;
44 uint16_t stp_count;
45 uint16_t ldp_count;
46 /* NOTE: max_reg, etc, does not include registers not touched
47 * by the shader (ie. vertex fetched via VFD_DECODE but not
48 * touched by shader)
49 */
50 int8_t max_reg; /* highest GPR # used by shader */
51 int8_t max_half_reg;
52 int16_t max_const;
53 /* This is the maximum # of waves that can executed at once in one core,
54 * assuming that they are all executing this shader.
55 */
56 int8_t max_waves;
57 uint8_t subgroup_size;
58 bool double_threadsize;
59 bool multi_dword_ldp_stp;
60 bool early_preamble;
61
62 /* number of sync bits: */
63 uint16_t ss, sy;
64
65 /* estimate of number of cycles stalled on (ss) */
66 uint16_t sstall;
67 /* estimate of number of cycles stalled on (sy) */
68 uint16_t systall;
69
70 uint16_t last_baryf; /* instruction # of last varying fetch */
71
72 uint16_t last_helper; /* last instruction to use helper invocations */
73
74 /* Number of instructions of a given category: */
75 uint16_t instrs_per_cat[8];
76 };
77
78 struct ir3_merge_set {
79 uint16_t preferred_reg;
80 uint16_t size;
81 uint16_t alignment;
82
83 unsigned interval_start;
84 unsigned spill_slot;
85
86 unsigned regs_count;
87 struct ir3_register **regs;
88 };
89
90 typedef enum ir3_register_flags {
91 IR3_REG_CONST = BIT(0),
92 IR3_REG_IMMED = BIT(1),
93 IR3_REG_HALF = BIT(2),
94 /* Shared registers have the same value for all threads when read.
95 * They can only be written when one thread is active (that is, inside
96 * a "getone" block).
97 */
98 IR3_REG_SHARED = BIT(3),
99 IR3_REG_RELATIV = BIT(4),
100 IR3_REG_R = BIT(5),
101 /* Most instructions, it seems, can do float abs/neg but not
102 * integer. The CP pass needs to know what is intended (int or
103 * float) in order to do the right thing. For this reason the
104 * abs/neg flags are split out into float and int variants. In
105 * addition, .b (bitwise) operations, the negate is actually a
106 * bitwise not, so split that out into a new flag to make it
107 * more clear.
108 */
109 IR3_REG_FNEG = BIT(6),
110 IR3_REG_FABS = BIT(7),
111 IR3_REG_SNEG = BIT(8),
112 IR3_REG_SABS = BIT(9),
113 IR3_REG_BNOT = BIT(10),
114 /* (ei) flag, end-input? Set on last bary, presumably to signal
115 * that the shader needs no more input:
116 *
117 * Note: Has different meaning on other instructions like add.s/u
118 */
119 IR3_REG_EI = BIT(11),
120 /* meta-flags, for intermediate stages of IR, ie.
121 * before register assignment is done:
122 */
123 IR3_REG_SSA = BIT(12), /* 'def' is ptr to assigning destination */
124 IR3_REG_ARRAY = BIT(13),
125
126 /* Set on a use whenever the SSA value becomes dead after the current
127 * instruction.
128 */
129 IR3_REG_KILL = BIT(14),
130
131 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the
132 * same SSA value in a single instruction, this is only set on the first
133 * use.
134 */
135 IR3_REG_FIRST_KILL = BIT(15),
136
137 /* Set when a destination doesn't have any uses and is dead immediately
138 * after the instruction. This can happen even after optimizations for
139 * corner cases such as destinations of atomic instructions.
140 */
141 IR3_REG_UNUSED = BIT(16),
142
143 /* "Early-clobber" on a destination means that the destination is
144 * (potentially) written before any sources are read and therefore
145 * interferes with the sources of the instruction.
146 */
147 IR3_REG_EARLY_CLOBBER = BIT(17),
148
149 /* If this is the last usage of a specific value in the register, the
150 * register cannot be read without being written to first after this.
151 * Note: This effectively has the same semantics as IR3_REG_KILL.
152 */
153 IR3_REG_LAST_USE = BIT(18),
154
155 /* Predicate register (p0.c). Cannot be combined with half or shared. */
156 IR3_REG_PREDICATE = BIT(19),
157 } ir3_register_flags;
158
159 struct ir3_register {
160 BITMASK_ENUM(ir3_register_flags) flags;
161
162 unsigned name;
163
164 /* used for cat5 instructions, but also for internal/IR level
165 * tracking of what registers are read/written by an instruction.
166 * wrmask may be a bad name since it is used to represent both
167 * src and dst that touch multiple adjacent registers.
168 */
169 unsigned wrmask : 16; /* up to vec16 */
170
171 /* for relative addressing, 32bits for array size is too small,
172 * but otoh we don't need to deal with disjoint sets, so instead
173 * use a simple size field (number of scalar components).
174 *
175 * Note the size field isn't important for relative const (since
176 * we don't have to do register allocation for constants).
177 */
178 unsigned size : 16;
179
180 /* normal registers:
181 * the component is in the low two bits of the reg #, so
182 * rN.x becomes: (N << 2) | x
183 */
184 uint16_t num;
185 union {
186 /* immediate: */
187 int32_t iim_val;
188 uint32_t uim_val;
189 float fim_val;
190 /* relative: */
191 struct {
192 uint16_t id;
193 int16_t offset;
194 uint16_t base;
195 } array;
196 };
197
198 /* For IR3_REG_SSA, dst registers contain pointer back to the instruction
199 * containing this register.
200 */
201 struct ir3_instruction *instr;
202
203 /* For IR3_REG_SSA, src registers contain ptr back to assigning
204 * instruction.
205 *
206 * For IR3_REG_ARRAY, the pointer is back to the last dependent
207 * array access (although the net effect is the same, it points
208 * back to a previous instruction that we depend on).
209 */
210 struct ir3_register *def;
211
212 /* Pointer to another register in the instruction that must share the same
213 * physical register. Each destination can be tied with one source, and
214 * they must have "tied" pointing to each other.
215 */
216 struct ir3_register *tied;
217
218 unsigned spill_slot, next_use;
219
220 unsigned merge_set_offset;
221 struct ir3_merge_set *merge_set;
222 unsigned interval_start, interval_end;
223 };
224
225 /*
226 * Stupid/simple growable array implementation:
227 */
228 #define DECLARE_ARRAY(type, name) \
229 unsigned name##_count, name##_sz; \
230 type *name;
231
232 #define array_insert(ctx, arr, ...) \
233 do { \
234 if (arr##_count == arr##_sz) { \
235 arr##_sz = MAX2(2 * arr##_sz, 16); \
236 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \
237 } \
238 arr[arr##_count++] = __VA_ARGS__; \
239 } while (0)
240
241 typedef enum {
242 REDUCE_OP_ADD_U,
243 REDUCE_OP_ADD_F,
244 REDUCE_OP_MUL_U,
245 REDUCE_OP_MUL_F,
246 REDUCE_OP_MIN_U,
247 REDUCE_OP_MIN_S,
248 REDUCE_OP_MIN_F,
249 REDUCE_OP_MAX_U,
250 REDUCE_OP_MAX_S,
251 REDUCE_OP_MAX_F,
252 REDUCE_OP_AND_B,
253 REDUCE_OP_OR_B,
254 REDUCE_OP_XOR_B,
255 } reduce_op_t;
256
257 typedef enum {
258 ALIAS_TEX = 0,
259 ALIAS_RT = 3,
260 ALIAS_MEM = 4,
261 } ir3_alias_scope;
262
263 typedef enum ir3_instruction_flags {
264 /* (sy) flag is set on first instruction, and after sample
265 * instructions (probably just on RAW hazard).
266 */
267 IR3_INSTR_SY = BIT(0),
268 /* (ss) flag is set on first instruction, and first instruction
269 * to depend on the result of "long" instructions (RAW hazard):
270 *
271 * rcp, rsq, log2, exp2, sin, cos, sqrt
272 *
273 * It seems to synchronize until all in-flight instructions are
274 * completed, for example:
275 *
276 * rsq hr1.w, hr1.w
277 * add.f hr2.z, (neg)hr2.z, hc0.y
278 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
279 * rsq hr2.x, hr2.x
280 * (rpt1)nop
281 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
282 * nop
283 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
284 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
285 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
286 *
287 * The last mul.f does not have (ss) set, presumably because the
288 * (ss) on the previous instruction does the job.
289 *
290 * The blob driver also seems to set it on WAR hazards, although
291 * not really clear if this is needed or just blob compiler being
292 * sloppy. So far I haven't found a case where removing the (ss)
293 * causes problems for WAR hazard, but I could just be getting
294 * lucky:
295 *
296 * rcp r1.y, r3.y
297 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
298 *
299 */
300 IR3_INSTR_SS = BIT(1),
301 /* (jp) flag is set on jump targets:
302 */
303 IR3_INSTR_JP = BIT(2),
304 /* (eq) flag kills helper invocations when they are no longer needed */
305 IR3_INSTR_EQ = BIT(3),
306 IR3_INSTR_UL = BIT(4),
307 IR3_INSTR_3D = BIT(5),
308 IR3_INSTR_A = BIT(6),
309 IR3_INSTR_O = BIT(7),
310 IR3_INSTR_P = BIT(8),
311 IR3_INSTR_S = BIT(9),
312 IR3_INSTR_S2EN = BIT(10),
313 IR3_INSTR_SAT = BIT(11),
314 /* (cat5/cat6) Bindless */
315 IR3_INSTR_B = BIT(12),
316 /* (cat5/cat6) nonuniform */
317 IR3_INSTR_NONUNIF = BIT(13),
318 /* (cat5-only) Get some parts of the encoding from a1.x */
319 IR3_INSTR_A1EN = BIT(14),
320 /* uniform destination for ldc, which must be set if and only if it has a
321 * shared reg destination
322 */
323 IR3_INSTR_U = BIT(15),
324 /* meta-flags, for intermediate stages of IR, ie.
325 * before register assignment is done:
326 */
327 IR3_INSTR_MARK = BIT(16),
328
329 /* Used by shared register allocation when creating spill/reload instructions
330 * to inform validation that this is created by RA. This also may be set on
331 * an instruction where a spill has been folded into it.
332 */
333 IR3_INSTR_SHARED_SPILL = IR3_INSTR_MARK,
334
335 IR3_INSTR_UNUSED = BIT(17),
336
337 /* Used to indicate that a mov comes from a lowered READ_FIRST/READ_COND
338 * and may broadcast a helper invocation's value from a vector register to a
339 * shared register that may be read by other invocations. This factors into
340 * (eq) calculations.
341 */
342 IR3_INSTR_NEEDS_HELPERS = BIT(18),
343
344 /* isam.v */
345 IR3_INSTR_V = BIT(19),
346
347 /* isam.1d. Note that .1d is an active-low bit. */
348 IR3_INSTR_INV_1D = BIT(20),
349
350 /* isam.v/ldib.b/stib.b can optionally use an immediate offset with one of
351 * their sources.
352 */
353 IR3_INSTR_IMM_OFFSET = BIT(21),
354 } ir3_instruction_flags;
355
356 struct ir3_instruction {
357 struct ir3_block *block;
358 opc_t opc;
359 BITMASK_ENUM(ir3_instruction_flags) flags;
360 uint8_t repeat;
361 uint8_t nop;
362 #if MESA_DEBUG
363 unsigned srcs_max, dsts_max;
364 #endif
365 unsigned srcs_count, dsts_count;
366 struct ir3_register **dsts;
367 struct ir3_register **srcs;
368 union {
369 struct {
370 char inv1, inv2;
371 int immed;
372 struct ir3_block *target;
373 const char *target_label;
374 unsigned idx; /* for brac.N */
375 } cat0;
376 struct {
377 type_t src_type, dst_type;
378 round_t round;
379 reduce_op_t reduce_op;
380 } cat1;
381 struct {
382 enum {
383 IR3_COND_LT = 0,
384 IR3_COND_LE = 1,
385 IR3_COND_GT = 2,
386 IR3_COND_GE = 3,
387 IR3_COND_EQ = 4,
388 IR3_COND_NE = 5,
389 } condition;
390 } cat2;
391 struct {
392 enum {
393 IR3_SRC_UNSIGNED = 0,
394 IR3_SRC_MIXED = 1,
395 } signedness;
396 enum {
397 IR3_SRC_PACKED_LOW = 0,
398 IR3_SRC_PACKED_HIGH = 1,
399 } packed;
400 bool swapped;
401 } cat3;
402 struct {
403 unsigned samp, tex;
404 unsigned tex_base : 3;
405 unsigned cluster_size : 4;
406 type_t type;
407 } cat5;
408 struct {
409 type_t type;
410 /* TODO remove dst_offset and handle as a ir3_register
411 * which might be IMMED, similar to how src_offset is
412 * handled.
413 */
414 int dst_offset;
415 int iim_val; /* for ldgb/stgb, # of components */
416 unsigned d : 3; /* for ldc, component offset */
417 bool typed : 1;
418 unsigned base : 3;
419 } cat6;
420 struct {
421 unsigned w : 1; /* write */
422 unsigned r : 1; /* read */
423 unsigned l : 1; /* local */
424 unsigned g : 1; /* global */
425
426 ir3_alias_scope alias_scope;
427 } cat7;
428 /* for meta-instructions, just used to hold extra data
429 * before instruction scheduling, etc
430 */
431 struct {
432 int off; /* component/offset */
433 } split;
434 struct {
435 /* Per-source index back to the entry in the
436 * ir3_shader_variant::outputs table.
437 */
438 unsigned *outidxs;
439 } end;
440 struct {
441 /* used to temporarily hold reference to nir_phi_instr
442 * until we resolve the phi srcs
443 */
444 void *nphi;
445 unsigned comp;
446 } phi;
447 struct {
448 unsigned samp, tex;
449 unsigned input_offset;
450 unsigned samp_base : 3;
451 unsigned tex_base : 3;
452 } prefetch;
453 struct {
454 /* maps back to entry in ir3_shader_variant::inputs table: */
455 int inidx;
456 /* for sysvals, identifies the sysval type. Mostly so we can
457 * identify the special cases where a sysval should not be DCE'd
458 * (currently, just pre-fs texture fetch)
459 */
460 gl_system_value sysval;
461 } input;
462 struct {
463 unsigned src_base, src_size;
464 unsigned dst_base;
465 } push_consts;
466 struct {
467 uint64_t value;
468 } raw;
469 };
470
471 /* For assigning jump offsets, we need instruction's position: */
472 uint32_t ip;
473
474 /* used for per-pass extra instruction data.
475 *
476 * TODO we should remove the per-pass data like this and 'use_count'
477 * and do something similar to what RA does w/ ir3_ra_instr_data..
478 * ie. use the ir3_count_instructions pass, and then use instr->ip
479 * to index into a table of pass-private data.
480 */
481 void *data;
482
483 /**
484 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use()
485 */
486 struct set *uses;
487
488 int use_count; /* currently just updated/used by cp */
489
490 /* an instruction can reference at most one address register amongst
491 * it's src/dst registers. Beyond that, you need to insert mov's.
492 *
493 * NOTE: do not write this directly, use ir3_instr_set_address()
494 */
495 struct ir3_register *address;
496
497 /* Tracking for additional dependent instructions. Used to handle
498 * barriers, WAR hazards for arrays/SSBOs/etc.
499 */
500 DECLARE_ARRAY(struct ir3_instruction *, deps);
501
502 /*
503 * From PoV of instruction scheduling, not execution (ie. ignores global/
504 * local distinction):
505 * shared image atomic SSBO everything
506 * barrier()/ - R/W R/W R/W R/W X
507 * groupMemoryBarrier()
508 * memoryBarrier()
509 * (but only images declared coherent?)
510 * memoryBarrierAtomic() - R/W
511 * memoryBarrierBuffer() - R/W
512 * memoryBarrierImage() - R/W
513 * memoryBarrierShared() - R/W
514 *
515 * TODO I think for SSBO/image/shared, in cases where we can determine
516 * which variable is accessed, we don't need to care about accesses to
517 * different variables (unless declared coherent??)
518 */
519 enum {
520 IR3_BARRIER_EVERYTHING = 1 << 0,
521 IR3_BARRIER_SHARED_R = 1 << 1,
522 IR3_BARRIER_SHARED_W = 1 << 2,
523 IR3_BARRIER_IMAGE_R = 1 << 3,
524 IR3_BARRIER_IMAGE_W = 1 << 4,
525 IR3_BARRIER_BUFFER_R = 1 << 5,
526 IR3_BARRIER_BUFFER_W = 1 << 6,
527 IR3_BARRIER_ARRAY_R = 1 << 7,
528 IR3_BARRIER_ARRAY_W = 1 << 8,
529 IR3_BARRIER_PRIVATE_R = 1 << 9,
530 IR3_BARRIER_PRIVATE_W = 1 << 10,
531 IR3_BARRIER_CONST_W = 1 << 11,
532 IR3_BARRIER_ACTIVE_FIBERS_R = 1 << 12,
533 IR3_BARRIER_ACTIVE_FIBERS_W = 1 << 13,
534 } barrier_class,
535 barrier_conflict;
536
537 /* Entry in ir3_block's instruction list: */
538 struct list_head node;
539
540 /* List of this instruction's repeat group. Vectorized NIR instructions are
541 * emitted as multiple scalar instructions that are linked together using
542 * this field. After RA, the ir3_combine_rpt pass iterates these groups and,
543 * if the register assignment allows it, merges them into a (rptN)
544 * instruction.
545 *
546 * NOTE: this is not a typical list as there is no empty list head. The list
547 * head is stored in the first instruction of the repeat group so also refers
548 * to a list entry. In order to distinguish the list's first entry, we use
549 * serialno: instructions in a repeat group are always emitted consecutively
550 * so the first will have the lowest serialno.
551 *
552 * As this is not a typical list, we have to be careful with using the
553 * existing list helper. For example, using list_length on the first
554 * instruction will yield one less than the number of instructions in its
555 * group.
556 */
557 struct list_head rpt_node;
558
559 uint32_t serialno;
560
561 // TODO only computerator/assembler:
562 int line;
563 };
564
565 /* Represents repeat groups in return values and arguments of the rpt builder
566 * API functions.
567 */
568 struct ir3_instruction_rpt {
569 struct ir3_instruction *rpts[4];
570 };
571
572 struct ir3 {
573 struct ir3_compiler *compiler;
574 gl_shader_stage type;
575
576 DECLARE_ARRAY(struct ir3_instruction *, inputs);
577
578 /* Track bary.f (and ldlv) instructions.. this is needed in
579 * scheduling to ensure that all varying fetches happen before
580 * any potential kill instructions. The hw gets grumpy if all
581 * threads in a group are killed before the last bary.f gets
582 * a chance to signal end of input (ei).
583 */
584 DECLARE_ARRAY(struct ir3_instruction *, baryfs);
585
586 /* Track all indirect instructions (read and write). To avoid
587 * deadlock scenario where an address register gets scheduled,
588 * but other dependent src instructions cannot be scheduled due
589 * to dependency on a *different* address register value, the
590 * scheduler needs to ensure that all dependencies other than
591 * the instruction other than the address register are scheduled
592 * before the one that writes the address register. Having a
593 * convenient list of instructions that reference some address
594 * register simplifies this.
595 */
596 DECLARE_ARRAY(struct ir3_instruction *, a0_users);
597
598 /* same for a1.x: */
599 DECLARE_ARRAY(struct ir3_instruction *, a1_users);
600
601 /* Track texture sample instructions which need texture state
602 * patched in (for astc-srgb workaround):
603 */
604 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
605
606 /* Track tg4 instructions which need texture state patched in (for tg4
607 * swizzling workaround):
608 */
609 DECLARE_ARRAY(struct ir3_instruction *, tg4);
610
611 /* List of blocks: */
612 struct list_head block_list;
613
614 /* List of ir3_array's: */
615 struct list_head array_list;
616
617 #if MESA_DEBUG
618 unsigned block_count;
619 #endif
620 unsigned instr_count;
621 };
622
623 struct ir3_array {
624 struct list_head node;
625 unsigned length;
626 unsigned id;
627
628 struct nir_def *r;
629
630 /* To avoid array write's from getting DCE'd, keep track of the
631 * most recent write. Any array access depends on the most
632 * recent write. This way, nothing depends on writes after the
633 * last read. But all the writes that happen before that have
634 * something depending on them
635 */
636 struct ir3_register *last_write;
637
638 /* extra stuff used in RA pass: */
639 unsigned base; /* base vreg name */
640 unsigned reg; /* base physical reg */
641 uint16_t start_ip, end_ip;
642
643 /* Indicates if half-precision */
644 bool half;
645
646 bool unused;
647 };
648
649 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id);
650
651 struct ir3_block {
652 struct list_head node;
653 struct ir3 *shader;
654
655 const struct nir_block *nblock;
656
657 struct list_head instr_list; /* list of ir3_instruction */
658
659 /* each block has either one or two successors.. in case of two
660 * successors, 'condition' decides which one to follow. A block preceding
661 * an if/else has two successors.
662 *
663 * In some cases the path that the machine actually takes through the
664 * program may not match the per-thread view of the CFG. In particular
665 * this is the case for if/else, where the machine jumps from the end of
666 * the if to the beginning of the else and switches active lanes. While
667 * most things only care about the per-thread view, we need to use the
668 * "physical" view when allocating shared registers. "successors" contains
669 * the per-thread successors, and "physical_successors" contains the
670 * physical successors which includes the fallthrough edge from the if to
671 * the else.
672 */
673 struct ir3_block *successors[2];
674
675 bool divergent_condition;
676
677 DECLARE_ARRAY(struct ir3_block *, predecessors);
678 DECLARE_ARRAY(struct ir3_block *, physical_predecessors);
679 DECLARE_ARRAY(struct ir3_block *, physical_successors);
680
681 uint16_t start_ip, end_ip;
682
683 bool reconvergence_point;
684
685 bool in_early_preamble;
686
687 /* Track instructions which do not write a register but other-
688 * wise must not be discarded (such as kill, stg, etc)
689 */
690 DECLARE_ARRAY(struct ir3_instruction *, keeps);
691
692 /* used for per-pass extra block data. Mainly used right
693 * now in RA step to track livein/liveout.
694 */
695 void *data;
696
697 uint32_t index;
698
699 struct ir3_block *imm_dom;
700 DECLARE_ARRAY(struct ir3_block *, dom_children);
701
702 uint32_t dom_pre_index;
703 uint32_t dom_post_index;
704
705 uint32_t loop_depth;
706
707 #if MESA_DEBUG
708 uint32_t serialno;
709 #endif
710 };
711
712 enum ir3_cursor_option {
713 IR3_CURSOR_BEFORE_BLOCK,
714 IR3_CURSOR_AFTER_BLOCK,
715 IR3_CURSOR_BEFORE_INSTR,
716 IR3_CURSOR_AFTER_INSTR,
717 };
718
719 struct ir3_cursor {
720 enum ir3_cursor_option option;
721 union {
722 struct ir3_block *block;
723 struct ir3_instruction *instr;
724 };
725 };
726
727 struct ir3_builder {
728 struct ir3_cursor cursor;
729 };
730
731 static inline uint32_t
block_id(struct ir3_block * block)732 block_id(struct ir3_block *block)
733 {
734 #if MESA_DEBUG
735 return block->serialno;
736 #else
737 return (uint32_t)(unsigned long)block;
738 #endif
739 }
740
741 static inline struct ir3_block *
ir3_start_block(struct ir3 * ir)742 ir3_start_block(struct ir3 *ir)
743 {
744 return list_first_entry(&ir->block_list, struct ir3_block, node);
745 }
746
747 static inline struct ir3_block *
ir3_end_block(struct ir3 * ir)748 ir3_end_block(struct ir3 *ir)
749 {
750 return list_last_entry(&ir->block_list, struct ir3_block, node);
751 }
752
753 struct ir3_instruction *ir3_block_get_terminator(struct ir3_block *block);
754
755 struct ir3_instruction *ir3_block_take_terminator(struct ir3_block *block);
756
757 struct ir3_instruction *
758 ir3_block_get_last_non_terminator(struct ir3_block *block);
759
760 struct ir3_instruction *ir3_block_get_last_phi(struct ir3_block *block);
761
762 static inline struct ir3_block *
ir3_after_preamble(struct ir3 * ir)763 ir3_after_preamble(struct ir3 *ir)
764 {
765 struct ir3_block *block = ir3_start_block(ir);
766 /* The preamble will have a usually-empty else branch, and we want to skip
767 * that to get to the block after the preamble.
768 */
769 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
770 if (terminator && (terminator->opc == OPC_SHPS))
771 return block->successors[1]->successors[0];
772 else
773 return block;
774 }
775
776 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred);
777 void ir3_block_link_physical(struct ir3_block *pred, struct ir3_block *succ);
778 void ir3_block_remove_predecessor(struct ir3_block *block,
779 struct ir3_block *pred);
780 unsigned ir3_block_get_pred_index(struct ir3_block *block,
781 struct ir3_block *pred);
782
783 void ir3_calc_dominance(struct ir3 *ir);
784 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b);
785
786 struct ir3_shader_variant;
787
788 struct ir3 *ir3_create(struct ir3_compiler *compiler,
789 struct ir3_shader_variant *v);
790 void ir3_destroy(struct ir3 *shader);
791
792 void ir3_collect_info(struct ir3_shader_variant *v);
793 void *ir3_alloc(struct ir3 *shader, int sz);
794
795 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
796 unsigned reg_count,
797 bool double_threadsize);
798
799 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
800 bool double_threadsize);
801
802 bool ir3_should_double_threadsize(struct ir3_shader_variant *v,
803 unsigned regs_count);
804
805 struct ir3_block *ir3_block_create(struct ir3 *shader);
806
807 struct ir3_instruction *ir3_build_instr(struct ir3_builder *builder, opc_t opc,
808 int ndst, int nsrc);
809 struct ir3_instruction *ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc,
810 int ndst, int nsrc);
811 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc,
812 int ndst, int nsrc);
813 struct ir3_instruction *ir3_instr_create_at_end(struct ir3_block *block,
814 opc_t opc, int ndst, int nsrc);
815 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr);
816 void ir3_instr_add_dep(struct ir3_instruction *instr,
817 struct ir3_instruction *dep);
818 const char *ir3_instr_name(struct ir3_instruction *instr);
819 void ir3_instr_remove(struct ir3_instruction *instr);
820
821 void ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n);
822 bool ir3_instr_is_rpt(const struct ir3_instruction *instr);
823 bool ir3_instr_is_first_rpt(const struct ir3_instruction *instr);
824 struct ir3_instruction *ir3_instr_prev_rpt(const struct ir3_instruction *instr);
825 struct ir3_instruction *ir3_instr_first_rpt(struct ir3_instruction *instr);
826 unsigned ir3_instr_rpt_length(const struct ir3_instruction *instr);
827
828 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num,
829 int flags);
830 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num,
831 int flags);
832 struct ir3_register *ir3_reg_clone(struct ir3 *shader,
833 struct ir3_register *reg);
834
835 static inline void
ir3_reg_tie(struct ir3_register * dst,struct ir3_register * src)836 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src)
837 {
838 assert(!dst->tied && !src->tied);
839 dst->tied = src;
840 src->tied = dst;
841 }
842
843 void ir3_reg_set_last_array(struct ir3_instruction *instr,
844 struct ir3_register *reg,
845 struct ir3_register *last_write);
846
847 void ir3_instr_set_address(struct ir3_instruction *instr,
848 struct ir3_instruction *addr);
849
850 static inline bool
ir3_instr_check_mark(struct ir3_instruction * instr)851 ir3_instr_check_mark(struct ir3_instruction *instr)
852 {
853 if (instr->flags & IR3_INSTR_MARK)
854 return true; /* already visited */
855 instr->flags |= IR3_INSTR_MARK;
856 return false;
857 }
858
859 void ir3_block_clear_mark(struct ir3_block *block);
860 void ir3_clear_mark(struct ir3 *shader);
861
862 unsigned ir3_count_instructions(struct ir3 *ir);
863 unsigned ir3_count_instructions_sched(struct ir3 *ir);
864 unsigned ir3_count_instructions_ra(struct ir3 *ir);
865
866 /**
867 * Move 'instr' to just before 'after'
868 */
869 static inline void
ir3_instr_move_before(struct ir3_instruction * instr,struct ir3_instruction * after)870 ir3_instr_move_before(struct ir3_instruction *instr,
871 struct ir3_instruction *after)
872 {
873 list_delinit(&instr->node);
874 list_addtail(&instr->node, &after->node);
875 }
876
877 /**
878 * Move 'instr' to just after 'before':
879 */
880 static inline void
ir3_instr_move_after(struct ir3_instruction * instr,struct ir3_instruction * before)881 ir3_instr_move_after(struct ir3_instruction *instr,
882 struct ir3_instruction *before)
883 {
884 list_delinit(&instr->node);
885 list_add(&instr->node, &before->node);
886 }
887
888 /**
889 * Move 'instr' to the beginning of the block:
890 */
891 static inline void
ir3_instr_move_before_block(struct ir3_instruction * instr,struct ir3_block * block)892 ir3_instr_move_before_block(struct ir3_instruction *instr,
893 struct ir3_block *block)
894 {
895 list_delinit(&instr->node);
896 list_add(&instr->node, &block->instr_list);
897 }
898
899 typedef bool (*use_filter_cb)(struct ir3_instruction *use, unsigned src_n);
900
901 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps);
902 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter);
903
904 void ir3_set_dst_type(struct ir3_instruction *instr, bool half);
905 void ir3_fixup_src_type(struct ir3_instruction *instr);
906
907 int ir3_flut(struct ir3_register *src_reg);
908
909 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
910
911 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed);
912
913 /**
914 * Given an instruction whose result we want to test for nonzero, return a
915 * potentially different instruction for which the result would be the same.
916 * This might be one of its sources if instr doesn't change the nonzero-ness.
917 */
918 struct ir3_instruction *
919 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr);
920
921 bool ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc);
922
923 #include "util/set.h"
924 #define foreach_ssa_use(__use, __instr) \
925 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \
926 __use = NULL) \
927 set_foreach ((__instr)->uses, __entry) \
928 if ((__use = (void *)__entry->key))
929
930 static inline uint32_t
reg_num(const struct ir3_register * reg)931 reg_num(const struct ir3_register *reg)
932 {
933 return reg->num >> 2;
934 }
935
936 static inline uint32_t
reg_comp(const struct ir3_register * reg)937 reg_comp(const struct ir3_register *reg)
938 {
939 return reg->num & 0x3;
940 }
941
942 static inline bool
is_flow(struct ir3_instruction * instr)943 is_flow(struct ir3_instruction *instr)
944 {
945 return (opc_cat(instr->opc) == 0);
946 }
947
948 static inline bool
is_terminator(struct ir3_instruction * instr)949 is_terminator(struct ir3_instruction *instr)
950 {
951 switch (instr->opc) {
952 case OPC_BR:
953 case OPC_JUMP:
954 case OPC_BANY:
955 case OPC_BALL:
956 case OPC_BRAA:
957 case OPC_BRAO:
958 case OPC_SHPS:
959 case OPC_GETONE:
960 case OPC_GETLAST:
961 case OPC_PREDT:
962 case OPC_PREDF:
963 return true;
964 default:
965 return false;
966 }
967 }
968
969 static inline bool
is_kill_or_demote(struct ir3_instruction * instr)970 is_kill_or_demote(struct ir3_instruction *instr)
971 {
972 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE;
973 }
974
975 static inline bool
is_nop(struct ir3_instruction * instr)976 is_nop(struct ir3_instruction *instr)
977 {
978 return instr->opc == OPC_NOP;
979 }
980
981 static inline bool
is_same_type_reg(struct ir3_register * dst,struct ir3_register * src)982 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src)
983 {
984 unsigned dst_type = (dst->flags & IR3_REG_HALF);
985 unsigned src_type = (src->flags & IR3_REG_HALF);
986
987 /* Treat shared->normal copies and normal->shared copies as same-type. */
988 return dst_type == src_type;
989 }
990
991 /* Is it a non-transformative (ie. not type changing) mov? This can
992 * also include absneg.s/absneg.f, which for the most part can be
993 * treated as a mov (single src argument).
994 */
995 static inline bool
is_same_type_mov(struct ir3_instruction * instr)996 is_same_type_mov(struct ir3_instruction *instr)
997 {
998 struct ir3_register *dst;
999
1000 switch (instr->opc) {
1001 case OPC_MOV:
1002 if (instr->cat1.src_type != instr->cat1.dst_type)
1003 return false;
1004 /* If the type of dest reg and src reg are different,
1005 * it shouldn't be considered as same type mov
1006 */
1007 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1008 return false;
1009 break;
1010 case OPC_ABSNEG_F:
1011 case OPC_ABSNEG_S:
1012 if (instr->flags & IR3_INSTR_SAT)
1013 return false;
1014 /* If the type of dest reg and src reg are different,
1015 * it shouldn't be considered as same type mov
1016 */
1017 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0]))
1018 return false;
1019 break;
1020 default:
1021 return false;
1022 }
1023
1024 dst = instr->dsts[0];
1025
1026 /* mov's that write to a0 or p0.x are special: */
1027 if (dst->flags & IR3_REG_PREDICATE)
1028 return false;
1029 if (reg_num(dst) == REG_A0)
1030 return false;
1031
1032 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
1033 return false;
1034
1035 return true;
1036 }
1037
1038 /* A move from const, which changes size but not type, can also be
1039 * folded into dest instruction in some cases.
1040 */
1041 static inline bool
is_const_mov(struct ir3_instruction * instr)1042 is_const_mov(struct ir3_instruction *instr)
1043 {
1044 if (instr->opc != OPC_MOV)
1045 return false;
1046
1047 if (!(instr->srcs[0]->flags & IR3_REG_CONST))
1048 return false;
1049
1050 type_t src_type = instr->cat1.src_type;
1051 type_t dst_type = instr->cat1.dst_type;
1052
1053 return (type_float(src_type) && type_float(dst_type)) ||
1054 (type_uint(src_type) && type_uint(dst_type)) ||
1055 (type_sint(src_type) && type_sint(dst_type));
1056 }
1057
1058 static inline bool
is_subgroup_cond_mov_macro(struct ir3_instruction * instr)1059 is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
1060 {
1061 switch (instr->opc) {
1062 case OPC_BALLOT_MACRO:
1063 case OPC_ANY_MACRO:
1064 case OPC_ALL_MACRO:
1065 case OPC_ELECT_MACRO:
1066 case OPC_READ_COND_MACRO:
1067 case OPC_READ_FIRST_MACRO:
1068 case OPC_SCAN_MACRO:
1069 case OPC_SCAN_CLUSTERS_MACRO:
1070 return true;
1071 default:
1072 return false;
1073 }
1074 }
1075
1076 static inline bool
is_alu(struct ir3_instruction * instr)1077 is_alu(struct ir3_instruction *instr)
1078 {
1079 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
1080 }
1081
1082 static inline bool
is_sfu(struct ir3_instruction * instr)1083 is_sfu(struct ir3_instruction *instr)
1084 {
1085 return (opc_cat(instr->opc) == 4) || instr->opc == OPC_GETFIBERID;
1086 }
1087
1088 static inline bool
is_tex(struct ir3_instruction * instr)1089 is_tex(struct ir3_instruction *instr)
1090 {
1091 return (opc_cat(instr->opc) == 5) && instr->opc != OPC_TCINV;
1092 }
1093
1094 static inline bool
is_tex_shuffle(struct ir3_instruction * instr)1095 is_tex_shuffle(struct ir3_instruction *instr)
1096 {
1097 switch (instr->opc) {
1098 case OPC_BRCST_ACTIVE:
1099 case OPC_QUAD_SHUFFLE_BRCST:
1100 case OPC_QUAD_SHUFFLE_HORIZ:
1101 case OPC_QUAD_SHUFFLE_VERT:
1102 case OPC_QUAD_SHUFFLE_DIAG:
1103 return true;
1104 default:
1105 return false;
1106 }
1107 }
1108
1109 static inline bool
is_tex_or_prefetch(struct ir3_instruction * instr)1110 is_tex_or_prefetch(struct ir3_instruction *instr)
1111 {
1112 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH);
1113 }
1114
1115 static inline bool
is_mem(struct ir3_instruction * instr)1116 is_mem(struct ir3_instruction *instr)
1117 {
1118 return (opc_cat(instr->opc) == 6) && instr->opc != OPC_GETFIBERID;
1119 }
1120
1121 static inline bool
is_barrier(struct ir3_instruction * instr)1122 is_barrier(struct ir3_instruction *instr)
1123 {
1124 return (opc_cat(instr->opc) == 7);
1125 }
1126
1127 static inline bool
is_half(struct ir3_instruction * instr)1128 is_half(struct ir3_instruction *instr)
1129 {
1130 return !!(instr->dsts[0]->flags & IR3_REG_HALF);
1131 }
1132
1133 static inline bool
is_shared(struct ir3_instruction * instr)1134 is_shared(struct ir3_instruction *instr)
1135 {
1136 return !!(instr->dsts[0]->flags & IR3_REG_SHARED);
1137 }
1138
1139 static inline bool
is_store(struct ir3_instruction * instr)1140 is_store(struct ir3_instruction *instr)
1141 {
1142 /* these instructions, the "destination" register is
1143 * actually a source, the address to store to.
1144 */
1145 switch (instr->opc) {
1146 case OPC_STG:
1147 case OPC_STG_A:
1148 case OPC_STGB:
1149 case OPC_STIB:
1150 case OPC_STP:
1151 case OPC_STL:
1152 case OPC_STLW:
1153 case OPC_L2G:
1154 case OPC_G2L:
1155 return true;
1156 default:
1157 return false;
1158 }
1159 }
1160
1161 static inline bool
is_load(struct ir3_instruction * instr)1162 is_load(struct ir3_instruction *instr)
1163 {
1164 switch (instr->opc) {
1165 case OPC_LDG:
1166 case OPC_LDG_A:
1167 case OPC_LDGB:
1168 case OPC_LDIB:
1169 case OPC_LDL:
1170 case OPC_LDP:
1171 case OPC_L2G:
1172 case OPC_LDLW:
1173 case OPC_LDLV:
1174 /* probably some others too.. */
1175 return true;
1176 case OPC_LDC:
1177 return instr->dsts_count > 0;
1178 default:
1179 return false;
1180 }
1181 }
1182
1183 static inline bool
is_input(struct ir3_instruction * instr)1184 is_input(struct ir3_instruction *instr)
1185 {
1186 /* in some cases, ldlv is used to fetch varying without
1187 * interpolation.. fortunately inloc is the first src
1188 * register in either case
1189 */
1190 switch (instr->opc) {
1191 case OPC_LDLV:
1192 case OPC_BARY_F:
1193 case OPC_FLAT_B:
1194 return true;
1195 default:
1196 return false;
1197 }
1198 }
1199
1200 /* Whether non-helper invocations can read the value of helper invocations. We
1201 * cannot insert (eq) before these instructions.
1202 */
1203 static inline bool
uses_helpers(struct ir3_instruction * instr)1204 uses_helpers(struct ir3_instruction *instr)
1205 {
1206 switch (instr->opc) {
1207 /* These require helper invocations to be present */
1208 case OPC_SAMB:
1209 case OPC_GETLOD:
1210 case OPC_DSX:
1211 case OPC_DSY:
1212 case OPC_DSXPP_1:
1213 case OPC_DSYPP_1:
1214 case OPC_DSXPP_MACRO:
1215 case OPC_DSYPP_MACRO:
1216 case OPC_QUAD_SHUFFLE_BRCST:
1217 case OPC_QUAD_SHUFFLE_HORIZ:
1218 case OPC_QUAD_SHUFFLE_VERT:
1219 case OPC_QUAD_SHUFFLE_DIAG:
1220 case OPC_META_TEX_PREFETCH:
1221 return true;
1222
1223 /* sam requires helper invocations except for dummy prefetch instructions */
1224 case OPC_SAM:
1225 return instr->dsts_count != 0;
1226
1227 /* Subgroup operations don't require helper invocations to be present, but
1228 * will use helper invocations if they are present.
1229 */
1230 case OPC_BALLOT_MACRO:
1231 case OPC_ANY_MACRO:
1232 case OPC_ALL_MACRO:
1233 case OPC_READ_FIRST_MACRO:
1234 case OPC_READ_COND_MACRO:
1235 case OPC_MOVMSK:
1236 case OPC_BRCST_ACTIVE:
1237 return true;
1238
1239 /* Catch lowered READ_FIRST/READ_COND. For elect, don't include the getone
1240 * in the preamble because it doesn't actually matter which fiber is
1241 * selected.
1242 */
1243 case OPC_MOV:
1244 case OPC_ELECT_MACRO:
1245 return instr->flags & IR3_INSTR_NEEDS_HELPERS;
1246
1247 default:
1248 return false;
1249 }
1250 }
1251
1252 static inline bool
is_bool(struct ir3_instruction * instr)1253 is_bool(struct ir3_instruction *instr)
1254 {
1255 switch (instr->opc) {
1256 case OPC_CMPS_F:
1257 case OPC_CMPS_S:
1258 case OPC_CMPS_U:
1259 return true;
1260 default:
1261 return false;
1262 }
1263 }
1264
1265 static inline opc_t
cat3_half_opc(opc_t opc)1266 cat3_half_opc(opc_t opc)
1267 {
1268 switch (opc) {
1269 case OPC_MAD_F32:
1270 return OPC_MAD_F16;
1271 case OPC_SEL_B32:
1272 return OPC_SEL_B16;
1273 case OPC_SEL_S32:
1274 return OPC_SEL_S16;
1275 case OPC_SEL_F32:
1276 return OPC_SEL_F16;
1277 case OPC_SAD_S32:
1278 return OPC_SAD_S16;
1279 default:
1280 return opc;
1281 }
1282 }
1283
1284 static inline opc_t
cat3_full_opc(opc_t opc)1285 cat3_full_opc(opc_t opc)
1286 {
1287 switch (opc) {
1288 case OPC_MAD_F16:
1289 return OPC_MAD_F32;
1290 case OPC_SEL_B16:
1291 return OPC_SEL_B32;
1292 case OPC_SEL_S16:
1293 return OPC_SEL_S32;
1294 case OPC_SEL_F16:
1295 return OPC_SEL_F32;
1296 case OPC_SAD_S16:
1297 return OPC_SAD_S32;
1298 default:
1299 return opc;
1300 }
1301 }
1302
1303 static inline opc_t
cat4_half_opc(opc_t opc)1304 cat4_half_opc(opc_t opc)
1305 {
1306 switch (opc) {
1307 case OPC_RSQ:
1308 return OPC_HRSQ;
1309 case OPC_LOG2:
1310 return OPC_HLOG2;
1311 case OPC_EXP2:
1312 return OPC_HEXP2;
1313 default:
1314 return opc;
1315 }
1316 }
1317
1318 static inline opc_t
cat4_full_opc(opc_t opc)1319 cat4_full_opc(opc_t opc)
1320 {
1321 switch (opc) {
1322 case OPC_HRSQ:
1323 return OPC_RSQ;
1324 case OPC_HLOG2:
1325 return OPC_LOG2;
1326 case OPC_HEXP2:
1327 return OPC_EXP2;
1328 default:
1329 return opc;
1330 }
1331 }
1332
1333 static inline bool
is_meta(struct ir3_instruction * instr)1334 is_meta(struct ir3_instruction *instr)
1335 {
1336 return (opc_cat(instr->opc) == OPC_META);
1337 }
1338
1339 static inline unsigned
reg_elems(const struct ir3_register * reg)1340 reg_elems(const struct ir3_register *reg)
1341 {
1342 if (reg->flags & IR3_REG_ARRAY)
1343 return reg->size;
1344 else
1345 return util_last_bit(reg->wrmask);
1346 }
1347
1348 static inline unsigned
reg_elem_size(const struct ir3_register * reg)1349 reg_elem_size(const struct ir3_register *reg)
1350 {
1351 return (reg->flags & IR3_REG_HALF) ? 1 : 2;
1352 }
1353
1354 static inline unsigned
reg_size(const struct ir3_register * reg)1355 reg_size(const struct ir3_register *reg)
1356 {
1357 return reg_elems(reg) * reg_elem_size(reg);
1358 }
1359
1360 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1361 * and have to handle relative accesses specially.
1362 */
1363
1364 static inline unsigned
post_ra_reg_elems(struct ir3_register * reg)1365 post_ra_reg_elems(struct ir3_register *reg)
1366 {
1367 if (reg->flags & IR3_REG_RELATIV)
1368 return reg->size;
1369 return reg_elems(reg);
1370 }
1371
1372 static inline unsigned
post_ra_reg_num(struct ir3_register * reg)1373 post_ra_reg_num(struct ir3_register *reg)
1374 {
1375 if (reg->flags & IR3_REG_RELATIV)
1376 return reg->array.base;
1377 return reg->num;
1378 }
1379
1380 static inline unsigned
dest_regs(struct ir3_instruction * instr)1381 dest_regs(struct ir3_instruction *instr)
1382 {
1383 if (instr->dsts_count == 0)
1384 return 0;
1385
1386 assert(instr->dsts_count == 1);
1387 return util_last_bit(instr->dsts[0]->wrmask);
1388 }
1389
1390 static inline bool
is_reg_gpr(const struct ir3_register * reg)1391 is_reg_gpr(const struct ir3_register *reg)
1392 {
1393 if ((reg_num(reg) == REG_A0) || (reg->flags & IR3_REG_PREDICATE))
1394 return false;
1395 if (!(reg->flags & (IR3_REG_SSA | IR3_REG_RELATIV)) &&
1396 reg->num == INVALID_REG)
1397 return false;
1398 return true;
1399 }
1400
1401 static inline bool
is_reg_a0(const struct ir3_register * reg)1402 is_reg_a0(const struct ir3_register *reg)
1403 {
1404 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1405 return false;
1406 return reg->num == regid(REG_A0, 0);
1407 }
1408
1409 /* is dst a normal temp register: */
1410 static inline bool
is_dest_gpr(const struct ir3_register * dst)1411 is_dest_gpr(const struct ir3_register *dst)
1412 {
1413 if (dst->wrmask == 0)
1414 return false;
1415 return is_reg_gpr(dst);
1416 }
1417
1418 static inline bool
writes_gpr(struct ir3_instruction * instr)1419 writes_gpr(struct ir3_instruction *instr)
1420 {
1421 if (dest_regs(instr) == 0)
1422 return false;
1423 return is_dest_gpr(instr->dsts[0]);
1424 }
1425
1426 static inline bool
writes_addr0(struct ir3_instruction * instr)1427 writes_addr0(struct ir3_instruction *instr)
1428 {
1429 /* Note: only the first dest can write to a0.x */
1430 if (instr->dsts_count > 0) {
1431 struct ir3_register *dst = instr->dsts[0];
1432 return dst->num == regid(REG_A0, 0);
1433 }
1434 return false;
1435 }
1436
1437 static inline bool
writes_addr1(struct ir3_instruction * instr)1438 writes_addr1(struct ir3_instruction *instr)
1439 {
1440 /* Note: only the first dest can write to a1.x */
1441 if (instr->dsts_count > 0) {
1442 struct ir3_register *dst = instr->dsts[0];
1443 return dst->num == regid(REG_A0, 1);
1444 }
1445 return false;
1446 }
1447
1448 static inline bool
writes_pred(struct ir3_instruction * instr)1449 writes_pred(struct ir3_instruction *instr)
1450 {
1451 /* Note: only the first dest can write to p0 */
1452 if (instr->dsts_count > 0) {
1453 struct ir3_register *dst = instr->dsts[0];
1454 return !!(dst->flags & IR3_REG_PREDICATE);
1455 }
1456 return false;
1457 }
1458
1459 /* r0.x - r47.w are "normal" registers. r48.x - r55.w are shared registers.
1460 * Everything above those are non-GPR registers like a0.x and p0.x that aren't
1461 * assigned by RA.
1462 */
1463 #define GPR_REG_SIZE (4 * 48)
1464 #define SHARED_REG_START GPR_REG_SIZE
1465 #define SHARED_REG_SIZE (4 * 8)
1466 #define NONGPR_REG_START (SHARED_REG_START + SHARED_REG_SIZE)
1467 #define NONGPR_REG_SIZE (4 * 8)
1468
1469 enum ir3_reg_file {
1470 IR3_FILE_FULL,
1471 IR3_FILE_HALF,
1472 IR3_FILE_SHARED,
1473 IR3_FILE_NONGPR,
1474 };
1475
1476 /* Return a file + offset that can be used for determining if two registers
1477 * alias. The register is only really used for its flags, the num is taken from
1478 * the parameter. Registers overlap if they are in the same file and have an
1479 * overlapping offset. The offset is multiplied by 2 for full registers to
1480 * handle aliasing half and full registers, that is it's in units of half-regs.
1481 */
1482 static inline unsigned
ir3_reg_file_offset(const struct ir3_register * reg,unsigned num,bool mergedregs,enum ir3_reg_file * file)1483 ir3_reg_file_offset(const struct ir3_register *reg, unsigned num,
1484 bool mergedregs, enum ir3_reg_file *file)
1485 {
1486 assert(!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
1487 unsigned size = reg_elem_size(reg);
1488 if (!is_reg_gpr(reg)) {
1489 *file = IR3_FILE_NONGPR;
1490 return (num - NONGPR_REG_START) * size;
1491 } else if (reg->flags & IR3_REG_SHARED) {
1492 *file = IR3_FILE_SHARED;
1493 return (num - SHARED_REG_START) * size;
1494 } else if (mergedregs || !(reg->flags & IR3_REG_HALF)) {
1495 *file = IR3_FILE_FULL;
1496 return num * size;
1497 } else {
1498 *file = IR3_FILE_HALF;
1499 return num;
1500 }
1501 }
1502
1503 /* returns defining instruction for reg */
1504 /* TODO better name */
1505 static inline struct ir3_instruction *
ssa(struct ir3_register * reg)1506 ssa(struct ir3_register *reg)
1507 {
1508 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def)
1509 return reg->def->instr;
1510 return NULL;
1511 }
1512
1513 static inline bool
conflicts(struct ir3_register * a,struct ir3_register * b)1514 conflicts(struct ir3_register *a, struct ir3_register *b)
1515 {
1516 return (a && b) && (a->def != b->def);
1517 }
1518
1519 static inline bool
reg_gpr(struct ir3_register * r)1520 reg_gpr(struct ir3_register *r)
1521 {
1522 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_PREDICATE))
1523 return false;
1524 if (reg_num(r) == REG_A0)
1525 return false;
1526 return true;
1527 }
1528
1529 static inline bool
reg_is_addr1(struct ir3_register * r)1530 reg_is_addr1(struct ir3_register *r)
1531 {
1532 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
1533 return false;
1534 return r->num == regid(REG_A0, 1);
1535 }
1536
1537 static inline type_t
half_type(type_t type)1538 half_type(type_t type)
1539 {
1540 switch (type) {
1541 case TYPE_F32:
1542 return TYPE_F16;
1543 case TYPE_U32:
1544 case TYPE_U8_32:
1545 return TYPE_U16;
1546 case TYPE_S32:
1547 return TYPE_S16;
1548 case TYPE_F16:
1549 case TYPE_U16:
1550 case TYPE_S16:
1551 return type;
1552 case TYPE_U8:
1553 return type;
1554 default:
1555 assert(0);
1556 return (type_t)~0;
1557 }
1558 }
1559
1560 static inline type_t
full_type(type_t type)1561 full_type(type_t type)
1562 {
1563 switch (type) {
1564 case TYPE_F16:
1565 return TYPE_F32;
1566 case TYPE_U8:
1567 case TYPE_U8_32:
1568 case TYPE_U16:
1569 return TYPE_U32;
1570 case TYPE_S16:
1571 return TYPE_S32;
1572 case TYPE_F32:
1573 case TYPE_U32:
1574 case TYPE_S32:
1575 return type;
1576 default:
1577 assert(0);
1578 return (type_t)~0;
1579 }
1580 }
1581
1582 /* some cat2 instructions (ie. those which are not float) can embed an
1583 * immediate:
1584 */
1585 static inline bool
ir3_cat2_int(opc_t opc)1586 ir3_cat2_int(opc_t opc)
1587 {
1588 switch (opc) {
1589 case OPC_ADD_U:
1590 case OPC_ADD_S:
1591 case OPC_SUB_U:
1592 case OPC_SUB_S:
1593 case OPC_CMPS_U:
1594 case OPC_CMPS_S:
1595 case OPC_MIN_U:
1596 case OPC_MIN_S:
1597 case OPC_MAX_U:
1598 case OPC_MAX_S:
1599 case OPC_CMPV_U:
1600 case OPC_CMPV_S:
1601 case OPC_MUL_U24:
1602 case OPC_MUL_S24:
1603 case OPC_MULL_U:
1604 case OPC_CLZ_S:
1605 case OPC_ABSNEG_S:
1606 case OPC_AND_B:
1607 case OPC_OR_B:
1608 case OPC_NOT_B:
1609 case OPC_XOR_B:
1610 case OPC_BFREV_B:
1611 case OPC_CLZ_B:
1612 case OPC_SHL_B:
1613 case OPC_SHR_B:
1614 case OPC_ASHR_B:
1615 case OPC_MGEN_B:
1616 case OPC_GETBIT_B:
1617 case OPC_CBITS_B:
1618 case OPC_BARY_F:
1619 case OPC_FLAT_B:
1620 return true;
1621
1622 default:
1623 return false;
1624 }
1625 }
1626
1627 /* map cat2 instruction to valid abs/neg flags: */
1628 static inline unsigned
ir3_cat2_absneg(opc_t opc)1629 ir3_cat2_absneg(opc_t opc)
1630 {
1631 switch (opc) {
1632 case OPC_ADD_F:
1633 case OPC_MIN_F:
1634 case OPC_MAX_F:
1635 case OPC_MUL_F:
1636 case OPC_SIGN_F:
1637 case OPC_CMPS_F:
1638 case OPC_ABSNEG_F:
1639 case OPC_CMPV_F:
1640 case OPC_FLOOR_F:
1641 case OPC_CEIL_F:
1642 case OPC_RNDNE_F:
1643 case OPC_RNDAZ_F:
1644 case OPC_TRUNC_F:
1645 case OPC_BARY_F:
1646 return IR3_REG_FABS | IR3_REG_FNEG;
1647
1648 case OPC_ADD_U:
1649 case OPC_ADD_S:
1650 case OPC_SUB_U:
1651 case OPC_SUB_S:
1652 case OPC_CMPS_U:
1653 case OPC_CMPS_S:
1654 case OPC_MIN_U:
1655 case OPC_MIN_S:
1656 case OPC_MAX_U:
1657 case OPC_MAX_S:
1658 case OPC_CMPV_U:
1659 case OPC_CMPV_S:
1660 case OPC_MUL_U24:
1661 case OPC_MUL_S24:
1662 case OPC_MULL_U:
1663 case OPC_CLZ_S:
1664 return 0;
1665
1666 case OPC_ABSNEG_S:
1667 return IR3_REG_SABS | IR3_REG_SNEG;
1668
1669 case OPC_AND_B:
1670 case OPC_OR_B:
1671 case OPC_NOT_B:
1672 case OPC_XOR_B:
1673 case OPC_BFREV_B:
1674 case OPC_CLZ_B:
1675 case OPC_SHL_B:
1676 case OPC_SHR_B:
1677 case OPC_ASHR_B:
1678 case OPC_MGEN_B:
1679 case OPC_GETBIT_B:
1680 case OPC_CBITS_B:
1681 return IR3_REG_BNOT;
1682
1683 default:
1684 return 0;
1685 }
1686 }
1687
1688 /* map cat3 instructions to valid abs/neg flags: */
1689 static inline unsigned
ir3_cat3_absneg(opc_t opc)1690 ir3_cat3_absneg(opc_t opc)
1691 {
1692 switch (opc) {
1693 case OPC_MAD_F16:
1694 case OPC_MAD_F32:
1695 case OPC_SEL_F16:
1696 case OPC_SEL_F32:
1697 return IR3_REG_FNEG;
1698
1699 case OPC_MAD_U16:
1700 case OPC_MADSH_U16:
1701 case OPC_MAD_S16:
1702 case OPC_MADSH_M16:
1703 case OPC_MAD_U24:
1704 case OPC_MAD_S24:
1705 case OPC_SEL_S16:
1706 case OPC_SEL_S32:
1707 case OPC_SAD_S16:
1708 case OPC_SAD_S32:
1709 /* neg *may* work on 3rd src.. */
1710
1711 case OPC_SEL_B16:
1712 case OPC_SEL_B32:
1713
1714 case OPC_SHRM:
1715 case OPC_SHLM:
1716 case OPC_SHRG:
1717 case OPC_SHLG:
1718 case OPC_ANDG:
1719 case OPC_WMM:
1720 case OPC_WMM_ACCU:
1721
1722 default:
1723 return 0;
1724 }
1725 }
1726
1727 /* Return the type (float, int, or uint) the op uses when converting from the
1728 * internal result of the op (which is assumed to be the same size as the
1729 * sources) to the destination when they are not the same size. If F32 it does
1730 * a floating-point conversion, if U32 it does a truncation/zero-extension, if
1731 * S32 it does a truncation/sign-extension. "can_fold" will be false if it
1732 * doesn't do anything sensible or is unknown.
1733 */
1734 static inline type_t
ir3_output_conv_type(struct ir3_instruction * instr,bool * can_fold)1735 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold)
1736 {
1737 *can_fold = true;
1738 switch (instr->opc) {
1739 case OPC_ADD_F:
1740 case OPC_MUL_F:
1741 case OPC_BARY_F:
1742 case OPC_MAD_F32:
1743 case OPC_MAD_F16:
1744 case OPC_WMM:
1745 case OPC_WMM_ACCU:
1746 return TYPE_F32;
1747
1748 case OPC_ADD_U:
1749 case OPC_SUB_U:
1750 case OPC_MIN_U:
1751 case OPC_MAX_U:
1752 case OPC_AND_B:
1753 case OPC_OR_B:
1754 case OPC_NOT_B:
1755 case OPC_XOR_B:
1756 case OPC_MUL_U24:
1757 case OPC_MULL_U:
1758 case OPC_SHL_B:
1759 case OPC_SHR_B:
1760 case OPC_ASHR_B:
1761 case OPC_MAD_U24:
1762 case OPC_SHRM:
1763 case OPC_SHLM:
1764 case OPC_SHRG:
1765 case OPC_SHLG:
1766 case OPC_ANDG:
1767 /* Comparison ops zero-extend/truncate their results, so consider them as
1768 * unsigned here.
1769 */
1770 case OPC_CMPS_F:
1771 case OPC_CMPV_F:
1772 case OPC_CMPS_U:
1773 case OPC_CMPS_S:
1774 return TYPE_U32;
1775
1776 case OPC_ADD_S:
1777 case OPC_SUB_S:
1778 case OPC_MIN_S:
1779 case OPC_MAX_S:
1780 case OPC_ABSNEG_S:
1781 case OPC_MUL_S24:
1782 case OPC_MAD_S24:
1783 return TYPE_S32;
1784
1785 /* We assume that any move->move folding that could be done was done by
1786 * NIR.
1787 */
1788 case OPC_MOV:
1789 default:
1790 *can_fold = false;
1791 return TYPE_U32;
1792 }
1793 }
1794
1795 /* Return the src and dst types for the conversion which is already folded
1796 * into the op. We can assume that instr has folded in a conversion from
1797 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense
1798 * to call if ir3_output_conv_type() returns can_fold = true.
1799 */
1800 static inline type_t
ir3_output_conv_src_type(struct ir3_instruction * instr,type_t base_type)1801 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type)
1802 {
1803 switch (instr->opc) {
1804 case OPC_CMPS_F:
1805 case OPC_CMPV_F:
1806 case OPC_CMPS_U:
1807 case OPC_CMPS_S:
1808 /* Comparisons only return 0/1 and the size of the comparison sources
1809 * is irrelevant, never consider them as having an output conversion
1810 * by returning a type with the dest size here:
1811 */
1812 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1813 : full_type(base_type);
1814
1815 case OPC_BARY_F:
1816 /* bary.f doesn't have an explicit source, but we can assume here that
1817 * the varying data it reads is in fp32.
1818 *
1819 * This may be fp16 on older gen's depending on some register
1820 * settings, but it's probably not worth plumbing that through for a
1821 * small improvement that NIR would hopefully handle for us anyway.
1822 */
1823 return TYPE_F32;
1824
1825 case OPC_FLAT_B:
1826 /* Treat the input data as u32 if not interpolating. */
1827 return TYPE_U32;
1828
1829 default:
1830 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1831 : full_type(base_type);
1832 }
1833 }
1834
1835 static inline type_t
ir3_output_conv_dst_type(struct ir3_instruction * instr,type_t base_type)1836 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type)
1837 {
1838 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type)
1839 : full_type(base_type);
1840 }
1841
1842 /* Some instructions have signed/unsigned variants which are identical except
1843 * for whether the folded conversion sign-extends or zero-extends, and we can
1844 * fold in a mismatching move by rewriting the opcode. Return the opcode to
1845 * switch signedness, and whether one exists.
1846 */
1847 static inline opc_t
ir3_try_swap_signedness(opc_t opc,bool * can_swap)1848 ir3_try_swap_signedness(opc_t opc, bool *can_swap)
1849 {
1850 switch (opc) {
1851 #define PAIR(u, s) \
1852 case OPC_##u: \
1853 return OPC_##s; \
1854 case OPC_##s: \
1855 return OPC_##u;
1856 PAIR(ADD_U, ADD_S)
1857 PAIR(SUB_U, SUB_S)
1858 /* Note: these are only identical when the sources are half, but that's
1859 * the only case we call this function for anyway.
1860 */
1861 PAIR(MUL_U24, MUL_S24)
1862
1863 default:
1864 *can_swap = false;
1865 return opc;
1866 }
1867 }
1868
1869 #define MASK(n) ((1 << (n)) - 1)
1870
1871 /* iterator for an instructions's sources (reg), also returns src #: */
1872 #define foreach_src_n(__srcreg, __n, __instr) \
1873 if ((__instr)->srcs_count) \
1874 for (struct ir3_register *__srcreg = (struct ir3_register *)~0; __srcreg;\
1875 __srcreg = NULL) \
1876 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \
1877 __n++) \
1878 if ((__srcreg = (__instr)->srcs[__n]))
1879
1880 /* iterator for an instructions's sources (reg): */
1881 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr)
1882
1883 #define foreach_src_if(__srcreg, __instr, __filter) \
1884 foreach_src (__srcreg, __instr) \
1885 if (__filter(__srcreg))
1886
1887 /* iterator for an instructions's destinations (reg), also returns dst #: */
1888 #define foreach_dst_n(__dstreg, __n, __instr) \
1889 if ((__instr)->dsts_count) \
1890 for (struct ir3_register *__dstreg = (struct ir3_register *)~0; __dstreg;\
1891 __dstreg = NULL) \
1892 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \
1893 __n++) \
1894 if ((__dstreg = (__instr)->dsts[__n]))
1895
1896 /* iterator for an instructions's destinations (reg): */
1897 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr)
1898
1899 #define foreach_dst_if(__dstreg, __instr, __filter) \
1900 foreach_dst (__dstreg, __instr) \
1901 if (__filter(__dstreg))
1902
1903 static inline unsigned
__ssa_src_cnt(struct ir3_instruction * instr)1904 __ssa_src_cnt(struct ir3_instruction *instr)
1905 {
1906 return instr->srcs_count + instr->deps_count;
1907 }
1908
1909 static inline bool
__is_false_dep(struct ir3_instruction * instr,unsigned n)1910 __is_false_dep(struct ir3_instruction *instr, unsigned n)
1911 {
1912 if (n >= instr->srcs_count)
1913 return true;
1914 return false;
1915 }
1916
1917 static inline struct ir3_instruction **
__ssa_srcp_n(struct ir3_instruction * instr,unsigned n)1918 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n)
1919 {
1920 if (__is_false_dep(instr, n))
1921 return &instr->deps[n - instr->srcs_count];
1922 if (ssa(instr->srcs[n]))
1923 return &instr->srcs[n]->def->instr;
1924 return NULL;
1925 }
1926
1927 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \
1928 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \
1929 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \
1930 __n++) \
1931 if ((__srcp = __ssa_srcp_n(__instr, __n)))
1932
1933 #define foreach_ssa_srcp(__srcp, __instr) \
1934 foreach_ssa_srcp_n (__srcp, __i, __instr)
1935
1936 /* iterator for an instruction's SSA sources (instr), also returns src #: */
1937 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
1938 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \
1939 __srcinst = NULL) \
1940 foreach_ssa_srcp_n (__srcp, __n, __instr) \
1941 if ((__srcinst = *__srcp))
1942
1943 /* iterator for an instruction's SSA sources (instr): */
1944 #define foreach_ssa_src(__srcinst, __instr) \
1945 foreach_ssa_src_n (__srcinst, __i, __instr)
1946
1947 /* iterators for shader inputs: */
1948 #define foreach_input_n(__ininstr, __cnt, __ir) \
1949 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \
1950 __ininstr = NULL) \
1951 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \
1952 if ((__ininstr = (__ir)->inputs[__cnt]))
1953 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir)
1954
1955 /* iterators for instructions: */
1956 #define foreach_instr(__instr, __list) \
1957 list_for_each_entry (struct ir3_instruction, __instr, __list, node)
1958 #define foreach_instr_from(__instr, __start, __list) \
1959 list_for_each_entry_from(struct ir3_instruction, __instr, &(__start)->node, \
1960 __list, node)
1961 #define foreach_instr_rev(__instr, __list) \
1962 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node)
1963 #define foreach_instr_safe(__instr, __list) \
1964 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node)
1965 #define foreach_instr_from_safe(__instr, __start, __list) \
1966 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \
1967 __list, node)
1968
1969 /* Iterate over all instructions in a repeat group. */
1970 #define foreach_instr_rpt(__rpt, __instr) \
1971 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
1972 for (struct ir3_instruction *__rpt = __instr, *__first = __instr; \
1973 __first || __rpt != __instr; \
1974 __first = NULL, __rpt = \
1975 list_entry(__rpt->rpt_node.next, \
1976 struct ir3_instruction, rpt_node))
1977
1978 /* Iterate over all instructions except the first one in a repeat group. */
1979 #define foreach_instr_rpt_excl(__rpt, __instr) \
1980 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
1981 list_for_each_entry (struct ir3_instruction, __rpt, &__instr->rpt_node, \
1982 rpt_node)
1983
1984 #define foreach_instr_rpt_excl_safe(__rpt, __instr) \
1985 if (assert(ir3_instr_is_first_rpt(__instr)), true) \
1986 list_for_each_entry_safe (struct ir3_instruction, __rpt, \
1987 &__instr->rpt_node, rpt_node)
1988
1989 /* iterators for blocks: */
1990 #define foreach_block(__block, __list) \
1991 list_for_each_entry (struct ir3_block, __block, __list, node)
1992 #define foreach_block_safe(__block, __list) \
1993 list_for_each_entry_safe (struct ir3_block, __block, __list, node)
1994 #define foreach_block_rev(__block, __list) \
1995 list_for_each_entry_rev (struct ir3_block, __block, __list, node)
1996
1997 /* iterators for arrays: */
1998 #define foreach_array(__array, __list) \
1999 list_for_each_entry (struct ir3_array, __array, __list, node)
2000 #define foreach_array_safe(__array, __list) \
2001 list_for_each_entry_safe (struct ir3_array, __array, __list, node)
2002
2003 #define IR3_PASS(ir, pass, ...) \
2004 ({ \
2005 bool progress = pass(ir, ##__VA_ARGS__); \
2006 if (progress) { \
2007 ir3_debug_print(ir, "AFTER: " #pass); \
2008 ir3_validate(ir); \
2009 } \
2010 progress; \
2011 })
2012
2013 /* validate: */
2014 void ir3_validate(struct ir3 *ir);
2015
2016 /* dump: */
2017 void ir3_print(struct ir3 *ir);
2018 void ir3_print_instr(struct ir3_instruction *instr);
2019
2020 struct log_stream;
2021 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
2022
2023 /* delay calculation: */
2024 int ir3_delayslots(struct ir3_compiler *compiler,
2025 struct ir3_instruction *assigner,
2026 struct ir3_instruction *consumer, unsigned n, bool soft);
2027 unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
2028 struct ir3_instruction *assigner,
2029 struct ir3_instruction *consumer,
2030 unsigned assigner_n, unsigned consumer_n);
2031
2032 /* estimated (ss)/(sy) delay calculation */
2033
2034 static inline bool
is_local_mem_load(struct ir3_instruction * instr)2035 is_local_mem_load(struct ir3_instruction *instr)
2036 {
2037 return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
2038 instr->opc == OPC_LDLW;
2039 }
2040
2041 bool is_scalar_alu(struct ir3_instruction *instr,
2042 const struct ir3_compiler *compiler);
2043
2044 /* Does this instruction sometimes need (ss) to wait for its result? */
2045 static inline bool
is_ss_producer(struct ir3_instruction * instr)2046 is_ss_producer(struct ir3_instruction *instr)
2047 {
2048 foreach_dst (dst, instr) {
2049 if (dst->flags & IR3_REG_SHARED)
2050 return true;
2051 }
2052
2053 if (instr->block->in_early_preamble && writes_addr1(instr))
2054 return true;
2055
2056 return is_sfu(instr) || is_local_mem_load(instr);
2057 }
2058
2059 static inline bool
needs_ss(const struct ir3_compiler * compiler,struct ir3_instruction * producer,struct ir3_instruction * consumer)2060 needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
2061 struct ir3_instruction *consumer)
2062 {
2063 if (is_scalar_alu(producer, compiler) &&
2064 is_scalar_alu(consumer, compiler) &&
2065 (producer->dsts[0]->flags & IR3_REG_HALF) ==
2066 (consumer->srcs[0]->flags & IR3_REG_HALF))
2067 return false;
2068
2069 return is_ss_producer(producer);
2070 }
2071
2072 /* The soft delay for approximating the cost of (ss). */
2073 static inline unsigned
soft_ss_delay(struct ir3_instruction * instr)2074 soft_ss_delay(struct ir3_instruction *instr)
2075 {
2076 /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
2077 * using nop's instead of (ss) is:
2078 *
2079 * 8 - single warp
2080 * 9 - two warps
2081 * 10 - four warps
2082 *
2083 * and so on. Not quite sure where it tapers out (ie. how many warps share an
2084 * SFU unit). But 10 seems like a reasonable # to choose:
2085 */
2086 if (is_sfu(instr) || is_local_mem_load(instr))
2087 return 10;
2088
2089 /* The blob adds 6 nops between shared producers and consumers, and before we
2090 * used (ss) this was sufficient in most cases.
2091 */
2092 return 6;
2093 }
2094
2095 static inline bool
is_sy_producer(struct ir3_instruction * instr)2096 is_sy_producer(struct ir3_instruction *instr)
2097 {
2098 return is_tex_or_prefetch(instr) ||
2099 (is_load(instr) && !is_local_mem_load(instr)) ||
2100 is_atomic(instr->opc);
2101 }
2102
2103 static inline unsigned
soft_sy_delay(struct ir3_instruction * instr,struct ir3 * shader)2104 soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
2105 {
2106 /* TODO: this is just an optimistic guess, we can do better post-RA.
2107 */
2108 bool double_wavesize =
2109 shader->type == MESA_SHADER_FRAGMENT ||
2110 shader->type == MESA_SHADER_COMPUTE;
2111
2112 unsigned components = reg_elems(instr->dsts[0]);
2113
2114 /* These numbers come from counting the number of delay slots to get
2115 * cat5/cat6 results back using nops instead of (sy). Note that these numbers
2116 * are with the result preloaded to cache by loading it before in the same
2117 * shader - uncached results are much larger.
2118 *
2119 * Note: most ALU instructions can't complete at the full doubled rate, so
2120 * they take 2 cycles. The only exception is fp16 instructions with no
2121 * built-in conversions. Therefore divide the latency by 2.
2122 *
2123 * TODO: Handle this properly in the scheduler and remove this.
2124 */
2125 if (instr->opc == OPC_LDC) {
2126 if (double_wavesize)
2127 return (21 + 8 * components) / 2;
2128 else
2129 return 18 + 4 * components;
2130 } else if (is_tex_or_prefetch(instr)) {
2131 if (double_wavesize) {
2132 switch (components) {
2133 case 1: return 58 / 2;
2134 case 2: return 60 / 2;
2135 case 3: return 77 / 2;
2136 case 4: return 79 / 2;
2137 default: unreachable("bad number of components");
2138 }
2139 } else {
2140 switch (components) {
2141 case 1: return 51;
2142 case 2: return 53;
2143 case 3: return 62;
2144 case 4: return 64;
2145 default: unreachable("bad number of components");
2146 }
2147 }
2148 } else {
2149 /* TODO: measure other cat6 opcodes like ldg */
2150 if (double_wavesize)
2151 return (172 + components) / 2;
2152 else
2153 return 109 + components;
2154 }
2155 }
2156
2157 /* Some instructions don't immediately consume their sources so may introduce a
2158 * WAR hazard.
2159 */
2160 static inline bool
is_war_hazard_producer(struct ir3_instruction * instr)2161 is_war_hazard_producer(struct ir3_instruction *instr)
2162 {
2163 return is_tex(instr) || is_mem(instr) || is_ss_producer(instr) ||
2164 instr->opc == OPC_STC;
2165 }
2166
2167 bool ir3_cleanup_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2168 bool ir3_merge_rpt(struct ir3 *ir, struct ir3_shader_variant *v);
2169 bool ir3_opt_predicates(struct ir3 *ir, struct ir3_shader_variant *v);
2170
2171 /* unreachable block elimination: */
2172 bool ir3_remove_unreachable(struct ir3 *ir);
2173
2174 /* calculate reconvergence information: */
2175 void ir3_calc_reconvergence(struct ir3_shader_variant *so);
2176
2177 /* lower invalid shared phis after calculating reconvergence information: */
2178 bool ir3_lower_shared_phis(struct ir3 *ir);
2179
2180 /* dead code elimination: */
2181 struct ir3_shader_variant;
2182 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
2183
2184 /* fp16 conversion folding */
2185 bool ir3_cf(struct ir3 *ir);
2186
2187 /* shared mov folding */
2188 bool ir3_shared_fold(struct ir3 *ir);
2189
2190 /* copy-propagate: */
2191 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
2192
2193 /* common subexpression elimination: */
2194 bool ir3_cse(struct ir3 *ir);
2195
2196 /* Make arrays SSA */
2197 bool ir3_array_to_ssa(struct ir3 *ir);
2198
2199 /* scheduling: */
2200 bool ir3_sched_add_deps(struct ir3 *ir);
2201 int ir3_sched(struct ir3 *ir);
2202
2203 struct ir3_context;
2204 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
2205
2206 /* register assignment: */
2207 int ir3_ra(struct ir3_shader_variant *v);
2208 void ir3_ra_predicates(struct ir3_shader_variant *v);
2209
2210 /* lower subgroup ops: */
2211 bool ir3_lower_subgroups(struct ir3 *ir);
2212
2213 /* legalize: */
2214 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
2215 bool ir3_legalize_relative(struct ir3 *ir);
2216
2217 static inline bool
ir3_has_latency_to_hide(struct ir3 * ir)2218 ir3_has_latency_to_hide(struct ir3 *ir)
2219 {
2220 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't
2221 * know the nature of the fragment shader. Just assume it will have
2222 * latency to hide:
2223 */
2224 if (ir->type != MESA_SHADER_FRAGMENT)
2225 return true;
2226
2227 foreach_block (block, &ir->block_list) {
2228 foreach_instr (instr, &block->instr_list) {
2229 if (is_tex_or_prefetch(instr))
2230 return true;
2231
2232 if (is_load(instr)) {
2233 switch (instr->opc) {
2234 case OPC_LDLV:
2235 case OPC_LDL:
2236 case OPC_LDLW:
2237 break;
2238 default:
2239 return true;
2240 }
2241 }
2242 }
2243 }
2244
2245 return false;
2246 }
2247
2248 /**
2249 * Move 'instr' to after the last phi node at the beginning of the block:
2250 */
2251 static inline void
ir3_instr_move_after_phis(struct ir3_instruction * instr,struct ir3_block * block)2252 ir3_instr_move_after_phis(struct ir3_instruction *instr,
2253 struct ir3_block *block)
2254 {
2255 struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
2256 if (last_phi)
2257 ir3_instr_move_after(instr, last_phi);
2258 else
2259 ir3_instr_move_before_block(instr, block);
2260 }
2261
2262 static inline struct ir3_cursor
ir3_before_block(struct ir3_block * block)2263 ir3_before_block(struct ir3_block *block)
2264 {
2265 assert(block);
2266 struct ir3_cursor cursor;
2267 cursor.option = IR3_CURSOR_BEFORE_BLOCK;
2268 cursor.block = block;
2269 return cursor;
2270 }
2271
2272 static inline struct ir3_cursor
ir3_after_block(struct ir3_block * block)2273 ir3_after_block(struct ir3_block *block)
2274 {
2275 assert(block);
2276 struct ir3_cursor cursor;
2277 cursor.option = IR3_CURSOR_AFTER_BLOCK;
2278 cursor.block = block;
2279 return cursor;
2280 }
2281
2282 static inline struct ir3_cursor
ir3_before_instr(struct ir3_instruction * instr)2283 ir3_before_instr(struct ir3_instruction *instr)
2284 {
2285 assert(instr);
2286 struct ir3_cursor cursor;
2287 cursor.option = IR3_CURSOR_BEFORE_INSTR;
2288 cursor.instr = instr;
2289 return cursor;
2290 }
2291
2292 static inline struct ir3_cursor
ir3_after_instr(struct ir3_instruction * instr)2293 ir3_after_instr(struct ir3_instruction *instr)
2294 {
2295 assert(instr);
2296 struct ir3_cursor cursor;
2297 cursor.option = IR3_CURSOR_AFTER_INSTR;
2298 cursor.instr = instr;
2299 return cursor;
2300 }
2301
2302 static inline struct ir3_cursor
ir3_before_terminator(struct ir3_block * block)2303 ir3_before_terminator(struct ir3_block *block)
2304 {
2305 assert(block);
2306 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
2307
2308 if (terminator)
2309 return ir3_before_instr(terminator);
2310 return ir3_after_block(block);
2311 }
2312
2313 static inline struct ir3_cursor
ir3_after_phis(struct ir3_block * block)2314 ir3_after_phis(struct ir3_block *block)
2315 {
2316 assert(block);
2317
2318 foreach_instr (instr, &block->instr_list) {
2319 if (instr->opc != OPC_META_PHI)
2320 return ir3_before_instr(instr);
2321 }
2322
2323 return ir3_after_block(block);
2324 }
2325
2326 static inline struct ir3_builder
ir3_builder_at(struct ir3_cursor cursor)2327 ir3_builder_at(struct ir3_cursor cursor)
2328 {
2329 struct ir3_builder builder;
2330 builder.cursor = cursor;
2331 return builder;
2332 }
2333
2334
2335 /* ************************************************************************* */
2336 /* instruction helpers */
2337
2338 /* creates SSA src of correct type (ie. half vs full precision) */
2339 static inline struct ir3_register *
__ssa_src(struct ir3_instruction * instr,struct ir3_instruction * src,unsigned flags)2340 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src,
2341 unsigned flags)
2342 {
2343 struct ir3_register *reg;
2344 flags |= src->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
2345 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags);
2346 reg->def = src->dsts[0];
2347 reg->wrmask = src->dsts[0]->wrmask;
2348 return reg;
2349 }
2350
2351 static inline struct ir3_register *
__ssa_dst(struct ir3_instruction * instr)2352 __ssa_dst(struct ir3_instruction *instr)
2353 {
2354 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA);
2355 reg->instr = instr;
2356 return reg;
2357 }
2358
2359 static BITMASK_ENUM(ir3_register_flags)
type_flags(type_t type)2360 type_flags(type_t type)
2361 {
2362 if (type_size(type) < 32)
2363 return IR3_REG_HALF;
2364 return (ir3_register_flags)0;
2365 }
2366
2367 static inline struct ir3_instruction *
create_immed_typed_shared(struct ir3_block * block,uint32_t val,type_t type,bool shared)2368 create_immed_typed_shared(struct ir3_block *block, uint32_t val, type_t type, bool shared)
2369 {
2370 struct ir3_instruction *mov;
2371 ir3_register_flags flags = type_flags(type);
2372
2373 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2374 mov->cat1.src_type = type;
2375 mov->cat1.dst_type = type;
2376 __ssa_dst(mov)->flags |= flags | (shared ? IR3_REG_SHARED : 0);
2377 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val;
2378
2379 return mov;
2380 }
2381
2382 static inline struct ir3_instruction *
create_immed_typed(struct ir3_block * block,uint32_t val,type_t type)2383 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type)
2384 {
2385 return create_immed_typed_shared(block, val, type, false);
2386 }
2387
2388 static inline struct ir3_instruction *
create_immed_shared(struct ir3_block * block,uint32_t val,bool shared)2389 create_immed_shared(struct ir3_block *block, uint32_t val, bool shared)
2390 {
2391 return create_immed_typed_shared(block, val, TYPE_U32, shared);
2392 }
2393
2394 static inline struct ir3_instruction *
create_immed(struct ir3_block * block,uint32_t val)2395 create_immed(struct ir3_block *block, uint32_t val)
2396 {
2397 return create_immed_shared(block, val, false);
2398 }
2399
2400 static inline struct ir3_instruction *
create_uniform_typed(struct ir3_block * block,unsigned n,type_t type)2401 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type)
2402 {
2403 struct ir3_instruction *mov;
2404 ir3_register_flags flags = type_flags(type);
2405
2406 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2407 mov->cat1.src_type = type;
2408 mov->cat1.dst_type = type;
2409 __ssa_dst(mov)->flags |= flags;
2410 ir3_src_create(mov, n, IR3_REG_CONST | flags);
2411
2412 return mov;
2413 }
2414
2415 static inline struct ir3_instruction *
create_uniform(struct ir3_block * block,unsigned n)2416 create_uniform(struct ir3_block *block, unsigned n)
2417 {
2418 return create_uniform_typed(block, n, TYPE_F32);
2419 }
2420
2421 static inline struct ir3_instruction *
create_uniform_indirect(struct ir3_block * block,int n,type_t type,struct ir3_instruction * address)2422 create_uniform_indirect(struct ir3_block *block, int n, type_t type,
2423 struct ir3_instruction *address)
2424 {
2425 struct ir3_instruction *mov;
2426
2427 mov = ir3_instr_create(block, OPC_MOV, 1, 1);
2428 mov->cat1.src_type = type;
2429 mov->cat1.dst_type = type;
2430 __ssa_dst(mov);
2431 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
2432
2433 ir3_instr_set_address(mov, address);
2434
2435 return mov;
2436 }
2437
2438 static inline struct ir3_instruction *
ir3_MOV(struct ir3_block * block,struct ir3_instruction * src,type_t type)2439 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
2440 {
2441 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2442 ir3_register_flags flags = type_flags(type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2443
2444 __ssa_dst(instr)->flags |= flags;
2445 if (src->dsts[0]->flags & IR3_REG_ARRAY) {
2446 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY);
2447 src_reg->array = src->dsts[0]->array;
2448 } else {
2449 __ssa_src(instr, src, 0);
2450 }
2451 assert(!(src->dsts[0]->flags & IR3_REG_RELATIV));
2452 instr->cat1.src_type = type;
2453 instr->cat1.dst_type = type;
2454 return instr;
2455 }
2456
2457 static inline struct ir3_instruction_rpt
ir3_MOV_rpt(struct ir3_block * block,unsigned nrpt,struct ir3_instruction_rpt src,type_t type)2458 ir3_MOV_rpt(struct ir3_block *block, unsigned nrpt,
2459 struct ir3_instruction_rpt src, type_t type)
2460 {
2461 struct ir3_instruction_rpt dst;
2462 assert(nrpt <= ARRAY_SIZE(dst.rpts));
2463
2464 for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2465 dst.rpts[rpt] = ir3_MOV(block, src.rpts[rpt], type);
2466
2467 ir3_instr_create_rpt(dst.rpts, nrpt);
2468 return dst;
2469 }
2470
2471 static inline struct ir3_instruction *
ir3_COV(struct ir3_block * block,struct ir3_instruction * src,type_t src_type,type_t dst_type)2472 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type,
2473 type_t dst_type)
2474 {
2475 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1);
2476 ir3_register_flags dst_flags = type_flags(dst_type) | (src->dsts[0]->flags & IR3_REG_SHARED);
2477 ASSERTED ir3_register_flags src_flags = type_flags(src_type);
2478
2479 assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags);
2480
2481 __ssa_dst(instr)->flags |= dst_flags;
2482 __ssa_src(instr, src, 0);
2483 instr->cat1.src_type = src_type;
2484 instr->cat1.dst_type = dst_type;
2485 assert(!(src->dsts[0]->flags & IR3_REG_ARRAY));
2486 return instr;
2487 }
2488
2489 static inline struct ir3_instruction_rpt
ir3_COV_rpt(struct ir3_block * block,unsigned nrpt,struct ir3_instruction_rpt src,type_t src_type,type_t dst_type)2490 ir3_COV_rpt(struct ir3_block *block, unsigned nrpt,
2491 struct ir3_instruction_rpt src, type_t src_type, type_t dst_type)
2492 {
2493 struct ir3_instruction_rpt dst;
2494
2495 for (unsigned rpt = 0; rpt < nrpt; ++rpt)
2496 dst.rpts[rpt] = ir3_COV(block, src.rpts[rpt], src_type, dst_type);
2497
2498 ir3_instr_create_rpt(dst.rpts, nrpt);
2499 return dst;
2500 }
2501
2502 static inline struct ir3_instruction *
ir3_MOVMSK(struct ir3_block * block,unsigned components)2503 ir3_MOVMSK(struct ir3_block *block, unsigned components)
2504 {
2505 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0);
2506
2507 struct ir3_register *dst = __ssa_dst(instr);
2508 dst->flags |= IR3_REG_SHARED;
2509 dst->wrmask = (1 << components) - 1;
2510 instr->repeat = components - 1;
2511 return instr;
2512 }
2513
2514 static inline struct ir3_instruction *
ir3_BALLOT_MACRO(struct ir3_block * block,struct ir3_instruction * src,unsigned components)2515 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src,
2516 unsigned components)
2517 {
2518 struct ir3_instruction *instr =
2519 ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1);
2520
2521 struct ir3_register *dst = __ssa_dst(instr);
2522 dst->flags |= IR3_REG_SHARED;
2523 dst->wrmask = (1 << components) - 1;
2524
2525 __ssa_src(instr, src, 0);
2526
2527 return instr;
2528 }
2529
2530 static inline struct ir3_instruction *
ir3_NOP(struct ir3_block * block)2531 ir3_NOP(struct ir3_block *block)
2532 {
2533 return ir3_instr_create(block, OPC_NOP, 0, 0);
2534 }
2535
2536 /* clang-format off */
2537 #define __INSTR0(flag, name, opc) \
2538 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block) \
2539 { \
2540 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0); \
2541 instr->flags |= flag; \
2542 return instr; \
2543 }
2544 /* clang-format on */
2545 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name)
2546 #define INSTR0(name) __INSTR0((ir3_instruction_flags)0, name, OPC_##name)
2547
2548 /* clang-format off */
2549 #define __INSTR1(flag, dst_count, name, opc, scalar_alu) \
2550 static inline struct ir3_instruction *ir3_##name( \
2551 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags) \
2552 { \
2553 struct ir3_instruction *instr = \
2554 ir3_instr_create(block, opc, dst_count, 1); \
2555 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & IR3_REG_SHARED) : 0; \
2556 for (unsigned i = 0; i < dst_count; i++) \
2557 __ssa_dst(instr)->flags |= dst_flag; \
2558 __ssa_src(instr, a, aflags); \
2559 instr->flags |= flag; \
2560 return instr; \
2561 } \
2562 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2563 struct ir3_block *block, unsigned nrpt, \
2564 struct ir3_instruction_rpt a, unsigned aflags) \
2565 { \
2566 struct ir3_instruction_rpt dst; \
2567 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2568 for (unsigned rpt = 0; rpt < nrpt; rpt++) \
2569 dst.rpts[rpt] = ir3_##name(block, a.rpts[rpt], aflags); \
2570 ir3_instr_create_rpt(dst.rpts, nrpt); \
2571 return dst; \
2572 }
2573
2574 /* clang-format on */
2575 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2576 false)
2577 #define INSTR1(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2578 #define INSTR1S(name) __INSTR1((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2579 #define INSTR1NODST(name) __INSTR1((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2580
2581 /* clang-format off */
2582 #define __INSTR2(flag, dst_count, name, opc, scalar_alu) \
2583 static inline struct ir3_instruction *ir3_##name( \
2584 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2585 struct ir3_instruction *b, unsigned bflags) \
2586 { \
2587 struct ir3_instruction *instr = ir3_instr_create(block, opc, dst_count, 2); \
2588 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags & \
2589 IR3_REG_SHARED) : 0; \
2590 for (unsigned i = 0; i < dst_count; i++) \
2591 __ssa_dst(instr)->flags |= dst_flag; \
2592 __ssa_src(instr, a, aflags); \
2593 __ssa_src(instr, b, bflags); \
2594 instr->flags |= flag; \
2595 return instr; \
2596 } \
2597 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2598 struct ir3_block *block, unsigned nrpt, \
2599 struct ir3_instruction_rpt a, unsigned aflags, \
2600 struct ir3_instruction_rpt b, unsigned bflags) \
2601 { \
2602 struct ir3_instruction_rpt dst; \
2603 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2604 for (unsigned rpt = 0; rpt < nrpt; rpt++) { \
2605 dst.rpts[rpt] = ir3_##name(block, a.rpts[rpt], aflags, \
2606 b.rpts[rpt], bflags); \
2607 } \
2608 ir3_instr_create_rpt(dst.rpts, nrpt); \
2609 return dst; \
2610 }
2611 /* clang-format on */
2612 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2613 false)
2614 #define INSTR2(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2615 #define INSTR2S(name) __INSTR2((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2616 #define INSTR2NODST(name) __INSTR2((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2617
2618 /* clang-format off */
2619 #define __INSTR3(flag, dst_count, name, opc, scalar_alu) \
2620 static inline struct ir3_instruction *ir3_##name( \
2621 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2622 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2623 unsigned cflags) \
2624 { \
2625 struct ir3_instruction *instr = \
2626 ir3_instr_create(block, opc, dst_count, 3); \
2627 unsigned dst_flag = scalar_alu ? (a->dsts[0]->flags & b->dsts[0]->flags & \
2628 c->dsts[0]->flags & IR3_REG_SHARED) : 0; \
2629 for (unsigned i = 0; i < dst_count; i++) \
2630 __ssa_dst(instr)->flags |= dst_flag; \
2631 __ssa_src(instr, a, aflags); \
2632 __ssa_src(instr, b, bflags); \
2633 __ssa_src(instr, c, cflags); \
2634 instr->flags |= flag; \
2635 return instr; \
2636 } \
2637 static inline struct ir3_instruction_rpt ir3_##name##_rpt( \
2638 struct ir3_block *block, unsigned nrpt, \
2639 struct ir3_instruction_rpt a, unsigned aflags, \
2640 struct ir3_instruction_rpt b, unsigned bflags, \
2641 struct ir3_instruction_rpt c, unsigned cflags) \
2642 { \
2643 struct ir3_instruction_rpt dst; \
2644 assert(nrpt <= ARRAY_SIZE(dst.rpts)); \
2645 for (unsigned rpt = 0; rpt < nrpt; rpt++) { \
2646 dst.rpts[rpt] = ir3_##name(block, a.rpts[rpt], aflags, \
2647 b.rpts[rpt], bflags, \
2648 c.rpts[rpt], cflags); \
2649 } \
2650 ir3_instr_create_rpt(dst.rpts, nrpt); \
2651 return dst; \
2652 }
2653 /* clang-format on */
2654 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name, \
2655 false)
2656 #define INSTR3(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, false)
2657 #define INSTR3S(name) __INSTR3((ir3_instruction_flags)0, 1, name, OPC_##name, true)
2658 #define INSTR3NODST(name) __INSTR3((ir3_instruction_flags)0, 0, name, OPC_##name, false)
2659
2660 /* clang-format off */
2661 #define __INSTR4(flag, dst_count, name, opc) \
2662 static inline struct ir3_instruction *ir3_##name( \
2663 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2664 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2665 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \
2666 { \
2667 struct ir3_instruction *instr = \
2668 ir3_instr_create(block, opc, dst_count, 4); \
2669 for (unsigned i = 0; i < dst_count; i++) \
2670 __ssa_dst(instr); \
2671 __ssa_src(instr, a, aflags); \
2672 __ssa_src(instr, b, bflags); \
2673 __ssa_src(instr, c, cflags); \
2674 __ssa_src(instr, d, dflags); \
2675 instr->flags |= flag; \
2676 return instr; \
2677 }
2678 /* clang-format on */
2679 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2680 #define INSTR4(name) __INSTR4((ir3_instruction_flags)0, 1, name, OPC_##name)
2681 #define INSTR4NODST(name) __INSTR4((ir3_instruction_flags)0, 0, name, OPC_##name)
2682
2683 /* clang-format off */
2684 #define __INSTR5(flag, name, opc) \
2685 static inline struct ir3_instruction *ir3_##name( \
2686 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2687 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2688 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2689 struct ir3_instruction *e, unsigned eflags) \
2690 { \
2691 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5); \
2692 __ssa_dst(instr); \
2693 __ssa_src(instr, a, aflags); \
2694 __ssa_src(instr, b, bflags); \
2695 __ssa_src(instr, c, cflags); \
2696 __ssa_src(instr, d, dflags); \
2697 __ssa_src(instr, e, eflags); \
2698 instr->flags |= flag; \
2699 return instr; \
2700 }
2701 /* clang-format on */
2702 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name)
2703 #define INSTR5(name) __INSTR5((ir3_instruction_flags)0, name, OPC_##name)
2704
2705 /* clang-format off */
2706 #define __INSTR6(flag, dst_count, name, opc) \
2707 static inline struct ir3_instruction *ir3_##name( \
2708 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \
2709 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \
2710 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \
2711 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \
2712 unsigned fflags) \
2713 { \
2714 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6); \
2715 for (unsigned i = 0; i < dst_count; i++) \
2716 __ssa_dst(instr); \
2717 __ssa_src(instr, a, aflags); \
2718 __ssa_src(instr, b, bflags); \
2719 __ssa_src(instr, c, cflags); \
2720 __ssa_src(instr, d, dflags); \
2721 __ssa_src(instr, e, eflags); \
2722 __ssa_src(instr, f, fflags); \
2723 instr->flags |= flag; \
2724 return instr; \
2725 }
2726 /* clang-format on */
2727 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name)
2728 #define INSTR6(name) __INSTR6((ir3_instruction_flags)0, 1, name, OPC_##name)
2729 #define INSTR6NODST(name) __INSTR6((ir3_instruction_flags)0, 0, name, OPC_##name)
2730
2731 /* cat0 instructions: */
2732 INSTR1NODST(BR)
INSTR1NODST(BALL)2733 INSTR1NODST(BALL)
2734 INSTR1NODST(BANY)
2735 INSTR2NODST(BRAA)
2736 INSTR2NODST(BRAO)
2737 INSTR0(JUMP)
2738 INSTR1NODST(KILL)
2739 INSTR1NODST(DEMOTE)
2740 INSTR0(END)
2741 INSTR0(CHSH)
2742 INSTR0(CHMASK)
2743 INSTR1NODST(PREDT)
2744 INSTR1NODST(PREDF)
2745 INSTR0(PREDE)
2746 INSTR0(GETONE)
2747 INSTR0(GETLAST)
2748 INSTR0(SHPS)
2749 INSTR0(SHPE)
2750
2751 /* cat1 macros */
2752 INSTR1(ANY_MACRO)
2753 INSTR1(ALL_MACRO)
2754 INSTR1(READ_FIRST_MACRO)
2755 INSTR2(READ_COND_MACRO)
2756
2757 static inline struct ir3_instruction *
2758 ir3_ELECT_MACRO(struct ir3_block *block)
2759 {
2760 struct ir3_instruction *instr =
2761 ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0);
2762 __ssa_dst(instr);
2763 return instr;
2764 }
2765
2766 static inline struct ir3_instruction *
ir3_SHPS_MACRO(struct ir3_block * block)2767 ir3_SHPS_MACRO(struct ir3_block *block)
2768 {
2769 struct ir3_instruction *instr =
2770 ir3_instr_create(block, OPC_SHPS_MACRO, 1, 0);
2771 __ssa_dst(instr);
2772 return instr;
2773 }
2774
2775 /* cat2 instructions, most 2 src but some 1 src: */
2776 INSTR2S(ADD_F)
INSTR2S(MIN_F)2777 INSTR2S(MIN_F)
2778 INSTR2S(MAX_F)
2779 INSTR2S(MUL_F)
2780 INSTR1S(SIGN_F)
2781 INSTR2S(CMPS_F)
2782 INSTR1S(ABSNEG_F)
2783 INSTR2S(CMPV_F)
2784 INSTR1S(FLOOR_F)
2785 INSTR1S(CEIL_F)
2786 INSTR1S(RNDNE_F)
2787 INSTR1S(RNDAZ_F)
2788 INSTR1S(TRUNC_F)
2789 INSTR2S(ADD_U)
2790 INSTR2S(ADD_S)
2791 INSTR2S(SUB_U)
2792 INSTR2S(SUB_S)
2793 INSTR2S(CMPS_U)
2794 INSTR2S(CMPS_S)
2795 INSTR2S(MIN_U)
2796 INSTR2S(MIN_S)
2797 INSTR2S(MAX_U)
2798 INSTR2S(MAX_S)
2799 INSTR1S(ABSNEG_S)
2800 INSTR2S(AND_B)
2801 INSTR2S(OR_B)
2802 INSTR1S(NOT_B)
2803 INSTR2S(XOR_B)
2804 INSTR2S(CMPV_U)
2805 INSTR2S(CMPV_S)
2806 INSTR2S(MUL_U24)
2807 INSTR2S(MUL_S24)
2808 INSTR2S(MULL_U)
2809 INSTR1S(BFREV_B)
2810 INSTR1S(CLZ_S)
2811 INSTR1S(CLZ_B)
2812 INSTR2S(SHL_B)
2813 INSTR2S(SHR_B)
2814 INSTR2S(ASHR_B)
2815 INSTR2(BARY_F)
2816 INSTR2(FLAT_B)
2817 INSTR2S(MGEN_B)
2818 INSTR2S(GETBIT_B)
2819 INSTR1(SETRM)
2820 INSTR1S(CBITS_B)
2821 INSTR2S(SHB)
2822 INSTR2S(MSAD)
2823
2824 /* cat3 instructions: */
2825 INSTR3(MAD_U16)
2826 INSTR3(MADSH_U16)
2827 INSTR3(MAD_S16)
2828 INSTR3(MADSH_M16)
2829 INSTR3(MAD_U24)
2830 INSTR3(MAD_S24)
2831 INSTR3(MAD_F16)
2832 INSTR3(MAD_F32)
2833 INSTR3(DP2ACC)
2834 INSTR3(DP4ACC)
2835 /* NOTE: SEL_B32 checks for zero vs nonzero */
2836 INSTR3S(SEL_B16)
2837 INSTR3S(SEL_B32)
2838 INSTR3S(SEL_S16)
2839 INSTR3S(SEL_S32)
2840 INSTR3S(SEL_F16)
2841 INSTR3S(SEL_F32)
2842 INSTR3(SAD_S16)
2843 INSTR3(SAD_S32)
2844
2845 /* cat4 instructions: */
2846 INSTR1S(RCP)
2847 INSTR1S(RSQ)
2848 INSTR1S(HRSQ)
2849 INSTR1S(LOG2)
2850 INSTR1S(HLOG2)
2851 INSTR1S(EXP2)
2852 INSTR1S(HEXP2)
2853 INSTR1S(SIN)
2854 INSTR1S(COS)
2855 INSTR1S(SQRT)
2856
2857 /* cat5 instructions: */
2858 INSTR1(DSX)
2859 INSTR1(DSXPP_MACRO)
2860 INSTR1(DSY)
2861 INSTR1(DSYPP_MACRO)
2862 INSTR1F(3D, DSX)
2863 INSTR1F(3D, DSY)
2864 INSTR1(RGETPOS)
2865
2866 static inline struct ir3_instruction *
2867 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask,
2868 ir3_instruction_flags flags, struct ir3_instruction *samp_tex,
2869 struct ir3_instruction *src0, struct ir3_instruction *src1)
2870 {
2871 struct ir3_instruction *sam;
2872 unsigned nreg = 0;
2873
2874 if (flags & IR3_INSTR_S2EN) {
2875 nreg++;
2876 }
2877 if (src0 || opc == OPC_SAM) {
2878 nreg++;
2879 }
2880 if (src1) {
2881 nreg++;
2882 }
2883
2884 sam = ir3_instr_create(block, opc, 1, nreg);
2885 sam->flags |= flags;
2886 __ssa_dst(sam)->wrmask = wrmask;
2887 if (flags & IR3_INSTR_S2EN) {
2888 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF);
2889 }
2890 if (src0) {
2891 __ssa_src(sam, src0, 0);
2892 } else if (opc == OPC_SAM) {
2893 /* Create a dummy shared source for the coordinate, for the prefetch
2894 * case. It needs to be shared so that we don't accidentally disable early
2895 * preamble, and this is what the blob does.
2896 */
2897 ir3_src_create(sam, regid(48, 0), IR3_REG_SHARED);
2898 }
2899 if (src1) {
2900 __ssa_src(sam, src1, 0);
2901 }
2902 sam->cat5.type = type;
2903
2904 return sam;
2905 }
2906
2907 /* brcst.active rx, ry behaves like a conditional move: rx either keeps its
2908 * value or is set to ry. In order to model this in SSA form, we add an extra
2909 * argument (the initial value of rx) and tie it to the destination.
2910 */
2911 static inline struct ir3_instruction *
ir3_BRCST_ACTIVE(struct ir3_block * block,unsigned cluster_size,struct ir3_instruction * src,struct ir3_instruction * dst_default)2912 ir3_BRCST_ACTIVE(struct ir3_block *block, unsigned cluster_size,
2913 struct ir3_instruction *src,
2914 struct ir3_instruction *dst_default)
2915 {
2916 struct ir3_instruction *brcst =
2917 ir3_instr_create(block, OPC_BRCST_ACTIVE, 1, 2);
2918 brcst->cat5.cluster_size = cluster_size;
2919 brcst->cat5.type = TYPE_U32;
2920 struct ir3_register *brcst_dst = __ssa_dst(brcst);
2921 __ssa_src(brcst, src, 0);
2922 struct ir3_register *default_src = __ssa_src(brcst, dst_default, 0);
2923 ir3_reg_tie(brcst_dst, default_src);
2924 return brcst;
2925 }
2926
2927 /* cat6 instructions: */
2928 INSTR0(GETFIBERID)
2929 INSTR2(LDLV)
2930 INSTR3(LDG)
2931 INSTR3(LDL)
2932 INSTR3(LDLW)
2933 INSTR3(LDP)
2934 INSTR4NODST(STG)
2935 INSTR3NODST(STL)
2936 INSTR3NODST(STLW)
2937 INSTR3NODST(STP)
2938 INSTR1(RESINFO)
2939 INSTR1(RESFMT)
2940 INSTR2(ATOMIC_ADD)
2941 INSTR2(ATOMIC_SUB)
2942 INSTR2(ATOMIC_XCHG)
2943 INSTR2(ATOMIC_INC)
2944 INSTR2(ATOMIC_DEC)
2945 INSTR2(ATOMIC_CMPXCHG)
2946 INSTR2(ATOMIC_MIN)
2947 INSTR2(ATOMIC_MAX)
2948 INSTR2(ATOMIC_AND)
2949 INSTR2(ATOMIC_OR)
2950 INSTR2(ATOMIC_XOR)
2951 INSTR2(LDC)
2952 INSTR2(QUAD_SHUFFLE_BRCST)
2953 INSTR1(QUAD_SHUFFLE_HORIZ)
2954 INSTR1(QUAD_SHUFFLE_VERT)
2955 INSTR1(QUAD_SHUFFLE_DIAG)
2956 INSTR2NODST(LDC_K)
2957 INSTR2NODST(STC)
2958 INSTR2NODST(STSC)
2959 #ifndef GPU
2960 #elif GPU >= 600
2961 INSTR4NODST(STIB);
2962 INSTR3(LDIB);
2963 INSTR5(LDG_A);
2964 INSTR6NODST(STG_A);
2965 INSTR2(ATOMIC_G_ADD)
2966 INSTR2(ATOMIC_G_SUB)
2967 INSTR2(ATOMIC_G_XCHG)
2968 INSTR2(ATOMIC_G_INC)
2969 INSTR2(ATOMIC_G_DEC)
2970 INSTR2(ATOMIC_G_CMPXCHG)
2971 INSTR2(ATOMIC_G_MIN)
2972 INSTR2(ATOMIC_G_MAX)
2973 INSTR2(ATOMIC_G_AND)
2974 INSTR2(ATOMIC_G_OR)
2975 INSTR2(ATOMIC_G_XOR)
2976 INSTR3(ATOMIC_B_ADD)
2977 INSTR3(ATOMIC_B_SUB)
2978 INSTR3(ATOMIC_B_XCHG)
2979 INSTR3(ATOMIC_B_INC)
2980 INSTR3(ATOMIC_B_DEC)
2981 INSTR3(ATOMIC_B_CMPXCHG)
2982 INSTR3(ATOMIC_B_MIN)
2983 INSTR3(ATOMIC_B_MAX)
2984 INSTR3(ATOMIC_B_AND)
2985 INSTR3(ATOMIC_B_OR)
2986 INSTR3(ATOMIC_B_XOR)
2987 #elif GPU >= 400
2988 INSTR3(LDGB)
2989 #if GPU >= 500
2990 INSTR3(LDIB)
2991 #endif
2992 INSTR4NODST(STGB)
2993 INSTR4NODST(STIB)
2994 INSTR4(ATOMIC_S_ADD)
2995 INSTR4(ATOMIC_S_SUB)
2996 INSTR4(ATOMIC_S_XCHG)
2997 INSTR4(ATOMIC_S_INC)
2998 INSTR4(ATOMIC_S_DEC)
2999 INSTR4(ATOMIC_S_CMPXCHG)
3000 INSTR4(ATOMIC_S_MIN)
3001 INSTR4(ATOMIC_S_MAX)
3002 INSTR4(ATOMIC_S_AND)
3003 INSTR4(ATOMIC_S_OR)
3004 INSTR4(ATOMIC_S_XOR)
3005 #endif
3006 INSTR4NODST(LDG_K)
3007
3008 /* cat7 instructions: */
3009 INSTR0(BAR)
3010 INSTR0(FENCE)
3011 INSTR0(CCINV)
3012
3013 /* ************************************************************************* */
3014 #include "util/bitset.h"
3015
3016 #define MAX_REG 256
3017
3018 typedef BITSET_DECLARE(fullstate_t, 2 * GPR_REG_SIZE);
3019 typedef BITSET_DECLARE(halfstate_t, GPR_REG_SIZE);
3020 typedef BITSET_DECLARE(sharedstate_t, 2 * SHARED_REG_SIZE);
3021 typedef BITSET_DECLARE(nongprstate_t, 2 * NONGPR_REG_SIZE);
3022
3023 typedef struct {
3024 bool mergedregs;
3025 fullstate_t full;
3026 halfstate_t half;
3027 sharedstate_t shared;
3028 nongprstate_t nongpr;
3029 } regmask_t;
3030
3031 static inline BITSET_WORD *
__regmask_file(regmask_t * regmask,enum ir3_reg_file file)3032 __regmask_file(regmask_t *regmask, enum ir3_reg_file file)
3033 {
3034 switch (file) {
3035 case IR3_FILE_FULL:
3036 return regmask->full;
3037 case IR3_FILE_HALF:
3038 return regmask->half;
3039 case IR3_FILE_SHARED:
3040 return regmask->shared;
3041 case IR3_FILE_NONGPR:
3042 return regmask->nongpr;
3043 }
3044 unreachable("bad file");
3045 }
3046
3047 static inline bool
__regmask_get(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3048 __regmask_get(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3049 {
3050 BITSET_WORD *regs = __regmask_file(regmask, file);
3051 for (unsigned i = 0; i < size; i++) {
3052 if (BITSET_TEST(regs, n + i))
3053 return true;
3054 }
3055 return false;
3056 }
3057
3058 static inline void
__regmask_set(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3059 __regmask_set(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3060 {
3061 BITSET_WORD *regs = __regmask_file(regmask, file);
3062 for (unsigned i = 0; i < size; i++)
3063 BITSET_SET(regs, n + i);
3064 }
3065
3066 static inline void
__regmask_clear(regmask_t * regmask,enum ir3_reg_file file,unsigned n,unsigned size)3067 __regmask_clear(regmask_t *regmask, enum ir3_reg_file file, unsigned n, unsigned size)
3068 {
3069 BITSET_WORD *regs = __regmask_file(regmask, file);
3070 for (unsigned i = 0; i < size; i++)
3071 BITSET_CLEAR(regs, n + i);
3072 }
3073
3074 static inline void
regmask_init(regmask_t * regmask,bool mergedregs)3075 regmask_init(regmask_t *regmask, bool mergedregs)
3076 {
3077 memset(regmask, 0, sizeof(*regmask));
3078 regmask->mergedregs = mergedregs;
3079 }
3080
3081 static inline void
regmask_or(regmask_t * dst,regmask_t * a,regmask_t * b)3082 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
3083 {
3084 assert(dst->mergedregs == a->mergedregs);
3085 assert(dst->mergedregs == b->mergedregs);
3086
3087 for (unsigned i = 0; i < ARRAY_SIZE(dst->full); i++)
3088 dst->full[i] = a->full[i] | b->full[i];
3089 for (unsigned i = 0; i < ARRAY_SIZE(dst->half); i++)
3090 dst->half[i] = a->half[i] | b->half[i];
3091 for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3092 dst->shared[i] = a->shared[i] | b->shared[i];
3093 for (unsigned i = 0; i < ARRAY_SIZE(dst->nongpr); i++)
3094 dst->nongpr[i] = a->nongpr[i] | b->nongpr[i];
3095 }
3096
3097 static inline void
regmask_or_shared(regmask_t * dst,regmask_t * a,regmask_t * b)3098 regmask_or_shared(regmask_t *dst, regmask_t *a, regmask_t *b)
3099 {
3100 for (unsigned i = 0; i < ARRAY_SIZE(dst->shared); i++)
3101 dst->shared[i] = a->shared[i] | b->shared[i];
3102 }
3103
3104 static inline void
regmask_set(regmask_t * regmask,struct ir3_register * reg)3105 regmask_set(regmask_t *regmask, struct ir3_register *reg)
3106 {
3107 unsigned size = reg_elem_size(reg);
3108 enum ir3_reg_file file;
3109 unsigned num = post_ra_reg_num(reg);
3110 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3111 if (reg->flags & IR3_REG_RELATIV) {
3112 __regmask_set(regmask, file, n, size * reg->size);
3113 } else {
3114 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3115 if (mask & 1)
3116 __regmask_set(regmask, file, n, size);
3117 }
3118 }
3119
3120 static inline void
regmask_clear(regmask_t * regmask,struct ir3_register * reg)3121 regmask_clear(regmask_t *regmask, struct ir3_register *reg)
3122 {
3123 unsigned size = reg_elem_size(reg);
3124 enum ir3_reg_file file;
3125 unsigned num = post_ra_reg_num(reg);
3126 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3127 if (reg->flags & IR3_REG_RELATIV) {
3128 __regmask_clear(regmask, file, n, size * reg->size);
3129 } else {
3130 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3131 if (mask & 1)
3132 __regmask_clear(regmask, file, n, size);
3133 }
3134 }
3135
3136 static inline bool
regmask_get(regmask_t * regmask,struct ir3_register * reg)3137 regmask_get(regmask_t *regmask, struct ir3_register *reg)
3138 {
3139 unsigned size = reg_elem_size(reg);
3140 enum ir3_reg_file file;
3141 unsigned num = post_ra_reg_num(reg);
3142 unsigned n = ir3_reg_file_offset(reg, num, regmask->mergedregs, &file);
3143 if (reg->flags & IR3_REG_RELATIV) {
3144 return __regmask_get(regmask, file, n, size * reg->size);
3145 } else {
3146 for (unsigned mask = reg->wrmask; mask; mask >>= 1, n += size)
3147 if (mask & 1)
3148 if (__regmask_get(regmask, file, n, size))
3149 return true;
3150 }
3151 return false;
3152 }
3153 /* ************************************************************************* */
3154
3155 #endif /* IR3_H_ */
3156