xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_delay.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "ir3.h"
10 
11 #include "ir3_compiler.h"
12 
13 /* The maximum number of nop's we may need to insert between two instructions.
14  */
15 #define MAX_NOPS 6
16 
17 /*
18  * Helpers to figure out the necessary delay slots between instructions.  Used
19  * both in scheduling pass(es) and the final pass to insert any required nop's
20  * so that the shader program is valid.
21  *
22  * Note that this needs to work both pre and post RA, so we can't assume ssa
23  * src iterators work.
24  */
25 
26 /* calculate required # of delay slots between the instruction that
27  * assigns a value and the one that consumes
28  */
29 int
ir3_delayslots(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)30 ir3_delayslots(struct ir3_compiler *compiler,
31                struct ir3_instruction *assigner,
32                struct ir3_instruction *consumer, unsigned n, bool soft)
33 {
34    /* generally don't count false dependencies, since this can just be
35     * something like a barrier, or SSBO store.
36     */
37    if (__is_false_dep(consumer, n))
38       return 0;
39 
40    /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
41     * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
42     * handled with sync bits
43     */
44 
45    if (is_meta(assigner) || is_meta(consumer))
46       return 0;
47 
48    if (writes_addr0(assigner) || writes_addr1(assigner))
49       return 6;
50 
51    if (soft && needs_ss(compiler, assigner, consumer))
52       return soft_ss_delay(assigner);
53 
54    /* handled via sync flags: */
55    if (needs_ss(compiler, assigner, consumer) ||
56        is_sy_producer(assigner))
57       return 0;
58 
59    /* scalar ALU -> scalar ALU depdendencies where the source and destination
60     * register sizes match don't require any nops.
61     */
62    if (is_scalar_alu(assigner, compiler)) {
63       assert(is_scalar_alu(consumer, compiler));
64       /* If the sizes don't match then we need (ss) and needs_ss() should've
65        * returned above.
66        */
67       assert((assigner->dsts[0]->flags & IR3_REG_HALF) ==
68              (consumer->srcs[n]->flags & IR3_REG_HALF));
69       return 0;
70    }
71 
72    /* As far as we know, shader outputs don't need any delay. */
73    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
74       return 0;
75 
76    /* assigner must be alu: */
77    if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
78        is_mem(consumer)) {
79       return 6;
80    } else {
81       /* In mergedregs mode, there is an extra 2-cycle penalty when half of
82        * a full-reg is read as a half-reg or when a half-reg is read as a
83        * full-reg.
84        */
85       bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
86                              (consumer->srcs[n]->flags & IR3_REG_HALF);
87       unsigned penalty = mismatched_half ? 3 : 0;
88       if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
89          /* special case, 3rd src to cat3 not required on first cycle */
90          return 1 + penalty;
91       } else {
92          return 3 + penalty;
93       }
94    }
95 }
96 
97 unsigned
ir3_delayslots_with_repeat(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n)98 ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
99                            struct ir3_instruction *assigner,
100                            struct ir3_instruction *consumer,
101                            unsigned assigner_n, unsigned consumer_n)
102 {
103    unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false);
104 
105    struct ir3_register *src = consumer->srcs[consumer_n];
106    struct ir3_register *dst = assigner->dsts[assigner_n];
107 
108    if (assigner->repeat == 0 && consumer->repeat == 0)
109       return delay;
110 
111    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
112    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
113 
114    /* If either side is a relative access, we can't really apply most of the
115     * reasoning below because we don't know which component aliases which.
116     * Just bail in this case.
117     */
118    if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
119       return delay;
120 
121    /* MOVMSK seems to require that all users wait until the entire
122     * instruction is finished, so just bail here.
123     */
124    if (assigner->opc == OPC_MOVMSK)
125       return delay;
126 
127    /* TODO: Handle the combination of (rpt) and different component sizes
128     * better like below. This complicates things significantly because the
129     * components don't line up.
130     */
131    if ((src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF))
132       return delay;
133 
134    /* If an instruction has a (rpt), then it acts as a sequence of
135     * instructions, reading its non-(r) sources at each cycle. First, get the
136     * register num for the first instruction where they interfere:
137     */
138 
139    unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
140 
141    /* Now, for that first conflicting half/full register, figure out the
142     * sub-instruction within assigner/consumer it corresponds to. For (r)
143     * sources, this should already return the correct answer of 0. However we
144     * have to special-case the multi-mov instructions, where the
145     * sub-instructions sometimes come from the src/dst indices instead.
146     */
147    unsigned first_src_instr;
148    if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
149       first_src_instr = consumer_n;
150    else
151       first_src_instr = first_num - src->num;
152 
153    unsigned first_dst_instr;
154    if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
155       first_dst_instr = assigner_n;
156    else
157       first_dst_instr = first_num - dst->num;
158 
159    /* The delay we return is relative to the *end* of assigner and the
160     * *beginning* of consumer, because it's the number of nops (or other
161     * things) needed between them. Any instructions after first_dst_instr
162     * subtract from the delay, and so do any instructions before
163     * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
164     * delay to account for that.
165     *
166     * Now, a priori, we need to go through this process for every
167     * conflicting regnum and take the minimum of the offsets to make sure
168     * that the appropriate number of nop's is inserted for every conflicting
169     * pair of sub-instructions. However, as we go to the next conflicting
170     * regnum (if any), the number of instructions after first_dst_instr
171     * decreases by 1 and the number of source instructions before
172     * first_src_instr correspondingly increases by 1, so the offset stays the
173     * same for all conflicting registers.
174     */
175    unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
176    return offset > delay ? 0 : delay - offset;
177 }
178 
179