xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/valhall/va_merge_flow.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2022 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall_enums.h"
27 
28 /*
29  * Merge NOPs with flow control with nearby instructions to eliminate the NOPs,
30  * according to the following rules:
31  *
32  * 1. Waits may be combined by waiting on a union of the slots.
33  * 2. Waits may be moved up as far as the first (last) asynchronous instruction
34  *    writing to a slot waited on, at a performance cost.
35  * 3. Discard may be moved down, at a performance cost.
36  * 4. Reconverge must be on the last instruction of the block.
37  * 5. End must be on the last instruction of the block.
38  *
39  * For simplicity, this pass only merges within a single basic block. This
40  * should be sufficient. The algorithm works as follows.
41  *
42  * Because reconverge/end must be on the last instruction, there may be only
43  * one of these in a block. First check for a NOP at the end, and if it has
44  * reconverge/end flow control, merge with the penultimate instruction. Now we
45  * only have to worry about waits and discard.
46  *
47  * The cost of moving a wait up is assumed to be greater than the cost of moving
48  * a discard down, so we next move waits while we have more freedom. For each
49  * wait, merge with an instruction above that either is free or has another
50  * wait, and merge the flow control, aborting if we hit a message signaling the
51  * relevant slot. By maintaining the candidate instruction at all steps, this is
52  * implemented in a linear time walk.
53  *
54  * Finally, all that's left are discards. Move discards down to merge with a
55  * free instruction with a similar linear time walk.
56  *
57  * Since discarding helper threads is an optimization (not required for
58  * correctness), we may remove discards if we believe them harmful to the
59  * performance of the shader. Keeping with the local structure of the pass, we
60  * remove discards only if they are at the end of the last block and cannot be
61  * merged with any other instruction.  Such a condition means every following
62  * instruction has incompatible flow control (wait and end). In practice, this
63  * means BLEND and ATEST run for helper threads in certain shaders to save a
64  * NOP, but BLEND is a no-op for helper threads anyway.
65  *
66  * One case we don't handle is merging "foo, bar, wait, reconverge" to
67  * "foo.wait, bar.reconverge". This sequence is rarely generated by the dataflow
68  * analysis, so we don't care for the complexity.
69  */
70 
71 /*
72  * If the last instruction has reconverge or end, try to merge with the
73  * second to last instruction.
74  *
75  * Precondition: block has at least 2 instructions.
76  */
77 static void
merge_end_reconverge(bi_block * block)78 merge_end_reconverge(bi_block *block)
79 {
80    bi_instr *last = list_last_entry(&block->instructions, bi_instr, link);
81    bi_instr *penult = bi_prev_op(last);
82 
83    if (last->op != BI_OPCODE_NOP)
84       return;
85    if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END)
86       return;
87 
88    /* End implies all other flow control except for waiting on barriers (slot
89     * #7, with VA_FLOW_WAIT), so remove blocking flow control.
90     */
91    if (last->flow == VA_FLOW_END) {
92       while (penult->op == BI_OPCODE_NOP && penult->flow != VA_FLOW_WAIT) {
93          bi_remove_instruction(penult);
94 
95          /* There may be nothing left */
96          if (list_is_singular(&block->instructions))
97             return;
98 
99          penult = bi_prev_op(last);
100       }
101    }
102 
103    /* If there is blocking flow control, we can't merge */
104    if (penult->flow != VA_FLOW_NONE)
105       return;
106 
107    /* Else, merge */
108    penult->flow = last->flow;
109    bi_remove_instruction(last);
110 }
111 
112 /*
113  * Calculate the union of two waits. We may wait on any combination of slots #0,
114  * #1, #2 or the entirety of 0126 and 01267. If we wait on the entirety, the
115  * union is trivial. If we do not wait on slot #6 (by extension slot #7), we
116  * wait only on slots #0, #1, and #2, for which the waits are encoded as a
117  * bitset and the union is just a bitwise OR.
118  */
119 static enum va_flow
union_waits(enum va_flow x,enum va_flow y)120 union_waits(enum va_flow x, enum va_flow y)
121 {
122    assert(va_flow_is_wait_or_none(x) && va_flow_is_wait_or_none(y));
123 
124    if ((x == VA_FLOW_WAIT) || (y == VA_FLOW_WAIT))
125       return VA_FLOW_WAIT;
126    else if ((x == VA_FLOW_WAIT0126) || (y == VA_FLOW_WAIT0126))
127       return VA_FLOW_WAIT0126;
128    else
129       return x | y;
130 }
131 
132 static void
merge_waits(bi_block * block)133 merge_waits(bi_block *block)
134 {
135    /* Most recent instruction with which we can merge, or NULL if none */
136    bi_instr *last_free = NULL;
137 
138    bi_foreach_instr_in_block_safe(block, I) {
139       if (last_free != NULL && I->op == BI_OPCODE_NOP &&
140           va_flow_is_wait_or_none(I->flow)) {
141 
142          /* Merge waits with compatible instructions */
143          last_free->flow = union_waits(last_free->flow, I->flow);
144          bi_remove_instruction(I);
145          continue;
146       }
147 
148       /* Don't move waits past async instructions, since they might be what
149        * we're waiting for. If we wanted to optimize this case, we could check
150        * the signaled slots.
151        */
152       if (bi_opcode_props[I->op].message)
153          last_free = NULL;
154 
155       /* We can only merge with instructions whose flow control is a wait.
156        * This includes such an instruction after merging in a wait. It also
157        * includes async instructions.
158        */
159       if (va_flow_is_wait_or_none(I->flow))
160          last_free = I;
161    }
162 }
163 
164 static bool
bi_is_first_instr(bi_block * block,bi_instr * I)165 bi_is_first_instr(bi_block *block, bi_instr *I)
166 {
167    return block->instructions.next == &I->link;
168 }
169 
170 static void
merge_discard(bi_block * block)171 merge_discard(bi_block *block)
172 {
173    bi_instr *last_free = NULL;
174 
175    bi_foreach_instr_in_block_safe_rev(block, I) {
176       if ((I->op == BI_OPCODE_NOP) && (I->flow == VA_FLOW_DISCARD)) {
177          /* Try to merge with the instruction *preceding* discard, because
178           * because flow control happens at the end of an instruction and
179           * discard is a NOP.
180           */
181          if (!bi_is_first_instr(block, I)) {
182             bi_instr *prev = bi_prev_op(I);
183 
184             if (prev->flow == VA_FLOW_NONE) {
185                prev->flow = VA_FLOW_DISCARD;
186                bi_remove_instruction(I);
187                continue;
188             }
189          }
190 
191          /* Or try to merge with the next instruction with no flow control */
192          if (last_free != NULL) {
193             last_free->flow = VA_FLOW_DISCARD;
194             bi_remove_instruction(I);
195             continue;
196          }
197 
198          /* If there's nowhere to merge and this is the end of the shader, just
199           * remove the discard.
200           */
201          if (bi_num_successors(block) == 0) {
202             bi_remove_instruction(I);
203             continue;
204          }
205       }
206 
207       /* Allow merging into free instructions */
208       if (I->flow == VA_FLOW_NONE)
209          last_free = I;
210    }
211 }
212 
213 void
va_merge_flow(bi_context * ctx)214 va_merge_flow(bi_context *ctx)
215 {
216    bi_foreach_block(ctx, block) {
217       /* If there are less than 2 instructions, there's nothing to merge */
218       if (list_is_empty(&block->instructions))
219          continue;
220       if (list_is_singular(&block->instructions))
221          continue;
222 
223       merge_end_reconverge(block);
224       merge_waits(block);
225 
226       if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
227          merge_discard(block);
228    }
229 }
230