xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bi_helper_invocations.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2019-2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *    Alyssa Rosenzweig <[email protected]>
25  */
26 
27 #include "compiler.h"
28 
29 /* Bifrost texture operations have a `skip` bit, instructing helper invocations
30  * to skip execution. Each clause has a `terminate_discarded_threads` bit,
31  * which will terminate helper invocations.
32  *
33  * The terminate bit should be set on the last clause requiring helper
34  * invocations. Without control flow, that's the last source-order instruction;
35  * with control flow, there may be multiple such instructions (with ifs) or no
36  * such instruction (with loops).
37  *
38  * The skip bit should be set unless the value of this instruction is required
39  * by a future instruction requiring helper invocations. Consider:
40  *
41  *      0 = texture ...
42  *      1 = fmul 0, #10
43  *      2 = dfdx 1
44  *      store 2
45  *
46  * Since the derivative calculation 2 requires helper invocations, the value 1
47  * must be calculated by helper invocations, and since it depends on 0, 0 must
48  * be calculated by helpers. Hence the texture op does NOT have the skip bit
49  * set, and the clause containing the derivative has the terminate bit set.
50  *
51  * Calculating the terminate bit occurs by forward dataflow analysis to
52  * determine which blocks require helper invocations. A block requires
53  * invocations in if any of its instructions use helper invocations, or if it
54  * depends on a block that requires invocation. With that analysis, the
55  * terminate bit is set on the last instruction using invocations within any
56  * block that does *not* require invocations out.
57  *
58  * Likewise, calculating the execute bit requires backward dataflow analysis
59  * with union as the join operation and the generating set being the union of
60  * sources of instructions writing executed values. The skip bit is the inverse
61  * of the execute bit.
62  */
63 
64 static bool
bi_has_skip_bit(enum bi_opcode op)65 bi_has_skip_bit(enum bi_opcode op)
66 {
67    switch (op) {
68    case BI_OPCODE_TEX_SINGLE:
69    case BI_OPCODE_TEXC:
70    case BI_OPCODE_TEXC_DUAL:
71    case BI_OPCODE_TEXS_2D_F16:
72    case BI_OPCODE_TEXS_2D_F32:
73    case BI_OPCODE_TEXS_CUBE_F16:
74    case BI_OPCODE_TEXS_CUBE_F32:
75    case BI_OPCODE_VAR_TEX_F16:
76    case BI_OPCODE_VAR_TEX_F32:
77       return true;
78    default:
79       return false;
80    }
81 }
82 
83 /* Does a given instruction require helper threads to be active (because it
84  * reads from other subgroup lanes)? This only applies to fragment shaders.
85  * Other shader stages do not have a notion of helper threads. */
86 
87 bool
bi_instr_uses_helpers(bi_instr * I)88 bi_instr_uses_helpers(bi_instr *I)
89 {
90    switch (I->op) {
91    case BI_OPCODE_TEXC:
92    case BI_OPCODE_TEXC_DUAL:
93    case BI_OPCODE_TEXS_2D_F16:
94    case BI_OPCODE_TEXS_2D_F32:
95    case BI_OPCODE_TEXS_CUBE_F16:
96    case BI_OPCODE_TEXS_CUBE_F32:
97    case BI_OPCODE_VAR_TEX_F16:
98    case BI_OPCODE_VAR_TEX_F32:
99       return !I->lod_mode; /* set for zero, clear for computed */
100    case BI_OPCODE_TEX_SINGLE:
101       return (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_LOD) ||
102              (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_BIAS);
103    case BI_OPCODE_CLPER_I32:
104    case BI_OPCODE_CLPER_OLD_I32:
105       /* Fragment shaders require helpers to implement derivatives.
106        * Other shader stages don't have helpers at all */
107       return true;
108    default:
109       return false;
110    }
111 }
112 
113 static void
bi_add_branch_compare_values(const bi_instr * I,BITSET_WORD * deps)114 bi_add_branch_compare_values(const bi_instr *I, BITSET_WORD *deps)
115 {
116    switch (I->op) {
117    case BI_OPCODE_BRANCHZI:
118    case BI_OPCODE_BRANCHC_I16:
119    case BI_OPCODE_BRANCHC_I32:
120       BITSET_SET(deps, I->src[0].value);
121       break;
122    case BI_OPCODE_BRANCH_F16:
123    case BI_OPCODE_BRANCH_F32:
124    case BI_OPCODE_BRANCH_I16:
125    case BI_OPCODE_BRANCH_I32:
126    case BI_OPCODE_BRANCH_S16:
127    case BI_OPCODE_BRANCH_S32:
128    case BI_OPCODE_BRANCH_U16:
129    case BI_OPCODE_BRANCH_U32:
130    case BI_OPCODE_BRANCHZ_F16:
131    case BI_OPCODE_BRANCHZ_F32:
132    case BI_OPCODE_BRANCHZ_I16:
133    case BI_OPCODE_BRANCHZ_I32:
134    case BI_OPCODE_BRANCHZ_S16:
135    case BI_OPCODE_BRANCHZ_S32:
136    case BI_OPCODE_BRANCHZ_U16:
137    case BI_OPCODE_BRANCHZ_U32:
138       BITSET_SET(deps, I->src[0].value);
139       BITSET_SET(deps, I->src[1].value);
140       break;
141    default:
142       break;
143    }
144 }
145 
146 /* Does a block use helpers directly */
147 static bool
bi_block_uses_helpers(bi_block * block)148 bi_block_uses_helpers(bi_block *block)
149 {
150    bi_foreach_instr_in_block(block, I) {
151       if (bi_instr_uses_helpers(I))
152          return true;
153    }
154 
155    return false;
156 }
157 
158 bool
bi_block_terminates_helpers(bi_block * block)159 bi_block_terminates_helpers(bi_block *block)
160 {
161    /* Can't terminate if a successor needs helpers */
162    bi_foreach_successor(block, succ) {
163       if (succ->pass_flags & 1)
164          return false;
165    }
166 
167    /* Otherwise we terminate */
168    return true;
169 }
170 
171 /*
172  * Propagate the pass flag up the control flow graph by performing depth-first
173  * search on the directed control flow graph.
174  */
175 static void
bi_propagate_pass_flag(bi_block * block)176 bi_propagate_pass_flag(bi_block *block)
177 {
178    block->pass_flags = 1;
179 
180    bi_foreach_predecessor(block, pred) {
181       if ((*pred)->pass_flags == 0)
182          bi_propagate_pass_flag(*pred);
183    }
184 }
185 
186 void
bi_analyze_helper_terminate(bi_context * ctx)187 bi_analyze_helper_terminate(bi_context *ctx)
188 {
189    /* Other shader stages do not have a notion of helper threads, so we
190     * can skip the analysis. Don't run for blend shaders, either, since
191     * they run in the context of another shader that we don't see. */
192    if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
193       return;
194 
195    /* Clear flags */
196    bi_foreach_block(ctx, block)
197       block->pass_flags = 0;
198 
199    /* For each block, check if it uses helpers and propagate that fact if
200     * so. We walk in reverse order to minimize the number of blocks tested:
201     * if the (unique) last block uses helpers, only that block is tested.
202     */
203    bi_foreach_block_rev(ctx, block) {
204       if (block->pass_flags == 0 && bi_block_uses_helpers(block))
205          bi_propagate_pass_flag(block);
206    }
207 }
208 
209 void
bi_mark_clauses_td(bi_context * ctx)210 bi_mark_clauses_td(bi_context *ctx)
211 {
212    if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
213       return;
214 
215    /* Finally, mark clauses requiring helpers */
216    bi_foreach_block(ctx, block) {
217       /* At the end, there are helpers iff we don't terminate */
218       bool helpers = !bi_block_terminates_helpers(block);
219 
220       bi_foreach_clause_in_block_rev(block, clause) {
221          bi_foreach_instr_in_clause_rev(block, clause, I) {
222             helpers |= bi_instr_uses_helpers(I);
223          }
224 
225          clause->td = !helpers;
226       }
227    }
228 }
229 
230 static bool
bi_helper_block_update(BITSET_WORD * deps,bi_block * block)231 bi_helper_block_update(BITSET_WORD *deps, bi_block *block)
232 {
233    bool progress = false;
234 
235    bi_foreach_instr_in_block_rev(block, I) {
236       /* If a destination is required by helper invocation... */
237       bi_foreach_dest(I, d) {
238          if (!BITSET_TEST(deps, I->dest[d].value))
239             continue;
240 
241          /* ...so are the sources */
242          bi_foreach_ssa_src(I, s) {
243             progress |= !BITSET_TEST(deps, I->src[s].value);
244             BITSET_SET(deps, I->src[s].value);
245          }
246 
247          break;
248       }
249    }
250 
251    return progress;
252 }
253 
254 void
bi_analyze_helper_requirements(bi_context * ctx)255 bi_analyze_helper_requirements(bi_context *ctx)
256 {
257    BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), ctx->ssa_alloc);
258 
259    /* Initialize with the sources of instructions consuming
260     * derivatives and the sources of conditional branch instructions */
261 
262    bi_foreach_instr_global(ctx, I) {
263       if (bi_instr_uses_helpers(I)) {
264          bi_foreach_ssa_src(I, s)
265             BITSET_SET(deps, I->src[s].value);
266       } else {
267          bi_add_branch_compare_values(I, deps);
268       }
269    }
270 
271    /* Propagate that up */
272    bool progress;
273    do {
274       progress = false;
275       bi_foreach_block_rev(ctx, block)
276          progress |= bi_helper_block_update(deps, block);
277    } while (progress);
278 
279    /* Set the execute bits */
280 
281    bi_foreach_instr_global(ctx, I) {
282       if (!bi_has_skip_bit(I->op))
283          continue;
284 
285       bool exec = false;
286 
287       bi_foreach_dest(I, d)
288          exec |= BITSET_TEST(deps, I->dest[d].value);
289 
290       I->skip = !exec;
291    }
292 
293    free(deps);
294 }
295