xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_use_dominance.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2014 Intel Corporation
3  * Copyright 2023 Advanced Micro Devices, Inc.
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /* This implements dominance and post-dominance of the SSA use graph where
9  * instructions are vertices and SSA uses are edges (i.e. edges go from
10  * each instruction to all its uses). CF nodes are ignored and irrelevant.
11  * It's different from nir_dominance.c, but the algorithm is the same, which
12  * is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
13  *
14  * Definitions:
15  * - Instruction A is post-dominated by instruction B if the result of
16  *   instruction A and following intermediate results using the result of
17  *   instruction A only affect the result of instruction B. Consequently,
18  *   if instruction B was removed, instruction A would become dead including
19  *   all instructions computing the intermediate results.
20  *   Example: A(load) -> ... -> B(ALU)
21  *   Note: This is the foundation of inter-shader code motion from later
22  *   shaders to earlier shaders.
23  * - Instruction B is dominated by instruction A if all use paths from
24  *   all loads to instruction B must go through instruction A.
25  *   Note: Unlike post-dominance, dominance is unusable as-is because
26  *   the immediate dominator typically doesn't exist if there are non-unary
27  *   opcodes (i.e. branches of an expression tree following source operands
28  *   don't usually converge to a single instruction unless all instructions
29  *   are unary). The solution is to ignore loads like load_const to allow
30  *   non-unary opcodes, which would be the foundation of inter-shader code
31  *   motion from earlier shaders to later shaders, such as 2 output stores
32  *   having only 1 ALU instruction as their only source at the beginning,
33  *   ignoring constant and uniform operands along the way.
34  *
35  * Interesting cases implied by this (post-)dominator tree:
36  * - load_const, loads without src operands, and undef are not dominated by
37  *   anything because they don't have any src operands.
38  * - No instruction post-dominates store intrinsics (and all other intrinsics
39  *   without a destination) and nir_if nodes (they use a value but don't
40  *   produce any).
41  *
42  * Typical application:
43  * - The immediate post-dominator query returns the solution to the problem of
44  *   how much code we can move into the previous shader or preamble without
45  *   increasing the number of inputs. Example of an SSA-use graph and
46  *   the possible result that a user of this utility can produce:
47  *
48  *          input0 input1             input0 input1
49  *              \   / \                  |      \
50  *    constant   alu  ...    ------>     |     ...
51  *           \   /
52  *            alu
53  * (immediate post-dominator of input0)
54  *
55  * Examples of possible applications:
56  * - Moving load_input+ALU to the previous shader: An immediate post-dominator
57  *   of load_input and all instructions between load_input and the immediate
58  *   post-dominator are a candidate for being moved into the previous shader
59  *   and we only need to check if the post-dominator is movable. Repeat
60  *   the immediate post-dominator query on the accepted post-dominator and see
61  *   if that is also movable. Repeat that until you find the farthest post-
62  *   dominator that is movable.
63  * - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate
64  *   post-dominator of load_uniform is a candidate for being moved into
65  *   the preamble shader or the CPU. Repeat the immediate post-dominator query
66  *   until you find the farthest post-dominator that is movable.
67  * - Replacing a value used to compute 2 shader outputs by only 1 output, and
68  *   moving the computation into the next shader:
69  *   The Lowest Common Ancestor of 2 output stores within the dominator tree
70  *   is a candidate for the new replacement output. Any loads that are
71  *   trivially movable such as load_const should be ignored by this utility,
72  *   otherwise the Lowest Common Ancestor wouldn't exist.
73  *
74  * Queries:
75  * - get the immediate dominator of an instruction
76  * - get the Lowest Common Ancestor of 2 instructions
77  * - whether one instruction dominates another
78  *
79  * Implemenation details:
80  * - Since some instructions are not dominated by anything, a dummy root is
81  *   added into the graph that dominates such instructions, which is required
82  *   by the algorithm.
83  *
84  * TODO: only post-dominance implemented, not dominance
85  */
86 
87 #include "nir.h"
88 
89 struct nir_use_dom_node {
90    nir_instr *instr;
91    uint32_t index;
92 
93    /* The index of this node's immediate dominator in the dominator tree.
94     * The dummy root points it to itself. -1 == unset.
95     */
96    int32_t imm_dom;
97 };
98 
99 struct nir_use_dominance_state {
100    nir_function_impl *impl;
101    struct nir_use_dom_node *dom_nodes;
102    unsigned num_dom_nodes;
103 };
104 
105 static struct nir_use_dom_node *
get_node(struct nir_use_dominance_state * state,nir_instr * instr)106 get_node(struct nir_use_dominance_state *state, nir_instr *instr)
107 {
108    return &state->dom_nodes[instr->index];
109 }
110 
111 static struct nir_use_dom_node *
get_imm_dom(struct nir_use_dominance_state * state,struct nir_use_dom_node * node)112 get_imm_dom(struct nir_use_dominance_state *state,
113             struct nir_use_dom_node *node)
114 {
115    assert(node->imm_dom != -1);
116    return &state->dom_nodes[node->imm_dom];
117 }
118 
119 static bool
init_instr(struct nir_use_dominance_state * state,nir_instr * instr,unsigned * index)120 init_instr(struct nir_use_dominance_state *state, nir_instr *instr,
121            unsigned *index)
122 {
123    assert(*index < state->num_dom_nodes);
124    struct nir_use_dom_node *node = &state->dom_nodes[*index];
125 
126    if (*index == 0) {
127       /* dummy root */
128       node->imm_dom = 0;
129    } else {
130       node->imm_dom = -1;
131       node->instr = instr;
132       instr->index = node->index = *index;
133    }
134    (*index)++;
135 
136    return true;
137 }
138 
139 static struct nir_use_dom_node *
intersect(struct nir_use_dominance_state * state,struct nir_use_dom_node * i1,struct nir_use_dom_node * i2)140 intersect(struct nir_use_dominance_state *state, struct nir_use_dom_node *i1,
141           struct nir_use_dom_node *i2)
142 {
143    while (i1 != i2) {
144       /* Note, the comparisons here are the opposite of what the paper says
145        * because we index instrs from beginning -> end (i.e. reverse
146        * post-order) instead of post-order like they assume.
147        */
148       while (i1->index > i2->index)
149          i1 = get_imm_dom(state, i1);
150       while (i2->index > i1->index)
151          i2 = get_imm_dom(state, i2);
152    }
153 
154    return i1;
155 }
156 
157 static void
update_imm_dom(struct nir_use_dominance_state * state,struct nir_use_dom_node * pred,struct nir_use_dom_node ** new_idom)158 update_imm_dom(struct nir_use_dominance_state *state,
159                struct nir_use_dom_node *pred,
160                struct nir_use_dom_node **new_idom)
161 {
162    if (pred->imm_dom != -1) {
163       if (*new_idom)
164          *new_idom = intersect(state, pred, *new_idom);
165       else
166          *new_idom = pred;
167    }
168 }
169 
170 static bool
calc_dominance(struct nir_use_dominance_state * state,struct nir_use_dom_node * node,bool post_dominance)171 calc_dominance(struct nir_use_dominance_state *state,
172                struct nir_use_dom_node *node, bool post_dominance)
173 {
174    struct nir_use_dom_node *new_idom = NULL;
175 
176    if (post_dominance) {
177       nir_def *def = nir_instr_def(node->instr);
178       bool has_use = false;
179 
180       /* Intrinsics that can't be reordered will get the root node as
181        * the post-dominator.
182        */
183       if (def &&
184           (node->instr->type != nir_instr_type_intrinsic ||
185            nir_intrinsic_can_reorder(nir_instr_as_intrinsic(node->instr)))) {
186          nir_foreach_use_including_if(src, def) {
187             has_use = true;
188 
189             /* Ifs are treated like stores because they don't produce
190              * a value. dom_nodes[0] is the dummy root.
191              */
192             if (nir_src_is_if(src)) {
193                update_imm_dom(state, &state->dom_nodes[0], &new_idom);
194                /* Short-cut because we can't come back from the root node. */
195                break;
196             } else {
197                update_imm_dom(state,
198                               get_node(state, nir_src_parent_instr(src)),
199                               &new_idom);
200             }
201          }
202       }
203 
204       /* No destination (e.g. stores, atomics with an unused result, discard,
205        * dead instructions). dom_nodes[0] is the dummy root.
206        */
207       if (!has_use)
208          update_imm_dom(state, &state->dom_nodes[0], &new_idom);
209    } else {
210       unreachable("TODO: only post-dominance implemented, not dominance");
211    }
212 
213    if (new_idom && node->imm_dom != new_idom->index) {
214       node->imm_dom = new_idom->index;
215       return true;
216    }
217 
218    return false;
219 }
220 
221 /**
222  * Calculate dominance or post-dominance of the SSA use graph.
223  * The returned state must not be freed while dominance queries are being used.
224  * ralloc_free() frees the state.
225  *
226  * It clobbers nir_instr::index, which can't be changed while dominance queries
227  * are being used.
228  *
229  * \param impl             NIR function
230  * \param post_dominance   Whether to compute post-dominance or dominance.
231  */
232 struct nir_use_dominance_state *
nir_calc_use_dominance_impl(nir_function_impl * impl,bool post_dominance)233 nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance)
234 {
235    struct nir_use_dominance_state *state =
236       rzalloc(NULL, struct nir_use_dominance_state);
237    if (!state)
238       return NULL;
239 
240    unsigned num_dom_nodes = 1; /* including the dummy root */
241    nir_foreach_block(block, impl) {
242       num_dom_nodes += exec_list_length(&block->instr_list);
243    }
244 
245    state->impl = impl;
246    state->num_dom_nodes = num_dom_nodes;
247    state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node,
248                                     num_dom_nodes);
249    if (!state->dom_nodes) {
250       ralloc_free(state);
251       return NULL;
252    }
253 
254    unsigned index = 0;
255 
256    /* We need a dummy root node because there are instructions such as
257     * load_const that aren't dominated by anything. If we are calculating
258     * post-dominance, intrinsics without a destination aren't post-dominated
259     * by anything. However, the algorithm requires a common (post-)dominator.
260     */
261    init_instr(state, NULL, &index);
262 
263    /* Post-dominance is identical to dominance, but instructions are added
264     * in the opposite order.
265     */
266    if (post_dominance) {
267       nir_foreach_block_reverse(block, impl) {
268          nir_foreach_instr_reverse(instr, block) {
269             init_instr(state, instr, &index);
270          }
271       }
272    } else {
273       nir_foreach_block(block, impl) {
274          nir_foreach_instr(instr, block) {
275             init_instr(state, instr, &index);
276          }
277       }
278    }
279 
280    bool progress = true;
281    while (progress) {
282       progress = false;
283 
284       /* Skip the dummy root (iterate from 1). */
285       for (unsigned i = 1; i < num_dom_nodes; i++) {
286          progress |= calc_dominance(state, &state->dom_nodes[i],
287                                     post_dominance);
288       }
289    }
290 
291    return state;
292 }
293 
294 nir_instr *
nir_get_immediate_use_dominator(struct nir_use_dominance_state * state,nir_instr * instr)295 nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
296                                 nir_instr *instr)
297 {
298    struct nir_use_dom_node *node = get_node(state, instr);
299 
300    return get_imm_dom(state, node)->instr;
301 }
302 
303 /**
304  * Computes the least common ancestor of two instructions.
305  */
306 nir_instr *
nir_use_dominance_lca(struct nir_use_dominance_state * state,nir_instr * i1,nir_instr * i2)307 nir_use_dominance_lca(struct nir_use_dominance_state *state,
308                       nir_instr *i1, nir_instr *i2)
309 {
310    assert(i1 && i2);
311    struct nir_use_dom_node *lca = intersect(state, get_node(state, i1),
312                                             get_node(state, i2));
313    assert(lca);
314    /* Might be NULL in case of the dummy root. */
315    return lca->instr;
316 }
317 
318 /**
319  * Returns true if the parent dominates the child in the SSA use graph
320  * described at the beginning.
321  */
322 bool
nir_instr_dominates_use(struct nir_use_dominance_state * state,nir_instr * parent_instr,nir_instr * child_instr)323 nir_instr_dominates_use(struct nir_use_dominance_state *state,
324                         nir_instr *parent_instr, nir_instr *child_instr)
325 {
326    struct nir_use_dom_node *parent = get_node(state, parent_instr);
327    struct nir_use_dom_node *child = get_node(state, child_instr);
328 
329    while (parent->index < child->index)
330       child = get_imm_dom(state, child);
331 
332    return parent == child;
333 }
334 
335 static void
print_instr(struct nir_use_dom_node * node)336 print_instr(struct nir_use_dom_node *node)
337 {
338    if (!node)
339       printf("NULL - bug");
340    else if (node->index == 0)
341       printf("dummy_root");
342    else
343       nir_print_instr(node->instr, stdout);
344 }
345 
346 void
nir_print_use_dominators(struct nir_use_dominance_state * state,nir_instr ** instructions,unsigned num_instructions)347 nir_print_use_dominators(struct nir_use_dominance_state *state,
348                          nir_instr **instructions, unsigned num_instructions)
349 {
350    for (unsigned i = 0; i < num_instructions; i++) {
351       printf("Input idom(\"");
352       nir_print_instr(instructions[i], stdout);
353       printf("\") = \"");
354       print_instr(get_imm_dom(state, get_node(state, instructions[i])));
355       printf("\"\n");
356    }
357    puts("");
358 
359    nir_foreach_block(block, state->impl) {
360       nir_foreach_instr(instr, block) {
361          printf("idom(\"");
362          nir_print_instr(instr, stdout);
363          printf("\") = \"");
364          print_instr(get_imm_dom(state, get_node(state, instr)));
365          printf("\"\n");
366       }
367    }
368    puts("");
369 
370    for (unsigned i = 0; i < num_instructions; i++) {
371       for (unsigned j = i + 1; j < num_instructions; j++) {
372          printf("LCA input 1: ");
373          nir_print_instr(instructions[i], stdout);
374          printf("\nLCA input 2: ");
375          nir_print_instr(instructions[j], stdout);
376          puts("");
377          nir_instr *lca =
378             nir_use_dominance_lca(state, instructions[i], instructions[j]);
379 
380          if (lca) {
381             printf("2 inputs have a common post-dominator: ");
382             nir_print_instr(lca, stdout);
383             printf("\n");
384          }
385          puts("");
386       }
387    }
388 }
389