1 /*
2 * Copyright 2014 Intel Corporation
3 * Copyright 2023 Advanced Micro Devices, Inc.
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 /* This implements dominance and post-dominance of the SSA use graph where
9 * instructions are vertices and SSA uses are edges (i.e. edges go from
10 * each instruction to all its uses). CF nodes are ignored and irrelevant.
11 * It's different from nir_dominance.c, but the algorithm is the same, which
12 * is from "A Simple, Fast Dominance Algorithm" by Cooper, Harvey, and Kennedy.
13 *
14 * Definitions:
15 * - Instruction A is post-dominated by instruction B if the result of
16 * instruction A and following intermediate results using the result of
17 * instruction A only affect the result of instruction B. Consequently,
18 * if instruction B was removed, instruction A would become dead including
19 * all instructions computing the intermediate results.
20 * Example: A(load) -> ... -> B(ALU)
21 * Note: This is the foundation of inter-shader code motion from later
22 * shaders to earlier shaders.
23 * - Instruction B is dominated by instruction A if all use paths from
24 * all loads to instruction B must go through instruction A.
25 * Note: Unlike post-dominance, dominance is unusable as-is because
26 * the immediate dominator typically doesn't exist if there are non-unary
27 * opcodes (i.e. branches of an expression tree following source operands
28 * don't usually converge to a single instruction unless all instructions
29 * are unary). The solution is to ignore loads like load_const to allow
30 * non-unary opcodes, which would be the foundation of inter-shader code
31 * motion from earlier shaders to later shaders, such as 2 output stores
32 * having only 1 ALU instruction as their only source at the beginning,
33 * ignoring constant and uniform operands along the way.
34 *
35 * Interesting cases implied by this (post-)dominator tree:
36 * - load_const, loads without src operands, and undef are not dominated by
37 * anything because they don't have any src operands.
38 * - No instruction post-dominates store intrinsics (and all other intrinsics
39 * without a destination) and nir_if nodes (they use a value but don't
40 * produce any).
41 *
42 * Typical application:
43 * - The immediate post-dominator query returns the solution to the problem of
44 * how much code we can move into the previous shader or preamble without
45 * increasing the number of inputs. Example of an SSA-use graph and
46 * the possible result that a user of this utility can produce:
47 *
48 * input0 input1 input0 input1
49 * \ / \ | \
50 * constant alu ... ------> | ...
51 * \ /
52 * alu
53 * (immediate post-dominator of input0)
54 *
55 * Examples of possible applications:
56 * - Moving load_input+ALU to the previous shader: An immediate post-dominator
57 * of load_input and all instructions between load_input and the immediate
58 * post-dominator are a candidate for being moved into the previous shader
59 * and we only need to check if the post-dominator is movable. Repeat
60 * the immediate post-dominator query on the accepted post-dominator and see
61 * if that is also movable. Repeat that until you find the farthest post-
62 * dominator that is movable.
63 * - Moving load_uniform+ALU to a preamble shader or the CPU: An immediate
64 * post-dominator of load_uniform is a candidate for being moved into
65 * the preamble shader or the CPU. Repeat the immediate post-dominator query
66 * until you find the farthest post-dominator that is movable.
67 * - Replacing a value used to compute 2 shader outputs by only 1 output, and
68 * moving the computation into the next shader:
69 * The Lowest Common Ancestor of 2 output stores within the dominator tree
70 * is a candidate for the new replacement output. Any loads that are
71 * trivially movable such as load_const should be ignored by this utility,
72 * otherwise the Lowest Common Ancestor wouldn't exist.
73 *
74 * Queries:
75 * - get the immediate dominator of an instruction
76 * - get the Lowest Common Ancestor of 2 instructions
77 * - whether one instruction dominates another
78 *
79 * Implemenation details:
80 * - Since some instructions are not dominated by anything, a dummy root is
81 * added into the graph that dominates such instructions, which is required
82 * by the algorithm.
83 *
84 * TODO: only post-dominance implemented, not dominance
85 */
86
87 #include "nir.h"
88
89 struct nir_use_dom_node {
90 nir_instr *instr;
91 uint32_t index;
92
93 /* The index of this node's immediate dominator in the dominator tree.
94 * The dummy root points it to itself. -1 == unset.
95 */
96 int32_t imm_dom;
97 };
98
99 struct nir_use_dominance_state {
100 nir_function_impl *impl;
101 struct nir_use_dom_node *dom_nodes;
102 unsigned num_dom_nodes;
103 };
104
105 static struct nir_use_dom_node *
get_node(struct nir_use_dominance_state * state,nir_instr * instr)106 get_node(struct nir_use_dominance_state *state, nir_instr *instr)
107 {
108 return &state->dom_nodes[instr->index];
109 }
110
111 static struct nir_use_dom_node *
get_imm_dom(struct nir_use_dominance_state * state,struct nir_use_dom_node * node)112 get_imm_dom(struct nir_use_dominance_state *state,
113 struct nir_use_dom_node *node)
114 {
115 assert(node->imm_dom != -1);
116 return &state->dom_nodes[node->imm_dom];
117 }
118
119 static bool
init_instr(struct nir_use_dominance_state * state,nir_instr * instr,unsigned * index)120 init_instr(struct nir_use_dominance_state *state, nir_instr *instr,
121 unsigned *index)
122 {
123 assert(*index < state->num_dom_nodes);
124 struct nir_use_dom_node *node = &state->dom_nodes[*index];
125
126 if (*index == 0) {
127 /* dummy root */
128 node->imm_dom = 0;
129 } else {
130 node->imm_dom = -1;
131 node->instr = instr;
132 instr->index = node->index = *index;
133 }
134 (*index)++;
135
136 return true;
137 }
138
139 static struct nir_use_dom_node *
intersect(struct nir_use_dominance_state * state,struct nir_use_dom_node * i1,struct nir_use_dom_node * i2)140 intersect(struct nir_use_dominance_state *state, struct nir_use_dom_node *i1,
141 struct nir_use_dom_node *i2)
142 {
143 while (i1 != i2) {
144 /* Note, the comparisons here are the opposite of what the paper says
145 * because we index instrs from beginning -> end (i.e. reverse
146 * post-order) instead of post-order like they assume.
147 */
148 while (i1->index > i2->index)
149 i1 = get_imm_dom(state, i1);
150 while (i2->index > i1->index)
151 i2 = get_imm_dom(state, i2);
152 }
153
154 return i1;
155 }
156
157 static void
update_imm_dom(struct nir_use_dominance_state * state,struct nir_use_dom_node * pred,struct nir_use_dom_node ** new_idom)158 update_imm_dom(struct nir_use_dominance_state *state,
159 struct nir_use_dom_node *pred,
160 struct nir_use_dom_node **new_idom)
161 {
162 if (pred->imm_dom != -1) {
163 if (*new_idom)
164 *new_idom = intersect(state, pred, *new_idom);
165 else
166 *new_idom = pred;
167 }
168 }
169
170 static bool
calc_dominance(struct nir_use_dominance_state * state,struct nir_use_dom_node * node,bool post_dominance)171 calc_dominance(struct nir_use_dominance_state *state,
172 struct nir_use_dom_node *node, bool post_dominance)
173 {
174 struct nir_use_dom_node *new_idom = NULL;
175
176 if (post_dominance) {
177 nir_def *def = nir_instr_def(node->instr);
178 bool has_use = false;
179
180 /* Intrinsics that can't be reordered will get the root node as
181 * the post-dominator.
182 */
183 if (def &&
184 (node->instr->type != nir_instr_type_intrinsic ||
185 nir_intrinsic_can_reorder(nir_instr_as_intrinsic(node->instr)))) {
186 nir_foreach_use_including_if(src, def) {
187 has_use = true;
188
189 /* Ifs are treated like stores because they don't produce
190 * a value. dom_nodes[0] is the dummy root.
191 */
192 if (nir_src_is_if(src)) {
193 update_imm_dom(state, &state->dom_nodes[0], &new_idom);
194 /* Short-cut because we can't come back from the root node. */
195 break;
196 } else {
197 update_imm_dom(state,
198 get_node(state, nir_src_parent_instr(src)),
199 &new_idom);
200 }
201 }
202 }
203
204 /* No destination (e.g. stores, atomics with an unused result, discard,
205 * dead instructions). dom_nodes[0] is the dummy root.
206 */
207 if (!has_use)
208 update_imm_dom(state, &state->dom_nodes[0], &new_idom);
209 } else {
210 unreachable("TODO: only post-dominance implemented, not dominance");
211 }
212
213 if (new_idom && node->imm_dom != new_idom->index) {
214 node->imm_dom = new_idom->index;
215 return true;
216 }
217
218 return false;
219 }
220
221 /**
222 * Calculate dominance or post-dominance of the SSA use graph.
223 * The returned state must not be freed while dominance queries are being used.
224 * ralloc_free() frees the state.
225 *
226 * It clobbers nir_instr::index, which can't be changed while dominance queries
227 * are being used.
228 *
229 * \param impl NIR function
230 * \param post_dominance Whether to compute post-dominance or dominance.
231 */
232 struct nir_use_dominance_state *
nir_calc_use_dominance_impl(nir_function_impl * impl,bool post_dominance)233 nir_calc_use_dominance_impl(nir_function_impl *impl, bool post_dominance)
234 {
235 struct nir_use_dominance_state *state =
236 rzalloc(NULL, struct nir_use_dominance_state);
237 if (!state)
238 return NULL;
239
240 unsigned num_dom_nodes = 1; /* including the dummy root */
241 nir_foreach_block(block, impl) {
242 num_dom_nodes += exec_list_length(&block->instr_list);
243 }
244
245 state->impl = impl;
246 state->num_dom_nodes = num_dom_nodes;
247 state->dom_nodes = rzalloc_array(state, struct nir_use_dom_node,
248 num_dom_nodes);
249 if (!state->dom_nodes) {
250 ralloc_free(state);
251 return NULL;
252 }
253
254 unsigned index = 0;
255
256 /* We need a dummy root node because there are instructions such as
257 * load_const that aren't dominated by anything. If we are calculating
258 * post-dominance, intrinsics without a destination aren't post-dominated
259 * by anything. However, the algorithm requires a common (post-)dominator.
260 */
261 init_instr(state, NULL, &index);
262
263 /* Post-dominance is identical to dominance, but instructions are added
264 * in the opposite order.
265 */
266 if (post_dominance) {
267 nir_foreach_block_reverse(block, impl) {
268 nir_foreach_instr_reverse(instr, block) {
269 init_instr(state, instr, &index);
270 }
271 }
272 } else {
273 nir_foreach_block(block, impl) {
274 nir_foreach_instr(instr, block) {
275 init_instr(state, instr, &index);
276 }
277 }
278 }
279
280 bool progress = true;
281 while (progress) {
282 progress = false;
283
284 /* Skip the dummy root (iterate from 1). */
285 for (unsigned i = 1; i < num_dom_nodes; i++) {
286 progress |= calc_dominance(state, &state->dom_nodes[i],
287 post_dominance);
288 }
289 }
290
291 return state;
292 }
293
294 nir_instr *
nir_get_immediate_use_dominator(struct nir_use_dominance_state * state,nir_instr * instr)295 nir_get_immediate_use_dominator(struct nir_use_dominance_state *state,
296 nir_instr *instr)
297 {
298 struct nir_use_dom_node *node = get_node(state, instr);
299
300 return get_imm_dom(state, node)->instr;
301 }
302
303 /**
304 * Computes the least common ancestor of two instructions.
305 */
306 nir_instr *
nir_use_dominance_lca(struct nir_use_dominance_state * state,nir_instr * i1,nir_instr * i2)307 nir_use_dominance_lca(struct nir_use_dominance_state *state,
308 nir_instr *i1, nir_instr *i2)
309 {
310 assert(i1 && i2);
311 struct nir_use_dom_node *lca = intersect(state, get_node(state, i1),
312 get_node(state, i2));
313 assert(lca);
314 /* Might be NULL in case of the dummy root. */
315 return lca->instr;
316 }
317
318 /**
319 * Returns true if the parent dominates the child in the SSA use graph
320 * described at the beginning.
321 */
322 bool
nir_instr_dominates_use(struct nir_use_dominance_state * state,nir_instr * parent_instr,nir_instr * child_instr)323 nir_instr_dominates_use(struct nir_use_dominance_state *state,
324 nir_instr *parent_instr, nir_instr *child_instr)
325 {
326 struct nir_use_dom_node *parent = get_node(state, parent_instr);
327 struct nir_use_dom_node *child = get_node(state, child_instr);
328
329 while (parent->index < child->index)
330 child = get_imm_dom(state, child);
331
332 return parent == child;
333 }
334
335 static void
print_instr(struct nir_use_dom_node * node)336 print_instr(struct nir_use_dom_node *node)
337 {
338 if (!node)
339 printf("NULL - bug");
340 else if (node->index == 0)
341 printf("dummy_root");
342 else
343 nir_print_instr(node->instr, stdout);
344 }
345
346 void
nir_print_use_dominators(struct nir_use_dominance_state * state,nir_instr ** instructions,unsigned num_instructions)347 nir_print_use_dominators(struct nir_use_dominance_state *state,
348 nir_instr **instructions, unsigned num_instructions)
349 {
350 for (unsigned i = 0; i < num_instructions; i++) {
351 printf("Input idom(\"");
352 nir_print_instr(instructions[i], stdout);
353 printf("\") = \"");
354 print_instr(get_imm_dom(state, get_node(state, instructions[i])));
355 printf("\"\n");
356 }
357 puts("");
358
359 nir_foreach_block(block, state->impl) {
360 nir_foreach_instr(instr, block) {
361 printf("idom(\"");
362 nir_print_instr(instr, stdout);
363 printf("\") = \"");
364 print_instr(get_imm_dom(state, get_node(state, instr)));
365 printf("\"\n");
366 }
367 }
368 puts("");
369
370 for (unsigned i = 0; i < num_instructions; i++) {
371 for (unsigned j = i + 1; j < num_instructions; j++) {
372 printf("LCA input 1: ");
373 nir_print_instr(instructions[i], stdout);
374 printf("\nLCA input 2: ");
375 nir_print_instr(instructions[j], stdout);
376 puts("");
377 nir_instr *lca =
378 nir_use_dominance_lca(state, instructions[i], instructions[j]);
379
380 if (lca) {
381 printf("2 inputs have a common post-dominator: ");
382 nir_print_instr(lca, stdout);
383 printf("\n");
384 }
385 puts("");
386 }
387 }
388 }
389