xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bi_opt_push_ubo.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_builder.h"
25 #include "compiler.h"
26 
27 /* This optimization pass, intended to run once after code emission but before
28  * copy propagation, analyzes direct word-aligned UBO reads and promotes a
29  * subset to moves from FAU. It is the sole populator of the UBO push data
30  * structure returned back to the command stream. */
31 
32 static bool
bi_is_ubo(bi_instr * ins)33 bi_is_ubo(bi_instr *ins)
34 {
35    return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
36           (ins->seg == BI_SEG_UBO);
37 }
38 
39 static bool
bi_is_direct_aligned_ubo(bi_instr * ins)40 bi_is_direct_aligned_ubo(bi_instr *ins)
41 {
42    return bi_is_ubo(ins) && (ins->src[0].type == BI_INDEX_CONSTANT) &&
43           (ins->src[1].type == BI_INDEX_CONSTANT) &&
44           ((ins->src[0].value & 0x3) == 0);
45 }
46 
47 /* Represents use data for a single UBO */
48 
49 #define MAX_UBO_WORDS (65536 / 16)
50 
51 struct bi_ubo_block {
52    BITSET_DECLARE(pushed, MAX_UBO_WORDS);
53    uint8_t range[MAX_UBO_WORDS];
54 };
55 
56 struct bi_ubo_analysis {
57    /* Per block analysis */
58    unsigned nr_blocks;
59    struct bi_ubo_block *blocks;
60 };
61 
62 static struct bi_ubo_analysis
bi_analyze_ranges(bi_context * ctx)63 bi_analyze_ranges(bi_context *ctx)
64 {
65    struct bi_ubo_analysis res = {
66       .nr_blocks = ctx->nir->info.num_ubos + 1,
67    };
68 
69    res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block));
70 
71    bi_foreach_instr_global(ctx, ins) {
72       if (!bi_is_direct_aligned_ubo(ins))
73          continue;
74 
75       unsigned ubo = pan_res_handle_get_index(ins->src[1].value);
76       unsigned word = ins->src[0].value / 4;
77       unsigned channels = bi_opcode_props[ins->op].sr_count;
78 
79       assert(ubo < res.nr_blocks);
80       assert(channels > 0 && channels <= 4);
81 
82       if (word >= MAX_UBO_WORDS)
83          continue;
84 
85       /* Must use max if the same base is read with different channel
86        * counts, which is possible with nir_opt_shrink_vectors */
87       uint8_t *range = res.blocks[ubo].range;
88       range[word] = MAX2(range[word], channels);
89    }
90 
91    return res;
92 }
93 
94 /* Select UBO words to push. A sophisticated implementation would consider the
95  * number of uses and perhaps the control flow to estimate benefit. This is not
96  * sophisticated. Select from the last UBO first to prioritize sysvals. */
97 
98 static void
bi_pick_ubo(struct panfrost_ubo_push * push,struct bi_ubo_analysis * analysis)99 bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
100 {
101    for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
102       struct bi_ubo_block *block = &analysis->blocks[ubo];
103 
104       for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) {
105          unsigned range = block->range[r];
106 
107          /* Don't push something we don't access */
108          if (range == 0)
109             continue;
110 
111          /* Don't push more than possible */
112          if (push->count > PAN_MAX_PUSH - range)
113             return;
114 
115          for (unsigned offs = 0; offs < range; ++offs) {
116             struct panfrost_ubo_word word = {
117                .ubo = ubo,
118                .offset = (r + offs) * 4,
119             };
120 
121             push->words[push->count++] = word;
122          }
123 
124          /* Mark it as pushed so we can rewrite */
125          BITSET_SET(block->pushed, r);
126       }
127    }
128 }
129 
130 void
bi_opt_push_ubo(bi_context * ctx)131 bi_opt_push_ubo(bi_context *ctx)
132 {
133    struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
134    bi_pick_ubo(ctx->info.push, &analysis);
135 
136    ctx->ubo_mask = 0;
137 
138    bi_foreach_instr_global_safe(ctx, ins) {
139       if (!bi_is_ubo(ins))
140          continue;
141 
142       unsigned ubo = pan_res_handle_get_index(ins->src[1].value);
143       unsigned offset = ins->src[0].value;
144 
145       if (!bi_is_direct_aligned_ubo(ins)) {
146          /* The load can't be pushed, so this UBO needs to be
147           * uploaded conventionally */
148          if (ins->src[1].type == BI_INDEX_CONSTANT)
149             ctx->ubo_mask |= BITSET_BIT(ubo);
150          else
151             ctx->ubo_mask = ~0;
152 
153          continue;
154       }
155 
156       /* Check if we decided to push this */
157       assert(ubo < analysis.nr_blocks);
158       if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) {
159          ctx->ubo_mask |= BITSET_BIT(ubo);
160          continue;
161       }
162 
163       /* Replace the UBO load with moves from FAU */
164       bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
165 
166       unsigned nr = bi_opcode_props[ins->op].sr_count;
167       bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr);
168 
169       bi_foreach_src(vec, w) {
170          /* FAU is grouped in pairs (2 x 4-byte) */
171          unsigned base =
172             pan_lookup_pushed_ubo(ctx->info.push, ubo, (offset + 4 * w));
173 
174          unsigned fau_idx = (base >> 1);
175          unsigned fau_hi = (base & 1);
176 
177          vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi);
178       }
179 
180       bi_remove_instruction(ins);
181    }
182 
183    free(analysis.blocks);
184 }
185 
186 typedef struct {
187    BITSET_DECLARE(row, PAN_MAX_PUSH);
188 } adjacency_row;
189 
190 /* Find the connected component containing `node` with depth-first search */
191 static void
bi_find_component(adjacency_row * adjacency,BITSET_WORD * visited,unsigned * component,unsigned * size,unsigned node)192 bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
193                   unsigned *component, unsigned *size, unsigned node)
194 {
195    unsigned neighbour;
196 
197    BITSET_SET(visited, node);
198    component[(*size)++] = node;
199 
200    BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
201       if (!BITSET_TEST(visited, neighbour)) {
202          bi_find_component(adjacency, visited, component, size, neighbour);
203       }
204    }
205 }
206 
207 static bool
bi_is_uniform(bi_index idx)208 bi_is_uniform(bi_index idx)
209 {
210    return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
211 }
212 
213 /* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
214 static unsigned
bi_uniform_word(bi_index idx)215 bi_uniform_word(bi_index idx)
216 {
217    assert(bi_is_uniform(idx));
218    assert(idx.offset <= 1);
219 
220    return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
221 }
222 
223 /*
224  * Create an undirected graph where nodes are 32-bit uniform indices and edges
225  * represent that two nodes are used in the same instruction.
226  *
227  * The graph is constructed as an adjacency matrix stored in adjacency.
228  */
229 static void
bi_create_fau_interference_graph(bi_context * ctx,adjacency_row * adjacency)230 bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
231 {
232    bi_foreach_instr_global(ctx, I) {
233       unsigned nodes[BI_MAX_SRCS] = {};
234       unsigned node_count = 0;
235 
236       /* Set nodes[] to 32-bit uniforms accessed */
237       bi_foreach_src(I, s) {
238          if (bi_is_uniform(I->src[s])) {
239             unsigned word = bi_uniform_word(I->src[s]);
240 
241             if (word >= ctx->info.push_offset)
242                nodes[node_count++] = word;
243          }
244       }
245 
246       /* Create clique connecting nodes[] */
247       for (unsigned i = 0; i < node_count; ++i) {
248          for (unsigned j = 0; j < node_count; ++j) {
249             if (i == j)
250                continue;
251 
252             unsigned x = nodes[i], y = nodes[j];
253             assert(MAX2(x, y) < ctx->info.push->count);
254 
255             /* Add undirected edge between the nodes */
256             BITSET_SET(adjacency[x].row, y);
257             BITSET_SET(adjacency[y].row, x);
258          }
259       }
260    }
261 }
262 
263 /*
264  * Optimization pass to reorder uniforms. The goal is to reduce the number of
265  * moves we emit when lowering FAU. The pass groups uniforms used by the same
266  * instruction.
267  *
268  * The pass works by creating a graph of pushed uniforms, where edges denote the
269  * "both 32-bit uniforms required by the same instruction" relationship. We
270  * perform depth-first search on this graph to find the connected components,
271  * where each connected component is a cluster of uniforms that are used
272  * together. We then select pairs of uniforms from each connected component.
273  * The remaining unpaired uniforms (from components of odd sizes) are paired
274  * together arbitrarily.
275  *
276  * After a new ordering is selected, pushed uniforms in the program and the
277  * panfrost_ubo_push data structure must be remapped to use the new ordering.
278  */
279 void
bi_opt_reorder_push(bi_context * ctx)280 bi_opt_reorder_push(bi_context *ctx)
281 {
282    adjacency_row adjacency[PAN_MAX_PUSH] = {0};
283    BITSET_DECLARE(visited, PAN_MAX_PUSH) = {0};
284 
285    unsigned ordering[PAN_MAX_PUSH] = {0};
286    unsigned unpaired[PAN_MAX_PUSH] = {0};
287    unsigned pushed = 0, unpaired_count = 0;
288 
289    struct panfrost_ubo_push *push = ctx->info.push;
290    unsigned push_offset = ctx->info.push_offset;
291 
292    bi_create_fau_interference_graph(ctx, adjacency);
293 
294    for (unsigned i = push_offset; i < push->count; ++i) {
295       if (BITSET_TEST(visited, i))
296          continue;
297 
298       unsigned component[PAN_MAX_PUSH] = {0};
299       unsigned size = 0;
300       bi_find_component(adjacency, visited, component, &size, i);
301 
302       /* If there is an odd number of uses, at least one use must be
303        * unpaired. Arbitrarily take the last one.
304        */
305       if (size % 2)
306          unpaired[unpaired_count++] = component[--size];
307 
308       /* The rest of uses are paired */
309       assert((size % 2) == 0);
310 
311       /* Push the paired uses */
312       memcpy(ordering + pushed, component, sizeof(unsigned) * size);
313       pushed += size;
314    }
315 
316    /* Push unpaired nodes at the end */
317    memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
318    pushed += unpaired_count;
319 
320    /* Ordering is a permutation. Invert it for O(1) lookup. */
321    unsigned old_to_new[PAN_MAX_PUSH] = {0};
322 
323    for (unsigned i = 0; i < push_offset; ++i) {
324       old_to_new[i] = i;
325    }
326 
327    for (unsigned i = 0; i < pushed; ++i) {
328       assert(ordering[i] >= push_offset);
329       old_to_new[ordering[i]] = push_offset + i;
330    }
331 
332    /* Use new ordering throughout the program */
333    bi_foreach_instr_global(ctx, I) {
334       bi_foreach_src(I, s) {
335          if (bi_is_uniform(I->src[s])) {
336             unsigned node = bi_uniform_word(I->src[s]);
337             unsigned new_node = old_to_new[node];
338             I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
339             I->src[s].offset = new_node & 1;
340          }
341       }
342    }
343 
344    /* Use new ordering for push */
345    struct panfrost_ubo_push old = *push;
346    for (unsigned i = 0; i < pushed; ++i)
347       push->words[push_offset + i] = old.words[ordering[i]];
348 
349    push->count = push_offset + pushed;
350 }
351