1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "compiler.h"
26
27 /* This optimization pass, intended to run once after code emission but before
28 * copy propagation, analyzes direct word-aligned UBO reads and promotes a
29 * subset to moves from FAU. It is the sole populator of the UBO push data
30 * structure returned back to the command stream. */
31
32 static bool
bi_is_ubo(bi_instr * ins)33 bi_is_ubo(bi_instr *ins)
34 {
35 return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
36 (ins->seg == BI_SEG_UBO);
37 }
38
39 static bool
bi_is_direct_aligned_ubo(bi_instr * ins)40 bi_is_direct_aligned_ubo(bi_instr *ins)
41 {
42 return bi_is_ubo(ins) && (ins->src[0].type == BI_INDEX_CONSTANT) &&
43 (ins->src[1].type == BI_INDEX_CONSTANT) &&
44 ((ins->src[0].value & 0x3) == 0);
45 }
46
47 /* Represents use data for a single UBO */
48
49 #define MAX_UBO_WORDS (65536 / 16)
50
51 struct bi_ubo_block {
52 BITSET_DECLARE(pushed, MAX_UBO_WORDS);
53 uint8_t range[MAX_UBO_WORDS];
54 };
55
56 struct bi_ubo_analysis {
57 /* Per block analysis */
58 unsigned nr_blocks;
59 struct bi_ubo_block *blocks;
60 };
61
62 static struct bi_ubo_analysis
bi_analyze_ranges(bi_context * ctx)63 bi_analyze_ranges(bi_context *ctx)
64 {
65 struct bi_ubo_analysis res = {
66 .nr_blocks = ctx->nir->info.num_ubos + 1,
67 };
68
69 res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block));
70
71 bi_foreach_instr_global(ctx, ins) {
72 if (!bi_is_direct_aligned_ubo(ins))
73 continue;
74
75 unsigned ubo = pan_res_handle_get_index(ins->src[1].value);
76 unsigned word = ins->src[0].value / 4;
77 unsigned channels = bi_opcode_props[ins->op].sr_count;
78
79 assert(ubo < res.nr_blocks);
80 assert(channels > 0 && channels <= 4);
81
82 if (word >= MAX_UBO_WORDS)
83 continue;
84
85 /* Must use max if the same base is read with different channel
86 * counts, which is possible with nir_opt_shrink_vectors */
87 uint8_t *range = res.blocks[ubo].range;
88 range[word] = MAX2(range[word], channels);
89 }
90
91 return res;
92 }
93
94 /* Select UBO words to push. A sophisticated implementation would consider the
95 * number of uses and perhaps the control flow to estimate benefit. This is not
96 * sophisticated. Select from the last UBO first to prioritize sysvals. */
97
98 static void
bi_pick_ubo(struct panfrost_ubo_push * push,struct bi_ubo_analysis * analysis)99 bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
100 {
101 for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
102 struct bi_ubo_block *block = &analysis->blocks[ubo];
103
104 for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) {
105 unsigned range = block->range[r];
106
107 /* Don't push something we don't access */
108 if (range == 0)
109 continue;
110
111 /* Don't push more than possible */
112 if (push->count > PAN_MAX_PUSH - range)
113 return;
114
115 for (unsigned offs = 0; offs < range; ++offs) {
116 struct panfrost_ubo_word word = {
117 .ubo = ubo,
118 .offset = (r + offs) * 4,
119 };
120
121 push->words[push->count++] = word;
122 }
123
124 /* Mark it as pushed so we can rewrite */
125 BITSET_SET(block->pushed, r);
126 }
127 }
128 }
129
130 void
bi_opt_push_ubo(bi_context * ctx)131 bi_opt_push_ubo(bi_context *ctx)
132 {
133 struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
134 bi_pick_ubo(ctx->info.push, &analysis);
135
136 ctx->ubo_mask = 0;
137
138 bi_foreach_instr_global_safe(ctx, ins) {
139 if (!bi_is_ubo(ins))
140 continue;
141
142 unsigned ubo = pan_res_handle_get_index(ins->src[1].value);
143 unsigned offset = ins->src[0].value;
144
145 if (!bi_is_direct_aligned_ubo(ins)) {
146 /* The load can't be pushed, so this UBO needs to be
147 * uploaded conventionally */
148 if (ins->src[1].type == BI_INDEX_CONSTANT)
149 ctx->ubo_mask |= BITSET_BIT(ubo);
150 else
151 ctx->ubo_mask = ~0;
152
153 continue;
154 }
155
156 /* Check if we decided to push this */
157 assert(ubo < analysis.nr_blocks);
158 if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) {
159 ctx->ubo_mask |= BITSET_BIT(ubo);
160 continue;
161 }
162
163 /* Replace the UBO load with moves from FAU */
164 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
165
166 unsigned nr = bi_opcode_props[ins->op].sr_count;
167 bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0], nr);
168
169 bi_foreach_src(vec, w) {
170 /* FAU is grouped in pairs (2 x 4-byte) */
171 unsigned base =
172 pan_lookup_pushed_ubo(ctx->info.push, ubo, (offset + 4 * w));
173
174 unsigned fau_idx = (base >> 1);
175 unsigned fau_hi = (base & 1);
176
177 vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi);
178 }
179
180 bi_remove_instruction(ins);
181 }
182
183 free(analysis.blocks);
184 }
185
186 typedef struct {
187 BITSET_DECLARE(row, PAN_MAX_PUSH);
188 } adjacency_row;
189
190 /* Find the connected component containing `node` with depth-first search */
191 static void
bi_find_component(adjacency_row * adjacency,BITSET_WORD * visited,unsigned * component,unsigned * size,unsigned node)192 bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
193 unsigned *component, unsigned *size, unsigned node)
194 {
195 unsigned neighbour;
196
197 BITSET_SET(visited, node);
198 component[(*size)++] = node;
199
200 BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
201 if (!BITSET_TEST(visited, neighbour)) {
202 bi_find_component(adjacency, visited, component, size, neighbour);
203 }
204 }
205 }
206
207 static bool
bi_is_uniform(bi_index idx)208 bi_is_uniform(bi_index idx)
209 {
210 return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
211 }
212
213 /* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
214 static unsigned
bi_uniform_word(bi_index idx)215 bi_uniform_word(bi_index idx)
216 {
217 assert(bi_is_uniform(idx));
218 assert(idx.offset <= 1);
219
220 return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
221 }
222
223 /*
224 * Create an undirected graph where nodes are 32-bit uniform indices and edges
225 * represent that two nodes are used in the same instruction.
226 *
227 * The graph is constructed as an adjacency matrix stored in adjacency.
228 */
229 static void
bi_create_fau_interference_graph(bi_context * ctx,adjacency_row * adjacency)230 bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
231 {
232 bi_foreach_instr_global(ctx, I) {
233 unsigned nodes[BI_MAX_SRCS] = {};
234 unsigned node_count = 0;
235
236 /* Set nodes[] to 32-bit uniforms accessed */
237 bi_foreach_src(I, s) {
238 if (bi_is_uniform(I->src[s])) {
239 unsigned word = bi_uniform_word(I->src[s]);
240
241 if (word >= ctx->info.push_offset)
242 nodes[node_count++] = word;
243 }
244 }
245
246 /* Create clique connecting nodes[] */
247 for (unsigned i = 0; i < node_count; ++i) {
248 for (unsigned j = 0; j < node_count; ++j) {
249 if (i == j)
250 continue;
251
252 unsigned x = nodes[i], y = nodes[j];
253 assert(MAX2(x, y) < ctx->info.push->count);
254
255 /* Add undirected edge between the nodes */
256 BITSET_SET(adjacency[x].row, y);
257 BITSET_SET(adjacency[y].row, x);
258 }
259 }
260 }
261 }
262
263 /*
264 * Optimization pass to reorder uniforms. The goal is to reduce the number of
265 * moves we emit when lowering FAU. The pass groups uniforms used by the same
266 * instruction.
267 *
268 * The pass works by creating a graph of pushed uniforms, where edges denote the
269 * "both 32-bit uniforms required by the same instruction" relationship. We
270 * perform depth-first search on this graph to find the connected components,
271 * where each connected component is a cluster of uniforms that are used
272 * together. We then select pairs of uniforms from each connected component.
273 * The remaining unpaired uniforms (from components of odd sizes) are paired
274 * together arbitrarily.
275 *
276 * After a new ordering is selected, pushed uniforms in the program and the
277 * panfrost_ubo_push data structure must be remapped to use the new ordering.
278 */
279 void
bi_opt_reorder_push(bi_context * ctx)280 bi_opt_reorder_push(bi_context *ctx)
281 {
282 adjacency_row adjacency[PAN_MAX_PUSH] = {0};
283 BITSET_DECLARE(visited, PAN_MAX_PUSH) = {0};
284
285 unsigned ordering[PAN_MAX_PUSH] = {0};
286 unsigned unpaired[PAN_MAX_PUSH] = {0};
287 unsigned pushed = 0, unpaired_count = 0;
288
289 struct panfrost_ubo_push *push = ctx->info.push;
290 unsigned push_offset = ctx->info.push_offset;
291
292 bi_create_fau_interference_graph(ctx, adjacency);
293
294 for (unsigned i = push_offset; i < push->count; ++i) {
295 if (BITSET_TEST(visited, i))
296 continue;
297
298 unsigned component[PAN_MAX_PUSH] = {0};
299 unsigned size = 0;
300 bi_find_component(adjacency, visited, component, &size, i);
301
302 /* If there is an odd number of uses, at least one use must be
303 * unpaired. Arbitrarily take the last one.
304 */
305 if (size % 2)
306 unpaired[unpaired_count++] = component[--size];
307
308 /* The rest of uses are paired */
309 assert((size % 2) == 0);
310
311 /* Push the paired uses */
312 memcpy(ordering + pushed, component, sizeof(unsigned) * size);
313 pushed += size;
314 }
315
316 /* Push unpaired nodes at the end */
317 memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
318 pushed += unpaired_count;
319
320 /* Ordering is a permutation. Invert it for O(1) lookup. */
321 unsigned old_to_new[PAN_MAX_PUSH] = {0};
322
323 for (unsigned i = 0; i < push_offset; ++i) {
324 old_to_new[i] = i;
325 }
326
327 for (unsigned i = 0; i < pushed; ++i) {
328 assert(ordering[i] >= push_offset);
329 old_to_new[ordering[i]] = push_offset + i;
330 }
331
332 /* Use new ordering throughout the program */
333 bi_foreach_instr_global(ctx, I) {
334 bi_foreach_src(I, s) {
335 if (bi_is_uniform(I->src[s])) {
336 unsigned node = bi_uniform_word(I->src[s]);
337 unsigned new_node = old_to_new[node];
338 I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
339 I->src[s].offset = new_node & 1;
340 }
341 }
342 }
343
344 /* Use new ordering for push */
345 struct panfrost_ubo_push old = *push;
346 for (unsigned i = 0; i < pushed; ++i)
347 push->words[push_offset + i] = old.words[ordering[i]];
348
349 push->count = push_offset + pushed;
350 }
351