1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <[email protected]>
25 */
26
27 #include "compiler.h"
28
29 /* Assign dependency slots to each clause and calculate dependencies, This pass
30 * must be run after scheduling.
31 *
32 * 1. A clause that does not produce a message must use the sentinel slot #0
33 * 2a. A clause that depends on the results of a previous message-passing
34 * instruction must depend on that instruction's dependency slot, unless all
35 * reaching code paths already depended on it.
36 * 2b. More generally, any dependencies must be encoded. This includes
37 * Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
38 * 3. The shader must wait on slot #6 before running BLEND, ATEST
39 * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
40 * 5. ATEST, ZS_EMIT must be issued with slot #0
41 * 6. BARRIER must be issued with slot #7 and wait on every active slot.
42 * 7. Only slots #0 through #5 may be used for clauses not otherwise specified.
43 * 8. If a clause writes to a read staging register of an unresolved
44 * dependency, it must set a staging barrier.
45 *
46 * Note it _is_ legal to reuse slots for multiple message passing instructions
47 * with overlapping liveness, albeit with a slight performance penalty. As such
48 * the problem is significantly easier than register allocation, rather than
49 * spilling we may simply reuse slots. (TODO: does this have an optimal
50 * linear-time solution).
51 *
52 * Within these constraints we are free to assign slots as we like. This pass
53 * attempts to minimize stalls (TODO).
54 */
55
56 #define BI_NUM_GENERAL_SLOTS 6
57 #define BI_NUM_SLOTS 8
58 #define BI_NUM_REGISTERS 64
59 #define BI_SLOT_SERIAL 0 /* arbitrary */
60
61 /*
62 * Due to the crude scoreboarding we do, we need to serialize varying loads and
63 * memory access. Identify these instructions here.
64 */
65 static bool
bi_should_serialize(bi_instr * I)66 bi_should_serialize(bi_instr *I)
67 {
68 /* For debug, serialize everything to disable scoreboard opts */
69 if (bifrost_debug & BIFROST_DBG_NOSB)
70 return true;
71
72 /* Although nominally on the attribute unit, image loads have the same
73 * coherency requirements as general memory loads. Serialize them for
74 * now until we can do something more clever.
75 */
76 if (I->op == BI_OPCODE_LD_ATTR_TEX)
77 return true;
78
79 switch (bi_opcode_props[I->op].message) {
80 case BIFROST_MESSAGE_VARYING:
81 case BIFROST_MESSAGE_LOAD:
82 case BIFROST_MESSAGE_STORE:
83 case BIFROST_MESSAGE_ATOMIC:
84 return true;
85 default:
86 return false;
87 }
88 }
89
90 /* Given a scoreboard model, choose a slot for a clause wrapping a given
91 * message passing instruction. No side effects. */
92
93 static unsigned
bi_choose_scoreboard_slot(bi_instr * message)94 bi_choose_scoreboard_slot(bi_instr *message)
95 {
96 /* ATEST, ZS_EMIT must be issued with slot #0 */
97 if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
98 return 0;
99
100 /* BARRIER must be issued with slot #7 */
101 if (message->op == BI_OPCODE_BARRIER)
102 return 7;
103
104 /* For now, make serialization is easy */
105 if (bi_should_serialize(message))
106 return BI_SLOT_SERIAL;
107
108 return 0;
109 }
110
111 static uint64_t
bi_read_mask(bi_instr * I,bool staging_only)112 bi_read_mask(bi_instr *I, bool staging_only)
113 {
114 uint64_t mask = 0;
115
116 if (staging_only && !bi_opcode_props[I->op].sr_read)
117 return mask;
118
119 bi_foreach_src(I, s) {
120 if (I->src[s].type == BI_INDEX_REGISTER) {
121 unsigned reg = I->src[s].value;
122 unsigned count = bi_count_read_registers(I, s);
123
124 mask |= (BITFIELD64_MASK(count) << reg);
125 }
126
127 if (staging_only)
128 break;
129 }
130
131 return mask;
132 }
133
134 static uint64_t
bi_write_mask(bi_instr * I)135 bi_write_mask(bi_instr *I)
136 {
137 uint64_t mask = 0;
138
139 bi_foreach_dest(I, d) {
140 if (bi_is_null(I->dest[d]))
141 continue;
142
143 assert(I->dest[d].type == BI_INDEX_REGISTER);
144
145 unsigned reg = I->dest[d].value;
146 unsigned count = bi_count_write_registers(I, d);
147
148 mask |= (BITFIELD64_MASK(count) << reg);
149 }
150
151 /* Instructions like AXCHG.i32 unconditionally both read and write
152 * staging registers. Even if we discard the result, the write still
153 * happens logically and needs to be included in our calculations.
154 * Obscurely, ATOM_CX is sr_write but can ignore the staging register in
155 * certain circumstances; this does not require consideration.
156 */
157 if (bi_opcode_props[I->op].sr_write && I->nr_dests && I->nr_srcs &&
158 bi_is_null(I->dest[0]) && !bi_is_null(I->src[0])) {
159
160 unsigned reg = I->src[0].value;
161 unsigned count = bi_count_write_registers(I, 0);
162
163 mask |= (BITFIELD64_MASK(count) << reg);
164 }
165
166 return mask;
167 }
168
169 /* Update the scoreboard model to assign an instruction to a given slot */
170
171 static void
bi_push_clause(struct bi_scoreboard_state * st,bi_clause * clause)172 bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
173 {
174 bi_instr *I = clause->message;
175 unsigned slot = clause->scoreboard_id;
176
177 if (!I)
178 return;
179
180 st->read[slot] |= bi_read_mask(I, true);
181
182 if (bi_opcode_props[I->op].sr_write)
183 st->write[slot] |= bi_write_mask(I);
184 }
185
186 /* Adds a dependency on each slot writing any specified register */
187
188 static void
bi_depend_on_writers(bi_clause * clause,struct bi_scoreboard_state * st,uint64_t regmask)189 bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st,
190 uint64_t regmask)
191 {
192 for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
193 if (!(st->write[slot] & regmask))
194 continue;
195
196 st->write[slot] = 0;
197 st->read[slot] = 0;
198
199 clause->dependencies |= BITFIELD_BIT(slot);
200 }
201 }
202
203 static void
bi_set_staging_barrier(bi_clause * clause,struct bi_scoreboard_state * st,uint64_t regmask)204 bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st,
205 uint64_t regmask)
206 {
207 for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
208 if (!(st->read[slot] & regmask))
209 continue;
210
211 st->read[slot] = 0;
212 clause->staging_barrier = true;
213 }
214 }
215
216 /* Sets the dependencies for a given clause, updating the model */
217
218 static void
bi_set_dependencies(bi_block * block,bi_clause * clause,struct bi_scoreboard_state * st)219 bi_set_dependencies(bi_block *block, bi_clause *clause,
220 struct bi_scoreboard_state *st)
221 {
222 bi_foreach_instr_in_clause(block, clause, I) {
223 uint64_t read = bi_read_mask(I, false);
224 uint64_t written = bi_write_mask(I);
225
226 /* Read-after-write; write-after-write */
227 bi_depend_on_writers(clause, st, read | written);
228
229 /* Write-after-read */
230 bi_set_staging_barrier(clause, st, written);
231 }
232
233 /* LD_VAR instructions must be serialized per-quad. Just always depend
234 * on any LD_VAR instructions. This isn't optimal, but doing better
235 * requires divergence-aware data flow analysis.
236 *
237 * Similarly, memory loads/stores need to be synchronized. For now,
238 * force them to be serialized. This is not optimal.
239 */
240 if (clause->message && bi_should_serialize(clause->message))
241 clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
242
243 /* Barriers must wait on all slots to flush existing work. It might be
244 * possible to skip this with more information about the barrier. For
245 * now, be conservative.
246 */
247 if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
248 clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
249 }
250
251 static bool
scoreboard_block_update(bi_block * blk)252 scoreboard_block_update(bi_block *blk)
253 {
254 bool progress = false;
255
256 /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
257 bi_foreach_predecessor(blk, pred) {
258 for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
259 blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
260 blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
261 }
262 }
263
264 struct bi_scoreboard_state state = blk->scoreboard_in;
265
266 /* Assign locally */
267
268 bi_foreach_clause_in_block(blk, clause) {
269 bi_set_dependencies(blk, clause, &state);
270 bi_push_clause(&state, clause);
271 }
272
273 /* To figure out progress, diff scoreboard_out */
274
275 for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
276 progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
277
278 blk->scoreboard_out = state;
279
280 return progress;
281 }
282
283 void
bi_assign_scoreboard(bi_context * ctx)284 bi_assign_scoreboard(bi_context *ctx)
285 {
286 u_worklist worklist;
287 bi_worklist_init(ctx, &worklist);
288
289 /* First, assign slots. */
290 bi_foreach_block(ctx, block) {
291 bi_foreach_clause_in_block(block, clause) {
292 if (clause->message) {
293 unsigned slot = bi_choose_scoreboard_slot(clause->message);
294 clause->scoreboard_id = slot;
295 }
296 }
297
298 bi_worklist_push_tail(&worklist, block);
299 }
300
301 /* Next, perform forward data flow analysis to calculate dependencies */
302 while (!u_worklist_is_empty(&worklist)) {
303 /* Pop from the front for forward analysis */
304 bi_block *blk = bi_worklist_pop_head(&worklist);
305
306 if (scoreboard_block_update(blk)) {
307 bi_foreach_successor(blk, succ)
308 bi_worklist_push_tail(&worklist, succ);
309 }
310 }
311
312 u_worklist_fini(&worklist);
313 }
314