1 /*
2 * Copyright © 2019 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #include "util/dag.h"
10 #include "util/u_math.h"
11
12 #include "ir3.h"
13 #include "ir3_compiler.h"
14 #include "ir3_context.h"
15
16 #if MESA_DEBUG
17 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
18 #else
19 #define SCHED_DEBUG 0
20 #endif
21 #define d(fmt, ...) \
22 do { \
23 if (SCHED_DEBUG) { \
24 mesa_logi("PSCHED: " fmt, ##__VA_ARGS__); \
25 } \
26 } while (0)
27
28 #define di(instr, fmt, ...) \
29 do { \
30 if (SCHED_DEBUG) { \
31 struct log_stream *stream = mesa_log_streami(); \
32 mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__); \
33 ir3_print_instr_stream(stream, instr); \
34 mesa_log_stream_destroy(stream); \
35 } \
36 } while (0)
37
38 #define SCHED_DEBUG_DUMP_DEPTH 1
39
40 /*
41 * Post RA Instruction Scheduling
42 */
43
44 struct ir3_postsched_ctx {
45 struct ir3 *ir;
46
47 struct ir3_shader_variant *v;
48
49 void *mem_ctx;
50 struct ir3_block *block; /* the current block */
51 struct dag *dag;
52
53 struct list_head unscheduled_list; /* unscheduled instructions */
54
55 unsigned ip;
56
57 int ss_delay;
58 int sy_delay;
59 };
60
61 struct ir3_postsched_node {
62 struct dag_node dag; /* must be first for util_dynarray_foreach */
63 struct ir3_instruction *instr;
64 bool partially_evaluated_path;
65
66 unsigned earliest_ip;
67
68 bool has_sy_src, has_ss_src;
69
70 unsigned max_delay;
71 };
72
73 #define foreach_sched_node(__n, __list) \
74 list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
75
76 static bool
has_sy_src(struct ir3_instruction * instr)77 has_sy_src(struct ir3_instruction *instr)
78 {
79 struct ir3_postsched_node *node = instr->data;
80 return node->has_sy_src;
81 }
82
83 static bool
has_ss_src(struct ir3_instruction * instr)84 has_ss_src(struct ir3_instruction *instr)
85 {
86 struct ir3_postsched_node *node = instr->data;
87 return node->has_ss_src;
88 }
89
90 #ifndef NDEBUG
91 static void
sched_dag_validate_cb(const struct dag_node * node,void * data)92 sched_dag_validate_cb(const struct dag_node *node, void *data)
93 {
94 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
95
96 ir3_print_instr(n->instr);
97 }
98 #endif
99
100 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)101 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
102 {
103 assert(ctx->block == instr->block);
104
105 /* remove from unscheduled_list:
106 */
107 list_delinit(&instr->node);
108
109 di(instr, "schedule");
110
111 bool counts_for_delay = is_alu(instr) || is_flow(instr);
112
113 unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
114
115 struct ir3_postsched_node *n = instr->data;
116
117 /* We insert any nop's needed to get to earliest_ip, then advance
118 * delay_cycles by scheduling the instruction.
119 */
120 ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
121
122 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
123 unsigned delay = (unsigned)(uintptr_t)edge->data;
124 struct ir3_postsched_node *child =
125 container_of(edge->child, struct ir3_postsched_node, dag);
126 child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
127 }
128
129 list_addtail(&instr->node, &instr->block->instr_list);
130
131 dag_prune_head(ctx->dag, &n->dag);
132
133 if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
134 return;
135
136 if (is_ss_producer(instr)) {
137 ctx->ss_delay = soft_ss_delay(instr);
138 } else if (has_ss_src(instr)) {
139 ctx->ss_delay = 0;
140 } else if (ctx->ss_delay > 0) {
141 ctx->ss_delay--;
142 }
143
144 if (is_sy_producer(instr)) {
145 ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
146 } else if (has_sy_src(instr)) {
147 ctx->sy_delay = 0;
148 } else if (ctx->sy_delay > 0) {
149 ctx->sy_delay--;
150 }
151 }
152
153 static unsigned
node_delay(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)154 node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
155 {
156 return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
157 }
158
159 static unsigned
node_delay_soft(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)160 node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
161 {
162 unsigned delay = node_delay(ctx, n);
163
164 /* This takes into account that as when we schedule multiple tex or sfu, the
165 * first user has to wait for all of them to complete.
166 */
167 if (n->has_ss_src)
168 delay = MAX2(delay, ctx->ss_delay);
169 if (n->has_sy_src)
170 delay = MAX2(delay, ctx->sy_delay);
171
172 return delay;
173 }
174
175 static void
dump_node(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n,int level)176 dump_node(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n,
177 int level)
178 {
179 if (level > SCHED_DEBUG_DUMP_DEPTH)
180 return;
181
182 di(n->instr, "%*s%smaxdel=%d, node_delay=%d,node_delay_soft=%d, %d parents ",
183 level * 2, "", (level > 0 ? "-> " : ""), n->max_delay, node_delay(ctx, n),
184 node_delay_soft(ctx, n), n->dag.parent_count);
185
186 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
187 struct ir3_postsched_node *child =
188 (struct ir3_postsched_node *)edge->child;
189
190 dump_node(ctx, child, level + 1);
191 }
192 }
193
194 static void
dump_state(struct ir3_postsched_ctx * ctx)195 dump_state(struct ir3_postsched_ctx *ctx)
196 {
197 if (!SCHED_DEBUG)
198 return;
199
200 foreach_sched_node (n, &ctx->dag->heads) {
201 dump_node(ctx, n, 0);
202 }
203 }
204
205 /* find instruction to schedule: */
206 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)207 choose_instr(struct ir3_postsched_ctx *ctx)
208 {
209 struct ir3_postsched_node *chosen = NULL;
210
211 dump_state(ctx);
212
213 foreach_sched_node (n, &ctx->dag->heads) {
214 if (!is_meta(n->instr))
215 continue;
216
217 if (!chosen || (chosen->max_delay < n->max_delay))
218 chosen = n;
219 }
220
221 if (chosen) {
222 di(chosen->instr, "prio: chose (meta)");
223 return chosen->instr;
224 }
225
226 /* Try to schedule inputs with a higher priority, if possible, as
227 * the last bary.f unlocks varying storage to unblock more VS
228 * warps.
229 */
230 foreach_sched_node (n, &ctx->dag->heads) {
231 if (!is_input(n->instr))
232 continue;
233
234 if (!chosen || (chosen->max_delay < n->max_delay))
235 chosen = n;
236 }
237
238 if (chosen) {
239 di(chosen->instr, "prio: chose (input)");
240 return chosen->instr;
241 }
242
243 /* Next prioritize discards: */
244 foreach_sched_node (n, &ctx->dag->heads) {
245 unsigned d = node_delay(ctx, n);
246
247 if (d > 0)
248 continue;
249
250 if (!is_kill_or_demote(n->instr))
251 continue;
252
253 if (!chosen || (chosen->max_delay < n->max_delay))
254 chosen = n;
255 }
256
257 if (chosen) {
258 di(chosen->instr, "csp: chose (kill, hard ready)");
259 return chosen->instr;
260 }
261
262 /* Next prioritize expensive instructions: */
263 foreach_sched_node (n, &ctx->dag->heads) {
264 unsigned d = node_delay_soft(ctx, n);
265
266 if (d > 0)
267 continue;
268
269 if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
270 continue;
271
272 if (!chosen || (chosen->max_delay < n->max_delay))
273 chosen = n;
274 }
275
276 if (chosen) {
277 di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
278 return chosen->instr;
279 }
280
281 /* Next try to find a ready leader w/ soft delay (ie. including extra
282 * delay for things like tex fetch which can be synchronized w/ sync
283 * bit (but we probably do want to schedule some other instructions
284 * while we wait). We also allow a small amount of nops, to prefer now-nops
285 * over future-nops up to a point, as that gives better results.
286 */
287 unsigned chosen_delay = 0;
288 foreach_sched_node (n, &ctx->dag->heads) {
289 unsigned d = node_delay_soft(ctx, n);
290
291 if (d > 3)
292 continue;
293
294 if (!chosen || d < chosen_delay) {
295 chosen = n;
296 chosen_delay = d;
297 continue;
298 }
299
300 if (d > chosen_delay)
301 continue;
302
303 if (chosen->max_delay < n->max_delay) {
304 chosen = n;
305 chosen_delay = d;
306 }
307 }
308
309 if (chosen) {
310 di(chosen->instr, "csp: chose (soft ready)");
311 return chosen->instr;
312 }
313
314 /* Otherwise choose leader with maximum cost:
315 */
316 foreach_sched_node (n, &ctx->dag->heads) {
317 if (!chosen || chosen->max_delay < n->max_delay)
318 chosen = n;
319 }
320
321 if (chosen) {
322 di(chosen->instr, "csp: chose (leader)");
323 return chosen->instr;
324 }
325
326 return NULL;
327 }
328
329 struct ir3_postsched_deps_state {
330 struct ir3_postsched_ctx *ctx;
331
332 enum { F, R } direction;
333
334 bool merged;
335
336 /* Track the mapping between sched node (instruction) that last
337 * wrote a given register (in whichever direction we are iterating
338 * the block)
339 *
340 * Note, this table is twice as big as the # of regs, to deal with
341 * half-precision regs. The approach differs depending on whether
342 * the half and full precision register files are "merged" (conflict,
343 * ie. a6xx+) in which case we use "regs" for both full precision and half
344 * precision dependencies and consider each full precision dep
345 * as two half-precision dependencies, vs older separate (non-
346 * conflicting) in which case the separate "half_regs" table is used for
347 * half-precision deps. See ir3_reg_file_offset().
348 */
349 struct ir3_postsched_node *regs[2 * GPR_REG_SIZE];
350 unsigned dst_n[2 * GPR_REG_SIZE];
351 struct ir3_postsched_node *half_regs[GPR_REG_SIZE];
352 unsigned half_dst_n[GPR_REG_SIZE];
353 struct ir3_postsched_node *shared_regs[2 * SHARED_REG_SIZE];
354 unsigned shared_dst_n[2 * SHARED_REG_SIZE];
355 struct ir3_postsched_node *nongpr_regs[2 * NONGPR_REG_SIZE];
356 unsigned nongpr_dst_n[2 * NONGPR_REG_SIZE];
357 };
358
359 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after,unsigned d)360 add_dep(struct ir3_postsched_deps_state *state,
361 struct ir3_postsched_node *before, struct ir3_postsched_node *after,
362 unsigned d)
363 {
364 if (!before || !after)
365 return;
366
367 assert(before != after);
368
369 if (state->direction == F) {
370 dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
371 } else {
372 dag_add_edge_max_data(&after->dag, &before->dag, 0);
373 }
374 }
375
376 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,struct ir3_postsched_node ** dep_ptr,unsigned * dst_n_ptr,unsigned num,int src_n,int dst_n)377 add_single_reg_dep(struct ir3_postsched_deps_state *state,
378 struct ir3_postsched_node *node,
379 struct ir3_postsched_node **dep_ptr,
380 unsigned *dst_n_ptr, unsigned num, int src_n,
381 int dst_n)
382 {
383 struct ir3_postsched_node *dep = *dep_ptr;
384
385 unsigned d = 0;
386 if (src_n >= 0 && dep && state->direction == F) {
387 struct ir3_compiler *compiler = state->ctx->ir->compiler;
388 /* get the dst_n this corresponds to */
389 unsigned dst_n = *dst_n_ptr;
390 d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n);
391 if (is_sy_producer(dep->instr))
392 node->has_sy_src = true;
393 if (needs_ss(compiler, dep->instr, node->instr))
394 node->has_ss_src = true;
395 }
396
397 if (src_n >= 0 && dep && state->direction == R) {
398 /* If node generates a WAR hazard (because it doesn't consume its sources
399 * immediately, dep needs (ss) to sync its dest. Even though this isn't a
400 * (ss) source (but rather a dest), the effect is exactly the same so we
401 * model it as such.
402 */
403 if (is_war_hazard_producer(node->instr)) {
404 dep->has_ss_src = true;
405 }
406 }
407
408 add_dep(state, dep, node, d);
409 if (src_n < 0) {
410 *dep_ptr = node;
411 *dst_n_ptr = dst_n;
412 }
413 }
414
415 /* This is where we handled full vs half-precision, and potential conflicts
416 * between half and full precision that result in additional dependencies.
417 * The 'reg' arg is really just to know half vs full precision.
418 *
419 * If src_n is positive, then this adds a dependency on a source register, and
420 * src_n is the index passed into ir3_delayslots() for calculating the delay:
421 * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
422 * this is for the destination register corresponding to dst_n.
423 */
424 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n,int dst_n)425 add_reg_dep(struct ir3_postsched_deps_state *state,
426 struct ir3_postsched_node *node, const struct ir3_register *reg,
427 unsigned num, int src_n, int dst_n)
428 {
429 struct ir3_postsched_node **regs;
430 unsigned *dst_n_ptr;
431 enum ir3_reg_file file;
432 unsigned size = reg_elem_size(reg);
433 unsigned offset = ir3_reg_file_offset(reg, num, state->merged, &file);
434 switch (file) {
435 case IR3_FILE_FULL:
436 assert(offset + size <= ARRAY_SIZE(state->regs));
437 regs = state->regs;
438 dst_n_ptr = state->dst_n;
439 break;
440 case IR3_FILE_HALF:
441 assert(offset + 1 <= ARRAY_SIZE(state->half_regs));
442 regs = state->half_regs;
443 dst_n_ptr = state->half_dst_n;
444 break;
445 case IR3_FILE_SHARED:
446 assert(offset + size <= ARRAY_SIZE(state->shared_regs));
447 regs = state->shared_regs;
448 dst_n_ptr = state->shared_dst_n;
449 break;
450 case IR3_FILE_NONGPR:
451 assert(offset + size <= ARRAY_SIZE(state->nongpr_regs));
452 regs = state->nongpr_regs;
453 dst_n_ptr = state->nongpr_dst_n;
454 break;
455 }
456
457 for (unsigned i = 0; i < size; i++)
458 add_single_reg_dep(state, node, ®s[offset + i], &dst_n_ptr[offset + i], num, src_n, dst_n);
459 }
460
461 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)462 calculate_deps(struct ir3_postsched_deps_state *state,
463 struct ir3_postsched_node *node)
464 {
465 /* Add dependencies on instructions that previously (or next,
466 * in the reverse direction) wrote any of our src registers:
467 */
468 foreach_src_n (reg, i, node->instr) {
469 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
470 continue;
471
472 if (reg->flags & IR3_REG_RELATIV) {
473 /* mark entire array as read: */
474 for (unsigned j = 0; j < reg->size; j++) {
475 add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
476 }
477 } else {
478 assert(reg->wrmask >= 1);
479 u_foreach_bit (b, reg->wrmask) {
480 add_reg_dep(state, node, reg, reg->num + b, i, -1);
481 }
482 }
483 }
484
485 /* And then after we update the state for what this instruction
486 * wrote:
487 */
488 foreach_dst_n (reg, i, node->instr) {
489 if (reg->wrmask == 0)
490 continue;
491 if (reg->flags & IR3_REG_RELATIV) {
492 /* mark the entire array as written: */
493 for (unsigned j = 0; j < reg->size; j++) {
494 add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
495 }
496 } else {
497 assert(reg->wrmask >= 1);
498 u_foreach_bit (b, reg->wrmask) {
499 add_reg_dep(state, node, reg, reg->num + b, -1, i);
500 }
501 }
502 }
503 }
504
505 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)506 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
507 {
508 struct ir3_postsched_deps_state state = {
509 .ctx = ctx,
510 .direction = F,
511 .merged = ctx->v->mergedregs,
512 };
513
514 foreach_instr (instr, &ctx->unscheduled_list) {
515 calculate_deps(&state, instr->data);
516 }
517 }
518
519 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)520 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
521 {
522 struct ir3_postsched_deps_state state = {
523 .ctx = ctx,
524 .direction = R,
525 .merged = ctx->v->mergedregs,
526 };
527
528 foreach_instr_rev (instr, &ctx->unscheduled_list) {
529 calculate_deps(&state, instr->data);
530 }
531 }
532
533 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)534 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
535 {
536 struct ir3_postsched_node *n =
537 rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
538
539 dag_init_node(ctx->dag, &n->dag);
540
541 n->instr = instr;
542 instr->data = n;
543 }
544
545 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)546 sched_dag_max_delay_cb(struct dag_node *node, void *state)
547 {
548 struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
549 struct ir3_postsched_ctx *ctx = state;
550 uint32_t max_delay = 0;
551
552 util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
553 struct ir3_postsched_node *child =
554 (struct ir3_postsched_node *)edge->child;
555 unsigned delay = edge->data;
556 unsigned sy_delay = 0;
557 unsigned ss_delay = 0;
558
559 if (child->has_sy_src && is_sy_producer(n->instr)) {
560 sy_delay = soft_sy_delay(n->instr, ctx->block->shader);
561 }
562
563 if (child->has_ss_src &&
564 needs_ss(ctx->v->compiler, n->instr, child->instr)) {
565 ss_delay = soft_ss_delay(n->instr);
566 }
567
568 delay = MAX3(delay, sy_delay, ss_delay);
569 max_delay = MAX2(child->max_delay + delay, max_delay);
570 }
571
572 n->max_delay = MAX2(n->max_delay, max_delay);
573 }
574
575 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)576 sched_dag_init(struct ir3_postsched_ctx *ctx)
577 {
578 ctx->mem_ctx = ralloc_context(NULL);
579
580 ctx->dag = dag_create(ctx->mem_ctx);
581
582 foreach_instr (instr, &ctx->unscheduled_list)
583 sched_node_init(ctx, instr);
584
585 calculate_forward_deps(ctx);
586 calculate_reverse_deps(ctx);
587
588 /*
589 * To avoid expensive texture fetches, etc, from being moved ahead
590 * of kills, track the kills we've seen so far, so we can add an
591 * extra dependency on them for tex/mem instructions
592 */
593 struct util_dynarray kills;
594 util_dynarray_init(&kills, ctx->mem_ctx);
595
596 /* The last bary.f with the (ei) flag must be scheduled before any kills,
597 * or the hw gets angry. Keep track of inputs here so we can add the
598 * false dep on the kill instruction.
599 */
600 struct util_dynarray inputs;
601 util_dynarray_init(&inputs, ctx->mem_ctx);
602
603 /*
604 * Normal srcs won't be in SSA at this point, those are dealt with in
605 * calculate_forward_deps() and calculate_reverse_deps(). But we still
606 * have the false-dep information in SSA form, so go ahead and add
607 * dependencies for that here:
608 */
609 foreach_instr (instr, &ctx->unscheduled_list) {
610 struct ir3_postsched_node *n = instr->data;
611
612 foreach_ssa_src_n (src, i, instr) {
613 if (src->block != instr->block)
614 continue;
615
616 /* we can end up with unused false-deps.. just skip them: */
617 if (src->flags & IR3_INSTR_UNUSED)
618 continue;
619
620 struct ir3_postsched_node *sn = src->data;
621
622 /* don't consider dependencies in other blocks: */
623 if (src->block != instr->block)
624 continue;
625
626 dag_add_edge_max_data(&sn->dag, &n->dag, 0);
627 }
628
629 if (is_input(instr)) {
630 util_dynarray_append(&inputs, struct ir3_instruction *, instr);
631 } else if (is_kill_or_demote(instr)) {
632 util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
633 struct ir3_instruction *input = *instrp;
634 struct ir3_postsched_node *in = input->data;
635 dag_add_edge_max_data(&in->dag, &n->dag, 0);
636 }
637 util_dynarray_append(&kills, struct ir3_instruction *, instr);
638 } else if (is_tex(instr) || is_mem(instr)) {
639 util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
640 struct ir3_instruction *kill = *instrp;
641 struct ir3_postsched_node *kn = kill->data;
642 dag_add_edge_max_data(&kn->dag, &n->dag, 0);
643 }
644 }
645 }
646
647 #ifndef NDEBUG
648 dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
649 #endif
650
651 // TODO do we want to do this after reverse-dependencies?
652 dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, ctx);
653 }
654
655 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)656 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
657 {
658 ralloc_free(ctx->mem_ctx);
659 ctx->mem_ctx = NULL;
660 ctx->dag = NULL;
661 }
662
663 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)664 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
665 {
666 ctx->block = block;
667 ctx->sy_delay = 0;
668 ctx->ss_delay = 0;
669
670 /* The terminator has to stay at the end. Instead of trying to set up
671 * dependencies to achieve this, it's easier to just remove it now and add it
672 * back after scheduling.
673 */
674 struct ir3_instruction *terminator = ir3_block_take_terminator(block);
675
676 /* move all instructions to the unscheduled list, and
677 * empty the block's instruction list (to which we will
678 * be inserting).
679 */
680 list_replace(&block->instr_list, &ctx->unscheduled_list);
681 list_inithead(&block->instr_list);
682
683 // TODO once we are using post-sched for everything we can
684 // just not stick in NOP's prior to post-sched, and drop this.
685 // for now keep this, since it makes post-sched optional:
686 foreach_instr_safe (instr, &ctx->unscheduled_list) {
687 switch (instr->opc) {
688 case OPC_NOP:
689 list_delinit(&instr->node);
690 break;
691 default:
692 break;
693 }
694 }
695
696 sched_dag_init(ctx);
697
698 /* First schedule all meta:input instructions, followed by
699 * tex-prefetch. We want all of the instructions that load
700 * values into registers before the shader starts to go
701 * before any other instructions. But in particular we
702 * want inputs to come before prefetches. This is because
703 * a FS's bary_ij input may not actually be live in the
704 * shader, but it should not be scheduled on top of any
705 * other input (but can be overwritten by a tex prefetch)
706 */
707 foreach_instr_safe (instr, &ctx->unscheduled_list)
708 if (instr->opc == OPC_META_INPUT)
709 schedule(ctx, instr);
710
711 foreach_instr_safe (instr, &ctx->unscheduled_list)
712 if (instr->opc == OPC_META_TEX_PREFETCH)
713 schedule(ctx, instr);
714
715 foreach_instr_safe (instr, &ctx->unscheduled_list)
716 if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
717 schedule(ctx, instr);
718
719 while (!list_is_empty(&ctx->unscheduled_list)) {
720 struct ir3_instruction *instr = choose_instr(ctx);
721
722 unsigned delay = node_delay(ctx, instr->data);
723 d("delay=%u", delay);
724
725 assert(delay <= 6);
726
727 schedule(ctx, instr);
728 }
729
730 sched_dag_destroy(ctx);
731
732 if (terminator)
733 list_addtail(&terminator->node, &block->instr_list);
734 }
735
736 static bool
is_self_mov(struct ir3_instruction * instr)737 is_self_mov(struct ir3_instruction *instr)
738 {
739 if (!is_same_type_mov(instr))
740 return false;
741
742 if (instr->dsts[0]->num != instr->srcs[0]->num)
743 return false;
744
745 if (instr->dsts[0]->flags & IR3_REG_RELATIV)
746 return false;
747
748 if (instr->cat1.round != ROUND_ZERO)
749 return false;
750
751 if (instr->srcs[0]->flags &
752 (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
753 IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
754 return false;
755
756 return true;
757 }
758
759 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
760 * as a result of places were before RA we are not sure that it is
761 * safe to eliminate. We could eliminate these earlier, but sometimes
762 * they are tangled up in false-dep's, etc, so it is easier just to
763 * let them exist until after RA
764 */
765 static void
cleanup_self_movs(struct ir3 * ir)766 cleanup_self_movs(struct ir3 *ir)
767 {
768 foreach_block (block, &ir->block_list) {
769 foreach_instr_safe (instr, &block->instr_list) {
770 for (unsigned i = 0; i < instr->deps_count; i++) {
771 if (instr->deps[i] && is_self_mov(instr->deps[i])) {
772 instr->deps[i] = NULL;
773 }
774 }
775
776 if (is_self_mov(instr))
777 list_delinit(&instr->node);
778 }
779 }
780 }
781
782 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)783 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
784 {
785 struct ir3_postsched_ctx ctx = {
786 .ir = ir,
787 .v = v,
788 };
789
790 cleanup_self_movs(ir);
791
792 foreach_block (block, &ir->block_list) {
793 sched_block(&ctx, block);
794 }
795
796 return true;
797 }
798