xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_postsched.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "util/dag.h"
10 #include "util/u_math.h"
11 
12 #include "ir3.h"
13 #include "ir3_compiler.h"
14 #include "ir3_context.h"
15 
16 #if MESA_DEBUG
17 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
18 #else
19 #define SCHED_DEBUG 0
20 #endif
21 #define d(fmt, ...)                                                            \
22    do {                                                                        \
23       if (SCHED_DEBUG) {                                                       \
24          mesa_logi("PSCHED: " fmt, ##__VA_ARGS__);                             \
25       }                                                                        \
26    } while (0)
27 
28 #define di(instr, fmt, ...)                                                    \
29    do {                                                                        \
30       if (SCHED_DEBUG) {                                                       \
31          struct log_stream *stream = mesa_log_streami();                       \
32          mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__);   \
33          ir3_print_instr_stream(stream, instr);                                \
34          mesa_log_stream_destroy(stream);                                      \
35       }                                                                        \
36    } while (0)
37 
38 #define SCHED_DEBUG_DUMP_DEPTH 1
39 
40 /*
41  * Post RA Instruction Scheduling
42  */
43 
44 struct ir3_postsched_ctx {
45    struct ir3 *ir;
46 
47    struct ir3_shader_variant *v;
48 
49    void *mem_ctx;
50    struct ir3_block *block; /* the current block */
51    struct dag *dag;
52 
53    struct list_head unscheduled_list; /* unscheduled instructions */
54 
55    unsigned ip;
56 
57    int ss_delay;
58    int sy_delay;
59 };
60 
61 struct ir3_postsched_node {
62    struct dag_node dag; /* must be first for util_dynarray_foreach */
63    struct ir3_instruction *instr;
64    bool partially_evaluated_path;
65 
66    unsigned earliest_ip;
67 
68    bool has_sy_src, has_ss_src;
69 
70    unsigned max_delay;
71 };
72 
73 #define foreach_sched_node(__n, __list)                                        \
74    list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
75 
76 static bool
has_sy_src(struct ir3_instruction * instr)77 has_sy_src(struct ir3_instruction *instr)
78 {
79    struct ir3_postsched_node *node = instr->data;
80    return node->has_sy_src;
81 }
82 
83 static bool
has_ss_src(struct ir3_instruction * instr)84 has_ss_src(struct ir3_instruction *instr)
85 {
86    struct ir3_postsched_node *node = instr->data;
87    return node->has_ss_src;
88 }
89 
90 #ifndef NDEBUG
91 static void
sched_dag_validate_cb(const struct dag_node * node,void * data)92 sched_dag_validate_cb(const struct dag_node *node, void *data)
93 {
94    struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
95 
96    ir3_print_instr(n->instr);
97 }
98 #endif
99 
100 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)101 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
102 {
103    assert(ctx->block == instr->block);
104 
105    /* remove from unscheduled_list:
106     */
107    list_delinit(&instr->node);
108 
109    di(instr, "schedule");
110 
111    bool counts_for_delay = is_alu(instr) || is_flow(instr);
112 
113    unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
114 
115    struct ir3_postsched_node *n = instr->data;
116 
117    /* We insert any nop's needed to get to earliest_ip, then advance
118     * delay_cycles by scheduling the instruction.
119     */
120    ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
121 
122    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
123       unsigned delay = (unsigned)(uintptr_t)edge->data;
124       struct ir3_postsched_node *child =
125          container_of(edge->child, struct ir3_postsched_node, dag);
126       child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
127    }
128 
129    list_addtail(&instr->node, &instr->block->instr_list);
130 
131    dag_prune_head(ctx->dag, &n->dag);
132 
133    if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
134       return;
135 
136    if (is_ss_producer(instr)) {
137       ctx->ss_delay = soft_ss_delay(instr);
138    } else if (has_ss_src(instr)) {
139       ctx->ss_delay = 0;
140    } else if (ctx->ss_delay > 0) {
141       ctx->ss_delay--;
142    }
143 
144    if (is_sy_producer(instr)) {
145       ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
146    } else if (has_sy_src(instr)) {
147       ctx->sy_delay = 0;
148    } else if (ctx->sy_delay > 0) {
149       ctx->sy_delay--;
150    }
151 }
152 
153 static unsigned
node_delay(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)154 node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
155 {
156    return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
157 }
158 
159 static unsigned
node_delay_soft(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n)160 node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
161 {
162    unsigned delay = node_delay(ctx, n);
163 
164    /* This takes into account that as when we schedule multiple tex or sfu, the
165     * first user has to wait for all of them to complete.
166     */
167    if (n->has_ss_src)
168       delay = MAX2(delay, ctx->ss_delay);
169    if (n->has_sy_src)
170       delay = MAX2(delay, ctx->sy_delay);
171 
172    return delay;
173 }
174 
175 static void
dump_node(struct ir3_postsched_ctx * ctx,struct ir3_postsched_node * n,int level)176 dump_node(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n,
177           int level)
178 {
179    if (level > SCHED_DEBUG_DUMP_DEPTH)
180       return;
181 
182    di(n->instr, "%*s%smaxdel=%d, node_delay=%d,node_delay_soft=%d, %d parents ",
183       level * 2, "", (level > 0 ? "-> " : ""), n->max_delay, node_delay(ctx, n),
184       node_delay_soft(ctx, n), n->dag.parent_count);
185 
186    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
187       struct ir3_postsched_node *child =
188          (struct ir3_postsched_node *)edge->child;
189 
190       dump_node(ctx, child, level + 1);
191    }
192 }
193 
194 static void
dump_state(struct ir3_postsched_ctx * ctx)195 dump_state(struct ir3_postsched_ctx *ctx)
196 {
197    if (!SCHED_DEBUG)
198       return;
199 
200    foreach_sched_node (n, &ctx->dag->heads) {
201       dump_node(ctx, n, 0);
202    }
203 }
204 
205 /* find instruction to schedule: */
206 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)207 choose_instr(struct ir3_postsched_ctx *ctx)
208 {
209    struct ir3_postsched_node *chosen = NULL;
210 
211    dump_state(ctx);
212 
213    foreach_sched_node (n, &ctx->dag->heads) {
214       if (!is_meta(n->instr))
215          continue;
216 
217       if (!chosen || (chosen->max_delay < n->max_delay))
218          chosen = n;
219    }
220 
221    if (chosen) {
222       di(chosen->instr, "prio: chose (meta)");
223       return chosen->instr;
224    }
225 
226    /* Try to schedule inputs with a higher priority, if possible, as
227     * the last bary.f unlocks varying storage to unblock more VS
228     * warps.
229     */
230    foreach_sched_node (n, &ctx->dag->heads) {
231       if (!is_input(n->instr))
232          continue;
233 
234       if (!chosen || (chosen->max_delay < n->max_delay))
235          chosen = n;
236    }
237 
238    if (chosen) {
239       di(chosen->instr, "prio: chose (input)");
240       return chosen->instr;
241    }
242 
243    /* Next prioritize discards: */
244    foreach_sched_node (n, &ctx->dag->heads) {
245       unsigned d = node_delay(ctx, n);
246 
247       if (d > 0)
248          continue;
249 
250       if (!is_kill_or_demote(n->instr))
251          continue;
252 
253       if (!chosen || (chosen->max_delay < n->max_delay))
254          chosen = n;
255    }
256 
257    if (chosen) {
258       di(chosen->instr, "csp: chose (kill, hard ready)");
259       return chosen->instr;
260    }
261 
262    /* Next prioritize expensive instructions: */
263    foreach_sched_node (n, &ctx->dag->heads) {
264       unsigned d = node_delay_soft(ctx, n);
265 
266       if (d > 0)
267          continue;
268 
269       if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr)))
270          continue;
271 
272       if (!chosen || (chosen->max_delay < n->max_delay))
273          chosen = n;
274    }
275 
276    if (chosen) {
277       di(chosen->instr, "csp: chose (sfu/tex, soft ready)");
278       return chosen->instr;
279    }
280 
281    /* Next try to find a ready leader w/ soft delay (ie. including extra
282     * delay for things like tex fetch which can be synchronized w/ sync
283     * bit (but we probably do want to schedule some other instructions
284     * while we wait). We also allow a small amount of nops, to prefer now-nops
285     * over future-nops up to a point, as that gives better results.
286     */
287    unsigned chosen_delay = 0;
288    foreach_sched_node (n, &ctx->dag->heads) {
289       unsigned d = node_delay_soft(ctx, n);
290 
291       if (d > 3)
292          continue;
293 
294       if (!chosen || d < chosen_delay) {
295          chosen = n;
296          chosen_delay = d;
297          continue;
298       }
299 
300       if (d > chosen_delay)
301          continue;
302 
303       if (chosen->max_delay < n->max_delay) {
304          chosen = n;
305          chosen_delay = d;
306       }
307    }
308 
309    if (chosen) {
310       di(chosen->instr, "csp: chose (soft ready)");
311       return chosen->instr;
312    }
313 
314    /* Otherwise choose leader with maximum cost:
315     */
316    foreach_sched_node (n, &ctx->dag->heads) {
317       if (!chosen || chosen->max_delay < n->max_delay)
318          chosen = n;
319    }
320 
321    if (chosen) {
322       di(chosen->instr, "csp: chose (leader)");
323       return chosen->instr;
324    }
325 
326    return NULL;
327 }
328 
329 struct ir3_postsched_deps_state {
330    struct ir3_postsched_ctx *ctx;
331 
332    enum { F, R } direction;
333 
334    bool merged;
335 
336    /* Track the mapping between sched node (instruction) that last
337     * wrote a given register (in whichever direction we are iterating
338     * the block)
339     *
340     * Note, this table is twice as big as the # of regs, to deal with
341     * half-precision regs.  The approach differs depending on whether
342     * the half and full precision register files are "merged" (conflict,
343     * ie. a6xx+) in which case we use "regs" for both full precision and half
344     * precision dependencies and consider each full precision dep
345     * as two half-precision dependencies, vs older separate (non-
346     * conflicting) in which case the separate "half_regs" table is used for
347     * half-precision deps. See ir3_reg_file_offset().
348     */
349    struct ir3_postsched_node *regs[2 * GPR_REG_SIZE];
350    unsigned dst_n[2 * GPR_REG_SIZE];
351    struct ir3_postsched_node *half_regs[GPR_REG_SIZE];
352    unsigned half_dst_n[GPR_REG_SIZE];
353    struct ir3_postsched_node *shared_regs[2 * SHARED_REG_SIZE];
354    unsigned shared_dst_n[2 * SHARED_REG_SIZE];
355    struct ir3_postsched_node *nongpr_regs[2 * NONGPR_REG_SIZE];
356    unsigned nongpr_dst_n[2 * NONGPR_REG_SIZE];
357 };
358 
359 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after,unsigned d)360 add_dep(struct ir3_postsched_deps_state *state,
361         struct ir3_postsched_node *before, struct ir3_postsched_node *after,
362         unsigned d)
363 {
364    if (!before || !after)
365       return;
366 
367    assert(before != after);
368 
369    if (state->direction == F) {
370       dag_add_edge_max_data(&before->dag, &after->dag, (uintptr_t)d);
371    } else {
372       dag_add_edge_max_data(&after->dag, &before->dag, 0);
373    }
374 }
375 
376 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,struct ir3_postsched_node ** dep_ptr,unsigned * dst_n_ptr,unsigned num,int src_n,int dst_n)377 add_single_reg_dep(struct ir3_postsched_deps_state *state,
378                    struct ir3_postsched_node *node,
379                    struct ir3_postsched_node **dep_ptr,
380                    unsigned *dst_n_ptr, unsigned num, int src_n,
381                    int dst_n)
382 {
383    struct ir3_postsched_node *dep = *dep_ptr;
384 
385    unsigned d = 0;
386    if (src_n >= 0 && dep && state->direction == F) {
387       struct ir3_compiler *compiler = state->ctx->ir->compiler;
388       /* get the dst_n this corresponds to */
389       unsigned dst_n = *dst_n_ptr;
390       d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n);
391       if (is_sy_producer(dep->instr))
392          node->has_sy_src = true;
393       if (needs_ss(compiler, dep->instr, node->instr))
394          node->has_ss_src = true;
395    }
396 
397    if (src_n >= 0 && dep && state->direction == R) {
398       /* If node generates a WAR hazard (because it doesn't consume its sources
399        * immediately, dep needs (ss) to sync its dest. Even though this isn't a
400        * (ss) source (but rather a dest), the effect is exactly the same so we
401        * model it as such.
402        */
403       if (is_war_hazard_producer(node->instr)) {
404          dep->has_ss_src = true;
405       }
406    }
407 
408    add_dep(state, dep, node, d);
409    if (src_n < 0) {
410       *dep_ptr = node;
411       *dst_n_ptr = dst_n;
412    }
413 }
414 
415 /* This is where we handled full vs half-precision, and potential conflicts
416  * between half and full precision that result in additional dependencies.
417  * The 'reg' arg is really just to know half vs full precision.
418  *
419  * If src_n is positive, then this adds a dependency on a source register, and
420  * src_n is the index passed into ir3_delayslots() for calculating the delay:
421  * it corresponds to node->instr->srcs[src_n]. If src_n is negative, then
422  * this is for the destination register corresponding to dst_n.
423  */
424 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n,int dst_n)425 add_reg_dep(struct ir3_postsched_deps_state *state,
426             struct ir3_postsched_node *node, const struct ir3_register *reg,
427             unsigned num, int src_n, int dst_n)
428 {
429    struct ir3_postsched_node **regs;
430    unsigned *dst_n_ptr;
431    enum ir3_reg_file file;
432    unsigned size = reg_elem_size(reg);
433    unsigned offset = ir3_reg_file_offset(reg, num, state->merged, &file);
434    switch (file) {
435    case IR3_FILE_FULL:
436       assert(offset + size <= ARRAY_SIZE(state->regs));
437       regs = state->regs;
438       dst_n_ptr = state->dst_n;
439       break;
440    case IR3_FILE_HALF:
441       assert(offset + 1 <= ARRAY_SIZE(state->half_regs));
442       regs = state->half_regs;
443       dst_n_ptr = state->half_dst_n;
444       break;
445    case IR3_FILE_SHARED:
446       assert(offset + size <= ARRAY_SIZE(state->shared_regs));
447       regs = state->shared_regs;
448       dst_n_ptr = state->shared_dst_n;
449       break;
450    case IR3_FILE_NONGPR:
451       assert(offset + size <= ARRAY_SIZE(state->nongpr_regs));
452       regs = state->nongpr_regs;
453       dst_n_ptr = state->nongpr_dst_n;
454       break;
455    }
456 
457    for (unsigned i = 0; i < size; i++)
458       add_single_reg_dep(state, node, &regs[offset + i], &dst_n_ptr[offset + i], num, src_n, dst_n);
459 }
460 
461 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)462 calculate_deps(struct ir3_postsched_deps_state *state,
463                struct ir3_postsched_node *node)
464 {
465    /* Add dependencies on instructions that previously (or next,
466     * in the reverse direction) wrote any of our src registers:
467     */
468    foreach_src_n (reg, i, node->instr) {
469       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
470          continue;
471 
472       if (reg->flags & IR3_REG_RELATIV) {
473          /* mark entire array as read: */
474          for (unsigned j = 0; j < reg->size; j++) {
475             add_reg_dep(state, node, reg, reg->array.base + j, i, -1);
476          }
477       } else {
478          assert(reg->wrmask >= 1);
479          u_foreach_bit (b, reg->wrmask) {
480             add_reg_dep(state, node, reg, reg->num + b, i, -1);
481          }
482       }
483    }
484 
485    /* And then after we update the state for what this instruction
486     * wrote:
487     */
488    foreach_dst_n (reg, i, node->instr) {
489       if (reg->wrmask == 0)
490          continue;
491       if (reg->flags & IR3_REG_RELATIV) {
492          /* mark the entire array as written: */
493          for (unsigned j = 0; j < reg->size; j++) {
494             add_reg_dep(state, node, reg, reg->array.base + j, -1, i);
495          }
496       } else {
497          assert(reg->wrmask >= 1);
498          u_foreach_bit (b, reg->wrmask) {
499             add_reg_dep(state, node, reg, reg->num + b, -1, i);
500          }
501       }
502    }
503 }
504 
505 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)506 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
507 {
508    struct ir3_postsched_deps_state state = {
509       .ctx = ctx,
510       .direction = F,
511       .merged = ctx->v->mergedregs,
512    };
513 
514    foreach_instr (instr, &ctx->unscheduled_list) {
515       calculate_deps(&state, instr->data);
516    }
517 }
518 
519 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)520 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
521 {
522    struct ir3_postsched_deps_state state = {
523       .ctx = ctx,
524       .direction = R,
525       .merged = ctx->v->mergedregs,
526    };
527 
528    foreach_instr_rev (instr, &ctx->unscheduled_list) {
529       calculate_deps(&state, instr->data);
530    }
531 }
532 
533 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)534 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
535 {
536    struct ir3_postsched_node *n =
537       rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
538 
539    dag_init_node(ctx->dag, &n->dag);
540 
541    n->instr = instr;
542    instr->data = n;
543 }
544 
545 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)546 sched_dag_max_delay_cb(struct dag_node *node, void *state)
547 {
548    struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
549    struct ir3_postsched_ctx *ctx = state;
550    uint32_t max_delay = 0;
551 
552    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
553       struct ir3_postsched_node *child =
554          (struct ir3_postsched_node *)edge->child;
555       unsigned delay = edge->data;
556       unsigned sy_delay = 0;
557       unsigned ss_delay = 0;
558 
559       if (child->has_sy_src && is_sy_producer(n->instr)) {
560          sy_delay = soft_sy_delay(n->instr, ctx->block->shader);
561       }
562 
563       if (child->has_ss_src &&
564           needs_ss(ctx->v->compiler, n->instr, child->instr)) {
565          ss_delay = soft_ss_delay(n->instr);
566       }
567 
568       delay = MAX3(delay, sy_delay, ss_delay);
569       max_delay = MAX2(child->max_delay + delay, max_delay);
570    }
571 
572    n->max_delay = MAX2(n->max_delay, max_delay);
573 }
574 
575 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)576 sched_dag_init(struct ir3_postsched_ctx *ctx)
577 {
578    ctx->mem_ctx = ralloc_context(NULL);
579 
580    ctx->dag = dag_create(ctx->mem_ctx);
581 
582    foreach_instr (instr, &ctx->unscheduled_list)
583       sched_node_init(ctx, instr);
584 
585    calculate_forward_deps(ctx);
586    calculate_reverse_deps(ctx);
587 
588    /*
589     * To avoid expensive texture fetches, etc, from being moved ahead
590     * of kills, track the kills we've seen so far, so we can add an
591     * extra dependency on them for tex/mem instructions
592     */
593    struct util_dynarray kills;
594    util_dynarray_init(&kills, ctx->mem_ctx);
595 
596    /* The last bary.f with the (ei) flag must be scheduled before any kills,
597     * or the hw gets angry. Keep track of inputs here so we can add the
598     * false dep on the kill instruction.
599     */
600    struct util_dynarray inputs;
601    util_dynarray_init(&inputs, ctx->mem_ctx);
602 
603    /*
604     * Normal srcs won't be in SSA at this point, those are dealt with in
605     * calculate_forward_deps() and calculate_reverse_deps().  But we still
606     * have the false-dep information in SSA form, so go ahead and add
607     * dependencies for that here:
608     */
609    foreach_instr (instr, &ctx->unscheduled_list) {
610       struct ir3_postsched_node *n = instr->data;
611 
612       foreach_ssa_src_n (src, i, instr) {
613          if (src->block != instr->block)
614             continue;
615 
616          /* we can end up with unused false-deps.. just skip them: */
617          if (src->flags & IR3_INSTR_UNUSED)
618             continue;
619 
620          struct ir3_postsched_node *sn = src->data;
621 
622          /* don't consider dependencies in other blocks: */
623          if (src->block != instr->block)
624             continue;
625 
626          dag_add_edge_max_data(&sn->dag, &n->dag, 0);
627       }
628 
629       if (is_input(instr)) {
630          util_dynarray_append(&inputs, struct ir3_instruction *, instr);
631       } else if (is_kill_or_demote(instr)) {
632          util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
633             struct ir3_instruction *input = *instrp;
634             struct ir3_postsched_node *in = input->data;
635             dag_add_edge_max_data(&in->dag, &n->dag, 0);
636          }
637          util_dynarray_append(&kills, struct ir3_instruction *, instr);
638       } else if (is_tex(instr) || is_mem(instr)) {
639          util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
640             struct ir3_instruction *kill = *instrp;
641             struct ir3_postsched_node *kn = kill->data;
642             dag_add_edge_max_data(&kn->dag, &n->dag, 0);
643          }
644       }
645    }
646 
647 #ifndef NDEBUG
648    dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
649 #endif
650 
651    // TODO do we want to do this after reverse-dependencies?
652    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, ctx);
653 }
654 
655 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)656 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
657 {
658    ralloc_free(ctx->mem_ctx);
659    ctx->mem_ctx = NULL;
660    ctx->dag = NULL;
661 }
662 
663 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)664 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
665 {
666    ctx->block = block;
667    ctx->sy_delay = 0;
668    ctx->ss_delay = 0;
669 
670    /* The terminator has to stay at the end. Instead of trying to set up
671     * dependencies to achieve this, it's easier to just remove it now and add it
672     * back after scheduling.
673     */
674    struct ir3_instruction *terminator = ir3_block_take_terminator(block);
675 
676    /* move all instructions to the unscheduled list, and
677     * empty the block's instruction list (to which we will
678     * be inserting).
679     */
680    list_replace(&block->instr_list, &ctx->unscheduled_list);
681    list_inithead(&block->instr_list);
682 
683    // TODO once we are using post-sched for everything we can
684    // just not stick in NOP's prior to post-sched, and drop this.
685    // for now keep this, since it makes post-sched optional:
686    foreach_instr_safe (instr, &ctx->unscheduled_list) {
687       switch (instr->opc) {
688       case OPC_NOP:
689          list_delinit(&instr->node);
690          break;
691       default:
692          break;
693       }
694    }
695 
696    sched_dag_init(ctx);
697 
698    /* First schedule all meta:input instructions, followed by
699     * tex-prefetch.  We want all of the instructions that load
700     * values into registers before the shader starts to go
701     * before any other instructions.  But in particular we
702     * want inputs to come before prefetches.  This is because
703     * a FS's bary_ij input may not actually be live in the
704     * shader, but it should not be scheduled on top of any
705     * other input (but can be overwritten by a tex prefetch)
706     */
707    foreach_instr_safe (instr, &ctx->unscheduled_list)
708       if (instr->opc == OPC_META_INPUT)
709          schedule(ctx, instr);
710 
711    foreach_instr_safe (instr, &ctx->unscheduled_list)
712       if (instr->opc == OPC_META_TEX_PREFETCH)
713          schedule(ctx, instr);
714 
715    foreach_instr_safe (instr, &ctx->unscheduled_list)
716       if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
717          schedule(ctx, instr);
718 
719    while (!list_is_empty(&ctx->unscheduled_list)) {
720       struct ir3_instruction *instr = choose_instr(ctx);
721 
722       unsigned delay = node_delay(ctx, instr->data);
723       d("delay=%u", delay);
724 
725       assert(delay <= 6);
726 
727       schedule(ctx, instr);
728    }
729 
730    sched_dag_destroy(ctx);
731 
732    if (terminator)
733       list_addtail(&terminator->node, &block->instr_list);
734 }
735 
736 static bool
is_self_mov(struct ir3_instruction * instr)737 is_self_mov(struct ir3_instruction *instr)
738 {
739    if (!is_same_type_mov(instr))
740       return false;
741 
742    if (instr->dsts[0]->num != instr->srcs[0]->num)
743       return false;
744 
745    if (instr->dsts[0]->flags & IR3_REG_RELATIV)
746       return false;
747 
748    if (instr->cat1.round != ROUND_ZERO)
749       return false;
750 
751    if (instr->srcs[0]->flags &
752        (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
753         IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
754       return false;
755 
756    return true;
757 }
758 
759 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
760  * as a result of places were before RA we are not sure that it is
761  * safe to eliminate.  We could eliminate these earlier, but sometimes
762  * they are tangled up in false-dep's, etc, so it is easier just to
763  * let them exist until after RA
764  */
765 static void
cleanup_self_movs(struct ir3 * ir)766 cleanup_self_movs(struct ir3 *ir)
767 {
768    foreach_block (block, &ir->block_list) {
769       foreach_instr_safe (instr, &block->instr_list) {
770          for (unsigned i = 0; i < instr->deps_count; i++) {
771             if (instr->deps[i] && is_self_mov(instr->deps[i])) {
772                instr->deps[i] = NULL;
773             }
774          }
775 
776          if (is_self_mov(instr))
777             list_delinit(&instr->node);
778       }
779    }
780 }
781 
782 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)783 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
784 {
785    struct ir3_postsched_ctx ctx = {
786       .ir = ir,
787       .v = v,
788    };
789 
790    cleanup_self_movs(ir);
791 
792    foreach_block (block, &ir->block_list) {
793       sched_block(&ctx, block);
794    }
795 
796    return true;
797 }
798