xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/vc4/vc4_qpu_schedule.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file vc4_qpu_schedule.c
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "vc4_qir.h"
38 #include "vc4_qpu.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct queued_qpu_inst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 
67         /**
68          * Which uniform from uniform_data[] this instruction read, or -1 if
69          * not reading a uniform.
70          */
71         int uniform;
72 };
73 
74 /* When walking the instructions in reverse, we need to swap before/after in
75  * add_dep().
76  */
77 enum direction { F, R };
78 
79 struct schedule_state {
80         struct dag *dag;
81         struct schedule_node *last_r[6];
82         struct schedule_node *last_ra[32];
83         struct schedule_node *last_rb[32];
84         struct schedule_node *last_sf;
85         struct schedule_node *last_vpm_read;
86         struct schedule_node *last_tmu_write;
87         struct schedule_node *last_tlb;
88         struct schedule_node *last_vpm;
89         struct schedule_node *last_uniforms_reset;
90         enum direction dir;
91         /* Estimated cycle when the current instruction would start. */
92         uint32_t time;
93 };
94 
95 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)96 add_dep(struct schedule_state *state,
97         struct schedule_node *before,
98         struct schedule_node *after,
99         bool write)
100 {
101         bool write_after_read = !write && state->dir == R;
102         uintptr_t edge_data = write_after_read;
103 
104         if (!before || !after)
105                 return;
106 
107         assert(before != after);
108 
109         if (state->dir == F)
110                 dag_add_edge(&before->dag, &after->dag, edge_data);
111         else
112                 dag_add_edge(&after->dag, &before->dag, edge_data);
113 }
114 
115 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)116 add_read_dep(struct schedule_state *state,
117               struct schedule_node *before,
118               struct schedule_node *after)
119 {
120         add_dep(state, before, after, false);
121 }
122 
123 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)124 add_write_dep(struct schedule_state *state,
125               struct schedule_node **before,
126               struct schedule_node *after)
127 {
128         add_dep(state, *before, after, true);
129         *before = after;
130 }
131 
132 static bool
qpu_writes_r4(uint64_t inst)133 qpu_writes_r4(uint64_t inst)
134 {
135         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
136 
137         switch(sig) {
138         case QPU_SIG_COLOR_LOAD:
139         case QPU_SIG_LOAD_TMU0:
140         case QPU_SIG_LOAD_TMU1:
141         case QPU_SIG_ALPHA_MASK_LOAD:
142                 return true;
143         default:
144                 return false;
145         }
146 }
147 
148 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t raddr,bool is_a)149 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
150                    uint32_t raddr, bool is_a)
151 {
152         switch (raddr) {
153         case QPU_R_VARY:
154                 add_write_dep(state, &state->last_r[5], n);
155                 break;
156 
157         case QPU_R_VPM:
158                 add_write_dep(state, &state->last_vpm_read, n);
159                 break;
160 
161         case QPU_R_UNIF:
162                 add_read_dep(state, state->last_uniforms_reset, n);
163                 break;
164 
165         case QPU_R_NOP:
166         case QPU_R_ELEM_QPU:
167         case QPU_R_XY_PIXEL_COORD:
168         case QPU_R_MS_REV_FLAGS:
169                 break;
170 
171         default:
172                 if (raddr < 32) {
173                         if (is_a)
174                                 add_read_dep(state, state->last_ra[raddr], n);
175                         else
176                                 add_read_dep(state, state->last_rb[raddr], n);
177                 } else {
178                         fprintf(stderr, "unknown raddr %d\n", raddr);
179                         abort();
180                 }
181                 break;
182         }
183 }
184 
185 static bool
is_tmu_write(uint32_t waddr)186 is_tmu_write(uint32_t waddr)
187 {
188         switch (waddr) {
189         case QPU_W_TMU0_S:
190         case QPU_W_TMU0_T:
191         case QPU_W_TMU0_R:
192         case QPU_W_TMU0_B:
193         case QPU_W_TMU1_S:
194         case QPU_W_TMU1_T:
195         case QPU_W_TMU1_R:
196         case QPU_W_TMU1_B:
197                 return true;
198         default:
199                 return false;
200         }
201 }
202 
203 static bool
reads_uniform(uint64_t inst)204 reads_uniform(uint64_t inst)
205 {
206         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
207                 return false;
208 
209         return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF ||
210                 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF &&
211                  QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) ||
212                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
213                 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
214 }
215 
216 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,uint32_t mux)217 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
218                  uint32_t mux)
219 {
220         if (mux != QPU_MUX_A && mux != QPU_MUX_B)
221                 add_read_dep(state, state->last_r[mux], n);
222 }
223 
224 
225 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool is_add)226 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
227                    uint32_t waddr, bool is_add)
228 {
229         uint64_t inst = n->inst->inst;
230         bool is_a = is_add ^ ((inst & QPU_WS) != 0);
231 
232         if (waddr < 32) {
233                 if (is_a) {
234                         add_write_dep(state, &state->last_ra[waddr], n);
235                 } else {
236                         add_write_dep(state, &state->last_rb[waddr], n);
237                 }
238         } else if (is_tmu_write(waddr)) {
239                 add_write_dep(state, &state->last_tmu_write, n);
240                 add_read_dep(state, state->last_uniforms_reset, n);
241         } else if (qpu_waddr_is_tlb(waddr) ||
242                    waddr == QPU_W_MS_FLAGS) {
243                 add_write_dep(state, &state->last_tlb, n);
244         } else {
245                 switch (waddr) {
246                 case QPU_W_ACC0:
247                 case QPU_W_ACC1:
248                 case QPU_W_ACC2:
249                 case QPU_W_ACC3:
250                 case QPU_W_ACC5:
251                         add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0],
252                                       n);
253                         break;
254 
255                 case QPU_W_VPM:
256                         add_write_dep(state, &state->last_vpm, n);
257                         break;
258 
259                 case QPU_W_VPMVCD_SETUP:
260                         if (is_a)
261                                 add_write_dep(state, &state->last_vpm_read, n);
262                         else
263                                 add_write_dep(state, &state->last_vpm, n);
264                         break;
265 
266                 case QPU_W_SFU_RECIP:
267                 case QPU_W_SFU_RECIPSQRT:
268                 case QPU_W_SFU_EXP:
269                 case QPU_W_SFU_LOG:
270                         add_write_dep(state, &state->last_r[4], n);
271                         break;
272 
273                 case QPU_W_TLB_STENCIL_SETUP:
274                         /* This isn't a TLB operation that does things like
275                          * implicitly lock the scoreboard, but it does have to
276                          * appear before TLB_Z, and each of the TLB_STENCILs
277                          * have to schedule in the same order relative to each
278                          * other.
279                          */
280                         add_write_dep(state, &state->last_tlb, n);
281                         break;
282 
283                 case QPU_W_MS_FLAGS:
284                         add_write_dep(state, &state->last_tlb, n);
285                         break;
286 
287                 case QPU_W_UNIFORMS_ADDRESS:
288                         add_write_dep(state, &state->last_uniforms_reset, n);
289                         break;
290 
291                 case QPU_W_NOP:
292                         break;
293 
294                 default:
295                         fprintf(stderr, "Unknown waddr %d\n", waddr);
296                         abort();
297                 }
298         }
299 }
300 
301 static void
process_cond_deps(struct schedule_state * state,struct schedule_node * n,uint32_t cond)302 process_cond_deps(struct schedule_state *state, struct schedule_node *n,
303                   uint32_t cond)
304 {
305         switch (cond) {
306         case QPU_COND_NEVER:
307         case QPU_COND_ALWAYS:
308                 break;
309         default:
310                 add_read_dep(state, state->last_sf, n);
311                 break;
312         }
313 }
314 
315 /**
316  * Common code for dependencies that need to be tracked both forward and
317  * backward.
318  *
319  * This is for things like "all reads of r4 have to happen between the r4
320  * writes that surround them".
321  */
322 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)323 calculate_deps(struct schedule_state *state, struct schedule_node *n)
324 {
325         uint64_t inst = n->inst->inst;
326         uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD);
327         uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL);
328         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
329         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
330         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
331         uint32_t raddr_a = sig == QPU_SIG_BRANCH ?
332                                 QPU_GET_FIELD(inst, QPU_BRANCH_RADDR_A) :
333                                 QPU_GET_FIELD(inst, QPU_RADDR_A);
334         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
335         uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
336         uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
337         uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A);
338         uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B);
339 
340         if (sig != QPU_SIG_LOAD_IMM) {
341                 process_raddr_deps(state, n, raddr_a, true);
342                 if (sig != QPU_SIG_SMALL_IMM &&
343                     sig != QPU_SIG_BRANCH)
344                         process_raddr_deps(state, n, raddr_b, false);
345         }
346 
347         if (sig != QPU_SIG_LOAD_IMM && sig != QPU_SIG_BRANCH) {
348                 if (add_op != QPU_A_NOP) {
349                         process_mux_deps(state, n, add_a);
350                         process_mux_deps(state, n, add_b);
351                 }
352                 if (mul_op != QPU_M_NOP) {
353                         process_mux_deps(state, n, mul_a);
354                         process_mux_deps(state, n, mul_b);
355                 }
356         }
357 
358         process_waddr_deps(state, n, waddr_add, true);
359         process_waddr_deps(state, n, waddr_mul, false);
360         if (qpu_writes_r4(inst))
361                 add_write_dep(state, &state->last_r[4], n);
362 
363         switch (sig) {
364         case QPU_SIG_SW_BREAKPOINT:
365         case QPU_SIG_NONE:
366         case QPU_SIG_SMALL_IMM:
367         case QPU_SIG_LOAD_IMM:
368                 break;
369 
370         case QPU_SIG_THREAD_SWITCH:
371         case QPU_SIG_LAST_THREAD_SWITCH:
372                 /* All accumulator contents and flags are undefined after the
373                  * switch.
374                  */
375                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
376                         add_write_dep(state, &state->last_r[i], n);
377                 add_write_dep(state, &state->last_sf, n);
378 
379                 /* Scoreboard-locking operations have to stay after the last
380                  * thread switch.
381                  */
382                 add_write_dep(state, &state->last_tlb, n);
383 
384                 add_write_dep(state, &state->last_tmu_write, n);
385                 break;
386 
387         case QPU_SIG_LOAD_TMU0:
388         case QPU_SIG_LOAD_TMU1:
389                 /* TMU loads are coming from a FIFO, so ordering is important.
390                  */
391                 add_write_dep(state, &state->last_tmu_write, n);
392                 break;
393 
394         case QPU_SIG_COLOR_LOAD:
395                 add_read_dep(state, state->last_tlb, n);
396                 break;
397 
398         case QPU_SIG_BRANCH:
399                 add_read_dep(state, state->last_sf, n);
400                 break;
401 
402         case QPU_SIG_PROG_END:
403         case QPU_SIG_WAIT_FOR_SCOREBOARD:
404         case QPU_SIG_SCOREBOARD_UNLOCK:
405         case QPU_SIG_COVERAGE_LOAD:
406         case QPU_SIG_COLOR_LOAD_END:
407         case QPU_SIG_ALPHA_MASK_LOAD:
408                 fprintf(stderr, "Unhandled signal bits %d\n", sig);
409                 abort();
410         }
411 
412         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD));
413         process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL));
414         if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH)
415                 add_write_dep(state, &state->last_sf, n);
416 }
417 
418 static void
calculate_forward_deps(struct vc4_compile * c,struct dag * dag,struct list_head * schedule_list)419 calculate_forward_deps(struct vc4_compile *c, struct dag *dag,
420                        struct list_head *schedule_list)
421 {
422         struct schedule_state state;
423 
424         memset(&state, 0, sizeof(state));
425         state.dag = dag;
426         state.dir = F;
427 
428         list_for_each_entry(struct schedule_node, node, schedule_list, link)
429                 calculate_deps(&state, node);
430 }
431 
432 static void
calculate_reverse_deps(struct vc4_compile * c,struct dag * dag,struct list_head * schedule_list)433 calculate_reverse_deps(struct vc4_compile *c, struct dag *dag,
434                        struct list_head *schedule_list)
435 {
436         struct schedule_state state;
437 
438         memset(&state, 0, sizeof(state));
439         state.dag = dag;
440         state.dir = R;
441 
442         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
443                                 link) {
444                 calculate_deps(&state, (struct schedule_node *)node);
445         }
446 }
447 
448 struct choose_scoreboard {
449         struct dag *dag;
450         int tick;
451         int last_sfu_write_tick;
452         int last_uniforms_reset_tick;
453         uint32_t last_waddr_a, last_waddr_b;
454         bool tlb_locked;
455 };
456 
457 static bool
reads_too_soon_after_write(struct choose_scoreboard * scoreboard,uint64_t inst)458 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
459 {
460         uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
461         uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
462         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
463 
464         /* Full immediate loads don't read any registers. */
465         if (sig == QPU_SIG_LOAD_IMM)
466                 return false;
467 
468         uint32_t src_muxes[] = {
469                 QPU_GET_FIELD(inst, QPU_ADD_A),
470                 QPU_GET_FIELD(inst, QPU_ADD_B),
471                 QPU_GET_FIELD(inst, QPU_MUL_A),
472                 QPU_GET_FIELD(inst, QPU_MUL_B),
473         };
474         for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) {
475                 if ((src_muxes[i] == QPU_MUX_A &&
476                      raddr_a < 32 &&
477                      scoreboard->last_waddr_a == raddr_a) ||
478                     (src_muxes[i] == QPU_MUX_B &&
479                      sig != QPU_SIG_SMALL_IMM &&
480                      raddr_b < 32 &&
481                      scoreboard->last_waddr_b == raddr_b)) {
482                         return true;
483                 }
484 
485                 if (src_muxes[i] == QPU_MUX_R4) {
486                         if (scoreboard->tick -
487                             scoreboard->last_sfu_write_tick <= 2) {
488                                 return true;
489                         }
490                 }
491         }
492 
493         if (sig == QPU_SIG_SMALL_IMM &&
494             QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) {
495                 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
496                 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
497 
498                 if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 ||
499                     scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 ||
500                     scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 ||
501                     scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) {
502                         return true;
503                 }
504         }
505 
506         if (reads_uniform(inst) &&
507             scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
508                 return true;
509         }
510 
511         return false;
512 }
513 
514 static bool
pixel_scoreboard_too_soon(struct choose_scoreboard * scoreboard,uint64_t inst)515 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst)
516 {
517         return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst));
518 }
519 
520 static int
get_instruction_priority(uint64_t inst)521 get_instruction_priority(uint64_t inst)
522 {
523         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
524         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
525         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
526         uint32_t baseline_score;
527         uint32_t next_score = 0;
528 
529         /* Schedule TLB operations as late as possible, to get more
530          * parallelism between shaders.
531          */
532         if (qpu_inst_is_tlb(inst))
533                 return next_score;
534         next_score++;
535 
536         /* Schedule texture read results collection late to hide latency. */
537         if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1)
538                 return next_score;
539         next_score++;
540 
541         /* Default score for things that aren't otherwise special. */
542         baseline_score = next_score;
543         next_score++;
544 
545         /* Schedule texture read setup early to hide their latency better. */
546         if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul))
547                 return next_score;
548         next_score++;
549 
550         return baseline_score;
551 }
552 
553 static struct schedule_node *
choose_instruction_to_schedule(struct choose_scoreboard * scoreboard,struct list_head * schedule_list,struct schedule_node * prev_inst)554 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
555                                struct list_head *schedule_list,
556                                struct schedule_node *prev_inst)
557 {
558         struct schedule_node *chosen = NULL;
559         int chosen_prio = 0;
560 
561         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
562          * will handle pairing it along with filling the delay slots.
563          */
564         if (prev_inst) {
565                 uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
566                                                   QPU_SIG);
567                 if (prev_sig == QPU_SIG_THREAD_SWITCH ||
568                     prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
569                         return NULL;
570                 }
571         }
572 
573         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
574                             dag.link) {
575                 uint64_t inst = n->inst->inst;
576                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
577 
578                 /* Don't choose the branch instruction until it's the last one
579                  * left.  XXX: We could potentially choose it before it's the
580                  * last one, if the remaining instructions fit in the delay
581                  * slots.
582                  */
583                 if (sig == QPU_SIG_BRANCH &&
584                     !list_is_singular(&scoreboard->dag->heads)) {
585                         continue;
586                 }
587 
588                 /* "An instruction must not read from a location in physical
589                  *  regfile A or B that was written to by the previous
590                  *  instruction."
591                  */
592                 if (reads_too_soon_after_write(scoreboard, inst))
593                         continue;
594 
595                 /* "A scoreboard wait must not occur in the first two
596                  *  instructions of a fragment shader. This is either the
597                  *  explicit Wait for Scoreboard signal or an implicit wait
598                  *  with the first tile-buffer read or write instruction."
599                  */
600                 if (pixel_scoreboard_too_soon(scoreboard, inst))
601                         continue;
602 
603                 /* If we're trying to pair with another instruction, check
604                  * that they're compatible.
605                  */
606                 if (prev_inst) {
607                         /* Don't pair up a thread switch signal -- we'll
608                          * handle pairing it when we pick it on its own.
609                          */
610                         if (sig == QPU_SIG_THREAD_SWITCH ||
611                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
612                                 continue;
613                         }
614 
615                         if (prev_inst->uniform != -1 && n->uniform != -1)
616                                 continue;
617 
618                         /* Don't merge in something that will lock the TLB.
619                          * Hopefully what we have in inst will release some
620                          * other instructions, allowing us to delay the
621                          * TLB-locking instruction until later.
622                          */
623                         if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst))
624                                 continue;
625 
626                         inst = qpu_merge_inst(prev_inst->inst->inst, inst);
627                         if (!inst)
628                                 continue;
629                 }
630 
631                 int prio = get_instruction_priority(inst);
632 
633                 /* Found a valid instruction.  If nothing better comes along,
634                  * this one works.
635                  */
636                 if (!chosen) {
637                         chosen = n;
638                         chosen_prio = prio;
639                         continue;
640                 }
641 
642                 if (prio > chosen_prio) {
643                         chosen = n;
644                         chosen_prio = prio;
645                 } else if (prio < chosen_prio) {
646                         continue;
647                 }
648 
649                 if (n->delay > chosen->delay) {
650                         chosen = n;
651                         chosen_prio = prio;
652                 } else if (n->delay < chosen->delay) {
653                         continue;
654                 }
655         }
656 
657         return chosen;
658 }
659 
660 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,uint64_t inst)661 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
662                              uint64_t inst)
663 {
664         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
665         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
666 
667         if (!(inst & QPU_WS)) {
668                 scoreboard->last_waddr_a = waddr_add;
669                 scoreboard->last_waddr_b = waddr_mul;
670         } else {
671                 scoreboard->last_waddr_b = waddr_add;
672                 scoreboard->last_waddr_a = waddr_mul;
673         }
674 
675         if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) ||
676             (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
677                 scoreboard->last_sfu_write_tick = scoreboard->tick;
678         }
679 
680         if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
681             waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
682                 scoreboard->last_uniforms_reset_tick = scoreboard->tick;
683         }
684 
685         if (qpu_inst_is_tlb(inst))
686                 scoreboard->tlb_locked = true;
687 }
688 
689 static void
dump_state(struct dag * dag)690 dump_state(struct dag *dag)
691 {
692         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
693                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
694                 vc4_qpu_disasm(&n->inst->inst, 1);
695                 fprintf(stderr, "\n");
696 
697                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
698                         struct schedule_node *child =
699                                 (struct schedule_node *)edge->child;
700                         if (!child)
701                                 continue;
702 
703                         fprintf(stderr, "                 - ");
704                         vc4_qpu_disasm(&child->inst->inst, 1);
705                         fprintf(stderr, " (%d parents, %c)\n",
706                                 child->dag.parent_count,
707                                 edge->data ? 'w' : 'r');
708                 }
709         }
710 }
711 
waddr_latency(uint32_t waddr,uint64_t after)712 static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
713 {
714         if (waddr < 32)
715                 return 2;
716 
717         /* Apply some huge latency between texture fetch requests and getting
718          * their results back.
719          *
720          * FIXME: This is actually pretty bogus.  If we do:
721          *
722          * mov tmu0_s, a
723          * <a bit of math>
724          * mov tmu0_s, b
725          * load_tmu0
726          * <more math>
727          * load_tmu0
728          *
729          * we count that as worse than
730          *
731          * mov tmu0_s, a
732          * mov tmu0_s, b
733          * <lots of math>
734          * load_tmu0
735          * <more math>
736          * load_tmu0
737          *
738          * because we associate the first load_tmu0 with the *second* tmu0_s.
739          */
740         if (waddr == QPU_W_TMU0_S) {
741                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
742                         return 100;
743         }
744         if (waddr == QPU_W_TMU1_S) {
745                 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
746                         return 100;
747         }
748 
749         switch(waddr) {
750         case QPU_W_SFU_RECIP:
751         case QPU_W_SFU_RECIPSQRT:
752         case QPU_W_SFU_EXP:
753         case QPU_W_SFU_LOG:
754                 return 3;
755         default:
756                 return 1;
757         }
758 }
759 
760 static uint32_t
instruction_latency(struct schedule_node * before,struct schedule_node * after)761 instruction_latency(struct schedule_node *before, struct schedule_node *after)
762 {
763         uint64_t before_inst = before->inst->inst;
764         uint64_t after_inst = after->inst->inst;
765 
766         return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
767                                   after_inst),
768                     waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
769                                   after_inst));
770 }
771 
772 /** Recursive computation of the delay member of a node. */
773 static void
compute_delay(struct dag_node * node,void * state)774 compute_delay(struct dag_node *node, void *state)
775 {
776         struct schedule_node *n = (struct schedule_node *)node;
777 
778         n->delay = 1;
779 
780         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
781                 struct schedule_node *child =
782                         (struct schedule_node *)edge->child;
783                 n->delay = MAX2(n->delay, (child->delay +
784                                            instruction_latency(n, child)));
785         }
786 }
787 
788 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
789  * should be called on it later to finish pruning the other edges).
790  */
791 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)792 pre_remove_head(struct dag *dag, struct schedule_node *n)
793 {
794         list_delinit(&n->dag.link);
795 
796         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
797                 if (edge->data)
798                         dag_remove_edge(dag, edge);
799         }
800 }
801 
802 static void
mark_instruction_scheduled(struct dag * dag,uint32_t time,struct schedule_node * node)803 mark_instruction_scheduled(struct dag *dag,
804                            uint32_t time,
805                            struct schedule_node *node)
806 {
807         if (!node)
808                 return;
809 
810         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
811                 struct schedule_node *child =
812                         (struct schedule_node *)edge->child;
813 
814                 if (!child)
815                         continue;
816 
817                 uint32_t latency = instruction_latency(node, child);
818 
819                 child->unblocked_time = MAX2(child->unblocked_time,
820                                              time + latency);
821         }
822         dag_prune_head(dag, &node->dag);
823 }
824 
825 /**
826  * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
827  * with another instruction.
828  */
829 static void
emit_thrsw(struct vc4_compile * c,struct choose_scoreboard * scoreboard,uint64_t inst)830 emit_thrsw(struct vc4_compile *c,
831            struct choose_scoreboard *scoreboard,
832            uint64_t inst)
833 {
834         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
835 
836         /* There should be nothing in a thrsw inst being scheduled other than
837          * the signal bits.
838          */
839         assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
840         assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
841 
842         /* Try to find an earlier scheduled instruction that we can merge the
843          * thrsw into.
844          */
845         int thrsw_ip = c->qpu_inst_count;
846         for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
847                 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
848                 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
849 
850                 if (prev_sig == QPU_SIG_NONE)
851                         thrsw_ip = c->qpu_inst_count - i;
852         }
853 
854         if (thrsw_ip != c->qpu_inst_count) {
855                 /* Merge the thrsw into the existing instruction. */
856                 c->qpu_insts[thrsw_ip] =
857                         QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
858         } else {
859                 qpu_serialize_one_inst(c, inst);
860                 update_scoreboard_for_chosen(scoreboard, inst);
861         }
862 
863         /* Fill the delay slots. */
864         while (c->qpu_inst_count < thrsw_ip + 3) {
865                 update_scoreboard_for_chosen(scoreboard, qpu_NOP());
866                 qpu_serialize_one_inst(c, qpu_NOP());
867         }
868 }
869 
870 static uint32_t
schedule_instructions(struct vc4_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct list_head * schedule_list,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)871 schedule_instructions(struct vc4_compile *c,
872                       struct choose_scoreboard *scoreboard,
873                       struct qblock *block,
874                       struct list_head *schedule_list,
875                       enum quniform_contents *orig_uniform_contents,
876                       uint32_t *orig_uniform_data,
877                       uint32_t *next_uniform)
878 {
879         uint32_t time = 0;
880 
881         while (!list_is_empty(&scoreboard->dag->heads)) {
882                 struct schedule_node *chosen =
883                         choose_instruction_to_schedule(scoreboard,
884                                                        schedule_list,
885                                                        NULL);
886                 struct schedule_node *merge = NULL;
887 
888                 /* If there are no valid instructions to schedule, drop a NOP
889                  * in.
890                  */
891                 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
892 
893                 if (debug) {
894                         fprintf(stderr, "t=%4d: current list:\n",
895                                 time);
896                         dump_state(scoreboard->dag);
897                         fprintf(stderr, "t=%4d: chose: ", time);
898                         vc4_qpu_disasm(&inst, 1);
899                         fprintf(stderr, "\n");
900                 }
901 
902                 /* Schedule this instruction onto the QPU list. Also try to
903                  * find an instruction to pair with it.
904                  */
905                 if (chosen) {
906                         time = MAX2(chosen->unblocked_time, time);
907                         pre_remove_head(scoreboard->dag, chosen);
908                         if (chosen->uniform != -1) {
909                                 c->uniform_data[*next_uniform] =
910                                         orig_uniform_data[chosen->uniform];
911                                 c->uniform_contents[*next_uniform] =
912                                         orig_uniform_contents[chosen->uniform];
913                                 (*next_uniform)++;
914                         }
915 
916                         merge = choose_instruction_to_schedule(scoreboard,
917                                                                schedule_list,
918                                                                chosen);
919                         if (merge) {
920                                 time = MAX2(merge->unblocked_time, time);
921                                 inst = qpu_merge_inst(inst, merge->inst->inst);
922                                 assert(inst != 0);
923                                 if (merge->uniform != -1) {
924                                         c->uniform_data[*next_uniform] =
925                                                 orig_uniform_data[merge->uniform];
926                                         c->uniform_contents[*next_uniform] =
927                                                 orig_uniform_contents[merge->uniform];
928                                         (*next_uniform)++;
929                                 }
930 
931                                 if (debug) {
932                                         fprintf(stderr, "t=%4d: merging: ",
933                                                 time);
934                                         vc4_qpu_disasm(&merge->inst->inst, 1);
935                                         fprintf(stderr, "\n");
936                                         fprintf(stderr, "            resulting in: ");
937                                         vc4_qpu_disasm(&inst, 1);
938                                         fprintf(stderr, "\n");
939                                 }
940                         }
941                 }
942 
943                 if (debug) {
944                         fprintf(stderr, "\n");
945                 }
946 
947                 /* Now that we've scheduled a new instruction, some of its
948                  * children can be promoted to the list of instructions ready to
949                  * be scheduled.  Update the children's unblocked time for this
950                  * DAG edge as we do so.
951                  */
952                 mark_instruction_scheduled(scoreboard->dag, time, chosen);
953                 mark_instruction_scheduled(scoreboard->dag, time, merge);
954 
955                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
956                     QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
957                         emit_thrsw(c, scoreboard, inst);
958                 } else {
959                         qpu_serialize_one_inst(c, inst);
960                         update_scoreboard_for_chosen(scoreboard, inst);
961                 }
962 
963                 scoreboard->tick++;
964                 time++;
965 
966                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) {
967                         block->branch_qpu_ip = c->qpu_inst_count - 1;
968                         /* Fill the delay slots.
969                          *
970                          * We should fill these with actual instructions,
971                          * instead, but that will probably need to be done
972                          * after this, once we know what the leading
973                          * instructions of the successors are (so we can
974                          * handle A/B register file write latency)
975                         */
976                         inst = qpu_NOP();
977                         update_scoreboard_for_chosen(scoreboard, inst);
978                         qpu_serialize_one_inst(c, inst);
979                         qpu_serialize_one_inst(c, inst);
980                         qpu_serialize_one_inst(c, inst);
981                 }
982         }
983 
984         return time;
985 }
986 
987 static uint32_t
qpu_schedule_instructions_block(struct vc4_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)988 qpu_schedule_instructions_block(struct vc4_compile *c,
989                                 struct choose_scoreboard *scoreboard,
990                                 struct qblock *block,
991                                 enum quniform_contents *orig_uniform_contents,
992                                 uint32_t *orig_uniform_data,
993                                 uint32_t *next_uniform)
994 {
995         scoreboard->dag = dag_create(NULL);
996         struct list_head setup_list;
997 
998         list_inithead(&setup_list);
999 
1000         /* Wrap each instruction in a scheduler structure. */
1001         uint32_t next_sched_uniform = *next_uniform;
1002         while (!list_is_empty(&block->qpu_inst_list)) {
1003                 struct queued_qpu_inst *inst =
1004                         (struct queued_qpu_inst *)block->qpu_inst_list.next;
1005                 struct schedule_node *n = rzalloc(scoreboard->dag,
1006                                                   struct schedule_node);
1007 
1008                 dag_init_node(scoreboard->dag, &n->dag);
1009                 n->inst = inst;
1010 
1011                 if (reads_uniform(inst->inst)) {
1012                         n->uniform = next_sched_uniform++;
1013                 } else {
1014                         n->uniform = -1;
1015                 }
1016                 list_del(&inst->link);
1017                 list_addtail(&n->link, &setup_list);
1018         }
1019 
1020         calculate_forward_deps(c, scoreboard->dag, &setup_list);
1021         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
1022 
1023         dag_traverse_bottom_up(scoreboard->dag, compute_delay, NULL);
1024 
1025         uint32_t cycles = schedule_instructions(c, scoreboard, block,
1026                                                 &setup_list,
1027                                                 orig_uniform_contents,
1028                                                 orig_uniform_data,
1029                                                 next_uniform);
1030 
1031         ralloc_free(scoreboard->dag);
1032         scoreboard->dag = NULL;
1033 
1034         return cycles;
1035 }
1036 
1037 static void
qpu_set_branch_targets(struct vc4_compile * c)1038 qpu_set_branch_targets(struct vc4_compile *c)
1039 {
1040         qir_for_each_block(block, c) {
1041                 /* The end block of the program has no branch. */
1042                 if (!block->successors[0])
1043                         continue;
1044 
1045                 /* If there was no branch instruction, then the successor
1046                  * block must follow immediately after this one.
1047                  */
1048                 if (block->branch_qpu_ip == ~0) {
1049                         assert(block->end_qpu_ip + 1 ==
1050                                block->successors[0]->start_qpu_ip);
1051                         continue;
1052                 }
1053 
1054                 /* Set the branch target for the block that doesn't follow
1055                  * immediately after ours.
1056                  */
1057                 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip];
1058                 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH);
1059                 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0);
1060 
1061                 uint32_t branch_target =
1062                         (block->successors[0]->start_qpu_ip -
1063                          (block->branch_qpu_ip + 4)) * sizeof(uint64_t);
1064                 *branch_inst = (*branch_inst |
1065                                 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET));
1066 
1067                 /* Make sure that the if-we-don't-jump successor was scheduled
1068                  * just after the delay slots.
1069                  */
1070                 if (block->successors[1]) {
1071                         assert(block->successors[1]->start_qpu_ip ==
1072                                block->branch_qpu_ip + 4);
1073                 }
1074         }
1075 }
1076 
1077 uint32_t
qpu_schedule_instructions(struct vc4_compile * c)1078 qpu_schedule_instructions(struct vc4_compile *c)
1079 {
1080         /* We reorder the uniforms as we schedule instructions, so save the
1081          * old data off and replace it.
1082          */
1083         uint32_t *uniform_data = c->uniform_data;
1084         enum quniform_contents *uniform_contents = c->uniform_contents;
1085         c->uniform_contents = ralloc_array(c, enum quniform_contents,
1086                                            c->num_uniforms);
1087         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
1088         c->uniform_array_size = c->num_uniforms;
1089         uint32_t next_uniform = 0;
1090 
1091         struct choose_scoreboard scoreboard;
1092         memset(&scoreboard, 0, sizeof(scoreboard));
1093         scoreboard.last_waddr_a = ~0;
1094         scoreboard.last_waddr_b = ~0;
1095         scoreboard.last_sfu_write_tick = -10;
1096         scoreboard.last_uniforms_reset_tick = -10;
1097 
1098         if (debug) {
1099                 fprintf(stderr, "Pre-schedule instructions\n");
1100                 qir_for_each_block(block, c) {
1101                         fprintf(stderr, "BLOCK %d\n", block->index);
1102                         list_for_each_entry(struct queued_qpu_inst, q,
1103                                             &block->qpu_inst_list, link) {
1104                                 vc4_qpu_disasm(&q->inst, 1);
1105                                 fprintf(stderr, "\n");
1106                         }
1107                 }
1108                 fprintf(stderr, "\n");
1109         }
1110 
1111         uint32_t cycles = 0;
1112         qir_for_each_block(block, c) {
1113                 block->start_qpu_ip = c->qpu_inst_count;
1114                 block->branch_qpu_ip = ~0;
1115 
1116                 cycles += qpu_schedule_instructions_block(c,
1117                                                           &scoreboard,
1118                                                           block,
1119                                                           uniform_contents,
1120                                                           uniform_data,
1121                                                           &next_uniform);
1122 
1123                 block->end_qpu_ip = c->qpu_inst_count - 1;
1124         }
1125 
1126         qpu_set_branch_targets(c);
1127 
1128         assert(next_uniform == c->num_uniforms);
1129 
1130         if (debug) {
1131                 fprintf(stderr, "Post-schedule instructions\n");
1132                 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count);
1133                 fprintf(stderr, "\n");
1134         }
1135 
1136         return cycles;
1137 }
1138