xref: /aosp_15_r20/external/mesa3d/src/broadcom/compiler/qpu_schedule.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * Copyright © 2014-2017 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 /**
26  * @file
27  *
28  * The basic model of the list scheduler is to take a basic block, compute a
29  * DAG of the dependencies, and make a list of the DAG heads.  Heuristically
30  * pick a DAG head, then put all the children that are now DAG heads into the
31  * list of things to schedule.
32  *
33  * The goal of scheduling here is to pack pairs of operations together in a
34  * single QPU instruction.
35  */
36 
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41 
42 static bool debug;
43 
44 struct schedule_node_child;
45 
46 struct schedule_node {
47         struct dag_node dag;
48         struct list_head link;
49         struct qinst *inst;
50 
51         /* Longest cycles + instruction_latency() of any parent of this node. */
52         uint32_t unblocked_time;
53 
54         /**
55          * Minimum number of cycles from scheduling this instruction until the
56          * end of the program, based on the slowest dependency chain through
57          * the children.
58          */
59         uint32_t delay;
60 
61         /**
62          * cycles between this instruction being scheduled and when its result
63          * can be consumed.
64          */
65         uint32_t latency;
66 };
67 
68 /* When walking the instructions in reverse, we need to swap before/after in
69  * add_dep().
70  */
71 enum direction { F, R };
72 
73 struct schedule_state {
74         const struct v3d_device_info *devinfo;
75         struct dag *dag;
76         struct schedule_node *last_r[6];
77         struct schedule_node *last_rf[64];
78         struct schedule_node *last_sf;
79         struct schedule_node *last_vpm_read;
80         struct schedule_node *last_tmu_write;
81         struct schedule_node *last_tmu_config;
82         struct schedule_node *last_tmu_read;
83         struct schedule_node *last_tlb;
84         struct schedule_node *last_vpm;
85         struct schedule_node *last_unif;
86         struct schedule_node *last_rtop;
87         struct schedule_node *last_unifa;
88         struct schedule_node *last_setmsf;
89         enum direction dir;
90         /* Estimated cycle when the current instruction would start. */
91         uint32_t time;
92 };
93 
94 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)95 add_dep(struct schedule_state *state,
96         struct schedule_node *before,
97         struct schedule_node *after,
98         bool write)
99 {
100         bool write_after_read = !write && state->dir == R;
101         uintptr_t edge_data = write_after_read;
102 
103         if (!before || !after)
104                 return;
105 
106         assert(before != after);
107 
108         if (state->dir == F)
109                 dag_add_edge(&before->dag, &after->dag, edge_data);
110         else
111                 dag_add_edge(&after->dag, &before->dag, edge_data);
112 }
113 
114 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)115 add_read_dep(struct schedule_state *state,
116               struct schedule_node *before,
117               struct schedule_node *after)
118 {
119         add_dep(state, before, after, false);
120 }
121 
122 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)123 add_write_dep(struct schedule_state *state,
124               struct schedule_node **before,
125               struct schedule_node *after)
126 {
127         add_dep(state, *before, after, true);
128         *before = after;
129 }
130 
131 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)132 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
133 {
134         if (inst->sig.ldtlb || inst->sig.ldtlbu)
135                 return true;
136 
137         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
138                 return false;
139 
140         if (inst->alu.add.op != V3D_QPU_A_NOP &&
141             inst->alu.add.magic_write &&
142             (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
143              inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
144                 return true;
145 
146         if (inst->alu.mul.op != V3D_QPU_M_NOP &&
147             inst->alu.mul.magic_write &&
148             (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
149              inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
150                 return true;
151 
152         return false;
153 }
154 
155 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)156 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
157                  enum v3d_qpu_mux mux)
158 {
159         assert(state->devinfo->ver < 71);
160         switch (mux) {
161         case V3D_QPU_MUX_A:
162                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
163                 break;
164         case V3D_QPU_MUX_B:
165                 if (!n->inst->qpu.sig.small_imm_b) {
166                         add_read_dep(state,
167                                      state->last_rf[n->inst->qpu.raddr_b], n);
168                 }
169                 break;
170         default:
171                 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
172                 break;
173         }
174 }
175 
176 
177 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint8_t raddr,bool is_small_imm)178 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
179                    uint8_t raddr, bool is_small_imm)
180 {
181         assert(state->devinfo->ver >= 71);
182 
183         if (!is_small_imm)
184                 add_read_dep(state, state->last_rf[raddr], n);
185 }
186 
187 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)188 tmu_write_is_sequence_terminator(uint32_t waddr)
189 {
190         switch (waddr) {
191         case V3D_QPU_WADDR_TMUS:
192         case V3D_QPU_WADDR_TMUSCM:
193         case V3D_QPU_WADDR_TMUSF:
194         case V3D_QPU_WADDR_TMUSLOD:
195         case V3D_QPU_WADDR_TMUA:
196         case V3D_QPU_WADDR_TMUAU:
197                 return true;
198         default:
199                 return false;
200         }
201 }
202 
203 static bool
can_reorder_tmu_write(const struct v3d_device_info * devinfo,uint32_t waddr)204 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
205 {
206         if (tmu_write_is_sequence_terminator(waddr))
207                 return false;
208 
209         if (waddr == V3D_QPU_WADDR_TMUD)
210                 return false;
211 
212         return true;
213 }
214 
215 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)216 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
217                    uint32_t waddr, bool magic)
218 {
219         if (!magic) {
220                 add_write_dep(state, &state->last_rf[waddr], n);
221         } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
222                 if (can_reorder_tmu_write(state->devinfo, waddr))
223                         add_read_dep(state, state->last_tmu_write, n);
224                 else
225                         add_write_dep(state, &state->last_tmu_write, n);
226 
227                 if (tmu_write_is_sequence_terminator(waddr))
228                         add_write_dep(state, &state->last_tmu_config, n);
229         } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
230                 /* Handled by v3d_qpu_writes_r4() check. */
231         } else {
232                 switch (waddr) {
233                 case V3D_QPU_WADDR_R0:
234                 case V3D_QPU_WADDR_R1:
235                 case V3D_QPU_WADDR_R2:
236                         add_write_dep(state,
237                                       &state->last_r[waddr - V3D_QPU_WADDR_R0],
238                                       n);
239                         break;
240                 case V3D_QPU_WADDR_R3:
241                 case V3D_QPU_WADDR_R4:
242                 case V3D_QPU_WADDR_R5:
243                         /* Handled by v3d_qpu_writes_r*() checks below. */
244                         break;
245 
246                 case V3D_QPU_WADDR_VPM:
247                 case V3D_QPU_WADDR_VPMU:
248                         add_write_dep(state, &state->last_vpm, n);
249                         break;
250 
251                 case V3D_QPU_WADDR_TLB:
252                 case V3D_QPU_WADDR_TLBU:
253                         add_write_dep(state, &state->last_tlb, n);
254                         break;
255 
256                 case V3D_QPU_WADDR_SYNC:
257                 case V3D_QPU_WADDR_SYNCB:
258                 case V3D_QPU_WADDR_SYNCU:
259                         /* For CS barrier(): Sync against any other memory
260                          * accesses.  There doesn't appear to be any need for
261                          * barriers to affect ALU operations.
262                          */
263                         add_write_dep(state, &state->last_tmu_write, n);
264                         add_write_dep(state, &state->last_tmu_read, n);
265                         break;
266 
267                 case V3D_QPU_WADDR_UNIFA:
268                         add_write_dep(state, &state->last_unifa, n);
269                         break;
270 
271                 case V3D_QPU_WADDR_NOP:
272                         break;
273 
274                 default:
275                         fprintf(stderr, "Unknown waddr %d\n", waddr);
276                         abort();
277                 }
278         }
279 }
280 
281 /**
282  * Common code for dependencies that need to be tracked both forward and
283  * backward.
284  *
285  * This is for things like "all reads of r4 have to happen between the r4
286  * writes that surround them".
287  */
288 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)289 calculate_deps(struct schedule_state *state, struct schedule_node *n)
290 {
291         const struct v3d_device_info *devinfo = state->devinfo;
292         struct qinst *qinst = n->inst;
293         struct v3d_qpu_instr *inst = &qinst->qpu;
294         /* If the input and output segments are shared, then all VPM reads to
295          * a location need to happen before all writes.  We handle this by
296          * serializing all VPM operations for now.
297          *
298          * FIXME: we are assuming that the segments are shared. That is
299          * correct right now as we are only using shared, but technically you
300          * can choose.
301          */
302         bool separate_vpm_segment = false;
303 
304         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
305                 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
306                         add_read_dep(state, state->last_sf, n);
307 
308                 /* XXX: BDI */
309                 /* XXX: BDU */
310                 /* XXX: ub */
311                 /* XXX: raddr_a */
312 
313                 add_write_dep(state, &state->last_unif, n);
314                 return;
315         }
316 
317         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
318 
319         /* XXX: LOAD_IMM */
320 
321         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
322                 if (devinfo->ver < 71) {
323                         process_mux_deps(state, n, inst->alu.add.a.mux);
324                 } else {
325                         process_raddr_deps(state, n, inst->alu.add.a.raddr,
326                                            inst->sig.small_imm_a);
327                 }
328         }
329         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
330                 if (devinfo->ver < 71) {
331                         process_mux_deps(state, n, inst->alu.add.b.mux);
332                 } else {
333                         process_raddr_deps(state, n, inst->alu.add.b.raddr,
334                                            inst->sig.small_imm_b);
335                 }
336         }
337 
338         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
339                 if (devinfo->ver < 71) {
340                         process_mux_deps(state, n, inst->alu.mul.a.mux);
341                 } else {
342                         process_raddr_deps(state, n, inst->alu.mul.a.raddr,
343                                            inst->sig.small_imm_c);
344                 }
345         }
346         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
347                 if (devinfo->ver < 71) {
348                         process_mux_deps(state, n, inst->alu.mul.b.mux);
349                 } else {
350                         process_raddr_deps(state, n, inst->alu.mul.b.raddr,
351                                            inst->sig.small_imm_d);
352                 }
353         }
354 
355         switch (inst->alu.add.op) {
356         case V3D_QPU_A_VPMSETUP:
357                 /* Could distinguish read/write by unpacking the uniform. */
358                 add_write_dep(state, &state->last_vpm, n);
359                 add_write_dep(state, &state->last_vpm_read, n);
360                 break;
361 
362         case V3D_QPU_A_STVPMV:
363         case V3D_QPU_A_STVPMD:
364         case V3D_QPU_A_STVPMP:
365                 add_write_dep(state, &state->last_vpm, n);
366                 break;
367 
368         case V3D_QPU_A_LDVPMV_IN:
369         case V3D_QPU_A_LDVPMD_IN:
370         case V3D_QPU_A_LDVPMG_IN:
371         case V3D_QPU_A_LDVPMP:
372                 if (!separate_vpm_segment)
373                         add_write_dep(state, &state->last_vpm, n);
374                 break;
375 
376         case V3D_QPU_A_VPMWT:
377                 add_read_dep(state, state->last_vpm, n);
378                 break;
379 
380         case V3D_QPU_A_MSF:
381                 add_read_dep(state, state->last_tlb, n);
382                 add_read_dep(state, state->last_setmsf, n);
383                 break;
384 
385         case V3D_QPU_A_SETMSF:
386                 add_write_dep(state, &state->last_setmsf, n);
387                 add_write_dep(state, &state->last_tmu_write, n);
388                 FALLTHROUGH;
389         case V3D_QPU_A_SETREVF:
390                 add_write_dep(state, &state->last_tlb, n);
391                 break;
392 
393         case V3D_QPU_A_BALLOT:
394         case V3D_QPU_A_BCASTF:
395         case V3D_QPU_A_ALLEQ:
396         case V3D_QPU_A_ALLFEQ:
397                 add_read_dep(state, state->last_setmsf, n);
398                 break;
399 
400         default:
401                 break;
402         }
403 
404         switch (inst->alu.mul.op) {
405         case V3D_QPU_M_MULTOP:
406         case V3D_QPU_M_UMUL24:
407                 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
408                  * resets it to 0.  We could possibly reorder umul24s relative
409                  * to each other, but for now just keep all the MUL parts in
410                  * order.
411                  */
412                 add_write_dep(state, &state->last_rtop, n);
413                 break;
414         default:
415                 break;
416         }
417 
418         if (inst->alu.add.op != V3D_QPU_A_NOP) {
419                 process_waddr_deps(state, n, inst->alu.add.waddr,
420                                    inst->alu.add.magic_write);
421         }
422         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
423                 process_waddr_deps(state, n, inst->alu.mul.waddr,
424                                    inst->alu.mul.magic_write);
425         }
426         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
427                 process_waddr_deps(state, n, inst->sig_addr,
428                                    inst->sig_magic);
429         }
430 
431         if (v3d_qpu_writes_r3(devinfo, inst))
432                 add_write_dep(state, &state->last_r[3], n);
433         if (v3d_qpu_writes_r4(devinfo, inst))
434                 add_write_dep(state, &state->last_r[4], n);
435         if (v3d_qpu_writes_r5(devinfo, inst))
436                 add_write_dep(state, &state->last_r[5], n);
437         if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
438                 add_write_dep(state, &state->last_rf[0], n);
439 
440         /* If we add any more dependencies here we should consider whether we
441          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
442          */
443         if (inst->sig.thrsw) {
444                 /* All accumulator contents and flags are undefined after the
445                  * switch.
446                  */
447                 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
448                         add_write_dep(state, &state->last_r[i], n);
449                 add_write_dep(state, &state->last_sf, n);
450                 add_write_dep(state, &state->last_rtop, n);
451 
452                 /* Scoreboard-locking operations have to stay after the last
453                  * thread switch.
454                  */
455                 add_write_dep(state, &state->last_tlb, n);
456 
457                 add_write_dep(state, &state->last_tmu_write, n);
458                 add_write_dep(state, &state->last_tmu_config, n);
459         }
460 
461         if (v3d_qpu_waits_on_tmu(inst)) {
462                 /* TMU loads are coming from a FIFO, so ordering is important.
463                  */
464                 add_write_dep(state, &state->last_tmu_read, n);
465                 /* Keep TMU loads after their TMU lookup terminator */
466                 add_read_dep(state, state->last_tmu_config, n);
467         }
468 
469         /* Allow wrtmuc to be reordered with other instructions in the
470          * same TMU sequence by using a read dependency on the last TMU
471          * sequence terminator.
472          */
473         if (inst->sig.wrtmuc)
474                 add_read_dep(state, state->last_tmu_config, n);
475 
476         if (inst->sig.ldtlb | inst->sig.ldtlbu)
477                 add_write_dep(state, &state->last_tlb, n);
478 
479         if (inst->sig.ldvpm) {
480                 add_write_dep(state, &state->last_vpm_read, n);
481 
482                 /* At least for now, we're doing shared I/O segments, so queue
483                  * all writes after all reads.
484                  */
485                 if (!separate_vpm_segment)
486                         add_write_dep(state, &state->last_vpm, n);
487         }
488 
489         /* inst->sig.ldunif or sideband uniform read */
490         if (vir_has_uniform(qinst))
491                 add_write_dep(state, &state->last_unif, n);
492 
493         /* Both unifa and ldunifa must preserve ordering */
494         if (inst->sig.ldunifa || inst->sig.ldunifarf)
495                 add_write_dep(state, &state->last_unifa, n);
496 
497         if (v3d_qpu_reads_flags(inst))
498                 add_read_dep(state, state->last_sf, n);
499         if (v3d_qpu_writes_flags(inst))
500                 add_write_dep(state, &state->last_sf, n);
501 }
502 
503 static void
calculate_forward_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)504 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
505                        struct list_head *schedule_list)
506 {
507         struct schedule_state state;
508 
509         memset(&state, 0, sizeof(state));
510         state.dag = dag;
511         state.devinfo = c->devinfo;
512         state.dir = F;
513 
514         list_for_each_entry(struct schedule_node, node, schedule_list, link)
515                 calculate_deps(&state, node);
516 }
517 
518 static void
calculate_reverse_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)519 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
520                        struct list_head *schedule_list)
521 {
522         struct schedule_state state;
523 
524         memset(&state, 0, sizeof(state));
525         state.dag = dag;
526         state.devinfo = c->devinfo;
527         state.dir = R;
528 
529         list_for_each_entry_rev(struct schedule_node, node, schedule_list,
530                                 link) {
531                 calculate_deps(&state, (struct schedule_node *)node);
532         }
533 }
534 
535 struct choose_scoreboard {
536         struct dag *dag;
537         int tick;
538         int last_magic_sfu_write_tick;
539         int last_stallable_sfu_reg;
540         int last_stallable_sfu_tick;
541         int last_ldvary_tick;
542         int last_unifa_write_tick;
543         int last_uniforms_reset_tick;
544         int last_thrsw_tick;
545         int last_branch_tick;
546         int last_setmsf_tick;
547         bool first_thrsw_emitted;
548         bool last_thrsw_emitted;
549         bool fixup_ldvary;
550         int ldvary_count;
551         int pending_ldtmu_count;
552         bool first_ldtmu_after_thrsw;
553 
554         /* V3D 7.x */
555         int last_implicit_rf0_write_tick;
556         bool has_rf0_flops_conflict;
557 };
558 
559 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)560 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
561                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
562 {
563         switch (mux) {
564         case V3D_QPU_MUX_R4:
565                 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
566                         return true;
567                 break;
568 
569         case V3D_QPU_MUX_R5:
570                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
571                         return true;
572                 break;
573         default:
574                 break;
575         }
576 
577         return false;
578 }
579 
580 static bool
reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,uint8_t raddr)581 reads_too_soon(struct choose_scoreboard *scoreboard,
582                const struct v3d_qpu_instr *inst, uint8_t raddr)
583 {
584         switch (raddr) {
585         case 0: /* ldvary delayed write of C coefficient to rf0 */
586                 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
587                         return true;
588                 break;
589         default:
590                 break;
591         }
592 
593         return false;
594 }
595 
596 static bool
reads_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)597 reads_too_soon_after_write(const struct v3d_device_info *devinfo,
598                            struct choose_scoreboard *scoreboard,
599                            struct qinst *qinst)
600 {
601         const struct v3d_qpu_instr *inst = &qinst->qpu;
602 
603         /* XXX: Branching off of raddr. */
604         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
605                 return false;
606 
607         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
608 
609         if (inst->alu.add.op != V3D_QPU_A_NOP) {
610                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
611                         if (devinfo->ver < 71) {
612                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
613                                         return true;
614                         } else {
615                                 if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
616                                         return true;
617                         }
618                 }
619                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
620                         if (devinfo->ver < 71) {
621                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
622                                         return true;
623                         } else {
624                                 if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
625                                         return true;
626                         }
627                 }
628         }
629 
630         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
631                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
632                         if (devinfo->ver < 71) {
633                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
634                                         return true;
635                         } else {
636                                 if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
637                                         return true;
638                         }
639                 }
640                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
641                         if (devinfo->ver < 71) {
642                                 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
643                                         return true;
644                         } else {
645                                 if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
646                                         return true;
647                         }
648                 }
649         }
650 
651         /* XXX: imm */
652 
653         return false;
654 }
655 
656 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)657 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
658                             struct choose_scoreboard *scoreboard,
659                             struct qinst *qinst)
660 {
661         const struct v3d_qpu_instr *inst = &qinst->qpu;
662 
663         /* Don't schedule any other r4 write too soon after an SFU write.
664          * This would normally be prevented by dependency tracking, but might
665          * occur if a dead SFU computation makes it to scheduling.
666          */
667         if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
668             v3d_qpu_writes_r4(devinfo, inst))
669                 return true;
670 
671         if (devinfo->ver == 42)
672            return false;
673 
674         /* Don't schedule anything that writes rf0 right after ldvary, since
675          * that would clash with the ldvary's delayed rf0 write (the exception
676          * is another ldvary, since its implicit rf0 write would also have
677          * one cycle of delay and would not clash).
678          */
679         if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
680             (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
681              (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
682               !inst->sig.ldvary))) {
683             return true;
684        }
685 
686         return false;
687 }
688 
689 static bool
scoreboard_is_locked(struct choose_scoreboard * scoreboard,bool lock_scoreboard_on_first_thrsw)690 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
691                      bool lock_scoreboard_on_first_thrsw)
692 {
693         if (lock_scoreboard_on_first_thrsw) {
694                 return scoreboard->first_thrsw_emitted &&
695                        scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
696         }
697 
698         return scoreboard->last_thrsw_emitted &&
699                scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
700 }
701 
702 static bool
pixel_scoreboard_too_soon(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)703 pixel_scoreboard_too_soon(struct v3d_compile *c,
704                           struct choose_scoreboard *scoreboard,
705                           const struct v3d_qpu_instr *inst)
706 {
707         return qpu_inst_is_tlb(inst) &&
708                !scoreboard_is_locked(scoreboard,
709                                      c->lock_scoreboard_on_first_thrsw);
710 }
711 
712 static bool
qpu_instruction_uses_rf(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst,uint32_t waddr)713 qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
714                         const struct v3d_qpu_instr *inst,
715                         uint32_t waddr) {
716 
717         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
718            return false;
719 
720         if (devinfo->ver < 71) {
721                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
722                     inst->raddr_a == waddr)
723                         return true;
724 
725                 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
726                     !inst->sig.small_imm_b && (inst->raddr_b == waddr))
727                         return true;
728         } else {
729                 if (v3d71_qpu_reads_raddr(inst, waddr))
730                         return true;
731         }
732 
733         return false;
734 }
735 
736 static bool
read_stalls(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)737 read_stalls(const struct v3d_device_info *devinfo,
738             struct choose_scoreboard *scoreboard,
739             const struct v3d_qpu_instr *inst)
740 {
741         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
742                 qpu_instruction_uses_rf(devinfo, inst,
743                                         scoreboard->last_stallable_sfu_reg);
744 }
745 
746 /* We define a max schedule priority to allow negative priorities as result of
747  * subtracting this max when an instruction stalls. So instructions that
748  * stall have lower priority than regular instructions. */
749 #define MAX_SCHEDULE_PRIORITY 16
750 
751 static int
get_instruction_priority(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)752 get_instruction_priority(const struct v3d_device_info *devinfo,
753                          const struct v3d_qpu_instr *inst)
754 {
755         uint32_t baseline_score;
756         uint32_t next_score = 0;
757 
758         /* Schedule TLB operations as late as possible, to get more
759          * parallelism between shaders.
760          */
761         if (qpu_inst_is_tlb(inst))
762                 return next_score;
763         next_score++;
764 
765         /* Empirical testing shows that using priorities to hide latency of
766          * TMU operations when scheduling QPU leads to slightly worse
767          * performance, even at 2 threads. We think this is because the thread
768          * switching is already quite effective at hiding latency and NIR
769          * scheduling (and possibly TMU pipelining too) are sufficient to hide
770          * TMU latency, so piling up on that here doesn't provide any benefits
771          * and instead may cause us to postpone critical paths that depend on
772          * the TMU results.
773          */
774 #if 0
775         /* Schedule texture read results collection late to hide latency. */
776         if (v3d_qpu_waits_on_tmu(inst))
777                 return next_score;
778         next_score++;
779 #endif
780 
781         /* Default score for things that aren't otherwise special. */
782         baseline_score = next_score;
783         next_score++;
784 
785 #if 0
786         /* Schedule texture read setup early to hide their latency better. */
787         if (v3d_qpu_writes_tmu(devinfo, inst))
788                 return next_score;
789         next_score++;
790 #endif
791 
792         /* We should increase the maximum if we assert here */
793         assert(next_score < MAX_SCHEDULE_PRIORITY);
794 
795         return baseline_score;
796 }
797 
798 enum {
799         V3D_PERIPHERAL_VPM_READ           = (1 << 0),
800         V3D_PERIPHERAL_VPM_WRITE          = (1 << 1),
801         V3D_PERIPHERAL_VPM_WAIT           = (1 << 2),
802         V3D_PERIPHERAL_SFU                = (1 << 3),
803         V3D_PERIPHERAL_TMU_WRITE          = (1 << 4),
804         V3D_PERIPHERAL_TMU_READ           = (1 << 5),
805         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
806         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
807         V3D_PERIPHERAL_TSY                = (1 << 8),
808         V3D_PERIPHERAL_TLB_READ           = (1 << 9),
809         V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
810 };
811 
812 static uint32_t
qpu_peripherals(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)813 qpu_peripherals(const struct v3d_device_info *devinfo,
814                 const struct v3d_qpu_instr *inst)
815 {
816         uint32_t result = 0;
817         if (v3d_qpu_reads_vpm(inst))
818                 result |= V3D_PERIPHERAL_VPM_READ;
819         if (v3d_qpu_writes_vpm(inst))
820                 result |= V3D_PERIPHERAL_VPM_WRITE;
821         if (v3d_qpu_waits_vpm(inst))
822                 result |= V3D_PERIPHERAL_VPM_WAIT;
823 
824         if (v3d_qpu_writes_tmu(devinfo, inst))
825                 result |= V3D_PERIPHERAL_TMU_WRITE;
826         if (inst->sig.ldtmu)
827                 result |= V3D_PERIPHERAL_TMU_READ;
828         if (inst->sig.wrtmuc)
829                 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
830 
831         if (v3d_qpu_uses_sfu(inst))
832                 result |= V3D_PERIPHERAL_SFU;
833 
834         if (v3d_qpu_reads_tlb(inst))
835                 result |= V3D_PERIPHERAL_TLB_READ;
836         if (v3d_qpu_writes_tlb(inst))
837                 result |= V3D_PERIPHERAL_TLB_WRITE;
838 
839         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
840                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
841                     inst->alu.add.magic_write &&
842                     v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
843                         result |= V3D_PERIPHERAL_TSY;
844                 }
845 
846                 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
847                         result |= V3D_PERIPHERAL_TMU_WAIT;
848         }
849 
850         return result;
851 }
852 
853 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)854 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
855                                  const struct v3d_qpu_instr *a,
856                                  const struct v3d_qpu_instr *b)
857 {
858         const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
859         const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
860 
861         /* We can always do one peripheral access per instruction. */
862         if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
863                 return true;
864 
865         /* V3D 4.x can't do more than one peripheral access except in a
866          * few cases:
867          */
868         if (devinfo->ver == 42) {
869                 /* WRTMUC signal with TMU register write (other than tmuc). */
870                 if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
871                     b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
872                         return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
873                 }
874                 if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
875                     a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
876                         return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
877                 }
878 
879                 /* TMU read with VPM read/write. */
880                 if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
881                     (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
882                      b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
883                         return true;
884                 }
885                 if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
886                     (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
887                      a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
888                         return true;
889                 }
890 
891                 return false;
892         }
893 
894         /* V3D 7.x can't have more than one of these restricted peripherals */
895         const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
896                                     V3D_PERIPHERAL_TMU_WRTMUC_SIG |
897                                     V3D_PERIPHERAL_TSY |
898                                     V3D_PERIPHERAL_TLB_READ |
899                                     V3D_PERIPHERAL_SFU |
900                                     V3D_PERIPHERAL_VPM_READ |
901                                     V3D_PERIPHERAL_VPM_WRITE;
902 
903         const uint32_t a_restricted = a_peripherals & restricted;
904         const uint32_t b_restricted = b_peripherals & restricted;
905         if (a_restricted && b_restricted) {
906                 /* WRTMUC signal with TMU register write (other than tmuc) is
907                  * allowed though.
908                  */
909                 if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
910                        b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
911                        v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
912                       (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
913                        a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
914                        v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
915                         return false;
916                 }
917         }
918 
919         /* Only one TMU read per instruction */
920         if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
921             (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
922                 return false;
923         }
924 
925         /* Only one TLB access per instruction */
926         if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
927                               V3D_PERIPHERAL_TLB_READ)) &&
928             (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
929                               V3D_PERIPHERAL_TLB_READ))) {
930                 return false;
931         }
932 
933         return true;
934 }
935 
936 /* Compute a bitmask of which rf registers are used between
937  * the two instructions.
938  */
939 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)940 qpu_raddrs_used(const struct v3d_qpu_instr *a,
941                 const struct v3d_qpu_instr *b)
942 {
943         assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
944         assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
945 
946         uint64_t raddrs_used = 0;
947         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
948                 raddrs_used |= (UINT64_C(1) << a->raddr_a);
949         if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
950                 raddrs_used |= (UINT64_C(1) << a->raddr_b);
951         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
952                 raddrs_used |= (UINT64_C(1) << b->raddr_a);
953         if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
954                 raddrs_used |= (UINT64_C(1) << b->raddr_b);
955 
956         return raddrs_used;
957 }
958 
959 /* Takes two instructions and attempts to merge their raddr fields (including
960  * small immediates) into one merged instruction. For V3D 4.x, returns false
961  * if the two instructions access more than two different rf registers between
962  * them, or more than one rf register and one small immediate. For 7.x returns
963  * false if both instructions use small immediates.
964  */
965 static bool
qpu_merge_raddrs(struct v3d_qpu_instr * result,const struct v3d_qpu_instr * add_instr,const struct v3d_qpu_instr * mul_instr,const struct v3d_device_info * devinfo)966 qpu_merge_raddrs(struct v3d_qpu_instr *result,
967                  const struct v3d_qpu_instr *add_instr,
968                  const struct v3d_qpu_instr *mul_instr,
969                  const struct v3d_device_info *devinfo)
970 {
971         if (devinfo->ver >= 71) {
972                 assert(add_instr->sig.small_imm_a +
973                        add_instr->sig.small_imm_b <= 1);
974                 assert(add_instr->sig.small_imm_c +
975                        add_instr->sig.small_imm_d == 0);
976                 assert(mul_instr->sig.small_imm_a +
977                        mul_instr->sig.small_imm_b == 0);
978                 assert(mul_instr->sig.small_imm_c +
979                        mul_instr->sig.small_imm_d <= 1);
980 
981                 result->sig.small_imm_a = add_instr->sig.small_imm_a;
982                 result->sig.small_imm_b = add_instr->sig.small_imm_b;
983                 result->sig.small_imm_c = mul_instr->sig.small_imm_c;
984                 result->sig.small_imm_d = mul_instr->sig.small_imm_d;
985 
986                 return (result->sig.small_imm_a +
987                         result->sig.small_imm_b +
988                         result->sig.small_imm_c +
989                         result->sig.small_imm_d) <= 1;
990         }
991 
992         assert(devinfo->ver == 42);
993 
994         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
995         int naddrs = util_bitcount64(raddrs_used);
996 
997         if (naddrs > 2)
998                 return false;
999 
1000         if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
1001                 if (naddrs > 1)
1002                         return false;
1003 
1004                 if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
1005                         if (add_instr->raddr_b != mul_instr->raddr_b)
1006                                 return false;
1007 
1008                 result->sig.small_imm_b = true;
1009                 result->raddr_b = add_instr->sig.small_imm_b ?
1010                         add_instr->raddr_b : mul_instr->raddr_b;
1011         }
1012 
1013         if (naddrs == 0)
1014                 return true;
1015 
1016         int raddr_a = ffsll(raddrs_used) - 1;
1017         raddrs_used &= ~(UINT64_C(1) << raddr_a);
1018         result->raddr_a = raddr_a;
1019 
1020         if (!result->sig.small_imm_b) {
1021                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
1022                     raddr_a == add_instr->raddr_b) {
1023                         if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
1024                                 result->alu.add.a.mux = V3D_QPU_MUX_A;
1025                         if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
1026                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1027                                 result->alu.add.b.mux = V3D_QPU_MUX_A;
1028                         }
1029                 }
1030                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
1031                     raddr_a == mul_instr->raddr_b) {
1032                         if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
1033                                 result->alu.mul.a.mux = V3D_QPU_MUX_A;
1034                         if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
1035                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1036                                 result->alu.mul.b.mux = V3D_QPU_MUX_A;
1037                         }
1038                 }
1039         }
1040         if (!raddrs_used)
1041                 return true;
1042 
1043         int raddr_b = ffsll(raddrs_used) - 1;
1044         result->raddr_b = raddr_b;
1045         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
1046             raddr_b == add_instr->raddr_a) {
1047                 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
1048                         result->alu.add.a.mux = V3D_QPU_MUX_B;
1049                 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
1050                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1051                         result->alu.add.b.mux = V3D_QPU_MUX_B;
1052                 }
1053         }
1054         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
1055             raddr_b == mul_instr->raddr_a) {
1056                 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
1057                         result->alu.mul.a.mux = V3D_QPU_MUX_B;
1058                 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
1059                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1060                         result->alu.mul.b.mux = V3D_QPU_MUX_B;
1061                 }
1062         }
1063 
1064         return true;
1065 }
1066 
1067 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)1068 can_do_add_as_mul(enum v3d_qpu_add_op op)
1069 {
1070         switch (op) {
1071         case V3D_QPU_A_ADD:
1072         case V3D_QPU_A_SUB:
1073                 return true;
1074         default:
1075                 return false;
1076         }
1077 }
1078 
1079 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)1080 add_op_as_mul_op(enum v3d_qpu_add_op op)
1081 {
1082         switch (op) {
1083         case V3D_QPU_A_ADD:
1084                 return V3D_QPU_M_ADD;
1085         case V3D_QPU_A_SUB:
1086                 return V3D_QPU_M_SUB;
1087         default:
1088                 unreachable("unexpected add opcode");
1089         }
1090 }
1091 
1092 static void
qpu_convert_add_to_mul(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst)1093 qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
1094                        struct v3d_qpu_instr *inst)
1095 {
1096         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
1097         assert(inst->alu.add.op != V3D_QPU_A_NOP);
1098         assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1099 
1100         memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
1101         inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
1102         inst->alu.add.op = V3D_QPU_A_NOP;
1103 
1104         inst->flags.mc = inst->flags.ac;
1105         inst->flags.mpf = inst->flags.apf;
1106         inst->flags.muf = inst->flags.auf;
1107         inst->flags.ac = V3D_QPU_COND_NONE;
1108         inst->flags.apf = V3D_QPU_PF_NONE;
1109         inst->flags.auf = V3D_QPU_UF_NONE;
1110 
1111         inst->alu.mul.output_pack = inst->alu.add.output_pack;
1112 
1113         inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
1114         inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
1115         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
1116         inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
1117         inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
1118 
1119         if (devinfo->ver >= 71) {
1120                 assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
1121                 assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
1122                 if (inst->sig.small_imm_a) {
1123                         inst->sig.small_imm_c = true;
1124                         inst->sig.small_imm_a = false;
1125                 } else if (inst->sig.small_imm_b) {
1126                         inst->sig.small_imm_d = true;
1127                         inst->sig.small_imm_b = false;
1128                 }
1129         }
1130 }
1131 
1132 static bool
can_do_mul_as_add(const struct v3d_device_info * devinfo,enum v3d_qpu_mul_op op)1133 can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
1134 {
1135         switch (op) {
1136         case V3D_QPU_M_MOV:
1137         case V3D_QPU_M_FMOV:
1138                 return devinfo->ver >= 71;
1139         default:
1140                 return false;
1141         }
1142 }
1143 
1144 static enum v3d_qpu_mul_op
mul_op_as_add_op(enum v3d_qpu_mul_op op)1145 mul_op_as_add_op(enum v3d_qpu_mul_op op)
1146 {
1147         switch (op) {
1148         case V3D_QPU_M_MOV:
1149                 return V3D_QPU_A_MOV;
1150         case V3D_QPU_M_FMOV:
1151                 return V3D_QPU_A_FMOV;
1152         default:
1153                 unreachable("unexpected mov opcode");
1154         }
1155 }
1156 
1157 static void
qpu_convert_mul_to_add(struct v3d_qpu_instr * inst)1158 qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
1159 {
1160         STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
1161         assert(inst->alu.mul.op != V3D_QPU_M_NOP);
1162         assert(inst->alu.add.op == V3D_QPU_A_NOP);
1163 
1164         memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
1165         inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
1166         inst->alu.mul.op = V3D_QPU_M_NOP;
1167 
1168         inst->flags.ac = inst->flags.mc;
1169         inst->flags.apf = inst->flags.mpf;
1170         inst->flags.auf = inst->flags.muf;
1171         inst->flags.mc = V3D_QPU_COND_NONE;
1172         inst->flags.mpf = V3D_QPU_PF_NONE;
1173         inst->flags.muf = V3D_QPU_UF_NONE;
1174 
1175         inst->alu.add.output_pack = inst->alu.mul.output_pack;
1176         inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
1177         inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
1178         inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
1179         inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
1180         inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
1181 
1182         assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
1183         assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
1184         if (inst->sig.small_imm_c) {
1185                 inst->sig.small_imm_a = true;
1186                 inst->sig.small_imm_c = false;
1187         } else if (inst->sig.small_imm_d) {
1188                 inst->sig.small_imm_b = true;
1189                 inst->sig.small_imm_d = false;
1190         }
1191 }
1192 
1193 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)1194 qpu_merge_inst(const struct v3d_device_info *devinfo,
1195                struct v3d_qpu_instr *result,
1196                const struct v3d_qpu_instr *a,
1197                const struct v3d_qpu_instr *b)
1198 {
1199         if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
1200             b->type != V3D_QPU_INSTR_TYPE_ALU) {
1201                 return false;
1202         }
1203 
1204         if (!qpu_compatible_peripheral_access(devinfo, a, b))
1205                 return false;
1206 
1207         struct v3d_qpu_instr merge = *a;
1208         const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
1209 
1210         struct v3d_qpu_instr mul_inst;
1211         if (b->alu.add.op != V3D_QPU_A_NOP) {
1212                 if (a->alu.add.op == V3D_QPU_A_NOP) {
1213                         merge.alu.add = b->alu.add;
1214 
1215                         merge.flags.ac = b->flags.ac;
1216                         merge.flags.apf = b->flags.apf;
1217                         merge.flags.auf = b->flags.auf;
1218 
1219                         add_instr = b;
1220                         mul_instr = a;
1221                 }
1222                 /* If a's add op is used but its mul op is not, then see if we
1223                  * can convert either a's add op or b's add op to a mul op
1224                  * so we can merge.
1225                  */
1226                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1227                          can_do_add_as_mul(b->alu.add.op)) {
1228                         mul_inst = *b;
1229                         qpu_convert_add_to_mul(devinfo, &mul_inst);
1230 
1231                         merge.alu.mul = mul_inst.alu.mul;
1232 
1233                         merge.flags.mc = mul_inst.flags.mc;
1234                         merge.flags.mpf = mul_inst.flags.mpf;
1235                         merge.flags.muf = mul_inst.flags.muf;
1236 
1237                         add_instr = a;
1238                         mul_instr = &mul_inst;
1239                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1240                            can_do_add_as_mul(a->alu.add.op)) {
1241                         mul_inst = *a;
1242                         qpu_convert_add_to_mul(devinfo, &mul_inst);
1243 
1244                         merge = mul_inst;
1245                         merge.alu.add = b->alu.add;
1246 
1247                         merge.flags.ac = b->flags.ac;
1248                         merge.flags.apf = b->flags.apf;
1249                         merge.flags.auf = b->flags.auf;
1250 
1251                         add_instr = b;
1252                         mul_instr = &mul_inst;
1253                 } else {
1254                         return false;
1255                 }
1256         }
1257 
1258         struct v3d_qpu_instr add_inst;
1259         if (b->alu.mul.op != V3D_QPU_M_NOP) {
1260                 if (a->alu.mul.op == V3D_QPU_M_NOP) {
1261                         merge.alu.mul = b->alu.mul;
1262 
1263                         merge.flags.mc = b->flags.mc;
1264                         merge.flags.mpf = b->flags.mpf;
1265                         merge.flags.muf = b->flags.muf;
1266 
1267                         mul_instr = b;
1268                         add_instr = a;
1269                 }
1270                 /* If a's mul op is used but its add op is not, then see if we
1271                  * can convert either a's mul op or b's mul op to an add op
1272                  * so we can merge.
1273                  */
1274                 else if (a->alu.add.op == V3D_QPU_A_NOP &&
1275                          can_do_mul_as_add(devinfo, b->alu.mul.op)) {
1276                         add_inst = *b;
1277                         qpu_convert_mul_to_add(&add_inst);
1278 
1279                         merge.alu.add = add_inst.alu.add;
1280 
1281                         merge.flags.ac = add_inst.flags.ac;
1282                         merge.flags.apf = add_inst.flags.apf;
1283                         merge.flags.auf = add_inst.flags.auf;
1284 
1285                         mul_instr = a;
1286                         add_instr = &add_inst;
1287                 } else if (a->alu.add.op == V3D_QPU_A_NOP &&
1288                            can_do_mul_as_add(devinfo, a->alu.mul.op)) {
1289                         add_inst = *a;
1290                         qpu_convert_mul_to_add(&add_inst);
1291 
1292                         merge = add_inst;
1293                         merge.alu.mul = b->alu.mul;
1294 
1295                         merge.flags.mc = b->flags.mc;
1296                         merge.flags.mpf = b->flags.mpf;
1297                         merge.flags.muf = b->flags.muf;
1298 
1299                         mul_instr = b;
1300                         add_instr = &add_inst;
1301                 } else {
1302                         return false;
1303                 }
1304         }
1305 
1306         /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
1307          * they have restrictions on the number of raddrs that can be adressed
1308          * in a single instruction. In V3D 7.x, we don't have that restriction,
1309          * but we are still limited to a single small immediate per instruction.
1310          */
1311         if (add_instr && mul_instr &&
1312             !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
1313                 return false;
1314         }
1315 
1316         merge.sig.thrsw |= b->sig.thrsw;
1317         merge.sig.ldunif |= b->sig.ldunif;
1318         merge.sig.ldunifrf |= b->sig.ldunifrf;
1319         merge.sig.ldunifa |= b->sig.ldunifa;
1320         merge.sig.ldunifarf |= b->sig.ldunifarf;
1321         merge.sig.ldtmu |= b->sig.ldtmu;
1322         merge.sig.ldvary |= b->sig.ldvary;
1323         merge.sig.ldvpm |= b->sig.ldvpm;
1324         merge.sig.ldtlb |= b->sig.ldtlb;
1325         merge.sig.ldtlbu |= b->sig.ldtlbu;
1326         merge.sig.ucb |= b->sig.ucb;
1327         merge.sig.rotate |= b->sig.rotate;
1328         merge.sig.wrtmuc |= b->sig.wrtmuc;
1329 
1330         if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1331             v3d_qpu_sig_writes_address(devinfo, &b->sig))
1332                 return false;
1333         merge.sig_addr |= b->sig_addr;
1334         merge.sig_magic |= b->sig_magic;
1335 
1336         uint64_t packed;
1337         bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1338 
1339         *result = merge;
1340         /* No modifying the real instructions on failure. */
1341         assert(ok || (a != result && b != result));
1342 
1343         return ok;
1344 }
1345 
1346 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr * inst)1347 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1348 {
1349         return inst->sig.ldunif || inst->sig.ldunifrf;
1350 }
1351 
1352 static bool
1353 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1354                                          struct choose_scoreboard *scoreboard,
1355                                          const struct qinst *qinst);
1356 
1357 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct schedule_node * prev_inst)1358 choose_instruction_to_schedule(struct v3d_compile *c,
1359                                struct choose_scoreboard *scoreboard,
1360                                struct schedule_node *prev_inst)
1361 {
1362         struct schedule_node *chosen = NULL;
1363         int chosen_prio = 0;
1364 
1365         /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1366          * will handle pairing it along with filling the delay slots.
1367          */
1368         if (prev_inst) {
1369                 if (prev_inst->inst->qpu.sig.thrsw)
1370                         return NULL;
1371         }
1372 
1373         bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1374                                  scoreboard->ldvary_count < c->num_inputs;
1375         bool skipped_insts_for_ldvary_pipelining = false;
1376 retry:
1377         list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1378                             dag.link) {
1379                 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1380 
1381                 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1382                         skipped_insts_for_ldvary_pipelining = true;
1383                         continue;
1384                 }
1385 
1386                 /* Don't choose the branch instruction until it's the last one
1387                  * left.  We'll move it up to fit its delay slots after we
1388                  * choose it.
1389                  */
1390                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1391                     !list_is_singular(&scoreboard->dag->heads)) {
1392                         continue;
1393                 }
1394 
1395                 /* We need to have 3 delay slots between a write to unifa and
1396                  * a follow-up ldunifa.
1397                  */
1398                 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1399                     scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1400                         continue;
1401 
1402                 /* "An instruction must not read from a location in physical
1403                  *  regfile A or B that was written to by the previous
1404                  *  instruction."
1405                  */
1406                 if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1407                         continue;
1408 
1409                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1410                         continue;
1411 
1412                 /* "Before doing a TLB access a scoreboard wait must have been
1413                  *  done. This happens either on the first or last thread
1414                  *  switch, depending on a setting (scb_wait_on_first_thrsw) in
1415                  *  the shader state."
1416                  */
1417                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1418                         continue;
1419 
1420                 /* ldunif and ldvary both write the same register (r5 for v42
1421                  * and below, rf0 for v71), but ldunif does so a tick sooner.
1422                  * If the ldvary's register wasn't used, then ldunif might
1423                  * otherwise get scheduled so ldunif and ldvary try to update
1424                  * the register in the same tick.
1425                  */
1426                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1427                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1428                         continue;
1429                 }
1430 
1431                 /* If we are in a thrsw delay slot check that this instruction
1432                  * is valid for that.
1433                  */
1434                 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1435                     !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1436                                                               n->inst)) {
1437                         continue;
1438                 }
1439 
1440                 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1441                         /* Don't try to put a branch in the delay slots of another
1442                          * branch or a unifa write.
1443                          */
1444                         if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1445                                 continue;
1446                         if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1447                                 continue;
1448 
1449                         /* No branch with cond != 0,2,3 and msfign != 0 after
1450                          * setmsf.
1451                          */
1452                         if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1453                             inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1454                             inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1455                             inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1456                             inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1457                                 continue;
1458                         }
1459                 }
1460 
1461                 /* If we're trying to pair with another instruction, check
1462                  * that they're compatible.
1463                  */
1464                 if (prev_inst) {
1465                         /* Don't pair up a thread switch signal -- we'll
1466                          * handle pairing it when we pick it on its own.
1467                          */
1468                         if (inst->sig.thrsw)
1469                                 continue;
1470 
1471                         if (prev_inst->inst->uniform != -1 &&
1472                             n->inst->uniform != -1)
1473                                 continue;
1474 
1475                        /* Simulator complains if we have two uniforms loaded in
1476                         * the the same instruction, which could happen if we
1477                         * have a ldunif or sideband uniform and we pair that
1478                         * with ldunifa.
1479                         */
1480                         if (vir_has_uniform(prev_inst->inst) &&
1481                             (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1482                                 continue;
1483                         }
1484 
1485                         if ((prev_inst->inst->qpu.sig.ldunifa ||
1486                              prev_inst->inst->qpu.sig.ldunifarf) &&
1487                             vir_has_uniform(n->inst)) {
1488                                 continue;
1489                         }
1490 
1491                         /* Don't merge TLB instructions before we have acquired
1492                          * the scoreboard lock.
1493                          */
1494                         if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1495                                 continue;
1496 
1497                         /* When we successfully pair up an ldvary we then try
1498                          * to merge it into the previous instruction if
1499                          * possible to improve pipelining. Don't pick up the
1500                          * ldvary now if the follow-up fixup would place
1501                          * it in the delay slots of a thrsw, which is not
1502                          * allowed and would prevent the fixup from being
1503                          * successful. In V3D 7.x we can allow this to happen
1504                          * as long as it is not the last delay slot.
1505                          */
1506                         if (inst->sig.ldvary) {
1507                                 if (c->devinfo->ver == 42 &&
1508                                     scoreboard->last_thrsw_tick + 2 >=
1509                                     scoreboard->tick - 1) {
1510                                         continue;
1511                                 }
1512                                 if (c->devinfo->ver >= 71 &&
1513                                     scoreboard->last_thrsw_tick + 2 ==
1514                                     scoreboard->tick - 1) {
1515                                         continue;
1516                                 }
1517                         }
1518 
1519                         /* We can emit a new tmu lookup with a previous ldtmu
1520                          * if doing this would free just enough space in the
1521                          * TMU output fifo so we don't overflow, however, this
1522                          * is only safe if the ldtmu cannot stall.
1523                          *
1524                          * A ldtmu can stall if it is not the first following a
1525                          * thread switch and corresponds to the first word of a
1526                          * read request.
1527                          *
1528                          * FIXME: For now we forbid pairing up a new lookup
1529                          * with a previous ldtmu that is not the first after a
1530                          * thrsw if that could overflow the TMU output fifo
1531                          * regardless of whether the ldtmu is reading the first
1532                          * word of a TMU result or not, since we don't track
1533                          * this aspect in the compiler yet.
1534                          */
1535                         if (prev_inst->inst->qpu.sig.ldtmu &&
1536                             !scoreboard->first_ldtmu_after_thrsw &&
1537                             (scoreboard->pending_ldtmu_count +
1538                              n->inst->ldtmu_count > 16 / c->threads)) {
1539                                 continue;
1540                         }
1541 
1542                         struct v3d_qpu_instr merged_inst;
1543                         if (!qpu_merge_inst(c->devinfo, &merged_inst,
1544                                             &prev_inst->inst->qpu, inst)) {
1545                                 continue;
1546                         }
1547                 }
1548 
1549                 int prio = get_instruction_priority(c->devinfo, inst);
1550 
1551                 if (read_stalls(c->devinfo, scoreboard, inst)) {
1552                         /* Don't merge an instruction that stalls */
1553                         if (prev_inst)
1554                                 continue;
1555                         else {
1556                                 /* Any instruction that don't stall will have
1557                                  * higher scheduling priority */
1558                                 prio -= MAX_SCHEDULE_PRIORITY;
1559                                 assert(prio < 0);
1560                         }
1561                 }
1562 
1563                 /* Found a valid instruction.  If nothing better comes along,
1564                  * this one works.
1565                  */
1566                 if (!chosen) {
1567                         chosen = n;
1568                         chosen_prio = prio;
1569                         continue;
1570                 }
1571 
1572                 if (prio > chosen_prio) {
1573                         chosen = n;
1574                         chosen_prio = prio;
1575                 } else if (prio < chosen_prio) {
1576                         continue;
1577                 }
1578 
1579                 if (n->delay > chosen->delay) {
1580                         chosen = n;
1581                         chosen_prio = prio;
1582                 } else if (n->delay < chosen->delay) {
1583                         continue;
1584                 }
1585         }
1586 
1587         /* If we did not find any instruction to schedule but we discarded
1588          * some of them to prioritize ldvary pipelining, try again.
1589          */
1590         if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1591                 skipped_insts_for_ldvary_pipelining = false;
1592                 ldvary_pipelining = false;
1593                 goto retry;
1594         }
1595 
1596         if (chosen && chosen->inst->qpu.sig.ldvary) {
1597                 scoreboard->ldvary_count++;
1598                 /* If we are pairing an ldvary, flag it so we can fix it up for
1599                  * optimal pipelining of ldvary sequences.
1600                  */
1601                 if (prev_inst)
1602                         scoreboard->fixup_ldvary = true;
1603         }
1604 
1605         return chosen;
1606 }
1607 
1608 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr,const struct v3d_device_info * devinfo)1609 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1610                                   enum v3d_qpu_waddr waddr,
1611                                   const struct v3d_device_info *devinfo)
1612 {
1613         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1614                 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1615         else if (waddr == V3D_QPU_WADDR_UNIFA)
1616                 scoreboard->last_unifa_write_tick = scoreboard->tick;
1617 }
1618 
1619 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)1620 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1621                                       const struct v3d_qpu_instr *inst)
1622 {
1623         if (v3d_qpu_instr_is_sfu(inst)) {
1624                 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1625                 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1626         }
1627 }
1628 
1629 static void
update_scoreboard_tmu_tracking(struct choose_scoreboard * scoreboard,const struct qinst * inst)1630 update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
1631                                const struct qinst *inst)
1632 {
1633         /* Track if the have seen any ldtmu after the last thread switch */
1634         if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
1635                 scoreboard->first_ldtmu_after_thrsw = true;
1636 
1637         /* Track the number of pending ldtmu instructions for outstanding
1638          * TMU lookups.
1639          */
1640         scoreboard->pending_ldtmu_count += inst->ldtmu_count;
1641         if (inst->qpu.sig.ldtmu) {
1642                 assert(scoreboard->pending_ldtmu_count > 0);
1643                 scoreboard->pending_ldtmu_count--;
1644                 scoreboard->first_ldtmu_after_thrsw = false;
1645         }
1646 }
1647 
1648 static void
set_has_rf0_flops_conflict(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1649 set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
1650                            const struct v3d_qpu_instr *inst,
1651                            const struct v3d_device_info *devinfo)
1652 {
1653         if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
1654             v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
1655             !inst->sig_magic) {
1656                 scoreboard->has_rf0_flops_conflict = true;
1657         }
1658 }
1659 
1660 static void
update_scoreboard_for_rf0_flops(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1661 update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
1662                                 const struct v3d_qpu_instr *inst,
1663                                 const struct v3d_device_info *devinfo)
1664 {
1665         if (devinfo->ver < 71)
1666                 return;
1667 
1668         /* Thread switch restrictions:
1669          *
1670          * At the point of a thread switch or thread end (when the actual
1671          * thread switch or thread end happens, not when the signalling
1672          * instruction is processed):
1673          *
1674          *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
1675          *      ldvary instruction in which another signal also wrote to the
1676          *      register file, and the final instruction of the thread section
1677          *      contained a signal which wrote to the register file, then the
1678          *      value of rf0 is undefined at the start of the new section
1679          *
1680          * Here we use the scoreboard to track if our last rf0 implicit write
1681          * happens at the same time that another signal writes the register
1682          * file (has_rf0_flops_conflict). We will use that information when
1683          * scheduling thrsw instructions to avoid putting anything in their
1684          * last delay slot which has a signal that writes to the register file.
1685          */
1686 
1687         /* Reset tracking if we have an explicit rf0 write or we are starting
1688          * a new thread section.
1689          */
1690         if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
1691             scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
1692                 scoreboard->last_implicit_rf0_write_tick = -10;
1693                 scoreboard->has_rf0_flops_conflict = false;
1694         }
1695 
1696         if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
1697                 scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
1698                         scoreboard->tick + 1 : scoreboard->tick;
1699         }
1700 
1701         set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
1702 }
1703 
1704 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct qinst * qinst,const struct v3d_device_info * devinfo)1705 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1706                              const struct qinst *qinst,
1707                              const struct v3d_device_info *devinfo)
1708 {
1709         const struct v3d_qpu_instr *inst = &qinst->qpu;
1710 
1711         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1712                 return;
1713 
1714         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1715 
1716         if (inst->alu.add.op != V3D_QPU_A_NOP)  {
1717                 if (inst->alu.add.magic_write) {
1718                         update_scoreboard_for_magic_waddr(scoreboard,
1719                                                           inst->alu.add.waddr,
1720                                                           devinfo);
1721                 } else {
1722                         update_scoreboard_for_sfu_stall_waddr(scoreboard,
1723                                                               inst);
1724                 }
1725 
1726                 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1727                         scoreboard->last_setmsf_tick = scoreboard->tick;
1728         }
1729 
1730         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1731                 if (inst->alu.mul.magic_write) {
1732                         update_scoreboard_for_magic_waddr(scoreboard,
1733                                                           inst->alu.mul.waddr,
1734                                                           devinfo);
1735                 }
1736         }
1737 
1738         if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
1739                 update_scoreboard_for_magic_waddr(scoreboard,
1740                                                   inst->sig_addr,
1741                                                   devinfo);
1742         }
1743 
1744         if (inst->sig.ldvary)
1745                 scoreboard->last_ldvary_tick = scoreboard->tick;
1746 
1747         update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
1748 
1749         update_scoreboard_tmu_tracking(scoreboard, qinst);
1750 }
1751 
1752 static void
dump_state(const struct v3d_device_info * devinfo,struct dag * dag)1753 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1754 {
1755         list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1756                 fprintf(stderr, "         t=%4d: ", n->unblocked_time);
1757                 v3d_qpu_dump(devinfo, &n->inst->qpu);
1758                 fprintf(stderr, "\n");
1759 
1760                 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1761                         struct schedule_node *child =
1762                                 (struct schedule_node *)edge->child;
1763                         if (!child)
1764                                 continue;
1765 
1766                         fprintf(stderr, "                 - ");
1767                         v3d_qpu_dump(devinfo, &child->inst->qpu);
1768                         fprintf(stderr, " (%d parents, %c)\n",
1769                                 child->dag.parent_count,
1770                                 edge->data ? 'w' : 'r');
1771                 }
1772         }
1773 }
1774 
magic_waddr_latency(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)1775 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1776                                     enum v3d_qpu_waddr waddr,
1777                                     const struct v3d_qpu_instr *after)
1778 {
1779         /* Apply some huge latency between texture fetch requests and getting
1780          * their results back.
1781          *
1782          * FIXME: This is actually pretty bogus.  If we do:
1783          *
1784          * mov tmu0_s, a
1785          * <a bit of math>
1786          * mov tmu0_s, b
1787          * load_tmu0
1788          * <more math>
1789          * load_tmu0
1790          *
1791          * we count that as worse than
1792          *
1793          * mov tmu0_s, a
1794          * mov tmu0_s, b
1795          * <lots of math>
1796          * load_tmu0
1797          * <more math>
1798          * load_tmu0
1799          *
1800          * because we associate the first load_tmu0 with the *second* tmu0_s.
1801          */
1802         if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1803             v3d_qpu_waits_on_tmu(after)) {
1804                 return 100;
1805         }
1806 
1807         /* Assume that anything depending on us is consuming the SFU result. */
1808         if (v3d_qpu_magic_waddr_is_sfu(waddr))
1809                 return 3;
1810 
1811         return 1;
1812 }
1813 
1814 static uint32_t
instruction_latency(const struct v3d_device_info * devinfo,struct schedule_node * before,struct schedule_node * after)1815 instruction_latency(const struct v3d_device_info *devinfo,
1816                     struct schedule_node *before, struct schedule_node *after)
1817 {
1818         const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1819         const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1820         uint32_t latency = 1;
1821 
1822         if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1823             after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1824                 return latency;
1825 
1826         if (v3d_qpu_instr_is_sfu(before_inst))
1827                 return 2;
1828 
1829         if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
1830             before_inst->alu.add.magic_write) {
1831                 latency = MAX2(latency,
1832                                magic_waddr_latency(devinfo,
1833                                                    before_inst->alu.add.waddr,
1834                                                    after_inst));
1835         }
1836 
1837         if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
1838             before_inst->alu.mul.magic_write) {
1839                 latency = MAX2(latency,
1840                                magic_waddr_latency(devinfo,
1841                                                    before_inst->alu.mul.waddr,
1842                                                    after_inst));
1843         }
1844 
1845         return latency;
1846 }
1847 
1848 /** Recursive computation of the delay member of a node. */
1849 static void
compute_delay(struct dag_node * node,void * state)1850 compute_delay(struct dag_node *node, void *state)
1851 {
1852         struct schedule_node *n = (struct schedule_node *)node;
1853         struct v3d_compile *c = (struct v3d_compile *) state;
1854 
1855         n->delay = 1;
1856 
1857         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1858                 struct schedule_node *child =
1859                         (struct schedule_node *)edge->child;
1860 
1861                 n->delay = MAX2(n->delay, (child->delay +
1862                                            instruction_latency(c->devinfo, n,
1863                                                                child)));
1864         }
1865 }
1866 
1867 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1868  * should be called on it later to finish pruning the other edges).
1869  */
1870 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)1871 pre_remove_head(struct dag *dag, struct schedule_node *n)
1872 {
1873         list_delinit(&n->dag.link);
1874 
1875         util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1876                 if (edge->data)
1877                         dag_remove_edge(dag, edge);
1878         }
1879 }
1880 
1881 static void
mark_instruction_scheduled(const struct v3d_device_info * devinfo,struct dag * dag,uint32_t time,struct schedule_node * node)1882 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1883                            struct dag *dag,
1884                            uint32_t time,
1885                            struct schedule_node *node)
1886 {
1887         if (!node)
1888                 return;
1889 
1890         util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1891                 struct schedule_node *child =
1892                         (struct schedule_node *)edge->child;
1893 
1894                 if (!child)
1895                         continue;
1896 
1897                 uint32_t latency = instruction_latency(devinfo, node, child);
1898 
1899                 child->unblocked_time = MAX2(child->unblocked_time,
1900                                              time + latency);
1901         }
1902         dag_prune_head(dag, &node->dag);
1903 }
1904 
1905 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1906 insert_scheduled_instruction(struct v3d_compile *c,
1907                              struct qblock *block,
1908                              struct choose_scoreboard *scoreboard,
1909                              struct qinst *inst)
1910 {
1911         list_addtail(&inst->link, &block->instructions);
1912 
1913         update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
1914         c->qpu_inst_count++;
1915         scoreboard->tick++;
1916 }
1917 
1918 static struct qinst *
vir_nop()1919 vir_nop()
1920 {
1921         struct qreg undef = vir_nop_reg();
1922         struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1923 
1924         return qinst;
1925 }
1926 
1927 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1928 emit_nop(struct v3d_compile *c, struct qblock *block,
1929          struct choose_scoreboard *scoreboard)
1930 {
1931         insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1932 }
1933 
1934 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1935 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1936                               const struct qinst *qinst, int slot)
1937 {
1938         const struct v3d_qpu_instr *inst = &qinst->qpu;
1939 
1940         if (slot == 2 && qinst->is_tlb_z_write)
1941                 return false;
1942 
1943         if (slot > 0 && qinst->uniform != ~0)
1944                 return false;
1945 
1946         if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
1947                 return false;
1948 
1949         if (inst->sig.ldvary)
1950                 return false;
1951 
1952         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1953                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1954                 if (c->devinfo->ver == 42 && slot == 2 &&
1955                     inst->alu.add.op == V3D_QPU_A_TMUWT) {
1956                         return false;
1957                 }
1958 
1959                 if (c->devinfo->ver == 42) {
1960                         /* No writing physical registers at the end. */
1961                         bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
1962                         bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
1963                         if ((!add_is_nop && !inst->alu.add.magic_write) ||
1964                             (!mul_is_nop && !inst->alu.mul.magic_write)) {
1965                                 return false;
1966                         }
1967 
1968                         if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1969                             !inst->sig_magic) {
1970                                 return false;
1971                         }
1972                 }
1973 
1974                 if (c->devinfo->ver >= 71) {
1975                         /* The thread end instruction must not write to the
1976                          * register file via the add/mul ALUs.
1977                          */
1978                         if (slot == 0 &&
1979                             (!inst->alu.add.magic_write ||
1980                              !inst->alu.mul.magic_write)) {
1981                                 return false;
1982                         }
1983                 }
1984 
1985                 if (c->devinfo->ver == 42) {
1986                         /* RF0-2 might be overwritten during the delay slots by
1987                          * fragment shader setup.
1988                          */
1989                         if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
1990                                 return false;
1991 
1992                         if (inst->raddr_b < 3 &&
1993                             !inst->sig.small_imm_b &&
1994                             v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
1995                                 return false;
1996                         }
1997                 }
1998 
1999                 if (c->devinfo->ver >= 71) {
2000                         /* RF2-3 might be overwritten during the delay slots by
2001                          * fragment shader setup.
2002                          */
2003                         if (v3d71_qpu_reads_raddr(inst, 2) ||
2004                             v3d71_qpu_reads_raddr(inst, 3)) {
2005                                 return false;
2006                         }
2007 
2008                         if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
2009                             v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
2010                                 return false;
2011                         }
2012                 }
2013         }
2014 
2015         return true;
2016 }
2017 
2018 /**
2019  * This is called when trying to merge a thrsw back into the instruction stream
2020  * of instructions that were scheduled *before* the thrsw signal to fill its
2021  * delay slots. Because the actual execution of the thrsw happens after the
2022  * delay slots, it is usually safe to do this, but there are some cases that
2023  * need special care.
2024  */
2025 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst,uint32_t slot)2026 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2027                                           struct choose_scoreboard *scoreboard,
2028                                           const struct qinst *qinst,
2029                                           uint32_t slot)
2030 {
2031         /* No scheduling SFU when the result would land in the other
2032          * thread.  The simulator complains for safety, though it
2033          * would only occur for dead code in our case.
2034          */
2035         if (slot > 0) {
2036                 if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
2037                         return false;
2038                 if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
2039                         return false;
2040         }
2041 
2042         if (qinst->qpu.sig.ldvary) {
2043                 if (c->devinfo->ver == 42 && slot > 0)
2044                         return false;
2045                 if (c->devinfo->ver >= 71 && slot == 2)
2046                         return false;
2047         }
2048 
2049         /* unifa and the following 3 instructions can't overlap a
2050          * thread switch/end. The docs further clarify that this means
2051          * the cycle at which the actual thread switch/end happens
2052          * and not when the thrsw instruction is processed, which would
2053          * be after the 2 delay slots following the thrsw instruction.
2054          * This means that we can move up a thrsw up to the instruction
2055          * right after unifa:
2056          *
2057          * unifa, r5
2058          * thrsw
2059          * delay slot 1
2060          * delay slot 2
2061          * Thread switch happens here, 4 instructions away from unifa
2062          */
2063         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
2064                 return false;
2065 
2066         /* See comment when we set has_rf0_flops_conflict for details */
2067         if (c->devinfo->ver >= 71 &&
2068             slot == 2 &&
2069             v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
2070             !qinst->qpu.sig_magic) {
2071                 if (scoreboard->has_rf0_flops_conflict)
2072                         return false;
2073                 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
2074                         return false;
2075         }
2076 
2077         return true;
2078 }
2079 
2080 /**
2081  * This is called for instructions scheduled *after* a thrsw signal that may
2082  * land in the delay slots of the thrsw. Because these instructions were
2083  * scheduled after the thrsw, we need to be careful when placing them into
2084  * the delay slots, since that means that we are moving them ahead of the
2085  * thread switch and we need to ensure that is not a problem.
2086  */
2087 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst)2088 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2089                                          struct choose_scoreboard *scoreboard,
2090                                          const struct qinst *qinst)
2091 {
2092         const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
2093         assert(slot <= 2);
2094 
2095         /* We merge thrsw instructions back into the instruction stream
2096          * manually, so any instructions scheduled after a thrsw should be
2097          * in the actual delay slots and not in the same slot as the thrsw.
2098          */
2099         assert(slot >= 1);
2100 
2101         /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
2102         if (qinst->qpu.sig.thrsw)
2103                 return false;
2104 
2105         /* The restrictions for instructions scheduled before the the thrsw
2106          * also apply to instructions scheduled after the thrsw that we want
2107          * to place in its delay slots.
2108          */
2109         if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
2110                 return false;
2111 
2112         /* TLB access is disallowed until scoreboard wait is executed, which
2113          * we do on the last thread switch.
2114          */
2115         if (qpu_inst_is_tlb(&qinst->qpu))
2116                 return false;
2117 
2118         /* Instruction sequence restrictions: Branch is not allowed in delay
2119          * slots of a thrsw.
2120          */
2121         if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2122                 return false;
2123 
2124         /* Miscellaneous restrictions: At the point of a thrsw we need to have
2125          * at least one outstanding lookup or TSY wait.
2126          *
2127          * So avoid placing TMU instructions scheduled after the thrsw into
2128          * its delay slots or we may be compromising the integrity of our TMU
2129          * sequences. Also, notice that if we moved these instructions into
2130          * the delay slots of a previous thrsw we could overflow our TMU output
2131          * fifo, since we could be effectively pipelining a lookup scheduled
2132          * after the thrsw into the sequence before the thrsw.
2133          */
2134         if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
2135             qinst->qpu.sig.wrtmuc) {
2136                 return false;
2137         }
2138 
2139         /* Don't move instructions that wait on the TMU before the thread switch
2140          * happens since that would make the current thread stall before the
2141          * switch, which is exactly what we want to avoid with the thrsw
2142          * instruction.
2143          */
2144         if (v3d_qpu_waits_on_tmu(&qinst->qpu))
2145                 return false;
2146 
2147         /* A thread switch invalidates all accumulators, so don't place any
2148          * instructions that write accumulators into the delay slots.
2149          */
2150         if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
2151                 return false;
2152 
2153         /* Multop has an implicit write to the rtop register which is an
2154          * specialized accumulator that is only used with this instruction.
2155          */
2156         if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
2157                 return false;
2158 
2159         /* Flags are invalidated across a thread switch, so dont' place
2160          * instructions that write flags into delay slots.
2161          */
2162         if (v3d_qpu_writes_flags(&qinst->qpu))
2163                 return false;
2164 
2165         /* TSY sync ops materialize at the point of the next thread switch,
2166          * therefore, if we have a TSY sync right after a thread switch, we
2167          * cannot place it in its delay slots, or we would be moving the sync
2168          * to the thrsw before it instead.
2169          */
2170         if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
2171                 return false;
2172 
2173         return true;
2174 }
2175 
2176 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)2177 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
2178                      struct qinst *qinst, int instructions_in_sequence,
2179                      bool is_thrend)
2180 {
2181         for (int slot = 0; slot < instructions_in_sequence; slot++) {
2182                 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
2183                                                                qinst, slot)) {
2184                         return false;
2185                 }
2186 
2187                 if (is_thrend &&
2188                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
2189                         return false;
2190                 }
2191 
2192                 /* Note that the list is circular, so we can only do this up
2193                  * to instructions_in_sequence.
2194                  */
2195                 qinst = (struct qinst *)qinst->link.next;
2196         }
2197 
2198         return true;
2199 }
2200 
2201 /**
2202  * Emits a THRSW signal in the stream, trying to move it up to pair with
2203  * another instruction.
2204  */
2205 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)2206 emit_thrsw(struct v3d_compile *c,
2207            struct qblock *block,
2208            struct choose_scoreboard *scoreboard,
2209            struct qinst *inst,
2210            bool is_thrend)
2211 {
2212         int time = 0;
2213 
2214         /* There should be nothing in a thrsw inst being scheduled other than
2215          * the signal bits.
2216          */
2217         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
2218         assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
2219         assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
2220 
2221         /* Don't try to emit a thrsw in the delay slots of a previous thrsw
2222          * or branch.
2223          */
2224         while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
2225                 emit_nop(c, block, scoreboard);
2226                 time++;
2227         }
2228         while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
2229                 emit_nop(c, block, scoreboard);
2230                 time++;
2231         }
2232 
2233         /* Find how far back into previous instructions we can put the THRSW. */
2234         int slots_filled = 0;
2235         int invalid_sig_count = 0;
2236         int invalid_seq_count = 0;
2237         bool last_thrsw_after_invalid_ok = false;
2238         struct qinst *merge_inst = NULL;
2239         vir_for_each_inst_rev(prev_inst, block) {
2240                 /* No emitting our thrsw while the previous thrsw hasn't
2241                  * happened yet.
2242                  */
2243                 if (scoreboard->last_thrsw_tick + 3 >
2244                     scoreboard->tick - (slots_filled + 1)) {
2245                         break;
2246                 }
2247 
2248 
2249                 if (!valid_thrsw_sequence(c, scoreboard,
2250                                           prev_inst, slots_filled + 1,
2251                                           is_thrend)) {
2252                         /* Even if the current sequence isn't valid, we may
2253                          * be able to get a valid sequence by trying to move the
2254                          * thrsw earlier, so keep going.
2255                          */
2256                         invalid_seq_count++;
2257                         goto cont_block;
2258                 }
2259 
2260                 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
2261                 sig.thrsw = true;
2262                 uint32_t packed_sig;
2263                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
2264                         /* If we can't merge the thrsw here because of signal
2265                          * incompatibility, keep going, we might be able to
2266                          * merge it in an earlier instruction.
2267                          */
2268                         invalid_sig_count++;
2269                         goto cont_block;
2270                 }
2271 
2272                 /* For last thrsw we need 2 consecutive slots that are
2273                  * thrsw compatible, so if we have previously jumped over
2274                  * an incompatible signal, flag that we have found the first
2275                  * valid slot here and keep going.
2276                  */
2277                 if (inst->is_last_thrsw && invalid_sig_count > 0 &&
2278                     !last_thrsw_after_invalid_ok) {
2279                         last_thrsw_after_invalid_ok = true;
2280                         invalid_sig_count++;
2281                         goto cont_block;
2282                 }
2283 
2284                 /* We can merge the thrsw in this instruction */
2285                 last_thrsw_after_invalid_ok = false;
2286                 invalid_sig_count = 0;
2287                 invalid_seq_count = 0;
2288                 merge_inst = prev_inst;
2289 
2290 cont_block:
2291                 if (++slots_filled == 3)
2292                         break;
2293         }
2294 
2295         /* If we jumped over a signal incompatibility and did not manage to
2296          * merge the thrsw in the end, we need to adjust slots filled to match
2297          * the last valid merge point.
2298          */
2299         assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
2300                 slots_filled >= invalid_sig_count + invalid_seq_count);
2301         if (invalid_sig_count > 0)
2302                 slots_filled -= invalid_sig_count;
2303         if (invalid_seq_count > 0)
2304                 slots_filled -= invalid_seq_count;
2305 
2306         bool needs_free = false;
2307         if (merge_inst) {
2308                 merge_inst->qpu.sig.thrsw = true;
2309                 needs_free = true;
2310                 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
2311         } else {
2312                 scoreboard->last_thrsw_tick = scoreboard->tick;
2313                 insert_scheduled_instruction(c, block, scoreboard, inst);
2314                 time++;
2315                 slots_filled++;
2316                 merge_inst = inst;
2317         }
2318 
2319         scoreboard->first_thrsw_emitted = true;
2320 
2321         /* If we're emitting the last THRSW (other than program end), then
2322          * signal that to the HW by emitting two THRSWs in a row.
2323          */
2324         if (inst->is_last_thrsw) {
2325                 if (slots_filled <= 1) {
2326                         emit_nop(c, block, scoreboard);
2327                         time++;
2328                 }
2329                 struct qinst *second_inst =
2330                         (struct qinst *)merge_inst->link.next;
2331                 second_inst->qpu.sig.thrsw = true;
2332                 scoreboard->last_thrsw_emitted = true;
2333         }
2334 
2335         /* Make sure the thread end executes within the program lifespan */
2336         if (is_thrend) {
2337                 for (int i = 0; i < 3 - slots_filled; i++) {
2338                         emit_nop(c, block, scoreboard);
2339                         time++;
2340                 }
2341         }
2342 
2343         /* If we put our THRSW into another instruction, free up the
2344          * instruction that didn't end up scheduled into the list.
2345          */
2346         if (needs_free)
2347                 free(inst);
2348 
2349         return time;
2350 }
2351 
2352 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile * c,struct qinst * inst)2353 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
2354 {
2355         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2356                 return false;
2357 
2358         if (inst->qpu.sig.thrsw)
2359                 return false;
2360 
2361         if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
2362                 return false;
2363 
2364         if (vir_has_uniform(inst))
2365                 return false;
2366 
2367         return true;
2368 }
2369 
2370 static void
emit_branch(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)2371 emit_branch(struct v3d_compile *c,
2372            struct qblock *block,
2373            struct choose_scoreboard *scoreboard,
2374            struct qinst *inst)
2375 {
2376         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2377 
2378         /* We should've not picked up a branch for the delay slots of a previous
2379          * thrsw, branch or unifa write instruction.
2380          */
2381         int branch_tick = scoreboard->tick;
2382         assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
2383         assert(scoreboard->last_branch_tick + 3 < branch_tick);
2384         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
2385 
2386         /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
2387          * setmsf.
2388          */
2389         bool is_safe_msf_branch =
2390                 c->devinfo->ver >= 71 ||
2391                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
2392                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
2393                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
2394                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
2395         assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
2396                is_safe_msf_branch);
2397 
2398         /* Insert the branch instruction */
2399         insert_scheduled_instruction(c, block, scoreboard, inst);
2400 
2401         /* Now see if we can move the branch instruction back into the
2402          * instruction stream to fill its delay slots
2403          */
2404         int slots_filled = 0;
2405         while (slots_filled < 3 && block->instructions.next != &inst->link) {
2406                 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
2407                 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
2408 
2409                 /* Can't move the branch instruction if that would place it
2410                  * in the delay slots of other instructions.
2411                  */
2412                 if (scoreboard->last_branch_tick + 3 >=
2413                     branch_tick - slots_filled - 1) {
2414                         break;
2415                 }
2416 
2417                 if (scoreboard->last_thrsw_tick + 2 >=
2418                     branch_tick - slots_filled - 1) {
2419                         break;
2420                 }
2421 
2422                 if (scoreboard->last_unifa_write_tick + 3 >=
2423                     branch_tick - slots_filled - 1) {
2424                         break;
2425                 }
2426 
2427                 /* Do not move up a branch if it can disrupt an ldvary sequence
2428                  * as that can cause stomping of the r5 register.
2429                  */
2430                 if (scoreboard->last_ldvary_tick + 2 >=
2431                     branch_tick - slots_filled) {
2432                        break;
2433                 }
2434 
2435                 /* Can't move a conditional branch before the instruction
2436                  * that writes the flags for its condition.
2437                  */
2438                 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
2439                     inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
2440                         break;
2441                 }
2442 
2443                 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
2444                         break;
2445 
2446                 if (!is_safe_msf_branch) {
2447                         struct qinst *prev_prev_inst =
2448                                 (struct qinst *) prev_inst->link.prev;
2449                         if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
2450                             prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
2451                                 break;
2452                         }
2453                 }
2454 
2455                 list_del(&prev_inst->link);
2456                 list_add(&prev_inst->link, &inst->link);
2457                 slots_filled++;
2458         }
2459 
2460         block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
2461         scoreboard->last_branch_tick = branch_tick - slots_filled;
2462 
2463         /* Fill any remaining delay slots.
2464          *
2465          * For unconditional branches we'll try to fill these with the
2466          * first instructions in the successor block after scheduling
2467          * all blocks when setting up branch targets.
2468          */
2469         for (int i = 0; i < 3 - slots_filled; i++)
2470                 emit_nop(c, block, scoreboard);
2471 }
2472 
2473 static bool
alu_reads_register(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst,bool add,bool magic,uint32_t index)2474 alu_reads_register(const struct v3d_device_info *devinfo,
2475                    struct v3d_qpu_instr *inst,
2476                    bool add, bool magic, uint32_t index)
2477 {
2478         uint32_t num_src;
2479         if (add)
2480                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2481         else
2482                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2483 
2484         if (devinfo->ver == 42) {
2485                 enum v3d_qpu_mux mux_a, mux_b;
2486                 if (add) {
2487                         mux_a = inst->alu.add.a.mux;
2488                         mux_b = inst->alu.add.b.mux;
2489                 } else {
2490                         mux_a = inst->alu.mul.a.mux;
2491                         mux_b = inst->alu.mul.b.mux;
2492                 }
2493 
2494                 for (int i = 0; i < num_src; i++) {
2495                         if (magic) {
2496                                 if (i == 0 && mux_a == index)
2497                                         return true;
2498                                 if (i == 1 && mux_b == index)
2499                                         return true;
2500                         } else {
2501                                 if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2502                                     inst->raddr_a == index) {
2503                                         return true;
2504                                 }
2505                                 if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2506                                     inst->raddr_b == index) {
2507                                         return true;
2508                                 }
2509                                 if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2510                                     inst->raddr_a == index) {
2511                                         return true;
2512                                 }
2513                                 if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2514                                     inst->raddr_b == index) {
2515                                         return true;
2516                                 }
2517                         }
2518                 }
2519 
2520                 return false;
2521         }
2522 
2523         assert(devinfo->ver >= 71);
2524         assert(!magic);
2525 
2526         uint32_t raddr_a, raddr_b;
2527         if (add) {
2528                 raddr_a = inst->alu.add.a.raddr;
2529                 raddr_b = inst->alu.add.b.raddr;
2530         } else {
2531                 raddr_a = inst->alu.mul.a.raddr;
2532                 raddr_b = inst->alu.mul.b.raddr;
2533         }
2534 
2535         for (int i = 0; i < num_src; i++) {
2536                 if (i == 0 && raddr_a == index)
2537                         return true;
2538                 if (i == 1 && raddr_b == index)
2539                         return true;
2540         }
2541 
2542         return false;
2543 }
2544 
2545 /**
2546  * This takes and ldvary signal merged into 'inst' and tries to move it up to
2547  * the previous instruction to get good pipelining of ldvary sequences,
2548  * transforming this:
2549  *
2550  * nop                  ; nop               ; ldvary.r4
2551  * nop                  ; fmul  r0, r4, rf0 ;
2552  * fadd  rf13, r0, r5   ; nop;              ; ldvary.r1  <-- inst
2553  *
2554  * into:
2555  *
2556  * nop                  ; nop               ; ldvary.r4
2557  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2558  * fadd  rf13, r0, r5   ; nop;              ;            <-- inst
2559  *
2560  * If we manage to do this successfully (we return true here), then flagging
2561  * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2562  * we will be able to pick up to merge into 'inst', leading to code like this:
2563  *
2564  * nop                  ; nop               ; ldvary.r4
2565  * nop                  ; fmul  r0, r4, rf0 ; ldvary.r1
2566  * fadd  rf13, r0, r5   ; fmul  r2, r1, rf0 ;            <-- inst
2567  */
2568 static bool
fixup_pipelined_ldvary(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct v3d_qpu_instr * inst)2569 fixup_pipelined_ldvary(struct v3d_compile *c,
2570                        struct choose_scoreboard *scoreboard,
2571                        struct qblock *block,
2572                        struct v3d_qpu_instr *inst)
2573 {
2574         const struct v3d_device_info *devinfo = c->devinfo;
2575 
2576         /* We only call this if we have successfully merged an ldvary into a
2577          * previous instruction.
2578          */
2579         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2580         assert(inst->sig.ldvary);
2581         uint32_t ldvary_magic = inst->sig_magic;
2582         uint32_t ldvary_index = inst->sig_addr;
2583 
2584         /* The instruction in which we merged the ldvary cannot read
2585          * the ldvary destination, if it does, then moving the ldvary before
2586          * it would overwrite it.
2587          */
2588         if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
2589                 return false;
2590         if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
2591                 return false;
2592 
2593         /* The implicit ldvary destination may not be written to by a signal
2594          * in the instruction following ldvary. Since we are planning to move
2595          * ldvary to the previous instruction, this means we need to check if
2596          * the current instruction has any other signal that could create this
2597          * conflict. The only other signal that can write to the implicit
2598          * ldvary destination that is compatible with ldvary in the same
2599          * instruction is ldunif.
2600          */
2601         if (inst->sig.ldunif)
2602                 return false;
2603 
2604         /* The previous instruction can't write to the same destination as the
2605          * ldvary.
2606          */
2607         struct qinst *prev = (struct qinst *) block->instructions.prev;
2608         if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2609                 return false;
2610 
2611         if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2612                 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2613                     prev->qpu.alu.add.waddr == ldvary_index) {
2614                         return false;
2615                 }
2616         }
2617 
2618         if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2619                 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2620                     prev->qpu.alu.mul.waddr == ldvary_index) {
2621                         return false;
2622                 }
2623         }
2624 
2625         /* The previous instruction cannot have a conflicting signal */
2626         if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
2627                 return false;
2628 
2629         uint32_t sig;
2630         struct v3d_qpu_sig new_sig = prev->qpu.sig;
2631         new_sig.ldvary = true;
2632         if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
2633                 return false;
2634 
2635         /* The previous instruction cannot use flags since ldvary uses the
2636          * 'cond' instruction field to store the destination.
2637          */
2638         if (v3d_qpu_writes_flags(&prev->qpu))
2639                 return false;
2640         if (v3d_qpu_reads_flags(&prev->qpu))
2641                 return false;
2642 
2643         /* We can't put an ldvary in the delay slots of a thrsw. We should've
2644          * prevented this when pairing up the ldvary with another instruction
2645          * and flagging it for a fixup. In V3D 7.x this is limited only to the
2646          * second delay slot.
2647          */
2648         assert((devinfo->ver == 42 &&
2649                 scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
2650                (devinfo->ver >= 71 &&
2651                 scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
2652 
2653         /* Move the ldvary to the previous instruction and remove it from the
2654          * current one.
2655          */
2656         prev->qpu.sig.ldvary = true;
2657         prev->qpu.sig_magic = ldvary_magic;
2658         prev->qpu.sig_addr = ldvary_index;
2659         scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2660 
2661         inst->sig.ldvary = false;
2662         inst->sig_magic = false;
2663         inst->sig_addr = 0;
2664 
2665         /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
2666         if (devinfo->ver >= 71) {
2667                 scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
2668                 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
2669         }
2670 
2671         /* By moving ldvary to the previous instruction we make it update r5
2672          * (rf0 for ver >= 71) in the current one, so nothing else in it
2673          * should write this register.
2674          *
2675          * This should've been prevented by our depedency tracking, which
2676          * would not allow ldvary to be paired up with an instruction that
2677          * writes r5/rf0 (since our dependency tracking doesn't know that the
2678          * ldvary write to r5/rf0 happens in the next instruction).
2679          */
2680         assert(!v3d_qpu_writes_r5(devinfo, inst));
2681         assert(devinfo->ver == 42 ||
2682                (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
2683                 !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
2684 
2685         return true;
2686 }
2687 
2688 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2689 schedule_instructions(struct v3d_compile *c,
2690                       struct choose_scoreboard *scoreboard,
2691                       struct qblock *block,
2692                       enum quniform_contents *orig_uniform_contents,
2693                       uint32_t *orig_uniform_data,
2694                       uint32_t *next_uniform)
2695 {
2696         const struct v3d_device_info *devinfo = c->devinfo;
2697         uint32_t time = 0;
2698 
2699         while (!list_is_empty(&scoreboard->dag->heads)) {
2700                 struct schedule_node *chosen =
2701                         choose_instruction_to_schedule(c, scoreboard, NULL);
2702                 struct schedule_node *merge = NULL;
2703 
2704                 /* If there are no valid instructions to schedule, drop a NOP
2705                  * in.
2706                  */
2707                 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2708                 struct v3d_qpu_instr *inst = &qinst->qpu;
2709 
2710                 if (debug) {
2711                         fprintf(stderr, "t=%4d: current list:\n",
2712                                 time);
2713                         dump_state(devinfo, scoreboard->dag);
2714                         fprintf(stderr, "t=%4d: chose:   ", time);
2715                         v3d_qpu_dump(devinfo, inst);
2716                         fprintf(stderr, "\n");
2717                 }
2718 
2719                 /* We can't mark_instruction_scheduled() the chosen inst until
2720                  * we're done identifying instructions to merge, so put the
2721                  * merged instructions on a list for a moment.
2722                  */
2723                 struct list_head merged_list;
2724                 list_inithead(&merged_list);
2725 
2726                 /* Schedule this instruction onto the QPU list. Also try to
2727                  * find an instruction to pair with it.
2728                  */
2729                 if (chosen) {
2730                         time = MAX2(chosen->unblocked_time, time);
2731                         pre_remove_head(scoreboard->dag, chosen);
2732 
2733                         while ((merge =
2734                                 choose_instruction_to_schedule(c, scoreboard,
2735                                                                chosen))) {
2736                                 time = MAX2(merge->unblocked_time, time);
2737                                 pre_remove_head(scoreboard->dag, merge);
2738                                 list_addtail(&merge->link, &merged_list);
2739                                 (void)qpu_merge_inst(devinfo, inst,
2740                                                      inst, &merge->inst->qpu);
2741                                 if (merge->inst->uniform != -1) {
2742                                         chosen->inst->uniform =
2743                                                 merge->inst->uniform;
2744                                 }
2745 
2746                                 chosen->inst->ldtmu_count +=
2747                                         merge->inst->ldtmu_count;
2748 
2749                                 if (debug) {
2750                                         fprintf(stderr, "t=%4d: merging: ",
2751                                                 time);
2752                                         v3d_qpu_dump(devinfo, &merge->inst->qpu);
2753                                         fprintf(stderr, "\n");
2754                                         fprintf(stderr, "         result: ");
2755                                         v3d_qpu_dump(devinfo, inst);
2756                                         fprintf(stderr, "\n");
2757                                 }
2758 
2759                                 if (scoreboard->fixup_ldvary) {
2760                                         scoreboard->fixup_ldvary = false;
2761                                         if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2762                                                 /* Flag the ldvary as scheduled
2763                                                  * now so we can try to merge the
2764                                                  * follow-up instruction in the
2765                                                  * the ldvary sequence into the
2766                                                  * current instruction.
2767                                                  */
2768                                                 mark_instruction_scheduled(
2769                                                         devinfo, scoreboard->dag,
2770                                                         time, merge);
2771                                         }
2772                                 }
2773                         }
2774                         if (read_stalls(c->devinfo, scoreboard, inst))
2775                                 c->qpu_inst_stalled_count++;
2776                 }
2777 
2778                 /* Update the uniform index for the rewritten location --
2779                  * branch target updating will still need to change
2780                  * c->uniform_data[] using this index.
2781                  */
2782                 if (qinst->uniform != -1) {
2783                         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2784                                 block->branch_uniform = *next_uniform;
2785 
2786                         c->uniform_data[*next_uniform] =
2787                                 orig_uniform_data[qinst->uniform];
2788                         c->uniform_contents[*next_uniform] =
2789                                 orig_uniform_contents[qinst->uniform];
2790                         qinst->uniform = *next_uniform;
2791                         (*next_uniform)++;
2792                 }
2793 
2794                 if (debug) {
2795                         fprintf(stderr, "\n");
2796                 }
2797 
2798                 /* Now that we've scheduled a new instruction, some of its
2799                  * children can be promoted to the list of instructions ready to
2800                  * be scheduled.  Update the children's unblocked time for this
2801                  * DAG edge as we do so.
2802                  */
2803                 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2804                 list_for_each_entry(struct schedule_node, merge, &merged_list,
2805                                     link) {
2806                         mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2807 
2808                         /* The merged VIR instruction doesn't get re-added to the
2809                          * block, so free it now.
2810                          */
2811                         free(merge->inst);
2812                 }
2813 
2814                 if (inst->sig.thrsw) {
2815                         time += emit_thrsw(c, block, scoreboard, qinst, false);
2816                 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2817                         emit_branch(c, block, scoreboard, qinst);
2818                 } else {
2819                         insert_scheduled_instruction(c, block,
2820                                                      scoreboard, qinst);
2821                 }
2822         }
2823 
2824         return time;
2825 }
2826 
2827 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2828 qpu_schedule_instructions_block(struct v3d_compile *c,
2829                                 struct choose_scoreboard *scoreboard,
2830                                 struct qblock *block,
2831                                 enum quniform_contents *orig_uniform_contents,
2832                                 uint32_t *orig_uniform_data,
2833                                 uint32_t *next_uniform)
2834 {
2835         void *mem_ctx = ralloc_context(NULL);
2836         scoreboard->dag = dag_create(mem_ctx);
2837         struct list_head setup_list;
2838 
2839         list_inithead(&setup_list);
2840 
2841         /* Wrap each instruction in a scheduler structure. */
2842         while (!list_is_empty(&block->instructions)) {
2843                 struct qinst *qinst = (struct qinst *)block->instructions.next;
2844                 struct schedule_node *n =
2845                         rzalloc(mem_ctx, struct schedule_node);
2846 
2847                 dag_init_node(scoreboard->dag, &n->dag);
2848                 n->inst = qinst;
2849 
2850                 list_del(&qinst->link);
2851                 list_addtail(&n->link, &setup_list);
2852         }
2853 
2854         calculate_forward_deps(c, scoreboard->dag, &setup_list);
2855         calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2856 
2857         dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2858 
2859         uint32_t cycles = schedule_instructions(c, scoreboard, block,
2860                                                 orig_uniform_contents,
2861                                                 orig_uniform_data,
2862                                                 next_uniform);
2863 
2864         ralloc_free(mem_ctx);
2865         scoreboard->dag = NULL;
2866 
2867         return cycles;
2868 }
2869 
2870 static void
qpu_set_branch_targets(struct v3d_compile * c)2871 qpu_set_branch_targets(struct v3d_compile *c)
2872 {
2873         vir_for_each_block(block, c) {
2874                 /* The end block of the program has no branch. */
2875                 if (!block->successors[0])
2876                         continue;
2877 
2878                 /* If there was no branch instruction, then the successor
2879                  * block must follow immediately after this one.
2880                  */
2881                 if (block->branch_qpu_ip == ~0) {
2882                         assert(block->end_qpu_ip + 1 ==
2883                                block->successors[0]->start_qpu_ip);
2884                         continue;
2885                 }
2886 
2887                 /* Walk back through the delay slots to find the branch
2888                  * instr.
2889                  */
2890                 struct qinst *branch = NULL;
2891                 struct list_head *entry = block->instructions.prev;
2892                 int32_t delay_slot_count = -1;
2893                 struct qinst *delay_slots_start = NULL;
2894                 for (int i = 0; i < 3; i++) {
2895                         entry = entry->prev;
2896                         struct qinst *inst =
2897                                 container_of(entry, struct qinst, link);
2898 
2899                         if (delay_slot_count == -1) {
2900                                 if (!v3d_qpu_is_nop(&inst->qpu))
2901                                         delay_slot_count = i;
2902                                 else
2903                                         delay_slots_start = inst;
2904                         }
2905 
2906                         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2907                                 branch = inst;
2908                                 break;
2909                         }
2910                 }
2911                 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2912                 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2913                 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2914 
2915                 /* Make sure that the if-we-don't-jump
2916                  * successor was scheduled just after the
2917                  * delay slots.
2918                  */
2919                 assert(!block->successors[1] ||
2920                        block->successors[1]->start_qpu_ip ==
2921                        block->branch_qpu_ip + 4);
2922 
2923                 branch->qpu.branch.offset =
2924                         ((block->successors[0]->start_qpu_ip -
2925                           (block->branch_qpu_ip + 4)) *
2926                          sizeof(uint64_t));
2927 
2928                 /* Set up the relative offset to jump in the
2929                  * uniform stream.
2930                  *
2931                  * Use a temporary here, because
2932                  * uniform_data[inst->uniform] may be shared
2933                  * between multiple instructions.
2934                  */
2935                 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2936                 c->uniform_data[branch->uniform] =
2937                         (block->successors[0]->start_uniform -
2938                          (block->branch_uniform + 1)) * 4;
2939 
2940                 /* If this is an unconditional branch, try to fill any remaining
2941                  * delay slots with the initial instructions of the successor
2942                  * block.
2943                  *
2944                  * FIXME: we can do the same for conditional branches if we
2945                  * predicate the instructions to match the branch condition.
2946                  */
2947                 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2948                         struct list_head *successor_insts =
2949                                 &block->successors[0]->instructions;
2950                         delay_slot_count = MIN2(delay_slot_count,
2951                                                 list_length(successor_insts));
2952                         struct qinst *s_inst =
2953                                 (struct qinst *) successor_insts->next;
2954                         struct qinst *slot = delay_slots_start;
2955                         int slots_filled = 0;
2956                         while (slots_filled < delay_slot_count &&
2957                                qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2958                                 memcpy(&slot->qpu, &s_inst->qpu,
2959                                        sizeof(slot->qpu));
2960                                 s_inst = (struct qinst *) s_inst->link.next;
2961                                 slot = (struct qinst *) slot->link.next;
2962                                 slots_filled++;
2963                         }
2964                         branch->qpu.branch.offset +=
2965                                 slots_filled * sizeof(uint64_t);
2966                 }
2967         }
2968 }
2969 
2970 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)2971 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2972 {
2973         const struct v3d_device_info *devinfo = c->devinfo;
2974         struct qblock *end_block = list_last_entry(&c->blocks,
2975                                                    struct qblock, link);
2976 
2977         /* We reorder the uniforms as we schedule instructions, so save the
2978          * old data off and replace it.
2979          */
2980         uint32_t *uniform_data = c->uniform_data;
2981         enum quniform_contents *uniform_contents = c->uniform_contents;
2982         c->uniform_contents = ralloc_array(c, enum quniform_contents,
2983                                            c->num_uniforms);
2984         c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2985         c->uniform_array_size = c->num_uniforms;
2986         uint32_t next_uniform = 0;
2987 
2988         struct choose_scoreboard scoreboard;
2989         memset(&scoreboard, 0, sizeof(scoreboard));
2990         scoreboard.last_ldvary_tick = -10;
2991         scoreboard.last_unifa_write_tick = -10;
2992         scoreboard.last_magic_sfu_write_tick = -10;
2993         scoreboard.last_uniforms_reset_tick = -10;
2994         scoreboard.last_thrsw_tick = -10;
2995         scoreboard.last_branch_tick = -10;
2996         scoreboard.last_setmsf_tick = -10;
2997         scoreboard.last_stallable_sfu_tick = -10;
2998         scoreboard.first_ldtmu_after_thrsw = true;
2999         scoreboard.last_implicit_rf0_write_tick = - 10;
3000 
3001         if (debug) {
3002                 fprintf(stderr, "Pre-schedule instructions\n");
3003                 vir_for_each_block(block, c) {
3004                         fprintf(stderr, "BLOCK %d\n", block->index);
3005                         list_for_each_entry(struct qinst, qinst,
3006                                             &block->instructions, link) {
3007                                 v3d_qpu_dump(devinfo, &qinst->qpu);
3008                                 fprintf(stderr, "\n");
3009                         }
3010                 }
3011                 fprintf(stderr, "\n");
3012         }
3013 
3014         uint32_t cycles = 0;
3015         vir_for_each_block(block, c) {
3016                 block->start_qpu_ip = c->qpu_inst_count;
3017                 block->branch_qpu_ip = ~0;
3018                 block->start_uniform = next_uniform;
3019 
3020                 cycles += qpu_schedule_instructions_block(c,
3021                                                           &scoreboard,
3022                                                           block,
3023                                                           uniform_contents,
3024                                                           uniform_data,
3025                                                           &next_uniform);
3026 
3027                 block->end_qpu_ip = c->qpu_inst_count - 1;
3028         }
3029 
3030         /* Emit the program-end THRSW instruction. */;
3031         struct qinst *thrsw = vir_nop();
3032         thrsw->qpu.sig.thrsw = true;
3033         emit_thrsw(c, end_block, &scoreboard, thrsw, true);
3034 
3035         qpu_set_branch_targets(c);
3036 
3037         assert(next_uniform == c->num_uniforms);
3038 
3039         return cycles;
3040 }
3041