1 /*
2 * Copyright © 2010 Intel Corporation
3 * Copyright © 2014-2017 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 /**
26 * @file
27 *
28 * The basic model of the list scheduler is to take a basic block, compute a
29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically
30 * pick a DAG head, then put all the children that are now DAG heads into the
31 * list of things to schedule.
32 *
33 * The goal of scheduling here is to pack pairs of operations together in a
34 * single QPU instruction.
35 */
36
37 #include "qpu/qpu_disasm.h"
38 #include "v3d_compiler.h"
39 #include "util/ralloc.h"
40 #include "util/dag.h"
41
42 static bool debug;
43
44 struct schedule_node_child;
45
46 struct schedule_node {
47 struct dag_node dag;
48 struct list_head link;
49 struct qinst *inst;
50
51 /* Longest cycles + instruction_latency() of any parent of this node. */
52 uint32_t unblocked_time;
53
54 /**
55 * Minimum number of cycles from scheduling this instruction until the
56 * end of the program, based on the slowest dependency chain through
57 * the children.
58 */
59 uint32_t delay;
60
61 /**
62 * cycles between this instruction being scheduled and when its result
63 * can be consumed.
64 */
65 uint32_t latency;
66 };
67
68 /* When walking the instructions in reverse, we need to swap before/after in
69 * add_dep().
70 */
71 enum direction { F, R };
72
73 struct schedule_state {
74 const struct v3d_device_info *devinfo;
75 struct dag *dag;
76 struct schedule_node *last_r[6];
77 struct schedule_node *last_rf[64];
78 struct schedule_node *last_sf;
79 struct schedule_node *last_vpm_read;
80 struct schedule_node *last_tmu_write;
81 struct schedule_node *last_tmu_config;
82 struct schedule_node *last_tmu_read;
83 struct schedule_node *last_tlb;
84 struct schedule_node *last_vpm;
85 struct schedule_node *last_unif;
86 struct schedule_node *last_rtop;
87 struct schedule_node *last_unifa;
88 struct schedule_node *last_setmsf;
89 enum direction dir;
90 /* Estimated cycle when the current instruction would start. */
91 uint32_t time;
92 };
93
94 static void
add_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after,bool write)95 add_dep(struct schedule_state *state,
96 struct schedule_node *before,
97 struct schedule_node *after,
98 bool write)
99 {
100 bool write_after_read = !write && state->dir == R;
101 uintptr_t edge_data = write_after_read;
102
103 if (!before || !after)
104 return;
105
106 assert(before != after);
107
108 if (state->dir == F)
109 dag_add_edge(&before->dag, &after->dag, edge_data);
110 else
111 dag_add_edge(&after->dag, &before->dag, edge_data);
112 }
113
114 static void
add_read_dep(struct schedule_state * state,struct schedule_node * before,struct schedule_node * after)115 add_read_dep(struct schedule_state *state,
116 struct schedule_node *before,
117 struct schedule_node *after)
118 {
119 add_dep(state, before, after, false);
120 }
121
122 static void
add_write_dep(struct schedule_state * state,struct schedule_node ** before,struct schedule_node * after)123 add_write_dep(struct schedule_state *state,
124 struct schedule_node **before,
125 struct schedule_node *after)
126 {
127 add_dep(state, *before, after, true);
128 *before = after;
129 }
130
131 static bool
qpu_inst_is_tlb(const struct v3d_qpu_instr * inst)132 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst)
133 {
134 if (inst->sig.ldtlb || inst->sig.ldtlbu)
135 return true;
136
137 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
138 return false;
139
140 if (inst->alu.add.op != V3D_QPU_A_NOP &&
141 inst->alu.add.magic_write &&
142 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB ||
143 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU))
144 return true;
145
146 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
147 inst->alu.mul.magic_write &&
148 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB ||
149 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU))
150 return true;
151
152 return false;
153 }
154
155 static void
process_mux_deps(struct schedule_state * state,struct schedule_node * n,enum v3d_qpu_mux mux)156 process_mux_deps(struct schedule_state *state, struct schedule_node *n,
157 enum v3d_qpu_mux mux)
158 {
159 assert(state->devinfo->ver < 71);
160 switch (mux) {
161 case V3D_QPU_MUX_A:
162 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
163 break;
164 case V3D_QPU_MUX_B:
165 if (!n->inst->qpu.sig.small_imm_b) {
166 add_read_dep(state,
167 state->last_rf[n->inst->qpu.raddr_b], n);
168 }
169 break;
170 default:
171 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n);
172 break;
173 }
174 }
175
176
177 static void
process_raddr_deps(struct schedule_state * state,struct schedule_node * n,uint8_t raddr,bool is_small_imm)178 process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
179 uint8_t raddr, bool is_small_imm)
180 {
181 assert(state->devinfo->ver >= 71);
182
183 if (!is_small_imm)
184 add_read_dep(state, state->last_rf[raddr], n);
185 }
186
187 static bool
tmu_write_is_sequence_terminator(uint32_t waddr)188 tmu_write_is_sequence_terminator(uint32_t waddr)
189 {
190 switch (waddr) {
191 case V3D_QPU_WADDR_TMUS:
192 case V3D_QPU_WADDR_TMUSCM:
193 case V3D_QPU_WADDR_TMUSF:
194 case V3D_QPU_WADDR_TMUSLOD:
195 case V3D_QPU_WADDR_TMUA:
196 case V3D_QPU_WADDR_TMUAU:
197 return true;
198 default:
199 return false;
200 }
201 }
202
203 static bool
can_reorder_tmu_write(const struct v3d_device_info * devinfo,uint32_t waddr)204 can_reorder_tmu_write(const struct v3d_device_info *devinfo, uint32_t waddr)
205 {
206 if (tmu_write_is_sequence_terminator(waddr))
207 return false;
208
209 if (waddr == V3D_QPU_WADDR_TMUD)
210 return false;
211
212 return true;
213 }
214
215 static void
process_waddr_deps(struct schedule_state * state,struct schedule_node * n,uint32_t waddr,bool magic)216 process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
217 uint32_t waddr, bool magic)
218 {
219 if (!magic) {
220 add_write_dep(state, &state->last_rf[waddr], n);
221 } else if (v3d_qpu_magic_waddr_is_tmu(state->devinfo, waddr)) {
222 if (can_reorder_tmu_write(state->devinfo, waddr))
223 add_read_dep(state, state->last_tmu_write, n);
224 else
225 add_write_dep(state, &state->last_tmu_write, n);
226
227 if (tmu_write_is_sequence_terminator(waddr))
228 add_write_dep(state, &state->last_tmu_config, n);
229 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) {
230 /* Handled by v3d_qpu_writes_r4() check. */
231 } else {
232 switch (waddr) {
233 case V3D_QPU_WADDR_R0:
234 case V3D_QPU_WADDR_R1:
235 case V3D_QPU_WADDR_R2:
236 add_write_dep(state,
237 &state->last_r[waddr - V3D_QPU_WADDR_R0],
238 n);
239 break;
240 case V3D_QPU_WADDR_R3:
241 case V3D_QPU_WADDR_R4:
242 case V3D_QPU_WADDR_R5:
243 /* Handled by v3d_qpu_writes_r*() checks below. */
244 break;
245
246 case V3D_QPU_WADDR_VPM:
247 case V3D_QPU_WADDR_VPMU:
248 add_write_dep(state, &state->last_vpm, n);
249 break;
250
251 case V3D_QPU_WADDR_TLB:
252 case V3D_QPU_WADDR_TLBU:
253 add_write_dep(state, &state->last_tlb, n);
254 break;
255
256 case V3D_QPU_WADDR_SYNC:
257 case V3D_QPU_WADDR_SYNCB:
258 case V3D_QPU_WADDR_SYNCU:
259 /* For CS barrier(): Sync against any other memory
260 * accesses. There doesn't appear to be any need for
261 * barriers to affect ALU operations.
262 */
263 add_write_dep(state, &state->last_tmu_write, n);
264 add_write_dep(state, &state->last_tmu_read, n);
265 break;
266
267 case V3D_QPU_WADDR_UNIFA:
268 add_write_dep(state, &state->last_unifa, n);
269 break;
270
271 case V3D_QPU_WADDR_NOP:
272 break;
273
274 default:
275 fprintf(stderr, "Unknown waddr %d\n", waddr);
276 abort();
277 }
278 }
279 }
280
281 /**
282 * Common code for dependencies that need to be tracked both forward and
283 * backward.
284 *
285 * This is for things like "all reads of r4 have to happen between the r4
286 * writes that surround them".
287 */
288 static void
calculate_deps(struct schedule_state * state,struct schedule_node * n)289 calculate_deps(struct schedule_state *state, struct schedule_node *n)
290 {
291 const struct v3d_device_info *devinfo = state->devinfo;
292 struct qinst *qinst = n->inst;
293 struct v3d_qpu_instr *inst = &qinst->qpu;
294 /* If the input and output segments are shared, then all VPM reads to
295 * a location need to happen before all writes. We handle this by
296 * serializing all VPM operations for now.
297 *
298 * FIXME: we are assuming that the segments are shared. That is
299 * correct right now as we are only using shared, but technically you
300 * can choose.
301 */
302 bool separate_vpm_segment = false;
303
304 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
305 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS)
306 add_read_dep(state, state->last_sf, n);
307
308 /* XXX: BDI */
309 /* XXX: BDU */
310 /* XXX: ub */
311 /* XXX: raddr_a */
312
313 add_write_dep(state, &state->last_unif, n);
314 return;
315 }
316
317 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
318
319 /* XXX: LOAD_IMM */
320
321 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
322 if (devinfo->ver < 71) {
323 process_mux_deps(state, n, inst->alu.add.a.mux);
324 } else {
325 process_raddr_deps(state, n, inst->alu.add.a.raddr,
326 inst->sig.small_imm_a);
327 }
328 }
329 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
330 if (devinfo->ver < 71) {
331 process_mux_deps(state, n, inst->alu.add.b.mux);
332 } else {
333 process_raddr_deps(state, n, inst->alu.add.b.raddr,
334 inst->sig.small_imm_b);
335 }
336 }
337
338 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
339 if (devinfo->ver < 71) {
340 process_mux_deps(state, n, inst->alu.mul.a.mux);
341 } else {
342 process_raddr_deps(state, n, inst->alu.mul.a.raddr,
343 inst->sig.small_imm_c);
344 }
345 }
346 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
347 if (devinfo->ver < 71) {
348 process_mux_deps(state, n, inst->alu.mul.b.mux);
349 } else {
350 process_raddr_deps(state, n, inst->alu.mul.b.raddr,
351 inst->sig.small_imm_d);
352 }
353 }
354
355 switch (inst->alu.add.op) {
356 case V3D_QPU_A_VPMSETUP:
357 /* Could distinguish read/write by unpacking the uniform. */
358 add_write_dep(state, &state->last_vpm, n);
359 add_write_dep(state, &state->last_vpm_read, n);
360 break;
361
362 case V3D_QPU_A_STVPMV:
363 case V3D_QPU_A_STVPMD:
364 case V3D_QPU_A_STVPMP:
365 add_write_dep(state, &state->last_vpm, n);
366 break;
367
368 case V3D_QPU_A_LDVPMV_IN:
369 case V3D_QPU_A_LDVPMD_IN:
370 case V3D_QPU_A_LDVPMG_IN:
371 case V3D_QPU_A_LDVPMP:
372 if (!separate_vpm_segment)
373 add_write_dep(state, &state->last_vpm, n);
374 break;
375
376 case V3D_QPU_A_VPMWT:
377 add_read_dep(state, state->last_vpm, n);
378 break;
379
380 case V3D_QPU_A_MSF:
381 add_read_dep(state, state->last_tlb, n);
382 add_read_dep(state, state->last_setmsf, n);
383 break;
384
385 case V3D_QPU_A_SETMSF:
386 add_write_dep(state, &state->last_setmsf, n);
387 add_write_dep(state, &state->last_tmu_write, n);
388 FALLTHROUGH;
389 case V3D_QPU_A_SETREVF:
390 add_write_dep(state, &state->last_tlb, n);
391 break;
392
393 case V3D_QPU_A_BALLOT:
394 case V3D_QPU_A_BCASTF:
395 case V3D_QPU_A_ALLEQ:
396 case V3D_QPU_A_ALLFEQ:
397 add_read_dep(state, state->last_setmsf, n);
398 break;
399
400 default:
401 break;
402 }
403
404 switch (inst->alu.mul.op) {
405 case V3D_QPU_M_MULTOP:
406 case V3D_QPU_M_UMUL24:
407 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and
408 * resets it to 0. We could possibly reorder umul24s relative
409 * to each other, but for now just keep all the MUL parts in
410 * order.
411 */
412 add_write_dep(state, &state->last_rtop, n);
413 break;
414 default:
415 break;
416 }
417
418 if (inst->alu.add.op != V3D_QPU_A_NOP) {
419 process_waddr_deps(state, n, inst->alu.add.waddr,
420 inst->alu.add.magic_write);
421 }
422 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
423 process_waddr_deps(state, n, inst->alu.mul.waddr,
424 inst->alu.mul.magic_write);
425 }
426 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) {
427 process_waddr_deps(state, n, inst->sig_addr,
428 inst->sig_magic);
429 }
430
431 if (v3d_qpu_writes_r3(devinfo, inst))
432 add_write_dep(state, &state->last_r[3], n);
433 if (v3d_qpu_writes_r4(devinfo, inst))
434 add_write_dep(state, &state->last_r[4], n);
435 if (v3d_qpu_writes_r5(devinfo, inst))
436 add_write_dep(state, &state->last_r[5], n);
437 if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
438 add_write_dep(state, &state->last_rf[0], n);
439
440 /* If we add any more dependencies here we should consider whether we
441 * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
442 */
443 if (inst->sig.thrsw) {
444 /* All accumulator contents and flags are undefined after the
445 * switch.
446 */
447 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++)
448 add_write_dep(state, &state->last_r[i], n);
449 add_write_dep(state, &state->last_sf, n);
450 add_write_dep(state, &state->last_rtop, n);
451
452 /* Scoreboard-locking operations have to stay after the last
453 * thread switch.
454 */
455 add_write_dep(state, &state->last_tlb, n);
456
457 add_write_dep(state, &state->last_tmu_write, n);
458 add_write_dep(state, &state->last_tmu_config, n);
459 }
460
461 if (v3d_qpu_waits_on_tmu(inst)) {
462 /* TMU loads are coming from a FIFO, so ordering is important.
463 */
464 add_write_dep(state, &state->last_tmu_read, n);
465 /* Keep TMU loads after their TMU lookup terminator */
466 add_read_dep(state, state->last_tmu_config, n);
467 }
468
469 /* Allow wrtmuc to be reordered with other instructions in the
470 * same TMU sequence by using a read dependency on the last TMU
471 * sequence terminator.
472 */
473 if (inst->sig.wrtmuc)
474 add_read_dep(state, state->last_tmu_config, n);
475
476 if (inst->sig.ldtlb | inst->sig.ldtlbu)
477 add_write_dep(state, &state->last_tlb, n);
478
479 if (inst->sig.ldvpm) {
480 add_write_dep(state, &state->last_vpm_read, n);
481
482 /* At least for now, we're doing shared I/O segments, so queue
483 * all writes after all reads.
484 */
485 if (!separate_vpm_segment)
486 add_write_dep(state, &state->last_vpm, n);
487 }
488
489 /* inst->sig.ldunif or sideband uniform read */
490 if (vir_has_uniform(qinst))
491 add_write_dep(state, &state->last_unif, n);
492
493 /* Both unifa and ldunifa must preserve ordering */
494 if (inst->sig.ldunifa || inst->sig.ldunifarf)
495 add_write_dep(state, &state->last_unifa, n);
496
497 if (v3d_qpu_reads_flags(inst))
498 add_read_dep(state, state->last_sf, n);
499 if (v3d_qpu_writes_flags(inst))
500 add_write_dep(state, &state->last_sf, n);
501 }
502
503 static void
calculate_forward_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)504 calculate_forward_deps(struct v3d_compile *c, struct dag *dag,
505 struct list_head *schedule_list)
506 {
507 struct schedule_state state;
508
509 memset(&state, 0, sizeof(state));
510 state.dag = dag;
511 state.devinfo = c->devinfo;
512 state.dir = F;
513
514 list_for_each_entry(struct schedule_node, node, schedule_list, link)
515 calculate_deps(&state, node);
516 }
517
518 static void
calculate_reverse_deps(struct v3d_compile * c,struct dag * dag,struct list_head * schedule_list)519 calculate_reverse_deps(struct v3d_compile *c, struct dag *dag,
520 struct list_head *schedule_list)
521 {
522 struct schedule_state state;
523
524 memset(&state, 0, sizeof(state));
525 state.dag = dag;
526 state.devinfo = c->devinfo;
527 state.dir = R;
528
529 list_for_each_entry_rev(struct schedule_node, node, schedule_list,
530 link) {
531 calculate_deps(&state, (struct schedule_node *)node);
532 }
533 }
534
535 struct choose_scoreboard {
536 struct dag *dag;
537 int tick;
538 int last_magic_sfu_write_tick;
539 int last_stallable_sfu_reg;
540 int last_stallable_sfu_tick;
541 int last_ldvary_tick;
542 int last_unifa_write_tick;
543 int last_uniforms_reset_tick;
544 int last_thrsw_tick;
545 int last_branch_tick;
546 int last_setmsf_tick;
547 bool first_thrsw_emitted;
548 bool last_thrsw_emitted;
549 bool fixup_ldvary;
550 int ldvary_count;
551 int pending_ldtmu_count;
552 bool first_ldtmu_after_thrsw;
553
554 /* V3D 7.x */
555 int last_implicit_rf0_write_tick;
556 bool has_rf0_flops_conflict;
557 };
558
559 static bool
mux_reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,enum v3d_qpu_mux mux)560 mux_reads_too_soon(struct choose_scoreboard *scoreboard,
561 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
562 {
563 switch (mux) {
564 case V3D_QPU_MUX_R4:
565 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
566 return true;
567 break;
568
569 case V3D_QPU_MUX_R5:
570 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
571 return true;
572 break;
573 default:
574 break;
575 }
576
577 return false;
578 }
579
580 static bool
reads_too_soon(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,uint8_t raddr)581 reads_too_soon(struct choose_scoreboard *scoreboard,
582 const struct v3d_qpu_instr *inst, uint8_t raddr)
583 {
584 switch (raddr) {
585 case 0: /* ldvary delayed write of C coefficient to rf0 */
586 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
587 return true;
588 break;
589 default:
590 break;
591 }
592
593 return false;
594 }
595
596 static bool
reads_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)597 reads_too_soon_after_write(const struct v3d_device_info *devinfo,
598 struct choose_scoreboard *scoreboard,
599 struct qinst *qinst)
600 {
601 const struct v3d_qpu_instr *inst = &qinst->qpu;
602
603 /* XXX: Branching off of raddr. */
604 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
605 return false;
606
607 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
608
609 if (inst->alu.add.op != V3D_QPU_A_NOP) {
610 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
611 if (devinfo->ver < 71) {
612 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
613 return true;
614 } else {
615 if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
616 return true;
617 }
618 }
619 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
620 if (devinfo->ver < 71) {
621 if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
622 return true;
623 } else {
624 if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
625 return true;
626 }
627 }
628 }
629
630 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
631 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
632 if (devinfo->ver < 71) {
633 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
634 return true;
635 } else {
636 if (reads_too_soon(scoreboard, inst, inst->alu.mul.a.raddr))
637 return true;
638 }
639 }
640 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
641 if (devinfo->ver < 71) {
642 if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
643 return true;
644 } else {
645 if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
646 return true;
647 }
648 }
649 }
650
651 /* XXX: imm */
652
653 return false;
654 }
655
656 static bool
writes_too_soon_after_write(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,struct qinst * qinst)657 writes_too_soon_after_write(const struct v3d_device_info *devinfo,
658 struct choose_scoreboard *scoreboard,
659 struct qinst *qinst)
660 {
661 const struct v3d_qpu_instr *inst = &qinst->qpu;
662
663 /* Don't schedule any other r4 write too soon after an SFU write.
664 * This would normally be prevented by dependency tracking, but might
665 * occur if a dead SFU computation makes it to scheduling.
666 */
667 if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
668 v3d_qpu_writes_r4(devinfo, inst))
669 return true;
670
671 if (devinfo->ver == 42)
672 return false;
673
674 /* Don't schedule anything that writes rf0 right after ldvary, since
675 * that would clash with the ldvary's delayed rf0 write (the exception
676 * is another ldvary, since its implicit rf0 write would also have
677 * one cycle of delay and would not clash).
678 */
679 if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
680 (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
681 (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
682 !inst->sig.ldvary))) {
683 return true;
684 }
685
686 return false;
687 }
688
689 static bool
scoreboard_is_locked(struct choose_scoreboard * scoreboard,bool lock_scoreboard_on_first_thrsw)690 scoreboard_is_locked(struct choose_scoreboard *scoreboard,
691 bool lock_scoreboard_on_first_thrsw)
692 {
693 if (lock_scoreboard_on_first_thrsw) {
694 return scoreboard->first_thrsw_emitted &&
695 scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
696 }
697
698 return scoreboard->last_thrsw_emitted &&
699 scoreboard->tick - scoreboard->last_thrsw_tick >= 3;
700 }
701
702 static bool
pixel_scoreboard_too_soon(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)703 pixel_scoreboard_too_soon(struct v3d_compile *c,
704 struct choose_scoreboard *scoreboard,
705 const struct v3d_qpu_instr *inst)
706 {
707 return qpu_inst_is_tlb(inst) &&
708 !scoreboard_is_locked(scoreboard,
709 c->lock_scoreboard_on_first_thrsw);
710 }
711
712 static bool
qpu_instruction_uses_rf(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst,uint32_t waddr)713 qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
714 const struct v3d_qpu_instr *inst,
715 uint32_t waddr) {
716
717 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
718 return false;
719
720 if (devinfo->ver < 71) {
721 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
722 inst->raddr_a == waddr)
723 return true;
724
725 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
726 !inst->sig.small_imm_b && (inst->raddr_b == waddr))
727 return true;
728 } else {
729 if (v3d71_qpu_reads_raddr(inst, waddr))
730 return true;
731 }
732
733 return false;
734 }
735
736 static bool
read_stalls(const struct v3d_device_info * devinfo,struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)737 read_stalls(const struct v3d_device_info *devinfo,
738 struct choose_scoreboard *scoreboard,
739 const struct v3d_qpu_instr *inst)
740 {
741 return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
742 qpu_instruction_uses_rf(devinfo, inst,
743 scoreboard->last_stallable_sfu_reg);
744 }
745
746 /* We define a max schedule priority to allow negative priorities as result of
747 * subtracting this max when an instruction stalls. So instructions that
748 * stall have lower priority than regular instructions. */
749 #define MAX_SCHEDULE_PRIORITY 16
750
751 static int
get_instruction_priority(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)752 get_instruction_priority(const struct v3d_device_info *devinfo,
753 const struct v3d_qpu_instr *inst)
754 {
755 uint32_t baseline_score;
756 uint32_t next_score = 0;
757
758 /* Schedule TLB operations as late as possible, to get more
759 * parallelism between shaders.
760 */
761 if (qpu_inst_is_tlb(inst))
762 return next_score;
763 next_score++;
764
765 /* Empirical testing shows that using priorities to hide latency of
766 * TMU operations when scheduling QPU leads to slightly worse
767 * performance, even at 2 threads. We think this is because the thread
768 * switching is already quite effective at hiding latency and NIR
769 * scheduling (and possibly TMU pipelining too) are sufficient to hide
770 * TMU latency, so piling up on that here doesn't provide any benefits
771 * and instead may cause us to postpone critical paths that depend on
772 * the TMU results.
773 */
774 #if 0
775 /* Schedule texture read results collection late to hide latency. */
776 if (v3d_qpu_waits_on_tmu(inst))
777 return next_score;
778 next_score++;
779 #endif
780
781 /* Default score for things that aren't otherwise special. */
782 baseline_score = next_score;
783 next_score++;
784
785 #if 0
786 /* Schedule texture read setup early to hide their latency better. */
787 if (v3d_qpu_writes_tmu(devinfo, inst))
788 return next_score;
789 next_score++;
790 #endif
791
792 /* We should increase the maximum if we assert here */
793 assert(next_score < MAX_SCHEDULE_PRIORITY);
794
795 return baseline_score;
796 }
797
798 enum {
799 V3D_PERIPHERAL_VPM_READ = (1 << 0),
800 V3D_PERIPHERAL_VPM_WRITE = (1 << 1),
801 V3D_PERIPHERAL_VPM_WAIT = (1 << 2),
802 V3D_PERIPHERAL_SFU = (1 << 3),
803 V3D_PERIPHERAL_TMU_WRITE = (1 << 4),
804 V3D_PERIPHERAL_TMU_READ = (1 << 5),
805 V3D_PERIPHERAL_TMU_WAIT = (1 << 6),
806 V3D_PERIPHERAL_TMU_WRTMUC_SIG = (1 << 7),
807 V3D_PERIPHERAL_TSY = (1 << 8),
808 V3D_PERIPHERAL_TLB_READ = (1 << 9),
809 V3D_PERIPHERAL_TLB_WRITE = (1 << 10),
810 };
811
812 static uint32_t
qpu_peripherals(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * inst)813 qpu_peripherals(const struct v3d_device_info *devinfo,
814 const struct v3d_qpu_instr *inst)
815 {
816 uint32_t result = 0;
817 if (v3d_qpu_reads_vpm(inst))
818 result |= V3D_PERIPHERAL_VPM_READ;
819 if (v3d_qpu_writes_vpm(inst))
820 result |= V3D_PERIPHERAL_VPM_WRITE;
821 if (v3d_qpu_waits_vpm(inst))
822 result |= V3D_PERIPHERAL_VPM_WAIT;
823
824 if (v3d_qpu_writes_tmu(devinfo, inst))
825 result |= V3D_PERIPHERAL_TMU_WRITE;
826 if (inst->sig.ldtmu)
827 result |= V3D_PERIPHERAL_TMU_READ;
828 if (inst->sig.wrtmuc)
829 result |= V3D_PERIPHERAL_TMU_WRTMUC_SIG;
830
831 if (v3d_qpu_uses_sfu(inst))
832 result |= V3D_PERIPHERAL_SFU;
833
834 if (v3d_qpu_reads_tlb(inst))
835 result |= V3D_PERIPHERAL_TLB_READ;
836 if (v3d_qpu_writes_tlb(inst))
837 result |= V3D_PERIPHERAL_TLB_WRITE;
838
839 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
840 if (inst->alu.add.op != V3D_QPU_A_NOP &&
841 inst->alu.add.magic_write &&
842 v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr)) {
843 result |= V3D_PERIPHERAL_TSY;
844 }
845
846 if (inst->alu.add.op == V3D_QPU_A_TMUWT)
847 result |= V3D_PERIPHERAL_TMU_WAIT;
848 }
849
850 return result;
851 }
852
853 static bool
qpu_compatible_peripheral_access(const struct v3d_device_info * devinfo,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)854 qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
855 const struct v3d_qpu_instr *a,
856 const struct v3d_qpu_instr *b)
857 {
858 const uint32_t a_peripherals = qpu_peripherals(devinfo, a);
859 const uint32_t b_peripherals = qpu_peripherals(devinfo, b);
860
861 /* We can always do one peripheral access per instruction. */
862 if (util_bitcount(a_peripherals) + util_bitcount(b_peripherals) <= 1)
863 return true;
864
865 /* V3D 4.x can't do more than one peripheral access except in a
866 * few cases:
867 */
868 if (devinfo->ver == 42) {
869 /* WRTMUC signal with TMU register write (other than tmuc). */
870 if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
871 b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
872 return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
873 }
874 if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
875 a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
876 return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
877 }
878
879 /* TMU read with VPM read/write. */
880 if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
881 (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
882 b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
883 return true;
884 }
885 if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
886 (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
887 a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
888 return true;
889 }
890
891 return false;
892 }
893
894 /* V3D 7.x can't have more than one of these restricted peripherals */
895 const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
896 V3D_PERIPHERAL_TMU_WRTMUC_SIG |
897 V3D_PERIPHERAL_TSY |
898 V3D_PERIPHERAL_TLB_READ |
899 V3D_PERIPHERAL_SFU |
900 V3D_PERIPHERAL_VPM_READ |
901 V3D_PERIPHERAL_VPM_WRITE;
902
903 const uint32_t a_restricted = a_peripherals & restricted;
904 const uint32_t b_restricted = b_peripherals & restricted;
905 if (a_restricted && b_restricted) {
906 /* WRTMUC signal with TMU register write (other than tmuc) is
907 * allowed though.
908 */
909 if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
910 b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
911 v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
912 (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
913 a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
914 v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
915 return false;
916 }
917 }
918
919 /* Only one TMU read per instruction */
920 if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
921 (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
922 return false;
923 }
924
925 /* Only one TLB access per instruction */
926 if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
927 V3D_PERIPHERAL_TLB_READ)) &&
928 (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
929 V3D_PERIPHERAL_TLB_READ))) {
930 return false;
931 }
932
933 return true;
934 }
935
936 /* Compute a bitmask of which rf registers are used between
937 * the two instructions.
938 */
939 static uint64_t
qpu_raddrs_used(const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)940 qpu_raddrs_used(const struct v3d_qpu_instr *a,
941 const struct v3d_qpu_instr *b)
942 {
943 assert(a->type == V3D_QPU_INSTR_TYPE_ALU);
944 assert(b->type == V3D_QPU_INSTR_TYPE_ALU);
945
946 uint64_t raddrs_used = 0;
947 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
948 raddrs_used |= (UINT64_C(1) << a->raddr_a);
949 if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
950 raddrs_used |= (UINT64_C(1) << a->raddr_b);
951 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
952 raddrs_used |= (UINT64_C(1) << b->raddr_a);
953 if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
954 raddrs_used |= (UINT64_C(1) << b->raddr_b);
955
956 return raddrs_used;
957 }
958
959 /* Takes two instructions and attempts to merge their raddr fields (including
960 * small immediates) into one merged instruction. For V3D 4.x, returns false
961 * if the two instructions access more than two different rf registers between
962 * them, or more than one rf register and one small immediate. For 7.x returns
963 * false if both instructions use small immediates.
964 */
965 static bool
qpu_merge_raddrs(struct v3d_qpu_instr * result,const struct v3d_qpu_instr * add_instr,const struct v3d_qpu_instr * mul_instr,const struct v3d_device_info * devinfo)966 qpu_merge_raddrs(struct v3d_qpu_instr *result,
967 const struct v3d_qpu_instr *add_instr,
968 const struct v3d_qpu_instr *mul_instr,
969 const struct v3d_device_info *devinfo)
970 {
971 if (devinfo->ver >= 71) {
972 assert(add_instr->sig.small_imm_a +
973 add_instr->sig.small_imm_b <= 1);
974 assert(add_instr->sig.small_imm_c +
975 add_instr->sig.small_imm_d == 0);
976 assert(mul_instr->sig.small_imm_a +
977 mul_instr->sig.small_imm_b == 0);
978 assert(mul_instr->sig.small_imm_c +
979 mul_instr->sig.small_imm_d <= 1);
980
981 result->sig.small_imm_a = add_instr->sig.small_imm_a;
982 result->sig.small_imm_b = add_instr->sig.small_imm_b;
983 result->sig.small_imm_c = mul_instr->sig.small_imm_c;
984 result->sig.small_imm_d = mul_instr->sig.small_imm_d;
985
986 return (result->sig.small_imm_a +
987 result->sig.small_imm_b +
988 result->sig.small_imm_c +
989 result->sig.small_imm_d) <= 1;
990 }
991
992 assert(devinfo->ver == 42);
993
994 uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
995 int naddrs = util_bitcount64(raddrs_used);
996
997 if (naddrs > 2)
998 return false;
999
1000 if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
1001 if (naddrs > 1)
1002 return false;
1003
1004 if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
1005 if (add_instr->raddr_b != mul_instr->raddr_b)
1006 return false;
1007
1008 result->sig.small_imm_b = true;
1009 result->raddr_b = add_instr->sig.small_imm_b ?
1010 add_instr->raddr_b : mul_instr->raddr_b;
1011 }
1012
1013 if (naddrs == 0)
1014 return true;
1015
1016 int raddr_a = ffsll(raddrs_used) - 1;
1017 raddrs_used &= ~(UINT64_C(1) << raddr_a);
1018 result->raddr_a = raddr_a;
1019
1020 if (!result->sig.small_imm_b) {
1021 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
1022 raddr_a == add_instr->raddr_b) {
1023 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
1024 result->alu.add.a.mux = V3D_QPU_MUX_A;
1025 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
1026 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1027 result->alu.add.b.mux = V3D_QPU_MUX_A;
1028 }
1029 }
1030 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
1031 raddr_a == mul_instr->raddr_b) {
1032 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
1033 result->alu.mul.a.mux = V3D_QPU_MUX_A;
1034 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
1035 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1036 result->alu.mul.b.mux = V3D_QPU_MUX_A;
1037 }
1038 }
1039 }
1040 if (!raddrs_used)
1041 return true;
1042
1043 int raddr_b = ffsll(raddrs_used) - 1;
1044 result->raddr_b = raddr_b;
1045 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
1046 raddr_b == add_instr->raddr_a) {
1047 if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
1048 result->alu.add.a.mux = V3D_QPU_MUX_B;
1049 if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
1050 v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
1051 result->alu.add.b.mux = V3D_QPU_MUX_B;
1052 }
1053 }
1054 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
1055 raddr_b == mul_instr->raddr_a) {
1056 if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
1057 result->alu.mul.a.mux = V3D_QPU_MUX_B;
1058 if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
1059 v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
1060 result->alu.mul.b.mux = V3D_QPU_MUX_B;
1061 }
1062 }
1063
1064 return true;
1065 }
1066
1067 static bool
can_do_add_as_mul(enum v3d_qpu_add_op op)1068 can_do_add_as_mul(enum v3d_qpu_add_op op)
1069 {
1070 switch (op) {
1071 case V3D_QPU_A_ADD:
1072 case V3D_QPU_A_SUB:
1073 return true;
1074 default:
1075 return false;
1076 }
1077 }
1078
1079 static enum v3d_qpu_mul_op
add_op_as_mul_op(enum v3d_qpu_add_op op)1080 add_op_as_mul_op(enum v3d_qpu_add_op op)
1081 {
1082 switch (op) {
1083 case V3D_QPU_A_ADD:
1084 return V3D_QPU_M_ADD;
1085 case V3D_QPU_A_SUB:
1086 return V3D_QPU_M_SUB;
1087 default:
1088 unreachable("unexpected add opcode");
1089 }
1090 }
1091
1092 static void
qpu_convert_add_to_mul(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst)1093 qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
1094 struct v3d_qpu_instr *inst)
1095 {
1096 STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
1097 assert(inst->alu.add.op != V3D_QPU_A_NOP);
1098 assert(inst->alu.mul.op == V3D_QPU_M_NOP);
1099
1100 memcpy(&inst->alu.mul, &inst->alu.add, sizeof(inst->alu.mul));
1101 inst->alu.mul.op = add_op_as_mul_op(inst->alu.add.op);
1102 inst->alu.add.op = V3D_QPU_A_NOP;
1103
1104 inst->flags.mc = inst->flags.ac;
1105 inst->flags.mpf = inst->flags.apf;
1106 inst->flags.muf = inst->flags.auf;
1107 inst->flags.ac = V3D_QPU_COND_NONE;
1108 inst->flags.apf = V3D_QPU_PF_NONE;
1109 inst->flags.auf = V3D_QPU_UF_NONE;
1110
1111 inst->alu.mul.output_pack = inst->alu.add.output_pack;
1112
1113 inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
1114 inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
1115 inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
1116 inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
1117 inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
1118
1119 if (devinfo->ver >= 71) {
1120 assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
1121 assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
1122 if (inst->sig.small_imm_a) {
1123 inst->sig.small_imm_c = true;
1124 inst->sig.small_imm_a = false;
1125 } else if (inst->sig.small_imm_b) {
1126 inst->sig.small_imm_d = true;
1127 inst->sig.small_imm_b = false;
1128 }
1129 }
1130 }
1131
1132 static bool
can_do_mul_as_add(const struct v3d_device_info * devinfo,enum v3d_qpu_mul_op op)1133 can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
1134 {
1135 switch (op) {
1136 case V3D_QPU_M_MOV:
1137 case V3D_QPU_M_FMOV:
1138 return devinfo->ver >= 71;
1139 default:
1140 return false;
1141 }
1142 }
1143
1144 static enum v3d_qpu_mul_op
mul_op_as_add_op(enum v3d_qpu_mul_op op)1145 mul_op_as_add_op(enum v3d_qpu_mul_op op)
1146 {
1147 switch (op) {
1148 case V3D_QPU_M_MOV:
1149 return V3D_QPU_A_MOV;
1150 case V3D_QPU_M_FMOV:
1151 return V3D_QPU_A_FMOV;
1152 default:
1153 unreachable("unexpected mov opcode");
1154 }
1155 }
1156
1157 static void
qpu_convert_mul_to_add(struct v3d_qpu_instr * inst)1158 qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
1159 {
1160 STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
1161 assert(inst->alu.mul.op != V3D_QPU_M_NOP);
1162 assert(inst->alu.add.op == V3D_QPU_A_NOP);
1163
1164 memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
1165 inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
1166 inst->alu.mul.op = V3D_QPU_M_NOP;
1167
1168 inst->flags.ac = inst->flags.mc;
1169 inst->flags.apf = inst->flags.mpf;
1170 inst->flags.auf = inst->flags.muf;
1171 inst->flags.mc = V3D_QPU_COND_NONE;
1172 inst->flags.mpf = V3D_QPU_PF_NONE;
1173 inst->flags.muf = V3D_QPU_UF_NONE;
1174
1175 inst->alu.add.output_pack = inst->alu.mul.output_pack;
1176 inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
1177 inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
1178 inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
1179 inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
1180 inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
1181
1182 assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
1183 assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
1184 if (inst->sig.small_imm_c) {
1185 inst->sig.small_imm_a = true;
1186 inst->sig.small_imm_c = false;
1187 } else if (inst->sig.small_imm_d) {
1188 inst->sig.small_imm_b = true;
1189 inst->sig.small_imm_d = false;
1190 }
1191 }
1192
1193 static bool
qpu_merge_inst(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * result,const struct v3d_qpu_instr * a,const struct v3d_qpu_instr * b)1194 qpu_merge_inst(const struct v3d_device_info *devinfo,
1195 struct v3d_qpu_instr *result,
1196 const struct v3d_qpu_instr *a,
1197 const struct v3d_qpu_instr *b)
1198 {
1199 if (a->type != V3D_QPU_INSTR_TYPE_ALU ||
1200 b->type != V3D_QPU_INSTR_TYPE_ALU) {
1201 return false;
1202 }
1203
1204 if (!qpu_compatible_peripheral_access(devinfo, a, b))
1205 return false;
1206
1207 struct v3d_qpu_instr merge = *a;
1208 const struct v3d_qpu_instr *add_instr = NULL, *mul_instr = NULL;
1209
1210 struct v3d_qpu_instr mul_inst;
1211 if (b->alu.add.op != V3D_QPU_A_NOP) {
1212 if (a->alu.add.op == V3D_QPU_A_NOP) {
1213 merge.alu.add = b->alu.add;
1214
1215 merge.flags.ac = b->flags.ac;
1216 merge.flags.apf = b->flags.apf;
1217 merge.flags.auf = b->flags.auf;
1218
1219 add_instr = b;
1220 mul_instr = a;
1221 }
1222 /* If a's add op is used but its mul op is not, then see if we
1223 * can convert either a's add op or b's add op to a mul op
1224 * so we can merge.
1225 */
1226 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1227 can_do_add_as_mul(b->alu.add.op)) {
1228 mul_inst = *b;
1229 qpu_convert_add_to_mul(devinfo, &mul_inst);
1230
1231 merge.alu.mul = mul_inst.alu.mul;
1232
1233 merge.flags.mc = mul_inst.flags.mc;
1234 merge.flags.mpf = mul_inst.flags.mpf;
1235 merge.flags.muf = mul_inst.flags.muf;
1236
1237 add_instr = a;
1238 mul_instr = &mul_inst;
1239 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
1240 can_do_add_as_mul(a->alu.add.op)) {
1241 mul_inst = *a;
1242 qpu_convert_add_to_mul(devinfo, &mul_inst);
1243
1244 merge = mul_inst;
1245 merge.alu.add = b->alu.add;
1246
1247 merge.flags.ac = b->flags.ac;
1248 merge.flags.apf = b->flags.apf;
1249 merge.flags.auf = b->flags.auf;
1250
1251 add_instr = b;
1252 mul_instr = &mul_inst;
1253 } else {
1254 return false;
1255 }
1256 }
1257
1258 struct v3d_qpu_instr add_inst;
1259 if (b->alu.mul.op != V3D_QPU_M_NOP) {
1260 if (a->alu.mul.op == V3D_QPU_M_NOP) {
1261 merge.alu.mul = b->alu.mul;
1262
1263 merge.flags.mc = b->flags.mc;
1264 merge.flags.mpf = b->flags.mpf;
1265 merge.flags.muf = b->flags.muf;
1266
1267 mul_instr = b;
1268 add_instr = a;
1269 }
1270 /* If a's mul op is used but its add op is not, then see if we
1271 * can convert either a's mul op or b's mul op to an add op
1272 * so we can merge.
1273 */
1274 else if (a->alu.add.op == V3D_QPU_A_NOP &&
1275 can_do_mul_as_add(devinfo, b->alu.mul.op)) {
1276 add_inst = *b;
1277 qpu_convert_mul_to_add(&add_inst);
1278
1279 merge.alu.add = add_inst.alu.add;
1280
1281 merge.flags.ac = add_inst.flags.ac;
1282 merge.flags.apf = add_inst.flags.apf;
1283 merge.flags.auf = add_inst.flags.auf;
1284
1285 mul_instr = a;
1286 add_instr = &add_inst;
1287 } else if (a->alu.add.op == V3D_QPU_A_NOP &&
1288 can_do_mul_as_add(devinfo, a->alu.mul.op)) {
1289 add_inst = *a;
1290 qpu_convert_mul_to_add(&add_inst);
1291
1292 merge = add_inst;
1293 merge.alu.mul = b->alu.mul;
1294
1295 merge.flags.mc = b->flags.mc;
1296 merge.flags.mpf = b->flags.mpf;
1297 merge.flags.muf = b->flags.muf;
1298
1299 mul_instr = b;
1300 add_instr = &add_inst;
1301 } else {
1302 return false;
1303 }
1304 }
1305
1306 /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
1307 * they have restrictions on the number of raddrs that can be adressed
1308 * in a single instruction. In V3D 7.x, we don't have that restriction,
1309 * but we are still limited to a single small immediate per instruction.
1310 */
1311 if (add_instr && mul_instr &&
1312 !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
1313 return false;
1314 }
1315
1316 merge.sig.thrsw |= b->sig.thrsw;
1317 merge.sig.ldunif |= b->sig.ldunif;
1318 merge.sig.ldunifrf |= b->sig.ldunifrf;
1319 merge.sig.ldunifa |= b->sig.ldunifa;
1320 merge.sig.ldunifarf |= b->sig.ldunifarf;
1321 merge.sig.ldtmu |= b->sig.ldtmu;
1322 merge.sig.ldvary |= b->sig.ldvary;
1323 merge.sig.ldvpm |= b->sig.ldvpm;
1324 merge.sig.ldtlb |= b->sig.ldtlb;
1325 merge.sig.ldtlbu |= b->sig.ldtlbu;
1326 merge.sig.ucb |= b->sig.ucb;
1327 merge.sig.rotate |= b->sig.rotate;
1328 merge.sig.wrtmuc |= b->sig.wrtmuc;
1329
1330 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) &&
1331 v3d_qpu_sig_writes_address(devinfo, &b->sig))
1332 return false;
1333 merge.sig_addr |= b->sig_addr;
1334 merge.sig_magic |= b->sig_magic;
1335
1336 uint64_t packed;
1337 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed);
1338
1339 *result = merge;
1340 /* No modifying the real instructions on failure. */
1341 assert(ok || (a != result && b != result));
1342
1343 return ok;
1344 }
1345
1346 static inline bool
try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr * inst)1347 try_skip_for_ldvary_pipelining(const struct v3d_qpu_instr *inst)
1348 {
1349 return inst->sig.ldunif || inst->sig.ldunifrf;
1350 }
1351
1352 static bool
1353 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
1354 struct choose_scoreboard *scoreboard,
1355 const struct qinst *qinst);
1356
1357 static struct schedule_node *
choose_instruction_to_schedule(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct schedule_node * prev_inst)1358 choose_instruction_to_schedule(struct v3d_compile *c,
1359 struct choose_scoreboard *scoreboard,
1360 struct schedule_node *prev_inst)
1361 {
1362 struct schedule_node *chosen = NULL;
1363 int chosen_prio = 0;
1364
1365 /* Don't pair up anything with a thread switch signal -- emit_thrsw()
1366 * will handle pairing it along with filling the delay slots.
1367 */
1368 if (prev_inst) {
1369 if (prev_inst->inst->qpu.sig.thrsw)
1370 return NULL;
1371 }
1372
1373 bool ldvary_pipelining = c->s->info.stage == MESA_SHADER_FRAGMENT &&
1374 scoreboard->ldvary_count < c->num_inputs;
1375 bool skipped_insts_for_ldvary_pipelining = false;
1376 retry:
1377 list_for_each_entry(struct schedule_node, n, &scoreboard->dag->heads,
1378 dag.link) {
1379 const struct v3d_qpu_instr *inst = &n->inst->qpu;
1380
1381 if (ldvary_pipelining && try_skip_for_ldvary_pipelining(inst)) {
1382 skipped_insts_for_ldvary_pipelining = true;
1383 continue;
1384 }
1385
1386 /* Don't choose the branch instruction until it's the last one
1387 * left. We'll move it up to fit its delay slots after we
1388 * choose it.
1389 */
1390 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
1391 !list_is_singular(&scoreboard->dag->heads)) {
1392 continue;
1393 }
1394
1395 /* We need to have 3 delay slots between a write to unifa and
1396 * a follow-up ldunifa.
1397 */
1398 if ((inst->sig.ldunifa || inst->sig.ldunifarf) &&
1399 scoreboard->tick - scoreboard->last_unifa_write_tick <= 3)
1400 continue;
1401
1402 /* "An instruction must not read from a location in physical
1403 * regfile A or B that was written to by the previous
1404 * instruction."
1405 */
1406 if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1407 continue;
1408
1409 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
1410 continue;
1411
1412 /* "Before doing a TLB access a scoreboard wait must have been
1413 * done. This happens either on the first or last thread
1414 * switch, depending on a setting (scb_wait_on_first_thrsw) in
1415 * the shader state."
1416 */
1417 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1418 continue;
1419
1420 /* ldunif and ldvary both write the same register (r5 for v42
1421 * and below, rf0 for v71), but ldunif does so a tick sooner.
1422 * If the ldvary's register wasn't used, then ldunif might
1423 * otherwise get scheduled so ldunif and ldvary try to update
1424 * the register in the same tick.
1425 */
1426 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
1427 scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
1428 continue;
1429 }
1430
1431 /* If we are in a thrsw delay slot check that this instruction
1432 * is valid for that.
1433 */
1434 if (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick &&
1435 !qpu_inst_after_thrsw_valid_in_delay_slot(c, scoreboard,
1436 n->inst)) {
1437 continue;
1438 }
1439
1440 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
1441 /* Don't try to put a branch in the delay slots of another
1442 * branch or a unifa write.
1443 */
1444 if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
1445 continue;
1446 if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
1447 continue;
1448
1449 /* No branch with cond != 0,2,3 and msfign != 0 after
1450 * setmsf.
1451 */
1452 if (scoreboard->last_setmsf_tick == scoreboard->tick - 1 &&
1453 inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
1454 inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
1455 inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
1456 inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
1457 continue;
1458 }
1459 }
1460
1461 /* If we're trying to pair with another instruction, check
1462 * that they're compatible.
1463 */
1464 if (prev_inst) {
1465 /* Don't pair up a thread switch signal -- we'll
1466 * handle pairing it when we pick it on its own.
1467 */
1468 if (inst->sig.thrsw)
1469 continue;
1470
1471 if (prev_inst->inst->uniform != -1 &&
1472 n->inst->uniform != -1)
1473 continue;
1474
1475 /* Simulator complains if we have two uniforms loaded in
1476 * the the same instruction, which could happen if we
1477 * have a ldunif or sideband uniform and we pair that
1478 * with ldunifa.
1479 */
1480 if (vir_has_uniform(prev_inst->inst) &&
1481 (inst->sig.ldunifa || inst->sig.ldunifarf)) {
1482 continue;
1483 }
1484
1485 if ((prev_inst->inst->qpu.sig.ldunifa ||
1486 prev_inst->inst->qpu.sig.ldunifarf) &&
1487 vir_has_uniform(n->inst)) {
1488 continue;
1489 }
1490
1491 /* Don't merge TLB instructions before we have acquired
1492 * the scoreboard lock.
1493 */
1494 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
1495 continue;
1496
1497 /* When we successfully pair up an ldvary we then try
1498 * to merge it into the previous instruction if
1499 * possible to improve pipelining. Don't pick up the
1500 * ldvary now if the follow-up fixup would place
1501 * it in the delay slots of a thrsw, which is not
1502 * allowed and would prevent the fixup from being
1503 * successful. In V3D 7.x we can allow this to happen
1504 * as long as it is not the last delay slot.
1505 */
1506 if (inst->sig.ldvary) {
1507 if (c->devinfo->ver == 42 &&
1508 scoreboard->last_thrsw_tick + 2 >=
1509 scoreboard->tick - 1) {
1510 continue;
1511 }
1512 if (c->devinfo->ver >= 71 &&
1513 scoreboard->last_thrsw_tick + 2 ==
1514 scoreboard->tick - 1) {
1515 continue;
1516 }
1517 }
1518
1519 /* We can emit a new tmu lookup with a previous ldtmu
1520 * if doing this would free just enough space in the
1521 * TMU output fifo so we don't overflow, however, this
1522 * is only safe if the ldtmu cannot stall.
1523 *
1524 * A ldtmu can stall if it is not the first following a
1525 * thread switch and corresponds to the first word of a
1526 * read request.
1527 *
1528 * FIXME: For now we forbid pairing up a new lookup
1529 * with a previous ldtmu that is not the first after a
1530 * thrsw if that could overflow the TMU output fifo
1531 * regardless of whether the ldtmu is reading the first
1532 * word of a TMU result or not, since we don't track
1533 * this aspect in the compiler yet.
1534 */
1535 if (prev_inst->inst->qpu.sig.ldtmu &&
1536 !scoreboard->first_ldtmu_after_thrsw &&
1537 (scoreboard->pending_ldtmu_count +
1538 n->inst->ldtmu_count > 16 / c->threads)) {
1539 continue;
1540 }
1541
1542 struct v3d_qpu_instr merged_inst;
1543 if (!qpu_merge_inst(c->devinfo, &merged_inst,
1544 &prev_inst->inst->qpu, inst)) {
1545 continue;
1546 }
1547 }
1548
1549 int prio = get_instruction_priority(c->devinfo, inst);
1550
1551 if (read_stalls(c->devinfo, scoreboard, inst)) {
1552 /* Don't merge an instruction that stalls */
1553 if (prev_inst)
1554 continue;
1555 else {
1556 /* Any instruction that don't stall will have
1557 * higher scheduling priority */
1558 prio -= MAX_SCHEDULE_PRIORITY;
1559 assert(prio < 0);
1560 }
1561 }
1562
1563 /* Found a valid instruction. If nothing better comes along,
1564 * this one works.
1565 */
1566 if (!chosen) {
1567 chosen = n;
1568 chosen_prio = prio;
1569 continue;
1570 }
1571
1572 if (prio > chosen_prio) {
1573 chosen = n;
1574 chosen_prio = prio;
1575 } else if (prio < chosen_prio) {
1576 continue;
1577 }
1578
1579 if (n->delay > chosen->delay) {
1580 chosen = n;
1581 chosen_prio = prio;
1582 } else if (n->delay < chosen->delay) {
1583 continue;
1584 }
1585 }
1586
1587 /* If we did not find any instruction to schedule but we discarded
1588 * some of them to prioritize ldvary pipelining, try again.
1589 */
1590 if (!chosen && !prev_inst && skipped_insts_for_ldvary_pipelining) {
1591 skipped_insts_for_ldvary_pipelining = false;
1592 ldvary_pipelining = false;
1593 goto retry;
1594 }
1595
1596 if (chosen && chosen->inst->qpu.sig.ldvary) {
1597 scoreboard->ldvary_count++;
1598 /* If we are pairing an ldvary, flag it so we can fix it up for
1599 * optimal pipelining of ldvary sequences.
1600 */
1601 if (prev_inst)
1602 scoreboard->fixup_ldvary = true;
1603 }
1604
1605 return chosen;
1606 }
1607
1608 static void
update_scoreboard_for_magic_waddr(struct choose_scoreboard * scoreboard,enum v3d_qpu_waddr waddr,const struct v3d_device_info * devinfo)1609 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard,
1610 enum v3d_qpu_waddr waddr,
1611 const struct v3d_device_info *devinfo)
1612 {
1613 if (v3d_qpu_magic_waddr_is_sfu(waddr))
1614 scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
1615 else if (waddr == V3D_QPU_WADDR_UNIFA)
1616 scoreboard->last_unifa_write_tick = scoreboard->tick;
1617 }
1618
1619 static void
update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst)1620 update_scoreboard_for_sfu_stall_waddr(struct choose_scoreboard *scoreboard,
1621 const struct v3d_qpu_instr *inst)
1622 {
1623 if (v3d_qpu_instr_is_sfu(inst)) {
1624 scoreboard->last_stallable_sfu_reg = inst->alu.add.waddr;
1625 scoreboard->last_stallable_sfu_tick = scoreboard->tick;
1626 }
1627 }
1628
1629 static void
update_scoreboard_tmu_tracking(struct choose_scoreboard * scoreboard,const struct qinst * inst)1630 update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
1631 const struct qinst *inst)
1632 {
1633 /* Track if the have seen any ldtmu after the last thread switch */
1634 if (scoreboard->tick == scoreboard->last_thrsw_tick + 2)
1635 scoreboard->first_ldtmu_after_thrsw = true;
1636
1637 /* Track the number of pending ldtmu instructions for outstanding
1638 * TMU lookups.
1639 */
1640 scoreboard->pending_ldtmu_count += inst->ldtmu_count;
1641 if (inst->qpu.sig.ldtmu) {
1642 assert(scoreboard->pending_ldtmu_count > 0);
1643 scoreboard->pending_ldtmu_count--;
1644 scoreboard->first_ldtmu_after_thrsw = false;
1645 }
1646 }
1647
1648 static void
set_has_rf0_flops_conflict(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1649 set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
1650 const struct v3d_qpu_instr *inst,
1651 const struct v3d_device_info *devinfo)
1652 {
1653 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
1654 v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
1655 !inst->sig_magic) {
1656 scoreboard->has_rf0_flops_conflict = true;
1657 }
1658 }
1659
1660 static void
update_scoreboard_for_rf0_flops(struct choose_scoreboard * scoreboard,const struct v3d_qpu_instr * inst,const struct v3d_device_info * devinfo)1661 update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
1662 const struct v3d_qpu_instr *inst,
1663 const struct v3d_device_info *devinfo)
1664 {
1665 if (devinfo->ver < 71)
1666 return;
1667
1668 /* Thread switch restrictions:
1669 *
1670 * At the point of a thread switch or thread end (when the actual
1671 * thread switch or thread end happens, not when the signalling
1672 * instruction is processed):
1673 *
1674 * - If the most recent write to rf0 was from a ldunif, ldunifa, or
1675 * ldvary instruction in which another signal also wrote to the
1676 * register file, and the final instruction of the thread section
1677 * contained a signal which wrote to the register file, then the
1678 * value of rf0 is undefined at the start of the new section
1679 *
1680 * Here we use the scoreboard to track if our last rf0 implicit write
1681 * happens at the same time that another signal writes the register
1682 * file (has_rf0_flops_conflict). We will use that information when
1683 * scheduling thrsw instructions to avoid putting anything in their
1684 * last delay slot which has a signal that writes to the register file.
1685 */
1686
1687 /* Reset tracking if we have an explicit rf0 write or we are starting
1688 * a new thread section.
1689 */
1690 if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
1691 scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
1692 scoreboard->last_implicit_rf0_write_tick = -10;
1693 scoreboard->has_rf0_flops_conflict = false;
1694 }
1695
1696 if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
1697 scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
1698 scoreboard->tick + 1 : scoreboard->tick;
1699 }
1700
1701 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
1702 }
1703
1704 static void
update_scoreboard_for_chosen(struct choose_scoreboard * scoreboard,const struct qinst * qinst,const struct v3d_device_info * devinfo)1705 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
1706 const struct qinst *qinst,
1707 const struct v3d_device_info *devinfo)
1708 {
1709 const struct v3d_qpu_instr *inst = &qinst->qpu;
1710
1711 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
1712 return;
1713
1714 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
1715
1716 if (inst->alu.add.op != V3D_QPU_A_NOP) {
1717 if (inst->alu.add.magic_write) {
1718 update_scoreboard_for_magic_waddr(scoreboard,
1719 inst->alu.add.waddr,
1720 devinfo);
1721 } else {
1722 update_scoreboard_for_sfu_stall_waddr(scoreboard,
1723 inst);
1724 }
1725
1726 if (inst->alu.add.op == V3D_QPU_A_SETMSF)
1727 scoreboard->last_setmsf_tick = scoreboard->tick;
1728 }
1729
1730 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
1731 if (inst->alu.mul.magic_write) {
1732 update_scoreboard_for_magic_waddr(scoreboard,
1733 inst->alu.mul.waddr,
1734 devinfo);
1735 }
1736 }
1737
1738 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) && inst->sig_magic) {
1739 update_scoreboard_for_magic_waddr(scoreboard,
1740 inst->sig_addr,
1741 devinfo);
1742 }
1743
1744 if (inst->sig.ldvary)
1745 scoreboard->last_ldvary_tick = scoreboard->tick;
1746
1747 update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
1748
1749 update_scoreboard_tmu_tracking(scoreboard, qinst);
1750 }
1751
1752 static void
dump_state(const struct v3d_device_info * devinfo,struct dag * dag)1753 dump_state(const struct v3d_device_info *devinfo, struct dag *dag)
1754 {
1755 list_for_each_entry(struct schedule_node, n, &dag->heads, dag.link) {
1756 fprintf(stderr, " t=%4d: ", n->unblocked_time);
1757 v3d_qpu_dump(devinfo, &n->inst->qpu);
1758 fprintf(stderr, "\n");
1759
1760 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1761 struct schedule_node *child =
1762 (struct schedule_node *)edge->child;
1763 if (!child)
1764 continue;
1765
1766 fprintf(stderr, " - ");
1767 v3d_qpu_dump(devinfo, &child->inst->qpu);
1768 fprintf(stderr, " (%d parents, %c)\n",
1769 child->dag.parent_count,
1770 edge->data ? 'w' : 'r');
1771 }
1772 }
1773 }
1774
magic_waddr_latency(const struct v3d_device_info * devinfo,enum v3d_qpu_waddr waddr,const struct v3d_qpu_instr * after)1775 static uint32_t magic_waddr_latency(const struct v3d_device_info *devinfo,
1776 enum v3d_qpu_waddr waddr,
1777 const struct v3d_qpu_instr *after)
1778 {
1779 /* Apply some huge latency between texture fetch requests and getting
1780 * their results back.
1781 *
1782 * FIXME: This is actually pretty bogus. If we do:
1783 *
1784 * mov tmu0_s, a
1785 * <a bit of math>
1786 * mov tmu0_s, b
1787 * load_tmu0
1788 * <more math>
1789 * load_tmu0
1790 *
1791 * we count that as worse than
1792 *
1793 * mov tmu0_s, a
1794 * mov tmu0_s, b
1795 * <lots of math>
1796 * load_tmu0
1797 * <more math>
1798 * load_tmu0
1799 *
1800 * because we associate the first load_tmu0 with the *second* tmu0_s.
1801 */
1802 if (v3d_qpu_magic_waddr_is_tmu(devinfo, waddr) &&
1803 v3d_qpu_waits_on_tmu(after)) {
1804 return 100;
1805 }
1806
1807 /* Assume that anything depending on us is consuming the SFU result. */
1808 if (v3d_qpu_magic_waddr_is_sfu(waddr))
1809 return 3;
1810
1811 return 1;
1812 }
1813
1814 static uint32_t
instruction_latency(const struct v3d_device_info * devinfo,struct schedule_node * before,struct schedule_node * after)1815 instruction_latency(const struct v3d_device_info *devinfo,
1816 struct schedule_node *before, struct schedule_node *after)
1817 {
1818 const struct v3d_qpu_instr *before_inst = &before->inst->qpu;
1819 const struct v3d_qpu_instr *after_inst = &after->inst->qpu;
1820 uint32_t latency = 1;
1821
1822 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU ||
1823 after_inst->type != V3D_QPU_INSTR_TYPE_ALU)
1824 return latency;
1825
1826 if (v3d_qpu_instr_is_sfu(before_inst))
1827 return 2;
1828
1829 if (before_inst->alu.add.op != V3D_QPU_A_NOP &&
1830 before_inst->alu.add.magic_write) {
1831 latency = MAX2(latency,
1832 magic_waddr_latency(devinfo,
1833 before_inst->alu.add.waddr,
1834 after_inst));
1835 }
1836
1837 if (before_inst->alu.mul.op != V3D_QPU_M_NOP &&
1838 before_inst->alu.mul.magic_write) {
1839 latency = MAX2(latency,
1840 magic_waddr_latency(devinfo,
1841 before_inst->alu.mul.waddr,
1842 after_inst));
1843 }
1844
1845 return latency;
1846 }
1847
1848 /** Recursive computation of the delay member of a node. */
1849 static void
compute_delay(struct dag_node * node,void * state)1850 compute_delay(struct dag_node *node, void *state)
1851 {
1852 struct schedule_node *n = (struct schedule_node *)node;
1853 struct v3d_compile *c = (struct v3d_compile *) state;
1854
1855 n->delay = 1;
1856
1857 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1858 struct schedule_node *child =
1859 (struct schedule_node *)edge->child;
1860
1861 n->delay = MAX2(n->delay, (child->delay +
1862 instruction_latency(c->devinfo, n,
1863 child)));
1864 }
1865 }
1866
1867 /* Removes a DAG head, but removing only the WAR edges. (dag_prune_head()
1868 * should be called on it later to finish pruning the other edges).
1869 */
1870 static void
pre_remove_head(struct dag * dag,struct schedule_node * n)1871 pre_remove_head(struct dag *dag, struct schedule_node *n)
1872 {
1873 list_delinit(&n->dag.link);
1874
1875 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1876 if (edge->data)
1877 dag_remove_edge(dag, edge);
1878 }
1879 }
1880
1881 static void
mark_instruction_scheduled(const struct v3d_device_info * devinfo,struct dag * dag,uint32_t time,struct schedule_node * node)1882 mark_instruction_scheduled(const struct v3d_device_info *devinfo,
1883 struct dag *dag,
1884 uint32_t time,
1885 struct schedule_node *node)
1886 {
1887 if (!node)
1888 return;
1889
1890 util_dynarray_foreach(&node->dag.edges, struct dag_edge, edge) {
1891 struct schedule_node *child =
1892 (struct schedule_node *)edge->child;
1893
1894 if (!child)
1895 continue;
1896
1897 uint32_t latency = instruction_latency(devinfo, node, child);
1898
1899 child->unblocked_time = MAX2(child->unblocked_time,
1900 time + latency);
1901 }
1902 dag_prune_head(dag, &node->dag);
1903 }
1904
1905 static void
insert_scheduled_instruction(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)1906 insert_scheduled_instruction(struct v3d_compile *c,
1907 struct qblock *block,
1908 struct choose_scoreboard *scoreboard,
1909 struct qinst *inst)
1910 {
1911 list_addtail(&inst->link, &block->instructions);
1912
1913 update_scoreboard_for_chosen(scoreboard, inst, c->devinfo);
1914 c->qpu_inst_count++;
1915 scoreboard->tick++;
1916 }
1917
1918 static struct qinst *
vir_nop()1919 vir_nop()
1920 {
1921 struct qreg undef = vir_nop_reg();
1922 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef);
1923
1924 return qinst;
1925 }
1926
1927 static void
emit_nop(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard)1928 emit_nop(struct v3d_compile *c, struct qblock *block,
1929 struct choose_scoreboard *scoreboard)
1930 {
1931 insert_scheduled_instruction(c, block, scoreboard, vir_nop());
1932 }
1933
1934 static bool
qpu_inst_valid_in_thrend_slot(struct v3d_compile * c,const struct qinst * qinst,int slot)1935 qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
1936 const struct qinst *qinst, int slot)
1937 {
1938 const struct v3d_qpu_instr *inst = &qinst->qpu;
1939
1940 if (slot == 2 && qinst->is_tlb_z_write)
1941 return false;
1942
1943 if (slot > 0 && qinst->uniform != ~0)
1944 return false;
1945
1946 if (c->devinfo->ver == 42 && v3d_qpu_waits_vpm(inst))
1947 return false;
1948
1949 if (inst->sig.ldvary)
1950 return false;
1951
1952 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
1953 /* GFXH-1625: TMUWT not allowed in the final instruction. */
1954 if (c->devinfo->ver == 42 && slot == 2 &&
1955 inst->alu.add.op == V3D_QPU_A_TMUWT) {
1956 return false;
1957 }
1958
1959 if (c->devinfo->ver == 42) {
1960 /* No writing physical registers at the end. */
1961 bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
1962 bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
1963 if ((!add_is_nop && !inst->alu.add.magic_write) ||
1964 (!mul_is_nop && !inst->alu.mul.magic_write)) {
1965 return false;
1966 }
1967
1968 if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
1969 !inst->sig_magic) {
1970 return false;
1971 }
1972 }
1973
1974 if (c->devinfo->ver >= 71) {
1975 /* The thread end instruction must not write to the
1976 * register file via the add/mul ALUs.
1977 */
1978 if (slot == 0 &&
1979 (!inst->alu.add.magic_write ||
1980 !inst->alu.mul.magic_write)) {
1981 return false;
1982 }
1983 }
1984
1985 if (c->devinfo->ver == 42) {
1986 /* RF0-2 might be overwritten during the delay slots by
1987 * fragment shader setup.
1988 */
1989 if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
1990 return false;
1991
1992 if (inst->raddr_b < 3 &&
1993 !inst->sig.small_imm_b &&
1994 v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
1995 return false;
1996 }
1997 }
1998
1999 if (c->devinfo->ver >= 71) {
2000 /* RF2-3 might be overwritten during the delay slots by
2001 * fragment shader setup.
2002 */
2003 if (v3d71_qpu_reads_raddr(inst, 2) ||
2004 v3d71_qpu_reads_raddr(inst, 3)) {
2005 return false;
2006 }
2007
2008 if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
2009 v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
2010 return false;
2011 }
2012 }
2013 }
2014
2015 return true;
2016 }
2017
2018 /**
2019 * This is called when trying to merge a thrsw back into the instruction stream
2020 * of instructions that were scheduled *before* the thrsw signal to fill its
2021 * delay slots. Because the actual execution of the thrsw happens after the
2022 * delay slots, it is usually safe to do this, but there are some cases that
2023 * need special care.
2024 */
2025 static bool
qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst,uint32_t slot)2026 qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2027 struct choose_scoreboard *scoreboard,
2028 const struct qinst *qinst,
2029 uint32_t slot)
2030 {
2031 /* No scheduling SFU when the result would land in the other
2032 * thread. The simulator complains for safety, though it
2033 * would only occur for dead code in our case.
2034 */
2035 if (slot > 0) {
2036 if (c->devinfo->ver == 42 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
2037 return false;
2038 if (c->devinfo->ver >= 71 && v3d_qpu_instr_is_sfu(&qinst->qpu))
2039 return false;
2040 }
2041
2042 if (qinst->qpu.sig.ldvary) {
2043 if (c->devinfo->ver == 42 && slot > 0)
2044 return false;
2045 if (c->devinfo->ver >= 71 && slot == 2)
2046 return false;
2047 }
2048
2049 /* unifa and the following 3 instructions can't overlap a
2050 * thread switch/end. The docs further clarify that this means
2051 * the cycle at which the actual thread switch/end happens
2052 * and not when the thrsw instruction is processed, which would
2053 * be after the 2 delay slots following the thrsw instruction.
2054 * This means that we can move up a thrsw up to the instruction
2055 * right after unifa:
2056 *
2057 * unifa, r5
2058 * thrsw
2059 * delay slot 1
2060 * delay slot 2
2061 * Thread switch happens here, 4 instructions away from unifa
2062 */
2063 if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
2064 return false;
2065
2066 /* See comment when we set has_rf0_flops_conflict for details */
2067 if (c->devinfo->ver >= 71 &&
2068 slot == 2 &&
2069 v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
2070 !qinst->qpu.sig_magic) {
2071 if (scoreboard->has_rf0_flops_conflict)
2072 return false;
2073 if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
2074 return false;
2075 }
2076
2077 return true;
2078 }
2079
2080 /**
2081 * This is called for instructions scheduled *after* a thrsw signal that may
2082 * land in the delay slots of the thrsw. Because these instructions were
2083 * scheduled after the thrsw, we need to be careful when placing them into
2084 * the delay slots, since that means that we are moving them ahead of the
2085 * thread switch and we need to ensure that is not a problem.
2086 */
2087 static bool
qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile * c,struct choose_scoreboard * scoreboard,const struct qinst * qinst)2088 qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
2089 struct choose_scoreboard *scoreboard,
2090 const struct qinst *qinst)
2091 {
2092 const uint32_t slot = scoreboard->tick - scoreboard->last_thrsw_tick;
2093 assert(slot <= 2);
2094
2095 /* We merge thrsw instructions back into the instruction stream
2096 * manually, so any instructions scheduled after a thrsw should be
2097 * in the actual delay slots and not in the same slot as the thrsw.
2098 */
2099 assert(slot >= 1);
2100
2101 /* No emitting a thrsw while the previous thrsw hasn't happened yet. */
2102 if (qinst->qpu.sig.thrsw)
2103 return false;
2104
2105 /* The restrictions for instructions scheduled before the the thrsw
2106 * also apply to instructions scheduled after the thrsw that we want
2107 * to place in its delay slots.
2108 */
2109 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
2110 return false;
2111
2112 /* TLB access is disallowed until scoreboard wait is executed, which
2113 * we do on the last thread switch.
2114 */
2115 if (qpu_inst_is_tlb(&qinst->qpu))
2116 return false;
2117
2118 /* Instruction sequence restrictions: Branch is not allowed in delay
2119 * slots of a thrsw.
2120 */
2121 if (qinst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2122 return false;
2123
2124 /* Miscellaneous restrictions: At the point of a thrsw we need to have
2125 * at least one outstanding lookup or TSY wait.
2126 *
2127 * So avoid placing TMU instructions scheduled after the thrsw into
2128 * its delay slots or we may be compromising the integrity of our TMU
2129 * sequences. Also, notice that if we moved these instructions into
2130 * the delay slots of a previous thrsw we could overflow our TMU output
2131 * fifo, since we could be effectively pipelining a lookup scheduled
2132 * after the thrsw into the sequence before the thrsw.
2133 */
2134 if (v3d_qpu_writes_tmu(c->devinfo, &qinst->qpu) ||
2135 qinst->qpu.sig.wrtmuc) {
2136 return false;
2137 }
2138
2139 /* Don't move instructions that wait on the TMU before the thread switch
2140 * happens since that would make the current thread stall before the
2141 * switch, which is exactly what we want to avoid with the thrsw
2142 * instruction.
2143 */
2144 if (v3d_qpu_waits_on_tmu(&qinst->qpu))
2145 return false;
2146
2147 /* A thread switch invalidates all accumulators, so don't place any
2148 * instructions that write accumulators into the delay slots.
2149 */
2150 if (v3d_qpu_writes_accum(c->devinfo, &qinst->qpu))
2151 return false;
2152
2153 /* Multop has an implicit write to the rtop register which is an
2154 * specialized accumulator that is only used with this instruction.
2155 */
2156 if (qinst->qpu.alu.mul.op == V3D_QPU_M_MULTOP)
2157 return false;
2158
2159 /* Flags are invalidated across a thread switch, so dont' place
2160 * instructions that write flags into delay slots.
2161 */
2162 if (v3d_qpu_writes_flags(&qinst->qpu))
2163 return false;
2164
2165 /* TSY sync ops materialize at the point of the next thread switch,
2166 * therefore, if we have a TSY sync right after a thread switch, we
2167 * cannot place it in its delay slots, or we would be moving the sync
2168 * to the thrsw before it instead.
2169 */
2170 if (qinst->qpu.alu.add.op == V3D_QPU_A_BARRIERID)
2171 return false;
2172
2173 return true;
2174 }
2175
2176 static bool
valid_thrsw_sequence(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qinst * qinst,int instructions_in_sequence,bool is_thrend)2177 valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
2178 struct qinst *qinst, int instructions_in_sequence,
2179 bool is_thrend)
2180 {
2181 for (int slot = 0; slot < instructions_in_sequence; slot++) {
2182 if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
2183 qinst, slot)) {
2184 return false;
2185 }
2186
2187 if (is_thrend &&
2188 !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
2189 return false;
2190 }
2191
2192 /* Note that the list is circular, so we can only do this up
2193 * to instructions_in_sequence.
2194 */
2195 qinst = (struct qinst *)qinst->link.next;
2196 }
2197
2198 return true;
2199 }
2200
2201 /**
2202 * Emits a THRSW signal in the stream, trying to move it up to pair with
2203 * another instruction.
2204 */
2205 static int
emit_thrsw(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst,bool is_thrend)2206 emit_thrsw(struct v3d_compile *c,
2207 struct qblock *block,
2208 struct choose_scoreboard *scoreboard,
2209 struct qinst *inst,
2210 bool is_thrend)
2211 {
2212 int time = 0;
2213
2214 /* There should be nothing in a thrsw inst being scheduled other than
2215 * the signal bits.
2216 */
2217 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
2218 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
2219 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
2220
2221 /* Don't try to emit a thrsw in the delay slots of a previous thrsw
2222 * or branch.
2223 */
2224 while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
2225 emit_nop(c, block, scoreboard);
2226 time++;
2227 }
2228 while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
2229 emit_nop(c, block, scoreboard);
2230 time++;
2231 }
2232
2233 /* Find how far back into previous instructions we can put the THRSW. */
2234 int slots_filled = 0;
2235 int invalid_sig_count = 0;
2236 int invalid_seq_count = 0;
2237 bool last_thrsw_after_invalid_ok = false;
2238 struct qinst *merge_inst = NULL;
2239 vir_for_each_inst_rev(prev_inst, block) {
2240 /* No emitting our thrsw while the previous thrsw hasn't
2241 * happened yet.
2242 */
2243 if (scoreboard->last_thrsw_tick + 3 >
2244 scoreboard->tick - (slots_filled + 1)) {
2245 break;
2246 }
2247
2248
2249 if (!valid_thrsw_sequence(c, scoreboard,
2250 prev_inst, slots_filled + 1,
2251 is_thrend)) {
2252 /* Even if the current sequence isn't valid, we may
2253 * be able to get a valid sequence by trying to move the
2254 * thrsw earlier, so keep going.
2255 */
2256 invalid_seq_count++;
2257 goto cont_block;
2258 }
2259
2260 struct v3d_qpu_sig sig = prev_inst->qpu.sig;
2261 sig.thrsw = true;
2262 uint32_t packed_sig;
2263 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) {
2264 /* If we can't merge the thrsw here because of signal
2265 * incompatibility, keep going, we might be able to
2266 * merge it in an earlier instruction.
2267 */
2268 invalid_sig_count++;
2269 goto cont_block;
2270 }
2271
2272 /* For last thrsw we need 2 consecutive slots that are
2273 * thrsw compatible, so if we have previously jumped over
2274 * an incompatible signal, flag that we have found the first
2275 * valid slot here and keep going.
2276 */
2277 if (inst->is_last_thrsw && invalid_sig_count > 0 &&
2278 !last_thrsw_after_invalid_ok) {
2279 last_thrsw_after_invalid_ok = true;
2280 invalid_sig_count++;
2281 goto cont_block;
2282 }
2283
2284 /* We can merge the thrsw in this instruction */
2285 last_thrsw_after_invalid_ok = false;
2286 invalid_sig_count = 0;
2287 invalid_seq_count = 0;
2288 merge_inst = prev_inst;
2289
2290 cont_block:
2291 if (++slots_filled == 3)
2292 break;
2293 }
2294
2295 /* If we jumped over a signal incompatibility and did not manage to
2296 * merge the thrsw in the end, we need to adjust slots filled to match
2297 * the last valid merge point.
2298 */
2299 assert((invalid_sig_count == 0 && invalid_seq_count == 0) ||
2300 slots_filled >= invalid_sig_count + invalid_seq_count);
2301 if (invalid_sig_count > 0)
2302 slots_filled -= invalid_sig_count;
2303 if (invalid_seq_count > 0)
2304 slots_filled -= invalid_seq_count;
2305
2306 bool needs_free = false;
2307 if (merge_inst) {
2308 merge_inst->qpu.sig.thrsw = true;
2309 needs_free = true;
2310 scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
2311 } else {
2312 scoreboard->last_thrsw_tick = scoreboard->tick;
2313 insert_scheduled_instruction(c, block, scoreboard, inst);
2314 time++;
2315 slots_filled++;
2316 merge_inst = inst;
2317 }
2318
2319 scoreboard->first_thrsw_emitted = true;
2320
2321 /* If we're emitting the last THRSW (other than program end), then
2322 * signal that to the HW by emitting two THRSWs in a row.
2323 */
2324 if (inst->is_last_thrsw) {
2325 if (slots_filled <= 1) {
2326 emit_nop(c, block, scoreboard);
2327 time++;
2328 }
2329 struct qinst *second_inst =
2330 (struct qinst *)merge_inst->link.next;
2331 second_inst->qpu.sig.thrsw = true;
2332 scoreboard->last_thrsw_emitted = true;
2333 }
2334
2335 /* Make sure the thread end executes within the program lifespan */
2336 if (is_thrend) {
2337 for (int i = 0; i < 3 - slots_filled; i++) {
2338 emit_nop(c, block, scoreboard);
2339 time++;
2340 }
2341 }
2342
2343 /* If we put our THRSW into another instruction, free up the
2344 * instruction that didn't end up scheduled into the list.
2345 */
2346 if (needs_free)
2347 free(inst);
2348
2349 return time;
2350 }
2351
2352 static bool
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile * c,struct qinst * inst)2353 qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
2354 {
2355 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
2356 return false;
2357
2358 if (inst->qpu.sig.thrsw)
2359 return false;
2360
2361 if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
2362 return false;
2363
2364 if (vir_has_uniform(inst))
2365 return false;
2366
2367 return true;
2368 }
2369
2370 static void
emit_branch(struct v3d_compile * c,struct qblock * block,struct choose_scoreboard * scoreboard,struct qinst * inst)2371 emit_branch(struct v3d_compile *c,
2372 struct qblock *block,
2373 struct choose_scoreboard *scoreboard,
2374 struct qinst *inst)
2375 {
2376 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2377
2378 /* We should've not picked up a branch for the delay slots of a previous
2379 * thrsw, branch or unifa write instruction.
2380 */
2381 int branch_tick = scoreboard->tick;
2382 assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
2383 assert(scoreboard->last_branch_tick + 3 < branch_tick);
2384 assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
2385
2386 /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
2387 * setmsf.
2388 */
2389 bool is_safe_msf_branch =
2390 c->devinfo->ver >= 71 ||
2391 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
2392 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
2393 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
2394 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_NA0;
2395 assert(scoreboard->last_setmsf_tick != branch_tick - 1 ||
2396 is_safe_msf_branch);
2397
2398 /* Insert the branch instruction */
2399 insert_scheduled_instruction(c, block, scoreboard, inst);
2400
2401 /* Now see if we can move the branch instruction back into the
2402 * instruction stream to fill its delay slots
2403 */
2404 int slots_filled = 0;
2405 while (slots_filled < 3 && block->instructions.next != &inst->link) {
2406 struct qinst *prev_inst = (struct qinst *) inst->link.prev;
2407 assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
2408
2409 /* Can't move the branch instruction if that would place it
2410 * in the delay slots of other instructions.
2411 */
2412 if (scoreboard->last_branch_tick + 3 >=
2413 branch_tick - slots_filled - 1) {
2414 break;
2415 }
2416
2417 if (scoreboard->last_thrsw_tick + 2 >=
2418 branch_tick - slots_filled - 1) {
2419 break;
2420 }
2421
2422 if (scoreboard->last_unifa_write_tick + 3 >=
2423 branch_tick - slots_filled - 1) {
2424 break;
2425 }
2426
2427 /* Do not move up a branch if it can disrupt an ldvary sequence
2428 * as that can cause stomping of the r5 register.
2429 */
2430 if (scoreboard->last_ldvary_tick + 2 >=
2431 branch_tick - slots_filled) {
2432 break;
2433 }
2434
2435 /* Can't move a conditional branch before the instruction
2436 * that writes the flags for its condition.
2437 */
2438 if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
2439 inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
2440 break;
2441 }
2442
2443 if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
2444 break;
2445
2446 if (!is_safe_msf_branch) {
2447 struct qinst *prev_prev_inst =
2448 (struct qinst *) prev_inst->link.prev;
2449 if (prev_prev_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
2450 prev_prev_inst->qpu.alu.add.op == V3D_QPU_A_SETMSF) {
2451 break;
2452 }
2453 }
2454
2455 list_del(&prev_inst->link);
2456 list_add(&prev_inst->link, &inst->link);
2457 slots_filled++;
2458 }
2459
2460 block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
2461 scoreboard->last_branch_tick = branch_tick - slots_filled;
2462
2463 /* Fill any remaining delay slots.
2464 *
2465 * For unconditional branches we'll try to fill these with the
2466 * first instructions in the successor block after scheduling
2467 * all blocks when setting up branch targets.
2468 */
2469 for (int i = 0; i < 3 - slots_filled; i++)
2470 emit_nop(c, block, scoreboard);
2471 }
2472
2473 static bool
alu_reads_register(const struct v3d_device_info * devinfo,struct v3d_qpu_instr * inst,bool add,bool magic,uint32_t index)2474 alu_reads_register(const struct v3d_device_info *devinfo,
2475 struct v3d_qpu_instr *inst,
2476 bool add, bool magic, uint32_t index)
2477 {
2478 uint32_t num_src;
2479 if (add)
2480 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
2481 else
2482 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
2483
2484 if (devinfo->ver == 42) {
2485 enum v3d_qpu_mux mux_a, mux_b;
2486 if (add) {
2487 mux_a = inst->alu.add.a.mux;
2488 mux_b = inst->alu.add.b.mux;
2489 } else {
2490 mux_a = inst->alu.mul.a.mux;
2491 mux_b = inst->alu.mul.b.mux;
2492 }
2493
2494 for (int i = 0; i < num_src; i++) {
2495 if (magic) {
2496 if (i == 0 && mux_a == index)
2497 return true;
2498 if (i == 1 && mux_b == index)
2499 return true;
2500 } else {
2501 if (i == 0 && mux_a == V3D_QPU_MUX_A &&
2502 inst->raddr_a == index) {
2503 return true;
2504 }
2505 if (i == 0 && mux_a == V3D_QPU_MUX_B &&
2506 inst->raddr_b == index) {
2507 return true;
2508 }
2509 if (i == 1 && mux_b == V3D_QPU_MUX_A &&
2510 inst->raddr_a == index) {
2511 return true;
2512 }
2513 if (i == 1 && mux_b == V3D_QPU_MUX_B &&
2514 inst->raddr_b == index) {
2515 return true;
2516 }
2517 }
2518 }
2519
2520 return false;
2521 }
2522
2523 assert(devinfo->ver >= 71);
2524 assert(!magic);
2525
2526 uint32_t raddr_a, raddr_b;
2527 if (add) {
2528 raddr_a = inst->alu.add.a.raddr;
2529 raddr_b = inst->alu.add.b.raddr;
2530 } else {
2531 raddr_a = inst->alu.mul.a.raddr;
2532 raddr_b = inst->alu.mul.b.raddr;
2533 }
2534
2535 for (int i = 0; i < num_src; i++) {
2536 if (i == 0 && raddr_a == index)
2537 return true;
2538 if (i == 1 && raddr_b == index)
2539 return true;
2540 }
2541
2542 return false;
2543 }
2544
2545 /**
2546 * This takes and ldvary signal merged into 'inst' and tries to move it up to
2547 * the previous instruction to get good pipelining of ldvary sequences,
2548 * transforming this:
2549 *
2550 * nop ; nop ; ldvary.r4
2551 * nop ; fmul r0, r4, rf0 ;
2552 * fadd rf13, r0, r5 ; nop; ; ldvary.r1 <-- inst
2553 *
2554 * into:
2555 *
2556 * nop ; nop ; ldvary.r4
2557 * nop ; fmul r0, r4, rf0 ; ldvary.r1
2558 * fadd rf13, r0, r5 ; nop; ; <-- inst
2559 *
2560 * If we manage to do this successfully (we return true here), then flagging
2561 * the ldvary as "scheduled" may promote the follow-up fmul to a DAG head that
2562 * we will be able to pick up to merge into 'inst', leading to code like this:
2563 *
2564 * nop ; nop ; ldvary.r4
2565 * nop ; fmul r0, r4, rf0 ; ldvary.r1
2566 * fadd rf13, r0, r5 ; fmul r2, r1, rf0 ; <-- inst
2567 */
2568 static bool
fixup_pipelined_ldvary(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,struct v3d_qpu_instr * inst)2569 fixup_pipelined_ldvary(struct v3d_compile *c,
2570 struct choose_scoreboard *scoreboard,
2571 struct qblock *block,
2572 struct v3d_qpu_instr *inst)
2573 {
2574 const struct v3d_device_info *devinfo = c->devinfo;
2575
2576 /* We only call this if we have successfully merged an ldvary into a
2577 * previous instruction.
2578 */
2579 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
2580 assert(inst->sig.ldvary);
2581 uint32_t ldvary_magic = inst->sig_magic;
2582 uint32_t ldvary_index = inst->sig_addr;
2583
2584 /* The instruction in which we merged the ldvary cannot read
2585 * the ldvary destination, if it does, then moving the ldvary before
2586 * it would overwrite it.
2587 */
2588 if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
2589 return false;
2590 if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
2591 return false;
2592
2593 /* The implicit ldvary destination may not be written to by a signal
2594 * in the instruction following ldvary. Since we are planning to move
2595 * ldvary to the previous instruction, this means we need to check if
2596 * the current instruction has any other signal that could create this
2597 * conflict. The only other signal that can write to the implicit
2598 * ldvary destination that is compatible with ldvary in the same
2599 * instruction is ldunif.
2600 */
2601 if (inst->sig.ldunif)
2602 return false;
2603
2604 /* The previous instruction can't write to the same destination as the
2605 * ldvary.
2606 */
2607 struct qinst *prev = (struct qinst *) block->instructions.prev;
2608 if (!prev || prev->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
2609 return false;
2610
2611 if (prev->qpu.alu.add.op != V3D_QPU_A_NOP) {
2612 if (prev->qpu.alu.add.magic_write == ldvary_magic &&
2613 prev->qpu.alu.add.waddr == ldvary_index) {
2614 return false;
2615 }
2616 }
2617
2618 if (prev->qpu.alu.mul.op != V3D_QPU_M_NOP) {
2619 if (prev->qpu.alu.mul.magic_write == ldvary_magic &&
2620 prev->qpu.alu.mul.waddr == ldvary_index) {
2621 return false;
2622 }
2623 }
2624
2625 /* The previous instruction cannot have a conflicting signal */
2626 if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
2627 return false;
2628
2629 uint32_t sig;
2630 struct v3d_qpu_sig new_sig = prev->qpu.sig;
2631 new_sig.ldvary = true;
2632 if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
2633 return false;
2634
2635 /* The previous instruction cannot use flags since ldvary uses the
2636 * 'cond' instruction field to store the destination.
2637 */
2638 if (v3d_qpu_writes_flags(&prev->qpu))
2639 return false;
2640 if (v3d_qpu_reads_flags(&prev->qpu))
2641 return false;
2642
2643 /* We can't put an ldvary in the delay slots of a thrsw. We should've
2644 * prevented this when pairing up the ldvary with another instruction
2645 * and flagging it for a fixup. In V3D 7.x this is limited only to the
2646 * second delay slot.
2647 */
2648 assert((devinfo->ver == 42 &&
2649 scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
2650 (devinfo->ver >= 71 &&
2651 scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
2652
2653 /* Move the ldvary to the previous instruction and remove it from the
2654 * current one.
2655 */
2656 prev->qpu.sig.ldvary = true;
2657 prev->qpu.sig_magic = ldvary_magic;
2658 prev->qpu.sig_addr = ldvary_index;
2659 scoreboard->last_ldvary_tick = scoreboard->tick - 1;
2660
2661 inst->sig.ldvary = false;
2662 inst->sig_magic = false;
2663 inst->sig_addr = 0;
2664
2665 /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
2666 if (devinfo->ver >= 71) {
2667 scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
2668 set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
2669 }
2670
2671 /* By moving ldvary to the previous instruction we make it update r5
2672 * (rf0 for ver >= 71) in the current one, so nothing else in it
2673 * should write this register.
2674 *
2675 * This should've been prevented by our depedency tracking, which
2676 * would not allow ldvary to be paired up with an instruction that
2677 * writes r5/rf0 (since our dependency tracking doesn't know that the
2678 * ldvary write to r5/rf0 happens in the next instruction).
2679 */
2680 assert(!v3d_qpu_writes_r5(devinfo, inst));
2681 assert(devinfo->ver == 42 ||
2682 (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
2683 !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
2684
2685 return true;
2686 }
2687
2688 static uint32_t
schedule_instructions(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2689 schedule_instructions(struct v3d_compile *c,
2690 struct choose_scoreboard *scoreboard,
2691 struct qblock *block,
2692 enum quniform_contents *orig_uniform_contents,
2693 uint32_t *orig_uniform_data,
2694 uint32_t *next_uniform)
2695 {
2696 const struct v3d_device_info *devinfo = c->devinfo;
2697 uint32_t time = 0;
2698
2699 while (!list_is_empty(&scoreboard->dag->heads)) {
2700 struct schedule_node *chosen =
2701 choose_instruction_to_schedule(c, scoreboard, NULL);
2702 struct schedule_node *merge = NULL;
2703
2704 /* If there are no valid instructions to schedule, drop a NOP
2705 * in.
2706 */
2707 struct qinst *qinst = chosen ? chosen->inst : vir_nop();
2708 struct v3d_qpu_instr *inst = &qinst->qpu;
2709
2710 if (debug) {
2711 fprintf(stderr, "t=%4d: current list:\n",
2712 time);
2713 dump_state(devinfo, scoreboard->dag);
2714 fprintf(stderr, "t=%4d: chose: ", time);
2715 v3d_qpu_dump(devinfo, inst);
2716 fprintf(stderr, "\n");
2717 }
2718
2719 /* We can't mark_instruction_scheduled() the chosen inst until
2720 * we're done identifying instructions to merge, so put the
2721 * merged instructions on a list for a moment.
2722 */
2723 struct list_head merged_list;
2724 list_inithead(&merged_list);
2725
2726 /* Schedule this instruction onto the QPU list. Also try to
2727 * find an instruction to pair with it.
2728 */
2729 if (chosen) {
2730 time = MAX2(chosen->unblocked_time, time);
2731 pre_remove_head(scoreboard->dag, chosen);
2732
2733 while ((merge =
2734 choose_instruction_to_schedule(c, scoreboard,
2735 chosen))) {
2736 time = MAX2(merge->unblocked_time, time);
2737 pre_remove_head(scoreboard->dag, merge);
2738 list_addtail(&merge->link, &merged_list);
2739 (void)qpu_merge_inst(devinfo, inst,
2740 inst, &merge->inst->qpu);
2741 if (merge->inst->uniform != -1) {
2742 chosen->inst->uniform =
2743 merge->inst->uniform;
2744 }
2745
2746 chosen->inst->ldtmu_count +=
2747 merge->inst->ldtmu_count;
2748
2749 if (debug) {
2750 fprintf(stderr, "t=%4d: merging: ",
2751 time);
2752 v3d_qpu_dump(devinfo, &merge->inst->qpu);
2753 fprintf(stderr, "\n");
2754 fprintf(stderr, " result: ");
2755 v3d_qpu_dump(devinfo, inst);
2756 fprintf(stderr, "\n");
2757 }
2758
2759 if (scoreboard->fixup_ldvary) {
2760 scoreboard->fixup_ldvary = false;
2761 if (fixup_pipelined_ldvary(c, scoreboard, block, inst)) {
2762 /* Flag the ldvary as scheduled
2763 * now so we can try to merge the
2764 * follow-up instruction in the
2765 * the ldvary sequence into the
2766 * current instruction.
2767 */
2768 mark_instruction_scheduled(
2769 devinfo, scoreboard->dag,
2770 time, merge);
2771 }
2772 }
2773 }
2774 if (read_stalls(c->devinfo, scoreboard, inst))
2775 c->qpu_inst_stalled_count++;
2776 }
2777
2778 /* Update the uniform index for the rewritten location --
2779 * branch target updating will still need to change
2780 * c->uniform_data[] using this index.
2781 */
2782 if (qinst->uniform != -1) {
2783 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
2784 block->branch_uniform = *next_uniform;
2785
2786 c->uniform_data[*next_uniform] =
2787 orig_uniform_data[qinst->uniform];
2788 c->uniform_contents[*next_uniform] =
2789 orig_uniform_contents[qinst->uniform];
2790 qinst->uniform = *next_uniform;
2791 (*next_uniform)++;
2792 }
2793
2794 if (debug) {
2795 fprintf(stderr, "\n");
2796 }
2797
2798 /* Now that we've scheduled a new instruction, some of its
2799 * children can be promoted to the list of instructions ready to
2800 * be scheduled. Update the children's unblocked time for this
2801 * DAG edge as we do so.
2802 */
2803 mark_instruction_scheduled(devinfo, scoreboard->dag, time, chosen);
2804 list_for_each_entry(struct schedule_node, merge, &merged_list,
2805 link) {
2806 mark_instruction_scheduled(devinfo, scoreboard->dag, time, merge);
2807
2808 /* The merged VIR instruction doesn't get re-added to the
2809 * block, so free it now.
2810 */
2811 free(merge->inst);
2812 }
2813
2814 if (inst->sig.thrsw) {
2815 time += emit_thrsw(c, block, scoreboard, qinst, false);
2816 } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
2817 emit_branch(c, block, scoreboard, qinst);
2818 } else {
2819 insert_scheduled_instruction(c, block,
2820 scoreboard, qinst);
2821 }
2822 }
2823
2824 return time;
2825 }
2826
2827 static uint32_t
qpu_schedule_instructions_block(struct v3d_compile * c,struct choose_scoreboard * scoreboard,struct qblock * block,enum quniform_contents * orig_uniform_contents,uint32_t * orig_uniform_data,uint32_t * next_uniform)2828 qpu_schedule_instructions_block(struct v3d_compile *c,
2829 struct choose_scoreboard *scoreboard,
2830 struct qblock *block,
2831 enum quniform_contents *orig_uniform_contents,
2832 uint32_t *orig_uniform_data,
2833 uint32_t *next_uniform)
2834 {
2835 void *mem_ctx = ralloc_context(NULL);
2836 scoreboard->dag = dag_create(mem_ctx);
2837 struct list_head setup_list;
2838
2839 list_inithead(&setup_list);
2840
2841 /* Wrap each instruction in a scheduler structure. */
2842 while (!list_is_empty(&block->instructions)) {
2843 struct qinst *qinst = (struct qinst *)block->instructions.next;
2844 struct schedule_node *n =
2845 rzalloc(mem_ctx, struct schedule_node);
2846
2847 dag_init_node(scoreboard->dag, &n->dag);
2848 n->inst = qinst;
2849
2850 list_del(&qinst->link);
2851 list_addtail(&n->link, &setup_list);
2852 }
2853
2854 calculate_forward_deps(c, scoreboard->dag, &setup_list);
2855 calculate_reverse_deps(c, scoreboard->dag, &setup_list);
2856
2857 dag_traverse_bottom_up(scoreboard->dag, compute_delay, c);
2858
2859 uint32_t cycles = schedule_instructions(c, scoreboard, block,
2860 orig_uniform_contents,
2861 orig_uniform_data,
2862 next_uniform);
2863
2864 ralloc_free(mem_ctx);
2865 scoreboard->dag = NULL;
2866
2867 return cycles;
2868 }
2869
2870 static void
qpu_set_branch_targets(struct v3d_compile * c)2871 qpu_set_branch_targets(struct v3d_compile *c)
2872 {
2873 vir_for_each_block(block, c) {
2874 /* The end block of the program has no branch. */
2875 if (!block->successors[0])
2876 continue;
2877
2878 /* If there was no branch instruction, then the successor
2879 * block must follow immediately after this one.
2880 */
2881 if (block->branch_qpu_ip == ~0) {
2882 assert(block->end_qpu_ip + 1 ==
2883 block->successors[0]->start_qpu_ip);
2884 continue;
2885 }
2886
2887 /* Walk back through the delay slots to find the branch
2888 * instr.
2889 */
2890 struct qinst *branch = NULL;
2891 struct list_head *entry = block->instructions.prev;
2892 int32_t delay_slot_count = -1;
2893 struct qinst *delay_slots_start = NULL;
2894 for (int i = 0; i < 3; i++) {
2895 entry = entry->prev;
2896 struct qinst *inst =
2897 container_of(entry, struct qinst, link);
2898
2899 if (delay_slot_count == -1) {
2900 if (!v3d_qpu_is_nop(&inst->qpu))
2901 delay_slot_count = i;
2902 else
2903 delay_slots_start = inst;
2904 }
2905
2906 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
2907 branch = inst;
2908 break;
2909 }
2910 }
2911 assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
2912 assert(delay_slot_count >= 0 && delay_slot_count <= 3);
2913 assert(delay_slot_count == 0 || delay_slots_start != NULL);
2914
2915 /* Make sure that the if-we-don't-jump
2916 * successor was scheduled just after the
2917 * delay slots.
2918 */
2919 assert(!block->successors[1] ||
2920 block->successors[1]->start_qpu_ip ==
2921 block->branch_qpu_ip + 4);
2922
2923 branch->qpu.branch.offset =
2924 ((block->successors[0]->start_qpu_ip -
2925 (block->branch_qpu_ip + 4)) *
2926 sizeof(uint64_t));
2927
2928 /* Set up the relative offset to jump in the
2929 * uniform stream.
2930 *
2931 * Use a temporary here, because
2932 * uniform_data[inst->uniform] may be shared
2933 * between multiple instructions.
2934 */
2935 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT);
2936 c->uniform_data[branch->uniform] =
2937 (block->successors[0]->start_uniform -
2938 (block->branch_uniform + 1)) * 4;
2939
2940 /* If this is an unconditional branch, try to fill any remaining
2941 * delay slots with the initial instructions of the successor
2942 * block.
2943 *
2944 * FIXME: we can do the same for conditional branches if we
2945 * predicate the instructions to match the branch condition.
2946 */
2947 if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
2948 struct list_head *successor_insts =
2949 &block->successors[0]->instructions;
2950 delay_slot_count = MIN2(delay_slot_count,
2951 list_length(successor_insts));
2952 struct qinst *s_inst =
2953 (struct qinst *) successor_insts->next;
2954 struct qinst *slot = delay_slots_start;
2955 int slots_filled = 0;
2956 while (slots_filled < delay_slot_count &&
2957 qpu_inst_valid_in_branch_delay_slot(c, s_inst)) {
2958 memcpy(&slot->qpu, &s_inst->qpu,
2959 sizeof(slot->qpu));
2960 s_inst = (struct qinst *) s_inst->link.next;
2961 slot = (struct qinst *) slot->link.next;
2962 slots_filled++;
2963 }
2964 branch->qpu.branch.offset +=
2965 slots_filled * sizeof(uint64_t);
2966 }
2967 }
2968 }
2969
2970 uint32_t
v3d_qpu_schedule_instructions(struct v3d_compile * c)2971 v3d_qpu_schedule_instructions(struct v3d_compile *c)
2972 {
2973 const struct v3d_device_info *devinfo = c->devinfo;
2974 struct qblock *end_block = list_last_entry(&c->blocks,
2975 struct qblock, link);
2976
2977 /* We reorder the uniforms as we schedule instructions, so save the
2978 * old data off and replace it.
2979 */
2980 uint32_t *uniform_data = c->uniform_data;
2981 enum quniform_contents *uniform_contents = c->uniform_contents;
2982 c->uniform_contents = ralloc_array(c, enum quniform_contents,
2983 c->num_uniforms);
2984 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms);
2985 c->uniform_array_size = c->num_uniforms;
2986 uint32_t next_uniform = 0;
2987
2988 struct choose_scoreboard scoreboard;
2989 memset(&scoreboard, 0, sizeof(scoreboard));
2990 scoreboard.last_ldvary_tick = -10;
2991 scoreboard.last_unifa_write_tick = -10;
2992 scoreboard.last_magic_sfu_write_tick = -10;
2993 scoreboard.last_uniforms_reset_tick = -10;
2994 scoreboard.last_thrsw_tick = -10;
2995 scoreboard.last_branch_tick = -10;
2996 scoreboard.last_setmsf_tick = -10;
2997 scoreboard.last_stallable_sfu_tick = -10;
2998 scoreboard.first_ldtmu_after_thrsw = true;
2999 scoreboard.last_implicit_rf0_write_tick = - 10;
3000
3001 if (debug) {
3002 fprintf(stderr, "Pre-schedule instructions\n");
3003 vir_for_each_block(block, c) {
3004 fprintf(stderr, "BLOCK %d\n", block->index);
3005 list_for_each_entry(struct qinst, qinst,
3006 &block->instructions, link) {
3007 v3d_qpu_dump(devinfo, &qinst->qpu);
3008 fprintf(stderr, "\n");
3009 }
3010 }
3011 fprintf(stderr, "\n");
3012 }
3013
3014 uint32_t cycles = 0;
3015 vir_for_each_block(block, c) {
3016 block->start_qpu_ip = c->qpu_inst_count;
3017 block->branch_qpu_ip = ~0;
3018 block->start_uniform = next_uniform;
3019
3020 cycles += qpu_schedule_instructions_block(c,
3021 &scoreboard,
3022 block,
3023 uniform_contents,
3024 uniform_data,
3025 &next_uniform);
3026
3027 block->end_qpu_ip = c->qpu_inst_count - 1;
3028 }
3029
3030 /* Emit the program-end THRSW instruction. */;
3031 struct qinst *thrsw = vir_nop();
3032 thrsw->qpu.sig.thrsw = true;
3033 emit_thrsw(c, end_block, &scoreboard, thrsw, true);
3034
3035 qpu_set_branch_targets(c);
3036
3037 assert(next_uniform == c->num_uniforms);
3038
3039 return cycles;
3040 }
3041