1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "util/ralloc.h"
25 #include "util/register_allocate.h"
26 #include "common/v3d_device_info.h"
27 #include "v3d_compiler.h"
28
29 #define ACC_INDEX 0
30 #define ACC_COUNT 6
31
32 /* RA nodes used to track RF registers with implicit writes */
33 #define IMPLICIT_RF_COUNT 1
34
35 #define PHYS_COUNT 64
36
37 static uint8_t
get_phys_index(const struct v3d_device_info * devinfo)38 get_phys_index(const struct v3d_device_info *devinfo)
39 {
40 if (devinfo->has_accumulators)
41 return ACC_INDEX + ACC_COUNT;
42 else
43 return 0;
44 }
45
46 /* ACC as accumulator */
47 #define CLASS_BITS_PHYS (1 << 0)
48 #define CLASS_BITS_ACC (1 << 1)
49 #define CLASS_BITS_R5 (1 << 4)
50
51 static inline bool
stage_has_payload(struct v3d_compile * c)52 stage_has_payload(struct v3d_compile *c)
53 {
54 return c->s->info.stage == MESA_SHADER_FRAGMENT ||
55 c->s->info.stage == MESA_SHADER_COMPUTE;
56 }
57
58 static uint8_t
get_class_bit_any(const struct v3d_device_info * devinfo)59 get_class_bit_any(const struct v3d_device_info *devinfo)
60 {
61 if (devinfo->has_accumulators)
62 return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
63 else
64 return CLASS_BITS_PHYS;
65 }
66
67 static uint8_t
filter_class_bits(const struct v3d_device_info * devinfo,uint8_t class_bits)68 filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
69 {
70 if (!devinfo->has_accumulators) {
71 assert(class_bits & CLASS_BITS_PHYS);
72 class_bits = CLASS_BITS_PHYS;
73 }
74 return class_bits;
75 }
76
77 static inline uint32_t
temp_to_node(struct v3d_compile * c,uint32_t temp)78 temp_to_node(struct v3d_compile *c, uint32_t temp)
79 {
80 return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
81 IMPLICIT_RF_COUNT);
82 }
83
84 static inline uint32_t
node_to_temp(struct v3d_compile * c,uint32_t node)85 node_to_temp(struct v3d_compile *c, uint32_t node)
86 {
87 assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
88 (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
89 return node - (c->devinfo->has_accumulators ? ACC_COUNT :
90 IMPLICIT_RF_COUNT);
91 }
92
93 static inline uint8_t
get_temp_class_bits(struct v3d_compile * c,uint32_t temp)94 get_temp_class_bits(struct v3d_compile *c,
95 uint32_t temp)
96 {
97 return c->nodes.info[temp_to_node(c, temp)].class_bits;
98 }
99
100 static inline void
set_temp_class_bits(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)101 set_temp_class_bits(struct v3d_compile *c,
102 uint32_t temp, uint8_t class_bits)
103 {
104 c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
105 }
106
107 static struct ra_class *
choose_reg_class(struct v3d_compile * c,uint8_t class_bits)108 choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
109 {
110 if (class_bits == CLASS_BITS_PHYS) {
111 return c->compiler->reg_class_phys[c->thread_index];
112 } else if (class_bits == (CLASS_BITS_R5)) {
113 assert(c->devinfo->has_accumulators);
114 return c->compiler->reg_class_r5[c->thread_index];
115 } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
116 assert(c->devinfo->has_accumulators);
117 return c->compiler->reg_class_phys_or_acc[c->thread_index];
118 } else {
119 assert(class_bits == get_class_bit_any(c->devinfo));
120 return c->compiler->reg_class_any[c->thread_index];
121 }
122 }
123
124 static inline struct ra_class *
choose_reg_class_for_temp(struct v3d_compile * c,uint32_t temp)125 choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
126 {
127 assert(temp < c->num_temps && temp < c->nodes.alloc_count);
128 return choose_reg_class(c, get_temp_class_bits(c, temp));
129 }
130
131 static inline bool
qinst_writes_tmu(const struct v3d_device_info * devinfo,struct qinst * inst)132 qinst_writes_tmu(const struct v3d_device_info *devinfo,
133 struct qinst *inst)
134 {
135 return (inst->dst.file == QFILE_MAGIC &&
136 v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index)) ||
137 inst->qpu.sig.wrtmuc;
138 }
139
140 static bool
is_end_of_tmu_sequence(const struct v3d_device_info * devinfo,struct qinst * inst,struct qblock * block)141 is_end_of_tmu_sequence(const struct v3d_device_info *devinfo,
142 struct qinst *inst, struct qblock *block)
143 {
144 /* Only tmuwt and ldtmu can finish TMU sequences */
145 bool is_tmuwt = inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
146 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
147 bool is_ldtmu = inst->qpu.sig.ldtmu;
148 if (!is_tmuwt && !is_ldtmu)
149 return false;
150
151 /* Check if this is the last tmuwt or ldtmu in the sequence */
152 list_for_each_entry_from(struct qinst, scan_inst, inst->link.next,
153 &block->instructions, link) {
154 is_tmuwt = scan_inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
155 scan_inst->qpu.alu.add.op == V3D_QPU_A_TMUWT;
156 is_ldtmu = scan_inst->qpu.sig.ldtmu;
157
158 if (is_tmuwt || is_ldtmu)
159 return false;
160
161 if (qinst_writes_tmu(devinfo, scan_inst))
162 return true;
163 }
164
165 return true;
166 }
167
168 static bool
vir_is_mov_uniform(struct v3d_compile * c,int temp)169 vir_is_mov_uniform(struct v3d_compile *c, int temp)
170 {
171 struct qinst *def = c->defs[temp];
172
173 return def && def->qpu.sig.ldunif;
174 }
175
176 static bool
can_reconstruct_inst(struct qinst * inst)177 can_reconstruct_inst(struct qinst *inst)
178 {
179 assert(inst);
180
181 if (vir_is_add(inst)) {
182 switch (inst->qpu.alu.add.op) {
183 case V3D_QPU_A_FXCD:
184 case V3D_QPU_A_FYCD:
185 case V3D_QPU_A_XCD:
186 case V3D_QPU_A_YCD:
187 case V3D_QPU_A_IID:
188 case V3D_QPU_A_EIDX:
189 case V3D_QPU_A_TIDX:
190 case V3D_QPU_A_SAMPID:
191 /* No need to check input unpacks because none of these
192 * opcodes read sources. FXCD,FYCD have pack variants.
193 */
194 return inst->qpu.flags.ac == V3D_QPU_COND_NONE &&
195 inst->qpu.flags.auf == V3D_QPU_UF_NONE &&
196 inst->qpu.flags.apf == V3D_QPU_PF_NONE &&
197 inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE;
198 default:
199 return false;
200 }
201 }
202
203 return false;
204 }
205
206 static bool
can_reconstruct_temp(struct v3d_compile * c,int temp)207 can_reconstruct_temp(struct v3d_compile *c, int temp)
208 {
209 struct qinst *def = c->defs[temp];
210 return def && can_reconstruct_inst(def);
211 }
212
213 static struct qreg
reconstruct_temp(struct v3d_compile * c,enum v3d_qpu_add_op op)214 reconstruct_temp(struct v3d_compile *c, enum v3d_qpu_add_op op)
215 {
216 struct qreg dest;
217 switch (op) {
218 case V3D_QPU_A_FXCD:
219 dest = vir_FXCD(c);
220 break;
221 case V3D_QPU_A_FYCD:
222 dest = vir_FYCD(c);
223 break;
224 case V3D_QPU_A_XCD:
225 dest = vir_XCD(c);
226 break;
227 case V3D_QPU_A_YCD:
228 dest = vir_YCD(c);
229 break;
230 case V3D_QPU_A_IID:
231 dest = vir_IID(c);
232 break;
233 case V3D_QPU_A_EIDX:
234 dest = vir_EIDX(c);
235 break;
236 case V3D_QPU_A_TIDX:
237 dest = vir_TIDX(c);
238 break;
239 case V3D_QPU_A_SAMPID:
240 dest = vir_SAMPID(c);
241 break;
242 default:
243 unreachable("Unexpected opcode for reconstruction");
244 }
245
246 return dest;
247 }
248
249 enum temp_spill_type {
250 SPILL_TYPE_UNIFORM,
251 SPILL_TYPE_RECONSTRUCT,
252 SPILL_TYPE_TMU
253 };
254
255 static enum temp_spill_type
get_spill_type_for_temp(struct v3d_compile * c,int temp)256 get_spill_type_for_temp(struct v3d_compile *c, int temp)
257 {
258 if (vir_is_mov_uniform(c, temp))
259 return SPILL_TYPE_UNIFORM;
260
261 if (can_reconstruct_temp(c, temp))
262 return SPILL_TYPE_RECONSTRUCT;
263
264 return SPILL_TYPE_TMU;
265 }
266
267 static int
v3d_choose_spill_node(struct v3d_compile * c)268 v3d_choose_spill_node(struct v3d_compile *c)
269 {
270 const float tmu_scale = 10;
271 float block_scale = 1.0;
272 float spill_costs[c->num_temps];
273 bool in_tmu_operation = false;
274 bool rtop_hazard = false;
275 bool started_last_seg = false;
276
277 for (unsigned i = 0; i < c->num_temps; i++)
278 spill_costs[i] = 0.0;
279
280 /* XXX: Scale the cost up when inside of a loop. */
281 vir_for_each_block(block, c) {
282 vir_for_each_inst(inst, block) {
283 /* RTOP is not preserved across thread switches, so
284 * we can't spill in the middle of multop + umul24.
285 */
286 bool is_multop = false;
287 bool is_umul24 = false;
288 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
289 if (inst->qpu.alu.mul.op == V3D_QPU_M_MULTOP) {
290 is_multop = true;
291 rtop_hazard = true;
292 } else if (inst->qpu.alu.mul.op == V3D_QPU_M_UMUL24) {
293 is_umul24 = true;
294 }
295 }
296
297 /* We can't insert new thread switches after
298 * starting output writes.
299 */
300 bool no_spilling =
301 (c->threads > 1 && started_last_seg) ||
302 (c->max_tmu_spills == 0);
303
304 /* Discourage spilling of TMU operations */
305 for (int i = 0; i < vir_get_nsrc(inst); i++) {
306 if (inst->src[i].file != QFILE_TEMP)
307 continue;
308
309 int temp = inst->src[i].index;
310 enum temp_spill_type spill_type =
311 get_spill_type_for_temp(c, temp);
312
313 if (spill_type != SPILL_TYPE_TMU) {
314 spill_costs[temp] += block_scale;
315 } else if (!no_spilling && (!rtop_hazard || is_multop)) {
316 float tmu_op_scale = in_tmu_operation ?
317 3.0 : 1.0;
318 spill_costs[temp] += (block_scale *
319 tmu_scale *
320 tmu_op_scale);
321 } else {
322 BITSET_CLEAR(c->spillable, temp);
323 }
324 }
325
326 if (inst->dst.file == QFILE_TEMP) {
327 int temp = inst->dst.index;
328 enum temp_spill_type spill_type =
329 get_spill_type_for_temp(c, temp);
330
331 if (spill_type != SPILL_TYPE_TMU) {
332 /* We just rematerialize it later */
333 } else if (!no_spilling && (!rtop_hazard || is_umul24)) {
334 spill_costs[temp] += (block_scale *
335 tmu_scale);
336 } else {
337 BITSET_CLEAR(c->spillable, temp);
338 }
339 }
340
341 /* Refuse to spill a ldvary's dst, because that means
342 * that ldvary's r5 would end up being used across a
343 * thrsw.
344 */
345 if (inst->qpu.sig.ldvary) {
346 assert(inst->dst.file == QFILE_TEMP);
347 BITSET_CLEAR(c->spillable, inst->dst.index);
348 }
349
350 if (inst->is_last_thrsw)
351 started_last_seg = true;
352
353 /* Track when we're in between a TMU setup and the
354 * final LDTMU or TMUWT from that TMU setup. We
355 * penalize spills during that time.
356 */
357 if (is_end_of_tmu_sequence(c->devinfo, inst, block))
358 in_tmu_operation = false;
359
360 if (qinst_writes_tmu(c->devinfo, inst))
361 in_tmu_operation = true;
362
363 if (is_umul24)
364 rtop_hazard = false;
365 }
366 }
367
368 /* We always emit a "last thrsw" to ensure all our spilling occurs
369 * before the last thread section. See vir_emit_last_thrsw.
370 */
371 assert(started_last_seg);
372
373 for (unsigned i = 0; i < c->num_temps; i++) {
374 if (BITSET_TEST(c->spillable, i)) {
375 ra_set_node_spill_cost(c->g, temp_to_node(c, i),
376 spill_costs[i]);
377 }
378 }
379
380 return ra_get_best_spill_node(c->g);
381 }
382
383 static void
ensure_nodes(struct v3d_compile * c)384 ensure_nodes(struct v3d_compile *c)
385 {
386 if (c->num_temps < c->nodes.alloc_count)
387 return;
388
389 c->nodes.alloc_count *= 2;
390 c->nodes.info = reralloc_array_size(c,
391 c->nodes.info,
392 sizeof(c->nodes.info[0]),
393 c->nodes.alloc_count +
394 MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
395 }
396
397 /* Creates the interference node for a new temp. We use this to keep the node
398 * list updated during the spilling process, which generates new temps/nodes.
399 */
400 static int
add_node(struct v3d_compile * c,uint32_t temp,uint8_t class_bits)401 add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
402 {
403 ensure_nodes(c);
404
405 int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
406 assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
407 node == temp + IMPLICIT_RF_COUNT);
408
409 /* We fill the node priority after we are done inserting spills */
410 c->nodes.info[node].class_bits = class_bits;
411 c->nodes.info[node].priority = 0;
412 c->nodes.info[node].is_ldunif_dst = false;
413 c->nodes.info[node].is_program_end = false;
414 c->nodes.info[node].unused = false;
415 c->nodes.info[node].payload_conflict = false;
416
417 return node;
418 }
419
420 /* The spill offset for this thread takes a bit of setup, so do it once at
421 * program start.
422 */
423 void
v3d_setup_spill_base(struct v3d_compile * c)424 v3d_setup_spill_base(struct v3d_compile *c)
425 {
426 /* Setting up the spill base is done in the entry block; so change
427 * both the current block to emit and the cursor.
428 */
429 struct qblock *current_block = c->cur_block;
430 c->cur_block = vir_entry_block(c);
431 c->cursor = vir_before_block(c->cur_block);
432
433 int start_num_temps = c->num_temps;
434
435 /* Each thread wants to be in a separate region of the scratch space
436 * so that the QPUs aren't fighting over cache lines. We have the
437 * driver keep a single global spill BO rather than
438 * per-spilling-program BOs, so we need a uniform from the driver for
439 * what the per-thread scale is.
440 */
441 struct qreg thread_offset =
442 vir_UMUL(c,
443 vir_TIDX(c),
444 vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
445
446 /* Each channel in a reg is 4 bytes, so scale them up by that. */
447 struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
448 vir_uniform_ui(c, 2));
449
450 c->spill_base = vir_ADD(c,
451 vir_ADD(c, thread_offset, element_offset),
452 vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
453
454 /* Make sure that we don't spill the spilling setup instructions. */
455 for (int i = start_num_temps; i < c->num_temps; i++) {
456 BITSET_CLEAR(c->spillable, i);
457
458 /* If we are spilling, update the RA map with the temps added
459 * by the spill setup. Our spill_base register can never be an
460 * accumulator because it is used for TMU spill/fill and thus
461 * needs to persist across thread switches.
462 */
463 if (c->spilling) {
464 int temp_class = CLASS_BITS_PHYS;
465 if (c->devinfo->has_accumulators &&
466 i != c->spill_base.index) {
467 temp_class |= CLASS_BITS_ACC;
468 }
469 int node = add_node(c, i, temp_class);
470 c->nodes.info[node].payload_conflict =
471 stage_has_payload(c);
472 }
473 }
474
475 /* Restore the current block. */
476 c->cur_block = current_block;
477 c->cursor = vir_after_block(c->cur_block);
478 }
479
480 /**
481 * Computes the address for a spill/fill sequence and completes the spill/fill
482 * sequence by emitting the following code:
483 *
484 * ldunif.spill_offset
485 * add tmua spill_base spill_offset
486 * thrsw
487 *
488 * If the sequence is for a spill, then it will emit a tmuwt after the thrsw,
489 * otherwise it will emit an ldtmu to load the fill result into 'fill_dst'.
490 *
491 * The parameter 'ip' represents the ip at which the spill/fill is happening.
492 * This is used to disallow accumulators on temps that cross this ip boundary
493 * due to the new thrsw itroduced in the sequence above.
494 */
495 static void
v3d_emit_spill_tmua(struct v3d_compile * c,uint32_t spill_offset,enum v3d_qpu_cond cond,int32_t ip,struct qreg * fill_dst)496 v3d_emit_spill_tmua(struct v3d_compile *c,
497 uint32_t spill_offset,
498 enum v3d_qpu_cond cond,
499 int32_t ip,
500 struct qreg *fill_dst)
501 {
502 assert(ip >= 0);
503
504 /* Load a uniform with the spill offset and add it to the spill base
505 * to obtain the TMUA address. It can be of class ANY because we know
506 * we are consuming it immediately without thrsw in between.
507 */
508 assert(c->disable_ldunif_opt);
509 struct qreg offset = vir_uniform_ui(c, spill_offset);
510 add_node(c, offset.index, get_class_bit_any(c->devinfo));
511
512 /* We always enable per-quad on spills/fills to ensure we spill
513 * any channels involved with helper invocations, but only if
514 * the spill is not conditional, since otherwise we may be spilling
515 * invalida lanes and overwriting valid data from a previous spill
516 * to the same address.
517 */
518 struct qreg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
519 struct qinst *inst = vir_ADD_dest(c, tmua, c->spill_base, offset);
520 inst->qpu.flags.ac = cond;
521 inst->ldtmu_count = 1;
522 inst->uniform =
523 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
524 cond != V3D_QPU_COND_NONE ?
525 0xffffffff : 0xffffff7f /* per-quad*/);
526
527 vir_emit_thrsw(c);
528
529 /* If this is for a spill, emit a TMUWT otherwise a LDTMU to load the
530 * result of the fill. The TMUWT temp is not really read, the ldtmu
531 * temp will be used immediately so just like the uniform above we
532 * can allow accumulators.
533 */
534 int temp_class =
535 filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
536 if (!fill_dst) {
537 struct qreg dst = vir_TMUWT(c);
538 assert(dst.file == QFILE_TEMP);
539 add_node(c, dst.index, temp_class);
540 } else {
541 *fill_dst = vir_LDTMU(c);
542 assert(fill_dst->file == QFILE_TEMP);
543 add_node(c, fill_dst->index, temp_class);
544 }
545
546 /* Temps across the thread switch we injected can't be assigned to
547 * accumulators.
548 *
549 * Fills inject code before ip, so anything that starts at ip or later
550 * is not affected by the thrsw. Something that ends at ip will be
551 * affected though.
552 *
553 * Spills inject code after ip, so anything that starts strictly later
554 * than ip is not affected (the temp starting at ip is usually the
555 * spilled temp except for postponed spills). Something that ends at ip
556 * won't be affected either.
557 */
558 for (int i = 0; i < c->spill_start_num_temps; i++) {
559 bool thrsw_cross = fill_dst ?
560 c->temp_start[i] < ip && c->temp_end[i] >= ip :
561 c->temp_start[i] <= ip && c->temp_end[i] > ip;
562 if (thrsw_cross) {
563 ra_set_node_class(c->g, temp_to_node(c, i),
564 choose_reg_class(c, CLASS_BITS_PHYS));
565 }
566 }
567 }
568
569 static void
v3d_emit_tmu_spill(struct v3d_compile * c,struct qinst * inst,struct qreg spill_temp,struct qinst * position,uint32_t ip,uint32_t spill_offset)570 v3d_emit_tmu_spill(struct v3d_compile *c,
571 struct qinst *inst,
572 struct qreg spill_temp,
573 struct qinst *position,
574 uint32_t ip,
575 uint32_t spill_offset)
576 {
577 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
578 assert(inst->dst.file == QFILE_TEMP);
579
580 c->cursor = vir_after_inst(position);
581
582 enum v3d_qpu_cond cond = vir_get_cond(inst);
583
584 /* If inst and position don't match, this is a postponed spill,
585 * in which case we have already allocated the temp for the spill
586 * and we should use that, otherwise create a new temp with the
587 * same register class bits as the original.
588 */
589 if (inst == position) {
590 uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
591 inst->dst = vir_get_temp(c);
592 add_node(c, inst->dst.index, class_bits);
593 } else {
594 inst->dst = spill_temp;
595
596 /* If this is a postponed spill the register being spilled may
597 * have been written more than once including conditional
598 * writes, so ignore predication on the spill instruction and
599 * always spill the full register.
600 */
601 cond = V3D_QPU_COND_NONE;
602 }
603
604 struct qinst *tmp =
605 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
606 inst->dst);
607 tmp->qpu.flags.mc = cond;
608
609 v3d_emit_spill_tmua(c, spill_offset, cond, ip, NULL);
610
611 c->spills++;
612 c->tmu_dirty_rcl = true;
613 }
614
615 static inline bool
interferes(int32_t t0_start,int32_t t0_end,int32_t t1_start,int32_t t1_end)616 interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
617 {
618 return !(t0_start >= t1_end || t1_start >= t0_end);
619 }
620
621 static void
v3d_spill_reg(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int spill_temp)622 v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
623 int spill_temp)
624 {
625 c->spill_start_num_temps = c->num_temps;
626 c->spilling = true;
627
628 enum temp_spill_type spill_type = get_spill_type_for_temp(c, spill_temp);
629
630 uint32_t spill_offset = 0;
631 if (spill_type == SPILL_TYPE_TMU) {
632 spill_offset = c->spill_size;
633 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
634
635 if (spill_offset == 0) {
636 v3d_setup_spill_base(c);
637
638 /* Don't allocate our spill base to rf0 to avoid
639 * conflicts with instructions doing implicit writes
640 * to that register.
641 */
642 if (!c->devinfo->has_accumulators) {
643 ra_add_node_interference(
644 c->g,
645 temp_to_node(c, c->spill_base.index),
646 implicit_rf_nodes[0]);
647 }
648 }
649 }
650
651 struct qinst *last_thrsw = c->last_thrsw;
652 assert(last_thrsw && last_thrsw->is_last_thrsw);
653
654 int uniform_index = ~0;
655 if (spill_type == SPILL_TYPE_UNIFORM) {
656 struct qinst *orig_unif = c->defs[spill_temp];
657 uniform_index = orig_unif->uniform;
658 }
659
660 enum v3d_qpu_add_op reconstruct_op = V3D_QPU_A_NOP;
661 if (spill_type == SPILL_TYPE_RECONSTRUCT) {
662 struct qinst *orig_def = c->defs[spill_temp];
663 assert(vir_is_add(orig_def));
664 reconstruct_op = orig_def->qpu.alu.add.op;
665 }
666
667 uint32_t spill_node = temp_to_node(c, spill_temp);
668
669 /* We must disable the ldunif optimization if we are spilling uniforms */
670 bool had_disable_ldunif_opt = c->disable_ldunif_opt;
671 c->disable_ldunif_opt = true;
672
673 struct qinst *start_of_tmu_sequence = NULL;
674 struct qinst *postponed_spill = NULL;
675 struct qreg postponed_spill_temp = { 0 };
676 vir_for_each_block(block, c) {
677 vir_for_each_inst_safe(inst, block) {
678 int32_t ip = inst->ip;
679
680 /* Track when we're in between a TMU setup and the final
681 * LDTMU or TMUWT from that TMU setup. We can't spill/fill any
682 * temps during that time, because that involves inserting a
683 * new TMU setup/LDTMU sequence, so we postpone the spill or
684 * move the fill up to not intrude in the middle of the TMU
685 * sequence.
686 */
687 if (is_end_of_tmu_sequence(c->devinfo, inst, block)) {
688 if (postponed_spill) {
689 v3d_emit_tmu_spill(c, postponed_spill,
690 postponed_spill_temp,
691 inst, ip, spill_offset);
692 }
693
694 start_of_tmu_sequence = NULL;
695 postponed_spill = NULL;
696 }
697
698 if (!start_of_tmu_sequence &&
699 qinst_writes_tmu(c->devinfo, inst)) {
700 start_of_tmu_sequence = inst;
701 }
702
703 /* fills */
704 int filled_src = -1;
705 for (int i = 0; i < vir_get_nsrc(inst); i++) {
706 if (inst->src[i].file != QFILE_TEMP ||
707 inst->src[i].index != spill_temp) {
708 continue;
709 }
710
711 if (filled_src >= 0) {
712 inst->src[i] = inst->src[filled_src];
713 continue;
714 }
715
716 c->cursor = vir_before_inst(inst);
717
718 if (spill_type == SPILL_TYPE_UNIFORM) {
719 struct qreg unif =
720 vir_uniform(c,
721 c->uniform_contents[uniform_index],
722 c->uniform_data[uniform_index]);
723 inst->src[i] = unif;
724 /* We are using the uniform in the
725 * instruction immediately after, so
726 * we can use any register class for it.
727 */
728 add_node(c, unif.index,
729 get_class_bit_any(c->devinfo));
730 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
731 struct qreg temp =
732 reconstruct_temp(c, reconstruct_op);
733 inst->src[i] = temp;
734 /* We are using the temp in the
735 * instruction immediately after so we
736 * can use ACC.
737 */
738 int temp_class =
739 filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
740 CLASS_BITS_ACC);
741 add_node(c, temp.index, temp_class);
742 } else {
743 /* If we have a postponed spill, we
744 * don't need a fill as the temp would
745 * not have been spilled yet, however,
746 * we need to update the temp index.
747 */
748 if (postponed_spill) {
749 inst->src[i] =
750 postponed_spill_temp;
751 } else {
752 int32_t fill_ip = ip;
753 if (start_of_tmu_sequence) {
754 c->cursor = vir_before_inst(start_of_tmu_sequence);
755 fill_ip = start_of_tmu_sequence->ip;
756 }
757
758 v3d_emit_spill_tmua(c, spill_offset,
759 V3D_QPU_COND_NONE,
760 fill_ip, &inst->src[i]);
761 c->fills++;
762 }
763 }
764
765 filled_src = i;
766 }
767
768 /* spills */
769 if (inst->dst.file == QFILE_TEMP &&
770 inst->dst.index == spill_temp) {
771 if (spill_type != SPILL_TYPE_TMU) {
772 c->cursor.link = NULL;
773 vir_remove_instruction(c, inst);
774 } else {
775 /* If we are in the middle of a TMU
776 * sequence, we postpone the actual
777 * spill until we have finished it. We,
778 * still need to replace the spill temp
779 * with a new temp though.
780 */
781 if (start_of_tmu_sequence) {
782 if (postponed_spill) {
783 postponed_spill->dst =
784 postponed_spill_temp;
785 }
786 if (!postponed_spill ||
787 vir_get_cond(inst) == V3D_QPU_COND_NONE) {
788 postponed_spill_temp =
789 vir_get_temp(c);
790 add_node(c,
791 postponed_spill_temp.index,
792 c->nodes.info[spill_node].class_bits);
793 }
794 postponed_spill = inst;
795 } else {
796 v3d_emit_tmu_spill(c, inst,
797 postponed_spill_temp,
798 inst, ip,
799 spill_offset);
800 }
801 }
802 }
803 }
804 }
805
806 /* Make sure c->last_thrsw is the actual last thrsw, not just one we
807 * inserted in our most recent unspill.
808 */
809 c->last_thrsw = last_thrsw;
810
811 /* Don't allow spilling of our spilling instructions. There's no way
812 * they can help get things colored.
813 */
814 for (int i = c->spill_start_num_temps; i < c->num_temps; i++)
815 BITSET_CLEAR(c->spillable, i);
816
817 /* Reset interference for spilled node */
818 ra_set_node_spill_cost(c->g, spill_node, 0);
819 ra_reset_node_interference(c->g, spill_node);
820 BITSET_CLEAR(c->spillable, spill_temp);
821
822 /* Rebuild program ips */
823 int32_t ip = 0;
824 vir_for_each_inst_inorder(inst, c)
825 inst->ip = ip++;
826
827 /* Rebuild liveness */
828 vir_calculate_live_intervals(c);
829
830 /* Add interferences for the new spilled temps and update interferences
831 * for c->spill_base (since we may have modified its liveness). Also,
832 * update node priorities based one new liveness data.
833 */
834 uint32_t sb_temp =c->spill_base.index;
835 uint32_t sb_node = temp_to_node(c, sb_temp);
836 for (uint32_t i = 0; i < c->num_temps; i++) {
837 if (c->temp_end[i] == -1)
838 continue;
839
840 uint32_t node_i = temp_to_node(c, i);
841 c->nodes.info[node_i].priority =
842 c->temp_end[i] - c->temp_start[i];
843
844 for (uint32_t j = MAX2(i + 1, c->spill_start_num_temps);
845 j < c->num_temps; j++) {
846 if (interferes(c->temp_start[i], c->temp_end[i],
847 c->temp_start[j], c->temp_end[j])) {
848 uint32_t node_j = temp_to_node(c, j);
849 ra_add_node_interference(c->g, node_i, node_j);
850 }
851 }
852
853 if (spill_type == SPILL_TYPE_TMU) {
854 if (i != sb_temp &&
855 interferes(c->temp_start[i], c->temp_end[i],
856 c->temp_start[sb_temp], c->temp_end[sb_temp])) {
857 ra_add_node_interference(c->g, node_i, sb_node);
858 }
859 }
860 }
861
862 c->disable_ldunif_opt = had_disable_ldunif_opt;
863 c->spilling = false;
864 }
865
866 struct v3d_ra_select_callback_data {
867 uint32_t phys_index;
868 uint32_t next_acc;
869 uint32_t next_phys;
870 struct v3d_ra_node_info *nodes;
871 const struct v3d_device_info *devinfo;
872 };
873
874 /* Choosing accumulators improves chances of merging QPU instructions
875 * due to these merges requiring that at most 2 rf registers are used
876 * by the add and mul instructions.
877 */
878 static bool
v3d_ra_favor_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,int priority)879 v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
880 BITSET_WORD *regs,
881 int priority)
882 {
883 if (!v3d_ra->devinfo->has_accumulators)
884 return false;
885
886 /* Favor accumulators if we have less that this number of physical
887 * registers. Accumulators have more restrictions (like being
888 * invalidated through thrsw), so running out of physical registers
889 * even if we have accumulators available can lead to register
890 * allocation failures.
891 */
892 static const int available_rf_threshold = 5;
893 int available_rf = 0 ;
894 for (int i = 0; i < PHYS_COUNT; i++) {
895 if (BITSET_TEST(regs, v3d_ra->phys_index + i))
896 available_rf++;
897 if (available_rf >= available_rf_threshold)
898 break;
899 }
900 if (available_rf < available_rf_threshold)
901 return true;
902
903 /* Favor accumulators for short-lived temps (our priority represents
904 * liveness), to prevent long-lived temps from grabbing accumulators
905 * and preventing follow-up instructions from using them, potentially
906 * leading to large portions of the shader being unable to use
907 * accumulators and therefore merge instructions successfully.
908 */
909 static const int priority_threshold = 20;
910 if (priority <= priority_threshold)
911 return true;
912
913 return false;
914 }
915
916 static bool
v3d_ra_select_accum(struct v3d_ra_select_callback_data * v3d_ra,BITSET_WORD * regs,unsigned int * out)917 v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
918 BITSET_WORD *regs,
919 unsigned int *out)
920 {
921 if (!v3d_ra->devinfo->has_accumulators)
922 return false;
923
924 /* Choose r5 for our ldunifs if possible (nobody else can load to that
925 * reg, and it keeps the QPU cond field free from being occupied by
926 * ldunifrf).
927 */
928 int r5 = ACC_INDEX + 5;
929 if (BITSET_TEST(regs, r5)) {
930 *out = r5;
931 return true;
932 }
933
934 /* Round-robin through our accumulators to give post-RA instruction
935 * selection more options.
936 */
937 for (int i = 0; i < ACC_COUNT; i++) {
938 int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
939 int acc = ACC_INDEX + acc_off;
940
941 if (BITSET_TEST(regs, acc)) {
942 v3d_ra->next_acc = acc_off + 1;
943 *out = acc;
944 return true;
945 }
946 }
947
948 return false;
949 }
950
951 static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data * v3d_ra,unsigned int node,BITSET_WORD * regs,unsigned int * out)952 v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
953 unsigned int node,
954 BITSET_WORD *regs,
955 unsigned int *out)
956 {
957 /* If this node is for an unused temp, ignore. */
958 if (v3d_ra->nodes->info[node].unused) {
959 *out = 0;
960 return true;
961 }
962
963 /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
964 * so we can avoid turning them into ldunifrf (which uses the
965 * cond field to encode the dst and would prevent merge with
966 * instructions that use cond flags).
967 */
968 if (v3d_ra->nodes->info[node].is_ldunif_dst &&
969 BITSET_TEST(regs, v3d_ra->phys_index)) {
970 assert(v3d_ra->devinfo->ver >= 71);
971 *out = v3d_ra->phys_index;
972 return true;
973 }
974
975 /* The last 3 instructions in a shader can't use some specific registers
976 * (usually early rf registers, depends on v3d version) so try to
977 * avoid allocating these to registers used by the last instructions
978 * in the shader. Do the same for spilling setup instructions that
979 * may conflict with payload registers.
980 */
981 const uint32_t safe_rf_start = v3d_ra->devinfo->ver == 42 ? 3 : 4;
982 if ((v3d_ra->nodes->info[node].is_program_end ||
983 v3d_ra->nodes->info[node].payload_conflict) &&
984 v3d_ra->next_phys < safe_rf_start) {
985 v3d_ra->next_phys = safe_rf_start;
986 }
987
988 for (int i = 0; i < PHYS_COUNT; i++) {
989 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
990
991 /* Try to keep rf0 available for ldunif in 7.x (see above). */
992 if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
993 continue;
994
995 int phys = v3d_ra->phys_index + phys_off;
996
997 if (BITSET_TEST(regs, phys)) {
998 v3d_ra->next_phys = phys_off + 1;
999 *out = phys;
1000 return true;
1001 }
1002 }
1003
1004 /* If we couldn't allocate, do try to assign rf0 if it is available. */
1005 if (v3d_ra->devinfo->ver >= 71 &&
1006 BITSET_TEST(regs, v3d_ra->phys_index)) {
1007 v3d_ra->next_phys = 1;
1008 *out = v3d_ra->phys_index;
1009 return true;
1010 }
1011
1012 return false;
1013 }
1014
1015 static unsigned int
v3d_ra_select_callback(unsigned int n,BITSET_WORD * regs,void * data)1016 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
1017 {
1018 struct v3d_ra_select_callback_data *v3d_ra = data;
1019
1020 unsigned int reg;
1021 if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->nodes->info[n].priority) &&
1022 v3d_ra_select_accum(v3d_ra, regs, ®)) {
1023 return reg;
1024 }
1025
1026 if (v3d_ra_select_rf(v3d_ra, n, regs, ®))
1027 return reg;
1028
1029 /* If we ran out of physical registers try to assign an accumulator
1030 * if we didn't favor that option earlier.
1031 */
1032 if (v3d_ra_select_accum(v3d_ra, regs, ®))
1033 return reg;
1034
1035 unreachable("RA must pass us at least one possible reg.");
1036 }
1037
1038 bool
vir_init_reg_sets(struct v3d_compiler * compiler)1039 vir_init_reg_sets(struct v3d_compiler *compiler)
1040 {
1041 /* Allocate up to 3 regfile classes, for the ways the physical
1042 * register file can be divided up for fragment shader threading.
1043 */
1044 int max_thread_index = 2;
1045 uint8_t phys_index = get_phys_index(compiler->devinfo);
1046
1047 compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
1048 false);
1049 if (!compiler->regs)
1050 return false;
1051
1052 for (int threads = 0; threads < max_thread_index; threads++) {
1053 compiler->reg_class_any[threads] =
1054 ra_alloc_contig_reg_class(compiler->regs, 1);
1055 if (compiler->devinfo->has_accumulators) {
1056 compiler->reg_class_r5[threads] =
1057 ra_alloc_contig_reg_class(compiler->regs, 1);
1058 compiler->reg_class_phys_or_acc[threads] =
1059 ra_alloc_contig_reg_class(compiler->regs, 1);
1060 }
1061 compiler->reg_class_phys[threads] =
1062 ra_alloc_contig_reg_class(compiler->regs, 1);
1063
1064 /* Init physical regs */
1065 for (int i = phys_index;
1066 i < phys_index + (PHYS_COUNT >> threads); i++) {
1067 if (compiler->devinfo->has_accumulators)
1068 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1069 ra_class_add_reg(compiler->reg_class_phys[threads], i);
1070 ra_class_add_reg(compiler->reg_class_any[threads], i);
1071 }
1072
1073 /* Init accumulator regs */
1074 if (compiler->devinfo->has_accumulators) {
1075 for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
1076 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
1077 ra_class_add_reg(compiler->reg_class_any[threads], i);
1078 }
1079 /* r5 can only store a single 32-bit value, so not much can
1080 * use it.
1081 */
1082 ra_class_add_reg(compiler->reg_class_r5[threads],
1083 ACC_INDEX + 5);
1084 ra_class_add_reg(compiler->reg_class_any[threads],
1085 ACC_INDEX + 5);
1086 }
1087 }
1088
1089 ra_set_finalize(compiler->regs, NULL);
1090
1091 return true;
1092 }
1093
1094 static inline bool
tmu_spilling_allowed(struct v3d_compile * c)1095 tmu_spilling_allowed(struct v3d_compile *c)
1096 {
1097 return c->spills + c->fills < c->max_tmu_spills;
1098 }
1099
1100 static bool
reg_is_payload(struct v3d_compile * c,struct qreg reg)1101 reg_is_payload(struct v3d_compile *c, struct qreg reg)
1102 {
1103 if (reg.file != QFILE_REG)
1104 return false;
1105
1106 if (c->devinfo->ver >= 71) {
1107 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1108 return reg.index >= 1 && reg.index <= 3;
1109 if (c->s->info.stage == MESA_SHADER_COMPUTE)
1110 return reg.index == 2 || reg.index == 3;
1111 return false;
1112 }
1113
1114 assert(c->devinfo->ver == 42);
1115 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1116 return reg.index <= 2;
1117 if (c->s->info.stage == MESA_SHADER_COMPUTE)
1118 return reg.index == 0 || reg.index == 2;
1119 return false;
1120 }
1121
1122 static bool
inst_reads_payload(struct v3d_compile * c,struct qinst * inst)1123 inst_reads_payload(struct v3d_compile *c, struct qinst *inst)
1124 {
1125 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1126 return false;
1127
1128 if (reg_is_payload(c, inst->dst))
1129 return true;
1130
1131 if (reg_is_payload(c, inst->src[0]))
1132 return true;
1133
1134 if (vir_get_nsrc(inst) > 1 && reg_is_payload(c, inst->src[1]))
1135 return true;
1136
1137 return false;
1138 }
1139
1140 static void
update_graph_and_reg_classes_for_inst(struct v3d_compile * c,int * acc_nodes,int * implicit_rf_nodes,int last_ldvary_ip,bool has_payload,struct qinst * inst)1141 update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
1142 int *acc_nodes,
1143 int *implicit_rf_nodes,
1144 int last_ldvary_ip,
1145 bool has_payload,
1146 struct qinst *inst)
1147 {
1148 int32_t ip = inst->ip;
1149 assert(ip >= 0);
1150
1151 /* If the instruction writes r4 (and optionally moves its
1152 * result to a temp), nothing else can be stored in r4 across
1153 * it.
1154 */
1155 if (vir_writes_r4_implicitly(c->devinfo, inst)) {
1156 for (int i = 0; i < c->num_temps; i++) {
1157 if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1158 ra_add_node_interference(c->g,
1159 temp_to_node(c, i),
1160 acc_nodes[4]);
1161 }
1162 }
1163 }
1164
1165 /* If any instruction writes to a physical register implicitly
1166 * nothing else can write the same register across it.
1167 */
1168 if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
1169 for (int i = 0; i < c->num_temps; i++) {
1170 if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1171 ra_add_node_interference(c->g,
1172 temp_to_node(c, i),
1173 implicit_rf_nodes[0]);
1174 }
1175 }
1176 }
1177
1178 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
1179 switch (inst->qpu.alu.add.op) {
1180 case V3D_QPU_A_LDVPMV_IN:
1181 case V3D_QPU_A_LDVPMV_OUT:
1182 case V3D_QPU_A_LDVPMD_IN:
1183 case V3D_QPU_A_LDVPMD_OUT:
1184 case V3D_QPU_A_LDVPMP:
1185 case V3D_QPU_A_LDVPMG_IN:
1186 case V3D_QPU_A_LDVPMG_OUT: {
1187 /* LDVPMs only store to temps (the MA flag
1188 * decides whether the LDVPM is in or out)
1189 */
1190 assert(inst->dst.file == QFILE_TEMP);
1191 set_temp_class_bits(c, inst->dst.index,
1192 CLASS_BITS_PHYS);
1193 break;
1194 }
1195
1196 case V3D_QPU_A_RECIP:
1197 case V3D_QPU_A_RSQRT:
1198 case V3D_QPU_A_EXP:
1199 case V3D_QPU_A_LOG:
1200 case V3D_QPU_A_SIN:
1201 case V3D_QPU_A_RSQRT2: {
1202 /* The SFU instructions write directly to the
1203 * phys regfile.
1204 */
1205 assert(inst->dst.file == QFILE_TEMP);
1206 set_temp_class_bits(c, inst->dst.index,
1207 CLASS_BITS_PHYS);
1208 break;
1209 }
1210
1211 default:
1212 break;
1213 }
1214 }
1215
1216 if (inst->src[0].file == QFILE_REG) {
1217 switch (inst->src[0].index) {
1218 case 0:
1219 /* V3D 7.x doesn't use rf0 for thread payload */
1220 if (c->devinfo->ver >= 71)
1221 break;
1222 else
1223 FALLTHROUGH;
1224 case 1:
1225 case 2:
1226 case 3: {
1227 /* Payload setup instructions: Force allocate
1228 * the dst to the given register (so the MOV
1229 * will disappear).
1230 */
1231 assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
1232 assert(inst->dst.file == QFILE_TEMP);
1233 uint32_t node = temp_to_node(c, inst->dst.index);
1234 ra_set_node_reg(c->g, node,
1235 get_phys_index(c->devinfo) +
1236 inst->src[0].index);
1237 break;
1238 }
1239 }
1240 }
1241
1242 /* Don't allocate rf0 to temps that cross ranges where we have
1243 * live implicit rf0 writes from ldvary. We can identify these
1244 * by tracking the last ldvary instruction and explicit reads
1245 * of rf0.
1246 */
1247 if (c->devinfo->ver >= 71 &&
1248 ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
1249 (vir_get_nsrc(inst) > 1 &&
1250 inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
1251 for (int i = 0; i < c->num_temps; i++) {
1252 if (c->temp_start[i] < ip &&
1253 c->temp_end[i] > last_ldvary_ip) {
1254 ra_add_node_interference(c->g,
1255 temp_to_node(c, i),
1256 implicit_rf_nodes[0]);
1257 }
1258 }
1259 }
1260
1261 /* Spill setup instructions are the only ones that we emit before
1262 * reading payload registers so we want to flag their temps so we
1263 * don't assign them to payload registers and stomp them before we
1264 * can read them. For the case where we may have emitted spill setup
1265 * before RA (i.e. for scratch), we need to do this now.
1266 */
1267 if (c->spill_size > 0 && has_payload && inst_reads_payload(c, inst)) {
1268 struct qblock *first_block = vir_entry_block(c);
1269 list_for_each_entry_from_rev(struct qinst, _i, inst->link.prev,
1270 &first_block->instructions, link) {
1271 if (_i->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1272 continue;
1273 if (_i->dst.file == QFILE_TEMP) {
1274 int node = temp_to_node(c, _i->dst.index);
1275 c->nodes.info[node].payload_conflict = true;
1276 }
1277 if (_i->src[0].file == QFILE_TEMP) {
1278 int node = temp_to_node(c, _i->src[0].index);
1279 c->nodes.info[node].payload_conflict = true;
1280 }
1281 if (vir_get_nsrc(_i) > 1 && _i->src[1].file == QFILE_TEMP) {
1282 int node = temp_to_node(c, _i->src[1].index);
1283 c->nodes.info[node].payload_conflict = true;
1284 }
1285 }
1286 }
1287
1288 if (inst->dst.file == QFILE_TEMP) {
1289 /* Only a ldunif gets to write to R5, which only has a
1290 * single 32-bit channel of storage.
1291 *
1292 * NOTE: ldunifa is subject to the same, however, going by
1293 * shader-db it is best to keep r5 exclusive to ldunif, probably
1294 * because ldunif has usually a shorter lifespan, allowing for
1295 * more accumulator reuse and QPU merges.
1296 */
1297 if (c->devinfo->has_accumulators) {
1298 if (!inst->qpu.sig.ldunif) {
1299 uint8_t class_bits =
1300 get_temp_class_bits(c, inst->dst.index) &
1301 ~CLASS_BITS_R5;
1302 set_temp_class_bits(c, inst->dst.index,
1303 class_bits);
1304
1305 }
1306 } else {
1307 /* Make sure we don't allocate the ldvary's
1308 * destination to rf0, since it would clash
1309 * with its implicit write to that register.
1310 */
1311 if (inst->qpu.sig.ldvary) {
1312 ra_add_node_interference(c->g,
1313 temp_to_node(c, inst->dst.index),
1314 implicit_rf_nodes[0]);
1315 }
1316 /* Flag dst temps from ldunif(a) instructions
1317 * so we can try to assign rf0 to them and avoid
1318 * converting these to ldunif(a)rf.
1319 */
1320 if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
1321 const uint32_t dst_n =
1322 temp_to_node(c, inst->dst.index);
1323 c->nodes.info[dst_n].is_ldunif_dst = true;
1324 }
1325 }
1326 }
1327
1328 /* All accumulators are invalidated across a thread switch. */
1329 if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
1330 for (int i = 0; i < c->num_temps; i++) {
1331 if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
1332 set_temp_class_bits(c, i,
1333 CLASS_BITS_PHYS);
1334 }
1335 }
1336 }
1337 }
1338
1339 static void
flag_program_end_nodes(struct v3d_compile * c)1340 flag_program_end_nodes(struct v3d_compile *c)
1341 {
1342 /* Only look for registers used in this many instructions */
1343 uint32_t last_set_count = 6;
1344
1345 struct qblock *last_block = vir_exit_block(c);
1346 list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
1347 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
1348 continue;
1349
1350 int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
1351 for (int i = 0; i < num_src; i++) {
1352 if (inst->src[i].file == QFILE_TEMP) {
1353 int node = temp_to_node(c, inst->src[i].index);
1354 c->nodes.info[node].is_program_end = true;
1355 }
1356 }
1357
1358 num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
1359 for (int i = 0; i < num_src; i++) {
1360 if (inst->src[i].file == QFILE_TEMP) {
1361 int node = temp_to_node(c, inst->src[i].index);
1362 c->nodes.info[node].is_program_end = true;
1363
1364 }
1365 }
1366
1367 if (inst->dst.file == QFILE_TEMP) {
1368 int node = temp_to_node(c, inst->dst.index);
1369 c->nodes.info[node].is_program_end = true;
1370 }
1371
1372 if (--last_set_count == 0)
1373 break;
1374 }
1375 }
1376
1377 /**
1378 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
1379 *
1380 * The return value should be freed by the caller.
1381 */
1382 struct qpu_reg *
v3d_register_allocate(struct v3d_compile * c)1383 v3d_register_allocate(struct v3d_compile *c)
1384 {
1385 int acc_nodes[ACC_COUNT];
1386 int implicit_rf_nodes[IMPLICIT_RF_COUNT];
1387
1388 unsigned num_ra_nodes = c->num_temps;
1389 if (c->devinfo->has_accumulators)
1390 num_ra_nodes += ARRAY_SIZE(acc_nodes);
1391 else
1392 num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
1393
1394 c->nodes = (struct v3d_ra_node_info) {
1395 .alloc_count = c->num_temps,
1396 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
1397 num_ra_nodes),
1398 };
1399
1400 uint32_t phys_index = get_phys_index(c->devinfo);
1401
1402 struct v3d_ra_select_callback_data callback_data = {
1403 .phys_index = phys_index,
1404 .next_acc = 0,
1405 /* Start at RF3, to try to keep the TLB writes from using
1406 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
1407 * using RF2-3.
1408 */
1409 .next_phys = c->devinfo->ver == 42 ? 3 : 4,
1410 .nodes = &c->nodes,
1411 .devinfo = c->devinfo,
1412 };
1413
1414 vir_calculate_live_intervals(c);
1415
1416 /* Convert 1, 2, 4 threads to 0, 1, 2 index.
1417 *
1418 * V3D 4.x has double the physical register space, so 64 physical regs
1419 * are available at both 1x and 2x threading, and 4x has 32.
1420 */
1421 c->thread_index = ffs(c->threads) - 1;
1422 if (c->thread_index >= 1)
1423 c->thread_index--;
1424
1425 c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
1426 ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
1427
1428 /* Make some fixed nodes for the accumulators, which we will need to
1429 * interfere with when ops have implied r3/r4 writes or for the thread
1430 * switches. We could represent these as classes for the nodes to
1431 * live in, but the classes take up a lot of memory to set up, so we
1432 * don't want to make too many. We use the same mechanism on platforms
1433 * without accumulators that can have implicit writes to phys regs.
1434 */
1435 for (uint32_t i = 0; i < num_ra_nodes; i++) {
1436 c->nodes.info[i].is_ldunif_dst = false;
1437 c->nodes.info[i].is_program_end = false;
1438 c->nodes.info[i].unused = false;
1439 c->nodes.info[i].priority = 0;
1440 c->nodes.info[i].class_bits = 0;
1441 c->nodes.info[i].payload_conflict = false;
1442 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
1443 acc_nodes[i] = i;
1444 ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
1445 } else if (!c->devinfo->has_accumulators &&
1446 i < ARRAY_SIZE(implicit_rf_nodes)) {
1447 implicit_rf_nodes[i] = i;
1448 ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
1449 } else {
1450 uint32_t t = node_to_temp(c, i);
1451 c->nodes.info[i].priority =
1452 c->temp_end[t] - c->temp_start[t];
1453 c->nodes.info[i].class_bits =
1454 get_class_bit_any(c->devinfo);
1455 }
1456 }
1457
1458 /* Walk the instructions adding register class restrictions and
1459 * interferences.
1460 */
1461 int ip = 0;
1462 int last_ldvary_ip = -1;
1463 bool has_payload = stage_has_payload(c);
1464 vir_for_each_inst_inorder(inst, c) {
1465 inst->ip = ip++;
1466
1467 /* ldunif(a) always write to a temporary, so we have
1468 * liveness info available to decide if rf0 is
1469 * available for them, however, ldvary is different:
1470 * it always writes to rf0 directly so we don't have
1471 * liveness information for its implicit rf0 write.
1472 *
1473 * That means the allocator may assign rf0 to a temp
1474 * that is defined while an implicit rf0 write from
1475 * ldvary is still live. We fix that by manually
1476 * tracking rf0 live ranges from ldvary instructions.
1477 */
1478 if (inst->qpu.sig.ldvary)
1479 last_ldvary_ip = ip;
1480
1481 update_graph_and_reg_classes_for_inst(c, acc_nodes,
1482 implicit_rf_nodes,
1483 last_ldvary_ip,
1484 has_payload,
1485 inst);
1486 }
1487
1488 /* Flag the nodes that are used in the last instructions of the program
1489 * (there are some registers that cannot be used in the last 3
1490 * instructions). We only do this for fragment shaders, because the idea
1491 * is that by avoiding this conflict we may be able to emit the last
1492 * thread switch earlier in some cases, however, in non-fragment shaders
1493 * this won't happen because the last instructions are always VPM stores
1494 * with a small immediate, which conflicts with other signals,
1495 * preventing us from ever moving the thrsw earlier.
1496 */
1497 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
1498 flag_program_end_nodes(c);
1499
1500 /* Set the register classes for all our temporaries in the graph */
1501 for (uint32_t i = 0; i < c->num_temps; i++) {
1502 ra_set_node_class(c->g, temp_to_node(c, i),
1503 choose_reg_class_for_temp(c, i));
1504 }
1505
1506 /* Add register interferences based on liveness data */
1507 for (uint32_t i = 0; i < c->num_temps; i++) {
1508 /* And while we are here, let's also flag nodes for
1509 * unused temps.
1510 */
1511 if (c->temp_start[i] > c->temp_end[i])
1512 c->nodes.info[temp_to_node(c, i)].unused = true;
1513
1514 for (uint32_t j = i + 1; j < c->num_temps; j++) {
1515 if (interferes(c->temp_start[i], c->temp_end[i],
1516 c->temp_start[j], c->temp_end[j])) {
1517 ra_add_node_interference(c->g,
1518 temp_to_node(c, i),
1519 temp_to_node(c, j));
1520 }
1521 }
1522 }
1523
1524 /* Debug option to force a bit of TMU spilling, for running
1525 * across conformance tests to make sure that spilling works.
1526 */
1527 const int force_register_spills = 0;
1528 if (force_register_spills > 0)
1529 c->max_tmu_spills = UINT32_MAX;
1530
1531 struct qpu_reg *temp_registers = NULL;
1532 while (true) {
1533 if (c->spill_size <
1534 V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
1535 int node = v3d_choose_spill_node(c);
1536 uint32_t temp = node_to_temp(c, node);
1537 if (node != -1) {
1538 v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1539 continue;
1540 }
1541 }
1542
1543 if (ra_allocate(c->g))
1544 break;
1545
1546 /* Failed allocation, try to spill */
1547 int node = v3d_choose_spill_node(c);
1548 if (node == -1)
1549 goto spill_fail;
1550
1551 uint32_t temp = node_to_temp(c, node);
1552 enum temp_spill_type spill_type =
1553 get_spill_type_for_temp(c, temp);
1554 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
1555 v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
1556 if (c->spills + c->fills > c->max_tmu_spills)
1557 goto spill_fail;
1558 } else {
1559 goto spill_fail;
1560 }
1561 }
1562
1563 /* Allocation was successful, build the 'temp -> reg' map */
1564 temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
1565 for (uint32_t i = 0; i < c->num_temps; i++) {
1566 int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
1567 if (ra_reg < phys_index) {
1568 temp_registers[i].magic = true;
1569 temp_registers[i].index = (V3D_QPU_WADDR_R0 +
1570 ra_reg - ACC_INDEX);
1571 } else {
1572 temp_registers[i].magic = false;
1573 temp_registers[i].index = ra_reg - phys_index;
1574 }
1575 }
1576
1577 spill_fail:
1578 ralloc_free(c->nodes.info);
1579 c->nodes.info = NULL;
1580 c->nodes.alloc_count = 0;
1581 ralloc_free(c->g);
1582 c->g = NULL;
1583 return temp_registers;
1584 }
1585