1 /*
2 * Copyright © 2016-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_schedule.h"
27 #include "compiler/nir/nir_builder.h"
28
29 int
vir_get_nsrc(struct qinst * inst)30 vir_get_nsrc(struct qinst *inst)
31 {
32 switch (inst->qpu.type) {
33 case V3D_QPU_INSTR_TYPE_BRANCH:
34 return 0;
35 case V3D_QPU_INSTR_TYPE_ALU:
36 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
37 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
38 else
39 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
40 }
41
42 return 0;
43 }
44
45 /**
46 * Returns whether the instruction has any side effects that must be
47 * preserved.
48 */
49 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)50 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
51 {
52 switch (inst->qpu.type) {
53 case V3D_QPU_INSTR_TYPE_BRANCH:
54 return true;
55 case V3D_QPU_INSTR_TYPE_ALU:
56 switch (inst->qpu.alu.add.op) {
57 case V3D_QPU_A_SETREVF:
58 case V3D_QPU_A_SETMSF:
59 case V3D_QPU_A_VPMSETUP:
60 case V3D_QPU_A_STVPMV:
61 case V3D_QPU_A_STVPMD:
62 case V3D_QPU_A_STVPMP:
63 case V3D_QPU_A_VPMWT:
64 case V3D_QPU_A_TMUWT:
65 return true;
66 default:
67 break;
68 }
69
70 switch (inst->qpu.alu.mul.op) {
71 case V3D_QPU_M_MULTOP:
72 return true;
73 default:
74 break;
75 }
76 }
77
78 if (inst->qpu.sig.ldtmu ||
79 inst->qpu.sig.ldvary ||
80 inst->qpu.sig.ldtlbu ||
81 inst->qpu.sig.ldtlb ||
82 inst->qpu.sig.wrtmuc ||
83 inst->qpu.sig.thrsw) {
84 return true;
85 }
86
87 /* ldunifa works like ldunif: it reads an element and advances the
88 * pointer, so each read has a side effect (we don't care for ldunif
89 * because we reconstruct the uniform stream buffer after compiling
90 * with the surviving uniforms), so allowing DCE to remove
91 * one would break follow-up loads. We could fix this by emitting a
92 * unifa for each ldunifa, but each unifa requires 3 delay slots
93 * before a ldunifa, so that would be quite expensive.
94 */
95 if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
96 return true;
97
98 return false;
99 }
100
101 bool
vir_is_raw_mov(struct qinst * inst)102 vir_is_raw_mov(struct qinst *inst)
103 {
104 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
105 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
106 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
107 return false;
108 }
109
110 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
111 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
112 return false;
113 }
114
115 if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
116 inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
117 inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
118 inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
119 return false;
120 }
121
122 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
123 inst->qpu.flags.mc != V3D_QPU_COND_NONE)
124 return false;
125
126 return true;
127 }
128
129 bool
vir_is_add(struct qinst * inst)130 vir_is_add(struct qinst *inst)
131 {
132 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
133 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
134 }
135
136 bool
vir_is_mul(struct qinst * inst)137 vir_is_mul(struct qinst *inst)
138 {
139 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
140 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
141 }
142
143 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)144 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
145 {
146 if (inst->dst.file == QFILE_MAGIC)
147 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
148
149 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
150 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
151 return true;
152 }
153
154 return false;
155 }
156
157 bool
vir_writes_r4_implicitly(const struct v3d_device_info * devinfo,struct qinst * inst)158 vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
159 struct qinst *inst)
160 {
161 if (!devinfo->has_accumulators)
162 return false;
163
164 switch (inst->dst.file) {
165 case QFILE_MAGIC:
166 switch (inst->dst.index) {
167 case V3D_QPU_WADDR_RECIP:
168 case V3D_QPU_WADDR_RSQRT:
169 case V3D_QPU_WADDR_EXP:
170 case V3D_QPU_WADDR_LOG:
171 case V3D_QPU_WADDR_SIN:
172 return true;
173 }
174 break;
175 default:
176 break;
177 }
178
179 return false;
180 }
181
182 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)183 vir_set_unpack(struct qinst *inst, int src,
184 enum v3d_qpu_input_unpack unpack)
185 {
186 assert(src == 0 || src == 1);
187
188 if (vir_is_add(inst)) {
189 if (src == 0)
190 inst->qpu.alu.add.a.unpack = unpack;
191 else
192 inst->qpu.alu.add.b.unpack = unpack;
193 } else {
194 assert(vir_is_mul(inst));
195 if (src == 0)
196 inst->qpu.alu.mul.a.unpack = unpack;
197 else
198 inst->qpu.alu.mul.b.unpack = unpack;
199 }
200 }
201
202 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)203 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
204 {
205 if (vir_is_add(inst)) {
206 inst->qpu.alu.add.output_pack = pack;
207 } else {
208 assert(vir_is_mul(inst));
209 inst->qpu.alu.mul.output_pack = pack;
210 }
211 }
212
213 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)214 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
215 {
216 if (vir_is_add(inst)) {
217 inst->qpu.flags.ac = cond;
218 } else {
219 assert(vir_is_mul(inst));
220 inst->qpu.flags.mc = cond;
221 }
222 }
223
224 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)225 vir_get_cond(struct qinst *inst)
226 {
227 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
228
229 if (vir_is_add(inst))
230 return inst->qpu.flags.ac;
231 else if (vir_is_mul(inst))
232 return inst->qpu.flags.mc;
233 else /* NOP */
234 return V3D_QPU_COND_NONE;
235 }
236
237 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)238 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
239 {
240 c->flags_temp = -1;
241 if (vir_is_add(inst)) {
242 inst->qpu.flags.apf = pf;
243 } else {
244 assert(vir_is_mul(inst));
245 inst->qpu.flags.mpf = pf;
246 }
247 }
248
249 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)250 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
251 {
252 c->flags_temp = -1;
253 if (vir_is_add(inst)) {
254 inst->qpu.flags.auf = uf;
255 } else {
256 assert(vir_is_mul(inst));
257 inst->qpu.flags.muf = uf;
258 }
259 }
260
261 #if 0
262 uint8_t
263 vir_channels_written(struct qinst *inst)
264 {
265 if (vir_is_mul(inst)) {
266 switch (inst->dst.pack) {
267 case QPU_PACK_MUL_NOP:
268 case QPU_PACK_MUL_8888:
269 return 0xf;
270 case QPU_PACK_MUL_8A:
271 return 0x1;
272 case QPU_PACK_MUL_8B:
273 return 0x2;
274 case QPU_PACK_MUL_8C:
275 return 0x4;
276 case QPU_PACK_MUL_8D:
277 return 0x8;
278 }
279 } else {
280 switch (inst->dst.pack) {
281 case QPU_PACK_A_NOP:
282 case QPU_PACK_A_8888:
283 case QPU_PACK_A_8888_SAT:
284 case QPU_PACK_A_32_SAT:
285 return 0xf;
286 case QPU_PACK_A_8A:
287 case QPU_PACK_A_8A_SAT:
288 return 0x1;
289 case QPU_PACK_A_8B:
290 case QPU_PACK_A_8B_SAT:
291 return 0x2;
292 case QPU_PACK_A_8C:
293 case QPU_PACK_A_8C_SAT:
294 return 0x4;
295 case QPU_PACK_A_8D:
296 case QPU_PACK_A_8D_SAT:
297 return 0x8;
298 case QPU_PACK_A_16A:
299 case QPU_PACK_A_16A_SAT:
300 return 0x3;
301 case QPU_PACK_A_16B:
302 case QPU_PACK_A_16B_SAT:
303 return 0xc;
304 }
305 }
306 unreachable("Bad pack field");
307 }
308 #endif
309
310 struct qreg
vir_get_temp(struct v3d_compile * c)311 vir_get_temp(struct v3d_compile *c)
312 {
313 struct qreg reg;
314
315 reg.file = QFILE_TEMP;
316 reg.index = c->num_temps++;
317
318 if (c->num_temps > c->defs_array_size) {
319 uint32_t old_size = c->defs_array_size;
320 c->defs_array_size = MAX2(old_size * 2, 16);
321
322 c->defs = reralloc(c, c->defs, struct qinst *,
323 c->defs_array_size);
324 memset(&c->defs[old_size], 0,
325 sizeof(c->defs[0]) * (c->defs_array_size - old_size));
326
327 c->spillable = reralloc(c, c->spillable,
328 BITSET_WORD,
329 BITSET_WORDS(c->defs_array_size));
330 for (int i = old_size; i < c->defs_array_size; i++)
331 BITSET_SET(c->spillable, i);
332 }
333
334 return reg;
335 }
336
337 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)338 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
339 {
340 struct qinst *inst = calloc(1, sizeof(*inst));
341
342 inst->qpu = v3d_qpu_nop();
343 inst->qpu.alu.add.op = op;
344
345 inst->dst = dst;
346 inst->src[0] = src0;
347 inst->src[1] = src1;
348 inst->uniform = ~0;
349
350 inst->ip = -1;
351
352 return inst;
353 }
354
355 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)356 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
357 {
358 struct qinst *inst = calloc(1, sizeof(*inst));
359
360 inst->qpu = v3d_qpu_nop();
361 inst->qpu.alu.mul.op = op;
362
363 inst->dst = dst;
364 inst->src[0] = src0;
365 inst->src[1] = src1;
366 inst->uniform = ~0;
367
368 inst->ip = -1;
369
370 return inst;
371 }
372
373 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)374 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
375 {
376 struct qinst *inst = calloc(1, sizeof(*inst));
377
378 inst->qpu = v3d_qpu_nop();
379 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
380 inst->qpu.branch.cond = cond;
381 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
382 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
383 inst->qpu.branch.ub = true;
384 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
385
386 inst->dst = vir_nop_reg();
387 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
388
389 inst->ip = -1;
390
391 return inst;
392 }
393
394 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)395 vir_emit(struct v3d_compile *c, struct qinst *inst)
396 {
397 inst->ip = -1;
398
399 switch (c->cursor.mode) {
400 case vir_cursor_add:
401 list_add(&inst->link, c->cursor.link);
402 break;
403 case vir_cursor_addtail:
404 list_addtail(&inst->link, c->cursor.link);
405 break;
406 }
407
408 c->cursor = vir_after_inst(inst);
409 c->live_intervals_valid = false;
410 }
411
412 /* Updates inst to write to a new temporary, emits it, and notes the def. */
413 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)414 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
415 {
416 assert(inst->dst.file == QFILE_NULL);
417
418 /* If we're emitting an instruction that's a def, it had better be
419 * writing a register.
420 */
421 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
422 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
423 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
424 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
425 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
426 }
427
428 inst->dst = vir_get_temp(c);
429
430 if (inst->dst.file == QFILE_TEMP)
431 c->defs[inst->dst.index] = inst;
432
433 vir_emit(c, inst);
434
435 return inst->dst;
436 }
437
438 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)439 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
440 {
441 if (inst->dst.file == QFILE_TEMP)
442 c->defs[inst->dst.index] = NULL;
443
444 vir_emit(c, inst);
445
446 return inst;
447 }
448
449 struct qblock *
vir_new_block(struct v3d_compile * c)450 vir_new_block(struct v3d_compile *c)
451 {
452 struct qblock *block = rzalloc(c, struct qblock);
453
454 list_inithead(&block->instructions);
455
456 block->predecessors = _mesa_set_create(block,
457 _mesa_hash_pointer,
458 _mesa_key_pointer_equal);
459
460 block->index = c->next_block_index++;
461
462 return block;
463 }
464
465 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)466 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
467 {
468 c->cur_block = block;
469 c->cursor = vir_after_block(block);
470 list_addtail(&block->link, &c->blocks);
471 }
472
473 struct qblock *
vir_entry_block(struct v3d_compile * c)474 vir_entry_block(struct v3d_compile *c)
475 {
476 return list_first_entry(&c->blocks, struct qblock, link);
477 }
478
479 struct qblock *
vir_exit_block(struct v3d_compile * c)480 vir_exit_block(struct v3d_compile *c)
481 {
482 return list_last_entry(&c->blocks, struct qblock, link);
483 }
484
485 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)486 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
487 {
488 _mesa_set_add(successor->predecessors, predecessor);
489 if (predecessor->successors[0]) {
490 assert(!predecessor->successors[1]);
491 predecessor->successors[1] = successor;
492 } else {
493 predecessor->successors[0] = successor;
494 }
495 }
496
497 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)498 v3d_compiler_init(const struct v3d_device_info *devinfo,
499 uint32_t max_inline_uniform_buffers)
500 {
501 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
502 if (!compiler)
503 return NULL;
504
505 compiler->devinfo = devinfo;
506 compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
507
508 if (!vir_init_reg_sets(compiler)) {
509 ralloc_free(compiler);
510 return NULL;
511 }
512
513 return compiler;
514 }
515
516 void
v3d_compiler_free(const struct v3d_compiler * compiler)517 v3d_compiler_free(const struct v3d_compiler *compiler)
518 {
519 ralloc_free((void *)compiler);
520 }
521
522 struct v3d_compiler_strategy {
523 const char *name;
524 uint32_t max_threads;
525 uint32_t min_threads;
526 bool disable_general_tmu_sched;
527 bool disable_gcm;
528 bool disable_loop_unrolling;
529 bool disable_ubo_load_sorting;
530 bool move_buffer_loads;
531 bool disable_tmu_pipelining;
532 uint32_t max_tmu_spills;
533 };
534
535 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t compile_strategy_idx,const struct v3d_compiler_strategy * strategy,bool fallback_scheduler)536 vir_compile_init(const struct v3d_compiler *compiler,
537 struct v3d_key *key,
538 nir_shader *s,
539 void (*debug_output)(const char *msg,
540 void *debug_output_data),
541 void *debug_output_data,
542 int program_id, int variant_id,
543 uint32_t compile_strategy_idx,
544 const struct v3d_compiler_strategy *strategy,
545 bool fallback_scheduler)
546 {
547 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
548
549 c->compiler = compiler;
550 c->devinfo = compiler->devinfo;
551 c->key = key;
552 c->program_id = program_id;
553 c->variant_id = variant_id;
554 c->compile_strategy_idx = compile_strategy_idx;
555 c->threads = strategy->max_threads;
556 c->debug_output = debug_output;
557 c->debug_output_data = debug_output_data;
558 c->compilation_result = V3D_COMPILATION_SUCCEEDED;
559 c->min_threads_for_reg_alloc = strategy->min_threads;
560 c->max_tmu_spills = strategy->max_tmu_spills;
561 c->fallback_scheduler = fallback_scheduler;
562 c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
563 c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
564 c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
565 c->move_buffer_loads = strategy->move_buffer_loads;
566 c->disable_gcm = strategy->disable_gcm;
567 c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
568 ? true : strategy->disable_loop_unrolling;
569
570
571 s = nir_shader_clone(c, s);
572 c->s = s;
573
574 list_inithead(&c->blocks);
575 vir_set_emit_block(c, vir_new_block(c));
576
577 c->output_position_index = -1;
578 c->output_sample_mask_index = -1;
579
580 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
581 _mesa_key_pointer_equal);
582
583 c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
584 c->flags_temp = -1;
585
586 return c;
587 }
588
589 static int
type_size_vec4(const struct glsl_type * type,bool bindless)590 type_size_vec4(const struct glsl_type *type, bool bindless)
591 {
592 return glsl_count_attribute_slots(type, false);
593 }
594
595 static enum nir_lower_tex_packing
lower_tex_packing_cb(const nir_tex_instr * tex,const void * data)596 lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
597 {
598 struct v3d_compile *c = (struct v3d_compile *) data;
599
600 int sampler_index = nir_tex_instr_need_sampler(tex) ?
601 tex->sampler_index : tex->backend_flags;
602
603 assert(sampler_index < c->key->num_samplers_used);
604 return c->key->sampler[sampler_index].return_size == 16 ?
605 nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
606 }
607
608 static bool
v3d_nir_lower_null_pointers_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)609 v3d_nir_lower_null_pointers_cb(nir_builder *b,
610 nir_intrinsic_instr *intr,
611 void *_state)
612 {
613 uint32_t buffer_src_idx;
614
615 switch (intr->intrinsic) {
616 case nir_intrinsic_load_ubo:
617 case nir_intrinsic_load_ssbo:
618 buffer_src_idx = 0;
619 break;
620 case nir_intrinsic_store_ssbo:
621 buffer_src_idx = 1;
622 break;
623 default:
624 return false;
625 }
626
627 /* If index if constant we are good */
628 nir_src *src = &intr->src[buffer_src_idx];
629 if (nir_src_is_const(*src))
630 return false;
631
632 /* Otherwise, see if it comes from a bcsel including a null pointer */
633 if (src->ssa->parent_instr->type != nir_instr_type_alu)
634 return false;
635
636 nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
637 if (alu->op != nir_op_bcsel)
638 return false;
639
640 /* A null pointer is specified using block index 0xffffffff */
641 int32_t null_src_idx = -1;
642 for (int i = 1; i < 3; i++) {
643 /* FIXME: since we are running this before optimization maybe
644 * we need to also handle the case where we may have bcsel
645 * chain that we need to recurse?
646 */
647 if (!nir_src_is_const(alu->src[i].src))
648 continue;
649 if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
650 continue;
651
652 /* One of the bcsel srcs is a null pointer reference */
653 null_src_idx = i;
654 break;
655 }
656
657 if (null_src_idx < 0)
658 return false;
659
660 assert(null_src_idx == 1 || null_src_idx == 2);
661 int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
662
663 /* Rewrite the null pointer reference so we use the same buffer index
664 * as the other bcsel branch. This will allow optimization to remove
665 * the bcsel and we should then end up with a constant buffer index
666 * like we need.
667 */
668 b->cursor = nir_before_instr(&alu->instr);
669 nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
670 nir_src_rewrite(&alu->src[null_src_idx].src, copy);
671
672 return true;
673 }
674
675 static bool
v3d_nir_lower_null_pointers(nir_shader * s)676 v3d_nir_lower_null_pointers(nir_shader *s)
677 {
678 return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
679 nir_metadata_control_flow, NULL);
680 }
681
682 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * _data)683 lower_bit_size_cb(const nir_instr *instr, void *_data)
684 {
685 if (instr->type != nir_instr_type_alu)
686 return 0;
687
688 nir_alu_instr *alu = nir_instr_as_alu(instr);
689
690 switch (alu->op) {
691 case nir_op_mov:
692 case nir_op_vec2:
693 case nir_op_vec3:
694 case nir_op_vec4:
695 case nir_op_vec5:
696 case nir_op_vec8:
697 case nir_op_vec16:
698 case nir_op_b2i8:
699 case nir_op_b2f16:
700 case nir_op_b2i16:
701 case nir_op_b2f32:
702 case nir_op_b2i32:
703 case nir_op_f2f16:
704 case nir_op_f2f16_rtne:
705 case nir_op_f2f16_rtz:
706 case nir_op_f2f32:
707 case nir_op_f2i32:
708 case nir_op_f2u32:
709 case nir_op_i2i8:
710 case nir_op_i2i16:
711 case nir_op_i2f16:
712 case nir_op_i2f32:
713 case nir_op_i2i32:
714 case nir_op_u2u8:
715 case nir_op_u2u16:
716 case nir_op_u2f16:
717 case nir_op_u2f32:
718 case nir_op_u2u32:
719 case nir_op_pack_32_2x16_split:
720 case nir_op_pack_32_4x8_split:
721 case nir_op_pack_half_2x16_split:
722 return 0;
723
724 /* we need to handle those here as they only work with 32 bits */
725 default:
726 if (alu->src[0].src.ssa->bit_size != 1 && alu->src[0].src.ssa->bit_size < 32)
727 return 32;
728 return 0;
729 }
730 }
731
732 static void
v3d_lower_nir(struct v3d_compile * c)733 v3d_lower_nir(struct v3d_compile *c)
734 {
735 struct nir_lower_tex_options tex_options = {
736 .lower_txd = true,
737 .lower_tg4_offsets = true,
738 .lower_tg4_broadcom_swizzle = true,
739
740 .lower_rect = false, /* XXX: Use this on V3D 3.x */
741 .lower_txp = ~0,
742 /* Apply swizzles to all samplers. */
743 .swizzle_result = ~0,
744 .lower_invalid_implicit_lod = true,
745 };
746
747 /* Lower the format swizzle and (for 32-bit returns)
748 * ARB_texture_swizzle-style swizzle.
749 */
750 assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
751 for (int i = 0; i < c->key->num_tex_used; i++) {
752 for (int j = 0; j < 4; j++)
753 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
754 }
755
756 tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
757 tex_options.lower_tex_packing_data = c;
758
759 NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
760 NIR_PASS(_, c->s, nir_lower_system_values);
761
762 if (c->s->info.zero_initialize_shared_memory &&
763 c->s->info.shared_size > 0) {
764 /* All our BOs allocate full pages, so the underlying allocation
765 * for shared memory will always be a multiple of 4KB. This
766 * ensures that we can do an exact number of full chunk_size
767 * writes to initialize the memory independently of the actual
768 * shared_size used by the shader, which is a requirement of
769 * the initialization pass.
770 */
771 const unsigned chunk_size = 16; /* max single store size */
772 NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
773 align(c->s->info.shared_size, chunk_size), chunk_size);
774 }
775
776 NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
777 NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
778 NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
779 NIR_PASS(_, c->s, nir_lower_bit_size, lower_bit_size_cb, NULL);
780 }
781
782 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)783 v3d_set_prog_data_uniforms(struct v3d_compile *c,
784 struct v3d_prog_data *prog_data)
785 {
786 int count = c->num_uniforms;
787 struct v3d_uniform_list *ulist = &prog_data->uniforms;
788
789 ulist->count = count;
790 ulist->data = ralloc_array(prog_data, uint32_t, count);
791 memcpy(ulist->data, c->uniform_data,
792 count * sizeof(*ulist->data));
793 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
794 memcpy(ulist->contents, c->uniform_contents,
795 count * sizeof(*ulist->contents));
796 }
797
798 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)799 v3d_vs_set_prog_data(struct v3d_compile *c,
800 struct v3d_vs_prog_data *prog_data)
801 {
802 /* The vertex data gets format converted by the VPM so that
803 * each attribute channel takes up a VPM column. Precompute
804 * the sizes for the shader record.
805 */
806 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
807 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
808 prog_data->vpm_input_size += c->vattr_sizes[i];
809 }
810
811 memset(prog_data->driver_location_map, -1,
812 sizeof(prog_data->driver_location_map));
813
814 nir_foreach_shader_in_variable(var, c->s) {
815 prog_data->driver_location_map[var->data.location] =
816 var->data.driver_location;
817 }
818
819 prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
820 SYSTEM_VALUE_VERTEX_ID) ||
821 BITSET_TEST(c->s->info.system_values_read,
822 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
823
824 prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
825 SYSTEM_VALUE_BASE_INSTANCE);
826
827 prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
828 SYSTEM_VALUE_INSTANCE_ID) ||
829 BITSET_TEST(c->s->info.system_values_read,
830 SYSTEM_VALUE_INSTANCE_INDEX);
831
832 if (prog_data->uses_vid)
833 prog_data->vpm_input_size++;
834 if (prog_data->uses_biid)
835 prog_data->vpm_input_size++;
836 if (prog_data->uses_iid)
837 prog_data->vpm_input_size++;
838
839 prog_data->writes_psiz =
840 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
841
842 /* Input/output segment size are in sectors (8 rows of 32 bits per
843 * channel).
844 */
845 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
846 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
847
848 /* Set us up for shared input/output segments. This is apparently
849 * necessary for our VCM setup to avoid varying corruption.
850 *
851 * FIXME: initial testing on V3D 7.1 seems to work fine when using
852 * separate segments. So we could try to reevaluate in the future, if
853 * there is any advantage of using separate segments.
854 */
855 prog_data->separate_segments = false;
856 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
857 prog_data->vpm_input_size);
858 prog_data->vpm_input_size = 0;
859
860 /* Compute VCM cache size. We set up our program to take up less than
861 * half of the VPM, so that any set of bin and render programs won't
862 * run out of space. We need space for at least one input segment,
863 * and then allocate the rest to output segments (one for the current
864 * program, the rest to VCM). The valid range of the VCM cache size
865 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
866 * batches.
867 */
868 assert(c->devinfo->vpm_size);
869 int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
870 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
871 int half_vpm = vpm_size_in_sectors / 2;
872 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
873 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
874 assert(vpm_output_batches >= 2);
875 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
876 }
877
878 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)879 v3d_gs_set_prog_data(struct v3d_compile *c,
880 struct v3d_gs_prog_data *prog_data)
881 {
882 prog_data->num_inputs = c->num_inputs;
883 memcpy(prog_data->input_slots, c->input_slots,
884 c->num_inputs * sizeof(*c->input_slots));
885
886 /* gl_PrimitiveIdIn is written by the GBG into the first word of the
887 * VPM output header automatically and the shader will overwrite
888 * it after reading it if necessary, so it doesn't add to the VPM
889 * size requirements.
890 */
891 prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
892 SYSTEM_VALUE_PRIMITIVE_ID);
893
894 /* Output segment size is in sectors (8 rows of 32 bits per channel) */
895 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
896
897 /* Compute SIMD dispatch width and update VPM output size accordingly
898 * to ensure we can fit our program in memory. Available widths are
899 * 16, 8, 4, 1.
900 *
901 * Notice that at draw time we will have to consider VPM memory
902 * requirements from other stages and choose a smaller dispatch
903 * width if needed to fit the program in VPM memory.
904 */
905 prog_data->simd_width = 16;
906 while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
907 prog_data->simd_width == 2) {
908 prog_data->simd_width >>= 1;
909 prog_data->vpm_output_size =
910 align(prog_data->vpm_output_size, 2) / 2;
911 }
912 assert(prog_data->vpm_output_size <= 16);
913 assert(prog_data->simd_width != 2);
914
915 prog_data->out_prim_type = c->s->info.gs.output_primitive;
916 prog_data->num_invocations = c->s->info.gs.invocations;
917
918 prog_data->writes_psiz =
919 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
920 }
921
922 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)923 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
924 struct v3d_fs_prog_data *prog_data)
925 {
926 prog_data->num_inputs = c->num_inputs;
927 memcpy(prog_data->input_slots, c->input_slots,
928 c->num_inputs * sizeof(*c->input_slots));
929
930 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
931 (V3D_MAX_FS_INPUTS - 1) / 24);
932 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
933 if (BITSET_TEST(c->flat_shade_flags, i))
934 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
935
936 if (BITSET_TEST(c->noperspective_flags, i))
937 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
938
939 if (BITSET_TEST(c->centroid_flags, i))
940 prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
941 }
942 }
943
944 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)945 v3d_fs_set_prog_data(struct v3d_compile *c,
946 struct v3d_fs_prog_data *prog_data)
947 {
948 v3d_set_fs_prog_data_inputs(c, prog_data);
949 prog_data->writes_z = c->writes_z;
950 prog_data->writes_z_from_fep = c->writes_z_from_fep;
951 prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
952 prog_data->uses_center_w = c->uses_center_w;
953 prog_data->uses_implicit_point_line_varyings =
954 c->uses_implicit_point_line_varyings;
955 prog_data->lock_scoreboard_on_first_thrsw =
956 c->lock_scoreboard_on_first_thrsw;
957 prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
958 prog_data->uses_pid = c->fs_uses_primitive_id;
959 }
960
961 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)962 v3d_cs_set_prog_data(struct v3d_compile *c,
963 struct v3d_compute_prog_data *prog_data)
964 {
965 prog_data->shared_size = c->s->info.shared_size;
966
967 prog_data->local_size[0] = c->s->info.workgroup_size[0];
968 prog_data->local_size[1] = c->s->info.workgroup_size[1];
969 prog_data->local_size[2] = c->s->info.workgroup_size[2];
970
971 prog_data->has_subgroups = c->has_subgroups;
972 }
973
974 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)975 v3d_set_prog_data(struct v3d_compile *c,
976 struct v3d_prog_data *prog_data)
977 {
978 prog_data->threads = c->threads;
979 prog_data->single_seg = !c->last_thrsw;
980 prog_data->spill_size = c->spill_size;
981 prog_data->tmu_spills = c->spills;
982 prog_data->tmu_fills = c->fills;
983 prog_data->tmu_count = c->tmu.total_count;
984 prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
985 prog_data->compile_strategy_idx = c->compile_strategy_idx;
986 prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
987 prog_data->has_control_barrier = c->s->info.uses_control_barrier;
988 prog_data->has_global_address = c->has_global_address;
989
990 v3d_set_prog_data_uniforms(c, prog_data);
991
992 switch (c->s->info.stage) {
993 case MESA_SHADER_VERTEX:
994 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
995 break;
996 case MESA_SHADER_GEOMETRY:
997 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
998 break;
999 case MESA_SHADER_FRAGMENT:
1000 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
1001 break;
1002 case MESA_SHADER_COMPUTE:
1003 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
1004 break;
1005 default:
1006 unreachable("unsupported shader stage");
1007 }
1008 }
1009
1010 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)1011 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
1012 {
1013 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
1014
1015 uint64_t *qpu_insts = malloc(*final_assembly_size);
1016 if (!qpu_insts)
1017 return NULL;
1018
1019 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
1020
1021 vir_compile_destroy(c);
1022
1023 return qpu_insts;
1024 }
1025
1026 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)1027 v3d_nir_lower_vs_early(struct v3d_compile *c)
1028 {
1029 /* Split our I/O vars and dead code eliminate the unused
1030 * components.
1031 */
1032 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1033 nir_var_shader_in | nir_var_shader_out);
1034 uint64_t used_outputs[4] = {0};
1035 for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
1036 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
1037 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
1038 used_outputs[comp] |= 1ull << slot;
1039 }
1040 NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1041 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1042 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1043 v3d_optimize_nir(c, c->s);
1044 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1045
1046 /* This must go before nir_lower_io */
1047 if (c->vs_key->per_vertex_point_size)
1048 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1049
1050 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1051 type_size_vec4,
1052 (nir_lower_io_options)0);
1053 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1054 * on the code it generated.
1055 */
1056 NIR_PASS(_, c->s, nir_opt_dce);
1057 NIR_PASS(_, c->s, nir_opt_constant_folding);
1058 }
1059
1060 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)1061 v3d_nir_lower_gs_early(struct v3d_compile *c)
1062 {
1063 /* Split our I/O vars and dead code eliminate the unused
1064 * components.
1065 */
1066 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1067 nir_var_shader_in | nir_var_shader_out);
1068 uint64_t used_outputs[4] = {0};
1069 for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
1070 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
1071 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
1072 used_outputs[comp] |= 1ull << slot;
1073 }
1074 NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1075 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1076 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1077 v3d_optimize_nir(c, c->s);
1078 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1079
1080 /* This must go before nir_lower_io */
1081 if (c->gs_key->per_vertex_point_size)
1082 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1083
1084 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1085 type_size_vec4,
1086 (nir_lower_io_options)0);
1087 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1088 * on the code it generated.
1089 */
1090 NIR_PASS(_, c->s, nir_opt_dce);
1091 NIR_PASS(_, c->s, nir_opt_constant_folding);
1092 }
1093
1094 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)1095 v3d_fixup_fs_output_types(struct v3d_compile *c)
1096 {
1097 nir_foreach_shader_out_variable(var, c->s) {
1098 uint32_t mask = 0;
1099
1100 switch (var->data.location) {
1101 case FRAG_RESULT_COLOR:
1102 mask = ~0;
1103 break;
1104 case FRAG_RESULT_DATA0:
1105 case FRAG_RESULT_DATA1:
1106 case FRAG_RESULT_DATA2:
1107 case FRAG_RESULT_DATA3:
1108 mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
1109 break;
1110 }
1111
1112 if (c->fs_key->int_color_rb & mask) {
1113 var->type =
1114 glsl_vector_type(GLSL_TYPE_INT,
1115 glsl_get_components(var->type));
1116 } else if (c->fs_key->uint_color_rb & mask) {
1117 var->type =
1118 glsl_vector_type(GLSL_TYPE_UINT,
1119 glsl_get_components(var->type));
1120 }
1121 }
1122 }
1123
1124 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)1125 v3d_nir_lower_fs_early(struct v3d_compile *c)
1126 {
1127 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
1128 v3d_fixup_fs_output_types(c);
1129
1130 NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
1131
1132 if (c->fs_key->line_smoothing) {
1133 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
1134 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1135 /* The lowering pass can introduce new sysval reads */
1136 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1137 }
1138 }
1139
1140 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1141 v3d_nir_lower_gs_late(struct v3d_compile *c)
1142 {
1143 if (c->key->ucp_enables) {
1144 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
1145 true, NULL);
1146 }
1147
1148 /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1149 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1150 }
1151
1152 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1153 v3d_nir_lower_vs_late(struct v3d_compile *c)
1154 {
1155 if (c->key->ucp_enables) {
1156 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
1157 false, true, NULL);
1158 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1159 nir_var_shader_out, NULL, NULL);
1160 }
1161
1162 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1163 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1164 }
1165
1166 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1167 v3d_nir_lower_fs_late(struct v3d_compile *c)
1168 {
1169 /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1170 * Vulkan allows it, in which case the SPIR-V compiler will declare
1171 * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1172 * the last parameter to always operate with a compact array in both
1173 * OpenGL and Vulkan so we do't have to care about the API we
1174 * are using.
1175 */
1176 if (c->key->ucp_enables)
1177 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1178
1179 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
1180 }
1181
1182 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1183 vir_get_max_temps(struct v3d_compile *c)
1184 {
1185 int max_ip = 0;
1186 vir_for_each_inst_inorder(inst, c)
1187 max_ip++;
1188
1189 uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1190
1191 for (int t = 0; t < c->num_temps; t++) {
1192 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1193 i < max_ip); i++) {
1194 if (i > max_ip)
1195 break;
1196 pressure[i]++;
1197 }
1198 }
1199
1200 uint32_t max_temps = 0;
1201 for (int i = 0; i < max_ip; i++)
1202 max_temps = MAX2(max_temps, pressure[i]);
1203
1204 ralloc_free(pressure);
1205
1206 return max_temps;
1207 }
1208
1209 enum v3d_dependency_class {
1210 V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1211 };
1212
1213 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1214 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1215 nir_schedule_dependency *dep,
1216 void *user_data)
1217 {
1218 struct v3d_compile *c = user_data;
1219
1220 switch (intr->intrinsic) {
1221 case nir_intrinsic_store_output:
1222 /* Writing to location 0 overwrites the value passed in for
1223 * gl_PrimitiveID on geometry shaders
1224 */
1225 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1226 nir_intrinsic_base(intr) != 0)
1227 break;
1228
1229 nir_const_value *const_value =
1230 nir_src_as_const_value(intr->src[1]);
1231
1232 if (const_value == NULL)
1233 break;
1234
1235 uint64_t offset =
1236 nir_const_value_as_uint(*const_value,
1237 nir_src_bit_size(intr->src[1]));
1238 if (offset != 0)
1239 break;
1240
1241 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1242 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1243 return true;
1244
1245 case nir_intrinsic_load_primitive_id:
1246 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1247 break;
1248
1249 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1250 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1251 return true;
1252
1253 default:
1254 break;
1255 }
1256
1257 return false;
1258 }
1259
1260 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1261 v3d_instr_delay_cb(nir_instr *instr, void *data)
1262 {
1263 struct v3d_compile *c = (struct v3d_compile *) data;
1264
1265 switch (instr->type) {
1266 case nir_instr_type_undef:
1267 case nir_instr_type_load_const:
1268 case nir_instr_type_alu:
1269 case nir_instr_type_deref:
1270 case nir_instr_type_jump:
1271 case nir_instr_type_parallel_copy:
1272 case nir_instr_type_call:
1273 case nir_instr_type_phi:
1274 return 1;
1275
1276 /* We should not use very large delays for TMU instructions. Typically,
1277 * thread switches will be sufficient to hide all or most of the latency,
1278 * so we typically only need a little bit of extra room. If we over-estimate
1279 * the latency here we may end up unnecessarily delaying the critical path in
1280 * the shader, which would have a negative effect in performance, so here
1281 * we are trying to strike a balance based on empirical testing.
1282 */
1283 case nir_instr_type_intrinsic: {
1284 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1285 if (!c->disable_general_tmu_sched) {
1286 switch (intr->intrinsic) {
1287 case nir_intrinsic_decl_reg:
1288 case nir_intrinsic_load_reg:
1289 case nir_intrinsic_store_reg:
1290 return 0;
1291 case nir_intrinsic_load_ssbo:
1292 case nir_intrinsic_load_scratch:
1293 case nir_intrinsic_load_shared:
1294 case nir_intrinsic_image_load:
1295 return 3;
1296 case nir_intrinsic_load_ubo:
1297 if (nir_src_is_divergent(intr->src[1]))
1298 return 3;
1299 FALLTHROUGH;
1300 default:
1301 return 1;
1302 }
1303 } else {
1304 switch (intr->intrinsic) {
1305 case nir_intrinsic_decl_reg:
1306 case nir_intrinsic_load_reg:
1307 case nir_intrinsic_store_reg:
1308 return 0;
1309 default:
1310 return 1;
1311 }
1312 }
1313 break;
1314 }
1315
1316 case nir_instr_type_tex:
1317 return 5;
1318
1319 case nir_instr_type_debug_info:
1320 return 0;
1321 }
1322
1323 return 0;
1324 }
1325
1326 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1327 should_split_wrmask(const nir_instr *instr, const void *data)
1328 {
1329 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1330 switch (intr->intrinsic) {
1331 case nir_intrinsic_store_ssbo:
1332 case nir_intrinsic_store_shared:
1333 case nir_intrinsic_store_global:
1334 case nir_intrinsic_store_scratch:
1335 return true;
1336 default:
1337 return false;
1338 }
1339 }
1340
1341 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1342 nir_instr_as_constant_ubo_load(nir_instr *inst)
1343 {
1344 if (inst->type != nir_instr_type_intrinsic)
1345 return NULL;
1346
1347 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1348 if (intr->intrinsic != nir_intrinsic_load_ubo)
1349 return NULL;
1350
1351 assert(nir_src_is_const(intr->src[0]));
1352 if (!nir_src_is_const(intr->src[1]))
1353 return NULL;
1354
1355 return intr;
1356 }
1357
1358 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1359 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1360 {
1361 bool progress = false;
1362
1363 nir_instr *ref_inst = &ref->instr;
1364 uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1365 uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1366
1367 /* Go through all instructions after ref searching for constant UBO
1368 * loads for the same UBO index.
1369 */
1370 bool seq_break = false;
1371 nir_instr *inst = &ref->instr;
1372 nir_instr *next_inst = NULL;
1373 while (true) {
1374 inst = next_inst ? next_inst : nir_instr_next(inst);
1375 if (!inst)
1376 break;
1377
1378 next_inst = NULL;
1379
1380 if (inst->type != nir_instr_type_intrinsic)
1381 continue;
1382
1383 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1384 if (intr->intrinsic != nir_intrinsic_load_ubo)
1385 continue;
1386
1387 /* We only produce unifa sequences for non-divergent loads */
1388 if (nir_src_is_divergent(intr->src[1]))
1389 continue;
1390
1391 /* If there are any UBO loads that are not constant or that
1392 * use a different UBO index in between the reference load and
1393 * any other constant load for the same index, they would break
1394 * the unifa sequence. We will flag that so we can then move
1395 * all constant UBO loads for the reference index before these
1396 * and not just the ones that are not ordered to avoid breaking
1397 * the sequence and reduce unifa writes.
1398 */
1399 if (!nir_src_is_const(intr->src[1])) {
1400 seq_break = true;
1401 continue;
1402 }
1403 uint32_t offset = nir_src_as_uint(intr->src[1]);
1404
1405 assert(nir_src_is_const(intr->src[0]));
1406 uint32_t index = nir_src_as_uint(intr->src[0]);
1407 if (index != ref_index) {
1408 seq_break = true;
1409 continue;
1410 }
1411
1412 /* Only move loads with an offset that is close enough to the
1413 * reference offset, since otherwise we would not be able to
1414 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1415 */
1416 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1417 continue;
1418
1419 /* We will move this load if its offset is smaller than ref's
1420 * (in which case we will move it before ref) or if the offset
1421 * is larger than ref's but there are sequence breakers in
1422 * in between (in which case we will move it after ref and
1423 * before the sequence breakers).
1424 */
1425 if (!seq_break && offset >= ref_offset)
1426 continue;
1427
1428 /* Find where exactly we want to move this load:
1429 *
1430 * If we are moving it before ref, we want to check any other
1431 * UBO loads we placed before ref and make sure we insert this
1432 * one properly ordered with them. Likewise, if we are moving
1433 * it after ref.
1434 */
1435 nir_instr *pos = ref_inst;
1436 nir_instr *tmp = pos;
1437 do {
1438 if (offset < ref_offset)
1439 tmp = nir_instr_prev(tmp);
1440 else
1441 tmp = nir_instr_next(tmp);
1442
1443 if (!tmp || tmp == inst)
1444 break;
1445
1446 /* Ignore non-unifa UBO loads */
1447 if (tmp->type != nir_instr_type_intrinsic)
1448 continue;
1449
1450 nir_intrinsic_instr *tmp_intr =
1451 nir_instr_as_intrinsic(tmp);
1452 if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1453 continue;
1454
1455 if (nir_src_is_divergent(tmp_intr->src[1]))
1456 continue;
1457
1458 /* Stop if we find a unifa UBO load that breaks the
1459 * sequence.
1460 */
1461 if (!nir_src_is_const(tmp_intr->src[1]))
1462 break;
1463
1464 if (nir_src_as_uint(tmp_intr->src[0]) != index)
1465 break;
1466
1467 uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1468 if (offset < ref_offset) {
1469 if (tmp_offset < offset ||
1470 tmp_offset >= ref_offset) {
1471 break;
1472 } else {
1473 pos = tmp;
1474 }
1475 } else {
1476 if (tmp_offset > offset ||
1477 tmp_offset <= ref_offset) {
1478 break;
1479 } else {
1480 pos = tmp;
1481 }
1482 }
1483 } while (true);
1484
1485 /* We can't move the UBO load before the instruction that
1486 * defines its constant offset. If that instruction is placed
1487 * in between the new location (pos) and the current location
1488 * of this load, we will have to move that instruction too.
1489 *
1490 * We don't care about the UBO index definition because that
1491 * is optimized to be reused by all UBO loads for the same
1492 * index and therefore is certain to be defined before the
1493 * first UBO load that uses it.
1494 */
1495 nir_instr *offset_inst = NULL;
1496 tmp = inst;
1497 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1498 if (pos == tmp) {
1499 /* We reached the target location without
1500 * finding the instruction that defines the
1501 * offset, so that instruction must be before
1502 * the new position and we don't have to fix it.
1503 */
1504 break;
1505 }
1506 if (intr->src[1].ssa->parent_instr == tmp) {
1507 offset_inst = tmp;
1508 break;
1509 }
1510 }
1511
1512 if (offset_inst) {
1513 exec_node_remove(&offset_inst->node);
1514 exec_node_insert_node_before(&pos->node,
1515 &offset_inst->node);
1516 }
1517
1518 /* Since we are moving the instruction before its current
1519 * location, grab its successor before the move so that
1520 * we can continue the next iteration of the main loop from
1521 * that instruction.
1522 */
1523 next_inst = nir_instr_next(inst);
1524
1525 /* Move this load to the selected location */
1526 exec_node_remove(&inst->node);
1527 if (offset < ref_offset)
1528 exec_node_insert_node_before(&pos->node, &inst->node);
1529 else
1530 exec_node_insert_after(&pos->node, &inst->node);
1531
1532 progress = true;
1533 }
1534
1535 return progress;
1536 }
1537
1538 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1539 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1540 nir_block *block)
1541 {
1542 bool progress = false;
1543 bool local_progress;
1544 do {
1545 local_progress = false;
1546 nir_foreach_instr_safe(inst, block) {
1547 nir_intrinsic_instr *intr =
1548 nir_instr_as_constant_ubo_load(inst);
1549 if (intr) {
1550 local_progress |=
1551 v3d_nir_sort_constant_ubo_load(block, intr);
1552 }
1553 }
1554 progress |= local_progress;
1555 } while (local_progress);
1556
1557 return progress;
1558 }
1559
1560 /**
1561 * Sorts constant UBO loads in each block by offset to maximize chances of
1562 * skipping unifa writes when converting to VIR. This can increase register
1563 * pressure.
1564 */
1565 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1566 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1567 {
1568 nir_foreach_function_impl(impl, s) {
1569 nir_foreach_block(block, impl) {
1570 c->sorted_any_ubo_loads |=
1571 v3d_nir_sort_constant_ubo_loads_block(c, block);
1572 }
1573 nir_metadata_preserve(impl,
1574 nir_metadata_control_flow);
1575 }
1576 return c->sorted_any_ubo_loads;
1577 }
1578
1579 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1580 lower_load_num_subgroups(struct v3d_compile *c,
1581 nir_builder *b,
1582 nir_intrinsic_instr *intr)
1583 {
1584 assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1585 assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1586
1587 b->cursor = nir_after_instr(&intr->instr);
1588 uint32_t num_subgroups =
1589 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1590 c->s->info.workgroup_size[1] *
1591 c->s->info.workgroup_size[2], V3D_CHANNELS);
1592 nir_def *result = nir_imm_int(b, num_subgroups);
1593 nir_def_replace(&intr->def, result);
1594 }
1595
1596 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1597 lower_subgroup_intrinsics(struct v3d_compile *c,
1598 nir_block *block, nir_builder *b)
1599 {
1600 bool progress = false;
1601 nir_foreach_instr_safe(inst, block) {
1602 if (inst->type != nir_instr_type_intrinsic)
1603 continue;;
1604
1605 nir_intrinsic_instr *intr =
1606 nir_instr_as_intrinsic(inst);
1607 if (!intr)
1608 continue;
1609
1610 switch (intr->intrinsic) {
1611 case nir_intrinsic_load_num_subgroups:
1612 lower_load_num_subgroups(c, b, intr);
1613 progress = true;
1614 FALLTHROUGH;
1615 case nir_intrinsic_load_subgroup_id:
1616 case nir_intrinsic_load_subgroup_size:
1617 case nir_intrinsic_load_subgroup_invocation:
1618 case nir_intrinsic_elect:
1619 case nir_intrinsic_ballot:
1620 case nir_intrinsic_inverse_ballot:
1621 case nir_intrinsic_ballot_bitfield_extract:
1622 case nir_intrinsic_ballot_bit_count_reduce:
1623 case nir_intrinsic_ballot_find_lsb:
1624 case nir_intrinsic_ballot_find_msb:
1625 case nir_intrinsic_ballot_bit_count_exclusive:
1626 case nir_intrinsic_ballot_bit_count_inclusive:
1627 case nir_intrinsic_reduce:
1628 case nir_intrinsic_inclusive_scan:
1629 case nir_intrinsic_exclusive_scan:
1630 case nir_intrinsic_read_invocation:
1631 case nir_intrinsic_read_first_invocation:
1632 case nir_intrinsic_load_subgroup_eq_mask:
1633 case nir_intrinsic_load_subgroup_ge_mask:
1634 case nir_intrinsic_load_subgroup_gt_mask:
1635 case nir_intrinsic_load_subgroup_le_mask:
1636 case nir_intrinsic_load_subgroup_lt_mask:
1637 case nir_intrinsic_shuffle:
1638 case nir_intrinsic_shuffle_xor:
1639 case nir_intrinsic_shuffle_up:
1640 case nir_intrinsic_shuffle_down:
1641 case nir_intrinsic_vote_all:
1642 case nir_intrinsic_vote_any:
1643 case nir_intrinsic_vote_feq:
1644 case nir_intrinsic_vote_ieq:
1645 case nir_intrinsic_quad_broadcast:
1646 case nir_intrinsic_quad_swap_horizontal:
1647 case nir_intrinsic_quad_swap_vertical:
1648 case nir_intrinsic_quad_swap_diagonal:
1649 c->has_subgroups = true;
1650 break;
1651 default:
1652 break;
1653 }
1654 }
1655
1656 return progress;
1657 }
1658
1659 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1660 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1661 {
1662 bool progress = false;
1663 nir_foreach_function_impl(impl, s) {
1664 nir_builder b = nir_builder_create(impl);
1665
1666 nir_foreach_block(block, impl)
1667 progress |= lower_subgroup_intrinsics(c, block, &b);
1668
1669 nir_metadata_preserve(impl,
1670 nir_metadata_control_flow);
1671 }
1672 return progress;
1673 }
1674
1675 static void
v3d_attempt_compile(struct v3d_compile * c)1676 v3d_attempt_compile(struct v3d_compile *c)
1677 {
1678 switch (c->s->info.stage) {
1679 case MESA_SHADER_VERTEX:
1680 c->vs_key = (struct v3d_vs_key *) c->key;
1681 break;
1682 case MESA_SHADER_GEOMETRY:
1683 c->gs_key = (struct v3d_gs_key *) c->key;
1684 break;
1685 case MESA_SHADER_FRAGMENT:
1686 c->fs_key = (struct v3d_fs_key *) c->key;
1687 break;
1688 case MESA_SHADER_COMPUTE:
1689 break;
1690 default:
1691 unreachable("unsupported shader stage");
1692 }
1693
1694 switch (c->s->info.stage) {
1695 case MESA_SHADER_VERTEX:
1696 v3d_nir_lower_vs_early(c);
1697 break;
1698 case MESA_SHADER_GEOMETRY:
1699 v3d_nir_lower_gs_early(c);
1700 break;
1701 case MESA_SHADER_FRAGMENT:
1702 v3d_nir_lower_fs_early(c);
1703 break;
1704 default:
1705 break;
1706 }
1707
1708 v3d_lower_nir(c);
1709
1710 switch (c->s->info.stage) {
1711 case MESA_SHADER_VERTEX:
1712 v3d_nir_lower_vs_late(c);
1713 break;
1714 case MESA_SHADER_GEOMETRY:
1715 v3d_nir_lower_gs_late(c);
1716 break;
1717 case MESA_SHADER_FRAGMENT:
1718 v3d_nir_lower_fs_late(c);
1719 break;
1720 default:
1721 break;
1722 }
1723
1724 NIR_PASS(_, c->s, v3d_nir_lower_io, c);
1725 NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
1726 NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
1727
1728 NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
1729 nir_lower_idiv_options idiv_options = {
1730 .allow_fp16 = true,
1731 };
1732 NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
1733 NIR_PASS(_, c->s, nir_lower_alu);
1734
1735 if (c->key->robust_uniform_access || c->key->robust_storage_access ||
1736 c->key->robust_image_access) {
1737 /* nir_lower_robust_access assumes constant buffer
1738 * indices on ubo/ssbo intrinsics so run copy propagation and
1739 * constant folding passes before we run the lowering to warrant
1740 * this. We also want to run the lowering before v3d_optimize to
1741 * clean-up redundant get_buffer_size calls produced in the pass.
1742 */
1743 NIR_PASS(_, c->s, nir_copy_prop);
1744 NIR_PASS(_, c->s, nir_opt_constant_folding);
1745
1746 nir_lower_robust_access_options opts = {
1747 .lower_image = c->key->robust_image_access,
1748 .lower_ssbo = c->key->robust_storage_access,
1749 .lower_ubo = c->key->robust_uniform_access,
1750 };
1751
1752 NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
1753 }
1754
1755 NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
1756 nir_var_function_temp,
1757 0,
1758 glsl_get_natural_size_align_bytes,
1759 glsl_get_natural_size_align_bytes);
1760
1761 NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
1762 NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1763 NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
1764 NIR_PASS(_, c->s, v3d_nir_lower_scratch);
1765
1766 /* needs to run after load_store_bitsize */
1767 NIR_PASS(_, c->s, nir_lower_pack);
1768
1769 NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
1770
1771 const nir_lower_subgroups_options subgroup_opts = {
1772 .subgroup_size = V3D_CHANNELS,
1773 .ballot_components = 1,
1774 .ballot_bit_size = 32,
1775 .lower_to_scalar = true,
1776 .lower_inverse_ballot = true,
1777 .lower_subgroup_masks = true,
1778 .lower_relative_shuffle = true,
1779 .lower_quad = true,
1780 };
1781 NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
1782
1783 v3d_optimize_nir(c, c->s);
1784
1785 /* Do late algebraic optimization to turn add(a, neg(b)) back into
1786 * subs, then the mandatory cleanup after algebraic. Note that it may
1787 * produce fnegs, and if so then we need to keep running to squash
1788 * fneg(fneg(a)).
1789 */
1790 bool more_late_algebraic = true;
1791 while (more_late_algebraic) {
1792 more_late_algebraic = false;
1793 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1794 NIR_PASS(_, c->s, nir_opt_constant_folding);
1795 NIR_PASS(_, c->s, nir_copy_prop);
1796 NIR_PASS(_, c->s, nir_opt_dce);
1797 NIR_PASS(_, c->s, nir_opt_cse);
1798 }
1799
1800 NIR_PASS(_, c->s, nir_lower_bool_to_int32);
1801 NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
1802 NIR_PASS_V(c->s, nir_divergence_analysis);
1803 NIR_PASS(_, c->s, nir_convert_from_ssa, true);
1804
1805 struct nir_schedule_options schedule_options = {
1806 /* Schedule for about half our register space, to enable more
1807 * shaders to hit 4 threads.
1808 */
1809 .threshold = c->threads == 4 ? 24 : 48,
1810
1811 /* Vertex shaders share the same memory for inputs and outputs,
1812 * fragment and geometry shaders do not.
1813 */
1814 .stages_with_shared_io_memory =
1815 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1816 ~((1 << MESA_SHADER_FRAGMENT) |
1817 (1 << MESA_SHADER_GEOMETRY))),
1818
1819 .fallback = c->fallback_scheduler,
1820
1821 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1822 .intrinsic_cb_data = c,
1823
1824 .instr_delay_cb = v3d_instr_delay_cb,
1825 .instr_delay_cb_data = c,
1826 };
1827 NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1828
1829 if (!c->disable_constant_ubo_load_sorting)
1830 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
1831
1832 const nir_move_options buffer_opts = c->move_buffer_loads ?
1833 (nir_move_load_ubo | nir_move_load_ssbo) : 0;
1834 NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
1835 nir_move_const_undef |
1836 buffer_opts);
1837
1838 NIR_PASS_V(c->s, nir_trivialize_registers);
1839
1840 v3d_nir_to_vir(c);
1841 }
1842
1843 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1844 v3d_prog_data_size(gl_shader_stage stage)
1845 {
1846 static const int prog_data_size[] = {
1847 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1848 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1849 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1850 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1851 };
1852
1853 assert(stage >= 0 &&
1854 stage < ARRAY_SIZE(prog_data_size) &&
1855 prog_data_size[stage]);
1856
1857 return prog_data_size[stage];
1858 }
1859
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1860 int v3d_shaderdb_dump(struct v3d_compile *c,
1861 char **shaderdb_str)
1862 {
1863 if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1864 return -1;
1865
1866 return asprintf(shaderdb_str,
1867 "%s shader: %d inst, %d threads, %d loops, "
1868 "%d uniforms, %d max-temps, %d:%d spills:fills, "
1869 "%d sfu-stalls, %d inst-and-stalls, %d nops",
1870 vir_get_stage_name(c),
1871 c->qpu_inst_count,
1872 c->threads,
1873 c->loops,
1874 c->num_uniforms,
1875 vir_get_max_temps(c),
1876 c->spills,
1877 c->fills,
1878 c->qpu_inst_stalled_count,
1879 c->qpu_inst_count + c->qpu_inst_stalled_count,
1880 c->nop_count);
1881 }
1882
1883 /* This is a list of incremental changes to the compilation strategy
1884 * that will be used to try to compile the shader successfully. The
1885 * default strategy is to enable all optimizations which will have
1886 * the highest register pressure but is expected to produce most
1887 * optimal code. Following strategies incrementally disable specific
1888 * optimizations that are known to contribute to register pressure
1889 * in order to be able to compile the shader successfully while meeting
1890 * thread count requirements.
1891 *
1892 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1893 * cover previous hardware as well (meaning that we are not limiting
1894 * register allocation to any particular thread count). This is fine
1895 * because v3d_nir_to_vir will cap this to the actual minimum.
1896 */
1897 static const struct v3d_compiler_strategy strategies[] = {
1898 /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 },
1899 /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 },
1900 /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 },
1901 /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 },
1902 /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 },
1903 /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 },
1904 /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 },
1905 /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 },
1906 /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 },
1907 /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 },
1908 /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 },
1909 /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 },
1910 /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 }
1911 };
1912
1913 /**
1914 * If a particular optimization didn't make any progress during a compile
1915 * attempt disabling it alone won't allow us to compile the shader successfully,
1916 * since we'll end up with the same code. Detect these scenarios so we can
1917 * avoid wasting time with useless compiles. We should also consider if the
1918 * gy changes other aspects of the compilation process though, like
1919 * spilling, and not skip it in that case.
1920 */
1921 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1922 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1923 {
1924 /* We decide if we can skip a strategy based on the optimizations that
1925 * were active in the previous strategy, so we should only be calling this
1926 * for strategies after the first.
1927 */
1928 assert(idx > 0);
1929
1930 /* Don't skip a strategy that changes spilling behavior */
1931 if (strategies[idx].max_tmu_spills !=
1932 strategies[idx - 1].max_tmu_spills) {
1933 return false;
1934 }
1935
1936 switch (idx) {
1937 /* General TMU sched.: skip if we didn't emit any TMU loads */
1938 case 1:
1939 case 7:
1940 return !c->has_general_tmu_load;
1941 /* Global code motion: skip if nir_opt_gcm didn't make any progress */
1942 case 2:
1943 case 8:
1944 return !c->gcm_progress;
1945 /* Loop unrolling: skip if we didn't unroll any loops */
1946 case 3:
1947 case 9:
1948 return !c->unrolled_any_loops;
1949 /* UBO load sorting: skip if we didn't sort any loads */
1950 case 4:
1951 return !c->sorted_any_ubo_loads;
1952 /* Move buffer loads: we assume any shader with difficult RA
1953 * most likely has UBO / SSBO loads so we never try to skip.
1954 * For now, we only try this for 2-thread compiles since it
1955 * is expected to impact instruction counts and latency.
1956 */
1957 case 10:
1958 assert(c->threads < 4);
1959 return false;
1960 /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1961 case 5:
1962 case 11:
1963 return !c->pipelined_any_tmu;
1964 /* Lower thread count: skip if we already tried less that 4 threads */
1965 case 6:
1966 return c->threads < 4;
1967 default:
1968 return false;
1969 };
1970 }
1971
1972 static inline void
set_best_compile(struct v3d_compile ** best,struct v3d_compile * c)1973 set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
1974 {
1975 if (*best)
1976 vir_compile_destroy(*best);
1977 *best = c;
1978 }
1979
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)1980 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1981 struct v3d_key *key,
1982 struct v3d_prog_data **out_prog_data,
1983 nir_shader *s,
1984 void (*debug_output)(const char *msg,
1985 void *debug_output_data),
1986 void *debug_output_data,
1987 int program_id, int variant_id,
1988 uint32_t *final_assembly_size)
1989 {
1990 struct v3d_compile *c = NULL;
1991
1992 uint32_t best_spill_fill_count = UINT32_MAX;
1993 struct v3d_compile *best_c = NULL;
1994 for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
1995 /* Fallback strategy */
1996 if (strat > 0) {
1997 assert(c);
1998 if (skip_compile_strategy(c, strat))
1999 continue;
2000
2001 char *debug_msg;
2002 int ret = asprintf(&debug_msg,
2003 "Falling back to strategy '%s' "
2004 "for %s prog %d/%d",
2005 strategies[strat].name,
2006 vir_get_stage_name(c),
2007 c->program_id, c->variant_id);
2008
2009 if (ret >= 0) {
2010 if (V3D_DBG(PERF))
2011 fprintf(stderr, "%s\n", debug_msg);
2012
2013 c->debug_output(debug_msg, c->debug_output_data);
2014 free(debug_msg);
2015 }
2016
2017 if (c != best_c)
2018 vir_compile_destroy(c);
2019 }
2020
2021 c = vir_compile_init(compiler, key, s,
2022 debug_output, debug_output_data,
2023 program_id, variant_id,
2024 strat, &strategies[strat],
2025 strat == ARRAY_SIZE(strategies) - 1);
2026
2027 v3d_attempt_compile(c);
2028
2029 /* Broken shader or driver bug */
2030 if (c->compilation_result == V3D_COMPILATION_FAILED)
2031 break;
2032
2033 /* If we compiled without spills, choose this.
2034 * Otherwise if this is a 4-thread compile, choose this (these
2035 * have a very low cap on the allowed TMU spills so we assume
2036 * it will be better than a 2-thread compile without spills).
2037 * Otherwise, keep going while tracking the strategy with the
2038 * lowest spill count.
2039 */
2040 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
2041 if (c->spills == 0 ||
2042 strategies[strat].min_threads == 4 ||
2043 V3D_DBG(OPT_COMPILE_TIME)) {
2044 set_best_compile(&best_c, c);
2045 break;
2046 } else if (c->spills + c->fills <
2047 best_spill_fill_count) {
2048 set_best_compile(&best_c, c);
2049 best_spill_fill_count = c->spills + c->fills;
2050 }
2051
2052 if (V3D_DBG(PERF)) {
2053 char *debug_msg;
2054 int ret = asprintf(&debug_msg,
2055 "Compiled %s prog %d/%d with %d "
2056 "spills and %d fills. Will try "
2057 "more strategies.",
2058 vir_get_stage_name(c),
2059 c->program_id, c->variant_id,
2060 c->spills, c->fills);
2061 if (ret >= 0) {
2062 fprintf(stderr, "%s\n", debug_msg);
2063 c->debug_output(debug_msg, c->debug_output_data);
2064 free(debug_msg);
2065 }
2066 }
2067 }
2068
2069 /* Only try next streategy if we failed to register allocate
2070 * or we had to spill.
2071 */
2072 assert(c->compilation_result ==
2073 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
2074 c->spills > 0);
2075 }
2076
2077 /* If the best strategy was not the last, choose that */
2078 if (best_c && c != best_c)
2079 set_best_compile(&c, best_c);
2080
2081 if (V3D_DBG(PERF) &&
2082 c->compilation_result !=
2083 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
2084 c->spills > 0) {
2085 char *debug_msg;
2086 int ret = asprintf(&debug_msg,
2087 "Compiled %s prog %d/%d with %d "
2088 "spills and %d fills",
2089 vir_get_stage_name(c),
2090 c->program_id, c->variant_id,
2091 c->spills, c->fills);
2092 fprintf(stderr, "%s\n", debug_msg);
2093
2094 if (ret >= 0) {
2095 c->debug_output(debug_msg, c->debug_output_data);
2096 free(debug_msg);
2097 }
2098 }
2099
2100 if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
2101 fprintf(stderr, "Failed to compile %s prog %d/%d "
2102 "with any strategy.\n",
2103 vir_get_stage_name(c), c->program_id, c->variant_id);
2104
2105 vir_compile_destroy(c);
2106 return NULL;
2107 }
2108
2109 struct v3d_prog_data *prog_data;
2110
2111 prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
2112
2113 v3d_set_prog_data(c, prog_data);
2114
2115 *out_prog_data = prog_data;
2116
2117 char *shaderdb;
2118 int ret = v3d_shaderdb_dump(c, &shaderdb);
2119 if (ret >= 0) {
2120 if (V3D_DBG(SHADERDB))
2121 fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
2122
2123 c->debug_output(shaderdb, c->debug_output_data);
2124 free(shaderdb);
2125 }
2126
2127 return v3d_return_qpu_insts(c, final_assembly_size);
2128 }
2129
2130 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)2131 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
2132 {
2133 if (qinst->dst.file == QFILE_TEMP)
2134 c->defs[qinst->dst.index] = NULL;
2135
2136 assert(&qinst->link != c->cursor.link);
2137
2138 list_del(&qinst->link);
2139 free(qinst);
2140
2141 c->live_intervals_valid = false;
2142 }
2143
2144 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)2145 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
2146 {
2147 /* XXX
2148 int pack = reg.pack;
2149
2150 while (reg.file == QFILE_TEMP &&
2151 c->defs[reg.index] &&
2152 (c->defs[reg.index]->op == QOP_MOV ||
2153 c->defs[reg.index]->op == QOP_FMOV) &&
2154 !c->defs[reg.index]->dst.pack &&
2155 !c->defs[reg.index]->src[0].pack) {
2156 reg = c->defs[reg.index]->src[0];
2157 }
2158
2159 reg.pack = pack;
2160 */
2161 return reg;
2162 }
2163
2164 void
vir_compile_destroy(struct v3d_compile * c)2165 vir_compile_destroy(struct v3d_compile *c)
2166 {
2167 /* Defuse the assert that we aren't removing the cursor's instruction.
2168 */
2169 c->cursor.link = NULL;
2170
2171 vir_for_each_block(block, c) {
2172 while (!list_is_empty(&block->instructions)) {
2173 struct qinst *qinst =
2174 list_first_entry(&block->instructions,
2175 struct qinst, link);
2176 vir_remove_instruction(c, qinst);
2177 }
2178 }
2179
2180 ralloc_free(c);
2181 }
2182
2183 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2184 vir_get_uniform_index(struct v3d_compile *c,
2185 enum quniform_contents contents,
2186 uint32_t data)
2187 {
2188 for (int i = 0; i < c->num_uniforms; i++) {
2189 if (c->uniform_contents[i] == contents &&
2190 c->uniform_data[i] == data) {
2191 return i;
2192 }
2193 }
2194
2195 uint32_t uniform = c->num_uniforms++;
2196
2197 if (uniform >= c->uniform_array_size) {
2198 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
2199 c->uniform_array_size * 2);
2200
2201 c->uniform_data = reralloc(c, c->uniform_data,
2202 uint32_t,
2203 c->uniform_array_size);
2204 c->uniform_contents = reralloc(c, c->uniform_contents,
2205 enum quniform_contents,
2206 c->uniform_array_size);
2207 }
2208
2209 c->uniform_contents[uniform] = contents;
2210 c->uniform_data[uniform] = data;
2211
2212 return uniform;
2213 }
2214
2215 /* Looks back into the current block to find the ldunif that wrote the uniform
2216 * at the requested index. If it finds it, it returns true and writes the
2217 * destination register of the ldunif instruction to 'unif'.
2218 *
2219 * This can impact register pressure and end up leading to worse code, so we
2220 * limit the number of instructions we are willing to look back through to
2221 * strike a good balance.
2222 */
2223 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2224 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2225 {
2226 uint32_t count = 20;
2227 struct qinst *prev_inst = NULL;
2228 assert(c->cur_block);
2229
2230 #if MESA_DEBUG
2231 /* We can only reuse a uniform if it was emitted in the same block,
2232 * so callers must make sure the current instruction is being emitted
2233 * in the current block.
2234 */
2235 bool found = false;
2236 vir_for_each_inst(inst, c->cur_block) {
2237 if (&inst->link == c->cursor.link) {
2238 found = true;
2239 break;
2240 }
2241 }
2242
2243 assert(found || &c->cur_block->instructions == c->cursor.link);
2244 #endif
2245
2246 list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2247 &c->cur_block->instructions, link) {
2248 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2249 inst->uniform == index) {
2250 prev_inst = inst;
2251 break;
2252 }
2253
2254 if (--count == 0)
2255 break;
2256 }
2257
2258 if (!prev_inst)
2259 return false;
2260
2261 /* Only reuse the ldunif result if it was written to a temp register,
2262 * otherwise there may be special restrictions (for example, ldunif
2263 * may write directly to unifa, which is a write-only register).
2264 */
2265 if (prev_inst->dst.file != QFILE_TEMP)
2266 return false;
2267
2268 list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2269 &c->cur_block->instructions, link) {
2270 if (inst->dst.file == prev_inst->dst.file &&
2271 inst->dst.index == prev_inst->dst.index) {
2272 return false;
2273 }
2274 }
2275
2276 *unif = prev_inst->dst;
2277 return true;
2278 }
2279
2280 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2281 vir_uniform(struct v3d_compile *c,
2282 enum quniform_contents contents,
2283 uint32_t data)
2284 {
2285 const int num_uniforms = c->num_uniforms;
2286 const int index = vir_get_uniform_index(c, contents, data);
2287
2288 /* If this is not the first time we see this uniform try to reuse the
2289 * result of the last ldunif that loaded it.
2290 */
2291 const bool is_new_uniform = num_uniforms != c->num_uniforms;
2292 if (!is_new_uniform && !c->disable_ldunif_opt) {
2293 struct qreg ldunif_dst;
2294 if (try_opt_ldunif(c, index, &ldunif_dst))
2295 return ldunif_dst;
2296 }
2297
2298 struct qinst *inst = vir_NOP(c);
2299 inst->qpu.sig.ldunif = true;
2300 inst->uniform = index;
2301 inst->dst = vir_get_temp(c);
2302 c->defs[inst->dst.index] = inst;
2303 return inst->dst;
2304 }
2305
2306 #define OPTPASS(func) \
2307 do { \
2308 bool stage_progress = func(c); \
2309 if (stage_progress) { \
2310 progress = true; \
2311 if (print_opt_debug) { \
2312 fprintf(stderr, \
2313 "VIR opt pass %2d: %s progress\n", \
2314 pass, #func); \
2315 } \
2316 /*XXX vir_validate(c);*/ \
2317 } \
2318 } while (0)
2319
2320 void
vir_optimize(struct v3d_compile * c)2321 vir_optimize(struct v3d_compile *c)
2322 {
2323 bool print_opt_debug = false;
2324 int pass = 1;
2325
2326 while (true) {
2327 bool progress = false;
2328
2329 OPTPASS(vir_opt_copy_propagate);
2330 OPTPASS(vir_opt_redundant_flags);
2331 OPTPASS(vir_opt_dead_code);
2332 OPTPASS(vir_opt_small_immediates);
2333 OPTPASS(vir_opt_constant_alu);
2334
2335 if (!progress)
2336 break;
2337
2338 pass++;
2339 }
2340 }
2341
2342 const char *
vir_get_stage_name(struct v3d_compile * c)2343 vir_get_stage_name(struct v3d_compile *c)
2344 {
2345 if (c->vs_key && c->vs_key->is_coord)
2346 return "MESA_SHADER_VERTEX_BIN";
2347 else if (c->gs_key && c->gs_key->is_coord)
2348 return "MESA_SHADER_GEOMETRY_BIN";
2349 else
2350 return gl_shader_stage_name(c->s->info.stage);
2351 }
2352
2353 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2354 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2355 {
2356 assert(devinfo->vpm_size > 0);
2357 const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2358 return devinfo->vpm_size / sector_size;
2359 }
2360
2361 /* Computes various parameters affecting VPM memory configuration for programs
2362 * involving geometry shaders to ensure the program fits in memory and honors
2363 * requirements described in section "VPM usage" of the programming manual.
2364 */
2365 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2366 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2367 struct v3d_vs_prog_data *vs,
2368 struct v3d_gs_prog_data *gs,
2369 struct vpm_config *vpm_cfg_out)
2370 {
2371 const uint32_t A = vs->separate_segments ? 1 : 0;
2372 const uint32_t Ad = vs->vpm_input_size;
2373 const uint32_t Vd = vs->vpm_output_size;
2374
2375 const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2376
2377 /* Try to fit program into our VPM memory budget by adjusting
2378 * configurable parameters iteratively. We do this in two phases:
2379 * the first phase tries to fit the program into the total available
2380 * VPM memory. If we succeed at that, then the second phase attempts
2381 * to fit the program into half of that budget so we can run bin and
2382 * render programs in parallel.
2383 */
2384 struct vpm_config vpm_cfg[2];
2385 struct vpm_config *final_vpm_cfg = NULL;
2386 uint32_t phase = 0;
2387
2388 vpm_cfg[phase].As = 1;
2389 vpm_cfg[phase].Gs = 1;
2390 vpm_cfg[phase].Gd = gs->vpm_output_size;
2391 vpm_cfg[phase].gs_width = gs->simd_width;
2392
2393 /* While there is a requirement that Vc >= [Vn / 16], this is
2394 * always the case when tessellation is not present because in that
2395 * case Vn can only be 6 at most (when input primitive is triangles
2396 * with adjacency).
2397 *
2398 * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2399 * and Broadcom has not found it worth it to increase it beyond this
2400 * in general. Increasing Vc also increases VPM memory pressure which
2401 * can turn up being detrimental for performance in some scenarios.
2402 */
2403 vpm_cfg[phase].Vc = 2;
2404
2405 /* Gv is a constraint on the hardware to not exceed the
2406 * specified number of vertex segments per GS batch. If adding a
2407 * new primitive to a GS batch would result in a range of more
2408 * than Gv vertex segments being referenced by the batch, then
2409 * the hardware will flush the batch and start a new one. This
2410 * means that we can choose any value we want, we just need to
2411 * be aware that larger values improve GS batch utilization
2412 * at the expense of more VPM memory pressure (which can affect
2413 * other performance aspects, such as GS dispatch width).
2414 * We start with the largest value, and will reduce it if we
2415 * find that total memory pressure is too high.
2416 */
2417 vpm_cfg[phase].Gv = 3;
2418 do {
2419 /* When GS is present in absence of TES, then we need to satisfy
2420 * that Ve >= Gv. We go with the smallest value of Ve to avoid
2421 * increasing memory pressure.
2422 */
2423 vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2424
2425 uint32_t vpm_sectors =
2426 A * vpm_cfg[phase].As * Ad +
2427 (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2428 vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2429
2430 /* Ideally we want to use no more than half of the available
2431 * memory so we can execute a bin and render program in parallel
2432 * without stalls. If we achieved that then we are done.
2433 */
2434 if (vpm_sectors <= vpm_size / 2) {
2435 final_vpm_cfg = &vpm_cfg[phase];
2436 break;
2437 }
2438
2439 /* At the very least, we should not allocate more than the
2440 * total available VPM memory. If we have a configuration that
2441 * succeeds at this we save it and continue to see if we can
2442 * meet the half-memory-use criteria too.
2443 */
2444 if (phase == 0 && vpm_sectors <= vpm_size) {
2445 vpm_cfg[1] = vpm_cfg[0];
2446 phase = 1;
2447 }
2448
2449 /* Try lowering Gv */
2450 if (vpm_cfg[phase].Gv > 0) {
2451 vpm_cfg[phase].Gv--;
2452 continue;
2453 }
2454
2455 /* Try lowering GS dispatch width */
2456 if (vpm_cfg[phase].gs_width > 1) {
2457 do {
2458 vpm_cfg[phase].gs_width >>= 1;
2459 vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2460 } while (vpm_cfg[phase].gs_width == 2);
2461
2462 /* Reset Gv to max after dropping dispatch width */
2463 vpm_cfg[phase].Gv = 3;
2464 continue;
2465 }
2466
2467 /* We ran out of options to reduce memory pressure. If we
2468 * are at phase 1 we have at least a valid configuration, so we
2469 * we use that.
2470 */
2471 if (phase == 1)
2472 final_vpm_cfg = &vpm_cfg[0];
2473 break;
2474 } while (true);
2475
2476 if (!final_vpm_cfg)
2477 return false;
2478
2479 assert(final_vpm_cfg);
2480 assert(final_vpm_cfg->Gd <= 16);
2481 assert(final_vpm_cfg->Gv < 4);
2482 assert(final_vpm_cfg->Ve < 4);
2483 assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2484 assert(final_vpm_cfg->gs_width == 1 ||
2485 final_vpm_cfg->gs_width == 4 ||
2486 final_vpm_cfg->gs_width == 8 ||
2487 final_vpm_cfg->gs_width == 16);
2488
2489 *vpm_cfg_out = *final_vpm_cfg;
2490 return true;
2491 }
2492
2493 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2494 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2495 struct v3d_vs_prog_data *vs_bin,
2496 struct v3d_vs_prog_data *vs,
2497 struct v3d_gs_prog_data *gs_bin,
2498 struct v3d_gs_prog_data *gs,
2499 struct vpm_config *vpm_cfg_bin,
2500 struct vpm_config *vpm_cfg)
2501 {
2502 assert(vs && vs_bin);
2503 assert((gs != NULL) == (gs_bin != NULL));
2504
2505 if (!gs) {
2506 vpm_cfg_bin->As = 1;
2507 vpm_cfg_bin->Ve = 0;
2508 vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2509
2510 vpm_cfg->As = 1;
2511 vpm_cfg->Ve = 0;
2512 vpm_cfg->Vc = vs->vcm_cache_size;
2513 } else {
2514 if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2515 return false;
2516
2517 if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2518 return false;
2519 }
2520
2521 return true;
2522 }
2523