xref: /aosp_15_r20/external/mesa3d/src/broadcom/compiler/vir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016-2017 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_schedule.h"
27 #include "compiler/nir/nir_builder.h"
28 
29 int
vir_get_nsrc(struct qinst * inst)30 vir_get_nsrc(struct qinst *inst)
31 {
32         switch (inst->qpu.type) {
33         case V3D_QPU_INSTR_TYPE_BRANCH:
34                 return 0;
35         case V3D_QPU_INSTR_TYPE_ALU:
36                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
37                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
38                 else
39                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
40         }
41 
42         return 0;
43 }
44 
45 /**
46  * Returns whether the instruction has any side effects that must be
47  * preserved.
48  */
49 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)50 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
51 {
52         switch (inst->qpu.type) {
53         case V3D_QPU_INSTR_TYPE_BRANCH:
54                 return true;
55         case V3D_QPU_INSTR_TYPE_ALU:
56                 switch (inst->qpu.alu.add.op) {
57                 case V3D_QPU_A_SETREVF:
58                 case V3D_QPU_A_SETMSF:
59                 case V3D_QPU_A_VPMSETUP:
60                 case V3D_QPU_A_STVPMV:
61                 case V3D_QPU_A_STVPMD:
62                 case V3D_QPU_A_STVPMP:
63                 case V3D_QPU_A_VPMWT:
64                 case V3D_QPU_A_TMUWT:
65                         return true;
66                 default:
67                         break;
68                 }
69 
70                 switch (inst->qpu.alu.mul.op) {
71                 case V3D_QPU_M_MULTOP:
72                         return true;
73                 default:
74                         break;
75                 }
76         }
77 
78         if (inst->qpu.sig.ldtmu ||
79             inst->qpu.sig.ldvary ||
80             inst->qpu.sig.ldtlbu ||
81             inst->qpu.sig.ldtlb ||
82             inst->qpu.sig.wrtmuc ||
83             inst->qpu.sig.thrsw) {
84                 return true;
85         }
86 
87         /* ldunifa works like ldunif: it reads an element and advances the
88          * pointer, so each read has a side effect (we don't care for ldunif
89          * because we reconstruct the uniform stream buffer after compiling
90          * with the surviving uniforms), so allowing DCE to remove
91          * one would break follow-up loads. We could fix this by emitting a
92          * unifa for each ldunifa, but each unifa requires 3 delay slots
93          * before a ldunifa, so that would be quite expensive.
94          */
95         if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
96                 return true;
97 
98         return false;
99 }
100 
101 bool
vir_is_raw_mov(struct qinst * inst)102 vir_is_raw_mov(struct qinst *inst)
103 {
104         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
105             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
106              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
107                 return false;
108         }
109 
110         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
111             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
112                 return false;
113         }
114 
115         if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
116             inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
117             inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
118             inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
119                 return false;
120         }
121 
122         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
123             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
124                 return false;
125 
126         return true;
127 }
128 
129 bool
vir_is_add(struct qinst * inst)130 vir_is_add(struct qinst *inst)
131 {
132         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
133                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
134 }
135 
136 bool
vir_is_mul(struct qinst * inst)137 vir_is_mul(struct qinst *inst)
138 {
139         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
140                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
141 }
142 
143 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)144 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
145 {
146         if (inst->dst.file == QFILE_MAGIC)
147                 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
148 
149         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
150             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
151                 return true;
152         }
153 
154         return false;
155 }
156 
157 bool
vir_writes_r4_implicitly(const struct v3d_device_info * devinfo,struct qinst * inst)158 vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
159                          struct qinst *inst)
160 {
161         if (!devinfo->has_accumulators)
162                 return false;
163 
164         switch (inst->dst.file) {
165         case QFILE_MAGIC:
166                 switch (inst->dst.index) {
167                 case V3D_QPU_WADDR_RECIP:
168                 case V3D_QPU_WADDR_RSQRT:
169                 case V3D_QPU_WADDR_EXP:
170                 case V3D_QPU_WADDR_LOG:
171                 case V3D_QPU_WADDR_SIN:
172                         return true;
173                 }
174                 break;
175         default:
176                 break;
177         }
178 
179         return false;
180 }
181 
182 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)183 vir_set_unpack(struct qinst *inst, int src,
184                enum v3d_qpu_input_unpack unpack)
185 {
186         assert(src == 0 || src == 1);
187 
188         if (vir_is_add(inst)) {
189                 if (src == 0)
190                         inst->qpu.alu.add.a.unpack = unpack;
191                 else
192                         inst->qpu.alu.add.b.unpack = unpack;
193         } else {
194                 assert(vir_is_mul(inst));
195                 if (src == 0)
196                         inst->qpu.alu.mul.a.unpack = unpack;
197                 else
198                         inst->qpu.alu.mul.b.unpack = unpack;
199         }
200 }
201 
202 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)203 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
204 {
205         if (vir_is_add(inst)) {
206                 inst->qpu.alu.add.output_pack = pack;
207         } else {
208                 assert(vir_is_mul(inst));
209                 inst->qpu.alu.mul.output_pack = pack;
210         }
211 }
212 
213 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)214 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
215 {
216         if (vir_is_add(inst)) {
217                 inst->qpu.flags.ac = cond;
218         } else {
219                 assert(vir_is_mul(inst));
220                 inst->qpu.flags.mc = cond;
221         }
222 }
223 
224 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)225 vir_get_cond(struct qinst *inst)
226 {
227         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
228 
229         if (vir_is_add(inst))
230                 return inst->qpu.flags.ac;
231         else if (vir_is_mul(inst))
232                 return inst->qpu.flags.mc;
233         else /* NOP */
234                 return V3D_QPU_COND_NONE;
235 }
236 
237 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)238 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
239 {
240         c->flags_temp = -1;
241         if (vir_is_add(inst)) {
242                 inst->qpu.flags.apf = pf;
243         } else {
244                 assert(vir_is_mul(inst));
245                 inst->qpu.flags.mpf = pf;
246         }
247 }
248 
249 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)250 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
251 {
252         c->flags_temp = -1;
253         if (vir_is_add(inst)) {
254                 inst->qpu.flags.auf = uf;
255         } else {
256                 assert(vir_is_mul(inst));
257                 inst->qpu.flags.muf = uf;
258         }
259 }
260 
261 #if 0
262 uint8_t
263 vir_channels_written(struct qinst *inst)
264 {
265         if (vir_is_mul(inst)) {
266                 switch (inst->dst.pack) {
267                 case QPU_PACK_MUL_NOP:
268                 case QPU_PACK_MUL_8888:
269                         return 0xf;
270                 case QPU_PACK_MUL_8A:
271                         return 0x1;
272                 case QPU_PACK_MUL_8B:
273                         return 0x2;
274                 case QPU_PACK_MUL_8C:
275                         return 0x4;
276                 case QPU_PACK_MUL_8D:
277                         return 0x8;
278                 }
279         } else {
280                 switch (inst->dst.pack) {
281                 case QPU_PACK_A_NOP:
282                 case QPU_PACK_A_8888:
283                 case QPU_PACK_A_8888_SAT:
284                 case QPU_PACK_A_32_SAT:
285                         return 0xf;
286                 case QPU_PACK_A_8A:
287                 case QPU_PACK_A_8A_SAT:
288                         return 0x1;
289                 case QPU_PACK_A_8B:
290                 case QPU_PACK_A_8B_SAT:
291                         return 0x2;
292                 case QPU_PACK_A_8C:
293                 case QPU_PACK_A_8C_SAT:
294                         return 0x4;
295                 case QPU_PACK_A_8D:
296                 case QPU_PACK_A_8D_SAT:
297                         return 0x8;
298                 case QPU_PACK_A_16A:
299                 case QPU_PACK_A_16A_SAT:
300                         return 0x3;
301                 case QPU_PACK_A_16B:
302                 case QPU_PACK_A_16B_SAT:
303                         return 0xc;
304                 }
305         }
306         unreachable("Bad pack field");
307 }
308 #endif
309 
310 struct qreg
vir_get_temp(struct v3d_compile * c)311 vir_get_temp(struct v3d_compile *c)
312 {
313         struct qreg reg;
314 
315         reg.file = QFILE_TEMP;
316         reg.index = c->num_temps++;
317 
318         if (c->num_temps > c->defs_array_size) {
319                 uint32_t old_size = c->defs_array_size;
320                 c->defs_array_size = MAX2(old_size * 2, 16);
321 
322                 c->defs = reralloc(c, c->defs, struct qinst *,
323                                    c->defs_array_size);
324                 memset(&c->defs[old_size], 0,
325                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
326 
327                 c->spillable = reralloc(c, c->spillable,
328                                         BITSET_WORD,
329                                         BITSET_WORDS(c->defs_array_size));
330                 for (int i = old_size; i < c->defs_array_size; i++)
331                         BITSET_SET(c->spillable, i);
332         }
333 
334         return reg;
335 }
336 
337 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)338 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
339 {
340         struct qinst *inst = calloc(1, sizeof(*inst));
341 
342         inst->qpu = v3d_qpu_nop();
343         inst->qpu.alu.add.op = op;
344 
345         inst->dst = dst;
346         inst->src[0] = src0;
347         inst->src[1] = src1;
348         inst->uniform = ~0;
349 
350         inst->ip = -1;
351 
352         return inst;
353 }
354 
355 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)356 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
357 {
358         struct qinst *inst = calloc(1, sizeof(*inst));
359 
360         inst->qpu = v3d_qpu_nop();
361         inst->qpu.alu.mul.op = op;
362 
363         inst->dst = dst;
364         inst->src[0] = src0;
365         inst->src[1] = src1;
366         inst->uniform = ~0;
367 
368         inst->ip = -1;
369 
370         return inst;
371 }
372 
373 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)374 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
375 {
376         struct qinst *inst = calloc(1, sizeof(*inst));
377 
378         inst->qpu = v3d_qpu_nop();
379         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
380         inst->qpu.branch.cond = cond;
381         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
382         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
383         inst->qpu.branch.ub = true;
384         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
385 
386         inst->dst = vir_nop_reg();
387         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
388 
389         inst->ip = -1;
390 
391         return inst;
392 }
393 
394 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)395 vir_emit(struct v3d_compile *c, struct qinst *inst)
396 {
397         inst->ip = -1;
398 
399         switch (c->cursor.mode) {
400         case vir_cursor_add:
401                 list_add(&inst->link, c->cursor.link);
402                 break;
403         case vir_cursor_addtail:
404                 list_addtail(&inst->link, c->cursor.link);
405                 break;
406         }
407 
408         c->cursor = vir_after_inst(inst);
409         c->live_intervals_valid = false;
410 }
411 
412 /* Updates inst to write to a new temporary, emits it, and notes the def. */
413 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)414 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
415 {
416         assert(inst->dst.file == QFILE_NULL);
417 
418         /* If we're emitting an instruction that's a def, it had better be
419          * writing a register.
420          */
421         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
422                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
423                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
424                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
425                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
426         }
427 
428         inst->dst = vir_get_temp(c);
429 
430         if (inst->dst.file == QFILE_TEMP)
431                 c->defs[inst->dst.index] = inst;
432 
433         vir_emit(c, inst);
434 
435         return inst->dst;
436 }
437 
438 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)439 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
440 {
441         if (inst->dst.file == QFILE_TEMP)
442                 c->defs[inst->dst.index] = NULL;
443 
444         vir_emit(c, inst);
445 
446         return inst;
447 }
448 
449 struct qblock *
vir_new_block(struct v3d_compile * c)450 vir_new_block(struct v3d_compile *c)
451 {
452         struct qblock *block = rzalloc(c, struct qblock);
453 
454         list_inithead(&block->instructions);
455 
456         block->predecessors = _mesa_set_create(block,
457                                                _mesa_hash_pointer,
458                                                _mesa_key_pointer_equal);
459 
460         block->index = c->next_block_index++;
461 
462         return block;
463 }
464 
465 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)466 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
467 {
468         c->cur_block = block;
469         c->cursor = vir_after_block(block);
470         list_addtail(&block->link, &c->blocks);
471 }
472 
473 struct qblock *
vir_entry_block(struct v3d_compile * c)474 vir_entry_block(struct v3d_compile *c)
475 {
476         return list_first_entry(&c->blocks, struct qblock, link);
477 }
478 
479 struct qblock *
vir_exit_block(struct v3d_compile * c)480 vir_exit_block(struct v3d_compile *c)
481 {
482         return list_last_entry(&c->blocks, struct qblock, link);
483 }
484 
485 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)486 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
487 {
488         _mesa_set_add(successor->predecessors, predecessor);
489         if (predecessor->successors[0]) {
490                 assert(!predecessor->successors[1]);
491                 predecessor->successors[1] = successor;
492         } else {
493                 predecessor->successors[0] = successor;
494         }
495 }
496 
497 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)498 v3d_compiler_init(const struct v3d_device_info *devinfo,
499                   uint32_t max_inline_uniform_buffers)
500 {
501         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
502         if (!compiler)
503                 return NULL;
504 
505         compiler->devinfo = devinfo;
506         compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
507 
508         if (!vir_init_reg_sets(compiler)) {
509                 ralloc_free(compiler);
510                 return NULL;
511         }
512 
513         return compiler;
514 }
515 
516 void
v3d_compiler_free(const struct v3d_compiler * compiler)517 v3d_compiler_free(const struct v3d_compiler *compiler)
518 {
519         ralloc_free((void *)compiler);
520 }
521 
522 struct v3d_compiler_strategy {
523         const char *name;
524         uint32_t max_threads;
525         uint32_t min_threads;
526         bool disable_general_tmu_sched;
527         bool disable_gcm;
528         bool disable_loop_unrolling;
529         bool disable_ubo_load_sorting;
530         bool move_buffer_loads;
531         bool disable_tmu_pipelining;
532         uint32_t max_tmu_spills;
533 };
534 
535 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t compile_strategy_idx,const struct v3d_compiler_strategy * strategy,bool fallback_scheduler)536 vir_compile_init(const struct v3d_compiler *compiler,
537                  struct v3d_key *key,
538                  nir_shader *s,
539                  void (*debug_output)(const char *msg,
540                                       void *debug_output_data),
541                  void *debug_output_data,
542                  int program_id, int variant_id,
543                  uint32_t compile_strategy_idx,
544                  const struct v3d_compiler_strategy *strategy,
545                  bool fallback_scheduler)
546 {
547         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
548 
549         c->compiler = compiler;
550         c->devinfo = compiler->devinfo;
551         c->key = key;
552         c->program_id = program_id;
553         c->variant_id = variant_id;
554         c->compile_strategy_idx = compile_strategy_idx;
555         c->threads = strategy->max_threads;
556         c->debug_output = debug_output;
557         c->debug_output_data = debug_output_data;
558         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
559         c->min_threads_for_reg_alloc = strategy->min_threads;
560         c->max_tmu_spills = strategy->max_tmu_spills;
561         c->fallback_scheduler = fallback_scheduler;
562         c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
563         c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
564         c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
565         c->move_buffer_loads = strategy->move_buffer_loads;
566         c->disable_gcm = strategy->disable_gcm;
567         c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
568                 ? true : strategy->disable_loop_unrolling;
569 
570 
571         s = nir_shader_clone(c, s);
572         c->s = s;
573 
574         list_inithead(&c->blocks);
575         vir_set_emit_block(c, vir_new_block(c));
576 
577         c->output_position_index = -1;
578         c->output_sample_mask_index = -1;
579 
580         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
581                                             _mesa_key_pointer_equal);
582 
583         c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
584         c->flags_temp = -1;
585 
586         return c;
587 }
588 
589 static int
type_size_vec4(const struct glsl_type * type,bool bindless)590 type_size_vec4(const struct glsl_type *type, bool bindless)
591 {
592         return glsl_count_attribute_slots(type, false);
593 }
594 
595 static enum nir_lower_tex_packing
lower_tex_packing_cb(const nir_tex_instr * tex,const void * data)596 lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
597 {
598    struct v3d_compile *c = (struct v3d_compile *) data;
599 
600    int sampler_index = nir_tex_instr_need_sampler(tex) ?
601       tex->sampler_index : tex->backend_flags;
602 
603    assert(sampler_index < c->key->num_samplers_used);
604    return c->key->sampler[sampler_index].return_size == 16 ?
605       nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
606 }
607 
608 static bool
v3d_nir_lower_null_pointers_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)609 v3d_nir_lower_null_pointers_cb(nir_builder *b,
610                                nir_intrinsic_instr *intr,
611                                void *_state)
612 {
613         uint32_t buffer_src_idx;
614 
615         switch (intr->intrinsic) {
616         case nir_intrinsic_load_ubo:
617         case nir_intrinsic_load_ssbo:
618                 buffer_src_idx = 0;
619                 break;
620         case nir_intrinsic_store_ssbo:
621                 buffer_src_idx = 1;
622                 break;
623         default:
624                 return false;
625         }
626 
627         /* If index if constant we are good */
628         nir_src *src = &intr->src[buffer_src_idx];
629         if (nir_src_is_const(*src))
630                 return false;
631 
632         /* Otherwise, see if it comes from a bcsel including a null pointer */
633         if (src->ssa->parent_instr->type != nir_instr_type_alu)
634                 return false;
635 
636         nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
637         if (alu->op != nir_op_bcsel)
638                 return false;
639 
640         /* A null pointer is specified using block index 0xffffffff */
641         int32_t null_src_idx = -1;
642         for (int i = 1; i < 3; i++) {
643                  /* FIXME: since we are running this before optimization maybe
644                   * we need to also handle the case where we may have bcsel
645                   * chain that we need to recurse?
646                   */
647                 if (!nir_src_is_const(alu->src[i].src))
648                         continue;
649                 if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
650                         continue;
651 
652                 /* One of the bcsel srcs is a null pointer reference */
653                 null_src_idx = i;
654                 break;
655         }
656 
657         if (null_src_idx < 0)
658                 return false;
659 
660         assert(null_src_idx == 1 || null_src_idx == 2);
661         int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
662 
663         /* Rewrite the null pointer reference so we use the same buffer index
664          * as the other bcsel branch. This will allow optimization to remove
665          * the bcsel and we should then end up with a constant buffer index
666          * like we need.
667          */
668         b->cursor = nir_before_instr(&alu->instr);
669         nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
670         nir_src_rewrite(&alu->src[null_src_idx].src, copy);
671 
672         return true;
673 }
674 
675 static bool
v3d_nir_lower_null_pointers(nir_shader * s)676 v3d_nir_lower_null_pointers(nir_shader *s)
677 {
678         return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
679                                             nir_metadata_control_flow, NULL);
680 }
681 
682 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * _data)683 lower_bit_size_cb(const nir_instr *instr, void *_data)
684 {
685         if (instr->type != nir_instr_type_alu)
686                 return 0;
687 
688         nir_alu_instr *alu = nir_instr_as_alu(instr);
689 
690         switch (alu->op) {
691         case nir_op_mov:
692         case nir_op_vec2:
693         case nir_op_vec3:
694         case nir_op_vec4:
695         case nir_op_vec5:
696         case nir_op_vec8:
697         case nir_op_vec16:
698         case nir_op_b2i8:
699         case nir_op_b2f16:
700         case nir_op_b2i16:
701         case nir_op_b2f32:
702         case nir_op_b2i32:
703         case nir_op_f2f16:
704         case nir_op_f2f16_rtne:
705         case nir_op_f2f16_rtz:
706         case nir_op_f2f32:
707         case nir_op_f2i32:
708         case nir_op_f2u32:
709         case nir_op_i2i8:
710         case nir_op_i2i16:
711         case nir_op_i2f16:
712         case nir_op_i2f32:
713         case nir_op_i2i32:
714         case nir_op_u2u8:
715         case nir_op_u2u16:
716         case nir_op_u2f16:
717         case nir_op_u2f32:
718         case nir_op_u2u32:
719         case nir_op_pack_32_2x16_split:
720         case nir_op_pack_32_4x8_split:
721         case nir_op_pack_half_2x16_split:
722                 return 0;
723 
724         /* we need to handle those here as they only work with 32 bits */
725         default:
726                 if (alu->src[0].src.ssa->bit_size != 1 && alu->src[0].src.ssa->bit_size < 32)
727                         return 32;
728                 return 0;
729         }
730 }
731 
732 static void
v3d_lower_nir(struct v3d_compile * c)733 v3d_lower_nir(struct v3d_compile *c)
734 {
735         struct nir_lower_tex_options tex_options = {
736                 .lower_txd = true,
737                 .lower_tg4_offsets = true,
738                 .lower_tg4_broadcom_swizzle = true,
739 
740                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
741                 .lower_txp = ~0,
742                 /* Apply swizzles to all samplers. */
743                 .swizzle_result = ~0,
744                 .lower_invalid_implicit_lod = true,
745         };
746 
747         /* Lower the format swizzle and (for 32-bit returns)
748          * ARB_texture_swizzle-style swizzle.
749          */
750         assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
751         for (int i = 0; i < c->key->num_tex_used; i++) {
752                 for (int j = 0; j < 4; j++)
753                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
754         }
755 
756         tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
757         tex_options.lower_tex_packing_data = c;
758 
759         NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
760         NIR_PASS(_, c->s, nir_lower_system_values);
761 
762         if (c->s->info.zero_initialize_shared_memory &&
763             c->s->info.shared_size > 0) {
764                 /* All our BOs allocate full pages, so the underlying allocation
765                  * for shared memory will always be a multiple of 4KB. This
766                  * ensures that we can do an exact number of full chunk_size
767                  * writes to initialize the memory independently of the actual
768                  * shared_size used by the shader, which is a requirement of
769                  * the initialization pass.
770                  */
771                 const unsigned chunk_size = 16; /* max single store size */
772                 NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
773                          align(c->s->info.shared_size, chunk_size), chunk_size);
774         }
775 
776         NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
777         NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
778         NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
779         NIR_PASS(_, c->s, nir_lower_bit_size, lower_bit_size_cb, NULL);
780 }
781 
782 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)783 v3d_set_prog_data_uniforms(struct v3d_compile *c,
784                            struct v3d_prog_data *prog_data)
785 {
786         int count = c->num_uniforms;
787         struct v3d_uniform_list *ulist = &prog_data->uniforms;
788 
789         ulist->count = count;
790         ulist->data = ralloc_array(prog_data, uint32_t, count);
791         memcpy(ulist->data, c->uniform_data,
792                count * sizeof(*ulist->data));
793         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
794         memcpy(ulist->contents, c->uniform_contents,
795                count * sizeof(*ulist->contents));
796 }
797 
798 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)799 v3d_vs_set_prog_data(struct v3d_compile *c,
800                      struct v3d_vs_prog_data *prog_data)
801 {
802         /* The vertex data gets format converted by the VPM so that
803          * each attribute channel takes up a VPM column.  Precompute
804          * the sizes for the shader record.
805          */
806         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
807                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
808                 prog_data->vpm_input_size += c->vattr_sizes[i];
809         }
810 
811         memset(prog_data->driver_location_map, -1,
812                sizeof(prog_data->driver_location_map));
813 
814         nir_foreach_shader_in_variable(var, c->s) {
815                 prog_data->driver_location_map[var->data.location] =
816                         var->data.driver_location;
817         }
818 
819         prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
820                                           SYSTEM_VALUE_VERTEX_ID) ||
821                               BITSET_TEST(c->s->info.system_values_read,
822                                           SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
823 
824         prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
825                                            SYSTEM_VALUE_BASE_INSTANCE);
826 
827         prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
828                                           SYSTEM_VALUE_INSTANCE_ID) ||
829                               BITSET_TEST(c->s->info.system_values_read,
830                                           SYSTEM_VALUE_INSTANCE_INDEX);
831 
832         if (prog_data->uses_vid)
833                 prog_data->vpm_input_size++;
834         if (prog_data->uses_biid)
835                 prog_data->vpm_input_size++;
836         if (prog_data->uses_iid)
837                 prog_data->vpm_input_size++;
838 
839         prog_data->writes_psiz =
840             c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
841 
842         /* Input/output segment size are in sectors (8 rows of 32 bits per
843          * channel).
844          */
845         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
846         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
847 
848         /* Set us up for shared input/output segments.  This is apparently
849          * necessary for our VCM setup to avoid varying corruption.
850          *
851          * FIXME: initial testing on V3D 7.1 seems to work fine when using
852          * separate segments. So we could try to reevaluate in the future, if
853          * there is any advantage of using separate segments.
854          */
855         prog_data->separate_segments = false;
856         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
857                                           prog_data->vpm_input_size);
858         prog_data->vpm_input_size = 0;
859 
860         /* Compute VCM cache size.  We set up our program to take up less than
861          * half of the VPM, so that any set of bin and render programs won't
862          * run out of space.  We need space for at least one input segment,
863          * and then allocate the rest to output segments (one for the current
864          * program, the rest to VCM).  The valid range of the VCM cache size
865          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
866          * batches.
867          */
868         assert(c->devinfo->vpm_size);
869         int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
870         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
871         int half_vpm = vpm_size_in_sectors / 2;
872         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
873         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
874         assert(vpm_output_batches >= 2);
875         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
876 }
877 
878 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)879 v3d_gs_set_prog_data(struct v3d_compile *c,
880                      struct v3d_gs_prog_data *prog_data)
881 {
882         prog_data->num_inputs = c->num_inputs;
883         memcpy(prog_data->input_slots, c->input_slots,
884                c->num_inputs * sizeof(*c->input_slots));
885 
886         /* gl_PrimitiveIdIn is written by the GBG into the first word of the
887          * VPM output header automatically and the shader will overwrite
888          * it after reading it if necessary, so it doesn't add to the VPM
889          * size requirements.
890          */
891         prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
892                                           SYSTEM_VALUE_PRIMITIVE_ID);
893 
894         /* Output segment size is in sectors (8 rows of 32 bits per channel) */
895         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
896 
897         /* Compute SIMD dispatch width and update VPM output size accordingly
898          * to ensure we can fit our program in memory. Available widths are
899          * 16, 8, 4, 1.
900          *
901          * Notice that at draw time we will have to consider VPM memory
902          * requirements from other stages and choose a smaller dispatch
903          * width if needed to fit the program in VPM memory.
904          */
905         prog_data->simd_width = 16;
906         while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
907                prog_data->simd_width == 2) {
908                 prog_data->simd_width >>= 1;
909                 prog_data->vpm_output_size =
910                         align(prog_data->vpm_output_size, 2) / 2;
911         }
912         assert(prog_data->vpm_output_size <= 16);
913         assert(prog_data->simd_width != 2);
914 
915         prog_data->out_prim_type = c->s->info.gs.output_primitive;
916         prog_data->num_invocations = c->s->info.gs.invocations;
917 
918         prog_data->writes_psiz =
919             c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
920 }
921 
922 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)923 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
924                             struct v3d_fs_prog_data *prog_data)
925 {
926         prog_data->num_inputs = c->num_inputs;
927         memcpy(prog_data->input_slots, c->input_slots,
928                c->num_inputs * sizeof(*c->input_slots));
929 
930         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
931                       (V3D_MAX_FS_INPUTS - 1) / 24);
932         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
933                 if (BITSET_TEST(c->flat_shade_flags, i))
934                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
935 
936                 if (BITSET_TEST(c->noperspective_flags, i))
937                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
938 
939                 if (BITSET_TEST(c->centroid_flags, i))
940                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
941         }
942 }
943 
944 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)945 v3d_fs_set_prog_data(struct v3d_compile *c,
946                      struct v3d_fs_prog_data *prog_data)
947 {
948         v3d_set_fs_prog_data_inputs(c, prog_data);
949         prog_data->writes_z = c->writes_z;
950         prog_data->writes_z_from_fep = c->writes_z_from_fep;
951         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
952         prog_data->uses_center_w = c->uses_center_w;
953         prog_data->uses_implicit_point_line_varyings =
954                 c->uses_implicit_point_line_varyings;
955         prog_data->lock_scoreboard_on_first_thrsw =
956                 c->lock_scoreboard_on_first_thrsw;
957         prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
958         prog_data->uses_pid = c->fs_uses_primitive_id;
959 }
960 
961 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)962 v3d_cs_set_prog_data(struct v3d_compile *c,
963                      struct v3d_compute_prog_data *prog_data)
964 {
965         prog_data->shared_size = c->s->info.shared_size;
966 
967         prog_data->local_size[0] = c->s->info.workgroup_size[0];
968         prog_data->local_size[1] = c->s->info.workgroup_size[1];
969         prog_data->local_size[2] = c->s->info.workgroup_size[2];
970 
971         prog_data->has_subgroups = c->has_subgroups;
972 }
973 
974 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)975 v3d_set_prog_data(struct v3d_compile *c,
976                   struct v3d_prog_data *prog_data)
977 {
978         prog_data->threads = c->threads;
979         prog_data->single_seg = !c->last_thrsw;
980         prog_data->spill_size = c->spill_size;
981         prog_data->tmu_spills = c->spills;
982         prog_data->tmu_fills = c->fills;
983         prog_data->tmu_count = c->tmu.total_count;
984         prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
985         prog_data->compile_strategy_idx = c->compile_strategy_idx;
986         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
987         prog_data->has_control_barrier = c->s->info.uses_control_barrier;
988         prog_data->has_global_address = c->has_global_address;
989 
990         v3d_set_prog_data_uniforms(c, prog_data);
991 
992         switch (c->s->info.stage) {
993         case MESA_SHADER_VERTEX:
994                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
995                 break;
996         case MESA_SHADER_GEOMETRY:
997                 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
998                 break;
999         case MESA_SHADER_FRAGMENT:
1000                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
1001                 break;
1002         case MESA_SHADER_COMPUTE:
1003                 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
1004                 break;
1005         default:
1006                 unreachable("unsupported shader stage");
1007         }
1008 }
1009 
1010 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)1011 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
1012 {
1013         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
1014 
1015         uint64_t *qpu_insts = malloc(*final_assembly_size);
1016         if (!qpu_insts)
1017                 return NULL;
1018 
1019         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
1020 
1021         vir_compile_destroy(c);
1022 
1023         return qpu_insts;
1024 }
1025 
1026 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)1027 v3d_nir_lower_vs_early(struct v3d_compile *c)
1028 {
1029         /* Split our I/O vars and dead code eliminate the unused
1030          * components.
1031          */
1032         NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1033                  nir_var_shader_in | nir_var_shader_out);
1034         uint64_t used_outputs[4] = {0};
1035         for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
1036                 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
1037                 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
1038                 used_outputs[comp] |= 1ull << slot;
1039         }
1040         NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1041                  nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1042         NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1043         v3d_optimize_nir(c, c->s);
1044         NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1045 
1046         /* This must go before nir_lower_io */
1047         if (c->vs_key->per_vertex_point_size)
1048                 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1049 
1050         NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1051                  type_size_vec4,
1052                  (nir_lower_io_options)0);
1053         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1054          * on the code it generated.
1055          */
1056         NIR_PASS(_, c->s, nir_opt_dce);
1057         NIR_PASS(_, c->s, nir_opt_constant_folding);
1058 }
1059 
1060 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)1061 v3d_nir_lower_gs_early(struct v3d_compile *c)
1062 {
1063         /* Split our I/O vars and dead code eliminate the unused
1064          * components.
1065          */
1066         NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1067                  nir_var_shader_in | nir_var_shader_out);
1068         uint64_t used_outputs[4] = {0};
1069         for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
1070                 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
1071                 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
1072                 used_outputs[comp] |= 1ull << slot;
1073         }
1074         NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1075                  nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1076         NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1077         v3d_optimize_nir(c, c->s);
1078         NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1079 
1080         /* This must go before nir_lower_io */
1081         if (c->gs_key->per_vertex_point_size)
1082                 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1083 
1084         NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1085                  type_size_vec4,
1086                  (nir_lower_io_options)0);
1087         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1088          * on the code it generated.
1089          */
1090         NIR_PASS(_, c->s, nir_opt_dce);
1091         NIR_PASS(_, c->s, nir_opt_constant_folding);
1092 }
1093 
1094 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)1095 v3d_fixup_fs_output_types(struct v3d_compile *c)
1096 {
1097         nir_foreach_shader_out_variable(var, c->s) {
1098                 uint32_t mask = 0;
1099 
1100                 switch (var->data.location) {
1101                 case FRAG_RESULT_COLOR:
1102                         mask = ~0;
1103                         break;
1104                 case FRAG_RESULT_DATA0:
1105                 case FRAG_RESULT_DATA1:
1106                 case FRAG_RESULT_DATA2:
1107                 case FRAG_RESULT_DATA3:
1108                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
1109                         break;
1110                 }
1111 
1112                 if (c->fs_key->int_color_rb & mask) {
1113                         var->type =
1114                                 glsl_vector_type(GLSL_TYPE_INT,
1115                                                  glsl_get_components(var->type));
1116                 } else if (c->fs_key->uint_color_rb & mask) {
1117                         var->type =
1118                                 glsl_vector_type(GLSL_TYPE_UINT,
1119                                                  glsl_get_components(var->type));
1120                 }
1121         }
1122 }
1123 
1124 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)1125 v3d_nir_lower_fs_early(struct v3d_compile *c)
1126 {
1127         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
1128                 v3d_fixup_fs_output_types(c);
1129 
1130         NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
1131 
1132         if (c->fs_key->line_smoothing) {
1133                 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
1134                 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1135                 /* The lowering pass can introduce new sysval reads */
1136                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1137         }
1138 }
1139 
1140 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1141 v3d_nir_lower_gs_late(struct v3d_compile *c)
1142 {
1143         if (c->key->ucp_enables) {
1144                 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
1145                          true, NULL);
1146         }
1147 
1148         /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1149         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1150 }
1151 
1152 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1153 v3d_nir_lower_vs_late(struct v3d_compile *c)
1154 {
1155         if (c->key->ucp_enables) {
1156                 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
1157                          false, true, NULL);
1158                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1159                            nir_var_shader_out, NULL, NULL);
1160         }
1161 
1162         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1163         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1164 }
1165 
1166 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1167 v3d_nir_lower_fs_late(struct v3d_compile *c)
1168 {
1169         /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1170          * Vulkan allows it, in which case the SPIR-V compiler will declare
1171          * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1172          * the last parameter to always operate with a compact array in both
1173          * OpenGL and Vulkan so we do't have to care about the API we
1174          * are using.
1175          */
1176         if (c->key->ucp_enables)
1177                 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1178 
1179         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
1180 }
1181 
1182 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1183 vir_get_max_temps(struct v3d_compile *c)
1184 {
1185         int max_ip = 0;
1186         vir_for_each_inst_inorder(inst, c)
1187                 max_ip++;
1188 
1189         uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1190 
1191         for (int t = 0; t < c->num_temps; t++) {
1192                 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1193                                                 i < max_ip); i++) {
1194                         if (i > max_ip)
1195                                 break;
1196                         pressure[i]++;
1197                 }
1198         }
1199 
1200         uint32_t max_temps = 0;
1201         for (int i = 0; i < max_ip; i++)
1202                 max_temps = MAX2(max_temps, pressure[i]);
1203 
1204         ralloc_free(pressure);
1205 
1206         return max_temps;
1207 }
1208 
1209 enum v3d_dependency_class {
1210         V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1211 };
1212 
1213 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1214 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1215                             nir_schedule_dependency *dep,
1216                             void *user_data)
1217 {
1218         struct v3d_compile *c = user_data;
1219 
1220         switch (intr->intrinsic) {
1221         case nir_intrinsic_store_output:
1222                 /* Writing to location 0 overwrites the value passed in for
1223                  * gl_PrimitiveID on geometry shaders
1224                  */
1225                 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1226                     nir_intrinsic_base(intr) != 0)
1227                         break;
1228 
1229                 nir_const_value *const_value =
1230                         nir_src_as_const_value(intr->src[1]);
1231 
1232                 if (const_value == NULL)
1233                         break;
1234 
1235                 uint64_t offset =
1236                         nir_const_value_as_uint(*const_value,
1237                                                 nir_src_bit_size(intr->src[1]));
1238                 if (offset != 0)
1239                         break;
1240 
1241                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1242                 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1243                 return true;
1244 
1245         case nir_intrinsic_load_primitive_id:
1246                 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1247                         break;
1248 
1249                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1250                 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1251                 return true;
1252 
1253         default:
1254                 break;
1255         }
1256 
1257         return false;
1258 }
1259 
1260 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1261 v3d_instr_delay_cb(nir_instr *instr, void *data)
1262 {
1263    struct v3d_compile *c = (struct v3d_compile *) data;
1264 
1265    switch (instr->type) {
1266    case nir_instr_type_undef:
1267    case nir_instr_type_load_const:
1268    case nir_instr_type_alu:
1269    case nir_instr_type_deref:
1270    case nir_instr_type_jump:
1271    case nir_instr_type_parallel_copy:
1272    case nir_instr_type_call:
1273    case nir_instr_type_phi:
1274       return 1;
1275 
1276    /* We should not use very large delays for TMU instructions. Typically,
1277     * thread switches will be sufficient to hide all or most of the latency,
1278     * so we typically only need a little bit of extra room. If we over-estimate
1279     * the latency here we may end up unnecessarily delaying the critical path in
1280     * the shader, which would have a negative effect in performance, so here
1281     * we are trying to strike a balance based on empirical testing.
1282     */
1283    case nir_instr_type_intrinsic: {
1284       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1285       if (!c->disable_general_tmu_sched) {
1286          switch (intr->intrinsic) {
1287          case nir_intrinsic_decl_reg:
1288          case nir_intrinsic_load_reg:
1289          case nir_intrinsic_store_reg:
1290             return 0;
1291          case nir_intrinsic_load_ssbo:
1292          case nir_intrinsic_load_scratch:
1293          case nir_intrinsic_load_shared:
1294          case nir_intrinsic_image_load:
1295             return 3;
1296          case nir_intrinsic_load_ubo:
1297             if (nir_src_is_divergent(intr->src[1]))
1298                return 3;
1299             FALLTHROUGH;
1300          default:
1301             return 1;
1302          }
1303       } else {
1304          switch (intr->intrinsic) {
1305          case nir_intrinsic_decl_reg:
1306          case nir_intrinsic_load_reg:
1307          case nir_intrinsic_store_reg:
1308             return 0;
1309          default:
1310             return 1;
1311          }
1312       }
1313       break;
1314    }
1315 
1316    case nir_instr_type_tex:
1317       return 5;
1318 
1319    case nir_instr_type_debug_info:
1320       return 0;
1321    }
1322 
1323    return 0;
1324 }
1325 
1326 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1327 should_split_wrmask(const nir_instr *instr, const void *data)
1328 {
1329         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1330         switch (intr->intrinsic) {
1331         case nir_intrinsic_store_ssbo:
1332         case nir_intrinsic_store_shared:
1333         case nir_intrinsic_store_global:
1334         case nir_intrinsic_store_scratch:
1335                 return true;
1336         default:
1337                 return false;
1338         }
1339 }
1340 
1341 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1342 nir_instr_as_constant_ubo_load(nir_instr *inst)
1343 {
1344         if (inst->type != nir_instr_type_intrinsic)
1345                 return NULL;
1346 
1347         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1348         if (intr->intrinsic != nir_intrinsic_load_ubo)
1349                 return NULL;
1350 
1351         assert(nir_src_is_const(intr->src[0]));
1352         if (!nir_src_is_const(intr->src[1]))
1353                 return NULL;
1354 
1355         return intr;
1356 }
1357 
1358 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1359 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1360 {
1361         bool progress = false;
1362 
1363         nir_instr *ref_inst = &ref->instr;
1364         uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1365         uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1366 
1367         /* Go through all instructions after ref searching for constant UBO
1368          * loads for the same UBO index.
1369          */
1370         bool seq_break = false;
1371         nir_instr *inst = &ref->instr;
1372         nir_instr *next_inst = NULL;
1373         while (true) {
1374                 inst = next_inst ? next_inst : nir_instr_next(inst);
1375                 if (!inst)
1376                         break;
1377 
1378                 next_inst = NULL;
1379 
1380                 if (inst->type != nir_instr_type_intrinsic)
1381                         continue;
1382 
1383                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1384                 if (intr->intrinsic != nir_intrinsic_load_ubo)
1385                         continue;
1386 
1387                 /* We only produce unifa sequences for non-divergent loads */
1388                 if (nir_src_is_divergent(intr->src[1]))
1389                         continue;
1390 
1391                 /* If there are any UBO loads that are not constant or that
1392                  * use a different UBO index in between the reference load and
1393                  * any other constant load for the same index, they would break
1394                  * the unifa sequence. We will flag that so we can then move
1395                  * all constant UBO loads for the reference index before these
1396                  * and not just the ones that are not ordered to avoid breaking
1397                  * the sequence and reduce unifa writes.
1398                  */
1399                 if (!nir_src_is_const(intr->src[1])) {
1400                         seq_break = true;
1401                         continue;
1402                 }
1403                 uint32_t offset = nir_src_as_uint(intr->src[1]);
1404 
1405                 assert(nir_src_is_const(intr->src[0]));
1406                 uint32_t index = nir_src_as_uint(intr->src[0]);
1407                 if (index != ref_index) {
1408                        seq_break = true;
1409                        continue;
1410                 }
1411 
1412                 /* Only move loads with an offset that is close enough to the
1413                  * reference offset, since otherwise we would not be able to
1414                  * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1415                  */
1416                 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1417                         continue;
1418 
1419                 /* We will move this load if its offset is smaller than ref's
1420                  * (in which case we will move it before ref) or if the offset
1421                  * is larger than ref's but there are sequence breakers in
1422                  * in between (in which case we will move it after ref and
1423                  * before the sequence breakers).
1424                  */
1425                 if (!seq_break && offset >= ref_offset)
1426                         continue;
1427 
1428                 /* Find where exactly we want to move this load:
1429                  *
1430                  * If we are moving it before ref, we want to check any other
1431                  * UBO loads we placed before ref and make sure we insert this
1432                  * one properly ordered with them. Likewise, if we are moving
1433                  * it after ref.
1434                  */
1435                 nir_instr *pos = ref_inst;
1436                 nir_instr *tmp = pos;
1437                 do {
1438                         if (offset < ref_offset)
1439                                 tmp = nir_instr_prev(tmp);
1440                         else
1441                                 tmp = nir_instr_next(tmp);
1442 
1443                         if (!tmp || tmp == inst)
1444                                 break;
1445 
1446                         /* Ignore non-unifa UBO loads */
1447                         if (tmp->type != nir_instr_type_intrinsic)
1448                                 continue;
1449 
1450                         nir_intrinsic_instr *tmp_intr =
1451                                 nir_instr_as_intrinsic(tmp);
1452                         if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1453                                 continue;
1454 
1455                         if (nir_src_is_divergent(tmp_intr->src[1]))
1456                                 continue;
1457 
1458                         /* Stop if we find a unifa UBO load that breaks the
1459                          * sequence.
1460                          */
1461                         if (!nir_src_is_const(tmp_intr->src[1]))
1462                                 break;
1463 
1464                         if (nir_src_as_uint(tmp_intr->src[0]) != index)
1465                                 break;
1466 
1467                         uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1468                         if (offset < ref_offset) {
1469                                 if (tmp_offset < offset ||
1470                                     tmp_offset >= ref_offset) {
1471                                         break;
1472                                 } else {
1473                                         pos = tmp;
1474                                 }
1475                         } else {
1476                                 if (tmp_offset > offset ||
1477                                     tmp_offset <= ref_offset) {
1478                                         break;
1479                                 } else {
1480                                         pos = tmp;
1481                                 }
1482                         }
1483                 } while (true);
1484 
1485                 /* We can't move the UBO load before the instruction that
1486                  * defines its constant offset. If that instruction is placed
1487                  * in between the new location (pos) and the current location
1488                  * of this load, we will have to move that instruction too.
1489                  *
1490                  * We don't care about the UBO index definition because that
1491                  * is optimized to be reused by all UBO loads for the same
1492                  * index and therefore is certain to be defined before the
1493                  * first UBO load that uses it.
1494                  */
1495                 nir_instr *offset_inst = NULL;
1496                 tmp = inst;
1497                 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1498                         if (pos == tmp) {
1499                                 /* We reached the target location without
1500                                  * finding the instruction that defines the
1501                                  * offset, so that instruction must be before
1502                                  * the new position and we don't have to fix it.
1503                                  */
1504                                 break;
1505                         }
1506                         if (intr->src[1].ssa->parent_instr == tmp) {
1507                                 offset_inst = tmp;
1508                                 break;
1509                         }
1510                 }
1511 
1512                 if (offset_inst) {
1513                         exec_node_remove(&offset_inst->node);
1514                         exec_node_insert_node_before(&pos->node,
1515                                                      &offset_inst->node);
1516                 }
1517 
1518                 /* Since we are moving the instruction before its current
1519                  * location, grab its successor before the move so that
1520                  * we can continue the next iteration of the main loop from
1521                  * that instruction.
1522                  */
1523                 next_inst = nir_instr_next(inst);
1524 
1525                 /* Move this load to the selected location */
1526                 exec_node_remove(&inst->node);
1527                 if (offset < ref_offset)
1528                         exec_node_insert_node_before(&pos->node, &inst->node);
1529                 else
1530                         exec_node_insert_after(&pos->node, &inst->node);
1531 
1532                 progress = true;
1533         }
1534 
1535         return progress;
1536 }
1537 
1538 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1539 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1540                                       nir_block *block)
1541 {
1542         bool progress = false;
1543         bool local_progress;
1544         do {
1545                 local_progress = false;
1546                 nir_foreach_instr_safe(inst, block) {
1547                         nir_intrinsic_instr *intr =
1548                                 nir_instr_as_constant_ubo_load(inst);
1549                         if (intr) {
1550                                 local_progress |=
1551                                         v3d_nir_sort_constant_ubo_load(block, intr);
1552                         }
1553                 }
1554                 progress |= local_progress;
1555         } while (local_progress);
1556 
1557         return progress;
1558 }
1559 
1560 /**
1561  * Sorts constant UBO loads in each block by offset to maximize chances of
1562  * skipping unifa writes when converting to VIR. This can increase register
1563  * pressure.
1564  */
1565 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1566 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1567 {
1568         nir_foreach_function_impl(impl, s) {
1569                 nir_foreach_block(block, impl) {
1570                         c->sorted_any_ubo_loads |=
1571                                 v3d_nir_sort_constant_ubo_loads_block(c, block);
1572                 }
1573                 nir_metadata_preserve(impl,
1574                                       nir_metadata_control_flow);
1575         }
1576         return c->sorted_any_ubo_loads;
1577 }
1578 
1579 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1580 lower_load_num_subgroups(struct v3d_compile *c,
1581                          nir_builder *b,
1582                          nir_intrinsic_instr *intr)
1583 {
1584         assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1585         assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1586 
1587         b->cursor = nir_after_instr(&intr->instr);
1588         uint32_t num_subgroups =
1589                 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1590                              c->s->info.workgroup_size[1] *
1591                              c->s->info.workgroup_size[2], V3D_CHANNELS);
1592         nir_def *result = nir_imm_int(b, num_subgroups);
1593         nir_def_replace(&intr->def, result);
1594 }
1595 
1596 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1597 lower_subgroup_intrinsics(struct v3d_compile *c,
1598                           nir_block *block, nir_builder *b)
1599 {
1600         bool progress = false;
1601         nir_foreach_instr_safe(inst, block) {
1602                 if (inst->type != nir_instr_type_intrinsic)
1603                         continue;;
1604 
1605                 nir_intrinsic_instr *intr =
1606                         nir_instr_as_intrinsic(inst);
1607                 if (!intr)
1608                         continue;
1609 
1610                 switch (intr->intrinsic) {
1611                 case nir_intrinsic_load_num_subgroups:
1612                         lower_load_num_subgroups(c, b, intr);
1613                         progress = true;
1614                         FALLTHROUGH;
1615                 case nir_intrinsic_load_subgroup_id:
1616                 case nir_intrinsic_load_subgroup_size:
1617                 case nir_intrinsic_load_subgroup_invocation:
1618                 case nir_intrinsic_elect:
1619                 case nir_intrinsic_ballot:
1620                 case nir_intrinsic_inverse_ballot:
1621                 case nir_intrinsic_ballot_bitfield_extract:
1622                 case nir_intrinsic_ballot_bit_count_reduce:
1623                 case nir_intrinsic_ballot_find_lsb:
1624                 case nir_intrinsic_ballot_find_msb:
1625                 case nir_intrinsic_ballot_bit_count_exclusive:
1626                 case nir_intrinsic_ballot_bit_count_inclusive:
1627                 case nir_intrinsic_reduce:
1628                 case nir_intrinsic_inclusive_scan:
1629                 case nir_intrinsic_exclusive_scan:
1630                 case nir_intrinsic_read_invocation:
1631                 case nir_intrinsic_read_first_invocation:
1632                 case nir_intrinsic_load_subgroup_eq_mask:
1633                 case nir_intrinsic_load_subgroup_ge_mask:
1634                 case nir_intrinsic_load_subgroup_gt_mask:
1635                 case nir_intrinsic_load_subgroup_le_mask:
1636                 case nir_intrinsic_load_subgroup_lt_mask:
1637                 case nir_intrinsic_shuffle:
1638                 case nir_intrinsic_shuffle_xor:
1639                 case nir_intrinsic_shuffle_up:
1640                 case nir_intrinsic_shuffle_down:
1641                 case nir_intrinsic_vote_all:
1642                 case nir_intrinsic_vote_any:
1643                 case nir_intrinsic_vote_feq:
1644                 case nir_intrinsic_vote_ieq:
1645                 case nir_intrinsic_quad_broadcast:
1646                 case nir_intrinsic_quad_swap_horizontal:
1647                 case nir_intrinsic_quad_swap_vertical:
1648                 case nir_intrinsic_quad_swap_diagonal:
1649                         c->has_subgroups = true;
1650                         break;
1651                 default:
1652                         break;
1653                 }
1654         }
1655 
1656         return progress;
1657 }
1658 
1659 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1660 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1661 {
1662         bool progress = false;
1663         nir_foreach_function_impl(impl, s) {
1664                 nir_builder b = nir_builder_create(impl);
1665 
1666                 nir_foreach_block(block, impl)
1667                         progress |= lower_subgroup_intrinsics(c, block, &b);
1668 
1669                 nir_metadata_preserve(impl,
1670                                       nir_metadata_control_flow);
1671         }
1672         return progress;
1673 }
1674 
1675 static void
v3d_attempt_compile(struct v3d_compile * c)1676 v3d_attempt_compile(struct v3d_compile *c)
1677 {
1678         switch (c->s->info.stage) {
1679         case MESA_SHADER_VERTEX:
1680                 c->vs_key = (struct v3d_vs_key *) c->key;
1681                 break;
1682         case MESA_SHADER_GEOMETRY:
1683                 c->gs_key = (struct v3d_gs_key *) c->key;
1684                 break;
1685         case MESA_SHADER_FRAGMENT:
1686                 c->fs_key = (struct v3d_fs_key *) c->key;
1687                 break;
1688         case MESA_SHADER_COMPUTE:
1689                 break;
1690         default:
1691                 unreachable("unsupported shader stage");
1692         }
1693 
1694         switch (c->s->info.stage) {
1695         case MESA_SHADER_VERTEX:
1696                 v3d_nir_lower_vs_early(c);
1697                 break;
1698         case MESA_SHADER_GEOMETRY:
1699                 v3d_nir_lower_gs_early(c);
1700                 break;
1701         case MESA_SHADER_FRAGMENT:
1702                 v3d_nir_lower_fs_early(c);
1703                 break;
1704         default:
1705                 break;
1706         }
1707 
1708         v3d_lower_nir(c);
1709 
1710         switch (c->s->info.stage) {
1711         case MESA_SHADER_VERTEX:
1712                 v3d_nir_lower_vs_late(c);
1713                 break;
1714         case MESA_SHADER_GEOMETRY:
1715                 v3d_nir_lower_gs_late(c);
1716                 break;
1717         case MESA_SHADER_FRAGMENT:
1718                 v3d_nir_lower_fs_late(c);
1719                 break;
1720         default:
1721                 break;
1722         }
1723 
1724         NIR_PASS(_, c->s, v3d_nir_lower_io, c);
1725         NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
1726         NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
1727 
1728         NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
1729         nir_lower_idiv_options idiv_options = {
1730                 .allow_fp16 = true,
1731         };
1732         NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
1733         NIR_PASS(_, c->s, nir_lower_alu);
1734 
1735         if (c->key->robust_uniform_access || c->key->robust_storage_access ||
1736             c->key->robust_image_access) {
1737                 /* nir_lower_robust_access assumes constant buffer
1738                  * indices on ubo/ssbo intrinsics so run copy propagation and
1739                  * constant folding passes before we run the lowering to warrant
1740                  * this. We also want to run the lowering before v3d_optimize to
1741                  * clean-up redundant get_buffer_size calls produced in the pass.
1742                  */
1743                 NIR_PASS(_, c->s, nir_copy_prop);
1744                 NIR_PASS(_, c->s, nir_opt_constant_folding);
1745 
1746                 nir_lower_robust_access_options opts = {
1747                    .lower_image = c->key->robust_image_access,
1748                    .lower_ssbo = c->key->robust_storage_access,
1749                    .lower_ubo = c->key->robust_uniform_access,
1750                 };
1751 
1752                 NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
1753         }
1754 
1755         NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
1756                  nir_var_function_temp,
1757                  0,
1758                  glsl_get_natural_size_align_bytes,
1759                  glsl_get_natural_size_align_bytes);
1760 
1761         NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
1762         NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1763         NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
1764         NIR_PASS(_, c->s, v3d_nir_lower_scratch);
1765 
1766         /* needs to run after load_store_bitsize */
1767         NIR_PASS(_, c->s, nir_lower_pack);
1768 
1769         NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
1770 
1771         const nir_lower_subgroups_options subgroup_opts = {
1772                 .subgroup_size = V3D_CHANNELS,
1773                 .ballot_components = 1,
1774                 .ballot_bit_size = 32,
1775                 .lower_to_scalar = true,
1776                 .lower_inverse_ballot = true,
1777                 .lower_subgroup_masks = true,
1778                 .lower_relative_shuffle = true,
1779                 .lower_quad = true,
1780         };
1781         NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
1782 
1783         v3d_optimize_nir(c, c->s);
1784 
1785         /* Do late algebraic optimization to turn add(a, neg(b)) back into
1786          * subs, then the mandatory cleanup after algebraic.  Note that it may
1787          * produce fnegs, and if so then we need to keep running to squash
1788          * fneg(fneg(a)).
1789          */
1790         bool more_late_algebraic = true;
1791         while (more_late_algebraic) {
1792                 more_late_algebraic = false;
1793                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1794                 NIR_PASS(_, c->s, nir_opt_constant_folding);
1795                 NIR_PASS(_, c->s, nir_copy_prop);
1796                 NIR_PASS(_, c->s, nir_opt_dce);
1797                 NIR_PASS(_, c->s, nir_opt_cse);
1798         }
1799 
1800         NIR_PASS(_, c->s, nir_lower_bool_to_int32);
1801         NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
1802         NIR_PASS_V(c->s, nir_divergence_analysis);
1803         NIR_PASS(_, c->s, nir_convert_from_ssa, true);
1804 
1805         struct nir_schedule_options schedule_options = {
1806                 /* Schedule for about half our register space, to enable more
1807                  * shaders to hit 4 threads.
1808                  */
1809                 .threshold = c->threads == 4 ? 24 : 48,
1810 
1811                 /* Vertex shaders share the same memory for inputs and outputs,
1812                  * fragment and geometry shaders do not.
1813                  */
1814                 .stages_with_shared_io_memory =
1815                 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1816                  ~((1 << MESA_SHADER_FRAGMENT) |
1817                    (1 << MESA_SHADER_GEOMETRY))),
1818 
1819                 .fallback = c->fallback_scheduler,
1820 
1821                 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1822                 .intrinsic_cb_data = c,
1823 
1824                 .instr_delay_cb = v3d_instr_delay_cb,
1825                 .instr_delay_cb_data = c,
1826         };
1827         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1828 
1829         if (!c->disable_constant_ubo_load_sorting)
1830                 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
1831 
1832         const nir_move_options buffer_opts = c->move_buffer_loads ?
1833                 (nir_move_load_ubo | nir_move_load_ssbo) : 0;
1834         NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
1835                                         nir_move_const_undef |
1836                                         buffer_opts);
1837 
1838         NIR_PASS_V(c->s, nir_trivialize_registers);
1839 
1840         v3d_nir_to_vir(c);
1841 }
1842 
1843 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1844 v3d_prog_data_size(gl_shader_stage stage)
1845 {
1846         static const int prog_data_size[] = {
1847                 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1848                 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1849                 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1850                 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1851         };
1852 
1853         assert(stage >= 0 &&
1854                stage < ARRAY_SIZE(prog_data_size) &&
1855                prog_data_size[stage]);
1856 
1857         return prog_data_size[stage];
1858 }
1859 
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1860 int v3d_shaderdb_dump(struct v3d_compile *c,
1861 		      char **shaderdb_str)
1862 {
1863         if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1864                 return -1;
1865 
1866         return asprintf(shaderdb_str,
1867                         "%s shader: %d inst, %d threads, %d loops, "
1868                         "%d uniforms, %d max-temps, %d:%d spills:fills, "
1869                         "%d sfu-stalls, %d inst-and-stalls, %d nops",
1870                         vir_get_stage_name(c),
1871                         c->qpu_inst_count,
1872                         c->threads,
1873                         c->loops,
1874                         c->num_uniforms,
1875                         vir_get_max_temps(c),
1876                         c->spills,
1877                         c->fills,
1878                         c->qpu_inst_stalled_count,
1879                         c->qpu_inst_count + c->qpu_inst_stalled_count,
1880                         c->nop_count);
1881 }
1882 
1883 /* This is a list of incremental changes to the compilation strategy
1884  * that will be used to try to compile the shader successfully. The
1885  * default strategy is to enable all optimizations which will have
1886  * the highest register pressure but is expected to produce most
1887  * optimal code. Following strategies incrementally disable specific
1888  * optimizations that are known to contribute to register pressure
1889  * in order to be able to compile the shader successfully while meeting
1890  * thread count requirements.
1891  *
1892  * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1893  * cover previous hardware as well (meaning that we are not limiting
1894  * register allocation to any particular thread count). This is fine
1895  * because v3d_nir_to_vir will cap this to the actual minimum.
1896  */
1897 static const struct v3d_compiler_strategy strategies[] = {
1898         /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
1899         /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
1900         /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
1901         /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
1902         /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
1903         /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
1904         /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
1905         /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
1906         /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
1907         /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
1908         /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
1909         /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  true,  -1 },
1910         /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  true,  -1 }
1911 };
1912 
1913 /**
1914  * If a particular optimization didn't make any progress during a compile
1915  * attempt disabling it alone won't allow us to compile the shader successfully,
1916  * since we'll end up with the same code. Detect these scenarios so we can
1917  * avoid wasting time with useless compiles. We should also consider if the
1918  * gy changes other aspects of the compilation process though, like
1919  * spilling, and not skip it in that case.
1920  */
1921 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1922 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1923 {
1924    /* We decide if we can skip a strategy based on the optimizations that
1925     * were active in the previous strategy, so we should only be calling this
1926     * for strategies after the first.
1927     */
1928    assert(idx > 0);
1929 
1930    /* Don't skip a strategy that changes spilling behavior */
1931    if (strategies[idx].max_tmu_spills !=
1932        strategies[idx - 1].max_tmu_spills) {
1933            return false;
1934    }
1935 
1936    switch (idx) {
1937    /* General TMU sched.: skip if we didn't emit any TMU loads */
1938    case 1:
1939    case 7:
1940            return !c->has_general_tmu_load;
1941    /* Global code motion: skip if nir_opt_gcm didn't make any progress */
1942    case 2:
1943    case 8:
1944            return !c->gcm_progress;
1945    /* Loop unrolling: skip if we didn't unroll any loops */
1946    case 3:
1947    case 9:
1948            return !c->unrolled_any_loops;
1949    /* UBO load sorting: skip if we didn't sort any loads */
1950    case 4:
1951            return !c->sorted_any_ubo_loads;
1952    /* Move buffer loads: we assume any shader with difficult RA
1953     * most likely has UBO / SSBO loads so we never try to skip.
1954     * For now, we only try this for 2-thread compiles since it
1955     * is expected to impact instruction counts and latency.
1956     */
1957    case 10:
1958           assert(c->threads < 4);
1959           return false;
1960    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1961    case 5:
1962    case 11:
1963            return !c->pipelined_any_tmu;
1964    /* Lower thread count: skip if we already tried less that 4 threads */
1965    case 6:
1966           return c->threads < 4;
1967    default:
1968            return false;
1969    };
1970 }
1971 
1972 static inline void
set_best_compile(struct v3d_compile ** best,struct v3d_compile * c)1973 set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
1974 {
1975    if (*best)
1976       vir_compile_destroy(*best);
1977    *best = c;
1978 }
1979 
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)1980 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1981                       struct v3d_key *key,
1982                       struct v3d_prog_data **out_prog_data,
1983                       nir_shader *s,
1984                       void (*debug_output)(const char *msg,
1985                                            void *debug_output_data),
1986                       void *debug_output_data,
1987                       int program_id, int variant_id,
1988                       uint32_t *final_assembly_size)
1989 {
1990         struct v3d_compile *c = NULL;
1991 
1992         uint32_t best_spill_fill_count = UINT32_MAX;
1993         struct v3d_compile *best_c = NULL;
1994         for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
1995                 /* Fallback strategy */
1996                 if (strat > 0) {
1997                         assert(c);
1998                         if (skip_compile_strategy(c, strat))
1999                                 continue;
2000 
2001                         char *debug_msg;
2002                         int ret = asprintf(&debug_msg,
2003                                            "Falling back to strategy '%s' "
2004                                            "for %s prog %d/%d",
2005                                            strategies[strat].name,
2006                                            vir_get_stage_name(c),
2007                                            c->program_id, c->variant_id);
2008 
2009                         if (ret >= 0) {
2010                                 if (V3D_DBG(PERF))
2011                                         fprintf(stderr, "%s\n", debug_msg);
2012 
2013                                 c->debug_output(debug_msg, c->debug_output_data);
2014                                 free(debug_msg);
2015                         }
2016 
2017                         if (c != best_c)
2018                                 vir_compile_destroy(c);
2019                 }
2020 
2021                 c = vir_compile_init(compiler, key, s,
2022                                      debug_output, debug_output_data,
2023                                      program_id, variant_id,
2024                                      strat, &strategies[strat],
2025                                      strat == ARRAY_SIZE(strategies) - 1);
2026 
2027                 v3d_attempt_compile(c);
2028 
2029                 /* Broken shader or driver bug */
2030                 if (c->compilation_result == V3D_COMPILATION_FAILED)
2031                         break;
2032 
2033                 /* If we compiled without spills, choose this.
2034                  * Otherwise if this is a 4-thread compile, choose this (these
2035                  * have a very low cap on the allowed TMU spills so we assume
2036                  * it will be better than a 2-thread compile without spills).
2037                  * Otherwise, keep going while tracking the strategy with the
2038                  * lowest spill count.
2039                  */
2040                 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
2041                         if (c->spills == 0 ||
2042                             strategies[strat].min_threads == 4 ||
2043                             V3D_DBG(OPT_COMPILE_TIME)) {
2044                                 set_best_compile(&best_c, c);
2045                                 break;
2046                         } else if (c->spills + c->fills <
2047                                    best_spill_fill_count) {
2048                                 set_best_compile(&best_c, c);
2049                                 best_spill_fill_count = c->spills + c->fills;
2050                         }
2051 
2052                         if (V3D_DBG(PERF)) {
2053                                 char *debug_msg;
2054                                 int ret = asprintf(&debug_msg,
2055                                                    "Compiled %s prog %d/%d with %d "
2056                                                    "spills and %d fills. Will try "
2057                                                    "more strategies.",
2058                                                    vir_get_stage_name(c),
2059                                                    c->program_id, c->variant_id,
2060                                                    c->spills, c->fills);
2061                                 if (ret >= 0) {
2062                                         fprintf(stderr, "%s\n", debug_msg);
2063                                         c->debug_output(debug_msg, c->debug_output_data);
2064                                         free(debug_msg);
2065                                 }
2066                         }
2067                 }
2068 
2069                 /* Only try next streategy if we failed to register allocate
2070                  * or we had to spill.
2071                  */
2072                 assert(c->compilation_result ==
2073                        V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
2074                        c->spills > 0);
2075         }
2076 
2077         /* If the best strategy was not the last, choose that */
2078         if (best_c && c != best_c)
2079                 set_best_compile(&c, best_c);
2080 
2081         if (V3D_DBG(PERF) &&
2082             c->compilation_result !=
2083             V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
2084             c->spills > 0) {
2085                 char *debug_msg;
2086                 int ret = asprintf(&debug_msg,
2087                                    "Compiled %s prog %d/%d with %d "
2088                                    "spills and %d fills",
2089                                    vir_get_stage_name(c),
2090                                    c->program_id, c->variant_id,
2091                                    c->spills, c->fills);
2092                 fprintf(stderr, "%s\n", debug_msg);
2093 
2094                 if (ret >= 0) {
2095                         c->debug_output(debug_msg, c->debug_output_data);
2096                         free(debug_msg);
2097                 }
2098         }
2099 
2100         if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
2101                 fprintf(stderr, "Failed to compile %s prog %d/%d "
2102                         "with any strategy.\n",
2103                         vir_get_stage_name(c), c->program_id, c->variant_id);
2104 
2105                 vir_compile_destroy(c);
2106                 return NULL;
2107         }
2108 
2109         struct v3d_prog_data *prog_data;
2110 
2111         prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
2112 
2113         v3d_set_prog_data(c, prog_data);
2114 
2115         *out_prog_data = prog_data;
2116 
2117         char *shaderdb;
2118         int ret = v3d_shaderdb_dump(c, &shaderdb);
2119         if (ret >= 0) {
2120                 if (V3D_DBG(SHADERDB))
2121                         fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
2122 
2123                 c->debug_output(shaderdb, c->debug_output_data);
2124                 free(shaderdb);
2125         }
2126 
2127        return v3d_return_qpu_insts(c, final_assembly_size);
2128 }
2129 
2130 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)2131 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
2132 {
2133         if (qinst->dst.file == QFILE_TEMP)
2134                 c->defs[qinst->dst.index] = NULL;
2135 
2136         assert(&qinst->link != c->cursor.link);
2137 
2138         list_del(&qinst->link);
2139         free(qinst);
2140 
2141         c->live_intervals_valid = false;
2142 }
2143 
2144 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)2145 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
2146 {
2147         /* XXX
2148         int pack = reg.pack;
2149 
2150         while (reg.file == QFILE_TEMP &&
2151                c->defs[reg.index] &&
2152                (c->defs[reg.index]->op == QOP_MOV ||
2153                 c->defs[reg.index]->op == QOP_FMOV) &&
2154                !c->defs[reg.index]->dst.pack &&
2155                !c->defs[reg.index]->src[0].pack) {
2156                 reg = c->defs[reg.index]->src[0];
2157         }
2158 
2159         reg.pack = pack;
2160         */
2161         return reg;
2162 }
2163 
2164 void
vir_compile_destroy(struct v3d_compile * c)2165 vir_compile_destroy(struct v3d_compile *c)
2166 {
2167         /* Defuse the assert that we aren't removing the cursor's instruction.
2168          */
2169         c->cursor.link = NULL;
2170 
2171         vir_for_each_block(block, c) {
2172                 while (!list_is_empty(&block->instructions)) {
2173                         struct qinst *qinst =
2174                                 list_first_entry(&block->instructions,
2175                                                  struct qinst, link);
2176                         vir_remove_instruction(c, qinst);
2177                 }
2178         }
2179 
2180         ralloc_free(c);
2181 }
2182 
2183 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2184 vir_get_uniform_index(struct v3d_compile *c,
2185                       enum quniform_contents contents,
2186                       uint32_t data)
2187 {
2188         for (int i = 0; i < c->num_uniforms; i++) {
2189                 if (c->uniform_contents[i] == contents &&
2190                     c->uniform_data[i] == data) {
2191                         return i;
2192                 }
2193         }
2194 
2195         uint32_t uniform = c->num_uniforms++;
2196 
2197         if (uniform >= c->uniform_array_size) {
2198                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
2199                                              c->uniform_array_size * 2);
2200 
2201                 c->uniform_data = reralloc(c, c->uniform_data,
2202                                            uint32_t,
2203                                            c->uniform_array_size);
2204                 c->uniform_contents = reralloc(c, c->uniform_contents,
2205                                                enum quniform_contents,
2206                                                c->uniform_array_size);
2207         }
2208 
2209         c->uniform_contents[uniform] = contents;
2210         c->uniform_data[uniform] = data;
2211 
2212         return uniform;
2213 }
2214 
2215 /* Looks back into the current block to find the ldunif that wrote the uniform
2216  * at the requested index. If it finds it, it returns true and writes the
2217  * destination register of the ldunif instruction to 'unif'.
2218  *
2219  * This can impact register pressure and end up leading to worse code, so we
2220  * limit the number of instructions we are willing to look back through to
2221  * strike a good balance.
2222  */
2223 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2224 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2225 {
2226         uint32_t count = 20;
2227         struct qinst *prev_inst = NULL;
2228         assert(c->cur_block);
2229 
2230 #if MESA_DEBUG
2231         /* We can only reuse a uniform if it was emitted in the same block,
2232          * so callers must make sure the current instruction is being emitted
2233          * in the current block.
2234          */
2235         bool found = false;
2236         vir_for_each_inst(inst, c->cur_block) {
2237                 if (&inst->link == c->cursor.link) {
2238                         found = true;
2239                         break;
2240                 }
2241         }
2242 
2243         assert(found || &c->cur_block->instructions == c->cursor.link);
2244 #endif
2245 
2246         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2247                                      &c->cur_block->instructions, link) {
2248                 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2249                     inst->uniform == index) {
2250                         prev_inst = inst;
2251                         break;
2252                 }
2253 
2254                 if (--count == 0)
2255                         break;
2256         }
2257 
2258         if (!prev_inst)
2259                 return false;
2260 
2261         /* Only reuse the ldunif result if it was written to a temp register,
2262          * otherwise there may be special restrictions (for example, ldunif
2263          * may write directly to unifa, which is a write-only register).
2264          */
2265         if (prev_inst->dst.file != QFILE_TEMP)
2266                 return false;
2267 
2268         list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2269                                  &c->cur_block->instructions, link) {
2270                 if (inst->dst.file == prev_inst->dst.file &&
2271                     inst->dst.index == prev_inst->dst.index) {
2272                         return false;
2273                 }
2274         }
2275 
2276         *unif = prev_inst->dst;
2277         return true;
2278 }
2279 
2280 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2281 vir_uniform(struct v3d_compile *c,
2282             enum quniform_contents contents,
2283             uint32_t data)
2284 {
2285         const int num_uniforms = c->num_uniforms;
2286         const int index = vir_get_uniform_index(c, contents, data);
2287 
2288         /* If this is not the first time we see this uniform try to reuse the
2289          * result of the last ldunif that loaded it.
2290          */
2291         const bool is_new_uniform = num_uniforms != c->num_uniforms;
2292         if (!is_new_uniform && !c->disable_ldunif_opt) {
2293                 struct qreg ldunif_dst;
2294                 if (try_opt_ldunif(c, index, &ldunif_dst))
2295                         return ldunif_dst;
2296         }
2297 
2298         struct qinst *inst = vir_NOP(c);
2299         inst->qpu.sig.ldunif = true;
2300         inst->uniform = index;
2301         inst->dst = vir_get_temp(c);
2302         c->defs[inst->dst.index] = inst;
2303         return inst->dst;
2304 }
2305 
2306 #define OPTPASS(func)                                                   \
2307         do {                                                            \
2308                 bool stage_progress = func(c);                          \
2309                 if (stage_progress) {                                   \
2310                         progress = true;                                \
2311                         if (print_opt_debug) {                          \
2312                                 fprintf(stderr,                         \
2313                                         "VIR opt pass %2d: %s progress\n", \
2314                                         pass, #func);                   \
2315                         }                                               \
2316                         /*XXX vir_validate(c);*/                        \
2317                 }                                                       \
2318         } while (0)
2319 
2320 void
vir_optimize(struct v3d_compile * c)2321 vir_optimize(struct v3d_compile *c)
2322 {
2323         bool print_opt_debug = false;
2324         int pass = 1;
2325 
2326         while (true) {
2327                 bool progress = false;
2328 
2329                 OPTPASS(vir_opt_copy_propagate);
2330                 OPTPASS(vir_opt_redundant_flags);
2331                 OPTPASS(vir_opt_dead_code);
2332                 OPTPASS(vir_opt_small_immediates);
2333                 OPTPASS(vir_opt_constant_alu);
2334 
2335                 if (!progress)
2336                         break;
2337 
2338                 pass++;
2339         }
2340 }
2341 
2342 const char *
vir_get_stage_name(struct v3d_compile * c)2343 vir_get_stage_name(struct v3d_compile *c)
2344 {
2345         if (c->vs_key && c->vs_key->is_coord)
2346                 return "MESA_SHADER_VERTEX_BIN";
2347         else if (c->gs_key && c->gs_key->is_coord)
2348                 return "MESA_SHADER_GEOMETRY_BIN";
2349         else
2350                 return gl_shader_stage_name(c->s->info.stage);
2351 }
2352 
2353 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2354 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2355 {
2356    assert(devinfo->vpm_size > 0);
2357    const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2358    return devinfo->vpm_size / sector_size;
2359 }
2360 
2361 /* Computes various parameters affecting VPM memory configuration for programs
2362  * involving geometry shaders to ensure the program fits in memory and honors
2363  * requirements described in section "VPM usage" of the programming manual.
2364  */
2365 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2366 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2367                       struct v3d_vs_prog_data *vs,
2368                       struct v3d_gs_prog_data *gs,
2369                       struct vpm_config *vpm_cfg_out)
2370 {
2371    const uint32_t A = vs->separate_segments ? 1 : 0;
2372    const uint32_t Ad = vs->vpm_input_size;
2373    const uint32_t Vd = vs->vpm_output_size;
2374 
2375    const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2376 
2377    /* Try to fit program into our VPM memory budget by adjusting
2378     * configurable parameters iteratively. We do this in two phases:
2379     * the first phase tries to fit the program into the total available
2380     * VPM memory. If we succeed at that, then the second phase attempts
2381     * to fit the program into half of that budget so we can run bin and
2382     * render programs in parallel.
2383     */
2384    struct vpm_config vpm_cfg[2];
2385    struct vpm_config *final_vpm_cfg = NULL;
2386    uint32_t phase = 0;
2387 
2388    vpm_cfg[phase].As = 1;
2389    vpm_cfg[phase].Gs = 1;
2390    vpm_cfg[phase].Gd = gs->vpm_output_size;
2391    vpm_cfg[phase].gs_width = gs->simd_width;
2392 
2393    /* While there is a requirement that Vc >= [Vn / 16], this is
2394     * always the case when tessellation is not present because in that
2395     * case Vn can only be 6 at most (when input primitive is triangles
2396     * with adjacency).
2397     *
2398     * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2399     * and Broadcom has not found it worth it to increase it beyond this
2400     * in general. Increasing Vc also increases VPM memory pressure which
2401     * can turn up being detrimental for performance in some scenarios.
2402     */
2403    vpm_cfg[phase].Vc = 2;
2404 
2405    /* Gv is a constraint on the hardware to not exceed the
2406     * specified number of vertex segments per GS batch. If adding a
2407     * new primitive to a GS batch would result in a range of more
2408     * than Gv vertex segments being referenced by the batch, then
2409     * the hardware will flush the batch and start a new one. This
2410     * means that we can choose any value we want, we just need to
2411     * be aware that larger values improve GS batch utilization
2412     * at the expense of more VPM memory pressure (which can affect
2413     * other performance aspects, such as GS dispatch width).
2414     * We start with the largest value, and will reduce it if we
2415     * find that total memory pressure is too high.
2416     */
2417    vpm_cfg[phase].Gv = 3;
2418    do {
2419       /* When GS is present in absence of TES, then we need to satisfy
2420        * that Ve >= Gv. We go with the smallest value of Ve to avoid
2421        * increasing memory pressure.
2422        */
2423       vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2424 
2425       uint32_t vpm_sectors =
2426          A * vpm_cfg[phase].As * Ad +
2427          (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2428          vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2429 
2430       /* Ideally we want to use no more than half of the available
2431        * memory so we can execute a bin and render program in parallel
2432        * without stalls. If we achieved that then we are done.
2433        */
2434       if (vpm_sectors <= vpm_size / 2) {
2435          final_vpm_cfg = &vpm_cfg[phase];
2436          break;
2437       }
2438 
2439       /* At the very least, we should not allocate more than the
2440        * total available VPM memory. If we have a configuration that
2441        * succeeds at this we save it and continue to see if we can
2442        * meet the half-memory-use criteria too.
2443        */
2444       if (phase == 0 && vpm_sectors <= vpm_size) {
2445          vpm_cfg[1] = vpm_cfg[0];
2446          phase = 1;
2447       }
2448 
2449       /* Try lowering Gv */
2450       if (vpm_cfg[phase].Gv > 0) {
2451          vpm_cfg[phase].Gv--;
2452          continue;
2453       }
2454 
2455       /* Try lowering GS dispatch width */
2456       if (vpm_cfg[phase].gs_width > 1) {
2457          do {
2458             vpm_cfg[phase].gs_width >>= 1;
2459             vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2460          } while (vpm_cfg[phase].gs_width == 2);
2461 
2462          /* Reset Gv to max after dropping dispatch width */
2463          vpm_cfg[phase].Gv = 3;
2464          continue;
2465       }
2466 
2467       /* We ran out of options to reduce memory pressure. If we
2468        * are at phase 1 we have at least a valid configuration, so we
2469        * we use that.
2470        */
2471       if (phase == 1)
2472          final_vpm_cfg = &vpm_cfg[0];
2473       break;
2474    } while (true);
2475 
2476    if (!final_vpm_cfg)
2477       return false;
2478 
2479    assert(final_vpm_cfg);
2480    assert(final_vpm_cfg->Gd <= 16);
2481    assert(final_vpm_cfg->Gv < 4);
2482    assert(final_vpm_cfg->Ve < 4);
2483    assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2484    assert(final_vpm_cfg->gs_width == 1 ||
2485           final_vpm_cfg->gs_width == 4 ||
2486           final_vpm_cfg->gs_width == 8 ||
2487           final_vpm_cfg->gs_width == 16);
2488 
2489    *vpm_cfg_out = *final_vpm_cfg;
2490    return true;
2491 }
2492 
2493 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2494 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2495                        struct v3d_vs_prog_data *vs_bin,
2496                        struct v3d_vs_prog_data *vs,
2497                        struct v3d_gs_prog_data *gs_bin,
2498                        struct v3d_gs_prog_data *gs,
2499                        struct vpm_config *vpm_cfg_bin,
2500                        struct vpm_config *vpm_cfg)
2501 {
2502    assert(vs && vs_bin);
2503    assert((gs != NULL) == (gs_bin != NULL));
2504 
2505    if (!gs) {
2506       vpm_cfg_bin->As = 1;
2507       vpm_cfg_bin->Ve = 0;
2508       vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2509 
2510       vpm_cfg->As = 1;
2511       vpm_cfg->Ve = 0;
2512       vpm_cfg->Vc = vs->vcm_cache_size;
2513    } else {
2514       if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2515          return false;
2516 
2517       if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2518          return false;
2519    }
2520 
2521    return true;
2522 }
2523