1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v42_pack.h"
42
43 #define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0)
53
54 #define V3D_TSY_SET_QUORUM 0
55 #define V3D_TSY_INC_WAITERS 1
56 #define V3D_TSY_DEC_WAITERS 2
57 #define V3D_TSY_INC_QUORUM 3
58 #define V3D_TSY_DEC_QUORUM 4
59 #define V3D_TSY_FREE_ALL 5
60 #define V3D_TSY_RELEASE 6
61 #define V3D_TSY_ACQUIRE 7
62 #define V3D_TSY_WAIT 8
63 #define V3D_TSY_WAIT_INC 9
64 #define V3D_TSY_WAIT_CHECK 10
65 #define V3D_TSY_WAIT_INC_CHECK 11
66 #define V3D_TSY_WAIT_CV 12
67 #define V3D_TSY_INC_SEMAPHORE 13
68 #define V3D_TSY_DEC_SEMAPHORE 14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70
71 enum v3d_tmu_op_type
72 {
73 V3D_TMU_OP_TYPE_REGULAR,
74 V3D_TMU_OP_TYPE_ATOMIC,
75 V3D_TMU_OP_TYPE_CACHE
76 };
77
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81 switch(tmu_op) {
82 case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83 case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84 case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85 case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86 case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88 case V3D_TMU_OP_WRITE_UMAX:
89 case V3D_TMU_OP_WRITE_SMIN:
90 case V3D_TMU_OP_WRITE_SMAX:
91 assert(is_write);
92 FALLTHROUGH;
93 case V3D_TMU_OP_WRITE_AND_READ_INC:
94 case V3D_TMU_OP_WRITE_OR_READ_DEC:
95 case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96 return V3D_TMU_OP_TYPE_ATOMIC;
97 case V3D_TMU_OP_REGULAR:
98 return V3D_TMU_OP_TYPE_REGULAR;
99
100 default:
101 unreachable("Unknown tmu_op\n");
102 }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109 struct qreg **regs,
110 uint32_t *size,
111 uint32_t decl_size)
112 {
113 if (*size >= decl_size)
114 return;
115
116 uint32_t old_size = *size;
117 *size = MAX2(*size * 2, decl_size);
118 *regs = reralloc(c, *regs, struct qreg, *size);
119 if (!*regs) {
120 fprintf(stderr, "Malloc failure\n");
121 abort();
122 }
123
124 for (uint32_t i = old_size; i < *size; i++)
125 (*regs)[i] = c->undef;
126 }
127
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130 struct v3d_interp_input **regs,
131 uint32_t *size,
132 uint32_t decl_size)
133 {
134 if (*size >= decl_size)
135 return;
136
137 uint32_t old_size = *size;
138 *size = MAX2(*size * 2, decl_size);
139 *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140 if (!*regs) {
141 fprintf(stderr, "Malloc failure\n");
142 abort();
143 }
144
145 for (uint32_t i = old_size; i < *size; i++) {
146 (*regs)[i].vp = c->undef;
147 (*regs)[i].C = c->undef;
148 }
149 }
150
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154 if (c->threads == 1)
155 return;
156
157 /* Always thread switch after each texture operation for now.
158 *
159 * We could do better by batching a bunch of texture fetches up and
160 * then doing one thread switch and collecting all their results
161 * afterward.
162 */
163 c->last_thrsw = vir_NOP(c);
164 c->last_thrsw->qpu.sig.thrsw = true;
165 c->last_thrsw_at_top_level = !c->in_control_flow;
166
167 /* We need to lock the scoreboard before any tlb access happens. If this
168 * thread switch comes after we have emitted a tlb load, then it means
169 * that we can't lock on the last thread switch any more.
170 */
171 if (c->emitted_tlb_load)
172 c->lock_scoreboard_on_first_thrsw = true;
173 }
174
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178 if (nir_src_is_const(instr->src[src])) {
179 int64_t add_val = nir_src_as_int(instr->src[src]);
180 if (add_val == 1)
181 return V3D_TMU_OP_WRITE_AND_READ_INC;
182 else if (add_val == -1)
183 return V3D_TMU_OP_WRITE_OR_READ_DEC;
184 }
185
186 return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188
189 static uint32_t
v3d_general_tmu_op_for_atomic(nir_intrinsic_instr * instr)190 v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
191 {
192 nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
193 switch (atomic_op) {
194 case nir_atomic_op_iadd:
195 return instr->intrinsic == nir_intrinsic_ssbo_atomic ?
196 v3d_get_op_for_atomic_add(instr, 2) :
197 v3d_get_op_for_atomic_add(instr, 1);
198 case nir_atomic_op_imin: return V3D_TMU_OP_WRITE_SMIN;
199 case nir_atomic_op_umin: return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
200 case nir_atomic_op_imax: return V3D_TMU_OP_WRITE_SMAX;
201 case nir_atomic_op_umax: return V3D_TMU_OP_WRITE_UMAX;
202 case nir_atomic_op_iand: return V3D_TMU_OP_WRITE_AND_READ_INC;
203 case nir_atomic_op_ior: return V3D_TMU_OP_WRITE_OR_READ_DEC;
204 case nir_atomic_op_ixor: return V3D_TMU_OP_WRITE_XOR_READ_NOT;
205 case nir_atomic_op_xchg: return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
206 case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
207 default: unreachable("unknown atomic op");
208 }
209 }
210
211 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)212 v3d_general_tmu_op(nir_intrinsic_instr *instr)
213 {
214 switch (instr->intrinsic) {
215 case nir_intrinsic_load_ssbo:
216 case nir_intrinsic_load_ubo:
217 case nir_intrinsic_load_uniform:
218 case nir_intrinsic_load_shared:
219 case nir_intrinsic_load_scratch:
220 case nir_intrinsic_load_global:
221 case nir_intrinsic_load_global_constant:
222 case nir_intrinsic_store_ssbo:
223 case nir_intrinsic_store_shared:
224 case nir_intrinsic_store_scratch:
225 case nir_intrinsic_store_global:
226 return V3D_TMU_OP_REGULAR;
227
228 case nir_intrinsic_ssbo_atomic:
229 case nir_intrinsic_ssbo_atomic_swap:
230 case nir_intrinsic_shared_atomic:
231 case nir_intrinsic_shared_atomic_swap:
232 case nir_intrinsic_global_atomic:
233 case nir_intrinsic_global_atomic_swap:
234 return v3d_general_tmu_op_for_atomic(instr);
235
236 default:
237 unreachable("unknown intrinsic op");
238 }
239 }
240
241 /**
242 * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
243 * would overflow the Output TMU fifo.
244 *
245 * It is not allowed to overflow the Output fifo, however, we can overflow
246 * Input and Config fifos. Doing that makes the shader stall, but only for as
247 * long as it needs to be able to continue so it is better for pipelining to
248 * let the QPU stall on these if needed than trying to emit TMU flushes in the
249 * driver.
250 */
251 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)252 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
253 {
254 if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
255 return true;
256
257 return components > 0 &&
258 c->tmu.output_fifo_size + components > 16 / c->threads;
259 }
260
261 /**
262 * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
263 * popping all TMU fifo entries.
264 */
265 void
ntq_flush_tmu(struct v3d_compile * c)266 ntq_flush_tmu(struct v3d_compile *c)
267 {
268 if (c->tmu.flush_count == 0)
269 return;
270
271 vir_emit_thrsw(c);
272
273 bool emitted_tmuwt = false;
274 for (int i = 0; i < c->tmu.flush_count; i++) {
275 if (c->tmu.flush[i].component_mask > 0) {
276 nir_def *def = c->tmu.flush[i].def;
277 assert(def);
278
279 for (int j = 0; j < 4; j++) {
280 if (c->tmu.flush[i].component_mask & (1 << j)) {
281 ntq_store_def(c, def, j,
282 vir_MOV(c, vir_LDTMU(c)));
283 }
284 }
285 } else if (!emitted_tmuwt) {
286 vir_TMUWT(c);
287 emitted_tmuwt = true;
288 }
289 }
290
291 c->tmu.output_fifo_size = 0;
292 c->tmu.flush_count = 0;
293 _mesa_set_clear(c->tmu.outstanding_regs, NULL);
294 }
295
296 /**
297 * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
298 * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
299 * and more specifically, the output fifo, since that can't stall.
300 */
301 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_def * def,uint32_t component_mask)302 ntq_add_pending_tmu_flush(struct v3d_compile *c,
303 nir_def *def,
304 uint32_t component_mask)
305 {
306 const uint32_t num_components = util_bitcount(component_mask);
307 assert(!ntq_tmu_fifo_overflow(c, num_components));
308
309 if (num_components > 0) {
310 c->tmu.output_fifo_size += num_components;
311
312 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
313 if (store != NULL) {
314 nir_def *reg = store->src[1].ssa;
315 _mesa_set_add(c->tmu.outstanding_regs, reg);
316 }
317 }
318
319 c->tmu.flush[c->tmu.flush_count].def = def;
320 c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
321 c->tmu.flush_count++;
322 c->tmu.total_count++;
323
324 if (c->disable_tmu_pipelining)
325 ntq_flush_tmu(c);
326 else if (c->tmu.flush_count > 1)
327 c->pipelined_any_tmu = true;
328 }
329
330 enum emit_mode {
331 MODE_COUNT = 0,
332 MODE_EMIT,
333 MODE_LAST,
334 };
335
336 /**
337 * For a TMU general store instruction:
338 *
339 * In MODE_COUNT mode, records the number of TMU writes required and flushes
340 * any outstanding TMU operations the instruction depends on, but it doesn't
341 * emit any actual register writes.
342 *
343 * In MODE_EMIT mode, emits the data register writes required by the
344 * instruction.
345 */
346 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * type_size,uint32_t * tmu_writes)347 emit_tmu_general_store_writes(struct v3d_compile *c,
348 enum emit_mode mode,
349 nir_intrinsic_instr *instr,
350 uint32_t base_const_offset,
351 uint32_t *writemask,
352 uint32_t *const_offset,
353 uint32_t *type_size,
354 uint32_t *tmu_writes)
355 {
356 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
357
358 /* Find the first set of consecutive components that
359 * are enabled in the writemask and emit the TMUD
360 * instructions for them.
361 */
362 assert(*writemask != 0);
363 uint32_t first_component = ffs(*writemask) - 1;
364 uint32_t last_component = first_component;
365 while (*writemask & BITFIELD_BIT(last_component + 1))
366 last_component++;
367
368 assert(first_component <= last_component &&
369 last_component < instr->num_components);
370
371 for (int i = first_component; i <= last_component; i++) {
372 struct qreg data = ntq_get_src(c, instr->src[0], i);
373 if (mode == MODE_COUNT)
374 (*tmu_writes)++;
375 else
376 vir_MOV_dest(c, tmud, data);
377 }
378
379 if (mode == MODE_EMIT) {
380 /* Update the offset for the TMU write based on the
381 * the first component we are writing.
382 */
383 *type_size = nir_src_bit_size(instr->src[0]) / 8;
384 *const_offset =
385 base_const_offset + first_component * (*type_size);
386
387 /* Clear these components from the writemask */
388 uint32_t written_mask =
389 BITFIELD_RANGE(first_component, *tmu_writes);
390 (*writemask) &= ~written_mask;
391 }
392 }
393
394 /**
395 * For a TMU general atomic instruction:
396 *
397 * In MODE_COUNT mode, records the number of TMU writes required and flushes
398 * any outstanding TMU operations the instruction depends on, but it doesn't
399 * emit any actual register writes.
400 *
401 * In MODE_EMIT mode, emits the data register writes required by the
402 * instruction.
403 */
404 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)405 emit_tmu_general_atomic_writes(struct v3d_compile *c,
406 enum emit_mode mode,
407 nir_intrinsic_instr *instr,
408 uint32_t tmu_op,
409 bool has_index,
410 uint32_t *tmu_writes)
411 {
412 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
413
414 struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
415 if (mode == MODE_COUNT)
416 (*tmu_writes)++;
417 else
418 vir_MOV_dest(c, tmud, data);
419
420 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
421 data = ntq_get_src(c, instr->src[2 + has_index], 0);
422 if (mode == MODE_COUNT)
423 (*tmu_writes)++;
424 else
425 vir_MOV_dest(c, tmud, data);
426 }
427 }
428
429 /**
430 * For any TMU general instruction:
431 *
432 * In MODE_COUNT mode, records the number of TMU writes required to emit the
433 * address parameter and flushes any outstanding TMU operations the instruction
434 * depends on, but it doesn't emit any actual register writes.
435 *
436 * In MODE_EMIT mode, emits register writes required to emit the address.
437 */
438 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t dest_components,uint32_t * tmu_writes)439 emit_tmu_general_address_write(struct v3d_compile *c,
440 enum emit_mode mode,
441 nir_intrinsic_instr *instr,
442 uint32_t config,
443 bool dynamic_src,
444 int offset_src,
445 struct qreg base_offset,
446 uint32_t const_offset,
447 uint32_t dest_components,
448 uint32_t *tmu_writes)
449 {
450 if (mode == MODE_COUNT) {
451 (*tmu_writes)++;
452 if (dynamic_src)
453 ntq_get_src(c, instr->src[offset_src], 0);
454 return;
455 }
456
457 if (vir_in_nonuniform_control_flow(c)) {
458 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
459 V3D_QPU_PF_PUSHZ);
460 }
461
462 struct qreg tmua;
463 if (config == ~0)
464 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
465 else
466 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
467
468 struct qinst *tmu;
469 if (dynamic_src) {
470 struct qreg offset = base_offset;
471 if (const_offset != 0) {
472 offset = vir_ADD(c, offset,
473 vir_uniform_ui(c, const_offset));
474 }
475 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
476 tmu = vir_ADD_dest(c, tmua, offset, data);
477 } else {
478 if (const_offset != 0) {
479 tmu = vir_ADD_dest(c, tmua, base_offset,
480 vir_uniform_ui(c, const_offset));
481 } else {
482 tmu = vir_MOV_dest(c, tmua, base_offset);
483 }
484 }
485
486 if (config != ~0) {
487 tmu->uniform =
488 vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
489 }
490
491 if (vir_in_nonuniform_control_flow(c))
492 vir_set_cond(tmu, V3D_QPU_COND_IFA);
493
494 tmu->ldtmu_count = dest_components;
495 }
496
497 /**
498 * Implements indirect uniform loads and SSBO accesses through the TMU general
499 * memory access interface.
500 */
501 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch,bool is_global)502 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
503 bool is_shared_or_scratch, bool is_global)
504 {
505 uint32_t tmu_op = v3d_general_tmu_op(instr);
506
507 /* If we were able to replace atomic_add for an inc/dec, then we
508 * need/can to do things slightly different, like not loading the
509 * amount to add/sub, as that is implicit.
510 */
511 bool atomic_add_replaced =
512 (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
513 instr->intrinsic == nir_intrinsic_shared_atomic ||
514 instr->intrinsic == nir_intrinsic_global_atomic) &&
515 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
516 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
517 tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
518
519 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
520 instr->intrinsic == nir_intrinsic_store_scratch ||
521 instr->intrinsic == nir_intrinsic_store_shared ||
522 instr->intrinsic == nir_intrinsic_store_global);
523
524 bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
525 instr->intrinsic == nir_intrinsic_load_ubo ||
526 instr->intrinsic == nir_intrinsic_load_ssbo ||
527 instr->intrinsic == nir_intrinsic_load_scratch ||
528 instr->intrinsic == nir_intrinsic_load_shared ||
529 instr->intrinsic == nir_intrinsic_load_global ||
530 instr->intrinsic == nir_intrinsic_load_global_constant);
531
532 if (!is_load)
533 c->tmu_dirty_rcl = true;
534
535 if (is_global)
536 c->has_global_address = true;
537
538 bool has_index = !is_shared_or_scratch && !is_global;
539
540 int offset_src;
541 if (instr->intrinsic == nir_intrinsic_load_uniform) {
542 offset_src = 0;
543 } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
544 instr->intrinsic == nir_intrinsic_load_ubo ||
545 instr->intrinsic == nir_intrinsic_load_scratch ||
546 instr->intrinsic == nir_intrinsic_load_shared ||
547 instr->intrinsic == nir_intrinsic_load_global ||
548 instr->intrinsic == nir_intrinsic_load_global_constant ||
549 atomic_add_replaced) {
550 offset_src = 0 + has_index;
551 } else if (is_store) {
552 offset_src = 1 + has_index;
553 } else {
554 offset_src = 0 + has_index;
555 }
556
557 bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
558 uint32_t const_offset = 0;
559 if (!dynamic_src)
560 const_offset = nir_src_as_uint(instr->src[offset_src]);
561
562 struct qreg base_offset;
563 if (instr->intrinsic == nir_intrinsic_load_uniform) {
564 const_offset += nir_intrinsic_base(instr);
565 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
566 v3d_unit_data_create(0, const_offset));
567 const_offset = 0;
568 } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
569 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
570 * is gallium's constant buffer 0 in GL and push constants
571 * in Vulkan)).
572 */
573 uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
574 base_offset =
575 vir_uniform(c, QUNIFORM_UBO_ADDR,
576 v3d_unit_data_create(index, const_offset));
577 const_offset = 0;
578 } else if (is_shared_or_scratch) {
579 /* Shared and scratch variables have no buffer index, and all
580 * start from a common base that we set up at the start of
581 * dispatch.
582 */
583 if (instr->intrinsic == nir_intrinsic_load_scratch ||
584 instr->intrinsic == nir_intrinsic_store_scratch) {
585 base_offset = c->spill_base;
586 } else {
587 base_offset = c->cs_shared_offset;
588 const_offset += nir_intrinsic_base(instr);
589 }
590 } else if (is_global) {
591 /* Global load/store intrinsics use gloal addresses, so the
592 * offset is the target address and we don't need to add it
593 * to a base offset.
594 */
595 base_offset = vir_uniform_ui(c, 0);
596 } else {
597 uint32_t idx = is_store ? 1 : 0;
598 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
599 nir_src_comp_as_uint(instr->src[idx], 0));
600 }
601
602 /* We are ready to emit TMU register writes now, but before we actually
603 * emit them we need to flush outstanding TMU operations if any of our
604 * writes reads from the result of an outstanding TMU operation before
605 * we start the TMU sequence for this operation, since otherwise the
606 * flush could happen in the middle of the TMU sequence we are about to
607 * emit, which is illegal. To do this we run this logic twice, the
608 * first time it will count required register writes and flush pending
609 * TMU requests if necessary due to a dependency, and the second one
610 * will emit the actual TMU writes.
611 */
612 const uint32_t dest_components = nir_intrinsic_dest_components(instr);
613 uint32_t base_const_offset = const_offset;
614 uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
615 uint32_t tmu_writes = 0;
616 for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
617 assert(mode == MODE_COUNT || tmu_writes > 0);
618
619 uint32_t type_size = 4;
620
621 if (is_store) {
622 emit_tmu_general_store_writes(c, mode, instr,
623 base_const_offset,
624 &writemask,
625 &const_offset,
626 &type_size,
627 &tmu_writes);
628 } else if (!is_load && !atomic_add_replaced) {
629 emit_tmu_general_atomic_writes(c, mode, instr,
630 tmu_op, has_index,
631 &tmu_writes);
632 } else if (is_load) {
633 type_size = instr->def.bit_size / 8;
634 }
635
636 /* For atomics we use 32bit except for CMPXCHG, that we need
637 * to use VEC2. For the rest of the cases we use the number of
638 * tmud writes we did to decide the type. For cache operations
639 * the type is ignored.
640 */
641 uint32_t config = 0;
642 if (mode == MODE_EMIT) {
643 uint32_t num_components;
644 if (is_load || atomic_add_replaced) {
645 num_components = instr->num_components;
646 } else {
647 assert(tmu_writes > 0);
648 num_components = tmu_writes - 1;
649 }
650 bool is_atomic =
651 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
652 V3D_TMU_OP_TYPE_ATOMIC;
653
654 /* Only load per-quad if we can be certain that all
655 * lines in the quad are active. Notice that demoted
656 * invocations, unlike terminated ones, are still
657 * active: we want to skip memory writes for them but
658 * loads should still work.
659 */
660 uint32_t perquad =
661 is_load && !vir_in_nonuniform_control_flow(c) &&
662 ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
663 c->s->info.fs.needs_quad_helper_invocations &&
664 !c->emitted_discard) ||
665 c->s->info.uses_wide_subgroup_intrinsics) ?
666 GENERAL_TMU_LOOKUP_PER_QUAD :
667 GENERAL_TMU_LOOKUP_PER_PIXEL;
668 config = 0xffffff00 | tmu_op << 3 | perquad;
669
670 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
671 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
672 } else if (is_atomic || num_components == 1) {
673 switch (type_size) {
674 case 4:
675 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
676 break;
677 case 2:
678 config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
679 break;
680 case 1:
681 config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
682 break;
683 default:
684 unreachable("Unsupported bitsize");
685 }
686 } else {
687 assert(type_size == 4);
688 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
689 num_components - 2;
690 }
691 }
692
693 emit_tmu_general_address_write(c, mode, instr, config,
694 dynamic_src, offset_src,
695 base_offset, const_offset,
696 dest_components, &tmu_writes);
697
698 assert(tmu_writes > 0);
699 if (mode == MODE_COUNT) {
700 /* Make sure we won't exceed the 16-entry TMU
701 * fifo if each thread is storing at the same
702 * time.
703 */
704 while (tmu_writes > 16 / c->threads)
705 c->threads /= 2;
706
707 /* If pipelining this TMU operation would
708 * overflow TMU fifos, we need to flush.
709 */
710 if (ntq_tmu_fifo_overflow(c, dest_components))
711 ntq_flush_tmu(c);
712 } else {
713 /* Delay emission of the thread switch and
714 * LDTMU/TMUWT until we really need to do it to
715 * improve pipelining.
716 */
717 const uint32_t component_mask =
718 (1 << dest_components) - 1;
719 ntq_add_pending_tmu_flush(c, &instr->def,
720 component_mask);
721 }
722 }
723
724 /* nir_lower_wrmasks should've ensured that any writemask on a store
725 * operation only has consecutive bits set, in which case we should've
726 * processed the full writemask above.
727 */
728 assert(writemask == 0);
729 }
730
731 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_def * def)732 ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
733 {
734 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
735 def->num_components);
736 _mesa_hash_table_insert(c->def_ht, def, qregs);
737 return qregs;
738 }
739
740 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)741 is_ld_signal(const struct v3d_qpu_sig *sig)
742 {
743 return (sig->ldunif ||
744 sig->ldunifa ||
745 sig->ldunifrf ||
746 sig->ldunifarf ||
747 sig->ldtmu ||
748 sig->ldvary ||
749 sig->ldvpm ||
750 sig->ldtlb ||
751 sig->ldtlbu);
752 }
753
754 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)755 is_ldunif_signal(const struct v3d_qpu_sig *sig)
756 {
757 return sig->ldunif || sig->ldunifrf;
758 }
759
760 /**
761 * This function is responsible for getting VIR results into the associated
762 * storage for a NIR instruction.
763 *
764 * If it's a NIR SSA def, then we just set the associated hash table entry to
765 * the new result.
766 *
767 * If it's a NIR reg, then we need to update the existing qreg assigned to the
768 * NIR destination with the incoming value. To do that without introducing
769 * new MOVs, we require that the incoming qreg either be a uniform, or be
770 * SSA-defined by the previous VIR instruction in the block and rewritable by
771 * this function. That lets us sneak ahead and insert the SF flag beforehand
772 * (knowing that the previous instruction doesn't depend on flags) and rewrite
773 * its destination to be the NIR reg's destination
774 */
775 void
ntq_store_def(struct v3d_compile * c,nir_def * def,int chan,struct qreg result)776 ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
777 struct qreg result)
778 {
779 struct qinst *last_inst = NULL;
780 if (!list_is_empty(&c->cur_block->instructions))
781 last_inst = (struct qinst *)c->cur_block->instructions.prev;
782
783 bool is_reused_uniform =
784 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
785 last_inst != c->defs[result.index];
786
787 assert(result.file == QFILE_TEMP && last_inst &&
788 (last_inst == c->defs[result.index] || is_reused_uniform));
789
790 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
791 if (store == NULL) {
792 assert(chan < def->num_components);
793
794 struct qreg *qregs;
795 struct hash_entry *entry =
796 _mesa_hash_table_search(c->def_ht, def);
797
798 if (entry)
799 qregs = entry->data;
800 else
801 qregs = ntq_init_ssa_def(c, def);
802
803 qregs[chan] = result;
804 } else {
805 nir_def *reg = store->src[1].ssa;
806 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
807 assert(nir_intrinsic_base(store) == 0);
808 assert(nir_intrinsic_num_array_elems(decl) == 0);
809 struct hash_entry *entry =
810 _mesa_hash_table_search(c->def_ht, reg);
811 struct qreg *qregs = entry->data;
812
813 /* If the previous instruction can't be predicated for
814 * the store into the nir_register, then emit a MOV
815 * that can be.
816 */
817 if (is_reused_uniform ||
818 (vir_in_nonuniform_control_flow(c) &&
819 is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
820 result = vir_MOV(c, result);
821 last_inst = c->defs[result.index];
822 }
823
824 /* We know they're both temps, so just rewrite index. */
825 c->defs[last_inst->dst.index] = NULL;
826 last_inst->dst.index = qregs[chan].index;
827
828 /* If we're in control flow, then make this update of the reg
829 * conditional on the execution mask.
830 */
831 if (vir_in_nonuniform_control_flow(c)) {
832 last_inst->dst.index = qregs[chan].index;
833
834 /* Set the flags to the current exec mask.
835 */
836 c->cursor = vir_before_inst(last_inst);
837 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
838 V3D_QPU_PF_PUSHZ);
839 c->cursor = vir_after_inst(last_inst);
840
841 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
842 }
843 }
844 }
845
846 /**
847 * This looks up the qreg associated with a particular ssa/reg used as a source
848 * in any instruction.
849 *
850 * It is expected that the definition for any NIR value read as a source has
851 * been emitted by a previous instruction, however, in the case of TMU
852 * operations we may have postponed emission of the thread switch and LDTMUs
853 * required to read the TMU results until the results are actually used to
854 * improve pipelining, which then would lead to us not finding them here
855 * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
856 * (for registers), meaning that we need to flush outstanding TMU operations
857 * to read the correct value.
858 */
859 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)860 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
861 {
862 struct hash_entry *entry;
863
864 nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
865 if (load == NULL) {
866 assert(i < src.ssa->num_components);
867
868 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
869 if (!entry) {
870 ntq_flush_tmu(c);
871 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
872 }
873 } else {
874 nir_def *reg = load->src[0].ssa;
875 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
876 assert(nir_intrinsic_base(load) == 0);
877 assert(nir_intrinsic_num_array_elems(decl) == 0);
878 assert(i < nir_intrinsic_num_components(decl));
879
880 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
881 ntq_flush_tmu(c);
882 entry = _mesa_hash_table_search(c->def_ht, reg);
883 }
884 assert(entry);
885
886 struct qreg *qregs = entry->data;
887 return qregs[i];
888 }
889
890 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)891 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
892 unsigned src)
893 {
894 struct qreg r = ntq_get_src(c, instr->src[src].src,
895 instr->src[src].swizzle[0]);
896
897 return r;
898 };
899
900 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)901 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
902 {
903 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
904 }
905
906 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)907 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
908 {
909 unsigned unit = instr->texture_index;
910 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
911 int dest_size = nir_tex_instr_dest_size(instr);
912
913 struct qreg lod = c->undef;
914 if (lod_index != -1)
915 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
916
917 for (int i = 0; i < dest_size; i++) {
918 assert(i < 3);
919 enum quniform_contents contents;
920
921 if (instr->is_array && i == dest_size - 1)
922 contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
923 else
924 contents = QUNIFORM_TEXTURE_WIDTH + i;
925
926 struct qreg size = vir_uniform(c, contents, unit);
927
928 switch (instr->sampler_dim) {
929 case GLSL_SAMPLER_DIM_1D:
930 case GLSL_SAMPLER_DIM_2D:
931 case GLSL_SAMPLER_DIM_MS:
932 case GLSL_SAMPLER_DIM_3D:
933 case GLSL_SAMPLER_DIM_CUBE:
934 case GLSL_SAMPLER_DIM_BUF:
935 case GLSL_SAMPLER_DIM_EXTERNAL:
936 /* Don't minify the array size. */
937 if (!(instr->is_array && i == dest_size - 1)) {
938 size = ntq_minify(c, size, lod);
939 }
940 break;
941
942 case GLSL_SAMPLER_DIM_RECT:
943 /* There's no LOD field for rects */
944 break;
945
946 default:
947 unreachable("Bad sampler type");
948 }
949
950 ntq_store_def(c, &instr->def, i, size);
951 }
952 }
953
954 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)955 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
956 {
957 unsigned unit = instr->texture_index;
958
959 /* Since each texture sampling op requires uploading uniforms to
960 * reference the texture, there's no HW support for texture size and
961 * you just upload uniforms containing the size.
962 */
963 switch (instr->op) {
964 case nir_texop_query_levels:
965 ntq_store_def(c, &instr->def, 0,
966 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
967 return;
968 case nir_texop_texture_samples:
969 ntq_store_def(c, &instr->def, 0,
970 vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
971 return;
972 case nir_texop_txs:
973 ntq_emit_txs(c, instr);
974 return;
975 default:
976 break;
977 }
978
979 v3d_vir_emit_tex(c, instr);
980 }
981
982 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)983 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
984 {
985 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
986 if (is_cos)
987 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
988
989 struct qreg periods = vir_FROUND(c, input);
990 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
991 return vir_XOR(c, sin_output, vir_SHL(c,
992 vir_FTOIN(c, periods),
993 vir_uniform_ui(c, -1)));
994 }
995
996 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)997 ntq_fsign(struct v3d_compile *c, struct qreg src)
998 {
999 struct qreg t = vir_get_temp(c);
1000
1001 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
1002 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
1003 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
1004 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
1005 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
1006 return vir_MOV(c, t);
1007 }
1008
1009 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)1010 emit_fragcoord_input(struct v3d_compile *c, int attr)
1011 {
1012 c->inputs[attr * 4 + 0] = vir_FXCD(c);
1013 c->inputs[attr * 4 + 1] = vir_FYCD(c);
1014 c->inputs[attr * 4 + 2] = c->payload_z;
1015 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
1016 }
1017
1018 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg c_reg)1019 emit_smooth_varying(struct v3d_compile *c,
1020 struct qreg vary, struct qreg w, struct qreg c_reg)
1021 {
1022 return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
1023 }
1024
1025 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1026 emit_noperspective_varying(struct v3d_compile *c,
1027 struct qreg vary, struct qreg c_reg)
1028 {
1029 return vir_FADD(c, vir_MOV(c, vary), c_reg);
1030 }
1031
1032 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1033 emit_flat_varying(struct v3d_compile *c,
1034 struct qreg vary, struct qreg c_reg)
1035 {
1036 vir_MOV_dest(c, c->undef, vary);
1037 return vir_MOV(c, c_reg);
1038 }
1039
1040 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)1041 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
1042 int8_t input_idx, uint8_t swizzle, int array_index)
1043 {
1044 struct qreg c_reg; /* C coefficient */
1045
1046 if (c->devinfo->has_accumulators)
1047 c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
1048 else
1049 c_reg = vir_reg(QFILE_REG, 0);
1050
1051 struct qinst *ldvary = NULL;
1052 struct qreg vary;
1053 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1054 c->undef, c->undef);
1055 ldvary->qpu.sig.ldvary = true;
1056 vary = vir_emit_def(c, ldvary);
1057
1058 /* Store the input value before interpolation so we can implement
1059 * GLSL's interpolateAt functions if the shader uses them.
1060 */
1061 if (input_idx >= 0) {
1062 assert(var);
1063 c->interp[input_idx].vp = vary;
1064 c->interp[input_idx].C = vir_MOV(c, c_reg);
1065 c->interp[input_idx].mode = var->data.interpolation;
1066 }
1067
1068 /* For gl_PointCoord input or distance along a line, we'll be called
1069 * with no nir_variable, and we don't count toward VPM size so we
1070 * don't track an input slot.
1071 */
1072 if (!var) {
1073 assert(input_idx < 0);
1074 return emit_smooth_varying(c, vary, c->payload_w, c_reg);
1075 }
1076
1077 int i = c->num_inputs++;
1078 c->input_slots[i] =
1079 v3d_slot_from_slot_and_component(var->data.location +
1080 array_index, swizzle);
1081
1082 struct qreg result;
1083 switch (var->data.interpolation) {
1084 case INTERP_MODE_NONE:
1085 case INTERP_MODE_SMOOTH:
1086 if (var->data.centroid) {
1087 BITSET_SET(c->centroid_flags, i);
1088 result = emit_smooth_varying(c, vary,
1089 c->payload_w_centroid, c_reg);
1090 } else {
1091 result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
1092 }
1093 break;
1094
1095 case INTERP_MODE_NOPERSPECTIVE:
1096 BITSET_SET(c->noperspective_flags, i);
1097 result = emit_noperspective_varying(c, vary, c_reg);
1098 break;
1099
1100 case INTERP_MODE_FLAT:
1101 BITSET_SET(c->flat_shade_flags, i);
1102 result = emit_flat_varying(c, vary, c_reg);
1103 break;
1104
1105 default:
1106 unreachable("Bad interp mode");
1107 }
1108
1109 if (input_idx >= 0)
1110 c->inputs[input_idx] = result;
1111 return result;
1112 }
1113
1114 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1115 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1116 int array_index, unsigned nelem)
1117 {
1118 for (int i = 0; i < nelem ; i++) {
1119 int chan = var->data.location_frac + i;
1120 int input_idx = (base_attr + array_index) * 4 + chan;
1121 emit_fragment_varying(c, var, input_idx, chan, array_index);
1122 }
1123 }
1124
1125 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1126 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1127 int array_index)
1128 {
1129 /* Compact variables are scalar arrays where each set of 4 elements
1130 * consumes a single location.
1131 */
1132 int loc_offset = array_index / 4;
1133 int chan = var->data.location_frac + array_index % 4;
1134 int input_idx = (attr + loc_offset) * 4 + chan;
1135 emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1136 }
1137
1138 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1139 add_output(struct v3d_compile *c,
1140 uint32_t decl_offset,
1141 uint8_t slot,
1142 uint8_t swizzle)
1143 {
1144 uint32_t old_array_size = c->outputs_array_size;
1145 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1146 decl_offset + 1);
1147
1148 if (old_array_size != c->outputs_array_size) {
1149 c->output_slots = reralloc(c,
1150 c->output_slots,
1151 struct v3d_varying_slot,
1152 c->outputs_array_size);
1153 }
1154
1155 c->output_slots[decl_offset] =
1156 v3d_slot_from_slot_and_component(slot, swizzle);
1157 }
1158
1159 /**
1160 * If compare_instr is a valid comparison instruction, emits the
1161 * compare_instr's comparison and returns the sel_instr's return value based
1162 * on the compare_instr's result.
1163 */
1164 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1165 ntq_emit_comparison(struct v3d_compile *c,
1166 nir_alu_instr *compare_instr,
1167 enum v3d_qpu_cond *out_cond)
1168 {
1169 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1170 struct qreg src1;
1171 if (nir_op_infos[compare_instr->op].num_inputs > 1)
1172 src1 = ntq_get_alu_src(c, compare_instr, 1);
1173 bool cond_invert = false;
1174 struct qreg nop = vir_nop_reg();
1175
1176 switch (compare_instr->op) {
1177 case nir_op_feq32:
1178 case nir_op_seq:
1179 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1180 break;
1181 case nir_op_ieq32:
1182 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1183 break;
1184
1185 case nir_op_fneu32:
1186 case nir_op_sne:
1187 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1188 cond_invert = true;
1189 break;
1190 case nir_op_ine32:
1191 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1192 cond_invert = true;
1193 break;
1194
1195 case nir_op_fge32:
1196 case nir_op_sge:
1197 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1198 break;
1199 case nir_op_ige32:
1200 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1201 cond_invert = true;
1202 break;
1203 case nir_op_uge32:
1204 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1205 cond_invert = true;
1206 break;
1207
1208 case nir_op_slt:
1209 case nir_op_flt32:
1210 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1211 break;
1212 case nir_op_ilt32:
1213 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1214 break;
1215 case nir_op_ult32:
1216 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1217 break;
1218
1219 default:
1220 return false;
1221 }
1222
1223 *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1224
1225 return true;
1226 }
1227
1228 /* Finds an ALU instruction that generates our src value that could
1229 * (potentially) be greedily emitted in the consuming instruction.
1230 */
1231 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1232 ntq_get_alu_parent(nir_src src)
1233 {
1234 if (src.ssa->parent_instr->type != nir_instr_type_alu)
1235 return NULL;
1236 nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1237 if (!instr)
1238 return NULL;
1239
1240 /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1241 * moving emission of the ALU instr down past another write of the
1242 * src.
1243 */
1244 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1245 if (nir_load_reg_for_def(instr->src[i].src.ssa))
1246 return NULL;
1247 }
1248
1249 return instr;
1250 }
1251
1252 /* Turns a NIR bool into a condition code to predicate on. */
1253 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1254 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1255 {
1256 struct qreg qsrc = ntq_get_src(c, src, 0);
1257 /* skip if we already have src in the flags */
1258 if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1259 return c->flags_cond;
1260
1261 nir_alu_instr *compare = ntq_get_alu_parent(src);
1262 if (!compare)
1263 goto out;
1264
1265 enum v3d_qpu_cond cond;
1266 if (ntq_emit_comparison(c, compare, &cond))
1267 return cond;
1268
1269 out:
1270
1271 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1272 V3D_QPU_PF_PUSHZ);
1273 return V3D_QPU_COND_IFNA;
1274 }
1275
1276 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1277 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1278 {
1279 struct qreg result =
1280 vir_MOV(c, vir_SEL(c, cond,
1281 vir_uniform_ui(c, ~0),
1282 vir_uniform_ui(c, 0)));
1283 c->flags_temp = result.index;
1284 c->flags_cond = cond;
1285 return result;
1286 }
1287
1288 static struct qreg
ntq_emit_cond_to_int(struct v3d_compile * c,enum v3d_qpu_cond cond)1289 ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
1290 {
1291 struct qreg result =
1292 vir_MOV(c, vir_SEL(c, cond,
1293 vir_uniform_ui(c, 1),
1294 vir_uniform_ui(c, 0)));
1295 c->flags_temp = result.index;
1296 c->flags_cond = cond;
1297 return result;
1298 }
1299
1300 static struct qreg
f2f16_rtz(struct v3d_compile * c,struct qreg f32)1301 f2f16_rtz(struct v3d_compile *c, struct qreg f32)
1302 {
1303 /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
1304 * method and seems to be using RTE by default, so we need to implement
1305 * RTZ rounding in software.
1306 */
1307 struct qreg rf16 = vir_FMOV(c, f32);
1308 vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
1309
1310 struct qreg rf32 = vir_FMOV(c, rf16);
1311 vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
1312
1313 struct qreg f32_abs = vir_FMOV(c, f32);
1314 vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1315
1316 struct qreg rf32_abs = vir_FMOV(c, rf32);
1317 vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1318
1319 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
1320 V3D_QPU_PF_PUSHN);
1321 return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1322 vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
1323 }
1324
1325 /**
1326 * Takes the result value of a signed integer width conversion from a smaller
1327 * type to a larger type and if needed, it applies sign extension to it.
1328 */
1329 static struct qreg
sign_extend(struct v3d_compile * c,struct qreg value,uint32_t src_bit_size,uint32_t dst_bit_size)1330 sign_extend(struct v3d_compile *c,
1331 struct qreg value,
1332 uint32_t src_bit_size,
1333 uint32_t dst_bit_size)
1334 {
1335 assert(src_bit_size < dst_bit_size);
1336
1337 struct qreg tmp = vir_MOV(c, value);
1338
1339 /* Do we need to sign-extend? */
1340 uint32_t sign_mask = 1 << (src_bit_size - 1);
1341 struct qinst *sign_check =
1342 vir_AND_dest(c, vir_nop_reg(),
1343 tmp, vir_uniform_ui(c, sign_mask));
1344 vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
1345
1346 /* If so, fill in leading sign bits */
1347 uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
1348 ((1ull << dst_bit_size) - 1);
1349 struct qinst *extend_inst =
1350 vir_OR_dest(c, tmp, tmp,
1351 vir_uniform_ui(c, extend_bits));
1352 vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
1353
1354 return tmp;
1355 }
1356
1357 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1358 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1359 {
1360 /* Vectors are special in that they have non-scalarized writemasks,
1361 * and just take the first swizzle channel for each argument in order
1362 * into each writemask channel.
1363 */
1364 if (instr->op == nir_op_vec2 ||
1365 instr->op == nir_op_vec3 ||
1366 instr->op == nir_op_vec4) {
1367 struct qreg srcs[4];
1368 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1369 srcs[i] = ntq_get_src(c, instr->src[i].src,
1370 instr->src[i].swizzle[0]);
1371 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1372 ntq_store_def(c, &instr->def, i,
1373 vir_MOV(c, srcs[i]));
1374 return;
1375 }
1376
1377 /* General case: We can just grab the one used channel per src. */
1378 struct qreg src[nir_op_infos[instr->op].num_inputs];
1379 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1380 src[i] = ntq_get_alu_src(c, instr, i);
1381 }
1382
1383 struct qreg result;
1384
1385 switch (instr->op) {
1386 case nir_op_mov:
1387 result = vir_MOV(c, src[0]);
1388 break;
1389
1390 case nir_op_fneg:
1391 result = vir_XOR(c, src[0], vir_uniform_ui(c, UINT32_C(1) << 31));
1392 break;
1393 case nir_op_ineg:
1394 result = vir_NEG(c, src[0]);
1395 break;
1396
1397 case nir_op_fmul:
1398 result = vir_FMUL(c, src[0], src[1]);
1399 break;
1400 case nir_op_fadd:
1401 result = vir_FADD(c, src[0], src[1]);
1402 break;
1403 case nir_op_fsub:
1404 result = vir_FSUB(c, src[0], src[1]);
1405 break;
1406 case nir_op_fmin:
1407 result = vir_FMIN(c, src[0], src[1]);
1408 break;
1409 case nir_op_fmax:
1410 result = vir_FMAX(c, src[0], src[1]);
1411 break;
1412
1413 case nir_op_f2i32: {
1414 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1415 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1416 result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1417 } else {
1418 result = vir_FTOIZ(c, src[0]);
1419 }
1420 if (nir_src_bit_size(instr->src[0].src) == 16)
1421 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1422 break;
1423 }
1424
1425 case nir_op_f2u32:
1426 result = vir_FTOUZ(c, src[0]);
1427 if (nir_src_bit_size(instr->src[0].src) == 16)
1428 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1429 break;
1430 case nir_op_i2f32: {
1431 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1432 assert(bit_size <= 32);
1433 result = src[0];
1434 if (bit_size < 32) {
1435 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1436 result = vir_AND(c, result, vir_uniform_ui(c, mask));
1437 result = sign_extend(c, result, bit_size, 32);
1438 }
1439 result = vir_ITOF(c, result);
1440 break;
1441 }
1442 case nir_op_u2f32: {
1443 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1444 assert(bit_size <= 32);
1445 result = src[0];
1446 if (bit_size < 32) {
1447 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1448 result = vir_AND(c, result, vir_uniform_ui(c, mask));
1449 }
1450 result = vir_UTOF(c, result);
1451 break;
1452 }
1453 case nir_op_b2f16:
1454 result = vir_AND(c, src[0], vir_uniform_ui(c, 0x3c00));
1455 break;
1456 case nir_op_b2f32:
1457 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1458 break;
1459 case nir_op_b2i8:
1460 case nir_op_b2i16:
1461 case nir_op_b2i32:
1462 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1463 break;
1464
1465 case nir_op_i2f16: {
1466 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1467 assert(bit_size <= 32);
1468 if (bit_size < 32) {
1469 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1470 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1471 result = sign_extend(c, result, bit_size, 32);
1472 }
1473 result = vir_ITOF(c, result);
1474 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1475 break;
1476 }
1477
1478 case nir_op_u2f16: {
1479 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1480 assert(bit_size <= 32);
1481 if (bit_size < 32) {
1482 uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1483 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1484 }
1485 result = vir_UTOF(c, result);
1486 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1487 break;
1488 }
1489
1490 case nir_op_f2f16:
1491 case nir_op_f2f16_rtne:
1492 assert(nir_src_bit_size(instr->src[0].src) == 32);
1493 result = vir_FMOV(c, src[0]);
1494 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1495 break;
1496
1497 case nir_op_f2f16_rtz:
1498 assert(nir_src_bit_size(instr->src[0].src) == 32);
1499 result = f2f16_rtz(c, src[0]);
1500 break;
1501
1502 case nir_op_f2f32:
1503 assert(nir_src_bit_size(instr->src[0].src) == 16);
1504 result = vir_FMOV(c, src[0]);
1505 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1506 break;
1507
1508 case nir_op_i2i16: {
1509 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1510 assert(bit_size == 32 || bit_size == 8);
1511 if (bit_size == 32) {
1512 /* We don't have integer pack/unpack methods for
1513 * converting between 16-bit and 32-bit, so we implement
1514 * the conversion manually by truncating the src.
1515 */
1516 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1517 } else {
1518 struct qreg tmp = vir_AND(c, src[0],
1519 vir_uniform_ui(c, 0xff));
1520 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
1521 }
1522 break;
1523 }
1524
1525 case nir_op_u2u16: {
1526 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1527 assert(bit_size == 32 || bit_size == 8);
1528
1529 /* We don't have integer pack/unpack methods for converting
1530 * between 16-bit and 32-bit, so we implement the conversion
1531 * manually by truncating the src. For the 8-bit case, we
1532 * want to make sure we don't copy garbage from any of the
1533 * 24 MSB bits.
1534 */
1535 if (bit_size == 32)
1536 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1537 else
1538 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1539 break;
1540 }
1541
1542 case nir_op_i2i8:
1543 case nir_op_u2u8:
1544 assert(nir_src_bit_size(instr->src[0].src) == 32 ||
1545 nir_src_bit_size(instr->src[0].src) == 16);
1546 /* We don't have integer pack/unpack methods for converting
1547 * between 8-bit and 32-bit, so we implement the conversion
1548 * manually by truncating the src.
1549 */
1550 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1551 break;
1552
1553 case nir_op_u2u32: {
1554 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1555 assert(bit_size == 16 || bit_size == 8);
1556
1557 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
1558 * from the src but we make sure to clear any garbage bits that
1559 * may be present in the invalid src bits.
1560 */
1561 uint32_t mask = (1 << bit_size) - 1;
1562 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1563 break;
1564 }
1565
1566 case nir_op_i2i32: {
1567 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1568 assert(bit_size == 16 || bit_size == 8);
1569
1570 uint32_t mask = (1 << bit_size) - 1;
1571 struct qreg tmp = vir_AND(c, src[0],
1572 vir_uniform_ui(c, mask));
1573
1574 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
1575 break;
1576 }
1577
1578 case nir_op_iadd:
1579 result = vir_ADD(c, src[0], src[1]);
1580 break;
1581 case nir_op_ushr:
1582 result = vir_SHR(c, src[0], src[1]);
1583 break;
1584 case nir_op_isub:
1585 result = vir_SUB(c, src[0], src[1]);
1586 break;
1587 case nir_op_ishr:
1588 result = vir_ASR(c, src[0], src[1]);
1589 break;
1590 case nir_op_ishl:
1591 result = vir_SHL(c, src[0], src[1]);
1592 break;
1593 case nir_op_imin:
1594 result = vir_MIN(c, src[0], src[1]);
1595 break;
1596 case nir_op_umin:
1597 result = vir_UMIN(c, src[0], src[1]);
1598 break;
1599 case nir_op_imax:
1600 result = vir_MAX(c, src[0], src[1]);
1601 break;
1602 case nir_op_umax:
1603 result = vir_UMAX(c, src[0], src[1]);
1604 break;
1605 case nir_op_iand:
1606 result = vir_AND(c, src[0], src[1]);
1607 break;
1608 case nir_op_ior:
1609 result = vir_OR(c, src[0], src[1]);
1610 break;
1611 case nir_op_ixor:
1612 result = vir_XOR(c, src[0], src[1]);
1613 break;
1614 case nir_op_inot:
1615 result = vir_NOT(c, src[0]);
1616 break;
1617
1618 case nir_op_uclz:
1619 result = vir_CLZ(c, src[0]);
1620 break;
1621
1622 case nir_op_imul:
1623 result = vir_UMUL(c, src[0], src[1]);
1624 break;
1625
1626 case nir_op_seq:
1627 case nir_op_sne:
1628 case nir_op_sge:
1629 case nir_op_slt: {
1630 enum v3d_qpu_cond cond;
1631 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1632 assert(ok);
1633 result = vir_MOV(c, vir_SEL(c, cond,
1634 vir_uniform_f(c, 1.0),
1635 vir_uniform_f(c, 0.0)));
1636 c->flags_temp = result.index;
1637 c->flags_cond = cond;
1638 break;
1639 }
1640
1641 case nir_op_feq32:
1642 case nir_op_fneu32:
1643 case nir_op_fge32:
1644 case nir_op_flt32:
1645 case nir_op_ieq32:
1646 case nir_op_ine32:
1647 case nir_op_ige32:
1648 case nir_op_uge32:
1649 case nir_op_ilt32:
1650 case nir_op_ult32: {
1651 enum v3d_qpu_cond cond;
1652 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1653 assert(ok);
1654 result = ntq_emit_cond_to_bool(c, cond);
1655 break;
1656 }
1657
1658 case nir_op_b32csel:
1659 result = vir_MOV(c,
1660 vir_SEL(c,
1661 ntq_emit_bool_to_cond(c, instr->src[0].src),
1662 src[1], src[2]));
1663 break;
1664
1665 case nir_op_fcsel:
1666 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1667 V3D_QPU_PF_PUSHZ);
1668 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1669 src[1], src[2]));
1670 break;
1671
1672 case nir_op_frcp:
1673 result = vir_RECIP(c, src[0]);
1674 break;
1675 case nir_op_frsq:
1676 result = vir_RSQRT(c, src[0]);
1677 break;
1678 case nir_op_fexp2:
1679 result = vir_EXP(c, src[0]);
1680 break;
1681 case nir_op_flog2:
1682 result = vir_LOG(c, src[0]);
1683 break;
1684
1685 case nir_op_fceil:
1686 result = vir_FCEIL(c, src[0]);
1687 break;
1688 case nir_op_ffloor:
1689 result = vir_FFLOOR(c, src[0]);
1690 break;
1691 case nir_op_fround_even:
1692 result = vir_FROUND(c, src[0]);
1693 break;
1694 case nir_op_ftrunc:
1695 result = vir_FTRUNC(c, src[0]);
1696 break;
1697
1698 case nir_op_fsin:
1699 result = ntq_fsincos(c, src[0], false);
1700 break;
1701 case nir_op_fcos:
1702 result = ntq_fsincos(c, src[0], true);
1703 break;
1704
1705 case nir_op_fsign:
1706 result = ntq_fsign(c, src[0]);
1707 break;
1708
1709 case nir_op_fabs: {
1710 result = vir_FMOV(c, src[0]);
1711 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1712 break;
1713 }
1714
1715 case nir_op_iabs:
1716 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1717 break;
1718
1719 case nir_op_uadd_carry:
1720 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1721 V3D_QPU_PF_PUSHC);
1722 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1723 break;
1724
1725 case nir_op_usub_borrow:
1726 vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
1727 V3D_QPU_PF_PUSHC);
1728 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1729 break;
1730
1731 case nir_op_pack_half_2x16_split:
1732 result = vir_VFPACK(c, src[0], src[1]);
1733 break;
1734
1735 case nir_op_pack_2x32_to_2x16_v3d:
1736 result = vir_VPACK(c, src[0], src[1]);
1737 break;
1738
1739 case nir_op_pack_32_to_r11g11b10_v3d:
1740 result = vir_V11FPACK(c, src[0], src[1]);
1741 break;
1742
1743 case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
1744 result = vir_V10PACK(c, src[0], src[1]);
1745 break;
1746
1747 case nir_op_pack_4x16_to_4x8_v3d:
1748 result = vir_V8PACK(c, src[0], src[1]);
1749 break;
1750
1751 case nir_op_unpack_half_2x16_split_x:
1752 result = vir_FMOV(c, src[0]);
1753 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1754 break;
1755
1756 case nir_op_unpack_half_2x16_split_y:
1757 result = vir_FMOV(c, src[0]);
1758 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1759 break;
1760
1761 case nir_op_pack_2x16_to_unorm_2x8_v3d:
1762 result = vir_VFTOUNORM8(c, src[0]);
1763 break;
1764
1765 case nir_op_pack_2x16_to_snorm_2x8_v3d:
1766 result = vir_VFTOSNORM8(c, src[0]);
1767 break;
1768
1769 case nir_op_pack_2x16_to_unorm_2x10_v3d:
1770 result = vir_VFTOUNORM10LO(c, src[0]);
1771 break;
1772
1773 case nir_op_pack_2x16_to_unorm_10_2_v3d:
1774 result = vir_VFTOUNORM10HI(c, src[0]);
1775 break;
1776
1777 case nir_op_f2unorm_16_v3d:
1778 result = vir_FTOUNORM16(c, src[0]);
1779 break;
1780
1781 case nir_op_f2snorm_16_v3d:
1782 result = vir_FTOSNORM16(c, src[0]);
1783 break;
1784
1785 case nir_op_fsat:
1786 assert(c->devinfo->ver >= 71);
1787 result = vir_FMOV(c, src[0]);
1788 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_SAT);
1789 break;
1790
1791 default:
1792 fprintf(stderr, "unknown NIR ALU inst: ");
1793 nir_print_instr(&instr->instr, stderr);
1794 fprintf(stderr, "\n");
1795 abort();
1796 }
1797
1798 ntq_store_def(c, &instr->def, 0, result);
1799 }
1800
1801 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1802 * specifier. They come from a register that's preloaded with 0xffffffff
1803 * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
1804 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1805 */
1806 #define TLB_TYPE_F16_COLOR (3 << 6)
1807 #define TLB_TYPE_I32_COLOR (1 << 6)
1808 #define TLB_TYPE_F32_COLOR (0 << 6)
1809 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
1810 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1811 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
1812 #define TLB_F16_SWAP_HI_LO (1 << 1)
1813 #define TLB_VEC_SIZE_4_F16 (1 << 0)
1814 #define TLB_VEC_SIZE_2_F16 (0 << 0)
1815 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1816
1817 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1818 * flag is set.
1819 */
1820 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
1821 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
1822 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
1823 #define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */
1824 #define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */
1825
1826 /* Stencil is a single 32-bit write. */
1827 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
1828
1829 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1830 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1831 {
1832 if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1833 return;
1834
1835 struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1836 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1837
1838 nir_variable *var = c->output_color_var[rt];
1839 int num_components = glsl_get_vector_elements(var->type);
1840 uint32_t conf = 0xffffff00;
1841 struct qinst *inst;
1842
1843 conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1844 TLB_SAMPLE_MODE_PER_PIXEL;
1845 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1846
1847 if (c->fs_key->swap_color_rb & (1 << rt))
1848 num_components = MAX2(num_components, 3);
1849 assert(num_components != 0);
1850
1851 enum glsl_base_type type = glsl_get_base_type(var->type);
1852 bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1853 bool is_32b_tlb_format = is_int_format ||
1854 (c->fs_key->f32_color_rb & (1 << rt));
1855
1856 if (is_int_format) {
1857 /* The F32 vs I32 distinction was dropped in 4.2. */
1858 if (c->devinfo->ver < 42)
1859 conf |= TLB_TYPE_I32_COLOR;
1860 else
1861 conf |= TLB_TYPE_F32_COLOR;
1862 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1863 } else {
1864 if (c->fs_key->f32_color_rb & (1 << rt)) {
1865 conf |= TLB_TYPE_F32_COLOR;
1866 conf |= ((num_components - 1) <<
1867 TLB_VEC_SIZE_MINUS_1_SHIFT);
1868 } else {
1869 conf |= TLB_TYPE_F16_COLOR;
1870 conf |= TLB_F16_SWAP_HI_LO;
1871 if (num_components >= 3)
1872 conf |= TLB_VEC_SIZE_4_F16;
1873 else
1874 conf |= TLB_VEC_SIZE_2_F16;
1875 }
1876 }
1877
1878 int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1879 for (int i = 0; i < num_samples; i++) {
1880 struct qreg *color = c->msaa_per_sample_output ?
1881 &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1882 &c->outputs[var->data.driver_location * 4];
1883
1884 struct qreg r = color[0];
1885 struct qreg g = color[1];
1886 struct qreg b = color[2];
1887 struct qreg a = color[3];
1888
1889 if (c->fs_key->swap_color_rb & (1 << rt)) {
1890 r = color[2];
1891 b = color[0];
1892 }
1893
1894 if (c->fs_key->sample_alpha_to_one)
1895 a = vir_uniform_f(c, 1.0);
1896
1897 if (is_32b_tlb_format) {
1898 if (i == 0) {
1899 inst = vir_MOV_dest(c, tlbu_reg, r);
1900 inst->uniform =
1901 vir_get_uniform_index(c,
1902 QUNIFORM_CONSTANT,
1903 conf);
1904 } else {
1905 vir_MOV_dest(c, tlb_reg, r);
1906 }
1907
1908 if (num_components >= 2)
1909 vir_MOV_dest(c, tlb_reg, g);
1910 if (num_components >= 3)
1911 vir_MOV_dest(c, tlb_reg, b);
1912 if (num_components >= 4)
1913 vir_MOV_dest(c, tlb_reg, a);
1914 } else {
1915 inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1916 if (conf != ~0 && i == 0) {
1917 inst->dst = tlbu_reg;
1918 inst->uniform =
1919 vir_get_uniform_index(c,
1920 QUNIFORM_CONSTANT,
1921 conf);
1922 }
1923
1924 if (num_components >= 3)
1925 vir_VFPACK_dest(c, tlb_reg, b, a);
1926 }
1927 }
1928 }
1929
1930 static void
emit_frag_end(struct v3d_compile * c)1931 emit_frag_end(struct v3d_compile *c)
1932 {
1933 if (c->output_sample_mask_index != -1) {
1934 vir_SETMSF_dest(c, vir_nop_reg(),
1935 vir_AND(c,
1936 vir_MSF(c),
1937 c->outputs[c->output_sample_mask_index]));
1938 }
1939
1940 bool has_any_tlb_color_write = false;
1941 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1942 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1943 has_any_tlb_color_write = true;
1944 }
1945
1946 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1947 struct nir_variable *var = c->output_color_var[0];
1948 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1949
1950 vir_SETMSF_dest(c, vir_nop_reg(),
1951 vir_AND(c,
1952 vir_MSF(c),
1953 vir_FTOC(c, color[3])));
1954 }
1955
1956 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1957
1958 /* If the shader has no non-TLB side effects and doesn't write Z
1959 * we can promote it to enabling early_fragment_tests even
1960 * if the user didn't.
1961 */
1962 if (c->output_position_index == -1 &&
1963 !(c->s->info.num_images || c->s->info.num_ssbos) &&
1964 !c->s->info.fs.uses_discard &&
1965 !c->fs_key->sample_alpha_to_coverage &&
1966 c->output_sample_mask_index == -1 &&
1967 has_any_tlb_color_write) {
1968 c->s->info.fs.early_fragment_tests = true;
1969 }
1970
1971 /* By default, Z buffer writes are implicit using the Z values produced
1972 * from FEP (Z value produced from rasterization). When this is not
1973 * desirable (shader writes Z explicitly, has discards, etc) we need
1974 * to let the hardware know by setting c->writes_z to true, in which
1975 * case we always need to write a Z value from the QPU, even if it is
1976 * just the passthrough Z value produced from FEP.
1977 *
1978 * Also, from the V3D 4.2 spec:
1979 *
1980 * "If a shader performs a Z read the “Fragment shader does Z writes”
1981 * bit in the shader record must be enabled to ensure deterministic
1982 * results"
1983 *
1984 * So if c->reads_z is set we always need to write Z, even if it is
1985 * a passthrough from the Z value produced from FEP.
1986 */
1987 if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
1988 c->writes_z = true;
1989 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1990 struct qinst *inst;
1991
1992 if (c->output_position_index != -1) {
1993 /* Shader writes to gl_FragDepth, use that */
1994 inst = vir_MOV_dest(c, tlbu_reg,
1995 c->outputs[c->output_position_index]);
1996
1997 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1998 TLB_SAMPLE_MODE_PER_PIXEL);
1999 } else {
2000 /* Shader doesn't write to gl_FragDepth, take Z from
2001 * FEP.
2002 */
2003 c->writes_z_from_fep = true;
2004 inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
2005
2006 /* The spec says the PER_PIXEL flag is ignored for
2007 * invariant writes, but the simulator demands it.
2008 */
2009 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
2010 TLB_SAMPLE_MODE_PER_PIXEL);
2011
2012 /* Since (single-threaded) fragment shaders always need
2013 * a TLB write, if we dond't have any we emit a
2014 * passthrouh Z and flag us as potentially discarding,
2015 * so that we can use Z as the required TLB write.
2016 */
2017 if (!has_any_tlb_color_write)
2018 c->s->info.fs.uses_discard = true;
2019 }
2020
2021 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
2022 tlb_specifier |
2023 0xffffff00);
2024 inst->is_tlb_z_write = true;
2025 }
2026
2027 /* XXX: Performance improvement: Merge Z write and color writes TLB
2028 * uniform setup
2029 */
2030 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
2031 vir_emit_tlb_color_write(c, rt);
2032 }
2033
2034 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)2035 vir_VPM_WRITE_indirect(struct v3d_compile *c,
2036 struct qreg val,
2037 struct qreg vpm_index,
2038 bool uniform_vpm_index)
2039 {
2040 if (uniform_vpm_index)
2041 vir_STVPMV(c, vpm_index, val);
2042 else
2043 vir_STVPMD(c, vpm_index, val);
2044 }
2045
2046 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)2047 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
2048 {
2049 vir_VPM_WRITE_indirect(c, val,
2050 vir_uniform_ui(c, vpm_index), true);
2051 }
2052
2053 static void
emit_vert_end(struct v3d_compile * c)2054 emit_vert_end(struct v3d_compile *c)
2055 {
2056 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2057 */
2058 if (c->devinfo->ver == 42)
2059 vir_VPMWT(c);
2060 }
2061
2062 static void
emit_geom_end(struct v3d_compile * c)2063 emit_geom_end(struct v3d_compile *c)
2064 {
2065 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2066 */
2067 if (c->devinfo->ver == 42)
2068 vir_VPMWT(c);
2069 }
2070
2071 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2072 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
2073 unsigned bit_size,
2074 unsigned num_components,
2075 nir_intrinsic_instr *low,
2076 nir_intrinsic_instr *high,
2077 void *data)
2078 {
2079 /* TMU general access only supports 32-bit vectors */
2080 if (bit_size > 32)
2081 return false;
2082
2083 if ((bit_size == 8 || bit_size == 16) && num_components > 1)
2084 return false;
2085
2086 if (align_mul % 4 != 0 || align_offset % 4 != 0)
2087 return false;
2088
2089 /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
2090 * if the resulting vector crosses a 16-byte boundary.
2091 */
2092 assert(util_is_power_of_two_nonzero(align_mul));
2093 align_mul = MIN2(align_mul, 16);
2094 align_offset &= 0xf;
2095 if (16 - align_mul + align_offset + num_components * 4 > 16)
2096 return false;
2097
2098 return true;
2099 }
2100
2101 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)2102 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
2103 {
2104 bool progress;
2105 unsigned lower_flrp =
2106 (s->options->lower_flrp16 ? 16 : 0) |
2107 (s->options->lower_flrp32 ? 32 : 0) |
2108 (s->options->lower_flrp64 ? 64 : 0);
2109
2110 do {
2111 progress = false;
2112
2113 NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
2114 NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
2115 NIR_PASS(progress, s, nir_opt_deref);
2116
2117 NIR_PASS(progress, s, nir_lower_vars_to_ssa);
2118 if (!s->info.var_copies_lowered) {
2119 /* Only run this pass if nir_lower_var_copies was not called
2120 * yet. That would lower away any copy_deref instructions and we
2121 * don't want to introduce any more.
2122 */
2123 NIR_PASS(progress, s, nir_opt_find_array_copies);
2124 }
2125
2126 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
2127 NIR_PASS(progress, s, nir_opt_dead_write_vars);
2128 NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
2129
2130 NIR_PASS(progress, s, nir_remove_dead_variables,
2131 (nir_variable_mode)(nir_var_function_temp |
2132 nir_var_shader_temp |
2133 nir_var_mem_shared),
2134 NULL);
2135
2136 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2137 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
2138 NIR_PASS(progress, s, nir_copy_prop);
2139 NIR_PASS(progress, s, nir_opt_remove_phis);
2140 NIR_PASS(progress, s, nir_opt_dce);
2141 NIR_PASS(progress, s, nir_opt_dead_cf);
2142 NIR_PASS(progress, s, nir_opt_cse);
2143 /* before peephole_select as it can generate 64 bit bcsels */
2144 NIR_PASS(progress, s, nir_lower_64bit_phis);
2145 NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
2146 NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
2147 NIR_PASS(progress, s, nir_opt_algebraic);
2148 NIR_PASS(progress, s, nir_opt_constant_folding);
2149
2150 NIR_PASS(progress, s, nir_opt_intrinsics);
2151 NIR_PASS(progress, s, nir_opt_idiv_const, 32);
2152 NIR_PASS(progress, s, nir_lower_alu);
2153
2154 if (nir_opt_loop(s)) {
2155 progress = true;
2156 NIR_PASS(progress, s, nir_copy_prop);
2157 NIR_PASS(progress, s, nir_opt_dce);
2158 }
2159
2160 NIR_PASS(progress, s, nir_opt_conditional_discard);
2161
2162 NIR_PASS(progress, s, nir_opt_remove_phis);
2163 NIR_PASS(progress, s, nir_opt_if, false);
2164 if (c && !c->disable_gcm) {
2165 bool local_progress = false;
2166 NIR_PASS(local_progress, s, nir_opt_gcm, false);
2167 c->gcm_progress |= local_progress;
2168 progress |= local_progress;
2169 }
2170
2171 /* Note that vectorization may undo the load/store scalarization
2172 * pass we run for non 32-bit TMU general load/store by
2173 * converting, for example, 2 consecutive 16-bit loads into a
2174 * single 32-bit load. This is fine (and desirable) as long as
2175 * the resulting 32-bit load meets 32-bit alignment requirements,
2176 * which mem_vectorize_callback() should be enforcing.
2177 */
2178 nir_load_store_vectorize_options vectorize_opts = {
2179 .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
2180 nir_var_mem_push_const | nir_var_mem_shared |
2181 nir_var_mem_global,
2182 .callback = mem_vectorize_callback,
2183 .robust_modes = 0,
2184 };
2185 bool vectorize_progress = false;
2186
2187
2188 /* This requires that we have called
2189 * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
2190 * first, which we may not have done yet if we call here too
2191 * early durign NIR pre-processing. We can detect this because
2192 * in that case we won't have a compile object
2193 */
2194 if (c) {
2195 NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
2196 &vectorize_opts);
2197 if (vectorize_progress) {
2198 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2199 NIR_PASS(progress, s, nir_lower_pack);
2200 progress = true;
2201 }
2202 }
2203
2204 if (lower_flrp != 0) {
2205 bool lower_flrp_progress = false;
2206
2207 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
2208 lower_flrp,
2209 false /* always_precise */);
2210 if (lower_flrp_progress) {
2211 NIR_PASS(progress, s, nir_opt_constant_folding);
2212 progress = true;
2213 }
2214
2215 /* Nothing should rematerialize any flrps, so we only
2216 * need to do this lowering once.
2217 */
2218 lower_flrp = 0;
2219 }
2220
2221 NIR_PASS(progress, s, nir_opt_undef);
2222 NIR_PASS(progress, s, nir_lower_undef_to_zero);
2223
2224 if (c && !c->disable_loop_unrolling &&
2225 s->options->max_unroll_iterations > 0) {
2226 bool local_progress = false;
2227 NIR_PASS(local_progress, s, nir_opt_loop_unroll);
2228 c->unrolled_any_loops |= local_progress;
2229 progress |= local_progress;
2230 }
2231 } while (progress);
2232
2233 /* needs to be outside of optimization loop, otherwise it fights with
2234 * opt_algebraic optimizing the conversion lowering
2235 */
2236 NIR_PASS(progress, s, v3d_nir_lower_algebraic);
2237 NIR_PASS(progress, s, nir_opt_cse);
2238
2239 nir_move_options sink_opts =
2240 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
2241 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
2242 NIR_PASS(progress, s, nir_opt_sink, sink_opts);
2243 }
2244
2245 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)2246 driver_location_compare(const nir_variable *a, const nir_variable *b)
2247 {
2248 return a->data.driver_location == b->data.driver_location ?
2249 a->data.location_frac - b->data.location_frac :
2250 a->data.driver_location - b->data.driver_location;
2251 }
2252
2253 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t num_components)2254 ntq_emit_vpm_read(struct v3d_compile *c, uint32_t num_components)
2255 {
2256 return vir_LDVPMV_IN(c,
2257 vir_uniform_ui(c, num_components));
2258 }
2259
2260 static void
ntq_setup_vs_inputs(struct v3d_compile * c)2261 ntq_setup_vs_inputs(struct v3d_compile *c)
2262 {
2263 /* Figure out how many components of each vertex attribute the shader
2264 * uses. Each variable should have been split to individual
2265 * components and unused ones DCEed. The vertex fetcher will load
2266 * from the start of the attribute to the number of components we
2267 * declare we need in c->vattr_sizes[].
2268 *
2269 * BGRA vertex attributes are a bit special: since we implement these
2270 * as RGBA swapping R/B components we always need at least 3 components
2271 * if component 0 is read.
2272 */
2273 nir_foreach_shader_in_variable(var, c->s) {
2274 /* No VS attribute array support. */
2275 assert(MAX2(glsl_get_length(var->type), 1) == 1);
2276
2277 unsigned loc = var->data.driver_location;
2278 int start_component = var->data.location_frac;
2279 int num_components = glsl_get_components(var->type);
2280
2281 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
2282 start_component + num_components);
2283
2284 /* Handle BGRA inputs */
2285 if (start_component == 0 &&
2286 c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
2287 c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
2288 }
2289 }
2290
2291 uint32_t vpm_components = 0;
2292 bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
2293 SYSTEM_VALUE_INSTANCE_ID) ||
2294 BITSET_TEST(c->s->info.system_values_read,
2295 SYSTEM_VALUE_INSTANCE_INDEX);
2296 bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
2297 SYSTEM_VALUE_BASE_INSTANCE);
2298 bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
2299 SYSTEM_VALUE_VERTEX_ID) ||
2300 BITSET_TEST(c->s->info.system_values_read,
2301 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
2302
2303 if (uses_iid)
2304 c->iid = ntq_emit_vpm_read(c, vpm_components++);
2305
2306 if (uses_biid)
2307 c->biid = ntq_emit_vpm_read(c, vpm_components++);
2308
2309 if (uses_vid)
2310 c->vid = ntq_emit_vpm_read(c, vpm_components++);
2311
2312 /* The actual loads will happen directly in nir_intrinsic_load_input
2313 */
2314 return;
2315 }
2316
2317 static bool
program_reads_point_coord(struct v3d_compile * c)2318 program_reads_point_coord(struct v3d_compile *c)
2319 {
2320 nir_foreach_shader_in_variable(var, c->s) {
2321 if (util_varying_is_point_coord(var->data.location,
2322 c->fs_key->point_sprite_mask)) {
2323 return true;
2324 }
2325 }
2326
2327 return false;
2328 }
2329
2330 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2331 ntq_setup_gs_inputs(struct v3d_compile *c)
2332 {
2333 nir_sort_variables_with_modes(c->s, driver_location_compare,
2334 nir_var_shader_in);
2335
2336 nir_foreach_shader_in_variable(var, c->s) {
2337 /* All GS inputs are arrays with as many entries as vertices
2338 * in the input primitive, but here we only care about the
2339 * per-vertex input type.
2340 */
2341 assert(glsl_type_is_array(var->type));
2342 const struct glsl_type *type = glsl_get_array_element(var->type);
2343 unsigned var_len = glsl_count_vec4_slots(type, false, false);
2344 unsigned loc = var->data.driver_location;
2345
2346 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2347 (loc + var_len) * 4);
2348
2349 if (var->data.compact) {
2350 for (unsigned j = 0; j < var_len; j++) {
2351 unsigned input_idx = c->num_inputs++;
2352 unsigned loc_frac = var->data.location_frac + j;
2353 unsigned loc = var->data.location + loc_frac / 4;
2354 unsigned comp = loc_frac % 4;
2355 c->input_slots[input_idx] =
2356 v3d_slot_from_slot_and_component(loc, comp);
2357 }
2358 continue;
2359 }
2360
2361 for (unsigned j = 0; j < var_len; j++) {
2362 unsigned num_elements =
2363 glsl_type_is_struct(glsl_without_array(type)) ?
2364 4 : glsl_get_vector_elements(type);
2365 for (unsigned k = 0; k < num_elements; k++) {
2366 unsigned chan = var->data.location_frac + k;
2367 unsigned input_idx = c->num_inputs++;
2368 struct v3d_varying_slot slot =
2369 v3d_slot_from_slot_and_component(var->data.location + j, chan);
2370 c->input_slots[input_idx] = slot;
2371 }
2372 }
2373 }
2374 }
2375
2376
2377 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2378 ntq_setup_fs_inputs(struct v3d_compile *c)
2379 {
2380 nir_sort_variables_with_modes(c->s, driver_location_compare,
2381 nir_var_shader_in);
2382
2383 nir_foreach_shader_in_variable(var, c->s) {
2384 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2385 unsigned loc = var->data.driver_location;
2386
2387 uint32_t inputs_array_size = c->inputs_array_size;
2388 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2389 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2390 inputs_array_required_size);
2391 resize_interp_array(c, &c->interp, &inputs_array_size,
2392 inputs_array_required_size);
2393
2394 if (var->data.location == VARYING_SLOT_POS) {
2395 emit_fragcoord_input(c, loc);
2396 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2397 !c->fs_key->has_gs) {
2398 /* If the fragment shader reads gl_PrimitiveID and we
2399 * don't have a geometry shader in the pipeline to write
2400 * it then we program the hardware to inject it as
2401 * an implicit varying. Take it from there.
2402 */
2403 c->inputs[loc * 4] = c->primitive_id;
2404 } else if (util_varying_is_point_coord(var->data.location,
2405 c->fs_key->point_sprite_mask)) {
2406 c->inputs[loc * 4 + 0] = c->point_x;
2407 c->inputs[loc * 4 + 1] = c->point_y;
2408 } else if (var->data.compact) {
2409 for (int j = 0; j < var_len; j++)
2410 emit_compact_fragment_input(c, loc, var, j);
2411 } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
2412 for (int j = 0; j < var_len; j++) {
2413 emit_fragment_input(c, loc, var, j, 4);
2414 }
2415 } else {
2416 for (int j = 0; j < var_len; j++) {
2417 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2418 }
2419 }
2420 }
2421 }
2422
2423 static void
ntq_setup_outputs(struct v3d_compile * c)2424 ntq_setup_outputs(struct v3d_compile *c)
2425 {
2426 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2427 return;
2428
2429 nir_foreach_shader_out_variable(var, c->s) {
2430 assert(glsl_type_is_vector_or_scalar(var->type));
2431 unsigned loc = var->data.driver_location * 4;
2432
2433 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2434 add_output(c, loc + var->data.location_frac + i,
2435 var->data.location,
2436 var->data.location_frac + i);
2437 }
2438
2439 switch (var->data.location) {
2440 case FRAG_RESULT_COLOR:
2441 for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
2442 c->output_color_var[i] = var;
2443 break;
2444 case FRAG_RESULT_DATA0:
2445 case FRAG_RESULT_DATA1:
2446 case FRAG_RESULT_DATA2:
2447 case FRAG_RESULT_DATA3:
2448 case FRAG_RESULT_DATA4:
2449 case FRAG_RESULT_DATA5:
2450 case FRAG_RESULT_DATA6:
2451 case FRAG_RESULT_DATA7:
2452 c->output_color_var[var->data.location -
2453 FRAG_RESULT_DATA0] = var;
2454 break;
2455 case FRAG_RESULT_DEPTH:
2456 c->output_position_index = loc;
2457 break;
2458 case FRAG_RESULT_SAMPLE_MASK:
2459 c->output_sample_mask_index = loc;
2460 break;
2461 }
2462 }
2463 }
2464
2465 /**
2466 * Sets up the mapping from nir_register to struct qreg *.
2467 *
2468 * Each nir_register gets a struct qreg per 32-bit component being stored.
2469 */
2470 static void
ntq_setup_registers(struct v3d_compile * c,nir_function_impl * impl)2471 ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
2472 {
2473 nir_foreach_reg_decl(decl, impl) {
2474 unsigned num_components = nir_intrinsic_num_components(decl);
2475 unsigned array_len = nir_intrinsic_num_array_elems(decl);
2476 array_len = MAX2(array_len, 1);
2477 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2478 array_len * num_components);
2479
2480 nir_def *nir_reg = &decl->def;
2481 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2482
2483 for (int i = 0; i < array_len * num_components; i++)
2484 qregs[i] = vir_get_temp(c);
2485 }
2486 }
2487
2488 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2489 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2490 {
2491 /* XXX perf: Experiment with using immediate loads to avoid having
2492 * these end up in the uniform stream. Watch out for breaking the
2493 * small immediates optimization in the process!
2494 */
2495 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2496 for (int i = 0; i < instr->def.num_components; i++)
2497 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2498
2499 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2500 }
2501
2502 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2503 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2504 {
2505 unsigned image_index = nir_src_as_uint(instr->src[0]);
2506 bool is_array = nir_intrinsic_image_array(instr);
2507
2508 assert(nir_src_as_uint(instr->src[1]) == 0);
2509
2510 ntq_store_def(c, &instr->def, 0,
2511 vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2512 if (instr->num_components > 1) {
2513 ntq_store_def(c, &instr->def, 1,
2514 vir_uniform(c,
2515 instr->num_components == 2 && is_array ?
2516 QUNIFORM_IMAGE_ARRAY_SIZE :
2517 QUNIFORM_IMAGE_HEIGHT,
2518 image_index));
2519 }
2520 if (instr->num_components > 2) {
2521 ntq_store_def(c, &instr->def, 2,
2522 vir_uniform(c,
2523 is_array ?
2524 QUNIFORM_IMAGE_ARRAY_SIZE :
2525 QUNIFORM_IMAGE_DEPTH,
2526 image_index));
2527 }
2528 }
2529
2530 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2531 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2532 {
2533 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2534
2535 int rt = nir_src_as_uint(instr->src[0]);
2536 assert(rt < V3D_MAX_DRAW_BUFFERS);
2537
2538 int sample_index = nir_intrinsic_base(instr) ;
2539 assert(sample_index < V3D_MAX_SAMPLES);
2540
2541 int component = nir_intrinsic_component(instr);
2542 assert(component < 4);
2543
2544 /* We need to emit our TLB reads after we have acquired the scoreboard
2545 * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2546 * the last thread switch to improve parallelism, however, that is only
2547 * guaranteed to happen before the tlb color writes.
2548 *
2549 * To fix that, we make sure we always emit a thread switch before the
2550 * first tlb color read. If that happens to be the last thread switch
2551 * we emit, then everything is fine, but otherwise, if any code after
2552 * this point needs to emit additional thread switches, then we will
2553 * switch the strategy to locking the scoreboard on the first thread
2554 * switch instead -- see vir_emit_thrsw().
2555 */
2556 if (!c->emitted_tlb_load) {
2557 if (!c->last_thrsw_at_top_level)
2558 vir_emit_thrsw(c);
2559
2560 c->emitted_tlb_load = true;
2561 }
2562
2563 struct qreg *color_reads_for_sample =
2564 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2565
2566 if (color_reads_for_sample[component].file == QFILE_NULL) {
2567 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2568 int num_components =
2569 util_format_get_nr_components(rt_format);
2570
2571 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2572 if (swap_rb)
2573 num_components = MAX2(num_components, 3);
2574
2575 nir_variable *var = c->output_color_var[rt];
2576 enum glsl_base_type type = glsl_get_base_type(var->type);
2577
2578 bool is_int_format = type == GLSL_TYPE_INT ||
2579 type == GLSL_TYPE_UINT;
2580
2581 bool is_32b_tlb_format = is_int_format ||
2582 (c->fs_key->f32_color_rb & (1 << rt));
2583
2584 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2585
2586 uint32_t conf = 0xffffff00;
2587 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2588 TLB_SAMPLE_MODE_PER_PIXEL;
2589 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2590
2591 if (is_32b_tlb_format) {
2592 /* The F32 vs I32 distinction was dropped in 4.2. */
2593 conf |= (c->devinfo->ver < 42 && is_int_format) ?
2594 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2595
2596 conf |= ((num_components - 1) <<
2597 TLB_VEC_SIZE_MINUS_1_SHIFT);
2598 } else {
2599 conf |= TLB_TYPE_F16_COLOR;
2600 conf |= TLB_F16_SWAP_HI_LO;
2601
2602 if (num_components >= 3)
2603 conf |= TLB_VEC_SIZE_4_F16;
2604 else
2605 conf |= TLB_VEC_SIZE_2_F16;
2606 }
2607
2608
2609 for (int i = 0; i < num_samples; i++) {
2610 struct qreg r, g, b, a;
2611 if (is_32b_tlb_format) {
2612 r = conf != 0xffffffff && i == 0?
2613 vir_TLBU_COLOR_READ(c, conf) :
2614 vir_TLB_COLOR_READ(c);
2615 if (num_components >= 2)
2616 g = vir_TLB_COLOR_READ(c);
2617 if (num_components >= 3)
2618 b = vir_TLB_COLOR_READ(c);
2619 if (num_components >= 4)
2620 a = vir_TLB_COLOR_READ(c);
2621 } else {
2622 struct qreg rg = conf != 0xffffffff && i == 0 ?
2623 vir_TLBU_COLOR_READ(c, conf) :
2624 vir_TLB_COLOR_READ(c);
2625 r = vir_FMOV(c, rg);
2626 vir_set_unpack(c->defs[r.index], 0,
2627 V3D_QPU_UNPACK_L);
2628 g = vir_FMOV(c, rg);
2629 vir_set_unpack(c->defs[g.index], 0,
2630 V3D_QPU_UNPACK_H);
2631
2632 if (num_components > 2) {
2633 struct qreg ba = vir_TLB_COLOR_READ(c);
2634 b = vir_FMOV(c, ba);
2635 vir_set_unpack(c->defs[b.index], 0,
2636 V3D_QPU_UNPACK_L);
2637 a = vir_FMOV(c, ba);
2638 vir_set_unpack(c->defs[a.index], 0,
2639 V3D_QPU_UNPACK_H);
2640 }
2641 }
2642
2643 struct qreg *color_reads =
2644 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2645
2646 color_reads[0] = swap_rb ? b : r;
2647 if (num_components >= 2)
2648 color_reads[1] = g;
2649 if (num_components >= 3)
2650 color_reads[2] = swap_rb ? r : b;
2651 if (num_components >= 4)
2652 color_reads[3] = a;
2653 }
2654 }
2655
2656 assert(color_reads_for_sample[component].file != QFILE_NULL);
2657 ntq_store_def(c, &instr->def, 0,
2658 vir_MOV(c, color_reads_for_sample[component]));
2659 }
2660
2661 static bool
2662 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
2663
2664 static bool
try_emit_uniform(struct v3d_compile * c,int offset,int num_components,nir_def * def,enum quniform_contents contents)2665 try_emit_uniform(struct v3d_compile *c,
2666 int offset,
2667 int num_components,
2668 nir_def *def,
2669 enum quniform_contents contents)
2670 {
2671 /* Even though ldunif is strictly 32-bit we can still use it
2672 * to load scalar 8-bit/16-bit uniforms so long as their offset
2673 * is 32-bit aligned. In this case, ldunif would still load
2674 * 32-bit into the destination with the 8-bit/16-bit uniform
2675 * data in the LSB and garbage in the MSB, but that is fine
2676 * because we should only be accessing the valid bits of the
2677 * destination.
2678 *
2679 * FIXME: if in the future we improve our register allocator to
2680 * pack 2 16-bit variables in the MSB and LSB of the same
2681 * register then this optimization would not be valid as is,
2682 * since the load clobbers the MSB.
2683 */
2684 if (offset % 4 != 0)
2685 return false;
2686
2687 /* We need dwords */
2688 offset = offset / 4;
2689
2690 for (int i = 0; i < num_components; i++) {
2691 ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
2692 }
2693
2694 return true;
2695 }
2696
2697 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2698 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2699 {
2700 /* We scalarize general TMU access for anything that is not 32-bit. */
2701 assert(instr->def.bit_size == 32 ||
2702 instr->num_components == 1);
2703
2704 /* Try to emit ldunif if possible, otherwise fallback to general TMU */
2705 if (nir_src_is_const(instr->src[0])) {
2706 int offset = (nir_intrinsic_base(instr) +
2707 nir_src_as_uint(instr->src[0]));
2708
2709 if (try_emit_uniform(c, offset, instr->num_components,
2710 &instr->def, QUNIFORM_UNIFORM)) {
2711 return;
2712 }
2713 }
2714
2715 if (!ntq_emit_load_unifa(c, instr)) {
2716 ntq_emit_tmu_general(c, instr, false, false);
2717 c->has_general_tmu_load = true;
2718 }
2719 }
2720
2721 static bool
ntq_emit_inline_ubo_load(struct v3d_compile * c,nir_intrinsic_instr * instr)2722 ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
2723 {
2724 if (c->compiler->max_inline_uniform_buffers <= 0)
2725 return false;
2726
2727 /* Regular UBOs start after inline UBOs */
2728 uint32_t index = nir_src_as_uint(instr->src[0]);
2729 if (index >= c->compiler->max_inline_uniform_buffers)
2730 return false;
2731
2732 /* We scalarize general TMU access for anything that is not 32-bit */
2733 assert(instr->def.bit_size == 32 ||
2734 instr->num_components == 1);
2735
2736 if (nir_src_is_const(instr->src[1])) {
2737 int offset = nir_src_as_uint(instr->src[1]);
2738 if (try_emit_uniform(c, offset, instr->num_components,
2739 &instr->def,
2740 QUNIFORM_INLINE_UBO_0 + index)) {
2741 return true;
2742 }
2743 }
2744
2745 /* Fallback to regular UBO load */
2746 return false;
2747 }
2748
2749 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2750 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2751 {
2752 /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2753 *
2754 * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2755 * if we don't support non-uniform offsets because we also set the
2756 * lower_all_io_to_temps option in the NIR compiler. This ensures that
2757 * any indirect indexing on in/out variables is turned into indirect
2758 * indexing on temporary variables instead, that we handle by lowering
2759 * to scratch. If we implement non-uniform offset here we might be able
2760 * to avoid the temp and scratch lowering, which involves copying from
2761 * the input to the temp variable, possibly making code more optimal.
2762 */
2763 unsigned offset =
2764 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2765
2766 if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2767 /* Emit the LDVPM directly now, rather than at the top
2768 * of the shader like we did for V3D 3.x (which needs
2769 * vpmsetup when not just taking the next offset).
2770 *
2771 * Note that delaying like this may introduce stalls,
2772 * as LDVPMV takes a minimum of 1 instruction but may
2773 * be slower if the VPM unit is busy with another QPU.
2774 */
2775 int index = 0;
2776 if (BITSET_TEST(c->s->info.system_values_read,
2777 SYSTEM_VALUE_INSTANCE_ID)) {
2778 index++;
2779 }
2780 if (BITSET_TEST(c->s->info.system_values_read,
2781 SYSTEM_VALUE_BASE_INSTANCE)) {
2782 index++;
2783 }
2784 if (BITSET_TEST(c->s->info.system_values_read,
2785 SYSTEM_VALUE_VERTEX_ID)) {
2786 index++;
2787 }
2788
2789 for (int i = 0; i < offset; i++) {
2790 /* GFXH-1602: if any builtins (vid, iid, etc) are read then
2791 * attribute 0 must be active (size > 0). When we hit this,
2792 * the driver is expected to program attribute 0 to have a
2793 * size of 1, so here we need to add that.
2794 */
2795 if (i == 0 && c->vs_key->is_coord &&
2796 c->vattr_sizes[i] == 0 && index > 0) {
2797 index++;
2798 } else {
2799 index += c->vattr_sizes[i];
2800 }
2801 }
2802
2803 index += nir_intrinsic_component(instr);
2804 for (int i = 0; i < instr->num_components; i++) {
2805 struct qreg vpm_offset = vir_uniform_ui(c, index++);
2806 ntq_store_def(c, &instr->def, i,
2807 vir_LDVPMV_IN(c, vpm_offset));
2808 }
2809 } else {
2810 for (int i = 0; i < instr->num_components; i++) {
2811 int comp = nir_intrinsic_component(instr) + i;
2812 struct qreg input = c->inputs[offset * 4 + comp];
2813 ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
2814
2815 if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
2816 input.file == c->payload_z.file &&
2817 input.index == c->payload_z.index) {
2818 c->reads_z = true;
2819 }
2820 }
2821 }
2822 }
2823
2824 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2825 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2826 nir_intrinsic_instr *instr)
2827 {
2828 assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2829
2830 unsigned rt = nir_src_as_uint(instr->src[1]);
2831 assert(rt < V3D_MAX_DRAW_BUFFERS);
2832
2833 unsigned sample_idx = nir_intrinsic_base(instr);
2834 assert(sample_idx < V3D_MAX_SAMPLES);
2835
2836 unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2837 for (int i = 0; i < instr->num_components; i++) {
2838 c->sample_colors[offset + i] =
2839 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2840 }
2841 }
2842
2843 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2844 ntq_emit_color_write(struct v3d_compile *c,
2845 nir_intrinsic_instr *instr)
2846 {
2847 unsigned offset = (nir_intrinsic_base(instr) +
2848 nir_src_as_uint(instr->src[1])) * 4 +
2849 nir_intrinsic_component(instr);
2850 for (int i = 0; i < instr->num_components; i++) {
2851 c->outputs[offset + i] =
2852 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2853 }
2854 }
2855
2856 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2857 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2858 {
2859 assert(instr->num_components == 1);
2860
2861 struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2862
2863 uint32_t base_offset = nir_intrinsic_base(instr);
2864
2865 if (base_offset)
2866 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2867
2868 /* Usually, for VS or FS, we only emit outputs once at program end so
2869 * our VPM writes are never in non-uniform control flow, but this
2870 * is not true for GS, where we are emitting multiple vertices.
2871 */
2872 if (vir_in_nonuniform_control_flow(c)) {
2873 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2874 V3D_QPU_PF_PUSHZ);
2875 }
2876
2877 struct qreg val = ntq_get_src(c, instr->src[0], 0);
2878
2879 /* The offset isn’t necessarily dynamically uniform for a geometry
2880 * shader. This can happen if the shader sometimes doesn’t emit one of
2881 * the vertices. In that case subsequent vertices will be written to
2882 * different offsets in the VPM and we need to use the scatter write
2883 * instruction to have a different offset for each lane.
2884 */
2885 bool is_uniform_offset =
2886 !vir_in_nonuniform_control_flow(c) &&
2887 !nir_src_is_divergent(instr->src[1]);
2888 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2889
2890 if (vir_in_nonuniform_control_flow(c)) {
2891 struct qinst *last_inst =
2892 (struct qinst *)c->cur_block->instructions.prev;
2893 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2894 }
2895 }
2896
2897 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2898 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2899 {
2900 assert(c->s->info.stage == MESA_SHADER_VERTEX);
2901 assert(instr->num_components == 1);
2902
2903 uint32_t base = nir_intrinsic_base(instr);
2904 struct qreg val = ntq_get_src(c, instr->src[0], 0);
2905
2906 if (nir_src_is_const(instr->src[1])) {
2907 vir_VPM_WRITE(c, val,
2908 base + nir_src_as_uint(instr->src[1]));
2909 } else {
2910 struct qreg offset = vir_ADD(c,
2911 ntq_get_src(c, instr->src[1], 1),
2912 vir_uniform_ui(c, base));
2913 bool is_uniform_offset =
2914 !vir_in_nonuniform_control_flow(c) &&
2915 !nir_src_is_divergent(instr->src[1]);
2916 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2917 }
2918 }
2919
2920 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2921 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2922 {
2923 if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2924 ntq_emit_color_write(c, instr);
2925 else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2926 emit_store_output_gs(c, instr);
2927 else
2928 emit_store_output_vs(c, instr);
2929 }
2930
2931 /**
2932 * This implementation is based on v3d_sample_{x,y}_offset() from
2933 * v3d_sample_offset.h.
2934 */
2935 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2936 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2937 struct qreg *sx, struct qreg *sy)
2938 {
2939 sample_idx = vir_ITOF(c, sample_idx);
2940
2941 struct qreg offset_x =
2942 vir_FADD(c, vir_uniform_f(c, -0.125f),
2943 vir_FMUL(c, sample_idx,
2944 vir_uniform_f(c, 0.5f)));
2945 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2946 vir_uniform_f(c, 2.0f), sample_idx),
2947 V3D_QPU_PF_PUSHC);
2948 offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2949 vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2950 offset_x);
2951
2952 struct qreg offset_y =
2953 vir_FADD(c, vir_uniform_f(c, -0.375f),
2954 vir_FMUL(c, sample_idx,
2955 vir_uniform_f(c, 0.25f)));
2956 *sx = offset_x;
2957 *sy = offset_y;
2958 }
2959
2960 /**
2961 * This implementation is based on get_centroid_offset() from fep.c.
2962 */
2963 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2964 ntq_get_barycentric_centroid(struct v3d_compile *c,
2965 struct qreg *out_x,
2966 struct qreg *out_y)
2967 {
2968 struct qreg sample_mask;
2969 if (c->output_sample_mask_index != -1)
2970 sample_mask = c->outputs[c->output_sample_mask_index];
2971 else
2972 sample_mask = vir_MSF(c);
2973
2974 struct qreg i0 = vir_uniform_ui(c, 0);
2975 struct qreg i1 = vir_uniform_ui(c, 1);
2976 struct qreg i2 = vir_uniform_ui(c, 2);
2977 struct qreg i3 = vir_uniform_ui(c, 3);
2978 struct qreg i4 = vir_uniform_ui(c, 4);
2979 struct qreg i8 = vir_uniform_ui(c, 8);
2980
2981 /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2982 struct qreg F = vir_uniform_ui(c, 0);
2983 struct qreg T = vir_uniform_ui(c, ~0);
2984 struct qreg s0 = vir_AND(c, sample_mask, i1);
2985 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2986 s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2987 struct qreg s1 = vir_AND(c, sample_mask, i2);
2988 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2989 s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2990 struct qreg s2 = vir_AND(c, sample_mask, i4);
2991 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2992 s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2993 struct qreg s3 = vir_AND(c, sample_mask, i8);
2994 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2995 s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2996
2997 /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2998 struct qreg sample_idx = i3;
2999 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
3000 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
3001 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
3002 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
3003 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
3004 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
3005
3006 /* Get offset at selected sample index */
3007 struct qreg offset_x, offset_y;
3008 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3009
3010 /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
3011 * are selected.
3012 */
3013 struct qreg s0_and_s3 = vir_AND(c, s0, s3);
3014 struct qreg s1_and_s2 = vir_AND(c, s1, s2);
3015
3016 struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
3017 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3018 use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
3019 use_center = vir_OR(c, use_center, s0_and_s3);
3020 use_center = vir_OR(c, use_center, s1_and_s2);
3021
3022 struct qreg zero = vir_uniform_f(c, 0.0f);
3023 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3024 offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
3025 offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
3026
3027 *out_x = offset_x;
3028 *out_y = offset_y;
3029 }
3030
3031 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)3032 ntq_emit_load_interpolated_input(struct v3d_compile *c,
3033 struct qreg p,
3034 struct qreg C,
3035 struct qreg offset_x,
3036 struct qreg offset_y,
3037 unsigned mode)
3038 {
3039 if (mode == INTERP_MODE_FLAT)
3040 return C;
3041
3042 struct qreg sample_offset_x =
3043 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3044 struct qreg sample_offset_y =
3045 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3046
3047 struct qreg scaleX =
3048 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
3049 offset_x);
3050 struct qreg scaleY =
3051 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
3052 offset_y);
3053
3054 struct qreg pInterp =
3055 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
3056 vir_FMUL(c, vir_FDY(c, p), scaleY)));
3057
3058 if (mode == INTERP_MODE_NOPERSPECTIVE)
3059 return vir_FADD(c, pInterp, C);
3060
3061 struct qreg w = c->payload_w;
3062 struct qreg wInterp =
3063 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
3064 vir_FMUL(c, vir_FDY(c, w), scaleY)));
3065
3066 return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
3067 }
3068
3069 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)3070 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
3071 {
3072 struct qinst *ldunifa =
3073 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
3074 ldunifa->qpu.sig.ldunifa = true;
3075 if (result)
3076 *result = vir_emit_def(c, ldunifa);
3077 else
3078 vir_emit_nondef(c, ldunifa);
3079 c->current_unifa_offset += 4;
3080 }
3081
3082 /* Checks if the value of a nir src is derived from a nir register */
3083 static bool
nir_src_derived_from_reg(nir_src src)3084 nir_src_derived_from_reg(nir_src src)
3085 {
3086 nir_def *def = src.ssa;
3087 if (nir_load_reg_for_def(def))
3088 return true;
3089
3090 nir_instr *parent = def->parent_instr;
3091 switch (parent->type) {
3092 case nir_instr_type_alu: {
3093 nir_alu_instr *alu = nir_instr_as_alu(parent);
3094 int num_srcs = nir_op_infos[alu->op].num_inputs;
3095 for (int i = 0; i < num_srcs; i++) {
3096 if (nir_src_derived_from_reg(alu->src[i].src))
3097 return true;
3098 }
3099 return false;
3100 }
3101 case nir_instr_type_intrinsic: {
3102 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3103 int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
3104 for (int i = 0; i < num_srcs; i++) {
3105 if (nir_src_derived_from_reg(intr->src[i]))
3106 return true;
3107 }
3108 return false;
3109 }
3110 case nir_instr_type_load_const:
3111 case nir_instr_type_undef:
3112 return false;
3113 default:
3114 /* By default we assume it may come from a register, the above
3115 * cases should be able to handle the majority of situations
3116 * though.
3117 */
3118 return true;
3119 };
3120 }
3121
3122 static bool
ntq_emit_load_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)3123 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
3124 {
3125 assert(instr->intrinsic == nir_intrinsic_load_ubo ||
3126 instr->intrinsic == nir_intrinsic_load_ssbo ||
3127 instr->intrinsic == nir_intrinsic_load_uniform);
3128
3129 bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
3130 bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
3131 bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
3132
3133 /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
3134 * current unifa offset is 4 bytes ahead of the offset of the last load.
3135 */
3136 static const int32_t max_unifa_skip_dist =
3137 MAX_UNIFA_SKIP_DISTANCE - 4;
3138
3139 /* We can only use unifa if the offset is uniform */
3140 nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
3141 if (nir_src_is_divergent(offset))
3142 return false;
3143
3144 /* Emitting loads from unifa may not be safe under non-uniform control
3145 * flow. It seems the address that is used to write to the unifa
3146 * register is taken from the first lane and if that lane is disabled
3147 * by control flow then the value we read may be bogus and lead to
3148 * invalid memory accesses on follow-up ldunifa instructions. However,
3149 * ntq_store_def only emits conditional writes for nir registersas long
3150 * we can be certain that the offset isn't derived from a load_reg we
3151 * should be fine.
3152 *
3153 * The following CTS test can be used to trigger the problem, which
3154 * causes a GMP violations in the sim without this check:
3155 * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
3156 */
3157 if (vir_in_nonuniform_control_flow(c) &&
3158 nir_src_derived_from_reg(offset)) {
3159 return false;
3160 }
3161
3162 /* We can only use unifa with SSBOs if they are read-only. Otherwise
3163 * ldunifa won't see the shader writes to that address (possibly
3164 * because ldunifa doesn't read from the L2T cache).
3165 */
3166 if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
3167 return false;
3168
3169 /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
3170 * that we may have been written to scratch using the TMU.
3171 */
3172 bool dynamic_src = !nir_src_is_const(offset);
3173 if (is_uniform && dynamic_src && c->s->scratch_size > 0)
3174 return false;
3175
3176 uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
3177 if (is_uniform)
3178 const_offset += nir_intrinsic_base(instr);
3179
3180 /* ldunifa is a 32-bit load instruction so we can only use it with
3181 * 32-bit aligned addresses. We always produce 32-bit aligned addresses
3182 * except for types smaller than 32-bit, so in these cases we can only
3183 * use ldunifa if we can verify alignment, which we can only do for
3184 * loads with a constant offset.
3185 */
3186 uint32_t bit_size = instr->def.bit_size;
3187 uint32_t value_skips = 0;
3188 if (bit_size < 32) {
3189 if (dynamic_src) {
3190 return false;
3191 } else if (const_offset % 4 != 0) {
3192 /* If we are loading from an unaligned offset, fix
3193 * alignment and skip over unused elements in result.
3194 */
3195 value_skips = (const_offset % 4) / (bit_size / 8);
3196 const_offset &= ~0x3;
3197 }
3198 }
3199
3200 assert((bit_size == 32 && value_skips == 0) ||
3201 (bit_size == 16 && value_skips <= 1) ||
3202 (bit_size == 8 && value_skips <= 3));
3203
3204 /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
3205 * constants.
3206 */
3207 uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
3208
3209 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
3210 * index 0 for Gallium's constant buffer (GL) or push constants
3211 * (Vulkan).
3212 */
3213 if (is_ubo)
3214 index++;
3215
3216 /* We can only keep track of the last unifa address we used with
3217 * constant offset loads. If the new load targets the same buffer and
3218 * is close enough to the previous load, we can skip the unifa register
3219 * write by emitting dummy ldunifa instructions to update the unifa
3220 * address.
3221 */
3222 bool skip_unifa = false;
3223 uint32_t ldunifa_skips = 0;
3224 if (dynamic_src) {
3225 c->current_unifa_block = NULL;
3226 } else if (c->cur_block == c->current_unifa_block &&
3227 c->current_unifa_is_ubo == !is_ssbo &&
3228 c->current_unifa_index == index &&
3229 c->current_unifa_offset <= const_offset &&
3230 c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
3231 skip_unifa = true;
3232 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
3233 } else {
3234 c->current_unifa_block = c->cur_block;
3235 c->current_unifa_is_ubo = !is_ssbo;
3236 c->current_unifa_index = index;
3237 c->current_unifa_offset = const_offset;
3238 }
3239
3240 if (!skip_unifa) {
3241 struct qreg base_offset = !is_ssbo ?
3242 vir_uniform(c, QUNIFORM_UBO_ADDR,
3243 v3d_unit_data_create(index, const_offset)) :
3244 vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
3245
3246 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
3247 if (!dynamic_src) {
3248 if (!is_ssbo) {
3249 /* Avoid the extra MOV to UNIFA by making
3250 * ldunif load directly into it. We can't
3251 * do this if we have not actually emitted
3252 * ldunif and are instead reusing a previous
3253 * one.
3254 */
3255 struct qinst *inst =
3256 (struct qinst *)c->cur_block->instructions.prev;
3257 if (inst == c->defs[base_offset.index]) {
3258 inst->dst = unifa;
3259 c->defs[base_offset.index] = NULL;
3260 } else {
3261 vir_MOV_dest(c, unifa, base_offset);
3262 }
3263 } else {
3264 vir_ADD_dest(c, unifa, base_offset,
3265 vir_uniform_ui(c, const_offset));
3266 }
3267 } else {
3268 vir_ADD_dest(c, unifa, base_offset,
3269 ntq_get_src(c, offset, 0));
3270 }
3271 } else {
3272 for (int i = 0; i < ldunifa_skips; i++)
3273 emit_ldunifa(c, NULL);
3274 }
3275
3276 uint32_t num_components = nir_intrinsic_dest_components(instr);
3277 for (uint32_t i = 0; i < num_components; ) {
3278 struct qreg data;
3279 emit_ldunifa(c, &data);
3280
3281 if (bit_size == 32) {
3282 assert(value_skips == 0);
3283 ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
3284 i++;
3285 } else {
3286 assert((bit_size == 16 && value_skips <= 1) ||
3287 (bit_size == 8 && value_skips <= 3));
3288
3289 /* If we have any values to skip, shift to the first
3290 * valid value in the ldunifa result.
3291 */
3292 if (value_skips > 0) {
3293 data = vir_SHR(c, data,
3294 vir_uniform_ui(c, bit_size *
3295 value_skips));
3296 }
3297
3298 /* Check how many valid components we have discounting
3299 * read components to skip.
3300 */
3301 uint32_t valid_count = (32 / bit_size) - value_skips;
3302 assert((bit_size == 16 && valid_count <= 2) ||
3303 (bit_size == 8 && valid_count <= 4));
3304 assert(valid_count > 0);
3305
3306 /* Process the valid components */
3307 do {
3308 struct qreg tmp;
3309 uint32_t mask = (1 << bit_size) - 1;
3310 tmp = vir_AND(c, vir_MOV(c, data),
3311 vir_uniform_ui(c, mask));
3312 ntq_store_def(c, &instr->def, i,
3313 vir_MOV(c, tmp));
3314 i++;
3315 valid_count--;
3316
3317 /* Shift to next component */
3318 if (i < num_components && valid_count > 0) {
3319 data = vir_SHR(c, data,
3320 vir_uniform_ui(c, bit_size));
3321 }
3322 } while (i < num_components && valid_count > 0);
3323 }
3324 }
3325
3326 return true;
3327 }
3328
3329 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)3330 emit_load_local_invocation_index(struct v3d_compile *c)
3331 {
3332 return vir_SHR(c, c->cs_payload[1],
3333 vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
3334 }
3335
3336 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
3337 * fragment shaders a lane is considered active if any sample flags are set
3338 * for *any* lane in the same quad, however, we still need to ensure that
3339 * terminated lanes (OpTerminate) are not included. Further, we also need to
3340 * disable lanes that may be disabled because of non-uniform control
3341 * flow.
3342 */
3343 static enum v3d_qpu_cond
setup_subgroup_control_flow_condition(struct v3d_compile * c)3344 setup_subgroup_control_flow_condition(struct v3d_compile *c)
3345 {
3346 assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
3347 c->s->info.stage == MESA_SHADER_COMPUTE);
3348
3349 enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
3350
3351 /* We need to make sure that terminated lanes in fragment shaders are
3352 * not included. We can identify these lanes by comparing the inital
3353 * sample mask with the current. This fixes:
3354 * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
3355 */
3356 if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
3357 vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
3358 vir_NOT(c, vir_XOR(c, c->start_msf,
3359 vir_MSF(c)))),
3360 V3D_QPU_PF_PUSHZ);
3361 cond = V3D_QPU_COND_IFNA;
3362 }
3363
3364 /* If we are in non-uniform control-flow update the condition to
3365 * also limit lanes to those in the current execution mask.
3366 */
3367 if (vir_in_nonuniform_control_flow(c)) {
3368 if (cond == V3D_QPU_COND_IFNA) {
3369 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3370 V3D_QPU_UF_NORNZ);
3371 } else {
3372 assert(cond == V3D_QPU_COND_NONE);
3373 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3374 V3D_QPU_PF_PUSHZ);
3375 }
3376 cond = V3D_QPU_COND_IFA;
3377 }
3378
3379 return cond;
3380 }
3381
3382 static void
emit_compute_barrier(struct v3d_compile * c)3383 emit_compute_barrier(struct v3d_compile *c)
3384 {
3385 /* Ensure we flag the use of the control barrier. NIR's
3386 * gather info pass usually takes care of this, but that
3387 * requires that we call that pass after any other pass
3388 * may emit a control barrier, so this is safer.
3389 */
3390 c->s->info.uses_control_barrier = true;
3391
3392 /* Emit a TSY op to get all invocations in the workgroup
3393 * (actually supergroup) to block until the last
3394 * invocation reaches the TSY op.
3395 */
3396 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
3397 }
3398
3399 static void
emit_barrier(struct v3d_compile * c)3400 emit_barrier(struct v3d_compile *c)
3401 {
3402 struct qreg eidx = vir_EIDX(c);
3403
3404 /* The config for the TSY op should be setup like this:
3405 * - Lane 0: Quorum
3406 * - Lane 2: TSO id
3407 * - Lane 3: TSY opcode
3408 */
3409
3410 /* Lane 0: we want to synchronize across one subgroup. Here we write to
3411 * all lanes unconditionally and will overwrite other lanes below.
3412 */
3413 struct qreg tsy_conf = vir_uniform_ui(c, 1);
3414
3415 /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
3416 * curent QPU index and thread index to ensure we get a unique one for
3417 * this group of invocations in this core.
3418 */
3419 struct qreg tso_id =
3420 vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
3421 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
3422 V3D_QPU_PF_PUSHZ);
3423 vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
3424
3425 /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
3426 struct qreg tsy_op = vir_uniform_ui(c, 16);
3427 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
3428 V3D_QPU_PF_PUSHZ);
3429 vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
3430
3431 /* Emit TSY sync */
3432 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
3433 }
3434
3435 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)3436 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
3437 {
3438 switch (instr->intrinsic) {
3439 case nir_intrinsic_decl_reg:
3440 case nir_intrinsic_load_reg:
3441 case nir_intrinsic_store_reg:
3442 break; /* Ignore these */
3443
3444 case nir_intrinsic_load_uniform:
3445 ntq_emit_load_uniform(c, instr);
3446 break;
3447
3448 case nir_intrinsic_load_global:
3449 case nir_intrinsic_load_global_constant:
3450 ntq_emit_tmu_general(c, instr, false, true);
3451 c->has_general_tmu_load = true;
3452 break;
3453
3454 case nir_intrinsic_load_ubo:
3455 if (ntq_emit_inline_ubo_load(c, instr))
3456 break;
3457 FALLTHROUGH;
3458 case nir_intrinsic_load_ssbo:
3459 if (!ntq_emit_load_unifa(c, instr)) {
3460 ntq_emit_tmu_general(c, instr, false, false);
3461 c->has_general_tmu_load = true;
3462 }
3463 break;
3464
3465 case nir_intrinsic_store_ssbo:
3466 case nir_intrinsic_ssbo_atomic:
3467 case nir_intrinsic_ssbo_atomic_swap:
3468 ntq_emit_tmu_general(c, instr, false, false);
3469 break;
3470
3471 case nir_intrinsic_store_global:
3472 case nir_intrinsic_global_atomic:
3473 case nir_intrinsic_global_atomic_swap:
3474 ntq_emit_tmu_general(c, instr, false, true);
3475 break;
3476
3477 case nir_intrinsic_shared_atomic:
3478 case nir_intrinsic_shared_atomic_swap:
3479 case nir_intrinsic_store_shared:
3480 case nir_intrinsic_store_scratch:
3481 ntq_emit_tmu_general(c, instr, true, false);
3482 break;
3483
3484 case nir_intrinsic_load_scratch:
3485 case nir_intrinsic_load_shared:
3486 ntq_emit_tmu_general(c, instr, true, false);
3487 c->has_general_tmu_load = true;
3488 break;
3489
3490 case nir_intrinsic_image_store:
3491 case nir_intrinsic_image_atomic:
3492 case nir_intrinsic_image_atomic_swap:
3493 v3d_vir_emit_image_load_store(c, instr);
3494 break;
3495
3496 case nir_intrinsic_image_load:
3497 v3d_vir_emit_image_load_store(c, instr);
3498 /* Not really a general TMU load, but we only use this flag
3499 * for NIR scheduling and we do schedule these under the same
3500 * policy as general TMU.
3501 */
3502 c->has_general_tmu_load = true;
3503 break;
3504
3505 case nir_intrinsic_get_ssbo_size:
3506 ntq_store_def(c, &instr->def, 0,
3507 vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
3508 nir_src_comp_as_uint(instr->src[0], 0)));
3509 break;
3510
3511 case nir_intrinsic_get_ubo_size:
3512 ntq_store_def(c, &instr->def, 0,
3513 vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
3514 nir_src_comp_as_uint(instr->src[0], 0)));
3515 break;
3516
3517 case nir_intrinsic_load_user_clip_plane:
3518 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
3519 ntq_store_def(c, &instr->def, i,
3520 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
3521 nir_intrinsic_ucp_id(instr) *
3522 4 + i));
3523 }
3524 break;
3525
3526 case nir_intrinsic_load_viewport_x_scale:
3527 ntq_store_def(c, &instr->def, 0,
3528 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
3529 break;
3530
3531 case nir_intrinsic_load_viewport_y_scale:
3532 ntq_store_def(c, &instr->def, 0,
3533 vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
3534 break;
3535
3536 case nir_intrinsic_load_viewport_z_scale:
3537 ntq_store_def(c, &instr->def, 0,
3538 vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
3539 break;
3540
3541 case nir_intrinsic_load_viewport_z_offset:
3542 ntq_store_def(c, &instr->def, 0,
3543 vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
3544 break;
3545
3546 case nir_intrinsic_load_line_coord:
3547 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
3548 break;
3549
3550 case nir_intrinsic_load_line_width:
3551 ntq_store_def(c, &instr->def, 0,
3552 vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
3553 break;
3554
3555 case nir_intrinsic_load_aa_line_width:
3556 ntq_store_def(c, &instr->def, 0,
3557 vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
3558 break;
3559
3560 case nir_intrinsic_load_sample_mask_in:
3561 ntq_store_def(c, &instr->def, 0, vir_MSF(c));
3562 break;
3563
3564 case nir_intrinsic_load_helper_invocation:
3565 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3566 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3567 ntq_store_def(c, &instr->def, 0, qdest);
3568 break;
3569
3570 case nir_intrinsic_load_front_face:
3571 /* The register contains 0 (front) or 1 (back), and we need to
3572 * turn it into a NIR bool where true means front.
3573 */
3574 ntq_store_def(c, &instr->def, 0,
3575 vir_ADD(c,
3576 vir_uniform_ui(c, -1),
3577 vir_REVF(c)));
3578 break;
3579
3580 case nir_intrinsic_load_base_instance:
3581 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
3582 break;
3583
3584 case nir_intrinsic_load_instance_id:
3585 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
3586 break;
3587
3588 case nir_intrinsic_load_vertex_id:
3589 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
3590 break;
3591
3592 case nir_intrinsic_load_draw_id:
3593 ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
3594 break;
3595
3596 case nir_intrinsic_load_tlb_color_brcm:
3597 vir_emit_tlb_color_read(c, instr);
3598 break;
3599
3600 case nir_intrinsic_load_fep_w_v3d:
3601 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
3602 break;
3603
3604 case nir_intrinsic_load_input:
3605 ntq_emit_load_input(c, instr);
3606 break;
3607
3608 case nir_intrinsic_store_tlb_sample_color_v3d:
3609 ntq_emit_per_sample_color_write(c, instr);
3610 break;
3611
3612 case nir_intrinsic_store_output:
3613 ntq_emit_store_output(c, instr);
3614 break;
3615
3616 case nir_intrinsic_image_size:
3617 ntq_emit_image_size(c, instr);
3618 break;
3619
3620 /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
3621 * is intended to match the semantics of GLSL's discard) should
3622 * terminate the invocation immediately. Our implementation doesn't
3623 * do that. What we do is actually a demote by removing the invocations
3624 * from the sample mask. Maybe we could be more strict and force an
3625 * early termination by emitting a (maybe conditional) jump to the
3626 * end section of the fragment shader for affected invocations.
3627 */
3628 case nir_intrinsic_terminate:
3629 c->emitted_discard = true;
3630 FALLTHROUGH;
3631 case nir_intrinsic_demote:
3632 ntq_flush_tmu(c);
3633
3634 if (vir_in_nonuniform_control_flow(c)) {
3635 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3636 V3D_QPU_PF_PUSHZ);
3637 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3638 vir_uniform_ui(c, 0)),
3639 V3D_QPU_COND_IFA);
3640 } else {
3641 vir_SETMSF_dest(c, vir_nop_reg(),
3642 vir_uniform_ui(c, 0));
3643 }
3644 break;
3645
3646 case nir_intrinsic_terminate_if:
3647 c->emitted_discard = true;
3648 FALLTHROUGH;
3649 case nir_intrinsic_demote_if: {
3650 ntq_flush_tmu(c);
3651
3652 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3653
3654 if (vir_in_nonuniform_control_flow(c)) {
3655 struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3656 c->execute);
3657 if (cond == V3D_QPU_COND_IFA) {
3658 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3659 } else {
3660 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3661 cond = V3D_QPU_COND_IFA;
3662 }
3663 }
3664
3665 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3666 vir_uniform_ui(c, 0)), cond);
3667 break;
3668 }
3669
3670 case nir_intrinsic_barrier:
3671 /* Ensure that the TMU operations before the barrier are flushed
3672 * before the ones after the barrier.
3673 */
3674 ntq_flush_tmu(c);
3675
3676 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
3677 if (c->s->info.stage == MESA_SHADER_COMPUTE)
3678 emit_compute_barrier(c);
3679 else
3680 emit_barrier(c);
3681
3682 /* The blocking of a TSY op only happens at the next
3683 * thread switch. No texturing may be outstanding at the
3684 * time of a TSY blocking operation.
3685 */
3686 vir_emit_thrsw(c);
3687 }
3688 break;
3689
3690 case nir_intrinsic_load_num_workgroups:
3691 for (int i = 0; i < 3; i++) {
3692 ntq_store_def(c, &instr->def, i,
3693 vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3694 i));
3695 }
3696 break;
3697
3698 case nir_intrinsic_load_workgroup_id: {
3699 struct qreg x = vir_AND(c, c->cs_payload[0],
3700 vir_uniform_ui(c, 0xffff));
3701 ntq_store_def(c, &instr->def, 0, x);
3702
3703 struct qreg y = vir_SHR(c, c->cs_payload[0],
3704 vir_uniform_ui(c, 16));
3705 ntq_store_def(c, &instr->def, 1, y);
3706
3707 struct qreg z = vir_AND(c, c->cs_payload[1],
3708 vir_uniform_ui(c, 0xffff));
3709 ntq_store_def(c, &instr->def, 2, z);
3710 break;
3711 }
3712
3713 case nir_intrinsic_load_base_workgroup_id: {
3714 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
3715 ntq_store_def(c, &instr->def, 0, x);
3716
3717 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
3718 ntq_store_def(c, &instr->def, 1, y);
3719
3720 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
3721 ntq_store_def(c, &instr->def, 2, z);
3722 break;
3723 }
3724
3725 case nir_intrinsic_load_workgroup_size: {
3726 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 0);
3727 ntq_store_def(c, &instr->def, 0, x);
3728
3729 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 1);
3730 ntq_store_def(c, &instr->def, 1, y);
3731
3732 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 2);
3733 ntq_store_def(c, &instr->def, 2, z);
3734 break;
3735 }
3736
3737 case nir_intrinsic_load_local_invocation_index:
3738 ntq_store_def(c, &instr->def, 0,
3739 emit_load_local_invocation_index(c));
3740 break;
3741
3742 case nir_intrinsic_load_subgroup_id: {
3743 /* This is basically the batch index, which is the Local
3744 * Invocation Index divided by the SIMD width).
3745 */
3746 STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
3747 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3748 struct qreg lii = emit_load_local_invocation_index(c);
3749 ntq_store_def(c, &instr->def, 0,
3750 vir_SHR(c, lii,
3751 vir_uniform_ui(c, divide_shift)));
3752 break;
3753 }
3754
3755 case nir_intrinsic_load_per_vertex_input: {
3756 /* The vertex shader writes all its used outputs into
3757 * consecutive VPM offsets, so if any output component is
3758 * unused, its VPM offset is used by the next used
3759 * component. This means that we can't assume that each
3760 * location will use 4 consecutive scalar offsets in the VPM
3761 * and we need to compute the VPM offset for each input by
3762 * going through the inputs and finding the one that matches
3763 * our location and component.
3764 *
3765 * col: vertex index, row = varying index
3766 */
3767 assert(nir_src_is_const(instr->src[1]));
3768 uint32_t location =
3769 nir_intrinsic_io_semantics(instr).location +
3770 nir_src_as_uint(instr->src[1]);
3771 uint32_t component = nir_intrinsic_component(instr);
3772
3773 int32_t row_idx = -1;
3774 for (int i = 0; i < c->num_inputs; i++) {
3775 struct v3d_varying_slot slot = c->input_slots[i];
3776 if (v3d_slot_get_slot(slot) == location &&
3777 v3d_slot_get_component(slot) == component) {
3778 row_idx = i;
3779 break;
3780 }
3781 }
3782
3783 assert(row_idx != -1);
3784
3785 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3786 for (int i = 0; i < instr->num_components; i++) {
3787 struct qreg row = vir_uniform_ui(c, row_idx++);
3788 ntq_store_def(c, &instr->def, i,
3789 vir_LDVPMG_IN(c, row, col));
3790 }
3791 break;
3792 }
3793
3794 case nir_intrinsic_emit_vertex:
3795 case nir_intrinsic_end_primitive:
3796 unreachable("Should have been lowered in v3d_nir_lower_io");
3797 break;
3798
3799 case nir_intrinsic_load_primitive_id: {
3800 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3801 * VPM output header. According to docs, we should read this
3802 * using ldvpm(v,d)_in (See Table 71).
3803 */
3804 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3805 ntq_store_def(c, &instr->def, 0,
3806 vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3807 break;
3808 }
3809
3810 case nir_intrinsic_load_invocation_id:
3811 ntq_store_def(c, &instr->def, 0, vir_IID(c));
3812 break;
3813
3814 case nir_intrinsic_load_fb_layers_v3d:
3815 ntq_store_def(c, &instr->def, 0,
3816 vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3817 break;
3818
3819 case nir_intrinsic_load_sample_id:
3820 ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
3821 break;
3822
3823 case nir_intrinsic_load_sample_pos:
3824 ntq_store_def(c, &instr->def, 0,
3825 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3826 ntq_store_def(c, &instr->def, 1,
3827 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3828 break;
3829
3830 case nir_intrinsic_load_barycentric_at_offset:
3831 ntq_store_def(c, &instr->def, 0,
3832 vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3833 ntq_store_def(c, &instr->def, 1,
3834 vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3835 break;
3836
3837 case nir_intrinsic_load_barycentric_pixel:
3838 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3839 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3840 break;
3841
3842 case nir_intrinsic_load_barycentric_at_sample: {
3843 if (!c->fs_key->msaa) {
3844 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3845 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3846 return;
3847 }
3848
3849 struct qreg offset_x, offset_y;
3850 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3851 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3852
3853 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3854 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3855 break;
3856 }
3857
3858 case nir_intrinsic_load_barycentric_sample: {
3859 struct qreg offset_x =
3860 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3861 struct qreg offset_y =
3862 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3863
3864 ntq_store_def(c, &instr->def, 0,
3865 vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3866 ntq_store_def(c, &instr->def, 1,
3867 vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3868 break;
3869 }
3870
3871 case nir_intrinsic_load_barycentric_centroid: {
3872 struct qreg offset_x, offset_y;
3873 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3874 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3875 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3876 break;
3877 }
3878
3879 case nir_intrinsic_load_interpolated_input: {
3880 assert(nir_src_is_const(instr->src[1]));
3881 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3882
3883 for (int i = 0; i < instr->num_components; i++) {
3884 const uint32_t input_idx =
3885 (nir_intrinsic_base(instr) + offset) * 4 +
3886 nir_intrinsic_component(instr) + i;
3887
3888 /* If we are not in MSAA or if we are not interpolating
3889 * a user varying, just return the pre-computed
3890 * interpolated input.
3891 */
3892 if (!c->fs_key->msaa ||
3893 c->interp[input_idx].vp.file == QFILE_NULL) {
3894 ntq_store_def(c, &instr->def, i,
3895 vir_MOV(c, c->inputs[input_idx]));
3896 continue;
3897 }
3898
3899 /* Otherwise compute interpolation at the specified
3900 * offset.
3901 */
3902 struct qreg p = c->interp[input_idx].vp;
3903 struct qreg C = c->interp[input_idx].C;
3904 unsigned interp_mode = c->interp[input_idx].mode;
3905
3906 struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3907 struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3908
3909 struct qreg result =
3910 ntq_emit_load_interpolated_input(c, p, C,
3911 offset_x, offset_y,
3912 interp_mode);
3913 ntq_store_def(c, &instr->def, i, result);
3914 }
3915 break;
3916 }
3917
3918 case nir_intrinsic_load_subgroup_size:
3919 ntq_store_def(c, &instr->def, 0,
3920 vir_uniform_ui(c, V3D_CHANNELS));
3921 break;
3922
3923 case nir_intrinsic_load_subgroup_invocation:
3924 ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
3925 break;
3926
3927 case nir_intrinsic_ddx:
3928 case nir_intrinsic_ddx_coarse:
3929 case nir_intrinsic_ddx_fine: {
3930 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3931 ntq_store_def(c, &instr->def, 0, vir_FDX(c, value));
3932 break;
3933 }
3934
3935 case nir_intrinsic_ddy:
3936 case nir_intrinsic_ddy_coarse:
3937 case nir_intrinsic_ddy_fine: {
3938 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3939 ntq_store_def(c, &instr->def, 0, vir_FDY(c, value));
3940 break;
3941 }
3942
3943 case nir_intrinsic_elect: {
3944 struct qreg first;
3945 if (vir_in_nonuniform_control_flow(c)) {
3946 /* Sets A=1 for lanes enabled in the execution mask */
3947 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3948 V3D_QPU_PF_PUSHZ);
3949 /* Updates A ANDing with lanes enabled in MSF */
3950 vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
3951 V3D_QPU_UF_ANDNZ);
3952 first = vir_FLAFIRST(c);
3953 } else {
3954 /* Sets A=1 for inactive lanes */
3955 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
3956 V3D_QPU_PF_PUSHZ);
3957 first = vir_FLNAFIRST(c);
3958 }
3959
3960 /* Produce a boolean result */
3961 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3962 first, vir_uniform_ui(c, 1)),
3963 V3D_QPU_PF_PUSHZ);
3964 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3965 ntq_store_def(c, &instr->def, 0, result);
3966 break;
3967 }
3968
3969 case nir_intrinsic_ballot: {
3970 assert(c->devinfo->ver >= 71);
3971 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3972 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3973 struct qreg res = vir_get_temp(c);
3974 vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
3975 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3976 break;
3977 }
3978
3979 case nir_intrinsic_read_invocation: {
3980 assert(c->devinfo->ver >= 71);
3981 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3982 struct qreg index = ntq_get_src(c, instr->src[1], 0);
3983 struct qreg res = vir_SHUFFLE(c, value, index);
3984 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3985 break;
3986 }
3987
3988 case nir_intrinsic_read_first_invocation: {
3989 assert(c->devinfo->ver >= 71);
3990 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3991 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3992 struct qreg res = vir_get_temp(c);
3993 vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
3994 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3995 break;
3996 }
3997
3998 case nir_intrinsic_shuffle: {
3999 assert(c->devinfo->ver >= 71);
4000 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4001 struct qreg indices = ntq_get_src(c, instr->src[1], 0);
4002 struct qreg res = vir_SHUFFLE(c, value, indices);
4003 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4004 break;
4005 }
4006
4007 case nir_intrinsic_vote_feq:
4008 case nir_intrinsic_vote_ieq: {
4009 assert(c->devinfo->ver >= 71);
4010 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4011 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4012 struct qreg res = vir_get_temp(c);
4013 vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
4014 vir_ALLEQ_dest(c, res, value) :
4015 vir_ALLFEQ_dest(c, res, value),
4016 cond);
4017
4018 /* Produce boolean result */
4019 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4020 V3D_QPU_PF_PUSHZ);
4021 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4022 ntq_store_def(c, &instr->def, 0, result);
4023 break;
4024 }
4025
4026 case nir_intrinsic_vote_all: {
4027 assert(c->devinfo->ver >= 71);
4028 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4029 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4030 struct qreg res = vir_get_temp(c);
4031 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4032
4033 /* We want to check if 'all lanes are equal (alleq != 0) and
4034 * their value is True (value != 0)'.
4035 *
4036 * The first MOV.pushz generates predicate for 'alleq == 0'.
4037 * The second MOV.NORZ generates predicate for:
4038 * '!(alleq == 0) & !(value == 0).
4039 */
4040 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4041 V3D_QPU_PF_PUSHZ);
4042 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4043 V3D_QPU_UF_NORZ);
4044 struct qreg result =
4045 ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
4046 ntq_store_def(c, &instr->def, 0, result);
4047 break;
4048 }
4049
4050 case nir_intrinsic_vote_any: {
4051 assert(c->devinfo->ver >= 71);
4052 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4053 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4054 struct qreg res = vir_get_temp(c);
4055 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4056
4057 /* We want to check 'not (all lanes are equal (alleq != 0)'
4058 * and their value is False (value == 0))'.
4059 *
4060 * The first MOV.pushz generates predicate for 'alleq == 0'.
4061 * The second MOV.NORNZ generates predicate for:
4062 * '!(alleq == 0) & (value == 0).
4063 * The IFNA condition negates the predicate when evaluated:
4064 * '!(!alleq == 0) & (value == 0))
4065 */
4066 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4067 V3D_QPU_PF_PUSHZ);
4068 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4069 V3D_QPU_UF_NORNZ);
4070 struct qreg result =
4071 ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4072 ntq_store_def(c, &instr->def, 0, result);
4073 break;
4074 }
4075
4076 case nir_intrinsic_load_num_subgroups:
4077 unreachable("Should have been lowered");
4078 break;
4079
4080 case nir_intrinsic_load_view_index:
4081 ntq_store_def(c, &instr->def, 0,
4082 vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
4083 break;
4084
4085 default:
4086 fprintf(stderr, "Unknown intrinsic: ");
4087 nir_print_instr(&instr->instr, stderr);
4088 fprintf(stderr, "\n");
4089 abort();
4090 }
4091 }
4092
4093 /* Clears (activates) the execute flags for any channels whose jump target
4094 * matches this block.
4095 *
4096 * XXX perf: Could we be using flpush/flpop somehow for our execution channel
4097 * enabling?
4098 *
4099 */
4100 static void
ntq_activate_execute_for_block(struct v3d_compile * c)4101 ntq_activate_execute_for_block(struct v3d_compile *c)
4102 {
4103 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4104 c->execute, vir_uniform_ui(c, c->cur_block->index)),
4105 V3D_QPU_PF_PUSHZ);
4106
4107 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4108 }
4109
4110 static bool
is_cheap_block(nir_block * block)4111 is_cheap_block(nir_block *block)
4112 {
4113 int32_t cost = 3;
4114 nir_foreach_instr(instr, block) {
4115 switch (instr->type) {
4116 case nir_instr_type_alu:
4117 case nir_instr_type_undef:
4118 case nir_instr_type_load_const:
4119 if (--cost <= 0)
4120 return false;
4121 break;
4122 case nir_instr_type_intrinsic: {
4123 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4124 switch (intr->intrinsic) {
4125 case nir_intrinsic_decl_reg:
4126 case nir_intrinsic_load_reg:
4127 case nir_intrinsic_store_reg:
4128 continue;
4129 default:
4130 return false;
4131 }
4132 }
4133 default:
4134 return false;
4135 }
4136 }
4137 return true;
4138 }
4139
4140 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)4141 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
4142 {
4143 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4144 bool empty_else_block =
4145 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4146 exec_list_is_empty(&nir_else_block->instr_list));
4147
4148 struct qblock *then_block = vir_new_block(c);
4149 struct qblock *after_block = vir_new_block(c);
4150 struct qblock *else_block;
4151 if (empty_else_block)
4152 else_block = after_block;
4153 else
4154 else_block = vir_new_block(c);
4155
4156 /* Check if this if statement is really just a conditional jump with
4157 * the form:
4158 *
4159 * if (cond) {
4160 * break/continue;
4161 * } else {
4162 * }
4163 *
4164 * In which case we can skip the jump to ELSE we emit before the THEN
4165 * block and instead just emit the break/continue directly.
4166 */
4167 nir_jump_instr *conditional_jump = NULL;
4168 if (empty_else_block) {
4169 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
4170 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
4171 if (inst && inst->type == nir_instr_type_jump)
4172 conditional_jump = nir_instr_as_jump(inst);
4173 }
4174
4175 /* Set up the flags for the IF condition (taking the THEN branch). */
4176 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4177
4178 if (!conditional_jump) {
4179 /* Jump to ELSE. */
4180 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4181 V3D_QPU_BRANCH_COND_ANYNA :
4182 V3D_QPU_BRANCH_COND_ANYA);
4183 /* Pixels that were not dispatched or have been discarded
4184 * should not contribute to the ANYA/ANYNA condition.
4185 */
4186 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4187
4188 vir_link_blocks(c->cur_block, else_block);
4189 vir_link_blocks(c->cur_block, then_block);
4190
4191 /* Process the THEN block. */
4192 vir_set_emit_block(c, then_block);
4193 ntq_emit_cf_list(c, &if_stmt->then_list);
4194
4195 if (!empty_else_block) {
4196 /* At the end of the THEN block, jump to ENDIF, unless
4197 * the block ended in a break or continue.
4198 */
4199 if (!c->cur_block->branch_emitted) {
4200 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4201 vir_link_blocks(c->cur_block, after_block);
4202 }
4203
4204 /* Emit the else block. */
4205 vir_set_emit_block(c, else_block);
4206 ntq_emit_cf_list(c, &if_stmt->else_list);
4207 }
4208 } else {
4209 /* Emit the conditional jump directly.
4210 *
4211 * Use ALL with breaks and ANY with continues to ensure that
4212 * we always break and never continue when all lanes have been
4213 * disabled (for example because of discards) to prevent
4214 * infinite loops.
4215 */
4216 assert(conditional_jump &&
4217 (conditional_jump->type == nir_jump_continue ||
4218 conditional_jump->type == nir_jump_break));
4219
4220 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4221 (conditional_jump->type == nir_jump_break ?
4222 V3D_QPU_BRANCH_COND_ALLA :
4223 V3D_QPU_BRANCH_COND_ANYA) :
4224 (conditional_jump->type == nir_jump_break ?
4225 V3D_QPU_BRANCH_COND_ALLNA :
4226 V3D_QPU_BRANCH_COND_ANYNA));
4227 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4228
4229 vir_link_blocks(c->cur_block,
4230 conditional_jump->type == nir_jump_break ?
4231 c->loop_break_block :
4232 c->loop_cont_block);
4233 }
4234
4235 vir_link_blocks(c->cur_block, after_block);
4236
4237 vir_set_emit_block(c, after_block);
4238 }
4239
4240 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)4241 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
4242 {
4243 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4244 bool empty_else_block =
4245 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4246 exec_list_is_empty(&nir_else_block->instr_list));
4247
4248 struct qblock *then_block = vir_new_block(c);
4249 struct qblock *after_block = vir_new_block(c);
4250 struct qblock *else_block;
4251 if (empty_else_block)
4252 else_block = after_block;
4253 else
4254 else_block = vir_new_block(c);
4255
4256 bool was_uniform_control_flow = false;
4257 if (!vir_in_nonuniform_control_flow(c)) {
4258 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4259 was_uniform_control_flow = true;
4260 }
4261
4262 /* Set up the flags for the IF condition (taking the THEN branch). */
4263 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4264
4265 /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
4266 * was previously active (execute Z) for updating the exec flags.
4267 */
4268 if (was_uniform_control_flow) {
4269 cond = v3d_qpu_cond_invert(cond);
4270 } else {
4271 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
4272 if (cond == V3D_QPU_COND_IFA) {
4273 vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
4274 } else {
4275 vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
4276 cond = V3D_QPU_COND_IFA;
4277 }
4278 }
4279
4280 vir_MOV_cond(c, cond,
4281 c->execute,
4282 vir_uniform_ui(c, else_block->index));
4283
4284 /* Set the flags for taking the THEN block */
4285 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4286 V3D_QPU_PF_PUSHZ);
4287
4288 /* Jump to ELSE if nothing is active for THEN (unless THEN block is
4289 * so small it won't pay off), otherwise fall through.
4290 */
4291 bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
4292 is_cheap_block(nir_if_first_then_block(if_stmt));
4293 if (!is_cheap) {
4294 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
4295 vir_link_blocks(c->cur_block, else_block);
4296 }
4297 vir_link_blocks(c->cur_block, then_block);
4298
4299 /* Process the THEN block.
4300 *
4301 * Notice we don't call ntq_activate_execute_for_block here on purpose:
4302 * c->execute is already set up to be 0 for lanes that must take the
4303 * THEN block.
4304 */
4305 vir_set_emit_block(c, then_block);
4306 ntq_emit_cf_list(c, &if_stmt->then_list);
4307
4308 if (!empty_else_block) {
4309 /* Handle the end of the THEN block. First, all currently
4310 * active channels update their execute flags to point to
4311 * ENDIF
4312 */
4313 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4314 V3D_QPU_PF_PUSHZ);
4315 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4316 vir_uniform_ui(c, after_block->index));
4317
4318 /* If everything points at ENDIF, then jump there immediately
4319 * (unless ELSE block is so small it won't pay off).
4320 */
4321 bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
4322 is_cheap_block(nir_else_block);
4323 if (!is_cheap) {
4324 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4325 c->execute,
4326 vir_uniform_ui(c, after_block->index)),
4327 V3D_QPU_PF_PUSHZ);
4328 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
4329 vir_link_blocks(c->cur_block, after_block);
4330 }
4331 vir_link_blocks(c->cur_block, else_block);
4332
4333 vir_set_emit_block(c, else_block);
4334 ntq_activate_execute_for_block(c);
4335 ntq_emit_cf_list(c, &if_stmt->else_list);
4336 }
4337
4338 vir_link_blocks(c->cur_block, after_block);
4339
4340 vir_set_emit_block(c, after_block);
4341 if (was_uniform_control_flow)
4342 c->execute = c->undef;
4343 else
4344 ntq_activate_execute_for_block(c);
4345 }
4346
4347 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)4348 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
4349 {
4350 bool was_in_control_flow = c->in_control_flow;
4351 c->in_control_flow = true;
4352 if (!vir_in_nonuniform_control_flow(c) &&
4353 !nir_src_is_divergent(nif->condition)) {
4354 ntq_emit_uniform_if(c, nif);
4355 } else {
4356 ntq_emit_nonuniform_if(c, nif);
4357 }
4358 c->in_control_flow = was_in_control_flow;
4359 }
4360
4361 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)4362 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
4363 {
4364 switch (jump->type) {
4365 case nir_jump_break:
4366 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4367 V3D_QPU_PF_PUSHZ);
4368 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4369 vir_uniform_ui(c, c->loop_break_block->index));
4370 break;
4371
4372 case nir_jump_continue:
4373 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4374 V3D_QPU_PF_PUSHZ);
4375 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4376 vir_uniform_ui(c, c->loop_cont_block->index));
4377 break;
4378
4379 case nir_jump_return:
4380 unreachable("All returns should be lowered\n");
4381 break;
4382
4383 case nir_jump_halt:
4384 case nir_jump_goto:
4385 case nir_jump_goto_if:
4386 unreachable("not supported\n");
4387 break;
4388 }
4389 }
4390
4391 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)4392 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
4393 {
4394 switch (jump->type) {
4395 case nir_jump_break:
4396 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4397 vir_link_blocks(c->cur_block, c->loop_break_block);
4398 c->cur_block->branch_emitted = true;
4399 break;
4400 case nir_jump_continue:
4401 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4402 vir_link_blocks(c->cur_block, c->loop_cont_block);
4403 c->cur_block->branch_emitted = true;
4404 break;
4405
4406 case nir_jump_return:
4407 unreachable("All returns should be lowered\n");
4408 break;
4409
4410 case nir_jump_halt:
4411 case nir_jump_goto:
4412 case nir_jump_goto_if:
4413 unreachable("not supported\n");
4414 break;
4415 }
4416 }
4417
4418 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)4419 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
4420 {
4421 switch (instr->type) {
4422 case nir_instr_type_alu:
4423 ntq_emit_alu(c, nir_instr_as_alu(instr));
4424 break;
4425
4426 case nir_instr_type_intrinsic:
4427 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
4428 break;
4429
4430 case nir_instr_type_load_const:
4431 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
4432 break;
4433
4434 case nir_instr_type_undef:
4435 unreachable("Should've been lowered by nir_lower_undef_to_zero");
4436 break;
4437
4438 case nir_instr_type_tex:
4439 ntq_emit_tex(c, nir_instr_as_tex(instr));
4440 break;
4441
4442 case nir_instr_type_jump:
4443 /* Always flush TMU before jumping to another block, for the
4444 * same reasons as in ntq_emit_block.
4445 */
4446 ntq_flush_tmu(c);
4447 if (vir_in_nonuniform_control_flow(c))
4448 ntq_emit_jump(c, nir_instr_as_jump(instr));
4449 else
4450 ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
4451 break;
4452
4453 default:
4454 fprintf(stderr, "Unknown NIR instr type: ");
4455 nir_print_instr(instr, stderr);
4456 fprintf(stderr, "\n");
4457 abort();
4458 }
4459 }
4460
4461 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)4462 ntq_emit_block(struct v3d_compile *c, nir_block *block)
4463 {
4464 nir_foreach_instr(instr, block) {
4465 ntq_emit_instr(c, instr);
4466 }
4467
4468 /* Always process pending TMU operations in the same block they were
4469 * emitted: we can't emit TMU operations in a block and then emit a
4470 * thread switch and LDTMU/TMUWT for them in another block, possibly
4471 * under control flow.
4472 */
4473 ntq_flush_tmu(c);
4474 }
4475
4476 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
4477
4478 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)4479 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
4480 {
4481 bool was_uniform_control_flow = false;
4482 if (!vir_in_nonuniform_control_flow(c)) {
4483 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4484 was_uniform_control_flow = true;
4485 }
4486
4487 c->loop_cont_block = vir_new_block(c);
4488 c->loop_break_block = vir_new_block(c);
4489
4490 vir_link_blocks(c->cur_block, c->loop_cont_block);
4491 vir_set_emit_block(c, c->loop_cont_block);
4492 ntq_activate_execute_for_block(c);
4493
4494 ntq_emit_cf_list(c, &loop->body);
4495
4496 /* Re-enable any previous continues now, so our ANYA check below
4497 * works.
4498 *
4499 * XXX: Use the .ORZ flags update, instead.
4500 */
4501 vir_set_pf(c, vir_XOR_dest(c,
4502 vir_nop_reg(),
4503 c->execute,
4504 vir_uniform_ui(c, c->loop_cont_block->index)),
4505 V3D_QPU_PF_PUSHZ);
4506 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4507
4508 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
4509
4510 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
4511 /* Pixels that were not dispatched or have been discarded should not
4512 * contribute to looping again.
4513 */
4514 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4515 vir_link_blocks(c->cur_block, c->loop_cont_block);
4516 vir_link_blocks(c->cur_block, c->loop_break_block);
4517
4518 vir_set_emit_block(c, c->loop_break_block);
4519 if (was_uniform_control_flow)
4520 c->execute = c->undef;
4521 else
4522 ntq_activate_execute_for_block(c);
4523 }
4524
4525 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)4526 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
4527 {
4528 c->loop_cont_block = vir_new_block(c);
4529 c->loop_break_block = vir_new_block(c);
4530
4531 vir_link_blocks(c->cur_block, c->loop_cont_block);
4532 vir_set_emit_block(c, c->loop_cont_block);
4533
4534 ntq_emit_cf_list(c, &loop->body);
4535
4536 if (!c->cur_block->branch_emitted) {
4537 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4538 vir_link_blocks(c->cur_block, c->loop_cont_block);
4539 }
4540
4541 vir_set_emit_block(c, c->loop_break_block);
4542 }
4543
4544 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)4545 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
4546 {
4547 assert(!nir_loop_has_continue_construct(loop));
4548
4549 /* Disable flags optimization for loop conditions. The problem here is
4550 * that we can have code like this:
4551 *
4552 * // block_0
4553 * vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
4554 * loop {
4555 * // block_1
4556 * if ssa_9 {
4557 *
4558 * In this example we emit flags to compute ssa_9 and the optimization
4559 * will skip regenerating them again for the loop condition in the
4560 * loop continue block (block_1). However, this is not safe after the
4561 * first iteration because the loop body can stomp the flags if it has
4562 * any conditionals.
4563 */
4564 c->flags_temp = -1;
4565
4566 bool was_in_control_flow = c->in_control_flow;
4567 c->in_control_flow = true;
4568
4569 struct qblock *save_loop_cont_block = c->loop_cont_block;
4570 struct qblock *save_loop_break_block = c->loop_break_block;
4571
4572 if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
4573 ntq_emit_nonuniform_loop(c, loop);
4574 } else {
4575 ntq_emit_uniform_loop(c, loop);
4576 }
4577
4578 c->loop_break_block = save_loop_break_block;
4579 c->loop_cont_block = save_loop_cont_block;
4580
4581 c->loops++;
4582
4583 c->in_control_flow = was_in_control_flow;
4584 }
4585
4586 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)4587 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
4588 {
4589 fprintf(stderr, "FUNCTIONS not handled.\n");
4590 abort();
4591 }
4592
4593 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)4594 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
4595 {
4596 foreach_list_typed(nir_cf_node, node, node, list) {
4597 switch (node->type) {
4598 case nir_cf_node_block:
4599 ntq_emit_block(c, nir_cf_node_as_block(node));
4600 break;
4601
4602 case nir_cf_node_if:
4603 ntq_emit_if(c, nir_cf_node_as_if(node));
4604 break;
4605
4606 case nir_cf_node_loop:
4607 ntq_emit_loop(c, nir_cf_node_as_loop(node));
4608 break;
4609
4610 case nir_cf_node_function:
4611 ntq_emit_function(c, nir_cf_node_as_function(node));
4612 break;
4613
4614 default:
4615 fprintf(stderr, "Unknown NIR node type\n");
4616 abort();
4617 }
4618 }
4619 }
4620
4621 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)4622 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
4623 {
4624 ntq_setup_registers(c, impl);
4625 ntq_emit_cf_list(c, &impl->body);
4626 }
4627
4628 static bool
vir_inst_reads_reg(struct qinst * inst,struct qreg r)4629 vir_inst_reads_reg(struct qinst *inst, struct qreg r)
4630 {
4631 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4632 if (inst->src[i].file == r.file && inst->src[i].index == r.index)
4633 return true;
4634 }
4635 return false;
4636 }
4637
4638 static void
sched_flags_in_block(struct v3d_compile * c,struct qblock * block)4639 sched_flags_in_block(struct v3d_compile *c, struct qblock *block)
4640 {
4641 struct qinst *flags_inst = NULL;
4642 list_for_each_entry_safe_rev(struct qinst, inst, &block->instructions, link) {
4643 /* Check for cases that would prevent us from moving a flags
4644 * instruction any earlier than this instruction:
4645 *
4646 * - The flags instruction reads the result of this instr.
4647 * - The instruction reads or writes flags.
4648 */
4649 if (flags_inst) {
4650 if (vir_inst_reads_reg(flags_inst, inst->dst) ||
4651 v3d_qpu_writes_flags(&inst->qpu) ||
4652 v3d_qpu_reads_flags(&inst->qpu)) {
4653 list_move_to(&flags_inst->link, &inst->link);
4654 flags_inst = NULL;
4655 }
4656 }
4657
4658 /* Skip if this instruction does more than just write flags */
4659 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
4660 inst->dst.file != QFILE_NULL ||
4661 !v3d_qpu_writes_flags(&inst->qpu)) {
4662 continue;
4663 }
4664
4665 /* If we already had a flags_inst we should've moved it after
4666 * this instruction in the if (flags_inst) above.
4667 */
4668 assert(!flags_inst);
4669 flags_inst = inst;
4670 }
4671
4672 /* If we reached the beginning of the block and we still have a flags
4673 * instruction selected we can put it at the top of the block.
4674 */
4675 if (flags_inst) {
4676 list_move_to(&flags_inst->link, &block->instructions);
4677 flags_inst = NULL;
4678 }
4679 }
4680
4681 /**
4682 * The purpose of this pass is to emit instructions that are only concerned
4683 * with producing flags as early as possible to hopefully reduce liveness
4684 * of their source arguments.
4685 */
4686 static void
sched_flags(struct v3d_compile * c)4687 sched_flags(struct v3d_compile *c)
4688 {
4689 vir_for_each_block(block, c)
4690 sched_flags_in_block(c, block);
4691 }
4692
4693 static void
nir_to_vir(struct v3d_compile * c)4694 nir_to_vir(struct v3d_compile *c)
4695 {
4696 switch (c->s->info.stage) {
4697 case MESA_SHADER_FRAGMENT:
4698 c->start_msf = vir_MSF(c);
4699 if (c->devinfo->ver < 71)
4700 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
4701 else
4702 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
4703
4704 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
4705 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
4706
4707 /* V3D 4.x can disable implicit varyings if they are not used */
4708 c->fs_uses_primitive_id =
4709 nir_find_variable_with_location(c->s, nir_var_shader_in,
4710 VARYING_SLOT_PRIMITIVE_ID);
4711 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
4712 c->primitive_id =
4713 emit_fragment_varying(c, NULL, -1, 0, 0);
4714 }
4715
4716 if (c->fs_key->is_points && program_reads_point_coord(c)) {
4717 c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4718 c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
4719 c->uses_implicit_point_line_varyings = true;
4720 } else if (c->fs_key->is_lines &&
4721 (BITSET_TEST(c->s->info.system_values_read,
4722 SYSTEM_VALUE_LINE_COORD))) {
4723 c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4724 c->uses_implicit_point_line_varyings = true;
4725 }
4726 break;
4727 case MESA_SHADER_COMPUTE:
4728 /* Set up the TSO for barriers, assuming we do some. */
4729 if (c->devinfo->ver < 42) {
4730 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
4731 V3D_QPU_WADDR_SYNC));
4732 }
4733
4734 if (c->devinfo->ver == 42) {
4735 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
4736 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4737 } else if (c->devinfo->ver >= 71) {
4738 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
4739 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4740 }
4741
4742 /* Set up the division between gl_LocalInvocationIndex and
4743 * wg_in_mem in the payload reg.
4744 */
4745 int wg_size = (c->s->info.workgroup_size[0] *
4746 c->s->info.workgroup_size[1] *
4747 c->s->info.workgroup_size[2]);
4748 c->local_invocation_index_bits =
4749 ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
4750 assert(c->local_invocation_index_bits <= 8);
4751
4752 if (c->s->info.shared_size || c->s->info.cs.has_variable_shared_mem) {
4753 struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
4754 vir_uniform_ui(c, 16));
4755 if (c->s->info.workgroup_size[0] != 1 ||
4756 c->s->info.workgroup_size[1] != 1 ||
4757 c->s->info.workgroup_size[2] != 1) {
4758 int wg_bits = (16 -
4759 c->local_invocation_index_bits);
4760 int wg_mask = (1 << wg_bits) - 1;
4761 wg_in_mem = vir_AND(c, wg_in_mem,
4762 vir_uniform_ui(c, wg_mask));
4763 }
4764
4765 struct qreg shared_per_wg;
4766 if (c->s->info.cs.has_variable_shared_mem) {
4767 shared_per_wg = vir_uniform(c, QUNIFORM_SHARED_SIZE, 0);
4768 } else {
4769 shared_per_wg = vir_uniform_ui(c, c->s->info.shared_size);
4770 }
4771
4772 c->cs_shared_offset =
4773 vir_ADD(c,
4774 vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
4775 vir_UMUL(c, wg_in_mem, shared_per_wg));
4776 }
4777 break;
4778 default:
4779 break;
4780 }
4781
4782 if (c->s->scratch_size) {
4783 v3d_setup_spill_base(c);
4784 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
4785 }
4786
4787 switch (c->s->info.stage) {
4788 case MESA_SHADER_VERTEX:
4789 ntq_setup_vs_inputs(c);
4790 break;
4791 case MESA_SHADER_GEOMETRY:
4792 ntq_setup_gs_inputs(c);
4793 break;
4794 case MESA_SHADER_FRAGMENT:
4795 ntq_setup_fs_inputs(c);
4796 break;
4797 case MESA_SHADER_COMPUTE:
4798 break;
4799 default:
4800 unreachable("unsupported shader stage");
4801 }
4802
4803 ntq_setup_outputs(c);
4804
4805 /* Find the main function and emit the body. */
4806 nir_foreach_function(function, c->s) {
4807 assert(function->is_entrypoint);
4808 assert(function->impl);
4809 ntq_emit_impl(c, function->impl);
4810 }
4811 }
4812
4813 /**
4814 * When demoting a shader down to single-threaded, removes the THRSW
4815 * instructions (one will still be inserted at v3d_vir_to_qpu() for the
4816 * program end).
4817 */
4818 static void
vir_remove_thrsw(struct v3d_compile * c)4819 vir_remove_thrsw(struct v3d_compile *c)
4820 {
4821 vir_for_each_block(block, c) {
4822 vir_for_each_inst_safe(inst, block) {
4823 if (inst->qpu.sig.thrsw)
4824 vir_remove_instruction(c, inst);
4825 }
4826 }
4827
4828 c->last_thrsw = NULL;
4829 }
4830
4831 /**
4832 * This makes sure we have a top-level last thread switch which signals the
4833 * start of the last thread section, which may include adding a new thrsw
4834 * instruction if needed. We don't allow spilling in the last thread section, so
4835 * if we need to do any spills that inject additional thread switches later on,
4836 * we ensure this thread switch will still be the last thread switch in the
4837 * program, which makes last thread switch signalling a lot easier when we have
4838 * spilling. If in the end we don't need to spill to compile the program and we
4839 * injected a new thread switch instruction here only for that, we will
4840 * eventually restore the previous last thread switch and remove the one we
4841 * added here.
4842 */
4843 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)4844 vir_emit_last_thrsw(struct v3d_compile *c,
4845 struct qinst **restore_last_thrsw,
4846 bool *restore_scoreboard_lock)
4847 {
4848 *restore_last_thrsw = c->last_thrsw;
4849
4850 /* If we're threaded and the last THRSW was in conditional code, then
4851 * we need to emit another one so that we can flag it as the last
4852 * thrsw.
4853 */
4854 if (c->last_thrsw && !c->last_thrsw_at_top_level)
4855 vir_emit_thrsw(c);
4856
4857 /* If we're threaded, then we need to mark the last THRSW instruction
4858 * so we can emit a pair of them at QPU emit time.
4859 *
4860 * For V3D 4.x, we can spawn the non-fragment shaders already in the
4861 * post-last-THRSW state, so we can skip this.
4862 */
4863 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
4864 vir_emit_thrsw(c);
4865
4866 /* If we have not inserted a last thread switch yet, do it now to ensure
4867 * any potential spilling we do happens before this. If we don't spill
4868 * in the end, we will restore the previous one.
4869 */
4870 if (*restore_last_thrsw == c->last_thrsw) {
4871 if (*restore_last_thrsw)
4872 (*restore_last_thrsw)->is_last_thrsw = false;
4873 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
4874 vir_emit_thrsw(c);
4875 } else {
4876 *restore_last_thrsw = c->last_thrsw;
4877 }
4878
4879 assert(c->last_thrsw);
4880 c->last_thrsw->is_last_thrsw = true;
4881 }
4882
4883 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)4884 vir_restore_last_thrsw(struct v3d_compile *c,
4885 struct qinst *thrsw,
4886 bool scoreboard_lock)
4887 {
4888 assert(c->last_thrsw);
4889 vir_remove_instruction(c, c->last_thrsw);
4890 c->last_thrsw = thrsw;
4891 if (c->last_thrsw)
4892 c->last_thrsw->is_last_thrsw = true;
4893 c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
4894 }
4895
4896 /* There's a flag in the shader for "center W is needed for reasons other than
4897 * non-centroid varyings", so we just walk the program after VIR optimization
4898 * to see if it's used. It should be harmless to set even if we only use
4899 * center W for varyings.
4900 */
4901 static void
vir_check_payload_w(struct v3d_compile * c)4902 vir_check_payload_w(struct v3d_compile *c)
4903 {
4904 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4905 return;
4906
4907 vir_for_each_inst_inorder(inst, c) {
4908 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4909 if (inst->src[i].file == c->payload_w.file &&
4910 inst->src[i].index == c->payload_w.index) {
4911 c->uses_center_w = true;
4912 return;
4913 }
4914 }
4915 }
4916 }
4917
4918 void
v3d_nir_to_vir(struct v3d_compile * c)4919 v3d_nir_to_vir(struct v3d_compile *c)
4920 {
4921 if (V3D_DBG(NIR) ||
4922 v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4923 fprintf(stderr, "%s prog %d/%d NIR:\n",
4924 vir_get_stage_name(c),
4925 c->program_id, c->variant_id);
4926 nir_print_shader(c->s, stderr);
4927 }
4928
4929 nir_to_vir(c);
4930
4931 bool restore_scoreboard_lock = false;
4932 struct qinst *restore_last_thrsw;
4933
4934 /* Emit the last THRSW before STVPM and TLB writes. */
4935 vir_emit_last_thrsw(c,
4936 &restore_last_thrsw,
4937 &restore_scoreboard_lock);
4938
4939
4940 switch (c->s->info.stage) {
4941 case MESA_SHADER_FRAGMENT:
4942 emit_frag_end(c);
4943 break;
4944 case MESA_SHADER_GEOMETRY:
4945 emit_geom_end(c);
4946 break;
4947 case MESA_SHADER_VERTEX:
4948 emit_vert_end(c);
4949 break;
4950 case MESA_SHADER_COMPUTE:
4951 break;
4952 default:
4953 unreachable("bad stage");
4954 }
4955
4956 if (V3D_DBG(VIR) ||
4957 v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4958 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4959 vir_get_stage_name(c),
4960 c->program_id, c->variant_id);
4961 vir_dump(c);
4962 fprintf(stderr, "\n");
4963 }
4964
4965 vir_optimize(c);
4966 sched_flags(c);
4967
4968 vir_check_payload_w(c);
4969
4970 /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4971 * We used that on that platform to pipeline TMU writes and reduce the
4972 * number of thread switches, as well as try (mostly successfully) to
4973 * reduce maximum register pressure to allow more threads. We should
4974 * do something of that sort for V3D -- either instruction scheduling
4975 * here, or delay the the THRSW and LDTMUs from our texture
4976 * instructions until the results are needed.
4977 */
4978
4979 if (V3D_DBG(VIR) ||
4980 v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4981 fprintf(stderr, "%s prog %d/%d VIR:\n",
4982 vir_get_stage_name(c),
4983 c->program_id, c->variant_id);
4984 vir_dump(c);
4985 fprintf(stderr, "\n");
4986 }
4987
4988 /* Attempt to allocate registers for the temporaries. If we fail,
4989 * reduce thread count and try again.
4990 */
4991 int min_threads = 2;
4992 struct qpu_reg *temp_registers;
4993 while (true) {
4994 temp_registers = v3d_register_allocate(c);
4995 if (temp_registers) {
4996 assert(c->spills + c->fills <= c->max_tmu_spills);
4997 break;
4998 }
4999
5000 if (c->threads == min_threads &&
5001 V3D_DBG(RA)) {
5002 fprintf(stderr,
5003 "Failed to register allocate using %s\n",
5004 c->fallback_scheduler ? "the fallback scheduler:" :
5005 "the normal scheduler: \n");
5006
5007 vir_dump(c);
5008
5009 char *shaderdb;
5010 int ret = v3d_shaderdb_dump(c, &shaderdb);
5011 if (ret > 0) {
5012 fprintf(stderr, "%s\n", shaderdb);
5013 free(shaderdb);
5014 }
5015 }
5016
5017 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
5018 if (V3D_DBG(PERF)) {
5019 fprintf(stderr,
5020 "Failed to register allocate %s "
5021 "prog %d/%d at %d threads.\n",
5022 vir_get_stage_name(c),
5023 c->program_id, c->variant_id, c->threads);
5024 }
5025 c->compilation_result =
5026 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
5027 return;
5028 }
5029
5030 c->spills = 0;
5031 c->fills = 0;
5032 c->threads /= 2;
5033
5034 if (c->threads == 1)
5035 vir_remove_thrsw(c);
5036 }
5037
5038 /* If we didn't spill, then remove the last thread switch we injected
5039 * artificially (if any) and restore the previous one.
5040 */
5041 if (!c->spills && c->last_thrsw != restore_last_thrsw)
5042 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
5043
5044 if (c->spills &&
5045 (V3D_DBG(VIR) ||
5046 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
5047 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
5048 vir_get_stage_name(c),
5049 c->program_id, c->variant_id);
5050 vir_dump(c);
5051 fprintf(stderr, "\n");
5052 }
5053
5054 v3d_vir_to_qpu(c, temp_registers);
5055 }
5056