xref: /aosp_15_r20/external/mesa3d/src/broadcom/compiler/nir_to_vir.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35 
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v42_pack.h"
42 
43 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
53 
54 #define V3D_TSY_SET_QUORUM          0
55 #define V3D_TSY_INC_WAITERS         1
56 #define V3D_TSY_DEC_WAITERS         2
57 #define V3D_TSY_INC_QUORUM          3
58 #define V3D_TSY_DEC_QUORUM          4
59 #define V3D_TSY_FREE_ALL            5
60 #define V3D_TSY_RELEASE             6
61 #define V3D_TSY_ACQUIRE             7
62 #define V3D_TSY_WAIT                8
63 #define V3D_TSY_WAIT_INC            9
64 #define V3D_TSY_WAIT_CHECK          10
65 #define V3D_TSY_WAIT_INC_CHECK      11
66 #define V3D_TSY_WAIT_CV             12
67 #define V3D_TSY_INC_SEMAPHORE       13
68 #define V3D_TSY_DEC_SEMAPHORE       14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70 
71 enum v3d_tmu_op_type
72 {
73         V3D_TMU_OP_TYPE_REGULAR,
74         V3D_TMU_OP_TYPE_ATOMIC,
75         V3D_TMU_OP_TYPE_CACHE
76 };
77 
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81         switch(tmu_op) {
82         case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83         case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84         case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85         case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86         case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87                 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88         case V3D_TMU_OP_WRITE_UMAX:
89         case V3D_TMU_OP_WRITE_SMIN:
90         case V3D_TMU_OP_WRITE_SMAX:
91                 assert(is_write);
92                 FALLTHROUGH;
93         case V3D_TMU_OP_WRITE_AND_READ_INC:
94         case V3D_TMU_OP_WRITE_OR_READ_DEC:
95         case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96                 return V3D_TMU_OP_TYPE_ATOMIC;
97         case V3D_TMU_OP_REGULAR:
98                 return V3D_TMU_OP_TYPE_REGULAR;
99 
100         default:
101                 unreachable("Unknown tmu_op\n");
102         }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106 
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109                   struct qreg **regs,
110                   uint32_t *size,
111                   uint32_t decl_size)
112 {
113         if (*size >= decl_size)
114                 return;
115 
116         uint32_t old_size = *size;
117         *size = MAX2(*size * 2, decl_size);
118         *regs = reralloc(c, *regs, struct qreg, *size);
119         if (!*regs) {
120                 fprintf(stderr, "Malloc failure\n");
121                 abort();
122         }
123 
124         for (uint32_t i = old_size; i < *size; i++)
125                 (*regs)[i] = c->undef;
126 }
127 
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130                     struct v3d_interp_input **regs,
131                     uint32_t *size,
132                     uint32_t decl_size)
133 {
134         if (*size >= decl_size)
135                 return;
136 
137         uint32_t old_size = *size;
138         *size = MAX2(*size * 2, decl_size);
139         *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140         if (!*regs) {
141                 fprintf(stderr, "Malloc failure\n");
142                 abort();
143         }
144 
145         for (uint32_t i = old_size; i < *size; i++) {
146                 (*regs)[i].vp = c->undef;
147                 (*regs)[i].C = c->undef;
148         }
149 }
150 
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154         if (c->threads == 1)
155                 return;
156 
157         /* Always thread switch after each texture operation for now.
158          *
159          * We could do better by batching a bunch of texture fetches up and
160          * then doing one thread switch and collecting all their results
161          * afterward.
162          */
163         c->last_thrsw = vir_NOP(c);
164         c->last_thrsw->qpu.sig.thrsw = true;
165         c->last_thrsw_at_top_level = !c->in_control_flow;
166 
167         /* We need to lock the scoreboard before any tlb access happens. If this
168          * thread switch comes after we have emitted a tlb load, then it means
169          * that we can't lock on the last thread switch any more.
170          */
171         if (c->emitted_tlb_load)
172                 c->lock_scoreboard_on_first_thrsw = true;
173 }
174 
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178         if (nir_src_is_const(instr->src[src])) {
179                 int64_t add_val = nir_src_as_int(instr->src[src]);
180                 if (add_val == 1)
181                         return V3D_TMU_OP_WRITE_AND_READ_INC;
182                 else if (add_val == -1)
183                         return V3D_TMU_OP_WRITE_OR_READ_DEC;
184         }
185 
186         return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188 
189 static uint32_t
v3d_general_tmu_op_for_atomic(nir_intrinsic_instr * instr)190 v3d_general_tmu_op_for_atomic(nir_intrinsic_instr *instr)
191 {
192         nir_atomic_op atomic_op = nir_intrinsic_atomic_op(instr);
193         switch (atomic_op) {
194         case nir_atomic_op_iadd:
195                 return  instr->intrinsic == nir_intrinsic_ssbo_atomic ?
196                         v3d_get_op_for_atomic_add(instr, 2) :
197                         v3d_get_op_for_atomic_add(instr, 1);
198         case nir_atomic_op_imin:    return V3D_TMU_OP_WRITE_SMIN;
199         case nir_atomic_op_umin:    return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
200         case nir_atomic_op_imax:    return V3D_TMU_OP_WRITE_SMAX;
201         case nir_atomic_op_umax:    return V3D_TMU_OP_WRITE_UMAX;
202         case nir_atomic_op_iand:    return V3D_TMU_OP_WRITE_AND_READ_INC;
203         case nir_atomic_op_ior:     return V3D_TMU_OP_WRITE_OR_READ_DEC;
204         case nir_atomic_op_ixor:    return V3D_TMU_OP_WRITE_XOR_READ_NOT;
205         case nir_atomic_op_xchg:    return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
206         case nir_atomic_op_cmpxchg: return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
207         default:                    unreachable("unknown atomic op");
208         }
209 }
210 
211 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)212 v3d_general_tmu_op(nir_intrinsic_instr *instr)
213 {
214         switch (instr->intrinsic) {
215         case nir_intrinsic_load_ssbo:
216         case nir_intrinsic_load_ubo:
217         case nir_intrinsic_load_uniform:
218         case nir_intrinsic_load_shared:
219         case nir_intrinsic_load_scratch:
220         case nir_intrinsic_load_global:
221         case nir_intrinsic_load_global_constant:
222         case nir_intrinsic_store_ssbo:
223         case nir_intrinsic_store_shared:
224         case nir_intrinsic_store_scratch:
225         case nir_intrinsic_store_global:
226                 return V3D_TMU_OP_REGULAR;
227 
228         case nir_intrinsic_ssbo_atomic:
229         case nir_intrinsic_ssbo_atomic_swap:
230         case nir_intrinsic_shared_atomic:
231         case nir_intrinsic_shared_atomic_swap:
232         case nir_intrinsic_global_atomic:
233         case nir_intrinsic_global_atomic_swap:
234                 return v3d_general_tmu_op_for_atomic(instr);
235 
236         default:
237                 unreachable("unknown intrinsic op");
238         }
239 }
240 
241 /**
242  * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
243  * would overflow the Output TMU fifo.
244  *
245  * It is not allowed to overflow the Output fifo, however, we can overflow
246  * Input and Config fifos. Doing that makes the shader stall, but only for as
247  * long as it needs to be able to continue so it is better for pipelining to
248  * let the QPU stall on these if needed than trying to emit TMU flushes in the
249  * driver.
250  */
251 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)252 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
253 {
254         if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
255                 return true;
256 
257         return components > 0 &&
258                c->tmu.output_fifo_size + components > 16 / c->threads;
259 }
260 
261 /**
262  * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
263  * popping all TMU fifo entries.
264  */
265 void
ntq_flush_tmu(struct v3d_compile * c)266 ntq_flush_tmu(struct v3d_compile *c)
267 {
268         if (c->tmu.flush_count == 0)
269                 return;
270 
271         vir_emit_thrsw(c);
272 
273         bool emitted_tmuwt = false;
274         for (int i = 0; i < c->tmu.flush_count; i++) {
275                 if (c->tmu.flush[i].component_mask > 0) {
276                         nir_def *def = c->tmu.flush[i].def;
277                         assert(def);
278 
279                         for (int j = 0; j < 4; j++) {
280                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
281                                         ntq_store_def(c, def, j,
282                                                       vir_MOV(c, vir_LDTMU(c)));
283                                 }
284                         }
285                 } else if (!emitted_tmuwt) {
286                         vir_TMUWT(c);
287                         emitted_tmuwt = true;
288                 }
289         }
290 
291         c->tmu.output_fifo_size = 0;
292         c->tmu.flush_count = 0;
293         _mesa_set_clear(c->tmu.outstanding_regs, NULL);
294 }
295 
296 /**
297  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
298  * is responsible for ensuring that doing this doesn't overflow the TMU fifos,
299  * and more specifically, the output fifo, since that can't stall.
300  */
301 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_def * def,uint32_t component_mask)302 ntq_add_pending_tmu_flush(struct v3d_compile *c,
303                           nir_def *def,
304                           uint32_t component_mask)
305 {
306         const uint32_t num_components = util_bitcount(component_mask);
307         assert(!ntq_tmu_fifo_overflow(c, num_components));
308 
309         if (num_components > 0) {
310                 c->tmu.output_fifo_size += num_components;
311 
312                 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
313                 if (store != NULL) {
314                         nir_def *reg = store->src[1].ssa;
315                         _mesa_set_add(c->tmu.outstanding_regs, reg);
316                 }
317         }
318 
319         c->tmu.flush[c->tmu.flush_count].def = def;
320         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
321         c->tmu.flush_count++;
322         c->tmu.total_count++;
323 
324         if (c->disable_tmu_pipelining)
325                 ntq_flush_tmu(c);
326         else if (c->tmu.flush_count > 1)
327                 c->pipelined_any_tmu = true;
328 }
329 
330 enum emit_mode {
331     MODE_COUNT = 0,
332     MODE_EMIT,
333     MODE_LAST,
334 };
335 
336 /**
337  * For a TMU general store instruction:
338  *
339  * In MODE_COUNT mode, records the number of TMU writes required and flushes
340  * any outstanding TMU operations the instruction depends on, but it doesn't
341  * emit any actual register writes.
342  *
343  * In MODE_EMIT mode, emits the data register writes required by the
344  * instruction.
345  */
346 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * type_size,uint32_t * tmu_writes)347 emit_tmu_general_store_writes(struct v3d_compile *c,
348                               enum emit_mode mode,
349                               nir_intrinsic_instr *instr,
350                               uint32_t base_const_offset,
351                               uint32_t *writemask,
352                               uint32_t *const_offset,
353                               uint32_t *type_size,
354                               uint32_t *tmu_writes)
355 {
356         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
357 
358         /* Find the first set of consecutive components that
359          * are enabled in the writemask and emit the TMUD
360          * instructions for them.
361          */
362         assert(*writemask != 0);
363         uint32_t first_component = ffs(*writemask) - 1;
364         uint32_t last_component = first_component;
365         while (*writemask & BITFIELD_BIT(last_component + 1))
366                 last_component++;
367 
368         assert(first_component <= last_component &&
369                last_component < instr->num_components);
370 
371         for (int i = first_component; i <= last_component; i++) {
372                 struct qreg data = ntq_get_src(c, instr->src[0], i);
373                 if (mode == MODE_COUNT)
374                         (*tmu_writes)++;
375                 else
376                         vir_MOV_dest(c, tmud, data);
377         }
378 
379         if (mode == MODE_EMIT) {
380                 /* Update the offset for the TMU write based on the
381                  * the first component we are writing.
382                  */
383                 *type_size = nir_src_bit_size(instr->src[0]) / 8;
384                 *const_offset =
385                         base_const_offset + first_component * (*type_size);
386 
387                 /* Clear these components from the writemask */
388                 uint32_t written_mask =
389                         BITFIELD_RANGE(first_component, *tmu_writes);
390                 (*writemask) &= ~written_mask;
391         }
392 }
393 
394 /**
395  * For a TMU general atomic instruction:
396  *
397  * In MODE_COUNT mode, records the number of TMU writes required and flushes
398  * any outstanding TMU operations the instruction depends on, but it doesn't
399  * emit any actual register writes.
400  *
401  * In MODE_EMIT mode, emits the data register writes required by the
402  * instruction.
403  */
404 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)405 emit_tmu_general_atomic_writes(struct v3d_compile *c,
406                                enum emit_mode mode,
407                                nir_intrinsic_instr *instr,
408                                uint32_t tmu_op,
409                                bool has_index,
410                                uint32_t *tmu_writes)
411 {
412         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
413 
414         struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
415         if (mode == MODE_COUNT)
416                 (*tmu_writes)++;
417         else
418                 vir_MOV_dest(c, tmud, data);
419 
420         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
421                 data = ntq_get_src(c, instr->src[2 + has_index], 0);
422                 if (mode == MODE_COUNT)
423                         (*tmu_writes)++;
424                 else
425                         vir_MOV_dest(c, tmud, data);
426         }
427 }
428 
429 /**
430  * For any TMU general instruction:
431  *
432  * In MODE_COUNT mode, records the number of TMU writes required to emit the
433  * address parameter and flushes any outstanding TMU operations the instruction
434  * depends on, but it doesn't emit any actual register writes.
435  *
436  * In MODE_EMIT mode, emits register writes required to emit the address.
437  */
438 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t dest_components,uint32_t * tmu_writes)439 emit_tmu_general_address_write(struct v3d_compile *c,
440                                enum emit_mode mode,
441                                nir_intrinsic_instr *instr,
442                                uint32_t config,
443                                bool dynamic_src,
444                                int offset_src,
445                                struct qreg base_offset,
446                                uint32_t const_offset,
447                                uint32_t dest_components,
448                                uint32_t *tmu_writes)
449 {
450         if (mode == MODE_COUNT) {
451                 (*tmu_writes)++;
452                 if (dynamic_src)
453                         ntq_get_src(c, instr->src[offset_src], 0);
454                 return;
455         }
456 
457         if (vir_in_nonuniform_control_flow(c)) {
458                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
459                            V3D_QPU_PF_PUSHZ);
460         }
461 
462         struct qreg tmua;
463         if (config == ~0)
464                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
465         else
466                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
467 
468         struct qinst *tmu;
469         if (dynamic_src) {
470                 struct qreg offset = base_offset;
471                 if (const_offset != 0) {
472                         offset = vir_ADD(c, offset,
473                                          vir_uniform_ui(c, const_offset));
474                 }
475                 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
476                 tmu = vir_ADD_dest(c, tmua, offset, data);
477         } else {
478                 if (const_offset != 0) {
479                         tmu = vir_ADD_dest(c, tmua, base_offset,
480                                            vir_uniform_ui(c, const_offset));
481                 } else {
482                         tmu = vir_MOV_dest(c, tmua, base_offset);
483                 }
484         }
485 
486         if (config != ~0) {
487                 tmu->uniform =
488                         vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
489         }
490 
491         if (vir_in_nonuniform_control_flow(c))
492                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
493 
494         tmu->ldtmu_count = dest_components;
495 }
496 
497 /**
498  * Implements indirect uniform loads and SSBO accesses through the TMU general
499  * memory access interface.
500  */
501 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch,bool is_global)502 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
503                      bool is_shared_or_scratch, bool is_global)
504 {
505         uint32_t tmu_op = v3d_general_tmu_op(instr);
506 
507         /* If we were able to replace atomic_add for an inc/dec, then we
508          * need/can to do things slightly different, like not loading the
509          * amount to add/sub, as that is implicit.
510          */
511         bool atomic_add_replaced =
512                 (instr->intrinsic == nir_intrinsic_ssbo_atomic ||
513                  instr->intrinsic == nir_intrinsic_shared_atomic ||
514                  instr->intrinsic == nir_intrinsic_global_atomic) &&
515                 nir_intrinsic_atomic_op(instr) == nir_atomic_op_iadd &&
516                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
517                   tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC);
518 
519         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
520                          instr->intrinsic == nir_intrinsic_store_scratch ||
521                          instr->intrinsic == nir_intrinsic_store_shared ||
522                          instr->intrinsic == nir_intrinsic_store_global);
523 
524         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
525                         instr->intrinsic == nir_intrinsic_load_ubo ||
526                         instr->intrinsic == nir_intrinsic_load_ssbo ||
527                         instr->intrinsic == nir_intrinsic_load_scratch ||
528                         instr->intrinsic == nir_intrinsic_load_shared ||
529                         instr->intrinsic == nir_intrinsic_load_global ||
530                         instr->intrinsic == nir_intrinsic_load_global_constant);
531 
532         if (!is_load)
533                 c->tmu_dirty_rcl = true;
534 
535         if (is_global)
536                 c->has_global_address = true;
537 
538         bool has_index = !is_shared_or_scratch && !is_global;
539 
540         int offset_src;
541         if (instr->intrinsic == nir_intrinsic_load_uniform) {
542                 offset_src = 0;
543         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
544                    instr->intrinsic == nir_intrinsic_load_ubo ||
545                    instr->intrinsic == nir_intrinsic_load_scratch ||
546                    instr->intrinsic == nir_intrinsic_load_shared ||
547                    instr->intrinsic == nir_intrinsic_load_global ||
548                    instr->intrinsic == nir_intrinsic_load_global_constant ||
549                    atomic_add_replaced) {
550                 offset_src = 0 + has_index;
551         } else if (is_store) {
552                 offset_src = 1 + has_index;
553         } else {
554                 offset_src = 0 + has_index;
555         }
556 
557         bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
558         uint32_t const_offset = 0;
559         if (!dynamic_src)
560                 const_offset = nir_src_as_uint(instr->src[offset_src]);
561 
562         struct qreg base_offset;
563         if (instr->intrinsic == nir_intrinsic_load_uniform) {
564                 const_offset += nir_intrinsic_base(instr);
565                 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
566                                           v3d_unit_data_create(0, const_offset));
567                 const_offset = 0;
568         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
569                 /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 (0
570                  * is gallium's constant buffer 0 in GL and push constants
571                  * in Vulkan)).
572                  */
573                 uint32_t index = nir_src_as_uint(instr->src[0]) + 1;
574                 base_offset =
575                         vir_uniform(c, QUNIFORM_UBO_ADDR,
576                                     v3d_unit_data_create(index, const_offset));
577                 const_offset = 0;
578         } else if (is_shared_or_scratch) {
579                 /* Shared and scratch variables have no buffer index, and all
580                  * start from a common base that we set up at the start of
581                  * dispatch.
582                  */
583                 if (instr->intrinsic == nir_intrinsic_load_scratch ||
584                     instr->intrinsic == nir_intrinsic_store_scratch) {
585                         base_offset = c->spill_base;
586                 } else {
587                         base_offset = c->cs_shared_offset;
588                         const_offset += nir_intrinsic_base(instr);
589                 }
590         } else if (is_global) {
591                 /* Global load/store intrinsics use gloal addresses, so the
592                  * offset is the target address and we don't need to add it
593                  * to a base offset.
594                  */
595                 base_offset = vir_uniform_ui(c, 0);
596         } else {
597                 uint32_t idx = is_store ? 1 : 0;
598                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
599                                           nir_src_comp_as_uint(instr->src[idx], 0));
600         }
601 
602         /* We are ready to emit TMU register writes now, but before we actually
603          * emit them we need to flush outstanding TMU operations if any of our
604          * writes reads from the result of an outstanding TMU operation before
605          * we start the TMU sequence for this operation, since otherwise the
606          * flush could happen in the middle of the TMU sequence we are about to
607          * emit, which is illegal. To do this we run this logic twice, the
608          * first time it will count required register writes and flush pending
609          * TMU requests if necessary due to a dependency, and the second one
610          * will emit the actual TMU writes.
611          */
612         const uint32_t dest_components = nir_intrinsic_dest_components(instr);
613         uint32_t base_const_offset = const_offset;
614         uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
615         uint32_t tmu_writes = 0;
616         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
617                 assert(mode == MODE_COUNT || tmu_writes > 0);
618 
619                 uint32_t type_size = 4;
620 
621                 if (is_store) {
622                         emit_tmu_general_store_writes(c, mode, instr,
623                                                       base_const_offset,
624                                                       &writemask,
625                                                       &const_offset,
626                                                       &type_size,
627                                                       &tmu_writes);
628                 } else if (!is_load && !atomic_add_replaced) {
629                         emit_tmu_general_atomic_writes(c, mode, instr,
630                                                        tmu_op, has_index,
631                                                        &tmu_writes);
632                 } else if (is_load) {
633                         type_size = instr->def.bit_size / 8;
634                 }
635 
636                 /* For atomics we use 32bit except for CMPXCHG, that we need
637                  * to use VEC2. For the rest of the cases we use the number of
638                  * tmud writes we did to decide the type. For cache operations
639                  * the type is ignored.
640                  */
641                 uint32_t config = 0;
642                 if (mode == MODE_EMIT) {
643                         uint32_t num_components;
644                         if (is_load || atomic_add_replaced) {
645                                 num_components = instr->num_components;
646                         } else {
647                                 assert(tmu_writes > 0);
648                                 num_components = tmu_writes - 1;
649                         }
650                         bool is_atomic =
651                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
652                                 V3D_TMU_OP_TYPE_ATOMIC;
653 
654                         /* Only load per-quad if we can be certain that all
655                          * lines in the quad are active. Notice that demoted
656                          * invocations, unlike terminated ones, are still
657                          * active: we want to skip memory writes for them but
658                          * loads should still work.
659                          */
660                         uint32_t perquad =
661                                 is_load && !vir_in_nonuniform_control_flow(c) &&
662                                 ((c->s->info.stage == MESA_SHADER_FRAGMENT &&
663                                   c->s->info.fs.needs_quad_helper_invocations &&
664                                   !c->emitted_discard) ||
665                                  c->s->info.uses_wide_subgroup_intrinsics) ?
666                                 GENERAL_TMU_LOOKUP_PER_QUAD :
667                                 GENERAL_TMU_LOOKUP_PER_PIXEL;
668                         config = 0xffffff00 | tmu_op << 3 | perquad;
669 
670                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
671                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
672                         } else if (is_atomic || num_components == 1) {
673                                 switch (type_size) {
674                                 case 4:
675                                         config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
676                                         break;
677                                 case 2:
678                                         config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
679                                         break;
680                                 case 1:
681                                         config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
682                                         break;
683                                 default:
684                                         unreachable("Unsupported bitsize");
685                                 }
686                         } else {
687                                 assert(type_size == 4);
688                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
689                                           num_components - 2;
690                         }
691                 }
692 
693                 emit_tmu_general_address_write(c, mode, instr, config,
694                                                dynamic_src, offset_src,
695                                                base_offset, const_offset,
696                                                dest_components, &tmu_writes);
697 
698                 assert(tmu_writes > 0);
699                 if (mode == MODE_COUNT) {
700                         /* Make sure we won't exceed the 16-entry TMU
701                          * fifo if each thread is storing at the same
702                          * time.
703                          */
704                         while (tmu_writes > 16 / c->threads)
705                                 c->threads /= 2;
706 
707                         /* If pipelining this TMU operation would
708                          * overflow TMU fifos, we need to flush.
709                          */
710                         if (ntq_tmu_fifo_overflow(c, dest_components))
711                                 ntq_flush_tmu(c);
712                 } else {
713                         /* Delay emission of the thread switch and
714                          * LDTMU/TMUWT until we really need to do it to
715                          * improve pipelining.
716                          */
717                         const uint32_t component_mask =
718                                 (1 << dest_components) - 1;
719                         ntq_add_pending_tmu_flush(c, &instr->def,
720                                                   component_mask);
721                 }
722         }
723 
724         /* nir_lower_wrmasks should've ensured that any writemask on a store
725          * operation only has consecutive bits set, in which case we should've
726          * processed the full writemask above.
727          */
728         assert(writemask == 0);
729 }
730 
731 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_def * def)732 ntq_init_ssa_def(struct v3d_compile *c, nir_def *def)
733 {
734         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
735                                           def->num_components);
736         _mesa_hash_table_insert(c->def_ht, def, qregs);
737         return qregs;
738 }
739 
740 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)741 is_ld_signal(const struct v3d_qpu_sig *sig)
742 {
743         return (sig->ldunif ||
744                 sig->ldunifa ||
745                 sig->ldunifrf ||
746                 sig->ldunifarf ||
747                 sig->ldtmu ||
748                 sig->ldvary ||
749                 sig->ldvpm ||
750                 sig->ldtlb ||
751                 sig->ldtlbu);
752 }
753 
754 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)755 is_ldunif_signal(const struct v3d_qpu_sig *sig)
756 {
757         return sig->ldunif || sig->ldunifrf;
758 }
759 
760 /**
761  * This function is responsible for getting VIR results into the associated
762  * storage for a NIR instruction.
763  *
764  * If it's a NIR SSA def, then we just set the associated hash table entry to
765  * the new result.
766  *
767  * If it's a NIR reg, then we need to update the existing qreg assigned to the
768  * NIR destination with the incoming value.  To do that without introducing
769  * new MOVs, we require that the incoming qreg either be a uniform, or be
770  * SSA-defined by the previous VIR instruction in the block and rewritable by
771  * this function.  That lets us sneak ahead and insert the SF flag beforehand
772  * (knowing that the previous instruction doesn't depend on flags) and rewrite
773  * its destination to be the NIR reg's destination
774  */
775 void
ntq_store_def(struct v3d_compile * c,nir_def * def,int chan,struct qreg result)776 ntq_store_def(struct v3d_compile *c, nir_def *def, int chan,
777               struct qreg result)
778 {
779         struct qinst *last_inst = NULL;
780         if (!list_is_empty(&c->cur_block->instructions))
781                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
782 
783         bool is_reused_uniform =
784                 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
785                 last_inst != c->defs[result.index];
786 
787         assert(result.file == QFILE_TEMP && last_inst &&
788                (last_inst == c->defs[result.index] || is_reused_uniform));
789 
790         nir_intrinsic_instr *store = nir_store_reg_for_def(def);
791         if (store == NULL) {
792                 assert(chan < def->num_components);
793 
794                 struct qreg *qregs;
795                 struct hash_entry *entry =
796                         _mesa_hash_table_search(c->def_ht, def);
797 
798                 if (entry)
799                         qregs = entry->data;
800                 else
801                         qregs = ntq_init_ssa_def(c, def);
802 
803                 qregs[chan] = result;
804         } else {
805                 nir_def *reg = store->src[1].ssa;
806                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
807                 assert(nir_intrinsic_base(store) == 0);
808                 assert(nir_intrinsic_num_array_elems(decl) == 0);
809                 struct hash_entry *entry =
810                         _mesa_hash_table_search(c->def_ht, reg);
811                 struct qreg *qregs = entry->data;
812 
813                 /* If the previous instruction can't be predicated for
814                  * the store into the nir_register, then emit a MOV
815                  * that can be.
816                  */
817                 if (is_reused_uniform ||
818                     (vir_in_nonuniform_control_flow(c) &&
819                      is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
820                         result = vir_MOV(c, result);
821                         last_inst = c->defs[result.index];
822                 }
823 
824                 /* We know they're both temps, so just rewrite index. */
825                 c->defs[last_inst->dst.index] = NULL;
826                 last_inst->dst.index = qregs[chan].index;
827 
828                 /* If we're in control flow, then make this update of the reg
829                  * conditional on the execution mask.
830                  */
831                 if (vir_in_nonuniform_control_flow(c)) {
832                         last_inst->dst.index = qregs[chan].index;
833 
834                         /* Set the flags to the current exec mask.
835                          */
836                         c->cursor = vir_before_inst(last_inst);
837                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
838                                    V3D_QPU_PF_PUSHZ);
839                         c->cursor = vir_after_inst(last_inst);
840 
841                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
842                 }
843         }
844 }
845 
846 /**
847  * This looks up the qreg associated with a particular ssa/reg used as a source
848  * in any instruction.
849  *
850  * It is expected that the definition for any NIR value read as a source has
851  * been emitted by a previous instruction, however, in the case of TMU
852  * operations we may have postponed emission of the thread switch and LDTMUs
853  * required to read the TMU results until the results are actually used to
854  * improve pipelining, which then would lead to us not finding them here
855  * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
856  * (for registers), meaning that we need to flush outstanding TMU operations
857  * to read the correct value.
858  */
859 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)860 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
861 {
862         struct hash_entry *entry;
863 
864         nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
865         if (load == NULL) {
866                 assert(i < src.ssa->num_components);
867 
868                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
869                 if (!entry) {
870                         ntq_flush_tmu(c);
871                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
872                 }
873         } else {
874                 nir_def *reg = load->src[0].ssa;
875                 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
876                 assert(nir_intrinsic_base(load) == 0);
877                 assert(nir_intrinsic_num_array_elems(decl) == 0);
878                 assert(i < nir_intrinsic_num_components(decl));
879 
880                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
881                         ntq_flush_tmu(c);
882                 entry = _mesa_hash_table_search(c->def_ht, reg);
883         }
884         assert(entry);
885 
886         struct qreg *qregs = entry->data;
887         return qregs[i];
888 }
889 
890 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)891 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
892                 unsigned src)
893 {
894         struct qreg r = ntq_get_src(c, instr->src[src].src,
895                                     instr->src[src].swizzle[0]);
896 
897         return r;
898 };
899 
900 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)901 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
902 {
903         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
904 }
905 
906 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)907 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
908 {
909         unsigned unit = instr->texture_index;
910         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
911         int dest_size = nir_tex_instr_dest_size(instr);
912 
913         struct qreg lod = c->undef;
914         if (lod_index != -1)
915                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
916 
917         for (int i = 0; i < dest_size; i++) {
918                 assert(i < 3);
919                 enum quniform_contents contents;
920 
921                 if (instr->is_array && i == dest_size - 1)
922                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
923                 else
924                         contents = QUNIFORM_TEXTURE_WIDTH + i;
925 
926                 struct qreg size = vir_uniform(c, contents, unit);
927 
928                 switch (instr->sampler_dim) {
929                 case GLSL_SAMPLER_DIM_1D:
930                 case GLSL_SAMPLER_DIM_2D:
931                 case GLSL_SAMPLER_DIM_MS:
932                 case GLSL_SAMPLER_DIM_3D:
933                 case GLSL_SAMPLER_DIM_CUBE:
934                 case GLSL_SAMPLER_DIM_BUF:
935                 case GLSL_SAMPLER_DIM_EXTERNAL:
936                         /* Don't minify the array size. */
937                         if (!(instr->is_array && i == dest_size - 1)) {
938                                 size = ntq_minify(c, size, lod);
939                         }
940                         break;
941 
942                 case GLSL_SAMPLER_DIM_RECT:
943                         /* There's no LOD field for rects */
944                         break;
945 
946                 default:
947                         unreachable("Bad sampler type");
948                 }
949 
950                 ntq_store_def(c, &instr->def, i, size);
951         }
952 }
953 
954 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)955 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
956 {
957         unsigned unit = instr->texture_index;
958 
959         /* Since each texture sampling op requires uploading uniforms to
960          * reference the texture, there's no HW support for texture size and
961          * you just upload uniforms containing the size.
962          */
963         switch (instr->op) {
964         case nir_texop_query_levels:
965                 ntq_store_def(c, &instr->def, 0,
966                               vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
967                 return;
968         case nir_texop_texture_samples:
969                 ntq_store_def(c, &instr->def, 0,
970                               vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
971                 return;
972         case nir_texop_txs:
973                 ntq_emit_txs(c, instr);
974                 return;
975         default:
976                 break;
977         }
978 
979         v3d_vir_emit_tex(c, instr);
980 }
981 
982 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)983 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
984 {
985         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
986         if (is_cos)
987                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
988 
989         struct qreg periods = vir_FROUND(c, input);
990         struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
991         return vir_XOR(c, sin_output, vir_SHL(c,
992                                               vir_FTOIN(c, periods),
993                                               vir_uniform_ui(c, -1)));
994 }
995 
996 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)997 ntq_fsign(struct v3d_compile *c, struct qreg src)
998 {
999         struct qreg t = vir_get_temp(c);
1000 
1001         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
1002         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
1003         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
1004         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
1005         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
1006         return vir_MOV(c, t);
1007 }
1008 
1009 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)1010 emit_fragcoord_input(struct v3d_compile *c, int attr)
1011 {
1012         c->inputs[attr * 4 + 0] = vir_FXCD(c);
1013         c->inputs[attr * 4 + 1] = vir_FYCD(c);
1014         c->inputs[attr * 4 + 2] = c->payload_z;
1015         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
1016 }
1017 
1018 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg c_reg)1019 emit_smooth_varying(struct v3d_compile *c,
1020                     struct qreg vary, struct qreg w, struct qreg c_reg)
1021 {
1022         return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
1023 }
1024 
1025 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1026 emit_noperspective_varying(struct v3d_compile *c,
1027                            struct qreg vary, struct qreg c_reg)
1028 {
1029         return vir_FADD(c, vir_MOV(c, vary), c_reg);
1030 }
1031 
1032 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg c_reg)1033 emit_flat_varying(struct v3d_compile *c,
1034                   struct qreg vary, struct qreg c_reg)
1035 {
1036         vir_MOV_dest(c, c->undef, vary);
1037         return vir_MOV(c, c_reg);
1038 }
1039 
1040 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)1041 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
1042                       int8_t input_idx, uint8_t swizzle, int array_index)
1043 {
1044         struct qreg c_reg; /* C coefficient */
1045 
1046         if (c->devinfo->has_accumulators)
1047                 c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
1048         else
1049                 c_reg = vir_reg(QFILE_REG, 0);
1050 
1051         struct qinst *ldvary = NULL;
1052         struct qreg vary;
1053         ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1054                               c->undef, c->undef);
1055         ldvary->qpu.sig.ldvary = true;
1056         vary = vir_emit_def(c, ldvary);
1057 
1058         /* Store the input value before interpolation so we can implement
1059          * GLSL's interpolateAt functions if the shader uses them.
1060          */
1061         if (input_idx >= 0) {
1062                 assert(var);
1063                 c->interp[input_idx].vp = vary;
1064                 c->interp[input_idx].C = vir_MOV(c, c_reg);
1065                 c->interp[input_idx].mode = var->data.interpolation;
1066         }
1067 
1068         /* For gl_PointCoord input or distance along a line, we'll be called
1069          * with no nir_variable, and we don't count toward VPM size so we
1070          * don't track an input slot.
1071          */
1072         if (!var) {
1073                 assert(input_idx < 0);
1074                 return emit_smooth_varying(c, vary, c->payload_w, c_reg);
1075         }
1076 
1077         int i = c->num_inputs++;
1078         c->input_slots[i] =
1079                 v3d_slot_from_slot_and_component(var->data.location +
1080                                                  array_index, swizzle);
1081 
1082         struct qreg result;
1083         switch (var->data.interpolation) {
1084         case INTERP_MODE_NONE:
1085         case INTERP_MODE_SMOOTH:
1086                 if (var->data.centroid) {
1087                         BITSET_SET(c->centroid_flags, i);
1088                         result = emit_smooth_varying(c, vary,
1089                                                      c->payload_w_centroid, c_reg);
1090                 } else {
1091                         result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
1092                 }
1093                 break;
1094 
1095         case INTERP_MODE_NOPERSPECTIVE:
1096                 BITSET_SET(c->noperspective_flags, i);
1097                 result = emit_noperspective_varying(c, vary, c_reg);
1098                 break;
1099 
1100         case INTERP_MODE_FLAT:
1101                 BITSET_SET(c->flat_shade_flags, i);
1102                 result = emit_flat_varying(c, vary, c_reg);
1103                 break;
1104 
1105         default:
1106                 unreachable("Bad interp mode");
1107         }
1108 
1109         if (input_idx >= 0)
1110                 c->inputs[input_idx] = result;
1111         return result;
1112 }
1113 
1114 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1115 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1116                     int array_index, unsigned nelem)
1117 {
1118         for (int i = 0; i < nelem ; i++) {
1119                 int chan = var->data.location_frac + i;
1120                 int input_idx = (base_attr + array_index) * 4 + chan;
1121                 emit_fragment_varying(c, var, input_idx, chan, array_index);
1122         }
1123 }
1124 
1125 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1126 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1127                             int array_index)
1128 {
1129         /* Compact variables are scalar arrays where each set of 4 elements
1130          * consumes a single location.
1131          */
1132         int loc_offset = array_index / 4;
1133         int chan = var->data.location_frac + array_index % 4;
1134         int input_idx = (attr + loc_offset) * 4  + chan;
1135         emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1136 }
1137 
1138 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1139 add_output(struct v3d_compile *c,
1140            uint32_t decl_offset,
1141            uint8_t slot,
1142            uint8_t swizzle)
1143 {
1144         uint32_t old_array_size = c->outputs_array_size;
1145         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1146                           decl_offset + 1);
1147 
1148         if (old_array_size != c->outputs_array_size) {
1149                 c->output_slots = reralloc(c,
1150                                            c->output_slots,
1151                                            struct v3d_varying_slot,
1152                                            c->outputs_array_size);
1153         }
1154 
1155         c->output_slots[decl_offset] =
1156                 v3d_slot_from_slot_and_component(slot, swizzle);
1157 }
1158 
1159 /**
1160  * If compare_instr is a valid comparison instruction, emits the
1161  * compare_instr's comparison and returns the sel_instr's return value based
1162  * on the compare_instr's result.
1163  */
1164 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1165 ntq_emit_comparison(struct v3d_compile *c,
1166                     nir_alu_instr *compare_instr,
1167                     enum v3d_qpu_cond *out_cond)
1168 {
1169         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1170         struct qreg src1;
1171         if (nir_op_infos[compare_instr->op].num_inputs > 1)
1172                 src1 = ntq_get_alu_src(c, compare_instr, 1);
1173         bool cond_invert = false;
1174         struct qreg nop = vir_nop_reg();
1175 
1176         switch (compare_instr->op) {
1177         case nir_op_feq32:
1178         case nir_op_seq:
1179                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1180                 break;
1181         case nir_op_ieq32:
1182                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1183                 break;
1184 
1185         case nir_op_fneu32:
1186         case nir_op_sne:
1187                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1188                 cond_invert = true;
1189                 break;
1190         case nir_op_ine32:
1191                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1192                 cond_invert = true;
1193                 break;
1194 
1195         case nir_op_fge32:
1196         case nir_op_sge:
1197                 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1198                 break;
1199         case nir_op_ige32:
1200                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1201                 cond_invert = true;
1202                 break;
1203         case nir_op_uge32:
1204                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1205                 cond_invert = true;
1206                 break;
1207 
1208         case nir_op_slt:
1209         case nir_op_flt32:
1210                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1211                 break;
1212         case nir_op_ilt32:
1213                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1214                 break;
1215         case nir_op_ult32:
1216                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1217                 break;
1218 
1219         default:
1220                 return false;
1221         }
1222 
1223         *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1224 
1225         return true;
1226 }
1227 
1228 /* Finds an ALU instruction that generates our src value that could
1229  * (potentially) be greedily emitted in the consuming instruction.
1230  */
1231 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1232 ntq_get_alu_parent(nir_src src)
1233 {
1234         if (src.ssa->parent_instr->type != nir_instr_type_alu)
1235                 return NULL;
1236         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1237         if (!instr)
1238                 return NULL;
1239 
1240         /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1241          * moving emission of the ALU instr down past another write of the
1242          * src.
1243          */
1244         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1245                 if (nir_load_reg_for_def(instr->src[i].src.ssa))
1246                         return NULL;
1247         }
1248 
1249         return instr;
1250 }
1251 
1252 /* Turns a NIR bool into a condition code to predicate on. */
1253 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1254 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1255 {
1256         struct qreg qsrc = ntq_get_src(c, src, 0);
1257         /* skip if we already have src in the flags */
1258         if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1259                 return c->flags_cond;
1260 
1261         nir_alu_instr *compare = ntq_get_alu_parent(src);
1262         if (!compare)
1263                 goto out;
1264 
1265         enum v3d_qpu_cond cond;
1266         if (ntq_emit_comparison(c, compare, &cond))
1267                 return cond;
1268 
1269 out:
1270 
1271         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1272                    V3D_QPU_PF_PUSHZ);
1273         return V3D_QPU_COND_IFNA;
1274 }
1275 
1276 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1277 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1278 {
1279         struct qreg result =
1280                 vir_MOV(c, vir_SEL(c, cond,
1281                                    vir_uniform_ui(c, ~0),
1282                                    vir_uniform_ui(c, 0)));
1283         c->flags_temp = result.index;
1284         c->flags_cond = cond;
1285         return result;
1286 }
1287 
1288 static struct qreg
ntq_emit_cond_to_int(struct v3d_compile * c,enum v3d_qpu_cond cond)1289 ntq_emit_cond_to_int(struct v3d_compile *c, enum v3d_qpu_cond cond)
1290 {
1291         struct qreg result =
1292                 vir_MOV(c, vir_SEL(c, cond,
1293                                    vir_uniform_ui(c, 1),
1294                                    vir_uniform_ui(c, 0)));
1295         c->flags_temp = result.index;
1296         c->flags_cond = cond;
1297         return result;
1298 }
1299 
1300 static struct qreg
f2f16_rtz(struct v3d_compile * c,struct qreg f32)1301 f2f16_rtz(struct v3d_compile *c, struct qreg f32)
1302 {
1303    /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
1304     * method and seems to be using RTE by default, so we need to implement
1305     * RTZ rounding in software.
1306     */
1307    struct qreg rf16 = vir_FMOV(c, f32);
1308    vir_set_pack(c->defs[rf16.index], V3D_QPU_PACK_L);
1309 
1310    struct qreg rf32 = vir_FMOV(c, rf16);
1311    vir_set_unpack(c->defs[rf32.index], 0, V3D_QPU_UNPACK_L);
1312 
1313    struct qreg f32_abs = vir_FMOV(c, f32);
1314    vir_set_unpack(c->defs[f32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1315 
1316    struct qreg rf32_abs = vir_FMOV(c, rf32);
1317    vir_set_unpack(c->defs[rf32_abs.index], 0, V3D_QPU_UNPACK_ABS);
1318 
1319    vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), f32_abs, rf32_abs),
1320               V3D_QPU_PF_PUSHN);
1321    return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
1322                   vir_SUB(c, rf16, vir_uniform_ui(c, 1)), rf16));
1323 }
1324 
1325 /**
1326  * Takes the result value of a signed integer width conversion from a smaller
1327  * type to a larger type and if needed, it applies sign extension to it.
1328  */
1329 static struct qreg
sign_extend(struct v3d_compile * c,struct qreg value,uint32_t src_bit_size,uint32_t dst_bit_size)1330 sign_extend(struct v3d_compile *c,
1331             struct qreg value,
1332             uint32_t src_bit_size,
1333             uint32_t dst_bit_size)
1334 {
1335         assert(src_bit_size < dst_bit_size);
1336 
1337         struct qreg tmp = vir_MOV(c, value);
1338 
1339         /* Do we need to sign-extend? */
1340         uint32_t sign_mask = 1 << (src_bit_size - 1);
1341         struct qinst *sign_check =
1342                 vir_AND_dest(c, vir_nop_reg(),
1343                              tmp, vir_uniform_ui(c, sign_mask));
1344         vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
1345 
1346         /* If so, fill in leading sign bits */
1347         uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
1348                                ((1ull << dst_bit_size) - 1);
1349         struct qinst *extend_inst =
1350                 vir_OR_dest(c, tmp, tmp,
1351                             vir_uniform_ui(c, extend_bits));
1352         vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
1353 
1354         return tmp;
1355 }
1356 
1357 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1358 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1359 {
1360         /* Vectors are special in that they have non-scalarized writemasks,
1361          * and just take the first swizzle channel for each argument in order
1362          * into each writemask channel.
1363          */
1364         if (instr->op == nir_op_vec2 ||
1365             instr->op == nir_op_vec3 ||
1366             instr->op == nir_op_vec4) {
1367                 struct qreg srcs[4];
1368                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1369                         srcs[i] = ntq_get_src(c, instr->src[i].src,
1370                                               instr->src[i].swizzle[0]);
1371                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1372                         ntq_store_def(c, &instr->def, i,
1373                                       vir_MOV(c, srcs[i]));
1374                 return;
1375         }
1376 
1377         /* General case: We can just grab the one used channel per src. */
1378         struct qreg src[nir_op_infos[instr->op].num_inputs];
1379         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1380                 src[i] = ntq_get_alu_src(c, instr, i);
1381         }
1382 
1383         struct qreg result;
1384 
1385         switch (instr->op) {
1386         case nir_op_mov:
1387                 result = vir_MOV(c, src[0]);
1388                 break;
1389 
1390         case nir_op_fneg:
1391                 result = vir_XOR(c, src[0], vir_uniform_ui(c, UINT32_C(1) << 31));
1392                 break;
1393         case nir_op_ineg:
1394                 result = vir_NEG(c, src[0]);
1395                 break;
1396 
1397         case nir_op_fmul:
1398                 result = vir_FMUL(c, src[0], src[1]);
1399                 break;
1400         case nir_op_fadd:
1401                 result = vir_FADD(c, src[0], src[1]);
1402                 break;
1403         case nir_op_fsub:
1404                 result = vir_FSUB(c, src[0], src[1]);
1405                 break;
1406         case nir_op_fmin:
1407                 result = vir_FMIN(c, src[0], src[1]);
1408                 break;
1409         case nir_op_fmax:
1410                 result = vir_FMAX(c, src[0], src[1]);
1411                 break;
1412 
1413         case nir_op_f2i32: {
1414                 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1415                 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1416                         result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1417                 } else {
1418                         result = vir_FTOIZ(c, src[0]);
1419                 }
1420                 if (nir_src_bit_size(instr->src[0].src) == 16)
1421                         vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1422                 break;
1423         }
1424 
1425         case nir_op_f2u32:
1426                 result = vir_FTOUZ(c, src[0]);
1427                 if (nir_src_bit_size(instr->src[0].src) == 16)
1428                         vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1429                 break;
1430         case nir_op_i2f32: {
1431                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1432                 assert(bit_size <= 32);
1433                 result = src[0];
1434                 if (bit_size < 32) {
1435                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1436                         result = vir_AND(c, result, vir_uniform_ui(c, mask));
1437                         result = sign_extend(c, result, bit_size, 32);
1438                 }
1439                 result = vir_ITOF(c, result);
1440                 break;
1441         }
1442         case nir_op_u2f32: {
1443                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1444                 assert(bit_size <= 32);
1445                 result = src[0];
1446                 if (bit_size < 32) {
1447                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1448                         result = vir_AND(c, result, vir_uniform_ui(c, mask));
1449                 }
1450                 result = vir_UTOF(c, result);
1451                 break;
1452         }
1453         case nir_op_b2f16:
1454                 result = vir_AND(c, src[0], vir_uniform_ui(c, 0x3c00));
1455                 break;
1456         case nir_op_b2f32:
1457                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1458                 break;
1459         case nir_op_b2i8:
1460         case nir_op_b2i16:
1461         case nir_op_b2i32:
1462                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1463                 break;
1464 
1465         case nir_op_i2f16: {
1466                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1467                 assert(bit_size <= 32);
1468                 if (bit_size < 32) {
1469                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1470                         result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1471                         result = sign_extend(c, result, bit_size, 32);
1472                 }
1473                 result = vir_ITOF(c, result);
1474                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1475                 break;
1476         }
1477 
1478         case nir_op_u2f16: {
1479                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1480                 assert(bit_size <= 32);
1481                 if (bit_size < 32) {
1482                         uint32_t mask = bit_size == 16 ? 0xffff : 0xff;
1483                         result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1484                 }
1485                 result = vir_UTOF(c, result);
1486                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1487                 break;
1488         }
1489 
1490         case nir_op_f2f16:
1491         case nir_op_f2f16_rtne:
1492                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1493                 result = vir_FMOV(c, src[0]);
1494                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1495                 break;
1496 
1497         case nir_op_f2f16_rtz:
1498                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1499                 result = f2f16_rtz(c, src[0]);
1500                 break;
1501 
1502         case nir_op_f2f32:
1503                 assert(nir_src_bit_size(instr->src[0].src) == 16);
1504                 result = vir_FMOV(c, src[0]);
1505                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1506                 break;
1507 
1508         case nir_op_i2i16: {
1509                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1510                 assert(bit_size == 32 || bit_size == 8);
1511                 if (bit_size == 32) {
1512                         /* We don't have integer pack/unpack methods for
1513                          * converting between 16-bit and 32-bit, so we implement
1514                          * the conversion manually by truncating the src.
1515                          */
1516                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1517                 } else {
1518                         struct qreg tmp = vir_AND(c, src[0],
1519                                                   vir_uniform_ui(c, 0xff));
1520                         result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
1521                 }
1522                 break;
1523         }
1524 
1525         case nir_op_u2u16: {
1526                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1527                 assert(bit_size == 32 || bit_size == 8);
1528 
1529                 /* We don't have integer pack/unpack methods for converting
1530                  * between 16-bit and 32-bit, so we implement the conversion
1531                  * manually by truncating the src. For the 8-bit case, we
1532                  * want to make sure we don't copy garbage from any of the
1533                  * 24 MSB bits.
1534                  */
1535                 if (bit_size == 32)
1536                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1537                 else
1538                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1539                 break;
1540         }
1541 
1542         case nir_op_i2i8:
1543         case nir_op_u2u8:
1544                 assert(nir_src_bit_size(instr->src[0].src) == 32 ||
1545                        nir_src_bit_size(instr->src[0].src) == 16);
1546                 /* We don't have integer pack/unpack methods for converting
1547                  * between 8-bit and 32-bit, so we implement the conversion
1548                  * manually by truncating the src.
1549                  */
1550                 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1551                 break;
1552 
1553         case nir_op_u2u32: {
1554                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1555                 assert(bit_size == 16 || bit_size == 8);
1556 
1557                 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
1558                  * from the src but we make sure to clear any garbage bits that
1559                  * may be present in the invalid src bits.
1560                  */
1561                 uint32_t mask = (1 << bit_size) - 1;
1562                 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1563                 break;
1564         }
1565 
1566         case nir_op_i2i32: {
1567                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1568                 assert(bit_size == 16 || bit_size == 8);
1569 
1570                 uint32_t mask = (1 << bit_size) - 1;
1571                 struct qreg tmp = vir_AND(c, src[0],
1572                                           vir_uniform_ui(c, mask));
1573 
1574                 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
1575                 break;
1576         }
1577 
1578         case nir_op_iadd:
1579                 result = vir_ADD(c, src[0], src[1]);
1580                 break;
1581         case nir_op_ushr:
1582                 result = vir_SHR(c, src[0], src[1]);
1583                 break;
1584         case nir_op_isub:
1585                 result = vir_SUB(c, src[0], src[1]);
1586                 break;
1587         case nir_op_ishr:
1588                 result = vir_ASR(c, src[0], src[1]);
1589                 break;
1590         case nir_op_ishl:
1591                 result = vir_SHL(c, src[0], src[1]);
1592                 break;
1593         case nir_op_imin:
1594                 result = vir_MIN(c, src[0], src[1]);
1595                 break;
1596         case nir_op_umin:
1597                 result = vir_UMIN(c, src[0], src[1]);
1598                 break;
1599         case nir_op_imax:
1600                 result = vir_MAX(c, src[0], src[1]);
1601                 break;
1602         case nir_op_umax:
1603                 result = vir_UMAX(c, src[0], src[1]);
1604                 break;
1605         case nir_op_iand:
1606                 result = vir_AND(c, src[0], src[1]);
1607                 break;
1608         case nir_op_ior:
1609                 result = vir_OR(c, src[0], src[1]);
1610                 break;
1611         case nir_op_ixor:
1612                 result = vir_XOR(c, src[0], src[1]);
1613                 break;
1614         case nir_op_inot:
1615                 result = vir_NOT(c, src[0]);
1616                 break;
1617 
1618         case nir_op_uclz:
1619                 result = vir_CLZ(c, src[0]);
1620                 break;
1621 
1622         case nir_op_imul:
1623                 result = vir_UMUL(c, src[0], src[1]);
1624                 break;
1625 
1626         case nir_op_seq:
1627         case nir_op_sne:
1628         case nir_op_sge:
1629         case nir_op_slt: {
1630                 enum v3d_qpu_cond cond;
1631                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1632                 assert(ok);
1633                 result = vir_MOV(c, vir_SEL(c, cond,
1634                                             vir_uniform_f(c, 1.0),
1635                                             vir_uniform_f(c, 0.0)));
1636                 c->flags_temp = result.index;
1637                 c->flags_cond = cond;
1638                 break;
1639         }
1640 
1641         case nir_op_feq32:
1642         case nir_op_fneu32:
1643         case nir_op_fge32:
1644         case nir_op_flt32:
1645         case nir_op_ieq32:
1646         case nir_op_ine32:
1647         case nir_op_ige32:
1648         case nir_op_uge32:
1649         case nir_op_ilt32:
1650         case nir_op_ult32: {
1651                 enum v3d_qpu_cond cond;
1652                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1653                 assert(ok);
1654                 result = ntq_emit_cond_to_bool(c, cond);
1655                 break;
1656         }
1657 
1658         case nir_op_b32csel:
1659                 result = vir_MOV(c,
1660                                  vir_SEL(c,
1661                                          ntq_emit_bool_to_cond(c, instr->src[0].src),
1662                                          src[1], src[2]));
1663                 break;
1664 
1665         case nir_op_fcsel:
1666                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1667                            V3D_QPU_PF_PUSHZ);
1668                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1669                                             src[1], src[2]));
1670                 break;
1671 
1672         case nir_op_frcp:
1673                 result = vir_RECIP(c, src[0]);
1674                 break;
1675         case nir_op_frsq:
1676                 result = vir_RSQRT(c, src[0]);
1677                 break;
1678         case nir_op_fexp2:
1679                 result = vir_EXP(c, src[0]);
1680                 break;
1681         case nir_op_flog2:
1682                 result = vir_LOG(c, src[0]);
1683                 break;
1684 
1685         case nir_op_fceil:
1686                 result = vir_FCEIL(c, src[0]);
1687                 break;
1688         case nir_op_ffloor:
1689                 result = vir_FFLOOR(c, src[0]);
1690                 break;
1691         case nir_op_fround_even:
1692                 result = vir_FROUND(c, src[0]);
1693                 break;
1694         case nir_op_ftrunc:
1695                 result = vir_FTRUNC(c, src[0]);
1696                 break;
1697 
1698         case nir_op_fsin:
1699                 result = ntq_fsincos(c, src[0], false);
1700                 break;
1701         case nir_op_fcos:
1702                 result = ntq_fsincos(c, src[0], true);
1703                 break;
1704 
1705         case nir_op_fsign:
1706                 result = ntq_fsign(c, src[0]);
1707                 break;
1708 
1709         case nir_op_fabs: {
1710                 result = vir_FMOV(c, src[0]);
1711                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1712                 break;
1713         }
1714 
1715         case nir_op_iabs:
1716                 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1717                 break;
1718 
1719         case nir_op_uadd_carry:
1720                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1721                            V3D_QPU_PF_PUSHC);
1722                 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1723                 break;
1724 
1725         case nir_op_usub_borrow:
1726                 vir_set_pf(c, vir_SUB_dest(c, vir_nop_reg(), src[0], src[1]),
1727                            V3D_QPU_PF_PUSHC);
1728                 result = ntq_emit_cond_to_int(c, V3D_QPU_COND_IFA);
1729                 break;
1730 
1731         case nir_op_pack_half_2x16_split:
1732                 result = vir_VFPACK(c, src[0], src[1]);
1733                 break;
1734 
1735         case nir_op_pack_2x32_to_2x16_v3d:
1736                 result = vir_VPACK(c, src[0], src[1]);
1737                 break;
1738 
1739         case nir_op_pack_32_to_r11g11b10_v3d:
1740                 result = vir_V11FPACK(c, src[0], src[1]);
1741                 break;
1742 
1743         case nir_op_pack_uint_32_to_r10g10b10a2_v3d:
1744                 result = vir_V10PACK(c, src[0], src[1]);
1745                 break;
1746 
1747         case nir_op_pack_4x16_to_4x8_v3d:
1748                 result = vir_V8PACK(c, src[0], src[1]);
1749                 break;
1750 
1751         case nir_op_unpack_half_2x16_split_x:
1752                 result = vir_FMOV(c, src[0]);
1753                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1754                 break;
1755 
1756         case nir_op_unpack_half_2x16_split_y:
1757                 result = vir_FMOV(c, src[0]);
1758                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1759                 break;
1760 
1761         case nir_op_pack_2x16_to_unorm_2x8_v3d:
1762                 result = vir_VFTOUNORM8(c, src[0]);
1763                 break;
1764 
1765         case nir_op_pack_2x16_to_snorm_2x8_v3d:
1766                 result = vir_VFTOSNORM8(c, src[0]);
1767                 break;
1768 
1769         case nir_op_pack_2x16_to_unorm_2x10_v3d:
1770                 result = vir_VFTOUNORM10LO(c, src[0]);
1771                 break;
1772 
1773         case nir_op_pack_2x16_to_unorm_10_2_v3d:
1774                 result = vir_VFTOUNORM10HI(c, src[0]);
1775                 break;
1776 
1777         case nir_op_f2unorm_16_v3d:
1778                 result = vir_FTOUNORM16(c, src[0]);
1779                 break;
1780 
1781         case nir_op_f2snorm_16_v3d:
1782                 result = vir_FTOSNORM16(c, src[0]);
1783                 break;
1784 
1785         case nir_op_fsat:
1786                 assert(c->devinfo->ver >= 71);
1787                 result = vir_FMOV(c, src[0]);
1788                 vir_set_unpack(c->defs[result.index], 0, V3D71_QPU_UNPACK_SAT);
1789                 break;
1790 
1791         default:
1792                 fprintf(stderr, "unknown NIR ALU inst: ");
1793                 nir_print_instr(&instr->instr, stderr);
1794                 fprintf(stderr, "\n");
1795                 abort();
1796         }
1797 
1798         ntq_store_def(c, &instr->def, 0, result);
1799 }
1800 
1801 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1802  * specifier.  They come from a register that's preloaded with 0xffffffff
1803  * (0xff gets you normal vec4 f16 RT0 writes), and when one is needed the low
1804  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1805  */
1806 #define TLB_TYPE_F16_COLOR         (3 << 6)
1807 #define TLB_TYPE_I32_COLOR         (1 << 6)
1808 #define TLB_TYPE_F32_COLOR         (0 << 6)
1809 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1810 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1811 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1812 #define TLB_F16_SWAP_HI_LO         (1 << 1)
1813 #define TLB_VEC_SIZE_4_F16         (1 << 0)
1814 #define TLB_VEC_SIZE_2_F16         (0 << 0)
1815 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1816 
1817 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1818  * flag is set.
1819  */
1820 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1821 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1822 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1823 #define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1824 #define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1825 
1826 /* Stencil is a single 32-bit write. */
1827 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1828 
1829 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1830 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1831 {
1832         if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1833                 return;
1834 
1835         struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1836         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1837 
1838         nir_variable *var = c->output_color_var[rt];
1839         int num_components = glsl_get_vector_elements(var->type);
1840         uint32_t conf = 0xffffff00;
1841         struct qinst *inst;
1842 
1843         conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1844                                             TLB_SAMPLE_MODE_PER_PIXEL;
1845         conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1846 
1847         if (c->fs_key->swap_color_rb & (1 << rt))
1848                 num_components = MAX2(num_components, 3);
1849         assert(num_components != 0);
1850 
1851         enum glsl_base_type type = glsl_get_base_type(var->type);
1852         bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1853         bool is_32b_tlb_format = is_int_format ||
1854                                  (c->fs_key->f32_color_rb & (1 << rt));
1855 
1856         if (is_int_format) {
1857                 /* The F32 vs I32 distinction was dropped in 4.2. */
1858                 if (c->devinfo->ver < 42)
1859                         conf |= TLB_TYPE_I32_COLOR;
1860                 else
1861                         conf |= TLB_TYPE_F32_COLOR;
1862                 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1863         } else {
1864                 if (c->fs_key->f32_color_rb & (1 << rt)) {
1865                         conf |= TLB_TYPE_F32_COLOR;
1866                         conf |= ((num_components - 1) <<
1867                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
1868                 } else {
1869                         conf |= TLB_TYPE_F16_COLOR;
1870                         conf |= TLB_F16_SWAP_HI_LO;
1871                         if (num_components >= 3)
1872                                 conf |= TLB_VEC_SIZE_4_F16;
1873                         else
1874                                 conf |= TLB_VEC_SIZE_2_F16;
1875                 }
1876         }
1877 
1878         int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1879         for (int i = 0; i < num_samples; i++) {
1880                 struct qreg *color = c->msaa_per_sample_output ?
1881                         &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1882                         &c->outputs[var->data.driver_location * 4];
1883 
1884                 struct qreg r = color[0];
1885                 struct qreg g = color[1];
1886                 struct qreg b = color[2];
1887                 struct qreg a = color[3];
1888 
1889                 if (c->fs_key->swap_color_rb & (1 << rt))  {
1890                         r = color[2];
1891                         b = color[0];
1892                 }
1893 
1894                 if (c->fs_key->sample_alpha_to_one)
1895                         a = vir_uniform_f(c, 1.0);
1896 
1897                 if (is_32b_tlb_format) {
1898                         if (i == 0) {
1899                                 inst = vir_MOV_dest(c, tlbu_reg, r);
1900                                 inst->uniform =
1901                                         vir_get_uniform_index(c,
1902                                                               QUNIFORM_CONSTANT,
1903                                                               conf);
1904                         } else {
1905                                 vir_MOV_dest(c, tlb_reg, r);
1906                         }
1907 
1908                         if (num_components >= 2)
1909                                 vir_MOV_dest(c, tlb_reg, g);
1910                         if (num_components >= 3)
1911                                 vir_MOV_dest(c, tlb_reg, b);
1912                         if (num_components >= 4)
1913                                 vir_MOV_dest(c, tlb_reg, a);
1914                 } else {
1915                         inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1916                         if (conf != ~0 && i == 0) {
1917                                 inst->dst = tlbu_reg;
1918                                 inst->uniform =
1919                                         vir_get_uniform_index(c,
1920                                                               QUNIFORM_CONSTANT,
1921                                                               conf);
1922                         }
1923 
1924                         if (num_components >= 3)
1925                                 vir_VFPACK_dest(c, tlb_reg, b, a);
1926                 }
1927         }
1928 }
1929 
1930 static void
emit_frag_end(struct v3d_compile * c)1931 emit_frag_end(struct v3d_compile *c)
1932 {
1933         if (c->output_sample_mask_index != -1) {
1934                 vir_SETMSF_dest(c, vir_nop_reg(),
1935                                 vir_AND(c,
1936                                         vir_MSF(c),
1937                                         c->outputs[c->output_sample_mask_index]));
1938         }
1939 
1940         bool has_any_tlb_color_write = false;
1941         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1942                 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1943                         has_any_tlb_color_write = true;
1944         }
1945 
1946         if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1947                 struct nir_variable *var = c->output_color_var[0];
1948                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1949 
1950                 vir_SETMSF_dest(c, vir_nop_reg(),
1951                                 vir_AND(c,
1952                                         vir_MSF(c),
1953                                         vir_FTOC(c, color[3])));
1954         }
1955 
1956         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1957 
1958         /* If the shader has no non-TLB side effects and doesn't write Z
1959          * we can promote it to enabling early_fragment_tests even
1960          * if the user didn't.
1961          */
1962         if (c->output_position_index == -1 &&
1963             !(c->s->info.num_images || c->s->info.num_ssbos) &&
1964             !c->s->info.fs.uses_discard &&
1965             !c->fs_key->sample_alpha_to_coverage &&
1966             c->output_sample_mask_index == -1 &&
1967             has_any_tlb_color_write) {
1968                 c->s->info.fs.early_fragment_tests = true;
1969         }
1970 
1971         /* By default, Z buffer writes are implicit using the Z values produced
1972          * from FEP (Z value produced from rasterization). When this is not
1973          * desirable (shader writes Z explicitly, has discards, etc) we need
1974          * to let the hardware know by setting c->writes_z to true, in which
1975          * case we always need to write a Z value from the QPU, even if it is
1976          * just the passthrough Z value produced from FEP.
1977          *
1978          * Also, from the V3D 4.2 spec:
1979          *
1980          * "If a shader performs a Z read the “Fragment shader does Z writes”
1981          *  bit in the shader record must be enabled to ensure deterministic
1982          *  results"
1983          *
1984          * So if c->reads_z is set we always need to write Z, even if it is
1985          * a passthrough from the Z value produced from FEP.
1986          */
1987         if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
1988                 c->writes_z = true;
1989                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1990                 struct qinst *inst;
1991 
1992                 if (c->output_position_index != -1) {
1993                         /* Shader writes to gl_FragDepth, use that */
1994                         inst = vir_MOV_dest(c, tlbu_reg,
1995                                             c->outputs[c->output_position_index]);
1996 
1997                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1998                                           TLB_SAMPLE_MODE_PER_PIXEL);
1999                 } else {
2000                         /* Shader doesn't write to gl_FragDepth, take Z from
2001                          * FEP.
2002                          */
2003                         c->writes_z_from_fep = true;
2004                         inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
2005 
2006                         /* The spec says the PER_PIXEL flag is ignored for
2007                          * invariant writes, but the simulator demands it.
2008                          */
2009                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
2010                                           TLB_SAMPLE_MODE_PER_PIXEL);
2011 
2012                         /* Since (single-threaded) fragment shaders always need
2013                          * a TLB write, if we dond't have any we emit a
2014                          * passthrouh Z and flag us as potentially discarding,
2015                          * so that we can use Z as the required TLB write.
2016                          */
2017                         if (!has_any_tlb_color_write)
2018                                 c->s->info.fs.uses_discard = true;
2019                 }
2020 
2021                 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
2022                                                       tlb_specifier |
2023                                                       0xffffff00);
2024                 inst->is_tlb_z_write = true;
2025         }
2026 
2027         /* XXX: Performance improvement: Merge Z write and color writes TLB
2028          * uniform setup
2029          */
2030         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
2031                 vir_emit_tlb_color_write(c, rt);
2032 }
2033 
2034 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)2035 vir_VPM_WRITE_indirect(struct v3d_compile *c,
2036                        struct qreg val,
2037                        struct qreg vpm_index,
2038                        bool uniform_vpm_index)
2039 {
2040         if (uniform_vpm_index)
2041                 vir_STVPMV(c, vpm_index, val);
2042         else
2043                 vir_STVPMD(c, vpm_index, val);
2044 }
2045 
2046 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)2047 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
2048 {
2049         vir_VPM_WRITE_indirect(c, val,
2050                                vir_uniform_ui(c, vpm_index), true);
2051 }
2052 
2053 static void
emit_vert_end(struct v3d_compile * c)2054 emit_vert_end(struct v3d_compile *c)
2055 {
2056         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2057          */
2058         if (c->devinfo->ver == 42)
2059                 vir_VPMWT(c);
2060 }
2061 
2062 static void
emit_geom_end(struct v3d_compile * c)2063 emit_geom_end(struct v3d_compile *c)
2064 {
2065         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2066          */
2067         if (c->devinfo->ver == 42)
2068                 vir_VPMWT(c);
2069 }
2070 
2071 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2072 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
2073                        unsigned bit_size,
2074                        unsigned num_components,
2075                        nir_intrinsic_instr *low,
2076                        nir_intrinsic_instr *high,
2077                        void *data)
2078 {
2079         /* TMU general access only supports 32-bit vectors */
2080         if (bit_size > 32)
2081                 return false;
2082 
2083         if ((bit_size == 8 || bit_size == 16) && num_components > 1)
2084                 return false;
2085 
2086         if (align_mul % 4 != 0 || align_offset % 4 != 0)
2087                 return false;
2088 
2089         /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
2090          * if the resulting vector crosses a 16-byte boundary.
2091          */
2092         assert(util_is_power_of_two_nonzero(align_mul));
2093         align_mul = MIN2(align_mul, 16);
2094         align_offset &= 0xf;
2095         if (16 - align_mul + align_offset + num_components * 4 > 16)
2096                 return false;
2097 
2098         return true;
2099 }
2100 
2101 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)2102 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
2103 {
2104         bool progress;
2105         unsigned lower_flrp =
2106                 (s->options->lower_flrp16 ? 16 : 0) |
2107                 (s->options->lower_flrp32 ? 32 : 0) |
2108                 (s->options->lower_flrp64 ? 64 : 0);
2109 
2110         do {
2111                 progress = false;
2112 
2113                 NIR_PASS(progress, s, nir_split_array_vars, nir_var_function_temp);
2114                 NIR_PASS(progress, s, nir_shrink_vec_array_vars, nir_var_function_temp);
2115                 NIR_PASS(progress, s, nir_opt_deref);
2116 
2117                 NIR_PASS(progress, s, nir_lower_vars_to_ssa);
2118                 if (!s->info.var_copies_lowered) {
2119                         /* Only run this pass if nir_lower_var_copies was not called
2120                          * yet. That would lower away any copy_deref instructions and we
2121                          * don't want to introduce any more.
2122                          */
2123                         NIR_PASS(progress, s, nir_opt_find_array_copies);
2124                 }
2125 
2126                 NIR_PASS(progress, s, nir_opt_copy_prop_vars);
2127                 NIR_PASS(progress, s, nir_opt_dead_write_vars);
2128                 NIR_PASS(progress, s, nir_opt_combine_stores, nir_var_all);
2129 
2130                 NIR_PASS(progress, s, nir_remove_dead_variables,
2131                          (nir_variable_mode)(nir_var_function_temp |
2132                                              nir_var_shader_temp |
2133                                              nir_var_mem_shared),
2134                          NULL);
2135 
2136                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2137                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
2138                 NIR_PASS(progress, s, nir_copy_prop);
2139                 NIR_PASS(progress, s, nir_opt_remove_phis);
2140                 NIR_PASS(progress, s, nir_opt_dce);
2141                 NIR_PASS(progress, s, nir_opt_dead_cf);
2142                 NIR_PASS(progress, s, nir_opt_cse);
2143                 /* before peephole_select as it can generate 64 bit bcsels */
2144                 NIR_PASS(progress, s, nir_lower_64bit_phis);
2145                 NIR_PASS(progress, s, nir_opt_peephole_select, 0, false, false);
2146                 NIR_PASS(progress, s, nir_opt_peephole_select, 24, true, true);
2147                 NIR_PASS(progress, s, nir_opt_algebraic);
2148                 NIR_PASS(progress, s, nir_opt_constant_folding);
2149 
2150                 NIR_PASS(progress, s, nir_opt_intrinsics);
2151                 NIR_PASS(progress, s, nir_opt_idiv_const, 32);
2152                 NIR_PASS(progress, s, nir_lower_alu);
2153 
2154                 if (nir_opt_loop(s)) {
2155                    progress = true;
2156                    NIR_PASS(progress, s, nir_copy_prop);
2157                    NIR_PASS(progress, s, nir_opt_dce);
2158                 }
2159 
2160                 NIR_PASS(progress, s, nir_opt_conditional_discard);
2161 
2162                 NIR_PASS(progress, s, nir_opt_remove_phis);
2163                 NIR_PASS(progress, s, nir_opt_if, false);
2164                 if (c && !c->disable_gcm) {
2165                         bool local_progress = false;
2166                         NIR_PASS(local_progress, s, nir_opt_gcm, false);
2167                         c->gcm_progress |= local_progress;
2168                         progress |= local_progress;
2169                 }
2170 
2171                 /* Note that vectorization may undo the load/store scalarization
2172                  * pass we run for non 32-bit TMU general load/store by
2173                  * converting, for example, 2 consecutive 16-bit loads into a
2174                  * single 32-bit load. This is fine (and desirable) as long as
2175                  * the resulting 32-bit load meets 32-bit alignment requirements,
2176                  * which mem_vectorize_callback() should be enforcing.
2177                  */
2178                 nir_load_store_vectorize_options vectorize_opts = {
2179                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
2180                                  nir_var_mem_push_const | nir_var_mem_shared |
2181                                  nir_var_mem_global,
2182                         .callback = mem_vectorize_callback,
2183                         .robust_modes = 0,
2184                 };
2185                 bool vectorize_progress = false;
2186 
2187 
2188                 /* This requires that we have called
2189                  * nir_lower_vars_to_explicit_types / nir_lower_explicit_io
2190                  * first, which we may not have done yet if we call here too
2191                  * early durign NIR pre-processing. We can detect this because
2192                  * in that case we won't have a compile object
2193                  */
2194                 if (c) {
2195                         NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
2196                                  &vectorize_opts);
2197                         if (vectorize_progress) {
2198                                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2199                                 NIR_PASS(progress, s, nir_lower_pack);
2200                                 progress = true;
2201                         }
2202                 }
2203 
2204                 if (lower_flrp != 0) {
2205                         bool lower_flrp_progress = false;
2206 
2207                         NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
2208                                  lower_flrp,
2209                                  false /* always_precise */);
2210                         if (lower_flrp_progress) {
2211                                 NIR_PASS(progress, s, nir_opt_constant_folding);
2212                                 progress = true;
2213                         }
2214 
2215                         /* Nothing should rematerialize any flrps, so we only
2216                          * need to do this lowering once.
2217                          */
2218                         lower_flrp = 0;
2219                 }
2220 
2221                 NIR_PASS(progress, s, nir_opt_undef);
2222                 NIR_PASS(progress, s, nir_lower_undef_to_zero);
2223 
2224                 if (c && !c->disable_loop_unrolling &&
2225                     s->options->max_unroll_iterations > 0) {
2226                        bool local_progress = false;
2227                        NIR_PASS(local_progress, s, nir_opt_loop_unroll);
2228                        c->unrolled_any_loops |= local_progress;
2229                        progress |= local_progress;
2230                 }
2231         } while (progress);
2232 
2233         /* needs to be outside of optimization loop, otherwise it fights with
2234          * opt_algebraic optimizing the conversion lowering
2235          */
2236         NIR_PASS(progress, s, v3d_nir_lower_algebraic);
2237         NIR_PASS(progress, s, nir_opt_cse);
2238 
2239         nir_move_options sink_opts =
2240                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
2241                 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
2242         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
2243 }
2244 
2245 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)2246 driver_location_compare(const nir_variable *a, const nir_variable *b)
2247 {
2248         return a->data.driver_location == b->data.driver_location ?
2249                a->data.location_frac - b->data.location_frac :
2250                a->data.driver_location - b->data.driver_location;
2251 }
2252 
2253 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t num_components)2254 ntq_emit_vpm_read(struct v3d_compile *c, uint32_t num_components)
2255 {
2256         return vir_LDVPMV_IN(c,
2257                              vir_uniform_ui(c, num_components));
2258 }
2259 
2260 static void
ntq_setup_vs_inputs(struct v3d_compile * c)2261 ntq_setup_vs_inputs(struct v3d_compile *c)
2262 {
2263         /* Figure out how many components of each vertex attribute the shader
2264          * uses.  Each variable should have been split to individual
2265          * components and unused ones DCEed.  The vertex fetcher will load
2266          * from the start of the attribute to the number of components we
2267          * declare we need in c->vattr_sizes[].
2268          *
2269          * BGRA vertex attributes are a bit special: since we implement these
2270          * as RGBA swapping R/B components we always need at least 3 components
2271          * if component 0 is read.
2272          */
2273         nir_foreach_shader_in_variable(var, c->s) {
2274                 /* No VS attribute array support. */
2275                 assert(MAX2(glsl_get_length(var->type), 1) == 1);
2276 
2277                 unsigned loc = var->data.driver_location;
2278                 int start_component = var->data.location_frac;
2279                 int num_components = glsl_get_components(var->type);
2280 
2281                 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
2282                                            start_component + num_components);
2283 
2284                 /* Handle BGRA inputs */
2285                 if (start_component == 0 &&
2286                     c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
2287                         c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
2288                 }
2289         }
2290 
2291         uint32_t vpm_components = 0;
2292         bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
2293                                     SYSTEM_VALUE_INSTANCE_ID) ||
2294                         BITSET_TEST(c->s->info.system_values_read,
2295                                     SYSTEM_VALUE_INSTANCE_INDEX);
2296         bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
2297                                      SYSTEM_VALUE_BASE_INSTANCE);
2298         bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
2299                                     SYSTEM_VALUE_VERTEX_ID) ||
2300                         BITSET_TEST(c->s->info.system_values_read,
2301                                     SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
2302 
2303         if (uses_iid)
2304                 c->iid = ntq_emit_vpm_read(c, vpm_components++);
2305 
2306         if (uses_biid)
2307                 c->biid = ntq_emit_vpm_read(c, vpm_components++);
2308 
2309         if (uses_vid)
2310                 c->vid = ntq_emit_vpm_read(c, vpm_components++);
2311 
2312         /* The actual loads will happen directly in nir_intrinsic_load_input
2313          */
2314         return;
2315 }
2316 
2317 static bool
program_reads_point_coord(struct v3d_compile * c)2318 program_reads_point_coord(struct v3d_compile *c)
2319 {
2320         nir_foreach_shader_in_variable(var, c->s) {
2321                 if (util_varying_is_point_coord(var->data.location,
2322                                                 c->fs_key->point_sprite_mask)) {
2323                         return true;
2324                 }
2325         }
2326 
2327         return false;
2328 }
2329 
2330 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2331 ntq_setup_gs_inputs(struct v3d_compile *c)
2332 {
2333         nir_sort_variables_with_modes(c->s, driver_location_compare,
2334                                       nir_var_shader_in);
2335 
2336         nir_foreach_shader_in_variable(var, c->s) {
2337                 /* All GS inputs are arrays with as many entries as vertices
2338                  * in the input primitive, but here we only care about the
2339                  * per-vertex input type.
2340                  */
2341                 assert(glsl_type_is_array(var->type));
2342                 const struct glsl_type *type = glsl_get_array_element(var->type);
2343                 unsigned var_len = glsl_count_vec4_slots(type, false, false);
2344                 unsigned loc = var->data.driver_location;
2345 
2346                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2347                                   (loc + var_len) * 4);
2348 
2349                 if (var->data.compact) {
2350                         for (unsigned j = 0; j < var_len; j++) {
2351                                 unsigned input_idx = c->num_inputs++;
2352                                 unsigned loc_frac = var->data.location_frac + j;
2353                                 unsigned loc = var->data.location + loc_frac / 4;
2354                                 unsigned comp = loc_frac % 4;
2355                                 c->input_slots[input_idx] =
2356                                         v3d_slot_from_slot_and_component(loc, comp);
2357                         }
2358                        continue;
2359                 }
2360 
2361                 for (unsigned j = 0; j < var_len; j++) {
2362                         unsigned num_elements =
2363                                 glsl_type_is_struct(glsl_without_array(type)) ?
2364                                 4 : glsl_get_vector_elements(type);
2365                         for (unsigned k = 0; k < num_elements; k++) {
2366                                 unsigned chan = var->data.location_frac + k;
2367                                 unsigned input_idx = c->num_inputs++;
2368                                 struct v3d_varying_slot slot =
2369                                         v3d_slot_from_slot_and_component(var->data.location + j, chan);
2370                                 c->input_slots[input_idx] = slot;
2371                         }
2372                 }
2373         }
2374 }
2375 
2376 
2377 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2378 ntq_setup_fs_inputs(struct v3d_compile *c)
2379 {
2380         nir_sort_variables_with_modes(c->s, driver_location_compare,
2381                                       nir_var_shader_in);
2382 
2383         nir_foreach_shader_in_variable(var, c->s) {
2384                 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2385                 unsigned loc = var->data.driver_location;
2386 
2387                 uint32_t inputs_array_size = c->inputs_array_size;
2388                 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2389                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2390                                   inputs_array_required_size);
2391                 resize_interp_array(c, &c->interp, &inputs_array_size,
2392                                     inputs_array_required_size);
2393 
2394                 if (var->data.location == VARYING_SLOT_POS) {
2395                         emit_fragcoord_input(c, loc);
2396                 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2397                            !c->fs_key->has_gs) {
2398                         /* If the fragment shader reads gl_PrimitiveID and we
2399                          * don't have a geometry shader in the pipeline to write
2400                          * it then we program the hardware to inject it as
2401                          * an implicit varying. Take it from there.
2402                          */
2403                         c->inputs[loc * 4] = c->primitive_id;
2404                 } else if (util_varying_is_point_coord(var->data.location,
2405                                                        c->fs_key->point_sprite_mask)) {
2406                         c->inputs[loc * 4 + 0] = c->point_x;
2407                         c->inputs[loc * 4 + 1] = c->point_y;
2408                 } else if (var->data.compact) {
2409                         for (int j = 0; j < var_len; j++)
2410                                 emit_compact_fragment_input(c, loc, var, j);
2411                 } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
2412                         for (int j = 0; j < var_len; j++) {
2413                            emit_fragment_input(c, loc, var, j, 4);
2414                         }
2415                 } else {
2416                         for (int j = 0; j < var_len; j++) {
2417                                 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2418                         }
2419                 }
2420         }
2421 }
2422 
2423 static void
ntq_setup_outputs(struct v3d_compile * c)2424 ntq_setup_outputs(struct v3d_compile *c)
2425 {
2426         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2427                 return;
2428 
2429         nir_foreach_shader_out_variable(var, c->s) {
2430                 assert(glsl_type_is_vector_or_scalar(var->type));
2431                 unsigned loc = var->data.driver_location * 4;
2432 
2433                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2434                         add_output(c, loc + var->data.location_frac + i,
2435                                    var->data.location,
2436                                    var->data.location_frac + i);
2437                 }
2438 
2439                 switch (var->data.location) {
2440                 case FRAG_RESULT_COLOR:
2441                         for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
2442                                 c->output_color_var[i] = var;
2443                         break;
2444                 case FRAG_RESULT_DATA0:
2445                 case FRAG_RESULT_DATA1:
2446                 case FRAG_RESULT_DATA2:
2447                 case FRAG_RESULT_DATA3:
2448                 case FRAG_RESULT_DATA4:
2449                 case FRAG_RESULT_DATA5:
2450                 case FRAG_RESULT_DATA6:
2451                 case FRAG_RESULT_DATA7:
2452                         c->output_color_var[var->data.location -
2453                                             FRAG_RESULT_DATA0] = var;
2454                         break;
2455                 case FRAG_RESULT_DEPTH:
2456                         c->output_position_index = loc;
2457                         break;
2458                 case FRAG_RESULT_SAMPLE_MASK:
2459                         c->output_sample_mask_index = loc;
2460                         break;
2461                 }
2462         }
2463 }
2464 
2465 /**
2466  * Sets up the mapping from nir_register to struct qreg *.
2467  *
2468  * Each nir_register gets a struct qreg per 32-bit component being stored.
2469  */
2470 static void
ntq_setup_registers(struct v3d_compile * c,nir_function_impl * impl)2471 ntq_setup_registers(struct v3d_compile *c, nir_function_impl *impl)
2472 {
2473         nir_foreach_reg_decl(decl, impl) {
2474                 unsigned num_components = nir_intrinsic_num_components(decl);
2475                 unsigned array_len = nir_intrinsic_num_array_elems(decl);
2476                 array_len = MAX2(array_len, 1);
2477                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2478                                                   array_len * num_components);
2479 
2480                 nir_def *nir_reg = &decl->def;
2481                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2482 
2483                 for (int i = 0; i < array_len * num_components; i++)
2484                         qregs[i] = vir_get_temp(c);
2485         }
2486 }
2487 
2488 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2489 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2490 {
2491         /* XXX perf: Experiment with using immediate loads to avoid having
2492          * these end up in the uniform stream.  Watch out for breaking the
2493          * small immediates optimization in the process!
2494          */
2495         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2496         for (int i = 0; i < instr->def.num_components; i++)
2497                 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2498 
2499         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2500 }
2501 
2502 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2503 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2504 {
2505         unsigned image_index = nir_src_as_uint(instr->src[0]);
2506         bool is_array = nir_intrinsic_image_array(instr);
2507 
2508         assert(nir_src_as_uint(instr->src[1]) == 0);
2509 
2510         ntq_store_def(c, &instr->def, 0,
2511                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2512         if (instr->num_components > 1) {
2513                 ntq_store_def(c, &instr->def, 1,
2514                               vir_uniform(c,
2515                                           instr->num_components == 2 && is_array ?
2516                                                   QUNIFORM_IMAGE_ARRAY_SIZE :
2517                                                   QUNIFORM_IMAGE_HEIGHT,
2518                                           image_index));
2519         }
2520         if (instr->num_components > 2) {
2521                 ntq_store_def(c, &instr->def, 2,
2522                               vir_uniform(c,
2523                                           is_array ?
2524                                           QUNIFORM_IMAGE_ARRAY_SIZE :
2525                                           QUNIFORM_IMAGE_DEPTH,
2526                                           image_index));
2527         }
2528 }
2529 
2530 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2531 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2532 {
2533         assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2534 
2535         int rt = nir_src_as_uint(instr->src[0]);
2536         assert(rt < V3D_MAX_DRAW_BUFFERS);
2537 
2538         int sample_index = nir_intrinsic_base(instr) ;
2539         assert(sample_index < V3D_MAX_SAMPLES);
2540 
2541         int component = nir_intrinsic_component(instr);
2542         assert(component < 4);
2543 
2544         /* We need to emit our TLB reads after we have acquired the scoreboard
2545          * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2546          * the last thread switch to improve parallelism, however, that is only
2547          * guaranteed to happen before the tlb color writes.
2548          *
2549          * To fix that, we make sure we always emit a thread switch before the
2550          * first tlb color read. If that happens to be the last thread switch
2551          * we emit, then everything is fine, but otherwise, if any code after
2552          * this point needs to emit additional thread switches, then we will
2553          * switch the strategy to locking the scoreboard on the first thread
2554          * switch instead -- see vir_emit_thrsw().
2555          */
2556         if (!c->emitted_tlb_load) {
2557                 if (!c->last_thrsw_at_top_level)
2558                         vir_emit_thrsw(c);
2559 
2560                 c->emitted_tlb_load = true;
2561         }
2562 
2563         struct qreg *color_reads_for_sample =
2564                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2565 
2566         if (color_reads_for_sample[component].file == QFILE_NULL) {
2567                 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2568                 int num_components =
2569                         util_format_get_nr_components(rt_format);
2570 
2571                 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2572                 if (swap_rb)
2573                         num_components = MAX2(num_components, 3);
2574 
2575                 nir_variable *var = c->output_color_var[rt];
2576                 enum glsl_base_type type = glsl_get_base_type(var->type);
2577 
2578                 bool is_int_format = type == GLSL_TYPE_INT ||
2579                                      type == GLSL_TYPE_UINT;
2580 
2581                 bool is_32b_tlb_format = is_int_format ||
2582                                          (c->fs_key->f32_color_rb & (1 << rt));
2583 
2584                 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2585 
2586                 uint32_t conf = 0xffffff00;
2587                 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2588                                           TLB_SAMPLE_MODE_PER_PIXEL;
2589                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2590 
2591                 if (is_32b_tlb_format) {
2592                         /* The F32 vs I32 distinction was dropped in 4.2. */
2593                         conf |= (c->devinfo->ver < 42 && is_int_format) ?
2594                                 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2595 
2596                         conf |= ((num_components - 1) <<
2597                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
2598                 } else {
2599                         conf |= TLB_TYPE_F16_COLOR;
2600                         conf |= TLB_F16_SWAP_HI_LO;
2601 
2602                         if (num_components >= 3)
2603                                 conf |= TLB_VEC_SIZE_4_F16;
2604                         else
2605                                 conf |= TLB_VEC_SIZE_2_F16;
2606                 }
2607 
2608 
2609                 for (int i = 0; i < num_samples; i++) {
2610                         struct qreg r, g, b, a;
2611                         if (is_32b_tlb_format) {
2612                                 r = conf != 0xffffffff && i == 0?
2613                                         vir_TLBU_COLOR_READ(c, conf) :
2614                                         vir_TLB_COLOR_READ(c);
2615                                 if (num_components >= 2)
2616                                         g = vir_TLB_COLOR_READ(c);
2617                                 if (num_components >= 3)
2618                                         b = vir_TLB_COLOR_READ(c);
2619                                 if (num_components >= 4)
2620                                         a = vir_TLB_COLOR_READ(c);
2621                         } else {
2622                                 struct qreg rg = conf != 0xffffffff && i == 0 ?
2623                                         vir_TLBU_COLOR_READ(c, conf) :
2624                                         vir_TLB_COLOR_READ(c);
2625                                 r = vir_FMOV(c, rg);
2626                                 vir_set_unpack(c->defs[r.index], 0,
2627                                                V3D_QPU_UNPACK_L);
2628                                 g = vir_FMOV(c, rg);
2629                                 vir_set_unpack(c->defs[g.index], 0,
2630                                                V3D_QPU_UNPACK_H);
2631 
2632                                 if (num_components > 2) {
2633                                     struct qreg ba = vir_TLB_COLOR_READ(c);
2634                                     b = vir_FMOV(c, ba);
2635                                     vir_set_unpack(c->defs[b.index], 0,
2636                                                    V3D_QPU_UNPACK_L);
2637                                     a = vir_FMOV(c, ba);
2638                                     vir_set_unpack(c->defs[a.index], 0,
2639                                                    V3D_QPU_UNPACK_H);
2640                                 }
2641                         }
2642 
2643                         struct qreg *color_reads =
2644                                 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2645 
2646                         color_reads[0] = swap_rb ? b : r;
2647                         if (num_components >= 2)
2648                                 color_reads[1] = g;
2649                         if (num_components >= 3)
2650                                 color_reads[2] = swap_rb ? r : b;
2651                         if (num_components >= 4)
2652                                 color_reads[3] = a;
2653                 }
2654         }
2655 
2656         assert(color_reads_for_sample[component].file != QFILE_NULL);
2657         ntq_store_def(c, &instr->def, 0,
2658                       vir_MOV(c, color_reads_for_sample[component]));
2659 }
2660 
2661 static bool
2662 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
2663 
2664 static bool
try_emit_uniform(struct v3d_compile * c,int offset,int num_components,nir_def * def,enum quniform_contents contents)2665 try_emit_uniform(struct v3d_compile *c,
2666                  int offset,
2667                  int num_components,
2668                  nir_def *def,
2669                  enum quniform_contents contents)
2670 {
2671         /* Even though ldunif is strictly 32-bit we can still use it
2672          * to load scalar 8-bit/16-bit uniforms so long as their offset
2673          * is 32-bit aligned. In this case, ldunif would still load
2674          * 32-bit into the destination with the 8-bit/16-bit uniform
2675          * data in the LSB and garbage in the MSB, but that is fine
2676          * because we should only be accessing the valid bits of the
2677          * destination.
2678          *
2679          * FIXME: if in the future we improve our register allocator to
2680          * pack 2 16-bit variables in the MSB and LSB of the same
2681          * register then this optimization would not be valid as is,
2682          * since the load clobbers the MSB.
2683          */
2684         if (offset % 4 != 0)
2685                 return false;
2686 
2687         /* We need dwords */
2688         offset = offset / 4;
2689 
2690         for (int i = 0; i < num_components; i++) {
2691                 ntq_store_def(c, def, i, vir_uniform(c, contents, offset + i));
2692         }
2693 
2694         return true;
2695 }
2696 
2697 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2698 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2699 {
2700         /* We scalarize general TMU access for anything that is not 32-bit. */
2701         assert(instr->def.bit_size == 32 ||
2702                instr->num_components == 1);
2703 
2704         /* Try to emit ldunif if possible, otherwise fallback to general TMU */
2705         if (nir_src_is_const(instr->src[0])) {
2706                 int offset = (nir_intrinsic_base(instr) +
2707                              nir_src_as_uint(instr->src[0]));
2708 
2709                 if (try_emit_uniform(c, offset, instr->num_components,
2710                                      &instr->def, QUNIFORM_UNIFORM)) {
2711                         return;
2712                 }
2713         }
2714 
2715         if (!ntq_emit_load_unifa(c, instr)) {
2716                 ntq_emit_tmu_general(c, instr, false, false);
2717                 c->has_general_tmu_load = true;
2718         }
2719 }
2720 
2721 static bool
ntq_emit_inline_ubo_load(struct v3d_compile * c,nir_intrinsic_instr * instr)2722 ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
2723 {
2724         if (c->compiler->max_inline_uniform_buffers <= 0)
2725                 return false;
2726 
2727         /* Regular UBOs start after inline UBOs */
2728         uint32_t index = nir_src_as_uint(instr->src[0]);
2729         if (index >= c->compiler->max_inline_uniform_buffers)
2730                 return false;
2731 
2732         /* We scalarize general TMU access for anything that is not 32-bit */
2733         assert(instr->def.bit_size == 32 ||
2734                instr->num_components == 1);
2735 
2736         if (nir_src_is_const(instr->src[1])) {
2737                 int offset = nir_src_as_uint(instr->src[1]);
2738                 if (try_emit_uniform(c, offset, instr->num_components,
2739                                      &instr->def,
2740                                      QUNIFORM_INLINE_UBO_0 + index)) {
2741                         return true;
2742                 }
2743         }
2744 
2745         /* Fallback to regular UBO load */
2746         return false;
2747 }
2748 
2749 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2750 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2751 {
2752         /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2753          *
2754          * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2755          * if we don't support non-uniform offsets because we also set the
2756          * lower_all_io_to_temps option in the NIR compiler. This ensures that
2757          * any indirect indexing on in/out variables is turned into indirect
2758          * indexing on temporary variables instead, that we handle by lowering
2759          * to scratch. If we implement non-uniform offset here we might be able
2760          * to avoid the temp and scratch lowering, which involves copying from
2761          * the input to the temp variable, possibly making code more optimal.
2762          */
2763         unsigned offset =
2764                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2765 
2766         if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
2767                /* Emit the LDVPM directly now, rather than at the top
2768                 * of the shader like we did for V3D 3.x (which needs
2769                 * vpmsetup when not just taking the next offset).
2770                 *
2771                 * Note that delaying like this may introduce stalls,
2772                 * as LDVPMV takes a minimum of 1 instruction but may
2773                 * be slower if the VPM unit is busy with another QPU.
2774                 */
2775                int index = 0;
2776                if (BITSET_TEST(c->s->info.system_values_read,
2777                                SYSTEM_VALUE_INSTANCE_ID)) {
2778                       index++;
2779                }
2780                if (BITSET_TEST(c->s->info.system_values_read,
2781                                SYSTEM_VALUE_BASE_INSTANCE)) {
2782                       index++;
2783                }
2784                if (BITSET_TEST(c->s->info.system_values_read,
2785                                SYSTEM_VALUE_VERTEX_ID)) {
2786                       index++;
2787                }
2788 
2789                for (int i = 0; i < offset; i++) {
2790                       /* GFXH-1602: if any builtins (vid, iid, etc) are read then
2791                        * attribute 0 must be active (size > 0). When we hit this,
2792                        * the driver is expected to program attribute 0 to have a
2793                        * size of 1, so here we need to add that.
2794                        */
2795                       if (i == 0 && c->vs_key->is_coord &&
2796                           c->vattr_sizes[i] == 0 && index > 0) {
2797                          index++;
2798                       } else {
2799                          index += c->vattr_sizes[i];
2800                       }
2801                }
2802 
2803                index += nir_intrinsic_component(instr);
2804                for (int i = 0; i < instr->num_components; i++) {
2805                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
2806                       ntq_store_def(c, &instr->def, i,
2807                                     vir_LDVPMV_IN(c, vpm_offset));
2808                 }
2809         } else {
2810                 for (int i = 0; i < instr->num_components; i++) {
2811                         int comp = nir_intrinsic_component(instr) + i;
2812                         struct qreg input = c->inputs[offset * 4 + comp];
2813                         ntq_store_def(c, &instr->def, i, vir_MOV(c, input));
2814 
2815                         if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
2816                             input.file == c->payload_z.file &&
2817                             input.index == c->payload_z.index) {
2818                                 c->reads_z = true;
2819                         }
2820                 }
2821         }
2822 }
2823 
2824 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2825 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2826                                 nir_intrinsic_instr *instr)
2827 {
2828         assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2829 
2830         unsigned rt = nir_src_as_uint(instr->src[1]);
2831         assert(rt < V3D_MAX_DRAW_BUFFERS);
2832 
2833         unsigned sample_idx = nir_intrinsic_base(instr);
2834         assert(sample_idx < V3D_MAX_SAMPLES);
2835 
2836         unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2837         for (int i = 0; i < instr->num_components; i++) {
2838                 c->sample_colors[offset + i] =
2839                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2840         }
2841 }
2842 
2843 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2844 ntq_emit_color_write(struct v3d_compile *c,
2845                      nir_intrinsic_instr *instr)
2846 {
2847         unsigned offset = (nir_intrinsic_base(instr) +
2848                            nir_src_as_uint(instr->src[1])) * 4 +
2849                           nir_intrinsic_component(instr);
2850         for (int i = 0; i < instr->num_components; i++) {
2851                 c->outputs[offset + i] =
2852                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2853         }
2854 }
2855 
2856 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2857 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2858 {
2859         assert(instr->num_components == 1);
2860 
2861         struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2862 
2863         uint32_t base_offset = nir_intrinsic_base(instr);
2864 
2865         if (base_offset)
2866                 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2867 
2868         /* Usually, for VS or FS, we only emit outputs once at program end so
2869          * our VPM writes are never in non-uniform control flow, but this
2870          * is not true for GS, where we are emitting multiple vertices.
2871          */
2872         if (vir_in_nonuniform_control_flow(c)) {
2873                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2874                            V3D_QPU_PF_PUSHZ);
2875         }
2876 
2877         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2878 
2879         /* The offset isn’t necessarily dynamically uniform for a geometry
2880          * shader. This can happen if the shader sometimes doesn’t emit one of
2881          * the vertices. In that case subsequent vertices will be written to
2882          * different offsets in the VPM and we need to use the scatter write
2883          * instruction to have a different offset for each lane.
2884          */
2885          bool is_uniform_offset =
2886                  !vir_in_nonuniform_control_flow(c) &&
2887                  !nir_src_is_divergent(instr->src[1]);
2888          vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2889 
2890         if (vir_in_nonuniform_control_flow(c)) {
2891                 struct qinst *last_inst =
2892                         (struct qinst *)c->cur_block->instructions.prev;
2893                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2894         }
2895 }
2896 
2897 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2898 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2899 {
2900         assert(c->s->info.stage == MESA_SHADER_VERTEX);
2901         assert(instr->num_components == 1);
2902 
2903         uint32_t base = nir_intrinsic_base(instr);
2904         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2905 
2906         if (nir_src_is_const(instr->src[1])) {
2907                 vir_VPM_WRITE(c, val,
2908                               base + nir_src_as_uint(instr->src[1]));
2909         } else {
2910                 struct qreg offset = vir_ADD(c,
2911                                              ntq_get_src(c, instr->src[1], 1),
2912                                              vir_uniform_ui(c, base));
2913                 bool is_uniform_offset =
2914                         !vir_in_nonuniform_control_flow(c) &&
2915                         !nir_src_is_divergent(instr->src[1]);
2916                 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2917         }
2918 }
2919 
2920 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2921 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2922 {
2923         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2924                ntq_emit_color_write(c, instr);
2925         else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2926                emit_store_output_gs(c, instr);
2927         else
2928                emit_store_output_vs(c, instr);
2929 }
2930 
2931 /**
2932  * This implementation is based on v3d_sample_{x,y}_offset() from
2933  * v3d_sample_offset.h.
2934  */
2935 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2936 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2937                       struct qreg *sx, struct qreg *sy)
2938 {
2939         sample_idx = vir_ITOF(c, sample_idx);
2940 
2941         struct qreg offset_x =
2942                 vir_FADD(c, vir_uniform_f(c, -0.125f),
2943                             vir_FMUL(c, sample_idx,
2944                                         vir_uniform_f(c, 0.5f)));
2945         vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2946                                     vir_uniform_f(c, 2.0f), sample_idx),
2947                    V3D_QPU_PF_PUSHC);
2948         offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2949                               vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2950                               offset_x);
2951 
2952         struct qreg offset_y =
2953                    vir_FADD(c, vir_uniform_f(c, -0.375f),
2954                                vir_FMUL(c, sample_idx,
2955                                            vir_uniform_f(c, 0.25f)));
2956         *sx = offset_x;
2957         *sy = offset_y;
2958 }
2959 
2960 /**
2961  * This implementation is based on get_centroid_offset() from fep.c.
2962  */
2963 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2964 ntq_get_barycentric_centroid(struct v3d_compile *c,
2965                              struct qreg *out_x,
2966                              struct qreg *out_y)
2967 {
2968         struct qreg sample_mask;
2969         if (c->output_sample_mask_index != -1)
2970                 sample_mask = c->outputs[c->output_sample_mask_index];
2971         else
2972                 sample_mask = vir_MSF(c);
2973 
2974         struct qreg i0 = vir_uniform_ui(c, 0);
2975         struct qreg i1 = vir_uniform_ui(c, 1);
2976         struct qreg i2 = vir_uniform_ui(c, 2);
2977         struct qreg i3 = vir_uniform_ui(c, 3);
2978         struct qreg i4 = vir_uniform_ui(c, 4);
2979         struct qreg i8 = vir_uniform_ui(c, 8);
2980 
2981         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2982         struct qreg F = vir_uniform_ui(c, 0);
2983         struct qreg T = vir_uniform_ui(c, ~0);
2984         struct qreg s0 = vir_AND(c, sample_mask, i1);
2985         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2986         s0 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2987         struct qreg s1 = vir_AND(c, sample_mask, i2);
2988         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2989         s1 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2990         struct qreg s2 = vir_AND(c, sample_mask, i4);
2991         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2992         s2 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2993         struct qreg s3 = vir_AND(c, sample_mask, i8);
2994         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2995         s3 = vir_SEL(c, V3D_QPU_COND_IFNA, T, F);
2996 
2997         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2998         struct qreg sample_idx = i3;
2999         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
3000         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
3001         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
3002         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
3003         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
3004         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
3005 
3006         /* Get offset at selected sample index */
3007         struct qreg offset_x, offset_y;
3008         ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3009 
3010         /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
3011          * are selected.
3012          */
3013         struct qreg s0_and_s3 = vir_AND(c, s0, s3);
3014         struct qreg s1_and_s2 = vir_AND(c, s1, s2);
3015 
3016         struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
3017         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3018         use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
3019         use_center = vir_OR(c, use_center, s0_and_s3);
3020         use_center = vir_OR(c, use_center, s1_and_s2);
3021 
3022         struct qreg zero = vir_uniform_f(c, 0.0f);
3023         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
3024         offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
3025         offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
3026 
3027         *out_x = offset_x;
3028         *out_y = offset_y;
3029 }
3030 
3031 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)3032 ntq_emit_load_interpolated_input(struct v3d_compile *c,
3033                                  struct qreg p,
3034                                  struct qreg C,
3035                                  struct qreg offset_x,
3036                                  struct qreg offset_y,
3037                                  unsigned mode)
3038 {
3039         if (mode == INTERP_MODE_FLAT)
3040                 return C;
3041 
3042         struct qreg sample_offset_x =
3043                 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3044         struct qreg sample_offset_y =
3045                 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3046 
3047         struct qreg scaleX =
3048                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
3049                             offset_x);
3050         struct qreg scaleY =
3051                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
3052                             offset_y);
3053 
3054         struct qreg pInterp =
3055                 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
3056                                            vir_FMUL(c, vir_FDY(c, p), scaleY)));
3057 
3058         if (mode == INTERP_MODE_NOPERSPECTIVE)
3059                 return vir_FADD(c, pInterp, C);
3060 
3061         struct qreg w = c->payload_w;
3062         struct qreg wInterp =
3063                 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
3064                                            vir_FMUL(c, vir_FDY(c, w), scaleY)));
3065 
3066         return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
3067 }
3068 
3069 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)3070 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
3071 {
3072         struct qinst *ldunifa =
3073                 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
3074         ldunifa->qpu.sig.ldunifa = true;
3075         if (result)
3076                 *result = vir_emit_def(c, ldunifa);
3077         else
3078                 vir_emit_nondef(c, ldunifa);
3079         c->current_unifa_offset += 4;
3080 }
3081 
3082 /* Checks if the value of a nir src is derived from a nir register */
3083 static bool
nir_src_derived_from_reg(nir_src src)3084 nir_src_derived_from_reg(nir_src src)
3085 {
3086         nir_def *def = src.ssa;
3087         if (nir_load_reg_for_def(def))
3088                 return true;
3089 
3090         nir_instr *parent = def->parent_instr;
3091         switch (parent->type) {
3092         case nir_instr_type_alu: {
3093                 nir_alu_instr *alu = nir_instr_as_alu(parent);
3094                 int num_srcs = nir_op_infos[alu->op].num_inputs;
3095                 for (int i = 0; i < num_srcs; i++) {
3096                         if (nir_src_derived_from_reg(alu->src[i].src))
3097                                 return true;
3098                 }
3099                 return false;
3100         }
3101         case nir_instr_type_intrinsic: {
3102                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3103                 int num_srcs = nir_intrinsic_infos[intr->intrinsic].num_srcs;
3104                 for (int i = 0; i < num_srcs; i++) {
3105                         if (nir_src_derived_from_reg(intr->src[i]))
3106                                 return true;
3107                 }
3108                 return false;
3109         }
3110         case nir_instr_type_load_const:
3111         case nir_instr_type_undef:
3112                 return false;
3113         default:
3114                 /* By default we assume it may come from a register, the above
3115                  * cases should be able to handle the majority of situations
3116                  * though.
3117                  */
3118                 return true;
3119         };
3120 }
3121 
3122 static bool
ntq_emit_load_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)3123 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
3124 {
3125         assert(instr->intrinsic == nir_intrinsic_load_ubo ||
3126                instr->intrinsic == nir_intrinsic_load_ssbo ||
3127                instr->intrinsic == nir_intrinsic_load_uniform);
3128 
3129         bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
3130         bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
3131         bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
3132 
3133         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
3134          * current unifa offset is 4 bytes ahead of the offset of the last load.
3135          */
3136         static const int32_t max_unifa_skip_dist =
3137                 MAX_UNIFA_SKIP_DISTANCE - 4;
3138 
3139         /* We can only use unifa if the offset is uniform */
3140         nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
3141         if (nir_src_is_divergent(offset))
3142                 return false;
3143 
3144         /* Emitting loads from unifa may not be safe under non-uniform control
3145          * flow. It seems the address that is used to write to the unifa
3146          * register is taken from the first lane and if that lane is disabled
3147          * by control flow then the value we read may be bogus and lead to
3148          * invalid memory accesses on follow-up ldunifa instructions. However,
3149          * ntq_store_def only emits conditional writes for nir registersas long
3150          * we can be certain that the offset isn't derived from a load_reg we
3151          * should be fine.
3152          *
3153          * The following CTS test can be used to trigger the problem, which
3154          * causes a GMP violations in the sim without this check:
3155          * dEQP-VK.subgroups.ballot_broadcast.graphics.subgroupbroadcastfirst_int
3156          */
3157         if (vir_in_nonuniform_control_flow(c) &&
3158             nir_src_derived_from_reg(offset)) {
3159                 return false;
3160         }
3161 
3162         /* We can only use unifa with SSBOs if they are read-only. Otherwise
3163          * ldunifa won't see the shader writes to that address (possibly
3164          * because ldunifa doesn't read from the L2T cache).
3165          */
3166         if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
3167                 return false;
3168 
3169         /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
3170          * that we may have been written to scratch using the TMU.
3171          */
3172         bool dynamic_src = !nir_src_is_const(offset);
3173         if (is_uniform && dynamic_src && c->s->scratch_size > 0)
3174                 return false;
3175 
3176         uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
3177         if (is_uniform)
3178                 const_offset += nir_intrinsic_base(instr);
3179 
3180         /* ldunifa is a 32-bit load instruction so we can only use it with
3181          * 32-bit aligned addresses. We always produce 32-bit aligned addresses
3182          * except for types smaller than 32-bit, so in these cases we can only
3183          * use ldunifa if we can verify alignment, which we can only do for
3184          * loads with a constant offset.
3185          */
3186         uint32_t bit_size = instr->def.bit_size;
3187         uint32_t value_skips = 0;
3188         if (bit_size < 32) {
3189                 if (dynamic_src) {
3190                         return false;
3191                 } else if (const_offset % 4 != 0) {
3192                         /* If we are loading from an unaligned offset, fix
3193                          * alignment and skip over unused elements in result.
3194                          */
3195                         value_skips = (const_offset % 4) / (bit_size / 8);
3196                         const_offset &= ~0x3;
3197                 }
3198         }
3199 
3200         assert((bit_size == 32 && value_skips == 0) ||
3201                (bit_size == 16 && value_skips <= 1) ||
3202                (bit_size == 8  && value_skips <= 3));
3203 
3204         /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
3205          * constants.
3206          */
3207         uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
3208 
3209         /* QUNIFORM_UBO_ADDR takes a UBO index shifted up by 1 since we use
3210          * index 0 for Gallium's constant buffer (GL) or push constants
3211          * (Vulkan).
3212          */
3213         if (is_ubo)
3214                 index++;
3215 
3216         /* We can only keep track of the last unifa address we used with
3217          * constant offset loads. If the new load targets the same buffer and
3218          * is close enough to the previous load, we can skip the unifa register
3219          * write by emitting dummy ldunifa instructions to update the unifa
3220          * address.
3221          */
3222         bool skip_unifa = false;
3223         uint32_t ldunifa_skips = 0;
3224         if (dynamic_src) {
3225                 c->current_unifa_block = NULL;
3226         } else if (c->cur_block == c->current_unifa_block &&
3227                    c->current_unifa_is_ubo == !is_ssbo &&
3228                    c->current_unifa_index == index &&
3229                    c->current_unifa_offset <= const_offset &&
3230                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
3231                 skip_unifa = true;
3232                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
3233         } else {
3234                 c->current_unifa_block = c->cur_block;
3235                 c->current_unifa_is_ubo = !is_ssbo;
3236                 c->current_unifa_index = index;
3237                 c->current_unifa_offset = const_offset;
3238         }
3239 
3240         if (!skip_unifa) {
3241                 struct qreg base_offset = !is_ssbo ?
3242                         vir_uniform(c, QUNIFORM_UBO_ADDR,
3243                                     v3d_unit_data_create(index, const_offset)) :
3244                         vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
3245 
3246                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
3247                 if (!dynamic_src) {
3248                         if (!is_ssbo) {
3249                                 /* Avoid the extra MOV to UNIFA by making
3250                                  * ldunif load directly into it. We can't
3251                                  * do this if we have not actually emitted
3252                                  * ldunif and are instead reusing a previous
3253                                  * one.
3254                                  */
3255                                 struct qinst *inst =
3256                                         (struct qinst *)c->cur_block->instructions.prev;
3257                                 if (inst == c->defs[base_offset.index]) {
3258                                    inst->dst = unifa;
3259                                    c->defs[base_offset.index] = NULL;
3260                                 } else {
3261                                    vir_MOV_dest(c, unifa, base_offset);
3262                                 }
3263                         } else {
3264                                 vir_ADD_dest(c, unifa, base_offset,
3265                                              vir_uniform_ui(c, const_offset));
3266                         }
3267                 } else {
3268                         vir_ADD_dest(c, unifa, base_offset,
3269                                      ntq_get_src(c, offset, 0));
3270                 }
3271         } else {
3272                 for (int i = 0; i < ldunifa_skips; i++)
3273                         emit_ldunifa(c, NULL);
3274         }
3275 
3276         uint32_t num_components = nir_intrinsic_dest_components(instr);
3277         for (uint32_t i = 0; i < num_components; ) {
3278                 struct qreg data;
3279                 emit_ldunifa(c, &data);
3280 
3281                 if (bit_size == 32) {
3282                         assert(value_skips == 0);
3283                         ntq_store_def(c, &instr->def, i, vir_MOV(c, data));
3284                         i++;
3285                 } else {
3286                         assert((bit_size == 16 && value_skips <= 1) ||
3287                                (bit_size ==  8 && value_skips <= 3));
3288 
3289                         /* If we have any values to skip, shift to the first
3290                          * valid value in the ldunifa result.
3291                          */
3292                         if (value_skips > 0) {
3293                                 data = vir_SHR(c, data,
3294                                                vir_uniform_ui(c, bit_size *
3295                                                                  value_skips));
3296                         }
3297 
3298                         /* Check how many valid components we have discounting
3299                          * read components to skip.
3300                          */
3301                         uint32_t valid_count = (32 / bit_size) - value_skips;
3302                         assert((bit_size == 16 && valid_count <= 2) ||
3303                                (bit_size ==  8 && valid_count <= 4));
3304                         assert(valid_count > 0);
3305 
3306                         /* Process the valid components */
3307                         do {
3308                                 struct qreg tmp;
3309                                 uint32_t mask = (1 << bit_size) - 1;
3310                                 tmp = vir_AND(c, vir_MOV(c, data),
3311                                               vir_uniform_ui(c, mask));
3312                                 ntq_store_def(c, &instr->def, i,
3313                                               vir_MOV(c, tmp));
3314                                 i++;
3315                                 valid_count--;
3316 
3317                                 /* Shift to next component */
3318                                 if (i < num_components && valid_count > 0) {
3319                                         data = vir_SHR(c, data,
3320                                                        vir_uniform_ui(c, bit_size));
3321                                 }
3322                         } while (i < num_components && valid_count > 0);
3323                 }
3324         }
3325 
3326         return true;
3327 }
3328 
3329 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)3330 emit_load_local_invocation_index(struct v3d_compile *c)
3331 {
3332         return vir_SHR(c, c->cs_payload[1],
3333                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
3334 }
3335 
3336 /* For the purposes of reduction operations (ballot, alleq, allfeq, bcastf) in
3337  * fragment shaders a lane is considered active if any sample flags are set
3338  * for *any* lane in the same quad, however, we still need to ensure that
3339  * terminated lanes (OpTerminate) are not included. Further, we also need to
3340  * disable lanes that may be disabled because of non-uniform control
3341  * flow.
3342  */
3343 static enum v3d_qpu_cond
setup_subgroup_control_flow_condition(struct v3d_compile * c)3344 setup_subgroup_control_flow_condition(struct v3d_compile *c)
3345 {
3346         assert(c->s->info.stage == MESA_SHADER_FRAGMENT ||
3347                c->s->info.stage == MESA_SHADER_COMPUTE);
3348 
3349         enum v3d_qpu_cond cond = V3D_QPU_COND_NONE;
3350 
3351         /* We need to make sure that terminated lanes in fragment shaders are
3352          * not included. We can identify these lanes by comparing the inital
3353          * sample mask with the current. This fixes:
3354          * dEQP-VK.spirv_assembly.instruction.terminate_invocation.terminate.subgroup_*
3355          */
3356         if (c->s->info.stage == MESA_SHADER_FRAGMENT && c->emitted_discard) {
3357                 vir_set_pf(c, vir_AND_dest(c, vir_nop_reg(), c->start_msf,
3358                                            vir_NOT(c, vir_XOR(c, c->start_msf,
3359                                                               vir_MSF(c)))),
3360                            V3D_QPU_PF_PUSHZ);
3361                 cond = V3D_QPU_COND_IFNA;
3362         }
3363 
3364         /* If we are in non-uniform control-flow update the condition to
3365          * also limit lanes to those in the current execution mask.
3366          */
3367         if (vir_in_nonuniform_control_flow(c)) {
3368                 if (cond == V3D_QPU_COND_IFNA) {
3369                         vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3370                                    V3D_QPU_UF_NORNZ);
3371                 } else {
3372                         assert(cond == V3D_QPU_COND_NONE);
3373                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3374                                    V3D_QPU_PF_PUSHZ);
3375                 }
3376                 cond = V3D_QPU_COND_IFA;
3377         }
3378 
3379         return cond;
3380 }
3381 
3382 static void
emit_compute_barrier(struct v3d_compile * c)3383 emit_compute_barrier(struct v3d_compile *c)
3384 {
3385         /* Ensure we flag the use of the control barrier. NIR's
3386          * gather info pass usually takes care of this, but that
3387          * requires that we call that pass after any other pass
3388          * may emit a control barrier, so this is safer.
3389          */
3390         c->s->info.uses_control_barrier = true;
3391 
3392         /* Emit a TSY op to get all invocations in the workgroup
3393          * (actually supergroup) to block until the last
3394          * invocation reaches the TSY op.
3395          */
3396         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB));
3397 }
3398 
3399 static void
emit_barrier(struct v3d_compile * c)3400 emit_barrier(struct v3d_compile *c)
3401 {
3402         struct qreg eidx = vir_EIDX(c);
3403 
3404         /* The config for the TSY op should be setup like this:
3405          * - Lane 0: Quorum
3406          * - Lane 2: TSO id
3407          * - Lane 3: TSY opcode
3408          */
3409 
3410         /* Lane 0: we want to synchronize across one subgroup. Here we write to
3411          * all lanes unconditionally and will overwrite other lanes below.
3412          */
3413         struct qreg tsy_conf = vir_uniform_ui(c, 1);
3414 
3415         /* Lane 2: TSO id. We choose a general purpose TSO (id=0..64) using the
3416          * curent QPU index and thread index to ensure we get a unique one for
3417          * this group of invocations in this core.
3418          */
3419         struct qreg tso_id =
3420                 vir_AND(c, vir_TIDX(c), vir_uniform_ui(c, 0x0000003f));
3421         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 2)),
3422                    V3D_QPU_PF_PUSHZ);
3423         vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tso_id);
3424 
3425         /* Lane 3: TSY opcode (set_quorum_wait_inc_check) */
3426         struct qreg tsy_op = vir_uniform_ui(c, 16);
3427         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), eidx, vir_uniform_ui(c, 3)),
3428                    V3D_QPU_PF_PUSHZ);
3429         vir_MOV_cond(c, V3D_QPU_COND_IFA, tsy_conf, tsy_op);
3430 
3431         /* Emit TSY sync */
3432         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_SYNCB), tsy_conf);
3433 }
3434 
3435 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)3436 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
3437 {
3438         switch (instr->intrinsic) {
3439         case nir_intrinsic_decl_reg:
3440         case nir_intrinsic_load_reg:
3441         case nir_intrinsic_store_reg:
3442                 break; /* Ignore these */
3443 
3444         case nir_intrinsic_load_uniform:
3445                 ntq_emit_load_uniform(c, instr);
3446                 break;
3447 
3448         case nir_intrinsic_load_global:
3449         case nir_intrinsic_load_global_constant:
3450                 ntq_emit_tmu_general(c, instr, false, true);
3451                 c->has_general_tmu_load = true;
3452                 break;
3453 
3454         case nir_intrinsic_load_ubo:
3455            if (ntq_emit_inline_ubo_load(c, instr))
3456                    break;
3457            FALLTHROUGH;
3458         case nir_intrinsic_load_ssbo:
3459                 if (!ntq_emit_load_unifa(c, instr)) {
3460                         ntq_emit_tmu_general(c, instr, false, false);
3461                         c->has_general_tmu_load = true;
3462                 }
3463                 break;
3464 
3465         case nir_intrinsic_store_ssbo:
3466         case nir_intrinsic_ssbo_atomic:
3467         case nir_intrinsic_ssbo_atomic_swap:
3468                 ntq_emit_tmu_general(c, instr, false, false);
3469                 break;
3470 
3471         case nir_intrinsic_store_global:
3472         case nir_intrinsic_global_atomic:
3473         case nir_intrinsic_global_atomic_swap:
3474                 ntq_emit_tmu_general(c, instr, false, true);
3475                 break;
3476 
3477         case nir_intrinsic_shared_atomic:
3478         case nir_intrinsic_shared_atomic_swap:
3479         case nir_intrinsic_store_shared:
3480         case nir_intrinsic_store_scratch:
3481                 ntq_emit_tmu_general(c, instr, true, false);
3482                 break;
3483 
3484         case nir_intrinsic_load_scratch:
3485         case nir_intrinsic_load_shared:
3486                 ntq_emit_tmu_general(c, instr, true, false);
3487                 c->has_general_tmu_load = true;
3488                 break;
3489 
3490         case nir_intrinsic_image_store:
3491         case nir_intrinsic_image_atomic:
3492         case nir_intrinsic_image_atomic_swap:
3493                 v3d_vir_emit_image_load_store(c, instr);
3494                 break;
3495 
3496         case nir_intrinsic_image_load:
3497                 v3d_vir_emit_image_load_store(c, instr);
3498                 /* Not really a general TMU load, but we only use this flag
3499                  * for NIR scheduling and we do schedule these under the same
3500                  * policy as general TMU.
3501                  */
3502                 c->has_general_tmu_load = true;
3503                 break;
3504 
3505         case nir_intrinsic_get_ssbo_size:
3506                 ntq_store_def(c, &instr->def, 0,
3507                               vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
3508                                           nir_src_comp_as_uint(instr->src[0], 0)));
3509                 break;
3510 
3511         case nir_intrinsic_get_ubo_size:
3512                 ntq_store_def(c, &instr->def, 0,
3513                               vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
3514                                           nir_src_comp_as_uint(instr->src[0], 0)));
3515                 break;
3516 
3517         case nir_intrinsic_load_user_clip_plane:
3518                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
3519                         ntq_store_def(c, &instr->def, i,
3520                                       vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
3521                                                   nir_intrinsic_ucp_id(instr) *
3522                                                   4 + i));
3523                 }
3524                 break;
3525 
3526         case nir_intrinsic_load_viewport_x_scale:
3527                 ntq_store_def(c, &instr->def, 0,
3528                               vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
3529                 break;
3530 
3531         case nir_intrinsic_load_viewport_y_scale:
3532                 ntq_store_def(c, &instr->def, 0,
3533                               vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
3534                 break;
3535 
3536         case nir_intrinsic_load_viewport_z_scale:
3537                 ntq_store_def(c, &instr->def, 0,
3538                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
3539                 break;
3540 
3541         case nir_intrinsic_load_viewport_z_offset:
3542                 ntq_store_def(c, &instr->def, 0,
3543                               vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
3544                 break;
3545 
3546         case nir_intrinsic_load_line_coord:
3547                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->line_x));
3548                 break;
3549 
3550         case nir_intrinsic_load_line_width:
3551                 ntq_store_def(c, &instr->def, 0,
3552                               vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
3553                 break;
3554 
3555         case nir_intrinsic_load_aa_line_width:
3556                 ntq_store_def(c, &instr->def, 0,
3557                               vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
3558                 break;
3559 
3560         case nir_intrinsic_load_sample_mask_in:
3561                 ntq_store_def(c, &instr->def, 0, vir_MSF(c));
3562                 break;
3563 
3564         case nir_intrinsic_load_helper_invocation:
3565                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3566                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3567                 ntq_store_def(c, &instr->def, 0, qdest);
3568                 break;
3569 
3570         case nir_intrinsic_load_front_face:
3571                 /* The register contains 0 (front) or 1 (back), and we need to
3572                  * turn it into a NIR bool where true means front.
3573                  */
3574                 ntq_store_def(c, &instr->def, 0,
3575                               vir_ADD(c,
3576                                       vir_uniform_ui(c, -1),
3577                                       vir_REVF(c)));
3578                 break;
3579 
3580         case nir_intrinsic_load_base_instance:
3581                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->biid));
3582                 break;
3583 
3584         case nir_intrinsic_load_instance_id:
3585                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->iid));
3586                 break;
3587 
3588         case nir_intrinsic_load_vertex_id:
3589                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->vid));
3590                 break;
3591 
3592         case nir_intrinsic_load_draw_id:
3593                 ntq_store_def(c, &instr->def, 0, vir_uniform(c, QUNIFORM_DRAW_ID, 0));
3594                 break;
3595 
3596         case nir_intrinsic_load_tlb_color_brcm:
3597                 vir_emit_tlb_color_read(c, instr);
3598                 break;
3599 
3600         case nir_intrinsic_load_fep_w_v3d:
3601                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, c->payload_w));
3602                 break;
3603 
3604         case nir_intrinsic_load_input:
3605                 ntq_emit_load_input(c, instr);
3606                 break;
3607 
3608         case nir_intrinsic_store_tlb_sample_color_v3d:
3609                ntq_emit_per_sample_color_write(c, instr);
3610                break;
3611 
3612        case nir_intrinsic_store_output:
3613                 ntq_emit_store_output(c, instr);
3614                 break;
3615 
3616         case nir_intrinsic_image_size:
3617                 ntq_emit_image_size(c, instr);
3618                 break;
3619 
3620         /* FIXME: the Vulkan and SPIR-V specs specify that OpTerminate (which
3621          * is intended to match the semantics of GLSL's discard) should
3622          * terminate the invocation immediately. Our implementation doesn't
3623          * do that. What we do is actually a demote by removing the invocations
3624          * from the sample mask. Maybe we could be more strict and force an
3625          * early termination by emitting a (maybe conditional) jump to the
3626          * end section of the fragment shader for affected invocations.
3627          */
3628         case nir_intrinsic_terminate:
3629                 c->emitted_discard = true;
3630                 FALLTHROUGH;
3631         case nir_intrinsic_demote:
3632                 ntq_flush_tmu(c);
3633 
3634                 if (vir_in_nonuniform_control_flow(c)) {
3635                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3636                                    V3D_QPU_PF_PUSHZ);
3637                         vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3638                                                      vir_uniform_ui(c, 0)),
3639                                 V3D_QPU_COND_IFA);
3640                 } else {
3641                         vir_SETMSF_dest(c, vir_nop_reg(),
3642                                         vir_uniform_ui(c, 0));
3643                 }
3644                 break;
3645 
3646         case nir_intrinsic_terminate_if:
3647                 c->emitted_discard = true;
3648                 FALLTHROUGH;
3649         case nir_intrinsic_demote_if: {
3650                 ntq_flush_tmu(c);
3651 
3652                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3653 
3654                 if (vir_in_nonuniform_control_flow(c)) {
3655                         struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3656                                                                c->execute);
3657                         if (cond == V3D_QPU_COND_IFA) {
3658                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3659                         } else {
3660                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3661                                 cond = V3D_QPU_COND_IFA;
3662                         }
3663                 }
3664 
3665                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3666                                              vir_uniform_ui(c, 0)), cond);
3667                 break;
3668         }
3669 
3670         case nir_intrinsic_barrier:
3671                 /* Ensure that the TMU operations before the barrier are flushed
3672                  * before the ones after the barrier.
3673                  */
3674                 ntq_flush_tmu(c);
3675 
3676                 if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
3677                         if (c->s->info.stage == MESA_SHADER_COMPUTE)
3678                                 emit_compute_barrier(c);
3679                         else
3680                                 emit_barrier(c);
3681 
3682                         /* The blocking of a TSY op only happens at the next
3683                          * thread switch. No texturing may be outstanding at the
3684                          * time of a TSY blocking operation.
3685                          */
3686                         vir_emit_thrsw(c);
3687                 }
3688                 break;
3689 
3690         case nir_intrinsic_load_num_workgroups:
3691                 for (int i = 0; i < 3; i++) {
3692                         ntq_store_def(c, &instr->def, i,
3693                                       vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3694                                                   i));
3695                 }
3696                 break;
3697 
3698         case nir_intrinsic_load_workgroup_id: {
3699                 struct qreg x = vir_AND(c, c->cs_payload[0],
3700                                          vir_uniform_ui(c, 0xffff));
3701                 ntq_store_def(c, &instr->def, 0, x);
3702 
3703                 struct qreg y = vir_SHR(c, c->cs_payload[0],
3704                                          vir_uniform_ui(c, 16));
3705                 ntq_store_def(c, &instr->def, 1, y);
3706 
3707                 struct qreg z = vir_AND(c, c->cs_payload[1],
3708                                          vir_uniform_ui(c, 0xffff));
3709                 ntq_store_def(c, &instr->def, 2, z);
3710                 break;
3711         }
3712 
3713         case nir_intrinsic_load_base_workgroup_id: {
3714                 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0);
3715                 ntq_store_def(c, &instr->def, 0, x);
3716 
3717                 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1);
3718                 ntq_store_def(c, &instr->def, 1, y);
3719 
3720                 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2);
3721                 ntq_store_def(c, &instr->def, 2, z);
3722                 break;
3723         }
3724 
3725         case nir_intrinsic_load_workgroup_size: {
3726                 struct qreg x = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 0);
3727                 ntq_store_def(c, &instr->def, 0, x);
3728 
3729                 struct qreg y = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 1);
3730                 ntq_store_def(c, &instr->def, 1, y);
3731 
3732                 struct qreg z = vir_uniform(c, QUNIFORM_WORK_GROUP_SIZE, 2);
3733                 ntq_store_def(c, &instr->def, 2, z);
3734                 break;
3735         }
3736 
3737         case nir_intrinsic_load_local_invocation_index:
3738                 ntq_store_def(c, &instr->def, 0,
3739                               emit_load_local_invocation_index(c));
3740                 break;
3741 
3742         case nir_intrinsic_load_subgroup_id: {
3743                 /* This is basically the batch index, which is the Local
3744                  * Invocation Index divided by the SIMD width).
3745                  */
3746                 STATIC_ASSERT(IS_POT(V3D_CHANNELS) && V3D_CHANNELS > 0);
3747                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3748                 struct qreg lii = emit_load_local_invocation_index(c);
3749                 ntq_store_def(c, &instr->def, 0,
3750                               vir_SHR(c, lii,
3751                                       vir_uniform_ui(c, divide_shift)));
3752                 break;
3753         }
3754 
3755         case nir_intrinsic_load_per_vertex_input: {
3756                 /* The vertex shader writes all its used outputs into
3757                  * consecutive VPM offsets, so if any output component is
3758                  * unused, its VPM offset is used by the next used
3759                  * component. This means that we can't assume that each
3760                  * location will use 4 consecutive scalar offsets in the VPM
3761                  * and we need to compute the VPM offset for each input by
3762                  * going through the inputs and finding the one that matches
3763                  * our location and component.
3764                  *
3765                  * col: vertex index, row = varying index
3766                  */
3767                 assert(nir_src_is_const(instr->src[1]));
3768                 uint32_t location =
3769                         nir_intrinsic_io_semantics(instr).location +
3770                         nir_src_as_uint(instr->src[1]);
3771                 uint32_t component = nir_intrinsic_component(instr);
3772 
3773                 int32_t row_idx = -1;
3774                 for (int i = 0; i < c->num_inputs; i++) {
3775                         struct v3d_varying_slot slot = c->input_slots[i];
3776                         if (v3d_slot_get_slot(slot) == location &&
3777                             v3d_slot_get_component(slot) == component) {
3778                                 row_idx = i;
3779                                 break;
3780                         }
3781                 }
3782 
3783                 assert(row_idx != -1);
3784 
3785                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3786                 for (int i = 0; i < instr->num_components; i++) {
3787                         struct qreg row = vir_uniform_ui(c, row_idx++);
3788                         ntq_store_def(c, &instr->def, i,
3789                                       vir_LDVPMG_IN(c, row, col));
3790                 }
3791                 break;
3792         }
3793 
3794         case nir_intrinsic_emit_vertex:
3795         case nir_intrinsic_end_primitive:
3796                 unreachable("Should have been lowered in v3d_nir_lower_io");
3797                 break;
3798 
3799         case nir_intrinsic_load_primitive_id: {
3800                 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3801                  * VPM output header. According to docs, we should read this
3802                  * using ldvpm(v,d)_in (See Table 71).
3803                  */
3804                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3805                 ntq_store_def(c, &instr->def, 0,
3806                               vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3807                 break;
3808         }
3809 
3810         case nir_intrinsic_load_invocation_id:
3811                 ntq_store_def(c, &instr->def, 0, vir_IID(c));
3812                 break;
3813 
3814         case nir_intrinsic_load_fb_layers_v3d:
3815                 ntq_store_def(c, &instr->def, 0,
3816                               vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3817                 break;
3818 
3819         case nir_intrinsic_load_sample_id:
3820                 ntq_store_def(c, &instr->def, 0, vir_SAMPID(c));
3821                 break;
3822 
3823         case nir_intrinsic_load_sample_pos:
3824                 ntq_store_def(c, &instr->def, 0,
3825                               vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3826                 ntq_store_def(c, &instr->def, 1,
3827                               vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3828                 break;
3829 
3830         case nir_intrinsic_load_barycentric_at_offset:
3831                 ntq_store_def(c, &instr->def, 0,
3832                               vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3833                 ntq_store_def(c, &instr->def, 1,
3834                               vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3835                 break;
3836 
3837         case nir_intrinsic_load_barycentric_pixel:
3838                 ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3839                 ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3840                 break;
3841 
3842         case nir_intrinsic_load_barycentric_at_sample: {
3843                 if (!c->fs_key->msaa) {
3844                         ntq_store_def(c, &instr->def, 0, vir_uniform_f(c, 0.0f));
3845                         ntq_store_def(c, &instr->def, 1, vir_uniform_f(c, 0.0f));
3846                         return;
3847                 }
3848 
3849                 struct qreg offset_x, offset_y;
3850                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3851                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3852 
3853                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3854                 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3855                 break;
3856         }
3857 
3858         case nir_intrinsic_load_barycentric_sample: {
3859                 struct qreg offset_x =
3860                         vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3861                 struct qreg offset_y =
3862                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3863 
3864                 ntq_store_def(c, &instr->def, 0,
3865                              vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3866                 ntq_store_def(c, &instr->def, 1,
3867                               vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3868                 break;
3869         }
3870 
3871         case nir_intrinsic_load_barycentric_centroid: {
3872                 struct qreg offset_x, offset_y;
3873                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3874                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, offset_x));
3875                 ntq_store_def(c, &instr->def, 1, vir_MOV(c, offset_y));
3876                 break;
3877         }
3878 
3879         case nir_intrinsic_load_interpolated_input: {
3880                 assert(nir_src_is_const(instr->src[1]));
3881                 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3882 
3883                 for (int i = 0; i < instr->num_components; i++) {
3884                         const uint32_t input_idx =
3885                                 (nir_intrinsic_base(instr) + offset) * 4 +
3886                                 nir_intrinsic_component(instr) + i;
3887 
3888                         /* If we are not in MSAA or if we are not interpolating
3889                          * a user varying, just return the pre-computed
3890                          * interpolated input.
3891                          */
3892                         if (!c->fs_key->msaa ||
3893                             c->interp[input_idx].vp.file == QFILE_NULL) {
3894                                 ntq_store_def(c, &instr->def, i,
3895                                               vir_MOV(c, c->inputs[input_idx]));
3896                                 continue;
3897                         }
3898 
3899                         /* Otherwise compute interpolation at the specified
3900                          * offset.
3901                          */
3902                         struct qreg p = c->interp[input_idx].vp;
3903                         struct qreg C = c->interp[input_idx].C;
3904                         unsigned interp_mode =  c->interp[input_idx].mode;
3905 
3906                         struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3907                         struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3908 
3909                         struct qreg result =
3910                               ntq_emit_load_interpolated_input(c, p, C,
3911                                                                offset_x, offset_y,
3912                                                                interp_mode);
3913                         ntq_store_def(c, &instr->def, i, result);
3914                 }
3915                 break;
3916         }
3917 
3918         case nir_intrinsic_load_subgroup_size:
3919                 ntq_store_def(c, &instr->def, 0,
3920                               vir_uniform_ui(c, V3D_CHANNELS));
3921                 break;
3922 
3923         case nir_intrinsic_load_subgroup_invocation:
3924                 ntq_store_def(c, &instr->def, 0, vir_EIDX(c));
3925                 break;
3926 
3927         case nir_intrinsic_ddx:
3928         case nir_intrinsic_ddx_coarse:
3929         case nir_intrinsic_ddx_fine: {
3930                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3931                 ntq_store_def(c, &instr->def, 0, vir_FDX(c, value));
3932                 break;
3933         }
3934 
3935         case nir_intrinsic_ddy:
3936         case nir_intrinsic_ddy_coarse:
3937         case nir_intrinsic_ddy_fine: {
3938                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3939                 ntq_store_def(c, &instr->def, 0, vir_FDY(c, value));
3940                 break;
3941         }
3942 
3943         case nir_intrinsic_elect: {
3944                 struct qreg first;
3945                 if (vir_in_nonuniform_control_flow(c)) {
3946                         /* Sets A=1 for lanes enabled in the execution mask */
3947                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3948                                    V3D_QPU_PF_PUSHZ);
3949                         /* Updates A ANDing with lanes enabled in MSF */
3950                         vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()),
3951                                    V3D_QPU_UF_ANDNZ);
3952                         first = vir_FLAFIRST(c);
3953                 } else {
3954                         /* Sets A=1 for inactive lanes */
3955                         vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()),
3956                                    V3D_QPU_PF_PUSHZ);
3957                         first = vir_FLNAFIRST(c);
3958                 }
3959 
3960                 /* Produce a boolean result */
3961                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3962                                            first, vir_uniform_ui(c, 1)),
3963                                            V3D_QPU_PF_PUSHZ);
3964                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3965                 ntq_store_def(c, &instr->def, 0, result);
3966                 break;
3967         }
3968 
3969         case nir_intrinsic_ballot: {
3970                 assert(c->devinfo->ver >= 71);
3971                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3972                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3973                 struct qreg res = vir_get_temp(c);
3974                 vir_set_cond(vir_BALLOT_dest(c, res, value), cond);
3975                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3976                 break;
3977         }
3978 
3979         case nir_intrinsic_read_invocation: {
3980                 assert(c->devinfo->ver >= 71);
3981                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3982                 struct qreg index = ntq_get_src(c, instr->src[1], 0);
3983                 struct qreg res = vir_SHUFFLE(c, value, index);
3984                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3985                 break;
3986         }
3987 
3988         case nir_intrinsic_read_first_invocation: {
3989                 assert(c->devinfo->ver >= 71);
3990                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
3991                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
3992                 struct qreg res = vir_get_temp(c);
3993                 vir_set_cond(vir_BCASTF_dest(c, res, value), cond);
3994                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
3995                 break;
3996         }
3997 
3998         case nir_intrinsic_shuffle: {
3999                 assert(c->devinfo->ver >= 71);
4000                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4001                 struct qreg indices = ntq_get_src(c, instr->src[1], 0);
4002                 struct qreg res = vir_SHUFFLE(c, value, indices);
4003                 ntq_store_def(c, &instr->def, 0, vir_MOV(c, res));
4004                 break;
4005         }
4006 
4007         case nir_intrinsic_vote_feq:
4008         case nir_intrinsic_vote_ieq: {
4009                 assert(c->devinfo->ver >= 71);
4010                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4011                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4012                 struct qreg res = vir_get_temp(c);
4013                 vir_set_cond(instr->intrinsic == nir_intrinsic_vote_ieq ?
4014                              vir_ALLEQ_dest(c, res, value) :
4015                              vir_ALLFEQ_dest(c, res, value),
4016                              cond);
4017 
4018                 /* Produce boolean result */
4019                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4020                            V3D_QPU_PF_PUSHZ);
4021                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4022                 ntq_store_def(c, &instr->def, 0, result);
4023                 break;
4024         }
4025 
4026         case nir_intrinsic_vote_all: {
4027                 assert(c->devinfo->ver >= 71);
4028                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4029                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4030                 struct qreg res = vir_get_temp(c);
4031                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4032 
4033                 /* We want to check if 'all lanes are equal (alleq != 0) and
4034                  * their value is True (value != 0)'.
4035                  *
4036                  * The first MOV.pushz generates predicate for 'alleq == 0'.
4037                  * The second MOV.NORZ generates predicate for:
4038                  * '!(alleq == 0) & !(value == 0).
4039                  */
4040                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4041                            V3D_QPU_PF_PUSHZ);
4042                 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4043                            V3D_QPU_UF_NORZ);
4044                 struct qreg result =
4045                         ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
4046                 ntq_store_def(c, &instr->def, 0, result);
4047                 break;
4048         }
4049 
4050         case nir_intrinsic_vote_any: {
4051                 assert(c->devinfo->ver >= 71);
4052                 struct qreg value = ntq_get_src(c, instr->src[0], 0);
4053                 enum v3d_qpu_cond cond = setup_subgroup_control_flow_condition(c);
4054                 struct qreg res = vir_get_temp(c);
4055                 vir_set_cond(vir_ALLEQ_dest(c, res, value), cond);
4056 
4057                 /* We want to check 'not (all lanes are equal (alleq != 0)'
4058                  * and their value is False (value == 0))'.
4059                  *
4060                  * The first MOV.pushz generates predicate for 'alleq == 0'.
4061                  * The second MOV.NORNZ generates predicate for:
4062                  * '!(alleq == 0) & (value == 0).
4063                  * The IFNA condition negates the predicate when evaluated:
4064                  * '!(!alleq == 0) & (value == 0))
4065                  */
4066                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), res),
4067                            V3D_QPU_PF_PUSHZ);
4068                 vir_set_uf(c, vir_MOV_dest(c, vir_nop_reg(), value),
4069                            V3D_QPU_UF_NORNZ);
4070                 struct qreg result =
4071                         ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFNA);
4072                 ntq_store_def(c, &instr->def, 0, result);
4073                 break;
4074         }
4075 
4076         case nir_intrinsic_load_num_subgroups:
4077                 unreachable("Should have been lowered");
4078                 break;
4079 
4080         case nir_intrinsic_load_view_index:
4081                 ntq_store_def(c, &instr->def, 0,
4082                               vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
4083                 break;
4084 
4085         default:
4086                 fprintf(stderr, "Unknown intrinsic: ");
4087                 nir_print_instr(&instr->instr, stderr);
4088                 fprintf(stderr, "\n");
4089                 abort();
4090         }
4091 }
4092 
4093 /* Clears (activates) the execute flags for any channels whose jump target
4094  * matches this block.
4095  *
4096  * XXX perf: Could we be using flpush/flpop somehow for our execution channel
4097  * enabling?
4098  *
4099  */
4100 static void
ntq_activate_execute_for_block(struct v3d_compile * c)4101 ntq_activate_execute_for_block(struct v3d_compile *c)
4102 {
4103         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4104                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
4105                    V3D_QPU_PF_PUSHZ);
4106 
4107         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4108 }
4109 
4110 static bool
is_cheap_block(nir_block * block)4111 is_cheap_block(nir_block *block)
4112 {
4113         int32_t cost = 3;
4114         nir_foreach_instr(instr, block) {
4115                 switch (instr->type) {
4116                 case nir_instr_type_alu:
4117                 case nir_instr_type_undef:
4118                 case nir_instr_type_load_const:
4119                         if (--cost <= 0)
4120                                 return false;
4121                 break;
4122                 case nir_instr_type_intrinsic: {
4123                         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
4124                         switch (intr->intrinsic) {
4125                         case nir_intrinsic_decl_reg:
4126                         case nir_intrinsic_load_reg:
4127                         case nir_intrinsic_store_reg:
4128                                 continue;
4129                         default:
4130                                 return false;
4131                         }
4132                 }
4133                 default:
4134                         return false;
4135                 }
4136         }
4137         return true;
4138 }
4139 
4140 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)4141 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
4142 {
4143         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4144         bool empty_else_block =
4145                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4146                  exec_list_is_empty(&nir_else_block->instr_list));
4147 
4148         struct qblock *then_block = vir_new_block(c);
4149         struct qblock *after_block = vir_new_block(c);
4150         struct qblock *else_block;
4151         if (empty_else_block)
4152                 else_block = after_block;
4153         else
4154                 else_block = vir_new_block(c);
4155 
4156         /* Check if this if statement is really just a conditional jump with
4157          * the form:
4158          *
4159          * if (cond) {
4160          *    break/continue;
4161          * } else {
4162          * }
4163          *
4164          * In which case we can skip the jump to ELSE we emit before the THEN
4165          * block and instead just emit the break/continue directly.
4166          */
4167         nir_jump_instr *conditional_jump = NULL;
4168         if (empty_else_block) {
4169                 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
4170                 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
4171                 if (inst && inst->type == nir_instr_type_jump)
4172                         conditional_jump = nir_instr_as_jump(inst);
4173         }
4174 
4175         /* Set up the flags for the IF condition (taking the THEN branch). */
4176         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4177 
4178         if (!conditional_jump) {
4179                 /* Jump to ELSE. */
4180                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4181                            V3D_QPU_BRANCH_COND_ANYNA :
4182                            V3D_QPU_BRANCH_COND_ANYA);
4183                 /* Pixels that were not dispatched or have been discarded
4184                  * should not contribute to the ANYA/ANYNA condition.
4185                  */
4186                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4187 
4188                 vir_link_blocks(c->cur_block, else_block);
4189                 vir_link_blocks(c->cur_block, then_block);
4190 
4191                 /* Process the THEN block. */
4192                 vir_set_emit_block(c, then_block);
4193                 ntq_emit_cf_list(c, &if_stmt->then_list);
4194 
4195                 if (!empty_else_block) {
4196                         /* At the end of the THEN block, jump to ENDIF, unless
4197                          * the block ended in a break or continue.
4198                          */
4199                         if (!c->cur_block->branch_emitted) {
4200                                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4201                                 vir_link_blocks(c->cur_block, after_block);
4202                         }
4203 
4204                         /* Emit the else block. */
4205                         vir_set_emit_block(c, else_block);
4206                         ntq_emit_cf_list(c, &if_stmt->else_list);
4207                 }
4208         } else {
4209                 /* Emit the conditional jump directly.
4210                  *
4211                  * Use ALL with breaks and ANY with continues to ensure that
4212                  * we always break and never continue when all lanes have been
4213                  * disabled (for example because of discards) to prevent
4214                  * infinite loops.
4215                  */
4216                 assert(conditional_jump &&
4217                        (conditional_jump->type == nir_jump_continue ||
4218                         conditional_jump->type == nir_jump_break));
4219 
4220                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
4221                            (conditional_jump->type == nir_jump_break ?
4222                             V3D_QPU_BRANCH_COND_ALLA :
4223                             V3D_QPU_BRANCH_COND_ANYA) :
4224                            (conditional_jump->type == nir_jump_break ?
4225                             V3D_QPU_BRANCH_COND_ALLNA :
4226                             V3D_QPU_BRANCH_COND_ANYNA));
4227                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4228 
4229                 vir_link_blocks(c->cur_block,
4230                                 conditional_jump->type == nir_jump_break ?
4231                                         c->loop_break_block :
4232                                         c->loop_cont_block);
4233         }
4234 
4235         vir_link_blocks(c->cur_block, after_block);
4236 
4237         vir_set_emit_block(c, after_block);
4238 }
4239 
4240 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)4241 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
4242 {
4243         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
4244         bool empty_else_block =
4245                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
4246                  exec_list_is_empty(&nir_else_block->instr_list));
4247 
4248         struct qblock *then_block = vir_new_block(c);
4249         struct qblock *after_block = vir_new_block(c);
4250         struct qblock *else_block;
4251         if (empty_else_block)
4252                 else_block = after_block;
4253         else
4254                 else_block = vir_new_block(c);
4255 
4256         bool was_uniform_control_flow = false;
4257         if (!vir_in_nonuniform_control_flow(c)) {
4258                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4259                 was_uniform_control_flow = true;
4260         }
4261 
4262         /* Set up the flags for the IF condition (taking the THEN branch). */
4263         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
4264 
4265         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
4266          * was previously active (execute Z) for updating the exec flags.
4267          */
4268         if (was_uniform_control_flow) {
4269                 cond = v3d_qpu_cond_invert(cond);
4270         } else {
4271                 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
4272                 if (cond == V3D_QPU_COND_IFA) {
4273                         vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
4274                 } else {
4275                         vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
4276                         cond = V3D_QPU_COND_IFA;
4277                 }
4278         }
4279 
4280         vir_MOV_cond(c, cond,
4281                      c->execute,
4282                      vir_uniform_ui(c, else_block->index));
4283 
4284         /* Set the flags for taking the THEN block */
4285         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4286                    V3D_QPU_PF_PUSHZ);
4287 
4288         /* Jump to ELSE if nothing is active for THEN (unless THEN block is
4289          * so small it won't pay off), otherwise fall through.
4290          */
4291         bool is_cheap = exec_list_is_singular(&if_stmt->then_list) &&
4292                         is_cheap_block(nir_if_first_then_block(if_stmt));
4293         if (!is_cheap) {
4294                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
4295                 vir_link_blocks(c->cur_block, else_block);
4296         }
4297         vir_link_blocks(c->cur_block, then_block);
4298 
4299         /* Process the THEN block.
4300          *
4301          * Notice we don't call ntq_activate_execute_for_block here on purpose:
4302          * c->execute is already set up to be 0 for lanes that must take the
4303          * THEN block.
4304          */
4305         vir_set_emit_block(c, then_block);
4306         ntq_emit_cf_list(c, &if_stmt->then_list);
4307 
4308         if (!empty_else_block) {
4309                 /* Handle the end of the THEN block.  First, all currently
4310                  * active channels update their execute flags to point to
4311                  * ENDIF
4312                  */
4313                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4314                            V3D_QPU_PF_PUSHZ);
4315                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4316                              vir_uniform_ui(c, after_block->index));
4317 
4318                 /* If everything points at ENDIF, then jump there immediately
4319                  * (unless ELSE block is so small it won't pay off).
4320                  */
4321                 bool is_cheap = exec_list_is_singular(&if_stmt->else_list) &&
4322                                 is_cheap_block(nir_else_block);
4323                 if (!is_cheap) {
4324                         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
4325                                                    c->execute,
4326                                                    vir_uniform_ui(c, after_block->index)),
4327                                    V3D_QPU_PF_PUSHZ);
4328                         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
4329                         vir_link_blocks(c->cur_block, after_block);
4330                 }
4331                 vir_link_blocks(c->cur_block, else_block);
4332 
4333                 vir_set_emit_block(c, else_block);
4334                 ntq_activate_execute_for_block(c);
4335                 ntq_emit_cf_list(c, &if_stmt->else_list);
4336         }
4337 
4338         vir_link_blocks(c->cur_block, after_block);
4339 
4340         vir_set_emit_block(c, after_block);
4341         if (was_uniform_control_flow)
4342                 c->execute = c->undef;
4343         else
4344                 ntq_activate_execute_for_block(c);
4345 }
4346 
4347 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)4348 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
4349 {
4350         bool was_in_control_flow = c->in_control_flow;
4351         c->in_control_flow = true;
4352         if (!vir_in_nonuniform_control_flow(c) &&
4353             !nir_src_is_divergent(nif->condition)) {
4354                 ntq_emit_uniform_if(c, nif);
4355         } else {
4356                 ntq_emit_nonuniform_if(c, nif);
4357         }
4358         c->in_control_flow = was_in_control_flow;
4359 }
4360 
4361 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)4362 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
4363 {
4364         switch (jump->type) {
4365         case nir_jump_break:
4366                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4367                            V3D_QPU_PF_PUSHZ);
4368                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4369                              vir_uniform_ui(c, c->loop_break_block->index));
4370                 break;
4371 
4372         case nir_jump_continue:
4373                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4374                            V3D_QPU_PF_PUSHZ);
4375                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4376                              vir_uniform_ui(c, c->loop_cont_block->index));
4377                 break;
4378 
4379         case nir_jump_return:
4380                 unreachable("All returns should be lowered\n");
4381                 break;
4382 
4383         case nir_jump_halt:
4384         case nir_jump_goto:
4385         case nir_jump_goto_if:
4386                 unreachable("not supported\n");
4387                 break;
4388         }
4389 }
4390 
4391 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)4392 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
4393 {
4394         switch (jump->type) {
4395         case nir_jump_break:
4396                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4397                 vir_link_blocks(c->cur_block, c->loop_break_block);
4398                 c->cur_block->branch_emitted = true;
4399                 break;
4400         case nir_jump_continue:
4401                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4402                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4403                 c->cur_block->branch_emitted = true;
4404                 break;
4405 
4406         case nir_jump_return:
4407                 unreachable("All returns should be lowered\n");
4408                 break;
4409 
4410         case nir_jump_halt:
4411         case nir_jump_goto:
4412         case nir_jump_goto_if:
4413                 unreachable("not supported\n");
4414                 break;
4415         }
4416 }
4417 
4418 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)4419 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
4420 {
4421         switch (instr->type) {
4422         case nir_instr_type_alu:
4423                 ntq_emit_alu(c, nir_instr_as_alu(instr));
4424                 break;
4425 
4426         case nir_instr_type_intrinsic:
4427                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
4428                 break;
4429 
4430         case nir_instr_type_load_const:
4431                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
4432                 break;
4433 
4434         case nir_instr_type_undef:
4435                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
4436                 break;
4437 
4438         case nir_instr_type_tex:
4439                 ntq_emit_tex(c, nir_instr_as_tex(instr));
4440                 break;
4441 
4442         case nir_instr_type_jump:
4443                 /* Always flush TMU before jumping to another block, for the
4444                  * same reasons as in ntq_emit_block.
4445                  */
4446                 ntq_flush_tmu(c);
4447                 if (vir_in_nonuniform_control_flow(c))
4448                         ntq_emit_jump(c, nir_instr_as_jump(instr));
4449                 else
4450                         ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
4451                 break;
4452 
4453         default:
4454                 fprintf(stderr, "Unknown NIR instr type: ");
4455                 nir_print_instr(instr, stderr);
4456                 fprintf(stderr, "\n");
4457                 abort();
4458         }
4459 }
4460 
4461 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)4462 ntq_emit_block(struct v3d_compile *c, nir_block *block)
4463 {
4464         nir_foreach_instr(instr, block) {
4465                 ntq_emit_instr(c, instr);
4466         }
4467 
4468         /* Always process pending TMU operations in the same block they were
4469          * emitted: we can't emit TMU operations in a block and then emit a
4470          * thread switch and LDTMU/TMUWT for them in another block, possibly
4471          * under control flow.
4472          */
4473         ntq_flush_tmu(c);
4474 }
4475 
4476 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
4477 
4478 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)4479 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
4480 {
4481         bool was_uniform_control_flow = false;
4482         if (!vir_in_nonuniform_control_flow(c)) {
4483                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4484                 was_uniform_control_flow = true;
4485         }
4486 
4487         c->loop_cont_block = vir_new_block(c);
4488         c->loop_break_block = vir_new_block(c);
4489 
4490         vir_link_blocks(c->cur_block, c->loop_cont_block);
4491         vir_set_emit_block(c, c->loop_cont_block);
4492         ntq_activate_execute_for_block(c);
4493 
4494         ntq_emit_cf_list(c, &loop->body);
4495 
4496         /* Re-enable any previous continues now, so our ANYA check below
4497          * works.
4498          *
4499          * XXX: Use the .ORZ flags update, instead.
4500          */
4501         vir_set_pf(c, vir_XOR_dest(c,
4502                                 vir_nop_reg(),
4503                                 c->execute,
4504                                 vir_uniform_ui(c, c->loop_cont_block->index)),
4505                    V3D_QPU_PF_PUSHZ);
4506         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4507 
4508         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
4509 
4510         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
4511         /* Pixels that were not dispatched or have been discarded should not
4512          * contribute to looping again.
4513          */
4514         branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4515         vir_link_blocks(c->cur_block, c->loop_cont_block);
4516         vir_link_blocks(c->cur_block, c->loop_break_block);
4517 
4518         vir_set_emit_block(c, c->loop_break_block);
4519         if (was_uniform_control_flow)
4520                 c->execute = c->undef;
4521         else
4522                 ntq_activate_execute_for_block(c);
4523 }
4524 
4525 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)4526 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
4527 {
4528         c->loop_cont_block = vir_new_block(c);
4529         c->loop_break_block = vir_new_block(c);
4530 
4531         vir_link_blocks(c->cur_block, c->loop_cont_block);
4532         vir_set_emit_block(c, c->loop_cont_block);
4533 
4534         ntq_emit_cf_list(c, &loop->body);
4535 
4536         if (!c->cur_block->branch_emitted) {
4537                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4538                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4539         }
4540 
4541         vir_set_emit_block(c, c->loop_break_block);
4542 }
4543 
4544 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)4545 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
4546 {
4547         assert(!nir_loop_has_continue_construct(loop));
4548 
4549         /* Disable flags optimization for loop conditions. The problem here is
4550          * that we can have code like this:
4551          *
4552          *  // block_0
4553          *  vec1 32 con ssa_9 = ine32 ssa_8, ssa_2
4554          *  loop {
4555          *     // block_1
4556          *     if ssa_9 {
4557          *
4558          * In this example we emit flags to compute ssa_9 and the optimization
4559          * will skip regenerating them again for the loop condition in the
4560          * loop continue block (block_1). However, this is not safe after the
4561          * first iteration because the loop body can stomp the flags if it has
4562          * any conditionals.
4563          */
4564         c->flags_temp = -1;
4565 
4566         bool was_in_control_flow = c->in_control_flow;
4567         c->in_control_flow = true;
4568 
4569         struct qblock *save_loop_cont_block = c->loop_cont_block;
4570         struct qblock *save_loop_break_block = c->loop_break_block;
4571 
4572         if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
4573                 ntq_emit_nonuniform_loop(c, loop);
4574         } else {
4575                 ntq_emit_uniform_loop(c, loop);
4576         }
4577 
4578         c->loop_break_block = save_loop_break_block;
4579         c->loop_cont_block = save_loop_cont_block;
4580 
4581         c->loops++;
4582 
4583         c->in_control_flow = was_in_control_flow;
4584 }
4585 
4586 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)4587 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
4588 {
4589         fprintf(stderr, "FUNCTIONS not handled.\n");
4590         abort();
4591 }
4592 
4593 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)4594 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
4595 {
4596         foreach_list_typed(nir_cf_node, node, node, list) {
4597                 switch (node->type) {
4598                 case nir_cf_node_block:
4599                         ntq_emit_block(c, nir_cf_node_as_block(node));
4600                         break;
4601 
4602                 case nir_cf_node_if:
4603                         ntq_emit_if(c, nir_cf_node_as_if(node));
4604                         break;
4605 
4606                 case nir_cf_node_loop:
4607                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
4608                         break;
4609 
4610                 case nir_cf_node_function:
4611                         ntq_emit_function(c, nir_cf_node_as_function(node));
4612                         break;
4613 
4614                 default:
4615                         fprintf(stderr, "Unknown NIR node type\n");
4616                         abort();
4617                 }
4618         }
4619 }
4620 
4621 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)4622 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
4623 {
4624         ntq_setup_registers(c, impl);
4625         ntq_emit_cf_list(c, &impl->body);
4626 }
4627 
4628 static bool
vir_inst_reads_reg(struct qinst * inst,struct qreg r)4629 vir_inst_reads_reg(struct qinst *inst, struct qreg r)
4630 {
4631    for (int i = 0; i < vir_get_nsrc(inst); i++) {
4632            if (inst->src[i].file == r.file && inst->src[i].index == r.index)
4633                    return true;
4634    }
4635    return false;
4636 }
4637 
4638 static void
sched_flags_in_block(struct v3d_compile * c,struct qblock * block)4639 sched_flags_in_block(struct v3d_compile *c, struct qblock *block)
4640 {
4641         struct qinst *flags_inst = NULL;
4642         list_for_each_entry_safe_rev(struct qinst, inst, &block->instructions, link) {
4643                 /* Check for cases that would prevent us from moving a flags
4644                  * instruction any earlier than this instruction:
4645                  *
4646                  * - The flags instruction reads the result of this instr.
4647                  * - The instruction reads or writes flags.
4648                  */
4649                 if (flags_inst) {
4650                         if (vir_inst_reads_reg(flags_inst, inst->dst) ||
4651                             v3d_qpu_writes_flags(&inst->qpu) ||
4652                             v3d_qpu_reads_flags(&inst->qpu)) {
4653                                 list_move_to(&flags_inst->link, &inst->link);
4654                                 flags_inst = NULL;
4655                         }
4656                 }
4657 
4658                 /* Skip if this instruction does more than just write flags */
4659                 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
4660                     inst->dst.file != QFILE_NULL ||
4661                     !v3d_qpu_writes_flags(&inst->qpu)) {
4662                         continue;
4663                 }
4664 
4665                 /* If we already had a flags_inst we should've moved it after
4666                  * this instruction in the if (flags_inst) above.
4667                  */
4668                 assert(!flags_inst);
4669                 flags_inst = inst;
4670         }
4671 
4672         /* If we reached the beginning of the block and we still have a flags
4673          * instruction selected we can put it at the top of the block.
4674          */
4675         if (flags_inst) {
4676                 list_move_to(&flags_inst->link, &block->instructions);
4677                 flags_inst = NULL;
4678         }
4679 }
4680 
4681 /**
4682  * The purpose of this pass is to emit instructions that are only concerned
4683  * with producing flags as early as possible to hopefully reduce liveness
4684  * of their source arguments.
4685  */
4686 static void
sched_flags(struct v3d_compile * c)4687 sched_flags(struct v3d_compile *c)
4688 {
4689         vir_for_each_block(block, c)
4690                 sched_flags_in_block(c, block);
4691 }
4692 
4693 static void
nir_to_vir(struct v3d_compile * c)4694 nir_to_vir(struct v3d_compile *c)
4695 {
4696         switch (c->s->info.stage) {
4697         case MESA_SHADER_FRAGMENT:
4698                 c->start_msf = vir_MSF(c);
4699                 if (c->devinfo->ver < 71)
4700                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
4701                 else
4702                         c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
4703 
4704                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
4705                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
4706 
4707                 /* V3D 4.x can disable implicit varyings if they are not used */
4708                 c->fs_uses_primitive_id =
4709                         nir_find_variable_with_location(c->s, nir_var_shader_in,
4710                                                         VARYING_SLOT_PRIMITIVE_ID);
4711                 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
4712                        c->primitive_id =
4713                                emit_fragment_varying(c, NULL, -1, 0, 0);
4714                 }
4715 
4716                 if (c->fs_key->is_points && program_reads_point_coord(c)) {
4717                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4718                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
4719                         c->uses_implicit_point_line_varyings = true;
4720                 } else if (c->fs_key->is_lines &&
4721                            (BITSET_TEST(c->s->info.system_values_read,
4722                                         SYSTEM_VALUE_LINE_COORD))) {
4723                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4724                         c->uses_implicit_point_line_varyings = true;
4725                 }
4726                 break;
4727         case MESA_SHADER_COMPUTE:
4728                 /* Set up the TSO for barriers, assuming we do some. */
4729                 if (c->devinfo->ver < 42) {
4730                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
4731                                                       V3D_QPU_WADDR_SYNC));
4732                 }
4733 
4734                 if (c->devinfo->ver == 42) {
4735                         c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
4736                         c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4737                 } else if (c->devinfo->ver >= 71) {
4738                         c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
4739                         c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4740                 }
4741 
4742                 /* Set up the division between gl_LocalInvocationIndex and
4743                  * wg_in_mem in the payload reg.
4744                  */
4745                 int wg_size = (c->s->info.workgroup_size[0] *
4746                                c->s->info.workgroup_size[1] *
4747                                c->s->info.workgroup_size[2]);
4748                 c->local_invocation_index_bits =
4749                         ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
4750                 assert(c->local_invocation_index_bits <= 8);
4751 
4752                 if (c->s->info.shared_size || c->s->info.cs.has_variable_shared_mem) {
4753                         struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
4754                                                         vir_uniform_ui(c, 16));
4755                         if (c->s->info.workgroup_size[0] != 1 ||
4756                             c->s->info.workgroup_size[1] != 1 ||
4757                             c->s->info.workgroup_size[2] != 1) {
4758                                 int wg_bits = (16 -
4759                                                c->local_invocation_index_bits);
4760                                 int wg_mask = (1 << wg_bits) - 1;
4761                                 wg_in_mem = vir_AND(c, wg_in_mem,
4762                                                     vir_uniform_ui(c, wg_mask));
4763                         }
4764 
4765                         struct qreg shared_per_wg;
4766                         if (c->s->info.cs.has_variable_shared_mem) {
4767                                 shared_per_wg = vir_uniform(c, QUNIFORM_SHARED_SIZE, 0);
4768                         } else {
4769                                 shared_per_wg = vir_uniform_ui(c, c->s->info.shared_size);
4770                         }
4771 
4772                         c->cs_shared_offset =
4773                                 vir_ADD(c,
4774                                         vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
4775                                         vir_UMUL(c, wg_in_mem, shared_per_wg));
4776                 }
4777                 break;
4778         default:
4779                 break;
4780         }
4781 
4782         if (c->s->scratch_size) {
4783                 v3d_setup_spill_base(c);
4784                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
4785         }
4786 
4787         switch (c->s->info.stage) {
4788         case MESA_SHADER_VERTEX:
4789                 ntq_setup_vs_inputs(c);
4790                 break;
4791         case MESA_SHADER_GEOMETRY:
4792                 ntq_setup_gs_inputs(c);
4793                 break;
4794         case MESA_SHADER_FRAGMENT:
4795                 ntq_setup_fs_inputs(c);
4796                 break;
4797         case MESA_SHADER_COMPUTE:
4798                 break;
4799         default:
4800                 unreachable("unsupported shader stage");
4801         }
4802 
4803         ntq_setup_outputs(c);
4804 
4805         /* Find the main function and emit the body. */
4806         nir_foreach_function(function, c->s) {
4807                 assert(function->is_entrypoint);
4808                 assert(function->impl);
4809                 ntq_emit_impl(c, function->impl);
4810         }
4811 }
4812 
4813 /**
4814  * When demoting a shader down to single-threaded, removes the THRSW
4815  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
4816  * program end).
4817  */
4818 static void
vir_remove_thrsw(struct v3d_compile * c)4819 vir_remove_thrsw(struct v3d_compile *c)
4820 {
4821         vir_for_each_block(block, c) {
4822                 vir_for_each_inst_safe(inst, block) {
4823                         if (inst->qpu.sig.thrsw)
4824                                 vir_remove_instruction(c, inst);
4825                 }
4826         }
4827 
4828         c->last_thrsw = NULL;
4829 }
4830 
4831 /**
4832  * This makes sure we have a top-level last thread switch which signals the
4833  * start of the last thread section, which may include adding a new thrsw
4834  * instruction if needed. We don't allow spilling in the last thread section, so
4835  * if we need to do any spills that inject additional thread switches later on,
4836  * we ensure this thread switch will still be the last thread switch in the
4837  * program, which makes last thread switch signalling a lot easier when we have
4838  * spilling. If in the end we don't need to spill to compile the program and we
4839  * injected a new thread switch instruction here only for that, we will
4840  * eventually restore the previous last thread switch and remove the one we
4841  * added here.
4842  */
4843 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)4844 vir_emit_last_thrsw(struct v3d_compile *c,
4845                     struct qinst **restore_last_thrsw,
4846                     bool *restore_scoreboard_lock)
4847 {
4848         *restore_last_thrsw = c->last_thrsw;
4849 
4850         /* If we're threaded and the last THRSW was in conditional code, then
4851          * we need to emit another one so that we can flag it as the last
4852          * thrsw.
4853          */
4854         if (c->last_thrsw && !c->last_thrsw_at_top_level)
4855                 vir_emit_thrsw(c);
4856 
4857         /* If we're threaded, then we need to mark the last THRSW instruction
4858          * so we can emit a pair of them at QPU emit time.
4859          *
4860          * For V3D 4.x, we can spawn the non-fragment shaders already in the
4861          * post-last-THRSW state, so we can skip this.
4862          */
4863         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT)
4864                 vir_emit_thrsw(c);
4865 
4866         /* If we have not inserted a last thread switch yet, do it now to ensure
4867          * any potential spilling we do happens before this. If we don't spill
4868          * in the end, we will restore the previous one.
4869          */
4870         if (*restore_last_thrsw == c->last_thrsw) {
4871                 if (*restore_last_thrsw)
4872                         (*restore_last_thrsw)->is_last_thrsw = false;
4873                 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
4874                 vir_emit_thrsw(c);
4875         } else {
4876                 *restore_last_thrsw = c->last_thrsw;
4877         }
4878 
4879         assert(c->last_thrsw);
4880         c->last_thrsw->is_last_thrsw = true;
4881 }
4882 
4883 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)4884 vir_restore_last_thrsw(struct v3d_compile *c,
4885                        struct qinst *thrsw,
4886                        bool scoreboard_lock)
4887 {
4888         assert(c->last_thrsw);
4889         vir_remove_instruction(c, c->last_thrsw);
4890         c->last_thrsw = thrsw;
4891         if (c->last_thrsw)
4892                 c->last_thrsw->is_last_thrsw = true;
4893         c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
4894 }
4895 
4896 /* There's a flag in the shader for "center W is needed for reasons other than
4897  * non-centroid varyings", so we just walk the program after VIR optimization
4898  * to see if it's used.  It should be harmless to set even if we only use
4899  * center W for varyings.
4900  */
4901 static void
vir_check_payload_w(struct v3d_compile * c)4902 vir_check_payload_w(struct v3d_compile *c)
4903 {
4904         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4905                 return;
4906 
4907         vir_for_each_inst_inorder(inst, c) {
4908                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4909                         if (inst->src[i].file == c->payload_w.file &&
4910                             inst->src[i].index == c->payload_w.index) {
4911                                 c->uses_center_w = true;
4912                                 return;
4913                         }
4914                 }
4915         }
4916 }
4917 
4918 void
v3d_nir_to_vir(struct v3d_compile * c)4919 v3d_nir_to_vir(struct v3d_compile *c)
4920 {
4921         if (V3D_DBG(NIR) ||
4922             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4923                 fprintf(stderr, "%s prog %d/%d NIR:\n",
4924                         vir_get_stage_name(c),
4925                         c->program_id, c->variant_id);
4926                 nir_print_shader(c->s, stderr);
4927         }
4928 
4929         nir_to_vir(c);
4930 
4931         bool restore_scoreboard_lock = false;
4932         struct qinst *restore_last_thrsw;
4933 
4934         /* Emit the last THRSW before STVPM and TLB writes. */
4935         vir_emit_last_thrsw(c,
4936                             &restore_last_thrsw,
4937                             &restore_scoreboard_lock);
4938 
4939 
4940         switch (c->s->info.stage) {
4941         case MESA_SHADER_FRAGMENT:
4942                 emit_frag_end(c);
4943                 break;
4944         case MESA_SHADER_GEOMETRY:
4945                 emit_geom_end(c);
4946                 break;
4947         case MESA_SHADER_VERTEX:
4948                 emit_vert_end(c);
4949                 break;
4950         case MESA_SHADER_COMPUTE:
4951                 break;
4952         default:
4953                 unreachable("bad stage");
4954         }
4955 
4956         if (V3D_DBG(VIR) ||
4957             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4958                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4959                         vir_get_stage_name(c),
4960                         c->program_id, c->variant_id);
4961                 vir_dump(c);
4962                 fprintf(stderr, "\n");
4963         }
4964 
4965         vir_optimize(c);
4966         sched_flags(c);
4967 
4968         vir_check_payload_w(c);
4969 
4970         /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4971          * We used that on that platform to pipeline TMU writes and reduce the
4972          * number of thread switches, as well as try (mostly successfully) to
4973          * reduce maximum register pressure to allow more threads.  We should
4974          * do something of that sort for V3D -- either instruction scheduling
4975          * here, or delay the the THRSW and LDTMUs from our texture
4976          * instructions until the results are needed.
4977          */
4978 
4979         if (V3D_DBG(VIR) ||
4980             v3d_debug_flag_for_shader_stage(c->s->info.stage)) {
4981                 fprintf(stderr, "%s prog %d/%d VIR:\n",
4982                         vir_get_stage_name(c),
4983                         c->program_id, c->variant_id);
4984                 vir_dump(c);
4985                 fprintf(stderr, "\n");
4986         }
4987 
4988         /* Attempt to allocate registers for the temporaries.  If we fail,
4989          * reduce thread count and try again.
4990          */
4991         int min_threads = 2;
4992         struct qpu_reg *temp_registers;
4993         while (true) {
4994                 temp_registers = v3d_register_allocate(c);
4995                 if (temp_registers) {
4996                         assert(c->spills + c->fills <= c->max_tmu_spills);
4997                         break;
4998                 }
4999 
5000                 if (c->threads == min_threads &&
5001                     V3D_DBG(RA)) {
5002                         fprintf(stderr,
5003                                 "Failed to register allocate using %s\n",
5004                                 c->fallback_scheduler ? "the fallback scheduler:" :
5005                                 "the normal scheduler: \n");
5006 
5007                         vir_dump(c);
5008 
5009                         char *shaderdb;
5010                         int ret = v3d_shaderdb_dump(c, &shaderdb);
5011                         if (ret > 0) {
5012                                 fprintf(stderr, "%s\n", shaderdb);
5013                                 free(shaderdb);
5014                         }
5015                 }
5016 
5017                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
5018                         if (V3D_DBG(PERF)) {
5019                                 fprintf(stderr,
5020                                         "Failed to register allocate %s "
5021                                         "prog %d/%d at %d threads.\n",
5022                                         vir_get_stage_name(c),
5023                                         c->program_id, c->variant_id, c->threads);
5024                         }
5025                         c->compilation_result =
5026                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
5027                         return;
5028                 }
5029 
5030                 c->spills = 0;
5031                 c->fills = 0;
5032                 c->threads /= 2;
5033 
5034                 if (c->threads == 1)
5035                         vir_remove_thrsw(c);
5036         }
5037 
5038         /* If we didn't spill, then remove the last thread switch we injected
5039          * artificially (if any) and restore the previous one.
5040          */
5041         if (!c->spills && c->last_thrsw != restore_last_thrsw)
5042                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
5043 
5044         if (c->spills &&
5045             (V3D_DBG(VIR) ||
5046              v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
5047                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
5048                         vir_get_stage_name(c),
5049                         c->program_id, c->variant_id);
5050                 vir_dump(c);
5051                 fprintf(stderr, "\n");
5052         }
5053 
5054         v3d_vir_to_qpu(c, temp_registers);
5055 }
5056