1 /*
2 * Copyright (c) 2014 Scott Mansell
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include <inttypes.h>
26 #include "util/format/u_format.h"
27 #include "util/crc32.h"
28 #include "util/u_helpers.h"
29 #include "util/u_math.h"
30 #include "util/u_memory.h"
31 #include "util/ralloc.h"
32 #include "util/hash_table.h"
33 #include "tgsi/tgsi_dump.h"
34 #include "compiler/glsl_types.h"
35 #include "compiler/nir/nir.h"
36 #include "compiler/nir/nir_builder.h"
37 #include "nir/tgsi_to_nir.h"
38 #include "vc4_context.h"
39 #include "vc4_qpu.h"
40 #include "vc4_qir.h"
41
42 static struct qreg
43 ntq_get_src(struct vc4_compile *c, nir_src src, int i);
44 static void
45 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
46
47 static struct vc4_compiled_shader *
48 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
49 struct vc4_key *key);
50
51 static int
type_size(const struct glsl_type * type,bool bindless)52 type_size(const struct glsl_type *type, bool bindless)
53 {
54 return glsl_count_attribute_slots(type, false);
55 }
56
57 static void
resize_qreg_array(struct vc4_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)58 resize_qreg_array(struct vc4_compile *c,
59 struct qreg **regs,
60 uint32_t *size,
61 uint32_t decl_size)
62 {
63 if (*size >= decl_size)
64 return;
65
66 uint32_t old_size = *size;
67 *size = MAX2(*size * 2, decl_size);
68 *regs = reralloc(c, *regs, struct qreg, *size);
69 if (!*regs) {
70 fprintf(stderr, "Malloc failure\n");
71 abort();
72 }
73
74 for (uint32_t i = old_size; i < *size; i++)
75 (*regs)[i] = c->undef;
76 }
77
78 static void
ntq_emit_thrsw(struct vc4_compile * c)79 ntq_emit_thrsw(struct vc4_compile *c)
80 {
81 if (!c->fs_threaded)
82 return;
83
84 /* Always thread switch after each texture operation for now.
85 *
86 * We could do better by batching a bunch of texture fetches up and
87 * then doing one thread switch and collecting all their results
88 * afterward.
89 */
90 qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef,
91 c->undef, c->undef));
92 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
93 }
94
95 static struct qreg
indirect_uniform_load(struct vc4_compile * c,nir_intrinsic_instr * intr)96 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
97 {
98 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
99
100 /* Clamp to [0, array size). Note that MIN/MAX are signed. */
101 uint32_t range = nir_intrinsic_range(intr);
102 indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
103 indirect_offset = qir_MIN_NOIMM(c, indirect_offset,
104 qir_uniform_ui(c, range - 4));
105
106 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
107 indirect_offset,
108 qir_uniform(c, QUNIFORM_UBO0_ADDR,
109 nir_intrinsic_base(intr)));
110
111 c->num_texture_samples++;
112
113 ntq_emit_thrsw(c);
114
115 return qir_TEX_RESULT(c);
116 }
117
118 static struct qreg
vc4_ubo_load(struct vc4_compile * c,nir_intrinsic_instr * intr)119 vc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr)
120 {
121 ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]);
122 assert(buffer_index == 1);
123 assert(c->stage == QSTAGE_FRAG);
124
125 struct qreg offset = ntq_get_src(c, intr->src[1], 0);
126
127 /* Clamp to [0, array size). Note that MIN/MAX are signed. */
128 offset = qir_MAX(c, offset, qir_uniform_ui(c, 0));
129 offset = qir_MIN_NOIMM(c, offset,
130 qir_uniform_ui(c, c->fs_key->ubo_1_size - 4));
131
132 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
133 offset,
134 qir_uniform(c, QUNIFORM_UBO1_ADDR, 0));
135
136 c->num_texture_samples++;
137
138 ntq_emit_thrsw(c);
139
140 return qir_TEX_RESULT(c);
141 }
142
143 nir_def *
vc4_nir_get_swizzled_channel(nir_builder * b,nir_def ** srcs,int swiz)144 vc4_nir_get_swizzled_channel(nir_builder *b, nir_def **srcs, int swiz)
145 {
146 switch (swiz) {
147 default:
148 case PIPE_SWIZZLE_NONE:
149 fprintf(stderr, "warning: unknown swizzle\n");
150 FALLTHROUGH;
151 case PIPE_SWIZZLE_0:
152 return nir_imm_float(b, 0.0);
153 case PIPE_SWIZZLE_1:
154 return nir_imm_float(b, 1.0);
155 case PIPE_SWIZZLE_X:
156 case PIPE_SWIZZLE_Y:
157 case PIPE_SWIZZLE_Z:
158 case PIPE_SWIZZLE_W:
159 return srcs[swiz];
160 }
161 }
162
163 static struct qreg *
ntq_init_ssa_def(struct vc4_compile * c,nir_def * def)164 ntq_init_ssa_def(struct vc4_compile *c, nir_def *def)
165 {
166 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
167 def->num_components);
168 _mesa_hash_table_insert(c->def_ht, def, qregs);
169 return qregs;
170 }
171
172 /**
173 * This function is responsible for getting QIR results into the associated
174 * storage for a NIR instruction.
175 *
176 * If it's a NIR SSA def, then we just set the associated hash table entry to
177 * the new result.
178 *
179 * If it's a NIR reg, then we need to update the existing qreg assigned to the
180 * NIR destination with the incoming value. To do that without introducing
181 * new MOVs, we require that the incoming qreg either be a uniform, or be
182 * SSA-defined by the previous QIR instruction in the block and rewritable by
183 * this function. That lets us sneak ahead and insert the SF flag beforehand
184 * (knowing that the previous instruction doesn't depend on flags) and rewrite
185 * its destination to be the NIR reg's destination
186 */
187 static void
ntq_store_def(struct vc4_compile * c,nir_def * def,int chan,struct qreg result)188 ntq_store_def(struct vc4_compile *c, nir_def *def, int chan,
189 struct qreg result)
190 {
191 struct qinst *last_inst = NULL;
192 if (!list_is_empty(&c->cur_block->instructions))
193 last_inst = (struct qinst *)c->cur_block->instructions.prev;
194
195 assert(result.file == QFILE_UNIF ||
196 (result.file == QFILE_TEMP &&
197 last_inst && last_inst == c->defs[result.index]));
198
199 nir_intrinsic_instr *store = nir_store_reg_for_def(def);
200 if (store == NULL) {
201 assert(chan < def->num_components);
202
203 struct qreg *qregs;
204 struct hash_entry *entry =
205 _mesa_hash_table_search(c->def_ht, def);
206
207 if (entry)
208 qregs = entry->data;
209 else
210 qregs = ntq_init_ssa_def(c, def);
211
212 qregs[chan] = result;
213 } else {
214 nir_def *reg = store->src[1].ssa;
215 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
216 assert(nir_intrinsic_base(store) == 0);
217 assert(nir_intrinsic_num_array_elems(decl) == 0);
218 struct hash_entry *entry =
219 _mesa_hash_table_search(c->def_ht, reg);
220 struct qreg *qregs = entry->data;
221
222 /* Insert a MOV if the source wasn't an SSA def in the
223 * previous instruction.
224 */
225 if (result.file == QFILE_UNIF) {
226 result = qir_MOV(c, result);
227 last_inst = c->defs[result.index];
228 }
229
230 /* We know they're both temps, so just rewrite index. */
231 c->defs[last_inst->dst.index] = NULL;
232 last_inst->dst.index = qregs[chan].index;
233
234 /* If we're in control flow, then make this update of the reg
235 * conditional on the execution mask.
236 */
237 if (c->execute.file != QFILE_NULL) {
238 last_inst->dst.index = qregs[chan].index;
239
240 /* Set the flags to the current exec mask. To insert
241 * the SF, we temporarily remove our SSA instruction.
242 */
243 list_del(&last_inst->link);
244 qir_SF(c, c->execute);
245 list_addtail(&last_inst->link,
246 &c->cur_block->instructions);
247
248 last_inst->cond = QPU_COND_ZS;
249 last_inst->cond_is_exec_mask = true;
250 }
251 }
252 }
253
254 static struct qreg
ntq_get_src(struct vc4_compile * c,nir_src src,int i)255 ntq_get_src(struct vc4_compile *c, nir_src src, int i)
256 {
257 struct hash_entry *entry;
258
259 nir_intrinsic_instr *load = nir_load_reg_for_def(src.ssa);
260 if (load == NULL) {
261 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
262 assert(i < src.ssa->num_components);
263 } else {
264 nir_def *reg = load->src[0].ssa;
265 ASSERTED nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
266 assert(nir_intrinsic_base(load) == 0);
267 assert(nir_intrinsic_num_array_elems(decl) == 0);
268 entry = _mesa_hash_table_search(c->def_ht, reg);
269 assert(i < nir_intrinsic_num_components(decl));
270 }
271
272 struct qreg *qregs = entry->data;
273 return qregs[i];
274 }
275
276 static struct qreg
ntq_get_alu_src(struct vc4_compile * c,nir_alu_instr * instr,unsigned src)277 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr,
278 unsigned src)
279 {
280 struct qreg r = ntq_get_src(c, instr->src[src].src,
281 instr->src[src].swizzle[0]);
282
283 return r;
284 };
285
286 static inline struct qreg
qir_SAT(struct vc4_compile * c,struct qreg val)287 qir_SAT(struct vc4_compile *c, struct qreg val)
288 {
289 return qir_FMAX(c,
290 qir_FMIN(c, val, qir_uniform_f(c, 1.0)),
291 qir_uniform_f(c, 0.0));
292 }
293
294 static struct qreg
ntq_rcp(struct vc4_compile * c,struct qreg x)295 ntq_rcp(struct vc4_compile *c, struct qreg x)
296 {
297 struct qreg r = qir_RCP(c, x);
298
299 /* Apply a Newton-Raphson step to improve the accuracy. */
300 r = qir_FMUL(c, r, qir_FSUB(c,
301 qir_uniform_f(c, 2.0),
302 qir_FMUL(c, x, r)));
303
304 return r;
305 }
306
307 static struct qreg
ntq_rsq(struct vc4_compile * c,struct qreg x)308 ntq_rsq(struct vc4_compile *c, struct qreg x)
309 {
310 struct qreg r = qir_RSQ(c, x);
311
312 /* Apply a Newton-Raphson step to improve the accuracy. */
313 r = qir_FMUL(c, r, qir_FSUB(c,
314 qir_uniform_f(c, 1.5),
315 qir_FMUL(c,
316 qir_uniform_f(c, 0.5),
317 qir_FMUL(c, x,
318 qir_FMUL(c, r, r)))));
319
320 return r;
321 }
322
323 static struct qreg
ntq_umul(struct vc4_compile * c,struct qreg src0,struct qreg src1)324 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1)
325 {
326 struct qreg src0_hi = qir_SHR(c, src0,
327 qir_uniform_ui(c, 24));
328 struct qreg src1_hi = qir_SHR(c, src1,
329 qir_uniform_ui(c, 24));
330
331 struct qreg hilo = qir_MUL24(c, src0_hi, src1);
332 struct qreg lohi = qir_MUL24(c, src0, src1_hi);
333 struct qreg lolo = qir_MUL24(c, src0, src1);
334
335 return qir_ADD(c, lolo, qir_SHL(c,
336 qir_ADD(c, hilo, lohi),
337 qir_uniform_ui(c, 24)));
338 }
339
340 static struct qreg
ntq_scale_depth_texture(struct vc4_compile * c,struct qreg src)341 ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src)
342 {
343 struct qreg depthf = qir_ITOF(c, qir_SHR(c, src,
344 qir_uniform_ui(c, 8)));
345 return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff));
346 }
347
348 /**
349 * Emits a lowered TXF_MS from an MSAA texture.
350 *
351 * The addressing math has been lowered in NIR, and now we just need to read
352 * it like a UBO.
353 */
354 static void
ntq_emit_txf(struct vc4_compile * c,nir_tex_instr * instr)355 ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
356 {
357 uint32_t tile_width = 32;
358 uint32_t tile_height = 32;
359 uint32_t tile_size = (tile_height * tile_width *
360 VC4_MAX_SAMPLES * sizeof(uint32_t));
361
362 unsigned unit = instr->texture_index;
363 uint32_t w = align(c->key->tex[unit].msaa_width, tile_width);
364 uint32_t w_tiles = w / tile_width;
365 uint32_t h = align(c->key->tex[unit].msaa_height, tile_height);
366 uint32_t h_tiles = h / tile_height;
367 uint32_t size = w_tiles * h_tiles * tile_size;
368
369 struct qreg addr;
370 assert(instr->num_srcs == 1);
371 assert(instr->src[0].src_type == nir_tex_src_coord);
372 addr = ntq_get_src(c, instr->src[0].src, 0);
373
374 /* Perform the clamping required by kernel validation. */
375 addr = qir_MAX(c, addr, qir_uniform_ui(c, 0));
376 addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4));
377
378 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0),
379 addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit));
380
381 ntq_emit_thrsw(c);
382
383 struct qreg tex = qir_TEX_RESULT(c);
384 c->num_texture_samples++;
385
386 enum pipe_format format = c->key->tex[unit].format;
387 if (util_format_is_depth_or_stencil(format)) {
388 struct qreg scaled = ntq_scale_depth_texture(c, tex);
389 for (int i = 0; i < 4; i++)
390 ntq_store_def(c, &instr->def, i, qir_MOV(c, scaled));
391 } else {
392 for (int i = 0; i < 4; i++)
393 ntq_store_def(c, &instr->def, i,
394 qir_UNPACK_8_F(c, tex, i));
395 }
396 }
397
398 static void
ntq_emit_tex(struct vc4_compile * c,nir_tex_instr * instr)399 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
400 {
401 struct qreg s, t, r, lod, compare;
402 bool is_txb = false, is_txl = false;
403 unsigned unit = instr->texture_index;
404
405 if (instr->op == nir_texop_txf) {
406 ntq_emit_txf(c, instr);
407 return;
408 }
409
410 for (unsigned i = 0; i < instr->num_srcs; i++) {
411 switch (instr->src[i].src_type) {
412 case nir_tex_src_coord:
413 s = ntq_get_src(c, instr->src[i].src, 0);
414 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
415 t = qir_uniform_f(c, 0.5);
416 else
417 t = ntq_get_src(c, instr->src[i].src, 1);
418 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
419 r = ntq_get_src(c, instr->src[i].src, 2);
420 break;
421 case nir_tex_src_bias:
422 lod = ntq_get_src(c, instr->src[i].src, 0);
423 is_txb = true;
424 break;
425 case nir_tex_src_lod:
426 lod = ntq_get_src(c, instr->src[i].src, 0);
427 is_txl = true;
428 break;
429 case nir_tex_src_comparator:
430 compare = ntq_get_src(c, instr->src[i].src, 0);
431 break;
432 default:
433 unreachable("unknown texture source");
434 }
435 }
436
437 if (c->stage != QSTAGE_FRAG && !is_txl) {
438 /* From the GLSL 1.20 spec:
439 *
440 * "If it is mip-mapped and running on the vertex shader,
441 * then the base texture is used."
442 */
443 is_txl = true;
444 lod = qir_uniform_ui(c, 0);
445 }
446
447 if (c->key->tex[unit].force_first_level) {
448 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit);
449 is_txl = true;
450 is_txb = false;
451 }
452
453 struct qreg texture_u[] = {
454 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit),
455 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit),
456 qir_uniform(c, QUNIFORM_CONSTANT, 0),
457 qir_uniform(c, QUNIFORM_CONSTANT, 0),
458 };
459 uint32_t next_texture_u = 0;
460
461 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) {
462 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2,
463 unit | (is_txl << 16));
464 }
465
466 struct qinst *tmu;
467 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
468 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r);
469 tmu->src[qir_get_tex_uniform_src(tmu)] =
470 texture_u[next_texture_u++];
471 } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
472 c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP ||
473 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
474 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
475 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0),
476 qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR,
477 unit));
478 tmu->src[qir_get_tex_uniform_src(tmu)] =
479 texture_u[next_texture_u++];
480 }
481
482 if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) {
483 s = qir_SAT(c, s);
484 }
485
486 if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) {
487 t = qir_SAT(c, t);
488 }
489
490 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t);
491 tmu->src[qir_get_tex_uniform_src(tmu)] =
492 texture_u[next_texture_u++];
493
494 if (is_txl || is_txb) {
495 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod);
496 tmu->src[qir_get_tex_uniform_src(tmu)] =
497 texture_u[next_texture_u++];
498 }
499
500 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s);
501 tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++];
502
503 c->num_texture_samples++;
504
505 ntq_emit_thrsw(c);
506
507 struct qreg tex = qir_TEX_RESULT(c);
508
509 enum pipe_format format = c->key->tex[unit].format;
510
511 if (util_format_is_depth_or_stencil(format)) {
512 struct qreg normalized = ntq_scale_depth_texture(c, tex);
513 struct qreg depth_output;
514
515 struct qreg u0 = qir_uniform_f(c, 0.0f);
516 struct qreg u1 = qir_uniform_f(c, 1.0f);
517 if (c->key->tex[unit].compare_mode) {
518 /* From the GL_ARB_shadow spec:
519 *
520 * "Let Dt (D subscript t) be the depth texture
521 * value, in the range [0, 1]. Let R be the
522 * interpolated texture coordinate clamped to the
523 * range [0, 1]."
524 */
525 compare = qir_SAT(c, compare);
526
527 switch (c->key->tex[unit].compare_func) {
528 case PIPE_FUNC_NEVER:
529 depth_output = qir_uniform_f(c, 0.0f);
530 break;
531 case PIPE_FUNC_ALWAYS:
532 depth_output = u1;
533 break;
534 case PIPE_FUNC_EQUAL:
535 qir_SF(c, qir_FSUB(c, compare, normalized));
536 depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0);
537 break;
538 case PIPE_FUNC_NOTEQUAL:
539 qir_SF(c, qir_FSUB(c, compare, normalized));
540 depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0);
541 break;
542 case PIPE_FUNC_GREATER:
543 qir_SF(c, qir_FSUB(c, compare, normalized));
544 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
545 break;
546 case PIPE_FUNC_GEQUAL:
547 qir_SF(c, qir_FSUB(c, normalized, compare));
548 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
549 break;
550 case PIPE_FUNC_LESS:
551 qir_SF(c, qir_FSUB(c, compare, normalized));
552 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0);
553 break;
554 case PIPE_FUNC_LEQUAL:
555 qir_SF(c, qir_FSUB(c, normalized, compare));
556 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0);
557 break;
558 }
559 } else {
560 depth_output = normalized;
561 }
562
563 for (int i = 0; i < 4; i++)
564 ntq_store_def(c, &instr->def, i,
565 qir_MOV(c, depth_output));
566 } else {
567 for (int i = 0; i < 4; i++)
568 ntq_store_def(c, &instr->def, i,
569 qir_UNPACK_8_F(c, tex, i));
570 }
571 }
572
573 /**
574 * Computes x - floor(x), which is tricky because our FTOI truncates (rounds
575 * to zero).
576 */
577 static struct qreg
ntq_ffract(struct vc4_compile * c,struct qreg src)578 ntq_ffract(struct vc4_compile *c, struct qreg src)
579 {
580 struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src));
581 struct qreg diff = qir_FSUB(c, src, trunc);
582 qir_SF(c, diff);
583
584 qir_FADD_dest(c, diff,
585 diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
586
587 return qir_MOV(c, diff);
588 }
589
590 /**
591 * Computes floor(x), which is tricky because our FTOI truncates (rounds to
592 * zero).
593 */
594 static struct qreg
ntq_ffloor(struct vc4_compile * c,struct qreg src)595 ntq_ffloor(struct vc4_compile *c, struct qreg src)
596 {
597 struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
598
599 /* This will be < 0 if we truncated and the truncation was of a value
600 * that was < 0 in the first place.
601 */
602 qir_SF(c, qir_FSUB(c, src, result));
603
604 struct qinst *sub = qir_FSUB_dest(c, result,
605 result, qir_uniform_f(c, 1.0));
606 sub->cond = QPU_COND_NS;
607
608 return qir_MOV(c, result);
609 }
610
611 /**
612 * Computes ceil(x), which is tricky because our FTOI truncates (rounds to
613 * zero).
614 */
615 static struct qreg
ntq_fceil(struct vc4_compile * c,struct qreg src)616 ntq_fceil(struct vc4_compile *c, struct qreg src)
617 {
618 struct qreg result = qir_ITOF(c, qir_FTOI(c, src));
619
620 /* This will be < 0 if we truncated and the truncation was of a value
621 * that was > 0 in the first place.
622 */
623 qir_SF(c, qir_FSUB(c, result, src));
624
625 qir_FADD_dest(c, result,
626 result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
627
628 return qir_MOV(c, result);
629 }
630
631 static struct qreg
ntq_shrink_sincos_input_range(struct vc4_compile * c,struct qreg x)632 ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
633 {
634 /* Since we're using a Taylor approximation, we want to have a small
635 * number of coefficients and take advantage of sin/cos repeating
636 * every 2pi. We keep our x as close to 0 as we can, since the series
637 * will be less accurate as |x| increases. (Also, be careful of
638 * shifting the input x value to be tricky with sin/cos relations,
639 * because getting accurate values for x==0 is very important for SDL
640 * rendering)
641 */
642 struct qreg scaled_x =
643 qir_FMUL(c, x,
644 qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
645 /* Note: FTOI truncates toward 0. */
646 struct qreg x_frac = qir_FSUB(c, scaled_x,
647 qir_ITOF(c, qir_FTOI(c, scaled_x)));
648 /* Map [0.5, 1] to [-0.5, 0] */
649 qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
650 qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
651 /* Map [-1, -0.5] to [0, 0.5] */
652 qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
653 qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
654
655 return x_frac;
656 }
657
658 static struct qreg
ntq_fsin(struct vc4_compile * c,struct qreg src)659 ntq_fsin(struct vc4_compile *c, struct qreg src)
660 {
661 float coeff[] = {
662 2.0 * M_PI,
663 -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
664 pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
665 -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
666 pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
667 };
668
669 struct qreg x = ntq_shrink_sincos_input_range(c, src);
670 struct qreg x2 = qir_FMUL(c, x, x);
671 struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
672 for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
673 x = qir_FMUL(c, x, x2);
674 sum = qir_FADD(c,
675 sum,
676 qir_FMUL(c,
677 x,
678 qir_uniform_f(c, coeff[i])));
679 }
680 return sum;
681 }
682
683 static struct qreg
ntq_fcos(struct vc4_compile * c,struct qreg src)684 ntq_fcos(struct vc4_compile *c, struct qreg src)
685 {
686 float coeff[] = {
687 1.0f,
688 -pow(2.0 * M_PI, 2) / (2 * 1),
689 pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
690 -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
691 pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
692 -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
693 };
694
695 struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
696 struct qreg sum = qir_uniform_f(c, coeff[0]);
697 struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
698 struct qreg x = x2; /* Current x^2, x^4, or x^6 */
699 for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
700 if (i != 1)
701 x = qir_FMUL(c, x, x2);
702
703 sum = qir_FADD(c, qir_FMUL(c,
704 x,
705 qir_uniform_f(c, coeff[i])),
706 sum);
707 }
708 return sum;
709 }
710
711 static struct qreg
ntq_fsign(struct vc4_compile * c,struct qreg src)712 ntq_fsign(struct vc4_compile *c, struct qreg src)
713 {
714 struct qreg t = qir_get_temp(c);
715
716 qir_SF(c, src);
717 qir_MOV_dest(c, t, qir_uniform_f(c, 0.0));
718 qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC;
719 qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS;
720 return qir_MOV(c, t);
721 }
722
723 static void
emit_vertex_input(struct vc4_compile * c,int attr)724 emit_vertex_input(struct vc4_compile *c, int attr)
725 {
726 enum pipe_format format = c->vs_key->attr_formats[attr];
727 uint32_t attr_size = util_format_get_blocksize(format);
728
729 c->vattr_sizes[attr] = align(attr_size, 4);
730 for (int i = 0; i < align(attr_size, 4) / 4; i++) {
731 c->inputs[attr * 4 + i] =
732 qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
733 c->num_inputs++;
734 }
735 }
736
737 static void
emit_fragcoord_input(struct vc4_compile * c,int attr)738 emit_fragcoord_input(struct vc4_compile *c, int attr)
739 {
740 c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
741 c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
742 c->inputs[attr * 4 + 2] =
743 qir_FMUL(c,
744 qir_ITOF(c, qir_FRAG_Z(c)),
745 qir_uniform_f(c, 1.0 / 0xffffff));
746 c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c));
747 }
748
749 static struct qreg
emit_fragment_varying(struct vc4_compile * c,gl_varying_slot slot,uint8_t swizzle)750 emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot,
751 uint8_t swizzle)
752 {
753 uint32_t i = c->num_input_slots++;
754 struct qreg vary = {
755 QFILE_VARY,
756 i
757 };
758
759 if (c->num_input_slots >= c->input_slots_array_size) {
760 c->input_slots_array_size =
761 MAX2(4, c->input_slots_array_size * 2);
762
763 c->input_slots = reralloc(c, c->input_slots,
764 struct vc4_varying_slot,
765 c->input_slots_array_size);
766 }
767
768 c->input_slots[i].slot = slot;
769 c->input_slots[i].swizzle = swizzle;
770
771 return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c)));
772 }
773
774 static void
emit_fragment_input(struct vc4_compile * c,int attr,gl_varying_slot slot)775 emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot)
776 {
777 for (int i = 0; i < 4; i++) {
778 c->inputs[attr * 4 + i] =
779 emit_fragment_varying(c, slot, i);
780 c->num_inputs++;
781 }
782 }
783
784 static void
add_output(struct vc4_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)785 add_output(struct vc4_compile *c,
786 uint32_t decl_offset,
787 uint8_t slot,
788 uint8_t swizzle)
789 {
790 uint32_t old_array_size = c->outputs_array_size;
791 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
792 decl_offset + 1);
793
794 if (old_array_size != c->outputs_array_size) {
795 c->output_slots = reralloc(c,
796 c->output_slots,
797 struct vc4_varying_slot,
798 c->outputs_array_size);
799 }
800
801 c->output_slots[decl_offset].slot = slot;
802 c->output_slots[decl_offset].swizzle = swizzle;
803 }
804
805 static bool
ntq_src_is_only_ssa_def_user(nir_src * src)806 ntq_src_is_only_ssa_def_user(nir_src *src)
807 {
808 return list_is_singular(&src->ssa->uses) &&
809 nir_load_reg_for_def(src->ssa) == NULL;
810 }
811
812 /**
813 * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
814 * bit set.
815 *
816 * However, as an optimization, it tries to find the instructions generating
817 * the sources to be packed and just emit the pack flag there, if possible.
818 */
819 static void
ntq_emit_pack_unorm_4x8(struct vc4_compile * c,nir_alu_instr * instr)820 ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
821 {
822 struct qreg result = qir_get_temp(c);
823 struct nir_alu_instr *vec4 = NULL;
824
825 /* If packing from a vec4 op (as expected), identify it so that we can
826 * peek back at what generated its sources.
827 */
828 if (instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
829 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
830 nir_op_vec4) {
831 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
832 }
833
834 /* If the pack is replicating the same channel 4 times, use the 8888
835 * pack flag. This is common for blending using the alpha
836 * channel.
837 */
838 if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
839 instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
840 instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
841 struct qreg rep = ntq_get_src(c,
842 instr->src[0].src,
843 instr->src[0].swizzle[0]);
844 ntq_store_def(c, &instr->def, 0, qir_PACK_8888_F(c, rep));
845 return;
846 }
847
848 for (int i = 0; i < 4; i++) {
849 int swiz = instr->src[0].swizzle[i];
850 struct qreg src;
851 if (vec4) {
852 src = ntq_get_src(c, vec4->src[swiz].src,
853 vec4->src[swiz].swizzle[0]);
854 } else {
855 src = ntq_get_src(c, instr->src[0].src, swiz);
856 }
857
858 if (vec4 &&
859 ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
860 src.file == QFILE_TEMP &&
861 c->defs[src.index] &&
862 qir_is_mul(c->defs[src.index]) &&
863 !c->defs[src.index]->dst.pack) {
864 struct qinst *rewrite = c->defs[src.index];
865 c->defs[src.index] = NULL;
866 rewrite->dst = result;
867 rewrite->dst.pack = QPU_PACK_MUL_8A + i;
868 continue;
869 }
870
871 qir_PACK_8_F(c, result, src, i);
872 }
873
874 ntq_store_def(c, &instr->def, 0, qir_MOV(c, result));
875 }
876
877 /** Handles sign-extended bitfield extracts for 16 bits. */
878 static struct qreg
ntq_emit_ibfe(struct vc4_compile * c,struct qreg base,struct qreg offset,struct qreg bits)879 ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
880 struct qreg bits)
881 {
882 assert(bits.file == QFILE_UNIF &&
883 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
884 c->uniform_data[bits.index] == 16);
885
886 assert(offset.file == QFILE_UNIF &&
887 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
888 int offset_bit = c->uniform_data[offset.index];
889 assert(offset_bit % 16 == 0);
890
891 return qir_UNPACK_16_I(c, base, offset_bit / 16);
892 }
893
894 /** Handles unsigned bitfield extracts for 8 bits. */
895 static struct qreg
ntq_emit_ubfe(struct vc4_compile * c,struct qreg base,struct qreg offset,struct qreg bits)896 ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset,
897 struct qreg bits)
898 {
899 assert(bits.file == QFILE_UNIF &&
900 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT &&
901 c->uniform_data[bits.index] == 8);
902
903 assert(offset.file == QFILE_UNIF &&
904 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT);
905 int offset_bit = c->uniform_data[offset.index];
906 assert(offset_bit % 8 == 0);
907
908 return qir_UNPACK_8_I(c, base, offset_bit / 8);
909 }
910
911 /**
912 * If compare_instr is a valid comparison instruction, emits the
913 * compare_instr's comparison and returns the sel_instr's return value based
914 * on the compare_instr's result.
915 */
916 static bool
ntq_emit_comparison(struct vc4_compile * c,struct qreg * dest,nir_alu_instr * compare_instr,nir_alu_instr * sel_instr)917 ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
918 nir_alu_instr *compare_instr,
919 nir_alu_instr *sel_instr)
920 {
921 enum qpu_cond cond;
922
923 switch (compare_instr->op) {
924 case nir_op_feq32:
925 case nir_op_ieq32:
926 case nir_op_seq:
927 cond = QPU_COND_ZS;
928 break;
929 case nir_op_fneu32:
930 case nir_op_ine32:
931 case nir_op_sne:
932 cond = QPU_COND_ZC;
933 break;
934 case nir_op_fge32:
935 case nir_op_ige32:
936 case nir_op_uge32:
937 case nir_op_sge:
938 cond = QPU_COND_NC;
939 break;
940 case nir_op_flt32:
941 case nir_op_ilt32:
942 case nir_op_slt:
943 cond = QPU_COND_NS;
944 break;
945 default:
946 return false;
947 }
948
949 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
950 struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
951
952 unsigned unsized_type =
953 nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
954 if (unsized_type == nir_type_float)
955 qir_SF(c, qir_FSUB(c, src0, src1));
956 else
957 qir_SF(c, qir_SUB(c, src0, src1));
958
959 switch (sel_instr->op) {
960 case nir_op_seq:
961 case nir_op_sne:
962 case nir_op_sge:
963 case nir_op_slt:
964 *dest = qir_SEL(c, cond,
965 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0));
966 break;
967
968 case nir_op_b32csel:
969 *dest = qir_SEL(c, cond,
970 ntq_get_alu_src(c, sel_instr, 1),
971 ntq_get_alu_src(c, sel_instr, 2));
972 break;
973
974 default:
975 *dest = qir_SEL(c, cond,
976 qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0));
977 break;
978 }
979
980 /* Make the temporary for nir_store_def(). */
981 *dest = qir_MOV(c, *dest);
982
983 return true;
984 }
985
986 /**
987 * Attempts to fold a comparison generating a boolean result into the
988 * condition code for selecting between two values, instead of comparing the
989 * boolean result against 0 to generate the condition code.
990 */
ntq_emit_bcsel(struct vc4_compile * c,nir_alu_instr * instr,struct qreg * src)991 static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr,
992 struct qreg *src)
993 {
994 if (nir_load_reg_for_def(instr->src[0].src.ssa))
995 goto out;
996 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
997 goto out;
998 nir_alu_instr *compare =
999 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
1000 if (!compare)
1001 goto out;
1002
1003 struct qreg dest;
1004 if (ntq_emit_comparison(c, &dest, compare, instr))
1005 return dest;
1006
1007 out:
1008 qir_SF(c, src[0]);
1009 return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2]));
1010 }
1011
1012 static struct qreg
ntq_fddx(struct vc4_compile * c,struct qreg src)1013 ntq_fddx(struct vc4_compile *c, struct qreg src)
1014 {
1015 /* Make sure that we have a bare temp to use for MUL rotation, so it
1016 * can be allocated to an accumulator.
1017 */
1018 if (src.pack || src.file != QFILE_TEMP)
1019 src = qir_MOV(c, src);
1020
1021 struct qreg from_left = qir_ROT_MUL(c, src, 1);
1022 struct qreg from_right = qir_ROT_MUL(c, src, 15);
1023
1024 /* Distinguish left/right pixels of the quad. */
1025 qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0),
1026 qir_uniform_ui(c, 1)));
1027
1028 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1029 qir_FSUB(c, from_right, src),
1030 qir_FSUB(c, src, from_left)));
1031 }
1032
1033 static struct qreg
ntq_fddy(struct vc4_compile * c,struct qreg src)1034 ntq_fddy(struct vc4_compile *c, struct qreg src)
1035 {
1036 if (src.pack || src.file != QFILE_TEMP)
1037 src = qir_MOV(c, src);
1038
1039 struct qreg from_bottom = qir_ROT_MUL(c, src, 2);
1040 struct qreg from_top = qir_ROT_MUL(c, src, 14);
1041
1042 /* Distinguish top/bottom pixels of the quad. */
1043 qir_SF(c, qir_AND(c,
1044 qir_reg(QFILE_QPU_ELEMENT, 0),
1045 qir_uniform_ui(c, 2)));
1046
1047 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS,
1048 qir_FSUB(c, from_top, src),
1049 qir_FSUB(c, src, from_bottom)));
1050 }
1051
1052 static struct qreg
ntq_emit_cond_to_int(struct vc4_compile * c,enum qpu_cond cond)1053 ntq_emit_cond_to_int(struct vc4_compile *c, enum qpu_cond cond)
1054 {
1055 return qir_MOV(c, qir_SEL(c, cond,
1056 qir_uniform_ui(c, 1),
1057 qir_uniform_ui(c, 0)));
1058 }
1059
1060 static void
ntq_emit_alu(struct vc4_compile * c,nir_alu_instr * instr)1061 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
1062 {
1063 /* Vectors are special in that they have non-scalarized writemasks,
1064 * and just take the first swizzle channel for each argument in order
1065 * into each writemask channel.
1066 */
1067 if (instr->op == nir_op_vec2 ||
1068 instr->op == nir_op_vec3 ||
1069 instr->op == nir_op_vec4) {
1070 struct qreg srcs[4];
1071 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1072 srcs[i] = ntq_get_src(c, instr->src[i].src,
1073 instr->src[i].swizzle[0]);
1074 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1075 ntq_store_def(c, &instr->def, i,
1076 qir_MOV(c, srcs[i]));
1077 return;
1078 }
1079
1080 if (instr->op == nir_op_pack_unorm_4x8) {
1081 ntq_emit_pack_unorm_4x8(c, instr);
1082 return;
1083 }
1084
1085 if (instr->op == nir_op_unpack_unorm_4x8) {
1086 struct qreg src = ntq_get_src(c, instr->src[0].src,
1087 instr->src[0].swizzle[0]);
1088 unsigned count = instr->def.num_components;
1089 for (int i = 0; i < count; i++) {
1090 ntq_store_def(c, &instr->def, i,
1091 qir_UNPACK_8_F(c, src, i));
1092 }
1093 return;
1094 }
1095
1096 /* General case: We can just grab the one used channel per src. */
1097 struct qreg src[nir_op_infos[instr->op].num_inputs];
1098 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1099 src[i] = ntq_get_alu_src(c, instr, i);
1100 }
1101
1102 struct qreg result;
1103
1104 switch (instr->op) {
1105 case nir_op_mov:
1106 result = qir_MOV(c, src[0]);
1107 break;
1108 case nir_op_fmul:
1109 result = qir_FMUL(c, src[0], src[1]);
1110 break;
1111 case nir_op_fadd:
1112 result = qir_FADD(c, src[0], src[1]);
1113 break;
1114 case nir_op_fsub:
1115 result = qir_FSUB(c, src[0], src[1]);
1116 break;
1117 case nir_op_fmin:
1118 result = qir_FMIN(c, src[0], src[1]);
1119 break;
1120 case nir_op_fmax:
1121 result = qir_FMAX(c, src[0], src[1]);
1122 break;
1123
1124 case nir_op_f2i32:
1125 case nir_op_f2u32:
1126 result = qir_FTOI(c, src[0]);
1127 break;
1128 case nir_op_i2f32:
1129 case nir_op_u2f32:
1130 result = qir_ITOF(c, src[0]);
1131 break;
1132 case nir_op_b2f32:
1133 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
1134 break;
1135 case nir_op_b2i32:
1136 result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
1137 break;
1138
1139 case nir_op_iadd:
1140 result = qir_ADD(c, src[0], src[1]);
1141 break;
1142 case nir_op_ushr:
1143 result = qir_SHR(c, src[0], src[1]);
1144 break;
1145 case nir_op_isub:
1146 result = qir_SUB(c, src[0], src[1]);
1147 break;
1148 case nir_op_ishr:
1149 result = qir_ASR(c, src[0], src[1]);
1150 break;
1151 case nir_op_ishl:
1152 result = qir_SHL(c, src[0], src[1]);
1153 break;
1154 case nir_op_imin:
1155 result = qir_MIN(c, src[0], src[1]);
1156 break;
1157 case nir_op_imax:
1158 result = qir_MAX(c, src[0], src[1]);
1159 break;
1160 case nir_op_iand:
1161 result = qir_AND(c, src[0], src[1]);
1162 break;
1163 case nir_op_ior:
1164 result = qir_OR(c, src[0], src[1]);
1165 break;
1166 case nir_op_ixor:
1167 result = qir_XOR(c, src[0], src[1]);
1168 break;
1169 case nir_op_inot:
1170 result = qir_NOT(c, src[0]);
1171 break;
1172
1173 case nir_op_imul:
1174 result = ntq_umul(c, src[0], src[1]);
1175 break;
1176
1177 case nir_op_seq:
1178 case nir_op_sne:
1179 case nir_op_sge:
1180 case nir_op_slt:
1181 case nir_op_feq32:
1182 case nir_op_fneu32:
1183 case nir_op_fge32:
1184 case nir_op_flt32:
1185 case nir_op_ieq32:
1186 case nir_op_ine32:
1187 case nir_op_ige32:
1188 case nir_op_uge32:
1189 case nir_op_ilt32:
1190 if (!ntq_emit_comparison(c, &result, instr, instr)) {
1191 fprintf(stderr, "Bad comparison instruction\n");
1192 }
1193 break;
1194
1195 case nir_op_b32csel:
1196 result = ntq_emit_bcsel(c, instr, src);
1197 break;
1198 case nir_op_fcsel:
1199 qir_SF(c, src[0]);
1200 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2]));
1201 break;
1202
1203 case nir_op_frcp:
1204 result = ntq_rcp(c, src[0]);
1205 break;
1206 case nir_op_frsq:
1207 result = ntq_rsq(c, src[0]);
1208 break;
1209 case nir_op_fexp2:
1210 result = qir_EXP2(c, src[0]);
1211 break;
1212 case nir_op_flog2:
1213 result = qir_LOG2(c, src[0]);
1214 break;
1215
1216 case nir_op_ftrunc:
1217 result = qir_ITOF(c, qir_FTOI(c, src[0]));
1218 break;
1219 case nir_op_fceil:
1220 result = ntq_fceil(c, src[0]);
1221 break;
1222 case nir_op_ffract:
1223 result = ntq_ffract(c, src[0]);
1224 break;
1225 case nir_op_ffloor:
1226 result = ntq_ffloor(c, src[0]);
1227 break;
1228
1229 case nir_op_fsin:
1230 result = ntq_fsin(c, src[0]);
1231 break;
1232 case nir_op_fcos:
1233 result = ntq_fcos(c, src[0]);
1234 break;
1235
1236 case nir_op_fsign:
1237 result = ntq_fsign(c, src[0]);
1238 break;
1239
1240 case nir_op_fabs:
1241 result = qir_FMAXABS(c, src[0], src[0]);
1242 break;
1243 case nir_op_iabs:
1244 result = qir_MAX(c, src[0],
1245 qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
1246 break;
1247
1248 case nir_op_ibitfield_extract:
1249 result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
1250 break;
1251
1252 case nir_op_ubitfield_extract:
1253 result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
1254 break;
1255
1256 case nir_op_usadd_4x8_vc4:
1257 result = qir_V8ADDS(c, src[0], src[1]);
1258 break;
1259
1260 case nir_op_ussub_4x8_vc4:
1261 result = qir_V8SUBS(c, src[0], src[1]);
1262 break;
1263
1264 case nir_op_umin_4x8_vc4:
1265 result = qir_V8MIN(c, src[0], src[1]);
1266 break;
1267
1268 case nir_op_umax_4x8_vc4:
1269 result = qir_V8MAX(c, src[0], src[1]);
1270 break;
1271
1272 case nir_op_umul_unorm_4x8_vc4:
1273 result = qir_V8MULD(c, src[0], src[1]);
1274 break;
1275
1276 case nir_op_uadd_carry:
1277 qir_SF(c, qir_ADD(c, src[0], src[1]));
1278 result = ntq_emit_cond_to_int(c, QPU_COND_CS);
1279 break;
1280
1281 case nir_op_usub_borrow:
1282 qir_SF(c, qir_SUB(c, src[0], src[1]));
1283 result = ntq_emit_cond_to_int(c, QPU_COND_CS);
1284 break;
1285
1286 default:
1287 fprintf(stderr, "unknown NIR ALU inst: ");
1288 nir_print_instr(&instr->instr, stderr);
1289 fprintf(stderr, "\n");
1290 abort();
1291 }
1292
1293 ntq_store_def(c, &instr->def, 0, result);
1294 }
1295
1296 static void
emit_frag_end(struct vc4_compile * c)1297 emit_frag_end(struct vc4_compile *c)
1298 {
1299 struct qreg color;
1300 if (c->output_color_index != -1) {
1301 color = c->outputs[c->output_color_index];
1302 } else {
1303 color = qir_uniform_ui(c, 0);
1304 }
1305
1306 uint32_t discard_cond = QPU_COND_ALWAYS;
1307 if (c->s->info.fs.uses_discard) {
1308 qir_SF(c, c->discard);
1309 discard_cond = QPU_COND_ZS;
1310 }
1311
1312 if (c->fs_key->stencil_enabled) {
1313 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1314 qir_uniform(c, QUNIFORM_STENCIL, 0));
1315 if (c->fs_key->stencil_twoside) {
1316 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1317 qir_uniform(c, QUNIFORM_STENCIL, 1));
1318 }
1319 if (c->fs_key->stencil_full_writemasks) {
1320 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
1321 qir_uniform(c, QUNIFORM_STENCIL, 2));
1322 }
1323 }
1324
1325 if (c->output_sample_mask_index != -1) {
1326 qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1327 }
1328
1329 if (c->fs_key->depth_enabled) {
1330 if (c->output_position_index != -1) {
1331 qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1332 qir_FMUL(c,
1333 c->outputs[c->output_position_index],
1334 qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
1335 } else {
1336 qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
1337 qir_FRAG_Z(c))->cond = discard_cond;
1338 }
1339 }
1340
1341 if (!c->msaa_per_sample_output) {
1342 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
1343 color)->cond = discard_cond;
1344 } else {
1345 for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
1346 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
1347 c->sample_colors[i])->cond = discard_cond;
1348 }
1349 }
1350 }
1351
1352 static void
emit_scaled_viewport_write(struct vc4_compile * c,struct qreg rcp_w)1353 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
1354 {
1355 struct qreg packed = qir_get_temp(c);
1356
1357 for (int i = 0; i < 2; i++) {
1358 struct qreg scale =
1359 qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
1360
1361 struct qreg packed_chan = packed;
1362 packed_chan.pack = QPU_PACK_A_16A + i;
1363
1364 qir_FTOI_dest(c, packed_chan,
1365 qir_FMUL(c,
1366 qir_FMUL(c,
1367 c->outputs[c->output_position_index + i],
1368 scale),
1369 rcp_w));
1370 }
1371
1372 qir_VPM_WRITE(c, packed);
1373 }
1374
1375 static void
emit_zs_write(struct vc4_compile * c,struct qreg rcp_w)1376 emit_zs_write(struct vc4_compile *c, struct qreg rcp_w)
1377 {
1378 struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1379 struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1380
1381 qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c,
1382 c->outputs[c->output_position_index + 2],
1383 zscale),
1384 rcp_w),
1385 zoffset));
1386 }
1387
1388 static void
emit_rcp_wc_write(struct vc4_compile * c,struct qreg rcp_w)1389 emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w)
1390 {
1391 qir_VPM_WRITE(c, rcp_w);
1392 }
1393
1394 static void
emit_point_size_write(struct vc4_compile * c)1395 emit_point_size_write(struct vc4_compile *c)
1396 {
1397 struct qreg point_size;
1398
1399 if (c->output_point_size_index != -1)
1400 point_size = c->outputs[c->output_point_size_index];
1401 else
1402 point_size = qir_uniform_f(c, 1.0);
1403
1404 qir_VPM_WRITE(c, point_size);
1405 }
1406
1407 /**
1408 * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c.
1409 *
1410 * The simulator insists that there be at least one vertex attribute, so
1411 * vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also
1412 * insists that all vertex attributes loaded get read by the VS/CS, so we have
1413 * to consume it here.
1414 */
1415 static void
emit_stub_vpm_read(struct vc4_compile * c)1416 emit_stub_vpm_read(struct vc4_compile *c)
1417 {
1418 if (c->num_inputs)
1419 return;
1420
1421 c->vattr_sizes[0] = 4;
1422 (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
1423 c->num_inputs++;
1424 }
1425
1426 static void
emit_vert_end(struct vc4_compile * c,struct vc4_varying_slot * fs_inputs,uint32_t num_fs_inputs)1427 emit_vert_end(struct vc4_compile *c,
1428 struct vc4_varying_slot *fs_inputs,
1429 uint32_t num_fs_inputs)
1430 {
1431 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1432
1433 emit_stub_vpm_read(c);
1434
1435 emit_scaled_viewport_write(c, rcp_w);
1436 emit_zs_write(c, rcp_w);
1437 emit_rcp_wc_write(c, rcp_w);
1438 if (c->vs_key->per_vertex_point_size)
1439 emit_point_size_write(c);
1440
1441 for (int i = 0; i < num_fs_inputs; i++) {
1442 struct vc4_varying_slot *input = &fs_inputs[i];
1443 int j;
1444
1445 for (j = 0; j < c->num_outputs; j++) {
1446 struct vc4_varying_slot *output =
1447 &c->output_slots[j];
1448
1449 if (input->slot == output->slot &&
1450 input->swizzle == output->swizzle) {
1451 qir_VPM_WRITE(c, c->outputs[j]);
1452 break;
1453 }
1454 }
1455 /* Emit padding if we didn't find a declared VS output for
1456 * this FS input.
1457 */
1458 if (j == c->num_outputs)
1459 qir_VPM_WRITE(c, qir_uniform_f(c, 0.0));
1460 }
1461 }
1462
1463 static void
emit_coord_end(struct vc4_compile * c)1464 emit_coord_end(struct vc4_compile *c)
1465 {
1466 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]);
1467
1468 emit_stub_vpm_read(c);
1469
1470 for (int i = 0; i < 4; i++)
1471 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]);
1472
1473 emit_scaled_viewport_write(c, rcp_w);
1474 emit_zs_write(c, rcp_w);
1475 emit_rcp_wc_write(c, rcp_w);
1476 if (c->vs_key->per_vertex_point_size)
1477 emit_point_size_write(c);
1478 }
1479
1480 static void
vc4_optimize_nir(struct nir_shader * s)1481 vc4_optimize_nir(struct nir_shader *s)
1482 {
1483 bool progress;
1484 unsigned lower_flrp =
1485 (s->options->lower_flrp16 ? 16 : 0) |
1486 (s->options->lower_flrp32 ? 32 : 0) |
1487 (s->options->lower_flrp64 ? 64 : 0);
1488
1489 do {
1490 progress = false;
1491
1492 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1493 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1494 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1495 NIR_PASS(progress, s, nir_copy_prop);
1496 NIR_PASS(progress, s, nir_opt_remove_phis);
1497 NIR_PASS(progress, s, nir_opt_dce);
1498 NIR_PASS(progress, s, nir_opt_dead_cf);
1499 NIR_PASS(progress, s, nir_opt_cse);
1500 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1501 NIR_PASS(progress, s, nir_opt_algebraic);
1502 NIR_PASS(progress, s, nir_opt_constant_folding);
1503 if (lower_flrp != 0) {
1504 bool lower_flrp_progress = false;
1505
1506 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1507 lower_flrp,
1508 false /* always_precise */);
1509 if (lower_flrp_progress) {
1510 NIR_PASS(progress, s, nir_opt_constant_folding);
1511 progress = true;
1512 }
1513
1514 /* Nothing should rematerialize any flrps, so we only
1515 * need to do this lowering once.
1516 */
1517 lower_flrp = 0;
1518 }
1519
1520 NIR_PASS(progress, s, nir_opt_undef);
1521 NIR_PASS(progress, s, nir_opt_loop_unroll);
1522 } while (progress);
1523 }
1524
1525 static int
driver_location_compare(const void * in_a,const void * in_b)1526 driver_location_compare(const void *in_a, const void *in_b)
1527 {
1528 const nir_variable *const *a = in_a;
1529 const nir_variable *const *b = in_b;
1530
1531 return (*a)->data.driver_location - (*b)->data.driver_location;
1532 }
1533
1534 static void
ntq_setup_inputs(struct vc4_compile * c)1535 ntq_setup_inputs(struct vc4_compile *c)
1536 {
1537 unsigned num_entries = 0;
1538 nir_foreach_shader_in_variable(var, c->s)
1539 num_entries++;
1540
1541 if (num_entries == 0)
1542 return;
1543
1544 nir_variable *vars[num_entries];
1545
1546 unsigned i = 0;
1547 nir_foreach_shader_in_variable(var, c->s)
1548 vars[i++] = var;
1549
1550 /* Sort the variables so that we emit the input setup in
1551 * driver_location order. This is required for VPM reads, whose data
1552 * is fetched into the VPM in driver_location (TGSI register index)
1553 * order.
1554 */
1555 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1556
1557 for (unsigned i = 0; i < num_entries; i++) {
1558 nir_variable *var = vars[i];
1559 assert(glsl_type_is_vector_or_scalar(var->type));
1560 unsigned loc = var->data.driver_location;
1561
1562 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1563 (loc + 1) * 4);
1564
1565 if (c->stage == QSTAGE_FRAG) {
1566 if (var->data.location == VARYING_SLOT_POS) {
1567 emit_fragcoord_input(c, loc);
1568 } else if (util_varying_is_point_coord(var->data.location,
1569 c->fs_key->point_sprite_mask)) {
1570 c->inputs[loc * 4 + 0] = c->point_x;
1571 c->inputs[loc * 4 + 1] = c->point_y;
1572 } else {
1573 emit_fragment_input(c, loc, var->data.location);
1574 }
1575 } else {
1576 emit_vertex_input(c, loc);
1577 }
1578 }
1579 }
1580
1581 static void
ntq_setup_outputs(struct vc4_compile * c)1582 ntq_setup_outputs(struct vc4_compile *c)
1583 {
1584 nir_foreach_shader_out_variable(var, c->s) {
1585 assert(glsl_type_is_vector_or_scalar(var->type));
1586 unsigned loc = var->data.driver_location * 4;
1587
1588 for (int i = 0; i < 4; i++)
1589 add_output(c, loc + i, var->data.location, i);
1590
1591 if (c->stage == QSTAGE_FRAG) {
1592 switch (var->data.location) {
1593 case FRAG_RESULT_COLOR:
1594 case FRAG_RESULT_DATA0:
1595 c->output_color_index = loc;
1596 break;
1597 case FRAG_RESULT_DEPTH:
1598 c->output_position_index = loc;
1599 break;
1600 case FRAG_RESULT_SAMPLE_MASK:
1601 c->output_sample_mask_index = loc;
1602 break;
1603 }
1604 } else {
1605 switch (var->data.location) {
1606 case VARYING_SLOT_POS:
1607 c->output_position_index = loc;
1608 break;
1609 case VARYING_SLOT_PSIZ:
1610 c->output_point_size_index = loc;
1611 break;
1612 }
1613 }
1614 }
1615 }
1616
1617 /**
1618 * Sets up the mapping from nir_register to struct qreg *.
1619 *
1620 * Each nir_register gets a struct qreg per 32-bit component being stored.
1621 */
1622 static void
ntq_setup_registers(struct vc4_compile * c,nir_function_impl * impl)1623 ntq_setup_registers(struct vc4_compile *c, nir_function_impl *impl)
1624 {
1625 nir_foreach_reg_decl(decl, impl) {
1626 unsigned num_components = nir_intrinsic_num_components(decl);
1627 unsigned array_len = nir_intrinsic_num_array_elems(decl);
1628 array_len = MAX2(array_len, 1);
1629 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1630 array_len * num_components);
1631
1632 nir_def *nir_reg = &decl->def;
1633 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1634
1635 for (int i = 0; i < array_len * num_components; i++)
1636 qregs[i] = qir_get_temp(c);
1637 }
1638 }
1639
1640 static void
ntq_emit_load_const(struct vc4_compile * c,nir_load_const_instr * instr)1641 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
1642 {
1643 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1644 for (int i = 0; i < instr->def.num_components; i++)
1645 qregs[i] = qir_uniform_ui(c, instr->value[i].u32);
1646
1647 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1648 }
1649
1650 static void
ntq_emit_ssa_undef(struct vc4_compile * c,nir_undef_instr * instr)1651 ntq_emit_ssa_undef(struct vc4_compile *c, nir_undef_instr *instr)
1652 {
1653 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1654
1655 /* QIR needs there to be *some* value, so pick 0 (same as for
1656 * ntq_setup_registers().
1657 */
1658 for (int i = 0; i < instr->def.num_components; i++)
1659 qregs[i] = qir_uniform_ui(c, 0);
1660 }
1661
1662 static void
ntq_emit_color_read(struct vc4_compile * c,nir_intrinsic_instr * instr)1663 ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr)
1664 {
1665 assert(nir_src_as_uint(instr->src[0]) == 0);
1666
1667 /* Reads of the per-sample color need to be done in
1668 * order.
1669 */
1670 int sample_index = nir_intrinsic_base(instr);
1671 for (int i = 0; i <= sample_index; i++) {
1672 if (c->color_reads[i].file == QFILE_NULL) {
1673 c->color_reads[i] =
1674 qir_TLB_COLOR_READ(c);
1675 }
1676 }
1677 ntq_store_def(c, &instr->def, 0,
1678 qir_MOV(c, c->color_reads[sample_index]));
1679 }
1680
1681 static void
ntq_emit_load_input(struct vc4_compile * c,nir_intrinsic_instr * instr)1682 ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
1683 {
1684 assert(instr->num_components == 1);
1685 assert(nir_src_is_const(instr->src[0]) &&
1686 "vc4 doesn't support indirect inputs");
1687
1688 uint32_t offset = nir_intrinsic_base(instr) +
1689 nir_src_as_uint(instr->src[0]);
1690 int comp = nir_intrinsic_component(instr);
1691 ntq_store_def(c, &instr->def, 0,
1692 qir_MOV(c, c->inputs[offset * 4 + comp]));
1693 }
1694
1695 static void
ntq_emit_intrinsic(struct vc4_compile * c,nir_intrinsic_instr * instr)1696 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
1697 {
1698 unsigned offset;
1699
1700 switch (instr->intrinsic) {
1701 case nir_intrinsic_decl_reg:
1702 case nir_intrinsic_load_reg:
1703 case nir_intrinsic_store_reg:
1704 break; /* Ignore these */
1705
1706 case nir_intrinsic_load_uniform:
1707 assert(instr->num_components == 1);
1708 if (nir_src_is_const(instr->src[0])) {
1709 offset = nir_intrinsic_base(instr) +
1710 nir_src_as_uint(instr->src[0]);
1711 assert(offset % 4 == 0);
1712 /* We need dwords */
1713 offset = offset / 4;
1714 ntq_store_def(c, &instr->def, 0,
1715 qir_uniform(c, QUNIFORM_UNIFORM,
1716 offset));
1717 } else {
1718 ntq_store_def(c, &instr->def, 0,
1719 indirect_uniform_load(c, instr));
1720 }
1721 break;
1722
1723 case nir_intrinsic_load_ubo:
1724 assert(instr->num_components == 1);
1725 ntq_store_def(c, &instr->def, 0, vc4_ubo_load(c, instr));
1726 break;
1727
1728 case nir_intrinsic_load_user_clip_plane:
1729 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
1730 ntq_store_def(c, &instr->def, i,
1731 qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1732 nir_intrinsic_ucp_id(instr) *
1733 4 + i));
1734 }
1735 break;
1736
1737 case nir_intrinsic_load_blend_const_color_r_float:
1738 case nir_intrinsic_load_blend_const_color_g_float:
1739 case nir_intrinsic_load_blend_const_color_b_float:
1740 case nir_intrinsic_load_blend_const_color_a_float:
1741 ntq_store_def(c, &instr->def, 0,
1742 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X +
1743 (instr->intrinsic -
1744 nir_intrinsic_load_blend_const_color_r_float),
1745 0));
1746 break;
1747
1748 case nir_intrinsic_load_blend_const_color_rgba8888_unorm:
1749 ntq_store_def(c, &instr->def, 0,
1750 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA,
1751 0));
1752 break;
1753
1754 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm:
1755 ntq_store_def(c, &instr->def, 0,
1756 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA,
1757 0));
1758 break;
1759
1760 case nir_intrinsic_load_sample_mask_in:
1761 ntq_store_def(c, &instr->def, 0,
1762 qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1763 break;
1764
1765 case nir_intrinsic_load_front_face:
1766 /* The register contains 0 (front) or 1 (back), and we need to
1767 * turn it into a NIR bool where true means front.
1768 */
1769 ntq_store_def(c, &instr->def, 0,
1770 qir_ADD(c,
1771 qir_uniform_ui(c, -1),
1772 qir_reg(QFILE_FRAG_REV_FLAG, 0)));
1773 break;
1774
1775 case nir_intrinsic_load_input:
1776 ntq_emit_load_input(c, instr);
1777 break;
1778
1779 case nir_intrinsic_load_tlb_color_brcm:
1780 ntq_emit_color_read(c, instr);
1781 break;
1782
1783 case nir_intrinsic_store_output:
1784 assert(nir_src_is_const(instr->src[1]) &&
1785 "vc4 doesn't support indirect outputs");
1786 offset = nir_intrinsic_base(instr) +
1787 nir_src_as_uint(instr->src[1]);
1788
1789 /* MSAA color outputs are the only case where we have an
1790 * output that's not lowered to being a store of a single 32
1791 * bit value.
1792 */
1793 if (c->stage == QSTAGE_FRAG && instr->num_components == 4) {
1794 assert(offset == c->output_color_index);
1795 for (int i = 0; i < 4; i++) {
1796 c->sample_colors[i] =
1797 qir_MOV(c, ntq_get_src(c, instr->src[0],
1798 i));
1799 }
1800 } else {
1801 offset = offset * 4 + nir_intrinsic_component(instr);
1802 assert(instr->num_components == 1);
1803 c->outputs[offset] =
1804 qir_MOV(c, ntq_get_src(c, instr->src[0], 0));
1805 c->num_outputs = MAX2(c->num_outputs, offset + 1);
1806 }
1807 break;
1808
1809 case nir_intrinsic_terminate:
1810 if (c->execute.file != QFILE_NULL) {
1811 qir_SF(c, c->execute);
1812 qir_MOV_cond(c, QPU_COND_ZS, c->discard,
1813 qir_uniform_ui(c, ~0));
1814 } else {
1815 qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0));
1816 }
1817 break;
1818
1819 case nir_intrinsic_terminate_if: {
1820 /* true (~0) if we're discarding */
1821 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1822
1823 if (c->execute.file != QFILE_NULL) {
1824 /* execute == 0 means the channel is active. Invert
1825 * the condition so that we can use zero as "executing
1826 * and discarding."
1827 */
1828 qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond)));
1829 qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond);
1830 } else {
1831 qir_OR_dest(c, c->discard, c->discard,
1832 ntq_get_src(c, instr->src[0], 0));
1833 }
1834
1835 break;
1836 }
1837
1838 case nir_intrinsic_load_texture_scale: {
1839 assert(nir_src_is_const(instr->src[0]));
1840 int sampler = nir_src_as_int(instr->src[0]);
1841
1842 ntq_store_def(c, &instr->def, 0,
1843 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler));
1844 ntq_store_def(c, &instr->def, 1,
1845 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler));
1846 break;
1847 }
1848
1849 case nir_intrinsic_ddx:
1850 case nir_intrinsic_ddx_coarse:
1851 case nir_intrinsic_ddx_fine:
1852 ntq_store_def(c, &instr->def, 0,
1853 ntq_fddx(c, ntq_get_src(c, instr->src[0], 0)));
1854 break;
1855
1856 case nir_intrinsic_ddy:
1857 case nir_intrinsic_ddy_coarse:
1858 case nir_intrinsic_ddy_fine:
1859 ntq_store_def(c, &instr->def, 0,
1860 ntq_fddy(c, ntq_get_src(c, instr->src[0], 0)));
1861 break;
1862
1863 default:
1864 fprintf(stderr, "Unknown intrinsic: ");
1865 nir_print_instr(&instr->instr, stderr);
1866 fprintf(stderr, "\n");
1867 break;
1868 }
1869 }
1870
1871 /* Clears (activates) the execute flags for any channels whose jump target
1872 * matches this block.
1873 */
1874 static void
ntq_activate_execute_for_block(struct vc4_compile * c)1875 ntq_activate_execute_for_block(struct vc4_compile *c)
1876 {
1877 qir_SF(c, qir_SUB(c,
1878 c->execute,
1879 qir_uniform_ui(c, c->cur_block->index)));
1880 qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0));
1881 }
1882
1883 static void
ntq_emit_if(struct vc4_compile * c,nir_if * if_stmt)1884 ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt)
1885 {
1886 if (!c->vc4->screen->has_control_flow) {
1887 fprintf(stderr,
1888 "IF statement support requires updated kernel.\n");
1889 return;
1890 }
1891
1892 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1893 bool empty_else_block =
1894 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1895 exec_list_is_empty(&nir_else_block->instr_list));
1896
1897 struct qblock *then_block = qir_new_block(c);
1898 struct qblock *after_block = qir_new_block(c);
1899 struct qblock *else_block;
1900 if (empty_else_block)
1901 else_block = after_block;
1902 else
1903 else_block = qir_new_block(c);
1904
1905 bool was_top_level = false;
1906 if (c->execute.file == QFILE_NULL) {
1907 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
1908 was_top_level = true;
1909 }
1910
1911 /* Set ZS for executing (execute == 0) and jumping (if->condition ==
1912 * 0) channels, and then update execute flags for those to point to
1913 * the ELSE block.
1914 */
1915 qir_SF(c, qir_OR(c,
1916 c->execute,
1917 ntq_get_src(c, if_stmt->condition, 0)));
1918 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1919 qir_uniform_ui(c, else_block->index));
1920
1921 /* Jump to ELSE if nothing is active for THEN, otherwise fall
1922 * through.
1923 */
1924 qir_SF(c, c->execute);
1925 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC);
1926 qir_link_blocks(c->cur_block, else_block);
1927 qir_link_blocks(c->cur_block, then_block);
1928
1929 /* Process the THEN block. */
1930 qir_set_emit_block(c, then_block);
1931 ntq_emit_cf_list(c, &if_stmt->then_list);
1932
1933 if (!empty_else_block) {
1934 /* Handle the end of the THEN block. First, all currently
1935 * active channels update their execute flags to point to
1936 * ENDIF
1937 */
1938 qir_SF(c, c->execute);
1939 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1940 qir_uniform_ui(c, after_block->index));
1941
1942 /* If everything points at ENDIF, then jump there immediately. */
1943 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index)));
1944 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1945 qir_link_blocks(c->cur_block, after_block);
1946 qir_link_blocks(c->cur_block, else_block);
1947
1948 qir_set_emit_block(c, else_block);
1949 ntq_activate_execute_for_block(c);
1950 ntq_emit_cf_list(c, &if_stmt->else_list);
1951 }
1952
1953 qir_link_blocks(c->cur_block, after_block);
1954
1955 qir_set_emit_block(c, after_block);
1956 if (was_top_level) {
1957 c->execute = c->undef;
1958 c->last_top_block = c->cur_block;
1959 } else {
1960 ntq_activate_execute_for_block(c);
1961 }
1962 }
1963
1964 static void
ntq_emit_jump(struct vc4_compile * c,nir_jump_instr * jump)1965 ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump)
1966 {
1967 struct qblock *jump_block;
1968 switch (jump->type) {
1969 case nir_jump_break:
1970 jump_block = c->loop_break_block;
1971 break;
1972 case nir_jump_continue:
1973 jump_block = c->loop_cont_block;
1974 break;
1975 default:
1976 unreachable("Unsupported jump type\n");
1977 }
1978
1979 qir_SF(c, c->execute);
1980 qir_MOV_cond(c, QPU_COND_ZS, c->execute,
1981 qir_uniform_ui(c, jump_block->index));
1982
1983 /* Jump to the destination block if everyone has taken the jump. */
1984 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index)));
1985 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS);
1986 struct qblock *new_block = qir_new_block(c);
1987 qir_link_blocks(c->cur_block, jump_block);
1988 qir_link_blocks(c->cur_block, new_block);
1989 qir_set_emit_block(c, new_block);
1990 }
1991
1992 static void
ntq_emit_instr(struct vc4_compile * c,nir_instr * instr)1993 ntq_emit_instr(struct vc4_compile *c, nir_instr *instr)
1994 {
1995 switch (instr->type) {
1996 case nir_instr_type_alu:
1997 ntq_emit_alu(c, nir_instr_as_alu(instr));
1998 break;
1999
2000 case nir_instr_type_intrinsic:
2001 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2002 break;
2003
2004 case nir_instr_type_load_const:
2005 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
2006 break;
2007
2008 case nir_instr_type_undef:
2009 ntq_emit_ssa_undef(c, nir_instr_as_undef(instr));
2010 break;
2011
2012 case nir_instr_type_tex:
2013 ntq_emit_tex(c, nir_instr_as_tex(instr));
2014 break;
2015
2016 case nir_instr_type_jump:
2017 ntq_emit_jump(c, nir_instr_as_jump(instr));
2018 break;
2019
2020 default:
2021 fprintf(stderr, "Unknown NIR instr type: ");
2022 nir_print_instr(instr, stderr);
2023 fprintf(stderr, "\n");
2024 abort();
2025 }
2026 }
2027
2028 static void
ntq_emit_block(struct vc4_compile * c,nir_block * block)2029 ntq_emit_block(struct vc4_compile *c, nir_block *block)
2030 {
2031 nir_foreach_instr(instr, block) {
2032 ntq_emit_instr(c, instr);
2033 }
2034 }
2035
2036 static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
2037
2038 static void
ntq_emit_loop(struct vc4_compile * c,nir_loop * loop)2039 ntq_emit_loop(struct vc4_compile *c, nir_loop *loop)
2040 {
2041 assert(!nir_loop_has_continue_construct(loop));
2042 if (!c->vc4->screen->has_control_flow) {
2043 fprintf(stderr,
2044 "loop support requires updated kernel.\n");
2045 ntq_emit_cf_list(c, &loop->body);
2046 return;
2047 }
2048
2049 bool was_top_level = false;
2050 if (c->execute.file == QFILE_NULL) {
2051 c->execute = qir_MOV(c, qir_uniform_ui(c, 0));
2052 was_top_level = true;
2053 }
2054
2055 struct qblock *save_loop_cont_block = c->loop_cont_block;
2056 struct qblock *save_loop_break_block = c->loop_break_block;
2057
2058 c->loop_cont_block = qir_new_block(c);
2059 c->loop_break_block = qir_new_block(c);
2060
2061 qir_link_blocks(c->cur_block, c->loop_cont_block);
2062 qir_set_emit_block(c, c->loop_cont_block);
2063 ntq_activate_execute_for_block(c);
2064
2065 ntq_emit_cf_list(c, &loop->body);
2066
2067 /* If anything had explicitly continued, or is here at the end of the
2068 * loop, then we need to loop again. SF updates are masked by the
2069 * instruction's condition, so we can do the OR of the two conditions
2070 * within SF.
2071 */
2072 qir_SF(c, c->execute);
2073 struct qinst *cont_check =
2074 qir_SUB_dest(c,
2075 c->undef,
2076 c->execute,
2077 qir_uniform_ui(c, c->loop_cont_block->index));
2078 cont_check->cond = QPU_COND_ZC;
2079 cont_check->sf = true;
2080
2081 qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS);
2082 qir_link_blocks(c->cur_block, c->loop_cont_block);
2083 qir_link_blocks(c->cur_block, c->loop_break_block);
2084
2085 qir_set_emit_block(c, c->loop_break_block);
2086 if (was_top_level) {
2087 c->execute = c->undef;
2088 c->last_top_block = c->cur_block;
2089 } else {
2090 ntq_activate_execute_for_block(c);
2091 }
2092
2093 c->loop_break_block = save_loop_break_block;
2094 c->loop_cont_block = save_loop_cont_block;
2095 }
2096
2097 static void
ntq_emit_function(struct vc4_compile * c,nir_function_impl * func)2098 ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
2099 {
2100 fprintf(stderr, "FUNCTIONS not handled.\n");
2101 abort();
2102 }
2103
2104 static void
ntq_emit_cf_list(struct vc4_compile * c,struct exec_list * list)2105 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
2106 {
2107 foreach_list_typed(nir_cf_node, node, node, list) {
2108 switch (node->type) {
2109 case nir_cf_node_block:
2110 ntq_emit_block(c, nir_cf_node_as_block(node));
2111 break;
2112
2113 case nir_cf_node_if:
2114 ntq_emit_if(c, nir_cf_node_as_if(node));
2115 break;
2116
2117 case nir_cf_node_loop:
2118 ntq_emit_loop(c, nir_cf_node_as_loop(node));
2119 break;
2120
2121 case nir_cf_node_function:
2122 ntq_emit_function(c, nir_cf_node_as_function(node));
2123 break;
2124
2125 default:
2126 fprintf(stderr, "Unknown NIR node type\n");
2127 abort();
2128 }
2129 }
2130 }
2131
2132 static void
ntq_emit_impl(struct vc4_compile * c,nir_function_impl * impl)2133 ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl)
2134 {
2135 ntq_setup_registers(c, impl);
2136 ntq_emit_cf_list(c, &impl->body);
2137 }
2138
2139 static void
nir_to_qir(struct vc4_compile * c)2140 nir_to_qir(struct vc4_compile *c)
2141 {
2142 if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard)
2143 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
2144
2145 ntq_setup_inputs(c);
2146 ntq_setup_outputs(c);
2147
2148 /* Find the main function and emit the body. */
2149 nir_foreach_function(function, c->s) {
2150 assert(strcmp(function->name, "main") == 0);
2151 assert(function->impl);
2152 ntq_emit_impl(c, function->impl);
2153 }
2154 }
2155
2156 static const nir_shader_compiler_options nir_options = {
2157 .lower_all_io_to_temps = true,
2158 .lower_extract_byte = true,
2159 .lower_extract_word = true,
2160 .lower_insert_byte = true,
2161 .lower_insert_word = true,
2162 .lower_fdiv = true,
2163 .lower_ffma16 = true,
2164 .lower_ffma32 = true,
2165 .lower_ffma64 = true,
2166 .lower_flrp32 = true,
2167 .lower_fmod = true,
2168 .lower_fpow = true,
2169 .lower_fsat = true,
2170 .lower_fsqrt = true,
2171 .lower_ldexp = true,
2172 .lower_fneg = true,
2173 .lower_ineg = true,
2174 .lower_to_scalar = true,
2175 .lower_umax = true,
2176 .lower_umin = true,
2177 .lower_isign = true,
2178 .has_fsub = true,
2179 .has_isub = true,
2180 .has_texture_scaling = true,
2181 .lower_mul_high = true,
2182 .max_unroll_iterations = 32,
2183 .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp),
2184 .has_ddx_intrinsics = true,
2185 .scalarize_ddx = true,
2186 };
2187
2188 const void *
vc4_screen_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,enum pipe_shader_type shader)2189 vc4_screen_get_compiler_options(struct pipe_screen *pscreen,
2190 enum pipe_shader_ir ir,
2191 enum pipe_shader_type shader)
2192 {
2193 return &nir_options;
2194 }
2195
2196 static int
count_nir_instrs(nir_shader * nir)2197 count_nir_instrs(nir_shader *nir)
2198 {
2199 int count = 0;
2200 nir_foreach_function_impl(impl, nir) {
2201 nir_foreach_block(block, impl) {
2202 nir_foreach_instr(instr, block)
2203 count++;
2204 }
2205 }
2206 return count;
2207 }
2208
2209 static struct vc4_compile *
vc4_shader_ntq(struct vc4_context * vc4,enum qstage stage,struct vc4_key * key,bool fs_threaded)2210 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
2211 struct vc4_key *key, bool fs_threaded)
2212 {
2213 struct vc4_compile *c = qir_compile_init();
2214
2215 c->vc4 = vc4;
2216 c->stage = stage;
2217 c->shader_state = &key->shader_state->base;
2218 c->program_id = key->shader_state->program_id;
2219 c->variant_id =
2220 p_atomic_inc_return(&key->shader_state->compiled_variant_count);
2221 c->fs_threaded = fs_threaded;
2222
2223 c->key = key;
2224 switch (stage) {
2225 case QSTAGE_FRAG:
2226 c->fs_key = (struct vc4_fs_key *)key;
2227 if (c->fs_key->is_points) {
2228 c->point_x = emit_fragment_varying(c, ~0, 0);
2229 c->point_y = emit_fragment_varying(c, ~0, 0);
2230 } else if (c->fs_key->is_lines) {
2231 c->line_x = emit_fragment_varying(c, ~0, 0);
2232 }
2233 break;
2234 case QSTAGE_VERT:
2235 c->vs_key = (struct vc4_vs_key *)key;
2236 break;
2237 case QSTAGE_COORD:
2238 c->vs_key = (struct vc4_vs_key *)key;
2239 break;
2240 }
2241
2242 c->s = nir_shader_clone(c, key->shader_state->base.ir.nir);
2243
2244 if (stage == QSTAGE_FRAG) {
2245 NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
2246 }
2247
2248 struct nir_lower_tex_options tex_options = {
2249 .lower_txp = ~0,
2250
2251 /* Apply swizzles to all samplers. */
2252 .swizzle_result = ~0,
2253 .lower_invalid_implicit_lod = true,
2254 };
2255
2256 /* Lower the format swizzle and ARB_texture_swizzle-style swizzle.
2257 * The format swizzling applies before sRGB decode, and
2258 * ARB_texture_swizzle is the last thing before returning the sample.
2259 */
2260 for (int i = 0; i < ARRAY_SIZE(key->tex); i++) {
2261 enum pipe_format format = c->key->tex[i].format;
2262
2263 if (!format)
2264 continue;
2265
2266 const uint8_t *format_swizzle = vc4_get_format_swizzle(format);
2267
2268 for (int j = 0; j < 4; j++) {
2269 uint8_t arb_swiz = c->key->tex[i].swizzle[j];
2270
2271 if (arb_swiz <= 3) {
2272 tex_options.swizzles[i][j] =
2273 format_swizzle[arb_swiz];
2274 } else {
2275 tex_options.swizzles[i][j] = arb_swiz;
2276 }
2277 }
2278
2279 if (util_format_is_srgb(format))
2280 tex_options.lower_srgb |= (1 << i);
2281 }
2282
2283 NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
2284
2285 if (c->key->ucp_enables) {
2286 if (stage == QSTAGE_FRAG) {
2287 NIR_PASS_V(c->s, nir_lower_clip_fs,
2288 c->key->ucp_enables, false);
2289 } else {
2290 NIR_PASS_V(c->s, nir_lower_clip_vs,
2291 c->key->ucp_enables, false, false, NULL);
2292 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
2293 nir_var_shader_out, NULL, NULL);
2294 }
2295 }
2296
2297 /* FS input scalarizing must happen after nir_lower_two_sided_color,
2298 * which only handles a vec4 at a time. Similarly, VS output
2299 * scalarizing must happen after nir_lower_clip_vs.
2300 */
2301 if (c->stage == QSTAGE_FRAG)
2302 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
2303 else
2304 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
2305
2306 NIR_PASS_V(c->s, vc4_nir_lower_io, c);
2307 NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
2308 nir_lower_idiv_options idiv_options = {
2309 .allow_fp16 = true,
2310 };
2311 NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
2312 NIR_PASS(_, c->s, nir_lower_alu);
2313
2314 vc4_optimize_nir(c->s);
2315
2316 /* Do late algebraic optimization to turn add(a, neg(b)) back into
2317 * subs, then the mandatory cleanup after algebraic. Note that it may
2318 * produce fnegs, and if so then we need to keep running to squash
2319 * fneg(fneg(a)).
2320 */
2321 bool more_late_algebraic = true;
2322 while (more_late_algebraic) {
2323 more_late_algebraic = false;
2324 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
2325 NIR_PASS_V(c->s, nir_opt_constant_folding);
2326 NIR_PASS_V(c->s, nir_copy_prop);
2327 NIR_PASS_V(c->s, nir_opt_dce);
2328 NIR_PASS_V(c->s, nir_opt_cse);
2329 }
2330
2331 NIR_PASS_V(c->s, nir_lower_bool_to_int32);
2332
2333 NIR_PASS_V(c->s, nir_convert_from_ssa, true);
2334 NIR_PASS_V(c->s, nir_trivialize_registers);
2335
2336 if (VC4_DBG(NIR)) {
2337 fprintf(stderr, "%s prog %d/%d NIR:\n",
2338 qir_get_stage_name(c->stage),
2339 c->program_id, c->variant_id);
2340 nir_print_shader(c->s, stderr);
2341 }
2342
2343 nir_to_qir(c);
2344
2345 switch (stage) {
2346 case QSTAGE_FRAG:
2347 /* FS threading requires that the thread execute
2348 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
2349 * (with no other THRSW afterwards, obviously). If we didn't
2350 * fetch a texture at a top level block, this wouldn't be
2351 * true.
2352 */
2353 if (c->fs_threaded && !c->last_thrsw_at_top_level) {
2354 c->failed = true;
2355 return c;
2356 }
2357
2358 emit_frag_end(c);
2359 break;
2360 case QSTAGE_VERT:
2361 emit_vert_end(c,
2362 c->vs_key->fs_inputs->input_slots,
2363 c->vs_key->fs_inputs->num_inputs);
2364 break;
2365 case QSTAGE_COORD:
2366 emit_coord_end(c);
2367 break;
2368 }
2369
2370 if (VC4_DBG(QIR)) {
2371 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n",
2372 qir_get_stage_name(c->stage),
2373 c->program_id, c->variant_id);
2374 qir_dump(c);
2375 fprintf(stderr, "\n");
2376 }
2377
2378 qir_optimize(c);
2379 qir_lower_uniforms(c);
2380
2381 qir_schedule_instructions(c);
2382 qir_emit_uniform_stream_resets(c);
2383
2384 if (VC4_DBG(QIR)) {
2385 fprintf(stderr, "%s prog %d/%d QIR:\n",
2386 qir_get_stage_name(c->stage),
2387 c->program_id, c->variant_id);
2388 qir_dump(c);
2389 fprintf(stderr, "\n");
2390 }
2391
2392 qir_reorder_uniforms(c);
2393 vc4_generate_code(vc4, c);
2394
2395 ralloc_free(c->s);
2396
2397 return c;
2398 }
2399
2400 static void
vc4_setup_shared_precompile_key(struct vc4_uncompiled_shader * uncompiled,struct vc4_key * key)2401 vc4_setup_shared_precompile_key(struct vc4_uncompiled_shader *uncompiled,
2402 struct vc4_key *key)
2403 {
2404 nir_shader *s = uncompiled->base.ir.nir;
2405
2406 for (int i = 0; i < s->info.num_textures; i++) {
2407 key->tex[i].format = PIPE_FORMAT_R8G8B8A8_UNORM;
2408 key->tex[i].swizzle[0] = PIPE_SWIZZLE_X;
2409 key->tex[i].swizzle[1] = PIPE_SWIZZLE_Y;
2410 key->tex[i].swizzle[2] = PIPE_SWIZZLE_Z;
2411 key->tex[i].swizzle[3] = PIPE_SWIZZLE_W;
2412 }
2413 }
2414
2415 static inline struct vc4_varying_slot
vc4_slot_from_slot_and_component(uint8_t slot,uint8_t component)2416 vc4_slot_from_slot_and_component(uint8_t slot, uint8_t component)
2417 {
2418 assume(slot < 255 / 4);
2419 return (struct vc4_varying_slot){ (slot << 2) + component };
2420 }
2421
2422 static void
precompile_all_fs_inputs(nir_shader * s,struct vc4_fs_inputs * fs_inputs)2423 precompile_all_fs_inputs(nir_shader *s,
2424 struct vc4_fs_inputs *fs_inputs)
2425 {
2426 /* Assume all VS outputs will actually be used by the FS and output
2427 * them (the two sides have to match exactly) */
2428 nir_foreach_shader_out_variable(var, s) {
2429 const int array_len =
2430 glsl_type_is_vector_or_scalar(var->type) ?
2431 1 : glsl_get_length(var->type);
2432 for (int j = 0; j < array_len; j++) {
2433 const int slot = var->data.location + j;
2434 const int num_components =
2435 glsl_get_components(var->type);
2436 for (int i = 0; i < num_components; i++) {
2437 const int swiz = var->data.location_frac + i;
2438 fs_inputs->input_slots[fs_inputs->num_inputs++] =
2439 vc4_slot_from_slot_and_component(slot,
2440 swiz);
2441 }
2442 }
2443 }
2444 }
2445
2446 /**
2447 * Precompiles a shader variant at shader state creation time if
2448 * VC4_DEBUG=shaderdb is set.
2449 */
2450 static void
vc4_shader_precompile(struct vc4_context * vc4,struct vc4_uncompiled_shader * so)2451 vc4_shader_precompile(struct vc4_context *vc4,
2452 struct vc4_uncompiled_shader *so)
2453 {
2454 nir_shader *s = so->base.ir.nir;
2455
2456 if (s->info.stage == MESA_SHADER_FRAGMENT) {
2457 struct vc4_fs_key key = {
2458 .base.shader_state = so,
2459 .depth_enabled = true,
2460 .logicop_func = PIPE_LOGICOP_COPY,
2461 .color_format = PIPE_FORMAT_R8G8B8A8_UNORM,
2462 .blend = {
2463 .blend_enable = false,
2464 .colormask = PIPE_MASK_RGBA,
2465 },
2466 };
2467
2468 vc4_setup_shared_precompile_key(so, &key.base);
2469 vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key.base);
2470 } else {
2471 assert(s->info.stage == MESA_SHADER_VERTEX);
2472 struct vc4_varying_slot input_slots[64] = {};
2473 struct vc4_fs_inputs fs_inputs = {
2474 .input_slots = input_slots,
2475 .num_inputs = 0,
2476 };
2477 struct vc4_vs_key key = {
2478 .base.shader_state = so,
2479 .fs_inputs = &fs_inputs,
2480 };
2481
2482 vc4_setup_shared_precompile_key(so, &key.base);
2483 precompile_all_fs_inputs(s, &fs_inputs);
2484 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key.base);
2485
2486 /* Compile VS bin shader: only position (XXX: include TF) */
2487 key.is_coord = true;
2488 fs_inputs.num_inputs = 0;
2489 precompile_all_fs_inputs(s, &fs_inputs);
2490 for (int i = 0; i < 4; i++) {
2491 fs_inputs.input_slots[fs_inputs.num_inputs++] =
2492 vc4_slot_from_slot_and_component(VARYING_SLOT_POS,
2493 i);
2494 }
2495 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key.base);
2496 }
2497 }
2498
2499 static void *
vc4_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)2500 vc4_shader_state_create(struct pipe_context *pctx,
2501 const struct pipe_shader_state *cso)
2502 {
2503 struct vc4_context *vc4 = vc4_context(pctx);
2504 struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader);
2505 if (!so)
2506 return NULL;
2507
2508 so->program_id = vc4->next_uncompiled_program_id++;
2509
2510 nir_shader *s;
2511
2512 if (cso->type == PIPE_SHADER_IR_NIR) {
2513 /* The backend takes ownership of the NIR shader on state
2514 * creation.
2515 */
2516 s = cso->ir.nir;
2517 } else {
2518 assert(cso->type == PIPE_SHADER_IR_TGSI);
2519
2520 if (VC4_DBG(TGSI)) {
2521 fprintf(stderr, "prog %d TGSI:\n",
2522 so->program_id);
2523 tgsi_dump(cso->tokens, 0);
2524 fprintf(stderr, "\n");
2525 }
2526 s = tgsi_to_nir(cso->tokens, pctx->screen, false);
2527 }
2528
2529 if (s->info.stage == MESA_SHADER_VERTEX)
2530 NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f);
2531
2532 NIR_PASS_V(s, nir_lower_io,
2533 nir_var_shader_in | nir_var_shader_out | nir_var_uniform,
2534 type_size, (nir_lower_io_options)0);
2535
2536 NIR_PASS_V(s, nir_normalize_cubemap_coords);
2537
2538 NIR_PASS_V(s, nir_lower_load_const_to_scalar);
2539
2540 vc4_optimize_nir(s);
2541
2542 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
2543
2544 /* Garbage collect dead instructions */
2545 nir_sweep(s);
2546
2547 so->base.type = PIPE_SHADER_IR_NIR;
2548 so->base.ir.nir = s;
2549
2550 if (VC4_DBG(NIR)) {
2551 fprintf(stderr, "%s prog %d NIR:\n",
2552 gl_shader_stage_name(s->info.stage),
2553 so->program_id);
2554 nir_print_shader(s, stderr);
2555 fprintf(stderr, "\n");
2556 }
2557
2558 if (VC4_DBG(SHADERDB)) {
2559 vc4_shader_precompile(vc4, so);
2560 }
2561
2562 return so;
2563 }
2564
2565 static void
copy_uniform_state_to_shader(struct vc4_compiled_shader * shader,struct vc4_compile * c)2566 copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
2567 struct vc4_compile *c)
2568 {
2569 int count = c->num_uniforms;
2570 struct vc4_shader_uniform_info *uinfo = &shader->uniforms;
2571
2572 uinfo->count = count;
2573 uinfo->data = ralloc_array(shader, uint32_t, count);
2574 memcpy(uinfo->data, c->uniform_data,
2575 count * sizeof(*uinfo->data));
2576 uinfo->contents = ralloc_array(shader, enum quniform_contents, count);
2577 memcpy(uinfo->contents, c->uniform_contents,
2578 count * sizeof(*uinfo->contents));
2579 uinfo->num_texture_samples = c->num_texture_samples;
2580
2581 vc4_set_shader_uniform_dirty_flags(shader);
2582 }
2583
2584 static void
vc4_setup_compiled_fs_inputs(struct vc4_context * vc4,struct vc4_compile * c,struct vc4_compiled_shader * shader)2585 vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c,
2586 struct vc4_compiled_shader *shader)
2587 {
2588 struct vc4_fs_inputs inputs;
2589
2590 memset(&inputs, 0, sizeof(inputs));
2591 if (c->num_input_slots > 0) {
2592 inputs.input_slots = ralloc_array(shader,
2593 struct vc4_varying_slot,
2594 c->num_input_slots);
2595
2596 bool input_live[c->num_input_slots];
2597
2598 memset(input_live, 0, sizeof(input_live));
2599 qir_for_each_inst_inorder(inst, c) {
2600 for (int i = 0; i < qir_get_nsrc(inst); i++) {
2601 if (inst->src[i].file == QFILE_VARY)
2602 input_live[inst->src[i].index] = true;
2603 }
2604 }
2605
2606 for (int i = 0; i < c->num_input_slots; i++) {
2607 struct vc4_varying_slot *slot = &c->input_slots[i];
2608
2609 if (!input_live[i])
2610 continue;
2611
2612 /* Skip non-VS-output inputs. */
2613 if (slot->slot == (uint8_t)~0)
2614 continue;
2615
2616 if (slot->slot == VARYING_SLOT_COL0 ||
2617 slot->slot == VARYING_SLOT_COL1 ||
2618 slot->slot == VARYING_SLOT_BFC0 ||
2619 slot->slot == VARYING_SLOT_BFC1) {
2620 shader->color_inputs |= (1 << inputs.num_inputs);
2621 }
2622
2623 inputs.input_slots[inputs.num_inputs] = *slot;
2624 inputs.num_inputs++;
2625 }
2626 }
2627 shader->num_inputs = inputs.num_inputs;
2628
2629 /* Add our set of inputs to the set of all inputs seen. This way, we
2630 * can have a single pointer that identifies an FS inputs set,
2631 * allowing VS to avoid recompiling when the FS is recompiled (or a
2632 * new one is bound using separate shader objects) but the inputs
2633 * don't change.
2634 */
2635 struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs);
2636 if (entry) {
2637 shader->fs_inputs = entry->key;
2638 ralloc_free(inputs.input_slots);
2639 } else {
2640 struct vc4_fs_inputs *alloc_inputs;
2641
2642 alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs);
2643 memcpy(alloc_inputs, &inputs, sizeof(inputs));
2644 ralloc_steal(alloc_inputs, inputs.input_slots);
2645 _mesa_set_add(vc4->fs_inputs_set, alloc_inputs);
2646
2647 shader->fs_inputs = alloc_inputs;
2648 }
2649 }
2650
2651 static struct vc4_compiled_shader *
vc4_get_compiled_shader(struct vc4_context * vc4,enum qstage stage,struct vc4_key * key)2652 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
2653 struct vc4_key *key)
2654 {
2655 struct hash_table *ht;
2656 uint32_t key_size;
2657 bool try_threading;
2658
2659 if (stage == QSTAGE_FRAG) {
2660 ht = vc4->fs_cache;
2661 key_size = sizeof(struct vc4_fs_key);
2662 try_threading = vc4->screen->has_threaded_fs;
2663 } else {
2664 ht = vc4->vs_cache;
2665 key_size = sizeof(struct vc4_vs_key);
2666 try_threading = false;
2667 }
2668
2669 struct vc4_compiled_shader *shader;
2670 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
2671 if (entry)
2672 return entry->data;
2673
2674 struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading);
2675 /* If the FS failed to compile threaded, fall back to single threaded. */
2676 if (try_threading && c->failed) {
2677 qir_compile_destroy(c);
2678 c = vc4_shader_ntq(vc4, stage, key, false);
2679 }
2680
2681 shader = rzalloc(NULL, struct vc4_compiled_shader);
2682
2683 shader->program_id = vc4->next_compiled_program_id++;
2684 if (stage == QSTAGE_FRAG) {
2685 vc4_setup_compiled_fs_inputs(vc4, c, shader);
2686
2687 /* Note: the temporary clone in c->s has been freed. */
2688 nir_shader *orig_shader = key->shader_state->base.ir.nir;
2689 if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
2690 shader->disable_early_z = true;
2691 } else {
2692 shader->num_inputs = c->num_inputs;
2693
2694 shader->vattr_offsets[0] = 0;
2695 for (int i = 0; i < 8; i++) {
2696 shader->vattr_offsets[i + 1] =
2697 shader->vattr_offsets[i] + c->vattr_sizes[i];
2698
2699 if (c->vattr_sizes[i])
2700 shader->vattrs_live |= (1 << i);
2701 }
2702 }
2703
2704 shader->failed = c->failed;
2705 if (c->failed) {
2706 shader->failed = true;
2707 } else {
2708 copy_uniform_state_to_shader(shader, c);
2709 shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts,
2710 c->qpu_inst_count *
2711 sizeof(uint64_t));
2712 }
2713
2714 shader->fs_threaded = c->fs_threaded;
2715
2716 qir_compile_destroy(c);
2717
2718 struct vc4_key *dup_key;
2719 dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */
2720 memcpy(dup_key, key, key_size);
2721 _mesa_hash_table_insert(ht, dup_key, shader);
2722
2723 return shader;
2724 }
2725
2726 static void
vc4_setup_shared_key(struct vc4_context * vc4,struct vc4_key * key,struct vc4_texture_stateobj * texstate)2727 vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
2728 struct vc4_texture_stateobj *texstate)
2729 {
2730 for (int i = 0; i < texstate->num_textures; i++) {
2731 struct pipe_sampler_view *sampler = texstate->textures[i];
2732 struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler);
2733 struct pipe_sampler_state *sampler_state =
2734 texstate->samplers[i];
2735
2736 if (!sampler)
2737 continue;
2738
2739 key->tex[i].format = sampler->format;
2740 key->tex[i].swizzle[0] = sampler->swizzle_r;
2741 key->tex[i].swizzle[1] = sampler->swizzle_g;
2742 key->tex[i].swizzle[2] = sampler->swizzle_b;
2743 key->tex[i].swizzle[3] = sampler->swizzle_a;
2744
2745 if (sampler->texture->nr_samples > 1) {
2746 key->tex[i].msaa_width = sampler->texture->width0;
2747 key->tex[i].msaa_height = sampler->texture->height0;
2748 } else if (sampler){
2749 key->tex[i].compare_mode = sampler_state->compare_mode;
2750 key->tex[i].compare_func = sampler_state->compare_func;
2751 key->tex[i].wrap_s = sampler_state->wrap_s;
2752 key->tex[i].wrap_t = sampler_state->wrap_t;
2753 key->tex[i].force_first_level =
2754 vc4_sampler->force_first_level;
2755 }
2756 }
2757
2758 key->ucp_enables = vc4->rasterizer->base.clip_plane_enable;
2759 }
2760
2761 static void
vc4_update_compiled_fs(struct vc4_context * vc4,uint8_t prim_mode)2762 vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode)
2763 {
2764 struct vc4_job *job = vc4->job;
2765 struct vc4_fs_key local_key;
2766 struct vc4_fs_key *key = &local_key;
2767
2768 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2769 VC4_DIRTY_BLEND |
2770 VC4_DIRTY_FRAMEBUFFER |
2771 VC4_DIRTY_ZSA |
2772 VC4_DIRTY_RASTERIZER |
2773 VC4_DIRTY_SAMPLE_MASK |
2774 VC4_DIRTY_FRAGTEX |
2775 VC4_DIRTY_UNCOMPILED_FS |
2776 VC4_DIRTY_UBO_1_SIZE))) {
2777 return;
2778 }
2779
2780 memset(key, 0, sizeof(*key));
2781 vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex);
2782 key->base.shader_state = vc4->prog.bind_fs;
2783 key->is_points = (prim_mode == MESA_PRIM_POINTS);
2784 key->is_lines = (prim_mode >= MESA_PRIM_LINES &&
2785 prim_mode <= MESA_PRIM_LINE_STRIP);
2786 key->blend = vc4->blend->rt[0];
2787 if (vc4->blend->logicop_enable) {
2788 key->logicop_func = vc4->blend->logicop_func;
2789 } else {
2790 key->logicop_func = PIPE_LOGICOP_COPY;
2791 }
2792 if (job->msaa) {
2793 key->msaa = vc4->rasterizer->base.multisample;
2794 key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1);
2795 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage;
2796 key->sample_alpha_to_one = vc4->blend->alpha_to_one;
2797 }
2798
2799 if (vc4->framebuffer.cbufs[0])
2800 key->color_format = vc4->framebuffer.cbufs[0]->format;
2801
2802 key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0;
2803 key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0;
2804 key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0;
2805 key->depth_enabled = (vc4->zsa->base.depth_enabled ||
2806 key->stencil_enabled);
2807
2808 if (key->is_points) {
2809 key->point_sprite_mask =
2810 vc4->rasterizer->base.sprite_coord_enable;
2811 key->point_coord_upper_left =
2812 (vc4->rasterizer->base.sprite_coord_mode ==
2813 PIPE_SPRITE_COORD_UPPER_LEFT);
2814 }
2815
2816 key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size;
2817
2818 struct vc4_compiled_shader *old_fs = vc4->prog.fs;
2819 vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base);
2820 if (vc4->prog.fs == old_fs)
2821 return;
2822
2823 vc4->dirty |= VC4_DIRTY_COMPILED_FS;
2824
2825 if (vc4->rasterizer->base.flatshade &&
2826 (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) {
2827 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS;
2828 }
2829
2830 if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs)
2831 vc4->dirty |= VC4_DIRTY_FS_INPUTS;
2832 }
2833
2834 static void
vc4_update_compiled_vs(struct vc4_context * vc4,uint8_t prim_mode)2835 vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode)
2836 {
2837 struct vc4_vs_key local_key;
2838 struct vc4_vs_key *key = &local_key;
2839
2840 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE |
2841 VC4_DIRTY_RASTERIZER |
2842 VC4_DIRTY_VERTTEX |
2843 VC4_DIRTY_VTXSTATE |
2844 VC4_DIRTY_UNCOMPILED_VS |
2845 VC4_DIRTY_FS_INPUTS))) {
2846 return;
2847 }
2848
2849 memset(key, 0, sizeof(*key));
2850 vc4_setup_shared_key(vc4, &key->base, &vc4->verttex);
2851 key->base.shader_state = vc4->prog.bind_vs;
2852 key->fs_inputs = vc4->prog.fs->fs_inputs;
2853
2854 for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++)
2855 key->attr_formats[i] = vc4->vtx->pipe[i].src_format;
2856
2857 key->per_vertex_point_size =
2858 (prim_mode == MESA_PRIM_POINTS &&
2859 vc4->rasterizer->base.point_size_per_vertex);
2860
2861 struct vc4_compiled_shader *vs =
2862 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base);
2863 if (vs != vc4->prog.vs) {
2864 vc4->prog.vs = vs;
2865 vc4->dirty |= VC4_DIRTY_COMPILED_VS;
2866 }
2867
2868 key->is_coord = true;
2869 /* Coord shaders don't care what the FS inputs are. */
2870 key->fs_inputs = NULL;
2871 struct vc4_compiled_shader *cs =
2872 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base);
2873 if (cs != vc4->prog.cs) {
2874 vc4->prog.cs = cs;
2875 vc4->dirty |= VC4_DIRTY_COMPILED_CS;
2876 }
2877 }
2878
2879 bool
vc4_update_compiled_shaders(struct vc4_context * vc4,uint8_t prim_mode)2880 vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode)
2881 {
2882 vc4_update_compiled_fs(vc4, prim_mode);
2883 vc4_update_compiled_vs(vc4, prim_mode);
2884
2885 return !(vc4->prog.cs->failed ||
2886 vc4->prog.vs->failed ||
2887 vc4->prog.fs->failed);
2888 }
2889
2890 static uint32_t
fs_cache_hash(const void * key)2891 fs_cache_hash(const void *key)
2892 {
2893 return _mesa_hash_data(key, sizeof(struct vc4_fs_key));
2894 }
2895
2896 static uint32_t
vs_cache_hash(const void * key)2897 vs_cache_hash(const void *key)
2898 {
2899 return _mesa_hash_data(key, sizeof(struct vc4_vs_key));
2900 }
2901
2902 static bool
fs_cache_compare(const void * key1,const void * key2)2903 fs_cache_compare(const void *key1, const void *key2)
2904 {
2905 return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0;
2906 }
2907
2908 static bool
vs_cache_compare(const void * key1,const void * key2)2909 vs_cache_compare(const void *key1, const void *key2)
2910 {
2911 return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0;
2912 }
2913
2914 static uint32_t
fs_inputs_hash(const void * key)2915 fs_inputs_hash(const void *key)
2916 {
2917 const struct vc4_fs_inputs *inputs = key;
2918
2919 return _mesa_hash_data(inputs->input_slots,
2920 sizeof(*inputs->input_slots) *
2921 inputs->num_inputs);
2922 }
2923
2924 static bool
fs_inputs_compare(const void * key1,const void * key2)2925 fs_inputs_compare(const void *key1, const void *key2)
2926 {
2927 const struct vc4_fs_inputs *inputs1 = key1;
2928 const struct vc4_fs_inputs *inputs2 = key2;
2929
2930 if (inputs1->num_inputs == inputs2->num_inputs) {
2931 if (inputs1->num_inputs == 0) {
2932 return true;
2933 } else {
2934 return memcmp(inputs1->input_slots,
2935 inputs2->input_slots,
2936 sizeof(*inputs1->input_slots) *
2937 inputs1->num_inputs) == 0;
2938 }
2939 }
2940
2941 return false;
2942 }
2943
2944 static void
delete_from_cache_if_matches(struct hash_table * ht,struct vc4_compiled_shader ** last_compile,struct hash_entry * entry,struct vc4_uncompiled_shader * so)2945 delete_from_cache_if_matches(struct hash_table *ht,
2946 struct vc4_compiled_shader **last_compile,
2947 struct hash_entry *entry,
2948 struct vc4_uncompiled_shader *so)
2949 {
2950 const struct vc4_key *key = entry->key;
2951
2952 if (key->shader_state == so) {
2953 struct vc4_compiled_shader *shader = entry->data;
2954 _mesa_hash_table_remove(ht, entry);
2955 vc4_bo_unreference(&shader->bo);
2956
2957 if (shader == *last_compile)
2958 *last_compile = NULL;
2959
2960 ralloc_free(shader);
2961 }
2962 }
2963
2964 static void
vc4_shader_state_delete(struct pipe_context * pctx,void * hwcso)2965 vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso)
2966 {
2967 struct vc4_context *vc4 = vc4_context(pctx);
2968 struct vc4_uncompiled_shader *so = hwcso;
2969
2970 hash_table_foreach(vc4->fs_cache, entry) {
2971 delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs,
2972 entry, so);
2973 }
2974 hash_table_foreach(vc4->vs_cache, entry) {
2975 delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs,
2976 entry, so);
2977 }
2978
2979 ralloc_free(so->base.ir.nir);
2980 free(so);
2981 }
2982
2983 static void
vc4_fp_state_bind(struct pipe_context * pctx,void * hwcso)2984 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso)
2985 {
2986 struct vc4_context *vc4 = vc4_context(pctx);
2987 vc4->prog.bind_fs = hwcso;
2988 vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS;
2989 }
2990
2991 static void
vc4_vp_state_bind(struct pipe_context * pctx,void * hwcso)2992 vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso)
2993 {
2994 struct vc4_context *vc4 = vc4_context(pctx);
2995 vc4->prog.bind_vs = hwcso;
2996 vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS;
2997 }
2998
2999 void
vc4_program_init(struct pipe_context * pctx)3000 vc4_program_init(struct pipe_context *pctx)
3001 {
3002 struct vc4_context *vc4 = vc4_context(pctx);
3003
3004 pctx->create_vs_state = vc4_shader_state_create;
3005 pctx->delete_vs_state = vc4_shader_state_delete;
3006
3007 pctx->create_fs_state = vc4_shader_state_create;
3008 pctx->delete_fs_state = vc4_shader_state_delete;
3009
3010 pctx->bind_fs_state = vc4_fp_state_bind;
3011 pctx->bind_vs_state = vc4_vp_state_bind;
3012
3013 vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
3014 fs_cache_compare);
3015 vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
3016 vs_cache_compare);
3017 vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash,
3018 fs_inputs_compare);
3019 }
3020
3021 void
vc4_program_fini(struct pipe_context * pctx)3022 vc4_program_fini(struct pipe_context *pctx)
3023 {
3024 struct vc4_context *vc4 = vc4_context(pctx);
3025
3026 hash_table_foreach(vc4->fs_cache, entry) {
3027 struct vc4_compiled_shader *shader = entry->data;
3028 vc4_bo_unreference(&shader->bo);
3029 ralloc_free(shader);
3030 _mesa_hash_table_remove(vc4->fs_cache, entry);
3031 }
3032
3033 hash_table_foreach(vc4->vs_cache, entry) {
3034 struct vc4_compiled_shader *shader = entry->data;
3035 vc4_bo_unreference(&shader->bo);
3036 ralloc_free(shader);
3037 _mesa_hash_table_remove(vc4->vs_cache, entry);
3038 }
3039 }
3040