1 /*
2 * Copyright © 2019 Google LLC
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_shader.h"
7
8 #include "spirv/nir_spirv.h"
9 #include "util/mesa-sha1.h"
10 #include "nir/nir_xfb_info.h"
11 #include "vk_nir.h"
12 #include "vk_nir_convert_ycbcr.h"
13 #include "vk_pipeline.h"
14 #include "vk_util.h"
15
16 #include "ir3/ir3_compiler.h"
17 #include "ir3/ir3_nir.h"
18
19 #include "tu_device.h"
20 #include "tu_descriptor_set.h"
21 #include "tu_lrz.h"
22 #include "tu_pipeline.h"
23 #include "tu_rmv.h"
24
25 #include <initializer_list>
26
27 nir_shader *
tu_spirv_to_nir(struct tu_device * dev,void * mem_ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage_info,gl_shader_stage stage)28 tu_spirv_to_nir(struct tu_device *dev,
29 void *mem_ctx,
30 VkPipelineCreateFlags2KHR pipeline_flags,
31 const VkPipelineShaderStageCreateInfo *stage_info,
32 gl_shader_stage stage)
33 {
34 /* TODO these are made-up */
35 const struct spirv_to_nir_options spirv_options = {
36 /* ViewID is a sysval in geometry stages and an input in the FS */
37 .view_index_is_input = stage == MESA_SHADER_FRAGMENT,
38
39 /* Use 16-bit math for RelaxedPrecision ALU ops */
40 .mediump_16bit_alu = true,
41
42 .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
43 .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
44
45 /* Accessed via stg/ldg */
46 .phys_ssbo_addr_format = nir_address_format_64bit_global,
47
48 /* Accessed via the const register file */
49 .push_const_addr_format = nir_address_format_logical,
50
51 /* Accessed via ldl/stl */
52 .shared_addr_format = nir_address_format_32bit_offset,
53
54 /* Accessed via stg/ldg (not used with Vulkan?) */
55 .global_addr_format = nir_address_format_64bit_global,
56 };
57
58 const nir_shader_compiler_options *nir_options =
59 ir3_get_compiler_options(dev->compiler);
60
61 nir_shader *nir;
62 VkResult result =
63 vk_pipeline_shader_stage_to_nir(&dev->vk, pipeline_flags, stage_info,
64 &spirv_options, nir_options,
65 mem_ctx, &nir);
66 if (result != VK_SUCCESS)
67 return NULL;
68
69 /* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
70 * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
71 * which is useless for us, so reset them here.
72 */
73 nir->info.num_ubos = 0;
74 nir->info.num_ssbos = 0;
75
76 if (TU_DEBUG(NIR)) {
77 fprintf(stderr, "translated nir:\n");
78 nir_print_shader(nir, stderr);
79 }
80
81 const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
82 .point_coord = true,
83 };
84 NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
85
86 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
87
88 /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
89 * precision on arg passed to relaxed param") will pass function args through
90 * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
91 * prop before we lower mediump vars, or you'll be unable to optimize out
92 * array copies after lowering. We do this before splitting copies, since
93 * that works against nir_opt_find_array_copies().
94 * */
95 NIR_PASS_V(nir, nir_opt_find_array_copies);
96 NIR_PASS_V(nir, nir_opt_copy_prop_vars);
97 NIR_PASS_V(nir, nir_opt_dce);
98
99 NIR_PASS_V(nir, nir_split_var_copies);
100 NIR_PASS_V(nir, nir_lower_var_copies);
101
102 NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
103 NIR_PASS_V(nir, nir_opt_copy_prop_vars);
104 NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
105
106 NIR_PASS_V(nir, nir_lower_system_values);
107 NIR_PASS_V(nir, nir_lower_is_helper_invocation);
108
109 ir3_optimize_loop(dev->compiler, nir);
110
111 NIR_PASS_V(nir, nir_opt_conditional_discard);
112
113 return nir;
114 }
115
116 static void
lower_load_push_constant(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)117 lower_load_push_constant(struct tu_device *dev,
118 nir_builder *b,
119 nir_intrinsic_instr *instr,
120 struct tu_shader *shader,
121 const struct tu_pipeline_layout *layout)
122 {
123 uint32_t base = nir_intrinsic_base(instr);
124 assert(base % 4 == 0);
125
126 if (tu6_shared_constants_enable(layout, dev->compiler)) {
127 /* All stages share the same range. We could potentially add
128 * push_constant_offset to layout and apply it, but this is good for
129 * now.
130 */
131 base += dev->compiler->shared_consts_base_offset * 4;
132 } else {
133 assert(base >= shader->const_state.push_consts.lo * 4);
134 base -= shader->const_state.push_consts.lo * 4;
135 }
136
137 nir_def *load =
138 nir_load_const_ir3(b, instr->num_components, instr->def.bit_size,
139 nir_ushr_imm(b, instr->src[0].ssa, 2), .base = base);
140
141 nir_def_replace(&instr->def, load);
142 }
143
144 static void
lower_vulkan_resource_index(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)145 lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
146 nir_intrinsic_instr *instr,
147 struct tu_shader *shader,
148 const struct tu_pipeline_layout *layout)
149 {
150 struct ir3_compiler *compiler = dev->compiler;
151 nir_def *vulkan_idx = instr->src[0].ssa;
152
153 unsigned set = nir_intrinsic_desc_set(instr);
154 unsigned binding = nir_intrinsic_binding(instr);
155 struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
156 struct tu_descriptor_set_binding_layout *binding_layout =
157 &set_layout->binding[binding];
158 nir_def *base;
159
160 if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
161 return;
162
163 shader->active_desc_sets |= 1u << set;
164
165 switch (binding_layout->type) {
166 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
167 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
168 int offset = 0;
169 for (unsigned i = 0; i < set; i++) {
170 if (shader->dynamic_descriptor_sizes[i] >= 0) {
171 offset += shader->dynamic_descriptor_sizes[i];
172 } else {
173 offset = -1;
174 break;
175 }
176 }
177
178 if (offset < 0) {
179 /* With independent sets, we don't know
180 * layout->set[set].dynamic_offset_start until after link time which
181 * with fast linking means after the shader is compiled. We have to
182 * get it from the const file instead.
183 */
184 base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
185 nir_def *dynamic_offset_start;
186 if (compiler->load_shader_consts_via_preamble) {
187 dynamic_offset_start =
188 ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
189 } else {
190 dynamic_offset_start = nir_load_const_ir3(
191 b, 1, 32, nir_imm_int(b, 0),
192 .base = shader->const_state.dynamic_offset_loc + set);
193 }
194 base = nir_iadd(b, base, dynamic_offset_start);
195 } else {
196 base = nir_imm_int(b, (offset +
197 binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
198 }
199 assert(dev->physical_device->reserved_set_idx >= 0);
200 set = dev->physical_device->reserved_set_idx;
201 break;
202 }
203 default:
204 base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
205 break;
206 }
207
208 unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
209 assert(util_is_power_of_two_nonzero(stride));
210 nir_def *shift = nir_imm_int(b, util_logbase2(stride));
211
212 nir_def *def = nir_vec3(b, nir_imm_int(b, set),
213 nir_iadd(b, base,
214 nir_ishl(b, vulkan_idx, shift)),
215 shift);
216
217 nir_def_replace(&instr->def, def);
218 }
219
220 static void
lower_vulkan_resource_reindex(nir_builder * b,nir_intrinsic_instr * instr)221 lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
222 {
223 nir_def *old_index = instr->src[0].ssa;
224 nir_def *delta = instr->src[1].ssa;
225 nir_def *shift = nir_channel(b, old_index, 2);
226
227 nir_def *new_index =
228 nir_vec3(b, nir_channel(b, old_index, 0),
229 nir_iadd(b, nir_channel(b, old_index, 1),
230 nir_ishl(b, delta, shift)),
231 shift);
232
233 nir_def_replace(&instr->def, new_index);
234 }
235
236 static void
lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin)237 lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
238 {
239 nir_def *old_index = intrin->src[0].ssa;
240 /* Loading the descriptor happens as part of the load/store instruction so
241 * this is a no-op. We just need to turn the shift into an offset of 0.
242 */
243 nir_def *new_index =
244 nir_vec3(b, nir_channel(b, old_index, 0),
245 nir_channel(b, old_index, 1),
246 nir_imm_int(b, 0));
247 nir_def_replace(&intrin->def, new_index);
248 }
249
250 static bool
lower_ssbo_ubo_intrinsic(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * intrin)251 lower_ssbo_ubo_intrinsic(struct tu_device *dev,
252 nir_builder *b, nir_intrinsic_instr *intrin)
253 {
254 const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
255
256 /* The bindless base is part of the instruction, which means that part of
257 * the "pointer" has to be constant. We solve this in the same way the blob
258 * does, by generating a bunch of if-statements. In the usual case where
259 * the descriptor set is constant we can skip that, though).
260 */
261
262 unsigned buffer_src;
263 if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
264 /* This has the value first */
265 buffer_src = 1;
266 } else {
267 buffer_src = 0;
268 }
269
270 /* Don't lower non-bindless UBO loads of driver params */
271 if (intrin->src[buffer_src].ssa->num_components == 1)
272 return false;
273
274 nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
275 nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
276
277 if (intrin->intrinsic == nir_intrinsic_load_ubo &&
278 dev->instance->allow_oob_indirect_ubo_loads) {
279 nir_scalar offset = nir_scalar_resolved(intrin->src[1].ssa, 0);
280 if (!nir_scalar_is_const(offset)) {
281 nir_intrinsic_set_range(intrin, ~0);
282 }
283 }
284
285 /* Descriptor index has to be adjusted in the following cases:
286 * - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
287 * loads -- next-index descriptor will be able to do that;
288 * - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
289 * storage accesses of that size.
290 */
291 if ((dev->physical_device->info->a6xx.storage_16bit &&
292 !dev->physical_device->info->a6xx.has_isam_v &&
293 intrin->intrinsic == nir_intrinsic_load_ssbo &&
294 (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
295 intrin->def.bit_size > 16) ||
296 (dev->physical_device->info->a7xx.storage_8bit &&
297 ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
298 (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
299 descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
300 }
301
302 nir_def *results[MAX_SETS] = { NULL };
303
304 if (nir_scalar_is_const(scalar_idx)) {
305 nir_def *bindless =
306 nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
307 nir_src_rewrite(&intrin->src[buffer_src], bindless);
308 return true;
309 }
310
311 nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
312 for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
313 /* if (base_idx == i) { ... */
314 nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
315
316 nir_def *bindless =
317 nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
318
319 nir_intrinsic_instr *copy =
320 nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
321
322 copy->num_components = intrin->num_components;
323
324 for (unsigned src = 0; src < info->num_srcs; src++) {
325 if (src == buffer_src)
326 copy->src[src] = nir_src_for_ssa(bindless);
327 else
328 copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
329 }
330
331 for (unsigned idx = 0; idx < info->num_indices; idx++) {
332 copy->const_index[idx] = intrin->const_index[idx];
333 }
334
335 if (info->has_dest) {
336 nir_def_init(©->instr, ©->def,
337 intrin->def.num_components,
338 intrin->def.bit_size);
339 results[i] = ©->def;
340 }
341
342 nir_builder_instr_insert(b, ©->instr);
343
344 /* } else { ... */
345 nir_push_else(b, nif);
346 }
347
348 nir_def *result =
349 nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
350 for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
351 nir_pop_if(b, NULL);
352 if (info->has_dest)
353 result = nir_if_phi(b, results[i], result);
354 }
355
356 if (info->has_dest)
357 nir_def_rewrite_uses(&intrin->def, result);
358 nir_instr_remove(&intrin->instr);
359 return true;
360 }
361
362 static nir_def *
build_bindless(struct tu_device * dev,nir_builder * b,nir_deref_instr * deref,bool is_sampler,struct tu_shader * shader,const struct tu_pipeline_layout * layout)363 build_bindless(struct tu_device *dev, nir_builder *b,
364 nir_deref_instr *deref, bool is_sampler,
365 struct tu_shader *shader,
366 const struct tu_pipeline_layout *layout)
367 {
368 nir_variable *var = nir_deref_instr_get_variable(deref);
369
370 unsigned set = var->data.descriptor_set;
371 unsigned binding = var->data.binding;
372 const struct tu_descriptor_set_binding_layout *bind_layout =
373 &layout->set[set].layout->binding[binding];
374
375 /* input attachments use non bindless workaround */
376 if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
377 !TU_DEBUG(DYNAMIC)) {
378 const struct glsl_type *glsl_type = glsl_without_array(var->type);
379 uint32_t idx = var->data.index * 2;
380
381 BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
382
383 /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
384 if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
385 idx += 1;
386
387 if (deref->deref_type == nir_deref_type_var)
388 return nir_imm_int(b, idx);
389
390 nir_def *arr_index = deref->arr.index.ssa;
391 return nir_iadd_imm(b, nir_imul_imm(b, arr_index, 2), idx);
392 }
393
394 shader->active_desc_sets |= 1u << set;
395
396 nir_def *desc_offset;
397 unsigned descriptor_stride;
398 unsigned offset = 0;
399 /* Samplers come second in combined image/sampler descriptors, see
400 * write_combined_image_sampler_descriptor().
401 */
402 if (is_sampler && bind_layout->type ==
403 VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
404 offset = 1;
405 }
406 desc_offset =
407 nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
408 offset);
409 descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
410
411 if (deref->deref_type != nir_deref_type_var) {
412 assert(deref->deref_type == nir_deref_type_array);
413
414 nir_def *arr_index = deref->arr.index.ssa;
415 desc_offset = nir_iadd(b, desc_offset,
416 nir_imul_imm(b, arr_index, descriptor_stride));
417 }
418
419 return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
420 }
421
422 static void
lower_image_deref(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)423 lower_image_deref(struct tu_device *dev, nir_builder *b,
424 nir_intrinsic_instr *instr, struct tu_shader *shader,
425 const struct tu_pipeline_layout *layout)
426 {
427 nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
428 nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
429 nir_rewrite_image_intrinsic(instr, bindless, true);
430 }
431
432 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout)433 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
434 struct tu_device *dev,
435 struct tu_shader *shader,
436 const struct tu_pipeline_layout *layout)
437 {
438 switch (instr->intrinsic) {
439 case nir_intrinsic_load_push_constant:
440 lower_load_push_constant(dev, b, instr, shader, layout);
441 return true;
442
443 case nir_intrinsic_load_vulkan_descriptor:
444 lower_load_vulkan_descriptor(b, instr);
445 return true;
446
447 case nir_intrinsic_vulkan_resource_index:
448 lower_vulkan_resource_index(dev, b, instr, shader, layout);
449 return true;
450 case nir_intrinsic_vulkan_resource_reindex:
451 lower_vulkan_resource_reindex(b, instr);
452 return true;
453
454 case nir_intrinsic_load_ubo:
455 case nir_intrinsic_load_ssbo:
456 case nir_intrinsic_store_ssbo:
457 case nir_intrinsic_ssbo_atomic:
458 case nir_intrinsic_ssbo_atomic_swap:
459 case nir_intrinsic_get_ssbo_size:
460 return lower_ssbo_ubo_intrinsic(dev, b, instr);
461
462 case nir_intrinsic_image_deref_load:
463 case nir_intrinsic_image_deref_store:
464 case nir_intrinsic_image_deref_atomic:
465 case nir_intrinsic_image_deref_atomic_swap:
466 case nir_intrinsic_image_deref_size:
467 case nir_intrinsic_image_deref_samples:
468 lower_image_deref(dev, b, instr, shader, layout);
469 return true;
470
471 case nir_intrinsic_load_frag_size_ir3:
472 case nir_intrinsic_load_frag_offset_ir3: {
473 if (!dev->compiler->load_shader_consts_via_preamble)
474 return false;
475
476 enum ir3_driver_param param =
477 instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
478 IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
479
480 unsigned offset = param - IR3_DP_FS_DYNAMIC;
481
482 nir_def *view = instr->src[0].ssa;
483 nir_def *result =
484 ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
485 offset, view, nir_intrinsic_range(instr));
486
487 nir_def_replace(&instr->def, result);
488 return true;
489 }
490 case nir_intrinsic_load_frag_invocation_count: {
491 if (!dev->compiler->load_shader_consts_via_preamble)
492 return false;
493
494 nir_def *result =
495 ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
496 IR3_DP_FS_FRAG_INVOCATION_COUNT -
497 IR3_DP_FS_DYNAMIC);
498
499 nir_def_replace(&instr->def, result);
500 return true;
501 }
502
503 default:
504 return false;
505 }
506 }
507
508 static void
lower_tex_ycbcr(const struct tu_pipeline_layout * layout,nir_builder * builder,nir_tex_instr * tex)509 lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
510 nir_builder *builder,
511 nir_tex_instr *tex)
512 {
513 int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
514 assert(deref_src_idx >= 0);
515 nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
516
517 nir_variable *var = nir_deref_instr_get_variable(deref);
518 const struct tu_descriptor_set_layout *set_layout =
519 layout->set[var->data.descriptor_set].layout;
520 const struct tu_descriptor_set_binding_layout *binding =
521 &set_layout->binding[var->data.binding];
522 const struct vk_ycbcr_conversion_state *ycbcr_samplers =
523 tu_immutable_ycbcr_samplers(set_layout, binding);
524
525 if (!ycbcr_samplers)
526 return;
527
528 /* For the following instructions, we don't apply any change */
529 if (tex->op == nir_texop_txs ||
530 tex->op == nir_texop_query_levels ||
531 tex->op == nir_texop_lod)
532 return;
533
534 assert(tex->texture_index == 0);
535 unsigned array_index = 0;
536 if (deref->deref_type != nir_deref_type_var) {
537 assert(deref->deref_type == nir_deref_type_array);
538 if (!nir_src_is_const(deref->arr.index))
539 return;
540 array_index = nir_src_as_uint(deref->arr.index);
541 array_index = MIN2(array_index, binding->array_size - 1);
542 }
543 const struct vk_ycbcr_conversion_state *ycbcr_sampler = ycbcr_samplers + array_index;
544
545 if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
546 return;
547
548 /* Skip if not actually a YCbCr format. CtsGraphics, for example, tries to create
549 * YcbcrConversions for RGB formats.
550 */
551 if (!vk_format_get_ycbcr_info(ycbcr_sampler->format))
552 return;
553
554 builder->cursor = nir_after_instr(&tex->instr);
555
556 uint8_t bits = vk_format_get_component_bits(ycbcr_sampler->format,
557 UTIL_FORMAT_COLORSPACE_RGB,
558 PIPE_SWIZZLE_X);
559 uint32_t bpcs[3] = {bits, bits, bits}; /* TODO: use right bpc for each channel ? */
560 nir_def *result = nir_convert_ycbcr_to_rgb(builder,
561 ycbcr_sampler->ycbcr_model,
562 ycbcr_sampler->ycbcr_range,
563 &tex->def,
564 bpcs);
565 nir_def_rewrite_uses_after(&tex->def, result,
566 result->parent_instr);
567
568 builder->cursor = nir_before_instr(&tex->instr);
569 }
570
571 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout)572 lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
573 struct tu_shader *shader, const struct tu_pipeline_layout *layout)
574 {
575 lower_tex_ycbcr(layout, b, tex);
576
577 int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
578 if (sampler_src_idx >= 0) {
579 nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
580 nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout);
581 nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
582 tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
583 }
584
585 int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
586 if (tex_src_idx >= 0) {
587 nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
588 nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
589 nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
590 tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
591
592 /* for the input attachment case: */
593 if (bindless->parent_instr->type != nir_instr_type_intrinsic)
594 tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
595 }
596
597 return true;
598 }
599
600 struct lower_instr_params {
601 struct tu_device *dev;
602 struct tu_shader *shader;
603 const struct tu_pipeline_layout *layout;
604 };
605
606 static bool
lower_instr(nir_builder * b,nir_instr * instr,void * cb_data)607 lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
608 {
609 struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
610 b->cursor = nir_before_instr(instr);
611 switch (instr->type) {
612 case nir_instr_type_tex:
613 return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout);
614 case nir_instr_type_intrinsic:
615 return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev, params->shader, params->layout);
616 default:
617 return false;
618 }
619 }
620
621 /* Since we always push inline uniforms into constant memory, lower loads of
622 * them to load_uniform which turns into constant memory loads.
623 */
624 static bool
lower_inline_ubo(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)625 lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
626 {
627 if (intrin->intrinsic != nir_intrinsic_load_ubo)
628 return false;
629
630 struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
631 struct tu_shader *shader = params->shader;
632 const struct tu_pipeline_layout *layout = params->layout;
633
634 nir_binding binding = nir_chase_binding(intrin->src[0]);
635
636 if (!binding.success)
637 return false;
638
639 struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
640 struct tu_descriptor_set_binding_layout *binding_layout =
641 &set_layout->binding[binding.binding];
642
643 if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
644 return false;
645
646 /* lookup the const offset of the inline UBO */
647 struct tu_const_state *const_state = &shader->const_state;
648
649 unsigned base = UINT_MAX;
650 unsigned range;
651 bool use_load = false;
652 bool use_ldg_k =
653 params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
654
655 for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
656 if (const_state->ubos[i].base == binding.desc_set &&
657 const_state->ubos[i].offset == binding_layout->offset) {
658 range = const_state->ubos[i].size_vec4 * 4;
659 if (use_ldg_k) {
660 base = i * 2;
661 } else {
662 use_load = const_state->ubos[i].push_address;
663 base = const_state->ubos[i].const_offset_vec4 * 4;
664 }
665 break;
666 }
667 }
668
669 if (base == UINT_MAX) {
670 /* Assume we're loading out-of-bounds from a 0-sized inline uniform
671 * filtered out below.
672 */
673 nir_def_rewrite_uses(&intrin->def,
674 nir_undef(b, intrin->num_components,
675 intrin->def.bit_size));
676 return true;
677 }
678
679 nir_def *offset = intrin->src[1].ssa;
680
681 b->cursor = nir_before_instr(&intrin->instr);
682 nir_def *val;
683
684 if (use_load || use_ldg_k) {
685 nir_def *base_addr;
686 if (use_ldg_k) {
687 base_addr = ir3_load_driver_ubo(b, 2,
688 ¶ms->shader->const_state.inline_uniforms_ubo,
689 base);
690 } else {
691 base_addr =
692 nir_load_const_ir3(b, 2, 32, nir_imm_int(b, 0), .base = base);
693 }
694 val = nir_load_global_ir3(b, intrin->num_components,
695 intrin->def.bit_size,
696 base_addr, nir_ishr_imm(b, offset, 2),
697 .access =
698 (enum gl_access_qualifier)(
699 (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
700 ACCESS_CAN_SPECULATE),
701 .align_mul = 16,
702 .align_offset = 0,
703 .range_base = 0,
704 .range = range);
705 } else {
706 val =
707 nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,
708 nir_ishr_imm(b, offset, 2), .base = base);
709 }
710
711 nir_def_replace(&intrin->def, val);
712 return true;
713 }
714
715 /* Figure out the range of push constants that we're actually going to push to
716 * the shader, and tell the backend to reserve this range when pushing UBO
717 * constants.
718 */
719
720 static void
gather_push_constants(nir_shader * shader,struct tu_shader * tu_shader)721 gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
722 {
723 uint32_t min = UINT32_MAX, max = 0;
724 nir_foreach_function_impl(impl, shader) {
725 nir_foreach_block(block, impl) {
726 nir_foreach_instr_safe(instr, block) {
727 if (instr->type != nir_instr_type_intrinsic)
728 continue;
729
730 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
731 if (intrin->intrinsic != nir_intrinsic_load_push_constant)
732 continue;
733
734 uint32_t base = nir_intrinsic_base(intrin);
735 uint32_t range = nir_intrinsic_range(intrin);
736 min = MIN2(min, base);
737 max = MAX2(max, base + range);
738 break;
739 }
740 }
741 }
742
743 if (min >= max) {
744 tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
745 return;
746 }
747
748 /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
749 * dwords while loading regular consts is in units of vec4's.
750 * So we unify the unit here as dwords for tu_push_constant_range, then
751 * we should consider correct unit when emitting.
752 *
753 * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
754 * the range and change units accordingly.
755 */
756 tu_shader->const_state.push_consts.lo = (min / 4) / 4 * 4;
757 tu_shader->const_state.push_consts.dwords =
758 align(max, 16) / 4 - tu_shader->const_state.push_consts.lo;
759 }
760
761 static bool
shader_uses_push_consts(nir_shader * shader)762 shader_uses_push_consts(nir_shader *shader)
763 {
764 nir_foreach_function_impl (impl, shader) {
765 nir_foreach_block (block, impl) {
766 nir_foreach_instr_safe (instr, block) {
767 if (instr->type != nir_instr_type_intrinsic)
768 continue;
769
770 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
771 if (intrin->intrinsic == nir_intrinsic_load_push_constant)
772 return true;
773 }
774 }
775 }
776 return false;
777 }
778
779 static bool
tu_lower_io(nir_shader * shader,struct tu_device * dev,struct tu_shader * tu_shader,const struct tu_pipeline_layout * layout,unsigned * reserved_consts_vec4_out)780 tu_lower_io(nir_shader *shader, struct tu_device *dev,
781 struct tu_shader *tu_shader,
782 const struct tu_pipeline_layout *layout,
783 unsigned *reserved_consts_vec4_out)
784 {
785 tu_shader->const_state.push_consts = (struct tu_push_constant_range) {
786 .lo = 0,
787 .dwords = layout->push_constant_size / 4,
788 .type = tu_push_consts_type(layout, dev->compiler),
789 };
790
791 if (tu_shader->const_state.push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
792 gather_push_constants(shader, tu_shader);
793 } else if (tu_shader->const_state.push_consts.type ==
794 IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
795 /* Disable pushing constants for this stage if none were loaded in the
796 * shader. If all stages don't load their declared push constants, as
797 * is often the case under zink, then we could additionally skip
798 * emitting REG_A7XX_HLSQ_SHARED_CONSTS_IMM entirely.
799 */
800 if (!shader_uses_push_consts(shader))
801 tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
802 }
803
804 struct tu_const_state *const_state = &tu_shader->const_state;
805 unsigned reserved_consts_vec4 =
806 align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
807 dev->compiler->const_upload_unit);
808
809 bool unknown_dynamic_size = false;
810 bool unknown_dynamic_offset = false;
811 for (unsigned i = 0; i < layout->num_sets; i++) {
812 if (tu_shader->dynamic_descriptor_sizes[i] == -1) {
813 unknown_dynamic_size = true;
814 } else if (unknown_dynamic_size &&
815 tu_shader->dynamic_descriptor_sizes[i] > 0) {
816 /* If there is an unknown size followed by a known size, then we may
817 * need to dynamically determine the offset when linking.
818 */
819 unknown_dynamic_offset = true;
820 }
821 }
822
823 if (unknown_dynamic_offset) {
824 const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
825 assert(dev->physical_device->reserved_set_idx >= 0);
826 reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4);
827 } else {
828 const_state->dynamic_offset_loc = UINT32_MAX;
829 }
830
831 /* Reserve space for inline uniforms, so we can always load them from
832 * constants and not setup a UBO descriptor for them.
833 */
834 bool use_ldg_k =
835 dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
836 for (unsigned set = 0; set < layout->num_sets; set++) {
837 const struct tu_descriptor_set_layout *desc_layout =
838 layout->set[set].layout;
839
840 if (!desc_layout || !desc_layout->has_inline_uniforms)
841 continue;
842
843 for (unsigned b = 0; b < desc_layout->binding_count; b++) {
844 const struct tu_descriptor_set_binding_layout *binding =
845 &desc_layout->binding[b];
846
847 if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
848 continue;
849 if (!(binding->shader_stages &
850 mesa_to_vk_shader_stage(shader->info.stage)))
851 continue;
852
853 /* If we don't know the size at compile time due to a variable
854 * descriptor count, then with descriptor buffers we cannot know
855 * how much space the real inline uniform has. In this case we fall
856 * back to pushing the address and using ldg, which is slower than
857 * setting up a descriptor but setting up our own descriptor with
858 * descriptor_buffer is also painful and has to be done on the GPU
859 * and doesn't avoid the UBO getting pushed anyway and faulting if a
860 * out-of-bounds access is hidden behind an if and not dynamically
861 * executed. Given the small max size, there shouldn't be much reason
862 * to use variable size anyway.
863 */
864 bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
865 b == desc_layout->binding_count - 1;
866
867 if (push_address) {
868 perf_debug(dev,
869 "falling back to ldg for variable-sized inline "
870 "uniform block");
871 }
872
873 assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
874 unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
875 const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) {
876 .base = set,
877 .offset = binding->offset,
878 .push_address = push_address,
879 .const_offset_vec4 = reserved_consts_vec4,
880 .size_vec4 = size_vec4,
881 };
882
883 if (!use_ldg_k)
884 reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
885 }
886 }
887
888 *reserved_consts_vec4_out = reserved_consts_vec4;
889
890 struct lower_instr_params params = {
891 .dev = dev,
892 .shader = tu_shader,
893 .layout = layout,
894 };
895
896 bool progress = false;
897 if (const_state->num_inline_ubos) {
898 progress |= nir_shader_intrinsics_pass(shader, lower_inline_ubo,
899 nir_metadata_none,
900 ¶ms);
901 }
902
903 progress |= nir_shader_instructions_pass(shader,
904 lower_instr,
905 nir_metadata_none,
906 ¶ms);
907
908 /* Remove now-unused variables so that when we gather the shader info later
909 * they won't be counted.
910 */
911
912 if (progress)
913 nir_opt_dce(shader);
914
915 progress |=
916 nir_remove_dead_variables(shader,
917 nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
918 NULL);
919
920 return progress;
921 }
922
923 struct lower_fdm_options {
924 unsigned num_views;
925 bool adjust_fragcoord;
926 bool multiview;
927 };
928
929 static bool
lower_fdm_filter(const nir_instr * instr,const void * data)930 lower_fdm_filter(const nir_instr *instr, const void *data)
931 {
932 const struct lower_fdm_options *options =
933 (const struct lower_fdm_options *)data;
934
935 if (instr->type != nir_instr_type_intrinsic)
936 return false;
937
938 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
939 return intrin->intrinsic == nir_intrinsic_load_frag_size ||
940 (intrin->intrinsic == nir_intrinsic_load_frag_coord &&
941 options->adjust_fragcoord);
942 }
943
944 static nir_def *
lower_fdm_instr(struct nir_builder * b,nir_instr * instr,void * data)945 lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
946 {
947 const struct lower_fdm_options *options =
948 (const struct lower_fdm_options *)data;
949
950 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
951
952 nir_def *view;
953 if (options->multiview) {
954 nir_variable *view_var =
955 nir_find_variable_with_location(b->shader, nir_var_shader_in,
956 VARYING_SLOT_VIEW_INDEX);
957
958 if (view_var == NULL) {
959 view_var = nir_variable_create(b->shader, nir_var_shader_in,
960 glsl_int_type(), NULL);
961 view_var->data.location = VARYING_SLOT_VIEW_INDEX;
962 view_var->data.interpolation = INTERP_MODE_FLAT;
963 view_var->data.driver_location = b->shader->num_inputs++;
964 }
965
966 view = nir_load_var(b, view_var);
967 } else {
968 view = nir_imm_int(b, 0);
969 }
970
971 nir_def *frag_size =
972 nir_load_frag_size_ir3(b, view, .range = options->num_views);
973
974 if (intrin->intrinsic == nir_intrinsic_load_frag_coord) {
975 nir_def *frag_offset =
976 nir_load_frag_offset_ir3(b, view, .range = options->num_views);
977 nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
978 nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
979 xy = nir_fmul(b, nir_fsub(b, xy, frag_offset), nir_i2f32(b, frag_size));
980 return nir_vec4(b,
981 nir_channel(b, xy, 0),
982 nir_channel(b, xy, 1),
983 nir_channel(b, unscaled_coord, 2),
984 nir_channel(b, unscaled_coord, 3));
985 }
986
987 assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
988 return frag_size;
989 }
990
991 static bool
tu_nir_lower_fdm(nir_shader * shader,const struct lower_fdm_options * options)992 tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
993 {
994 return nir_shader_lower_instructions(shader, lower_fdm_filter,
995 lower_fdm_instr, (void *)options);
996 }
997
998 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)999 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1000 {
1001 assert(glsl_type_is_vector_or_scalar(type));
1002
1003 unsigned comp_size =
1004 glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
1005 unsigned length = glsl_get_vector_elements(type);
1006 *size = comp_size * length;
1007 *align = comp_size;
1008 }
1009
1010 static void
tu_gather_xfb_info(nir_shader * nir,struct ir3_stream_output_info * info)1011 tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
1012 {
1013 nir_shader_gather_xfb_info(nir);
1014
1015 if (!nir->xfb_info)
1016 return;
1017
1018 nir_xfb_info *xfb = nir->xfb_info;
1019
1020 uint8_t output_map[VARYING_SLOT_TESS_MAX];
1021 memset(output_map, 0, sizeof(output_map));
1022
1023 nir_foreach_shader_out_variable(var, nir) {
1024 unsigned slots = nir_variable_count_slots(var, var->type);
1025 for (unsigned i = 0; i < slots; i++)
1026 output_map[var->data.location + i] = var->data.driver_location + i;
1027 }
1028
1029 assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
1030 info->num_outputs = xfb->output_count;
1031
1032 for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1033 info->stride[i] = xfb->buffers[i].stride / 4;
1034 info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
1035 }
1036
1037 info->streams_written = xfb->streams_written;
1038
1039 for (int i = 0; i < xfb->output_count; i++) {
1040 info->output[i].register_index = output_map[xfb->outputs[i].location];
1041 info->output[i].start_component = xfb->outputs[i].component_offset;
1042 info->output[i].num_components =
1043 util_bitcount(xfb->outputs[i].component_mask);
1044 info->output[i].output_buffer = xfb->outputs[i].buffer;
1045 info->output[i].dst_offset = xfb->outputs[i].offset / 4;
1046 info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
1047 }
1048 }
1049
1050 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)1051 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
1052 {
1053 const struct ir3_const_state *const_state = ir3_const_state(xs);
1054 uint32_t base = const_state->offsets.immediate;
1055 int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
1056
1057 /* truncate size to avoid writing constants that shader
1058 * does not use:
1059 */
1060 size = MIN2(size + base, xs->constlen) - base;
1061
1062 return MAX2(size, 0) * 4;
1063 }
1064
1065 /* We allocate fixed-length substreams for shader state, however some
1066 * parts of the state may have unbound length. Their additional space
1067 * requirements should be calculated here.
1068 */
1069 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)1070 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
1071 {
1072 const struct ir3_const_state *const_state = ir3_const_state(xs);
1073
1074 uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
1075
1076 /* Variable number of UBO upload ranges. */
1077 size += 4 * const_state->ubo_state.num_enabled;
1078
1079 /* Variable number of dwords for the primitive map */
1080 size += xs->input_size;
1081
1082 size += xs->constant_data_size / 4;
1083
1084 return size;
1085 }
1086
1087 static const struct xs_config {
1088 uint16_t reg_sp_xs_config;
1089 uint16_t reg_sp_xs_instrlen;
1090 uint16_t reg_sp_xs_first_exec_offset;
1091 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
1092 uint16_t reg_sp_xs_vgpr_config;
1093 } xs_config[] = {
1094 [MESA_SHADER_VERTEX] = {
1095 REG_A6XX_SP_VS_CONFIG,
1096 REG_A6XX_SP_VS_INSTRLEN,
1097 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
1098 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
1099 REG_A7XX_SP_VS_VGPR_CONFIG,
1100 },
1101 [MESA_SHADER_TESS_CTRL] = {
1102 REG_A6XX_SP_HS_CONFIG,
1103 REG_A6XX_SP_HS_INSTRLEN,
1104 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
1105 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
1106 REG_A7XX_SP_HS_VGPR_CONFIG,
1107 },
1108 [MESA_SHADER_TESS_EVAL] = {
1109 REG_A6XX_SP_DS_CONFIG,
1110 REG_A6XX_SP_DS_INSTRLEN,
1111 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
1112 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
1113 REG_A7XX_SP_DS_VGPR_CONFIG,
1114 },
1115 [MESA_SHADER_GEOMETRY] = {
1116 REG_A6XX_SP_GS_CONFIG,
1117 REG_A6XX_SP_GS_INSTRLEN,
1118 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
1119 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
1120 REG_A7XX_SP_GS_VGPR_CONFIG,
1121 },
1122 [MESA_SHADER_FRAGMENT] = {
1123 REG_A6XX_SP_FS_CONFIG,
1124 REG_A6XX_SP_FS_INSTRLEN,
1125 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
1126 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
1127 REG_A7XX_SP_FS_VGPR_CONFIG,
1128 },
1129 [MESA_SHADER_COMPUTE] = {
1130 REG_A6XX_SP_CS_CONFIG,
1131 REG_A6XX_SP_CS_INSTRLEN,
1132 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
1133 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
1134 REG_A7XX_SP_CS_VGPR_CONFIG,
1135 },
1136 };
1137
1138 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1139 tu6_emit_xs(struct tu_cs *cs,
1140 gl_shader_stage stage, /* xs->type, but xs may be NULL */
1141 const struct ir3_shader_variant *xs,
1142 const struct tu_pvtmem_config *pvtmem,
1143 uint64_t binary_iova)
1144 {
1145 const struct xs_config *cfg = &xs_config[stage];
1146
1147 if (!xs) {
1148 /* shader stage disabled */
1149 return;
1150 }
1151
1152 enum a6xx_threadsize thrsz =
1153 xs->info.double_threadsize ? THREAD128 : THREAD64;
1154 switch (stage) {
1155 case MESA_SHADER_VERTEX:
1156 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
1157 .halfregfootprint = xs->info.max_half_reg + 1,
1158 .fullregfootprint = xs->info.max_reg + 1,
1159 .branchstack = ir3_shader_branchstack_hw(xs),
1160 .mergedregs = xs->mergedregs,
1161 .earlypreamble = xs->early_preamble,
1162 ));
1163 break;
1164 case MESA_SHADER_TESS_CTRL:
1165 tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
1166 .halfregfootprint = xs->info.max_half_reg + 1,
1167 .fullregfootprint = xs->info.max_reg + 1,
1168 .branchstack = ir3_shader_branchstack_hw(xs),
1169 .earlypreamble = xs->early_preamble,
1170 ));
1171 break;
1172 case MESA_SHADER_TESS_EVAL:
1173 tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
1174 .halfregfootprint = xs->info.max_half_reg + 1,
1175 .fullregfootprint = xs->info.max_reg + 1,
1176 .branchstack = ir3_shader_branchstack_hw(xs),
1177 .earlypreamble = xs->early_preamble,
1178 ));
1179 break;
1180 case MESA_SHADER_GEOMETRY:
1181 tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
1182 .halfregfootprint = xs->info.max_half_reg + 1,
1183 .fullregfootprint = xs->info.max_reg + 1,
1184 .branchstack = ir3_shader_branchstack_hw(xs),
1185 .earlypreamble = xs->early_preamble,
1186 ));
1187 break;
1188 case MESA_SHADER_FRAGMENT:
1189 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
1190 .halfregfootprint = xs->info.max_half_reg + 1,
1191 .fullregfootprint = xs->info.max_reg + 1,
1192 .branchstack = ir3_shader_branchstack_hw(xs),
1193 .threadsize = thrsz,
1194 .varying = xs->total_in != 0,
1195 .lodpixmask = xs->need_full_quad,
1196 /* unknown bit, seems unnecessary */
1197 .unk24 = true,
1198 .pixlodenable = xs->need_pixlod,
1199 .earlypreamble = xs->early_preamble,
1200 .mergedregs = xs->mergedregs,
1201 ));
1202 break;
1203 case MESA_SHADER_COMPUTE:
1204 thrsz = cs->device->physical_device->info->a6xx
1205 .supports_double_threadsize ? thrsz : THREAD128;
1206 tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
1207 .halfregfootprint = xs->info.max_half_reg + 1,
1208 .fullregfootprint = xs->info.max_reg + 1,
1209 .branchstack = ir3_shader_branchstack_hw(xs),
1210 .threadsize = thrsz,
1211 .earlypreamble = xs->early_preamble,
1212 .mergedregs = xs->mergedregs,
1213 ));
1214 break;
1215 default:
1216 unreachable("bad shader stage");
1217 }
1218
1219 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
1220 tu_cs_emit(cs, xs->instrlen);
1221
1222 /* emit program binary & private memory layout
1223 * binary_iova should be aligned to 1 instrlen unit (128 bytes)
1224 */
1225
1226 assert((binary_iova & 0x7f) == 0);
1227 assert((pvtmem->iova & 0x1f) == 0);
1228
1229 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
1230 tu_cs_emit(cs, 0);
1231 tu_cs_emit_qw(cs, binary_iova);
1232 tu_cs_emit(cs,
1233 A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
1234 tu_cs_emit_qw(cs, pvtmem->iova);
1235 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
1236 COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
1237
1238 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
1239 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
1240
1241 if (cs->device->physical_device->info->chip >= A7XX) {
1242 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
1243 tu_cs_emit(cs, 0);
1244 }
1245
1246 if (cs->device->physical_device->info->chip == A6XX) {
1247 uint32_t shader_preload_size =
1248 MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
1249
1250 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1251 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1252 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1253 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1254 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1255 CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
1256 tu_cs_emit_qw(cs, binary_iova);
1257 }
1258
1259 /* emit immediates */
1260
1261 const struct ir3_const_state *const_state = ir3_const_state(xs);
1262 uint32_t base = const_state->offsets.immediate;
1263 unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
1264
1265 if (immediate_size > 0) {
1266 assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
1267 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
1268 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1269 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1270 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1271 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1272 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
1273 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1274 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1275
1276 tu_cs_emit_array(cs, const_state->immediates, immediate_size);
1277 }
1278
1279 if (const_state->consts_ubo.idx != -1) {
1280 uint64_t iova = binary_iova + xs->info.constant_data_offset;
1281 uint32_t offset = const_state->consts_ubo.idx;
1282
1283 /* Upload UBO state for the constant data. */
1284 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1285 tu_cs_emit(cs,
1286 CP_LOAD_STATE6_0_DST_OFF(offset) |
1287 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
1288 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1289 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1290 CP_LOAD_STATE6_0_NUM_UNIT(1));
1291 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1292 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1293 int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
1294 tu_cs_emit_qw(cs,
1295 iova |
1296 (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
1297
1298 /* Upload the constant data to the const file if needed. */
1299 const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
1300
1301 if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1302 for (int i = 0; i < ubo_state->num_enabled; i++) {
1303 if (ubo_state->range[i].ubo.block != offset ||
1304 ubo_state->range[i].ubo.bindless) {
1305 continue;
1306 }
1307
1308 uint32_t start = ubo_state->range[i].start;
1309 uint32_t end = ubo_state->range[i].end;
1310 uint32_t size = MIN2(end - start,
1311 (16 * xs->constlen) - ubo_state->range[i].offset);
1312
1313 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1314 tu_cs_emit(cs,
1315 CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
1316 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1317 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1318 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1319 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
1320 tu_cs_emit_qw(cs, iova + start);
1321 }
1322 }
1323 }
1324
1325 /* emit statically-known FS driver param */
1326 if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
1327 uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
1328 uint32_t size = ARRAY_SIZE(data);
1329
1330 /* A7XX TODO: Emit data via sub_cs instead of NOP */
1331 uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
1332 uint32_t base = const_state->driver_params_ubo.idx;
1333
1334 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1335 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1336 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
1337 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1338 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1339 CP_LOAD_STATE6_0_NUM_UNIT(1));
1340 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1341 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1342 int size_vec4s = DIV_ROUND_UP(size, 4);
1343 tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
1344 } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
1345 uint32_t base = const_state->offsets.driver_param;
1346 int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
1347 size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
1348
1349 if (size > 0) {
1350 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + 4);
1351 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1352 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1353 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1354 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1355 CP_LOAD_STATE6_0_NUM_UNIT(size));
1356 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1357 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1358
1359 tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
1360 tu_cs_emit(cs, 0);
1361 tu_cs_emit(cs, 0);
1362 tu_cs_emit(cs, 0);
1363 }
1364 }
1365 }
1366
1367 template <chip CHIP>
1368 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1369 tu6_emit_cs_config(struct tu_cs *cs,
1370 const struct ir3_shader_variant *v,
1371 const struct tu_pvtmem_config *pvtmem,
1372 uint64_t binary_iova)
1373 {
1374 bool shared_consts_enable =
1375 ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1376 tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1377
1378 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1379 .cs_state = true,
1380 .cs_ibo = true,
1381 .cs_shared_const = shared_consts_enable));
1382
1383 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
1384 tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
1385
1386 uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
1387 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
1388 tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
1389 A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
1390
1391 if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) {
1392 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
1393 tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
1394 A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
1395 }
1396
1397 uint32_t local_invocation_id =
1398 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
1399 uint32_t work_group_id =
1400 ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
1401
1402 /*
1403 * Devices that do not support double threadsize take the threadsize from
1404 * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
1405 * which is always set to THREAD128.
1406 */
1407 enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
1408 enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
1409 .supports_double_threadsize ? thrsz : THREAD128;
1410 if (CHIP == A6XX) {
1411 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
1412 tu_cs_emit(cs,
1413 A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1414 A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1415 A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1416 A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1417 tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1418 A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
1419 if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
1420 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1421 tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
1422 }
1423
1424 if (cs->device->physical_device->info->a6xx.has_lpac) {
1425 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
1426 tu_cs_emit(cs,
1427 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1428 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1429 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1430 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1431 tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1432 A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
1433 }
1434 } else {
1435 unsigned tile_height = (v->local_size[1] % 8 == 0) ? 3
1436 : (v->local_size[1] % 4 == 0) ? 5
1437 : (v->local_size[1] % 2 == 0) ? 9
1438 : 17;
1439 tu_cs_emit_regs(
1440 cs, HLSQ_CS_CNTL_1(CHIP,
1441 .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
1442 .workgrouprastorderzfirsten = true,
1443 .wgtilewidth = 4, .wgtileheight = tile_height));
1444
1445 tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
1446
1447 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1);
1448 tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1449 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1450 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1451 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1452
1453 tu_cs_emit_regs(cs,
1454 SP_CS_CNTL_1(CHIP,
1455 .linearlocalidregid = regid(63, 0),
1456 .threadsize = thrsz_cs,
1457 .workitemrastorder =
1458 v->cs.force_linear_dispatch ?
1459 WORKITEMRASTORDER_LINEAR :
1460 WORKITEMRASTORDER_TILED, ));
1461
1462 tu_cs_emit_regs(
1463 cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
1464 .localsizey = v->local_size[1] - 1,
1465 .localsizez = v->local_size[2] - 1, ));
1466
1467 tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
1468 }
1469 }
1470
1471 #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
1472
1473 static void
tu6_emit_vfd_dest(struct tu_cs * cs,const struct ir3_shader_variant * vs)1474 tu6_emit_vfd_dest(struct tu_cs *cs,
1475 const struct ir3_shader_variant *vs)
1476 {
1477 int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1478 uint32_t attr_count = 0;
1479
1480 for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
1481 input_for_attr[i] = -1;
1482
1483 for (unsigned i = 0; i < vs->inputs_count; i++) {
1484 if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
1485 continue;
1486
1487 assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
1488 unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
1489 input_for_attr[loc] = i;
1490 attr_count = MAX2(attr_count, loc + 1);
1491 }
1492
1493 tu_cs_emit_regs(cs,
1494 A6XX_VFD_CONTROL_0(
1495 .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
1496 .decode_cnt = attr_count));
1497
1498 if (attr_count)
1499 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
1500
1501 for (unsigned i = 0; i < attr_count; i++) {
1502 if (input_for_attr[i] >= 0) {
1503 unsigned input_idx = input_for_attr[i];
1504 tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1505 .writemask = vs->inputs[input_idx].compmask,
1506 .regid = vs->inputs[input_idx].regid).value);
1507 } else {
1508 tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1509 .writemask = 0,
1510 .regid = regid(63, 0)).value);
1511 }
1512 }
1513 }
1514
1515 static enum a6xx_tex_prefetch_cmd
tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)1516 tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)
1517 {
1518 switch (tex_opc) {
1519 case OPC_SAM:
1520 return TEX_PREFETCH_SAM;
1521 default:
1522 unreachable("Unknown tex opc for prefeth cmd");
1523 }
1524 }
1525
1526 template <chip CHIP>
1527 static void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1528 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1529 {
1530 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1531 uint32_t ij_regid[IJ_COUNT];
1532 uint32_t smask_in_regid;
1533
1534 bool sample_shading = fs->per_samp | fs->key.sample_shading;
1535 bool enable_varyings = fs->total_in > 0;
1536
1537 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1538 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1539 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1540 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1541 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1542 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1543 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1544
1545 if (fs->num_sampler_prefetch > 0) {
1546 /* It seems like ij_pix is *required* to be r0.x */
1547 assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
1548 ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1549 }
1550
1551 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1552 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1553 COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
1554 COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
1555 COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
1556 A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
1557 COND(fs->prefetch_end_of_quad,
1558 A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
1559 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1560 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1561 tu_cs_emit(
1562 cs, SP_FS_PREFETCH_CMD(
1563 CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id,
1564 .tex_id = prefetch->tex_id, .dst = prefetch->dst,
1565 .wrmask = prefetch->wrmask, .half = prefetch->half_precision,
1566 .bindless = prefetch->bindless,
1567 .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value);
1568 }
1569
1570 if (fs->num_sampler_prefetch > 0) {
1571 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1572 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1573 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1574 tu_cs_emit(cs,
1575 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1576 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1577 }
1578 }
1579
1580 tu_cs_emit_regs(cs,
1581 HLSQ_CONTROL_1_REG(CHIP,
1582 .primallocthreshold =
1583 cs->device->physical_device->info->a6xx.prim_alloc_threshold),
1584 HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid,
1585 .sampleid = samp_id_regid,
1586 .samplemask = smask_in_regid,
1587 .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]),
1588 HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1589 .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1590 .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1591 .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]),
1592 HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1593 .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1594 .xycoordregid = coord_regid,
1595 .zwcoordregid = zwcoord_regid),
1596 HLSQ_CONTROL_5_REG(CHIP, .dword = 0xfcfc), );
1597
1598 if (CHIP >= A7XX) {
1599 uint32_t sysval_regs = 0;
1600 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1601 if (VALIDREG(ij_regid[i])) {
1602 if (i == IJ_PERSP_CENTER_RHW)
1603 sysval_regs += 1;
1604 else
1605 sysval_regs += 2;
1606 }
1607 }
1608
1609 for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid }) {
1610 if (VALIDREG(sysval))
1611 sysval_regs += 1;
1612 }
1613
1614 for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1615 if (VALIDREG(sysval))
1616 sysval_regs += 2;
1617 }
1618
1619 tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.sysval_regs_count = sysval_regs,
1620 .unk8 = 1,
1621 .unk9 = 1));
1622 }
1623
1624 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1625 tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings));
1626
1627 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1628 bool need_size_persamp = false;
1629 if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1630 if (sample_shading)
1631 need_size_persamp = true;
1632 else
1633 need_size = true;
1634 }
1635
1636 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1637 tu_cs_emit(cs,
1638 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1639 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1640 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1641 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1642 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1643 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1644 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1645 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1646 COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1647
1648 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1649 tu_cs_emit(cs,
1650 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1651 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1652 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1653 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1654 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1655 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1656 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1657 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1658 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1659 COND(fs->fragcoord_compmask != 0,
1660 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1661 tu_cs_emit(cs,
1662 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1663 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1664 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1665 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1666 CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1667 COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE) |
1668 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1669
1670 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1671 tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1672
1673 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1674 tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1675 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1676 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1677
1678 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1679 tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1680
1681 uint32_t varmask[4] = { 0 };
1682
1683 for (int i = ir3_next_varying(fs, -1); i < fs->inputs_count;
1684 i = ir3_next_varying(fs, i)) {
1685 if (fs->inputs[i].inloc >= fs->total_in)
1686 continue;
1687
1688 unsigned loc = fs->inputs[i].inloc;
1689 for (int j = 0; j < util_last_bit(fs->inputs[i].compmask); j++) {
1690 uint8_t comploc = loc + j;
1691 varmask[comploc / 32] |= 1 << (comploc % 32);
1692 }
1693 }
1694
1695 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1696 tu_cs_emit(cs, ~varmask[0]);
1697 tu_cs_emit(cs, ~varmask[1]);
1698 tu_cs_emit(cs, ~varmask[2]);
1699 tu_cs_emit(cs, ~varmask[3]);
1700
1701 unsigned primid_loc = ir3_find_input_loc(fs, VARYING_SLOT_PRIMITIVE_ID);
1702 unsigned viewid_loc = ir3_find_input_loc(fs, VARYING_SLOT_VIEW_INDEX);
1703
1704 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1705 tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
1706 COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1707 A6XX_VPC_CNTL_0_PRIMIDLOC(primid_loc) |
1708 A6XX_VPC_CNTL_0_VIEWIDLOC(viewid_loc));
1709 }
1710
1711 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1712 tu6_emit_fs_outputs(struct tu_cs *cs,
1713 const struct ir3_shader_variant *fs)
1714 {
1715 uint32_t smask_regid, posz_regid, stencilref_regid;
1716
1717 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1718 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1719 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1720
1721 int output_reg_count = 0;
1722 uint32_t fragdata_regid[8];
1723
1724 assert(!fs->color0_mrt);
1725 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1726 fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1727 if (VALIDREG(fragdata_regid[i]))
1728 output_reg_count = i + 1;
1729 }
1730
1731 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1732 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1733 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1734 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1735 COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1736
1737 /* There is no point in having component enabled which is not written
1738 * by the shader. Per VK spec it is an UB, however a few apps depend on
1739 * attachment not being changed if FS doesn't have corresponding output.
1740 */
1741 uint32_t fs_render_components = 0;
1742
1743 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1744 for (uint32_t i = 0; i < output_reg_count; i++) {
1745 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1746 (COND(fragdata_regid[i] & HALF_REG_ID,
1747 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1748
1749 if (VALIDREG(fragdata_regid[i])) {
1750 fs_render_components |= 0xf << (i * 4);
1751 }
1752 }
1753
1754 tu_cs_emit_regs(cs,
1755 A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1756
1757 tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
1758 tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1759 COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1760 COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1761 COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1762
1763 tu_cs_emit_regs(cs,
1764 A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1765 }
1766
1767 template <chip CHIP>
1768 void
tu6_emit_vs(struct tu_cs * cs,const struct ir3_shader_variant * vs,uint32_t view_mask)1769 tu6_emit_vs(struct tu_cs *cs,
1770 const struct ir3_shader_variant *vs,
1771 uint32_t view_mask)
1772 {
1773 bool multi_pos_output = vs->multi_pos_output;
1774
1775 uint32_t multiview_views = util_logbase2(view_mask) + 1;
1776 uint32_t multiview_cntl = view_mask ?
1777 A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1778 A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1779 COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1780 : 0;
1781
1782 /* Copy what the blob does here. This will emit an extra 0x3f
1783 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1784 * this is working around yet.
1785 */
1786 if (cs->device->physical_device->info->a6xx.has_cp_reg_write) {
1787 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1788 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1789 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1790 } else {
1791 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1792 }
1793 tu_cs_emit(cs, multiview_cntl);
1794
1795 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1796 tu_cs_emit(cs, multiview_cntl);
1797
1798 if (multiview_cntl &&
1799 cs->device->physical_device->info->a6xx.supports_multiview_mask) {
1800 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1801 tu_cs_emit(cs, view_mask);
1802 }
1803
1804 if (CHIP >= A7XX) {
1805 tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_CNTL, 1);
1806 tu_cs_emit(cs, multiview_cntl);
1807
1808 tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_MASK, 1);
1809 tu_cs_emit(cs, view_mask);
1810 }
1811
1812 tu6_emit_vfd_dest(cs, vs);
1813
1814 const uint32_t vertexid_regid =
1815 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
1816 const uint32_t instanceid_regid =
1817 ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
1818
1819 /* Note: we currently don't support multiview with tess or GS. If we did,
1820 * and the HW actually works, then we'd have to somehow share this across
1821 * stages. Note that the blob doesn't support this either.
1822 */
1823 const uint32_t viewid_regid =
1824 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
1825
1826 const uint32_t vs_primitiveid_regid =
1827 ir3_find_sysval_regid(vs, SYSTEM_VALUE_PRIMITIVE_ID);
1828
1829 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 1);
1830 tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
1831 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
1832 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
1833 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
1834 }
1835 TU_GENX(tu6_emit_vs);
1836
1837 template <chip CHIP>
1838 void
tu6_emit_hs(struct tu_cs * cs,const struct ir3_shader_variant * hs)1839 tu6_emit_hs(struct tu_cs *cs,
1840 const struct ir3_shader_variant *hs)
1841 {
1842 const uint32_t hs_rel_patch_regid =
1843 ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1844 const uint32_t hs_invocation_regid =
1845 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);
1846
1847 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_2, 1);
1848 tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
1849 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
1850
1851 if (hs) {
1852 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1853 tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1854 }
1855 }
1856 TU_GENX(tu6_emit_hs);
1857
1858 template <chip CHIP>
1859 void
tu6_emit_ds(struct tu_cs * cs,const struct ir3_shader_variant * ds)1860 tu6_emit_ds(struct tu_cs *cs,
1861 const struct ir3_shader_variant *ds)
1862 {
1863 const uint32_t ds_rel_patch_regid =
1864 ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1865 const uint32_t tess_coord_x_regid =
1866 ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
1867 const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
1868 tess_coord_x_regid + 1 :
1869 regid(63, 0);
1870 const uint32_t ds_primitiveid_regid =
1871 ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
1872
1873 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_3, 2);
1874 tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
1875 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
1876 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
1877 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
1878 tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
1879 }
1880 TU_GENX(tu6_emit_ds);
1881
1882 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)1883 primitive_to_tess(enum mesa_prim primitive) {
1884 switch (primitive) {
1885 case MESA_PRIM_POINTS:
1886 return TESS_POINTS;
1887 case MESA_PRIM_LINE_STRIP:
1888 return TESS_LINES;
1889 case MESA_PRIM_TRIANGLE_STRIP:
1890 return TESS_CW_TRIS;
1891 default:
1892 unreachable("");
1893 }
1894 }
1895
1896 template <chip CHIP>
1897 void
tu6_emit_gs(struct tu_cs * cs,const struct ir3_shader_variant * gs)1898 tu6_emit_gs(struct tu_cs *cs,
1899 const struct ir3_shader_variant *gs)
1900 {
1901 const uint32_t gsheader_regid =
1902 ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
1903
1904 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_5, 1);
1905 tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
1906 0xfc00);
1907
1908 if (gs) {
1909 uint32_t vertices_out, invocations;
1910
1911 vertices_out = gs->gs.vertices_out - 1;
1912 enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim) gs->gs.output_primitive);
1913 invocations = gs->gs.invocations - 1;
1914
1915 uint32_t primitive_cntl =
1916 A6XX_PC_PRIMITIVE_CNTL_5(.gs_vertices_out = vertices_out,
1917 .gs_invocations = invocations,
1918 .gs_output = output,).value;
1919
1920 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1921 tu_cs_emit(cs, primitive_cntl);
1922
1923 if (CHIP >= A7XX) {
1924 tu_cs_emit_pkt4(cs, REG_A7XX_VPC_PRIMITIVE_CNTL_5, 1);
1925 tu_cs_emit(cs, primitive_cntl);
1926 } else {
1927 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1928 tu_cs_emit(cs, 0xff);
1929 }
1930 }
1931 }
1932 TU_GENX(tu6_emit_gs);
1933
1934 template <chip CHIP>
1935 void
tu6_emit_fs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1936 tu6_emit_fs(struct tu_cs *cs,
1937 const struct ir3_shader_variant *fs)
1938 {
1939 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_6, 1);
1940 tu_cs_emit(cs, COND(fs && fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN));
1941
1942 tu_cs_emit_regs(cs, A6XX_PC_PS_CNTL(.primitiveiden = fs && fs->reads_primid));
1943
1944 if (CHIP >= A7XX) {
1945 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
1946 tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
1947 }
1948
1949 if (fs) {
1950 tu6_emit_fs_inputs<CHIP>(cs, fs);
1951 tu6_emit_fs_outputs(cs, fs);
1952 } else {
1953 /* TODO: check if these can be skipped if fs is disabled */
1954 struct ir3_shader_variant dummy_variant = {};
1955 tu6_emit_fs_inputs<CHIP>(cs, &dummy_variant);
1956 tu6_emit_fs_outputs(cs, &dummy_variant);
1957 }
1958 }
1959 TU_GENX(tu6_emit_fs);
1960
1961 template <chip CHIP>
1962 static void
tu6_emit_variant(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,struct tu_pvtmem_config * pvtmem_config,uint32_t view_mask,uint64_t binary_iova)1963 tu6_emit_variant(struct tu_cs *cs,
1964 gl_shader_stage stage,
1965 const struct ir3_shader_variant *xs,
1966 struct tu_pvtmem_config *pvtmem_config,
1967 uint32_t view_mask,
1968 uint64_t binary_iova)
1969 {
1970 if (stage == MESA_SHADER_COMPUTE) {
1971 tu6_emit_cs_config<CHIP>(cs, xs, pvtmem_config, binary_iova);
1972 return;
1973 }
1974
1975 tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova);
1976
1977 switch (stage) {
1978 case MESA_SHADER_VERTEX:
1979 tu6_emit_vs<CHIP>(cs, xs, view_mask);
1980 break;
1981 case MESA_SHADER_TESS_CTRL:
1982 tu6_emit_hs<CHIP>(cs, xs);
1983 break;
1984 case MESA_SHADER_TESS_EVAL:
1985 tu6_emit_ds<CHIP>(cs, xs);
1986 break;
1987 case MESA_SHADER_GEOMETRY:
1988 tu6_emit_gs<CHIP>(cs, xs);
1989 break;
1990 case MESA_SHADER_FRAGMENT:
1991 tu6_emit_fs<CHIP>(cs, xs);
1992 break;
1993 default:
1994 unreachable("unknown shader stage");
1995 }
1996 }
1997
1998 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_shader * shader,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)1999 tu_setup_pvtmem(struct tu_device *dev,
2000 struct tu_shader *shader,
2001 struct tu_pvtmem_config *config,
2002 uint32_t pvtmem_bytes,
2003 bool per_wave)
2004 {
2005 if (!pvtmem_bytes) {
2006 memset(config, 0, sizeof(*config));
2007 return VK_SUCCESS;
2008 }
2009
2010 /* There is a substantial memory footprint from private memory BOs being
2011 * allocated on a per-pipeline basis and it isn't required as the same
2012 * BO can be utilized by multiple pipelines as long as they have the
2013 * private memory layout (sizes and per-wave/per-fiber) to avoid being
2014 * overwritten by other active pipelines using the same BO with differing
2015 * private memory layouts resulting memory corruption.
2016 *
2017 * To avoid this, we create private memory BOs on a per-device level with
2018 * an associated private memory layout then dynamically grow them when
2019 * needed and reuse them across pipelines. Growth is done in terms of
2020 * powers of two so that we can avoid frequent reallocation of the
2021 * private memory BOs.
2022 */
2023
2024 struct tu_pvtmem_bo *pvtmem_bo =
2025 per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
2026 mtx_lock(&pvtmem_bo->mtx);
2027
2028 if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
2029 if (pvtmem_bo->bo)
2030 tu_bo_finish(dev, pvtmem_bo->bo);
2031
2032 pvtmem_bo->per_fiber_size =
2033 util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
2034 pvtmem_bo->per_sp_size =
2035 ALIGN(pvtmem_bo->per_fiber_size *
2036 dev->physical_device->info->fibers_per_sp,
2037 1 << 12);
2038 uint32_t total_size =
2039 dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
2040
2041 VkResult result = tu_bo_init_new(dev, NULL, &pvtmem_bo->bo, total_size,
2042 TU_BO_ALLOC_INTERNAL_RESOURCE, "pvtmem");
2043 if (result != VK_SUCCESS) {
2044 mtx_unlock(&pvtmem_bo->mtx);
2045 return result;
2046 }
2047 }
2048
2049 config->per_wave = per_wave;
2050 config->per_fiber_size = pvtmem_bo->per_fiber_size;
2051 config->per_sp_size = pvtmem_bo->per_sp_size;
2052
2053 shader->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
2054 config->iova = shader->pvtmem_bo->iova;
2055
2056 mtx_unlock(&pvtmem_bo->mtx);
2057
2058 return VK_SUCCESS;
2059 }
2060
2061 static uint64_t
tu_upload_variant(struct tu_cs * cs,const struct ir3_shader_variant * variant)2062 tu_upload_variant(struct tu_cs *cs,
2063 const struct ir3_shader_variant *variant)
2064 {
2065 struct tu_cs_memory memory;
2066
2067 if (!variant)
2068 return 0;
2069
2070 /* this expects to get enough alignment because shaders are allocated first
2071 * and total size is always aligned correctly
2072 * note: an assert in tu6_emit_xs_config validates the alignment
2073 */
2074 tu_cs_alloc(cs, variant->info.size / 4, 1, &memory);
2075
2076 memcpy(memory.map, variant->bin, variant->info.size);
2077 return memory.iova;
2078 }
2079
2080 static VkResult
tu_upload_shader(struct tu_device * dev,struct tu_shader * shader)2081 tu_upload_shader(struct tu_device *dev,
2082 struct tu_shader *shader)
2083 {
2084 const struct ir3_shader_variant *v = shader->variant;
2085 const struct ir3_shader_variant *binning = v ? v->binning : NULL;
2086 const struct ir3_shader_variant *safe_const = shader->safe_const_variant;
2087
2088 if (v->type == MESA_SHADER_VERTEX && v->stream_output.num_outputs != 0)
2089 binning = v;
2090
2091 uint32_t size = 0;
2092 if (v->type == MESA_SHADER_VERTEX)
2093 size += TU6_EMIT_VFD_DEST_MAX_DWORDS;
2094
2095 const unsigned xs_size = 128;
2096 const unsigned vpc_size = 32 + (v->stream_output.num_outputs != 0 ? 256 : 0);
2097
2098 size += xs_size + tu_xs_get_additional_cs_size_dwords(v);
2099 size += v->info.size / 4;
2100 if (binning) {
2101 size += xs_size + tu_xs_get_additional_cs_size_dwords(binning);
2102 size += binning->info.size / 4;
2103 }
2104
2105 if (safe_const) {
2106 size += xs_size + tu_xs_get_additional_cs_size_dwords(safe_const);
2107 size += safe_const->info.size / 4;
2108 }
2109
2110 /* We emit an empty VPC including streamout state in the binning draw state */
2111 if (binning || v->type == MESA_SHADER_GEOMETRY) {
2112 size += vpc_size;
2113 }
2114
2115 pthread_mutex_lock(&dev->pipeline_mutex);
2116 VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2117 size * 4, 128);
2118 pthread_mutex_unlock(&dev->pipeline_mutex);
2119
2120 if (result != VK_SUCCESS)
2121 return result;
2122
2123 uint32_t pvtmem_size = v->pvtmem_size;
2124 bool per_wave = v->pvtmem_per_wave;
2125
2126 if (v->binning) {
2127 pvtmem_size = MAX2(pvtmem_size, shader->variant->binning->pvtmem_size);
2128 if (!shader->variant->binning->pvtmem_per_wave)
2129 per_wave = false;
2130 }
2131
2132 if (shader->safe_const_variant) {
2133 pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->pvtmem_size);
2134 if (!shader->safe_const_variant->pvtmem_per_wave)
2135 per_wave = false;
2136
2137 if (shader->safe_const_variant->binning) {
2138 pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->binning->pvtmem_size);
2139 if (!shader->safe_const_variant->binning->pvtmem_per_wave)
2140 per_wave = false;
2141 }
2142 }
2143
2144 struct tu_pvtmem_config pvtmem_config;
2145
2146 result = tu_setup_pvtmem(dev, shader, &pvtmem_config, pvtmem_size, per_wave);
2147 if (result != VK_SUCCESS) {
2148 pthread_mutex_lock(&dev->pipeline_mutex);
2149 tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2150 pthread_mutex_unlock(&dev->pipeline_mutex);
2151 return result;
2152 }
2153
2154 TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2155 tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2156
2157 uint64_t iova = tu_upload_variant(&shader->cs, v);
2158 uint64_t binning_iova = tu_upload_variant(&shader->cs, binning);
2159 uint64_t safe_const_iova = tu_upload_variant(&shader->cs, safe_const);
2160
2161 struct tu_cs sub_cs;
2162 tu_cs_begin_sub_stream(&shader->cs, xs_size +
2163 tu_xs_get_additional_cs_size_dwords(v), &sub_cs);
2164 TU_CALLX(dev, tu6_emit_variant)(
2165 &sub_cs, shader->variant->type, shader->variant, &pvtmem_config,
2166 shader->view_mask, iova);
2167 shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2168
2169 if (safe_const) {
2170 tu_cs_begin_sub_stream(&shader->cs, xs_size +
2171 tu_xs_get_additional_cs_size_dwords(safe_const), &sub_cs);
2172 TU_CALLX(dev, tu6_emit_variant)(
2173 &sub_cs, v->type, safe_const, &pvtmem_config, shader->view_mask,
2174 safe_const_iova);
2175 shader->safe_const_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2176 }
2177
2178 if (binning) {
2179 tu_cs_begin_sub_stream(&shader->cs, xs_size + vpc_size +
2180 tu_xs_get_additional_cs_size_dwords(binning), &sub_cs);
2181 TU_CALLX(dev, tu6_emit_variant)(
2182 &sub_cs, v->type, binning, &pvtmem_config, shader->view_mask,
2183 binning_iova);
2184 /* emit an empty VPC */
2185 TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, binning, NULL, NULL, NULL, NULL);
2186 shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2187 }
2188
2189 /* We don't support binning variants for GS, so the same draw state is used
2190 * when binning and when drawing, but the VPC draw state is not executed
2191 * when binning so we still need to generate an appropriate VPC config for
2192 * binning.
2193 */
2194 if (v->type == MESA_SHADER_GEOMETRY) {
2195 tu_cs_begin_sub_stream(&shader->cs, vpc_size, &sub_cs);
2196 TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, NULL, NULL, NULL, v, NULL);
2197 shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2198 }
2199
2200 return VK_SUCCESS;
2201 }
2202
2203 static bool
2204 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2205 struct blob *blob);
2206
2207 static struct vk_pipeline_cache_object *
2208 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2209 const void *key_data,
2210 size_t key_size,
2211 struct blob_reader *blob);
2212
2213 static void
tu_shader_pipeline_cache_object_destroy(struct vk_device * vk_device,struct vk_pipeline_cache_object * object)2214 tu_shader_pipeline_cache_object_destroy(struct vk_device *vk_device,
2215 struct vk_pipeline_cache_object *object)
2216 {
2217 struct tu_device *device = container_of(vk_device, struct tu_device, vk);
2218 struct tu_shader *shader =
2219 container_of(object, struct tu_shader, base);
2220
2221 vk_pipeline_cache_object_finish(&shader->base);
2222 tu_shader_destroy(device, shader);
2223 }
2224
2225 const struct vk_pipeline_cache_object_ops tu_shader_ops = {
2226 .serialize = tu_shader_serialize,
2227 .deserialize = tu_shader_deserialize,
2228 .destroy = tu_shader_pipeline_cache_object_destroy,
2229 };
2230
2231 static struct tu_shader *
tu_shader_init(struct tu_device * dev,const void * key_data,size_t key_size)2232 tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
2233 {
2234 VK_MULTIALLOC(ma);
2235 VK_MULTIALLOC_DECL(&ma, struct tu_shader, shader, 1);
2236 VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
2237
2238 if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2239 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2240 return NULL;
2241
2242 memcpy(obj_key_data, key_data, key_size);
2243
2244 vk_pipeline_cache_object_init(&dev->vk, &shader->base,
2245 &tu_shader_ops, obj_key_data, key_size);
2246
2247 shader->const_state.fdm_ubo.idx = -1;
2248 shader->const_state.dynamic_offsets_ubo.idx = -1;
2249 shader->const_state.inline_uniforms_ubo.idx = -1;
2250
2251 return shader;
2252 }
2253
2254 static bool
tu_shader_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2255 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2256 struct blob *blob)
2257 {
2258 struct tu_shader *shader =
2259 container_of(object, struct tu_shader, base);
2260
2261 blob_write_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2262 blob_write_bytes(blob, &shader->dynamic_descriptor_sizes,
2263 sizeof(shader->dynamic_descriptor_sizes));
2264 blob_write_uint32(blob, shader->view_mask);
2265 blob_write_uint8(blob, shader->active_desc_sets);
2266
2267 ir3_store_variant(blob, shader->variant);
2268
2269 if (shader->safe_const_variant) {
2270 blob_write_uint8(blob, 1);
2271 ir3_store_variant(blob, shader->safe_const_variant);
2272 } else {
2273 blob_write_uint8(blob, 0);
2274 }
2275
2276
2277
2278 switch (shader->variant->type) {
2279 case MESA_SHADER_TESS_EVAL:
2280 blob_write_bytes(blob, &shader->tes, sizeof(shader->tes));
2281 break;
2282 case MESA_SHADER_FRAGMENT:
2283 blob_write_bytes(blob, &shader->fs, sizeof(shader->fs));
2284 break;
2285 default:
2286 break;
2287 }
2288
2289 return true;
2290 }
2291
2292 static struct vk_pipeline_cache_object *
tu_shader_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)2293 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2294 const void *key_data,
2295 size_t key_size,
2296 struct blob_reader *blob)
2297 {
2298 struct tu_device *dev =
2299 container_of(cache->base.device, struct tu_device, vk);
2300 struct tu_shader *shader =
2301 tu_shader_init(dev, key_data, key_size);
2302
2303 if (!shader)
2304 return NULL;
2305
2306 blob_copy_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2307 blob_copy_bytes(blob, &shader->dynamic_descriptor_sizes,
2308 sizeof(shader->dynamic_descriptor_sizes));
2309 shader->view_mask = blob_read_uint32(blob);
2310 shader->active_desc_sets = blob_read_uint8(blob);
2311
2312 shader->variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2313
2314 bool has_safe_const = blob_read_uint8(blob);
2315 if (has_safe_const)
2316 shader->safe_const_variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2317
2318 switch (shader->variant->type) {
2319 case MESA_SHADER_TESS_EVAL:
2320 blob_copy_bytes(blob, &shader->tes, sizeof(shader->tes));
2321 break;
2322 case MESA_SHADER_FRAGMENT:
2323 blob_copy_bytes(blob, &shader->fs, sizeof(shader->fs));
2324 break;
2325 default:
2326 break;
2327 }
2328
2329 VkResult result = tu_upload_shader(dev, shader);
2330 if (result != VK_SUCCESS) {
2331 vk_free(&dev->vk.alloc, shader);
2332 return NULL;
2333 }
2334
2335 return &shader->base;
2336 }
2337
2338 VkResult
tu_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,nir_shader * nir,const struct tu_shader_key * key,const struct ir3_shader_key * ir3_key,const void * key_data,size_t key_size,struct tu_pipeline_layout * layout,bool executable_info)2339 tu_shader_create(struct tu_device *dev,
2340 struct tu_shader **shader_out,
2341 nir_shader *nir,
2342 const struct tu_shader_key *key,
2343 const struct ir3_shader_key *ir3_key,
2344 const void *key_data,
2345 size_t key_size,
2346 struct tu_pipeline_layout *layout,
2347 bool executable_info)
2348 {
2349 struct tu_shader *shader = tu_shader_init(dev, key_data, key_size);
2350
2351 if (!shader)
2352 return VK_ERROR_OUT_OF_HOST_MEMORY;
2353
2354 const nir_opt_access_options access_options = {
2355 .is_vulkan = true,
2356 };
2357 NIR_PASS_V(nir, nir_opt_access, &access_options);
2358
2359 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2360 const nir_input_attachment_options att_options = {
2361 .use_fragcoord_sysval = true,
2362 .use_layer_id_sysval = false,
2363 /* When using multiview rendering, we must use
2364 * gl_ViewIndex as the layer id to pass to the texture
2365 * sampling function. gl_Layer doesn't work when
2366 * multiview is enabled.
2367 */
2368 .use_view_id_for_layer = key->multiview_mask != 0,
2369 .unscaled_input_attachment_ir3 = key->unscaled_input_fragcoord,
2370 };
2371 NIR_PASS_V(nir, nir_lower_input_attachments, &att_options);
2372 }
2373
2374 /* This has to happen before lower_input_attachments, because we have to
2375 * lower input attachment coordinates except if unscaled.
2376 */
2377 const struct lower_fdm_options fdm_options = {
2378 .num_views = MAX2(util_last_bit(key->multiview_mask), 1),
2379 .adjust_fragcoord = key->fragment_density_map,
2380 };
2381 NIR_PASS_V(nir, tu_nir_lower_fdm, &fdm_options);
2382
2383
2384 /* This needs to happen before multiview lowering which rewrites store
2385 * instructions of the position variable, so that we can just rewrite one
2386 * store at the end instead of having to rewrite every store specified by
2387 * the user.
2388 */
2389 ir3_nir_lower_io_to_temporaries(nir);
2390
2391 if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
2392 tu_nir_lower_multiview(nir, key->multiview_mask, dev);
2393 }
2394
2395 if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
2396 nir_foreach_shader_in_variable(var, nir) {
2397 if (!var->data.centroid)
2398 var->data.sample = true;
2399 }
2400 }
2401
2402 NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
2403 nir_address_format_32bit_offset);
2404
2405 NIR_PASS_V(nir, nir_lower_explicit_io,
2406 nir_var_mem_ubo | nir_var_mem_ssbo,
2407 nir_address_format_vec2_index_32bit_offset);
2408
2409 NIR_PASS_V(nir, nir_lower_explicit_io,
2410 nir_var_mem_global,
2411 nir_address_format_64bit_global);
2412
2413 if (nir->info.stage == MESA_SHADER_COMPUTE) {
2414 if (!nir->info.shared_memory_explicit_layout) {
2415 NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
2416 nir_var_mem_shared, shared_type_info);
2417 }
2418 NIR_PASS_V(nir, nir_lower_explicit_io,
2419 nir_var_mem_shared,
2420 nir_address_format_32bit_offset);
2421
2422 if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
2423 const unsigned chunk_size = 16; /* max single store size */
2424 /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
2425 * extension only requires us to initialize the memory that the shader
2426 * is allocated at the API level, and it's up to the user to ensure
2427 * that accesses are limited to those bounds.
2428 */
2429 const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2430 NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
2431 }
2432
2433 const struct nir_lower_compute_system_values_options compute_sysval_options = {
2434 .has_base_workgroup_id = true,
2435 };
2436 NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
2437 }
2438
2439 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
2440 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
2441
2442 /* Gather information for transform feedback. This should be called after:
2443 * - nir_split_per_member_structs.
2444 * - nir_remove_dead_variables with varyings, so that we could align
2445 * stream outputs correctly.
2446 * - nir_assign_io_var_locations - to have valid driver_location
2447 */
2448 struct ir3_stream_output_info so_info = {};
2449 if (nir->info.stage == MESA_SHADER_VERTEX ||
2450 nir->info.stage == MESA_SHADER_TESS_EVAL ||
2451 nir->info.stage == MESA_SHADER_GEOMETRY)
2452 tu_gather_xfb_info(nir, &so_info);
2453
2454 for (unsigned i = 0; i < layout->num_sets; i++) {
2455 if (layout->set[i].layout) {
2456 shader->dynamic_descriptor_sizes[i] =
2457 layout->set[i].layout->dynamic_offset_size;
2458 } else {
2459 shader->dynamic_descriptor_sizes[i] = -1;
2460 }
2461 }
2462
2463 unsigned reserved_consts_vec4 = 0;
2464 NIR_PASS_V(nir, tu_lower_io, dev, shader, layout, &reserved_consts_vec4);
2465
2466 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2467
2468 ir3_finalize_nir(dev->compiler, nir);
2469
2470 const struct ir3_shader_options options = {
2471 .num_reserved_user_consts = reserved_consts_vec4,
2472 .api_wavesize = key->api_wavesize,
2473 .real_wavesize = key->real_wavesize,
2474 .push_consts_type = shader->const_state.push_consts.type,
2475 .push_consts_base = shader->const_state.push_consts.lo,
2476 .push_consts_dwords = shader->const_state.push_consts.dwords,
2477 };
2478
2479 struct ir3_shader *ir3_shader =
2480 ir3_shader_from_nir(dev->compiler, nir, &options, &so_info);
2481
2482 shader->variant =
2483 ir3_shader_create_variant(ir3_shader, ir3_key, executable_info);
2484
2485 if (ir3_exceeds_safe_constlen(shader->variant)) {
2486 struct ir3_shader_key safe_constlen_key = *ir3_key;
2487 safe_constlen_key.safe_constlen = true;
2488 shader->safe_const_variant =
2489 ir3_shader_create_variant(ir3_shader, &safe_constlen_key,
2490 executable_info);
2491 }
2492
2493 ir3_shader_destroy(ir3_shader);
2494
2495 shader->view_mask = key->multiview_mask;
2496
2497 switch (shader->variant->type) {
2498 case MESA_SHADER_TESS_EVAL: {
2499 const struct ir3_shader_variant *tes = shader->variant;
2500 if (tes->tess.point_mode) {
2501 shader->tes.tess_output_lower_left =
2502 shader->tes.tess_output_upper_left = TESS_POINTS;
2503 } else if (tes->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
2504 shader->tes.tess_output_lower_left =
2505 shader->tes.tess_output_upper_left = TESS_LINES;
2506 } else if (tes->tess.ccw) {
2507 /* Tessellation orientation in HW is specified with a lower-left
2508 * origin, we need to swap them if the origin is upper-left.
2509 */
2510 shader->tes.tess_output_lower_left = TESS_CCW_TRIS;
2511 shader->tes.tess_output_upper_left = TESS_CW_TRIS;
2512 } else {
2513 shader->tes.tess_output_lower_left = TESS_CW_TRIS;
2514 shader->tes.tess_output_upper_left = TESS_CCW_TRIS;
2515 }
2516
2517 switch (tes->tess.spacing) {
2518 case TESS_SPACING_EQUAL:
2519 shader->tes.tess_spacing = TESS_EQUAL;
2520 break;
2521 case TESS_SPACING_FRACTIONAL_ODD:
2522 shader->tes.tess_spacing = TESS_FRACTIONAL_ODD;
2523 break;
2524 case TESS_SPACING_FRACTIONAL_EVEN:
2525 shader->tes.tess_spacing = TESS_FRACTIONAL_EVEN;
2526 break;
2527 case TESS_SPACING_UNSPECIFIED:
2528 default:
2529 unreachable("invalid tess spacing");
2530 }
2531
2532 break;
2533 }
2534 case MESA_SHADER_FRAGMENT: {
2535 const struct ir3_shader_variant *fs = shader->variant;
2536 shader->fs.per_samp = fs->per_samp || ir3_key->sample_shading;
2537 shader->fs.has_fdm = key->fragment_density_map;
2538 if (fs->has_kill)
2539 shader->fs.lrz.status |= TU_LRZ_FORCE_DISABLE_WRITE;
2540 if (fs->no_earlyz || (fs->writes_pos && !fs->fs.early_fragment_tests))
2541 shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2542 /* FDM isn't compatible with LRZ, because the LRZ image uses the original
2543 * resolution and we would need to use the low resolution.
2544 *
2545 * TODO: Use a patchpoint to only disable LRZ for scaled bins.
2546 */
2547 if (key->fragment_density_map)
2548 shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2549 if (!fs->fs.early_fragment_tests &&
2550 (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
2551 shader->fs.lrz.force_late_z = true;
2552 }
2553 break;
2554 }
2555 default:
2556 break;
2557 }
2558
2559 VkResult result = tu_upload_shader(dev, shader);
2560 if (result != VK_SUCCESS) {
2561 vk_free(&dev->vk.alloc, shader);
2562 return result;
2563 }
2564
2565 *shader_out = shader;
2566 return VK_SUCCESS;
2567 }
2568
2569 static void
tu_link_shaders(nir_shader ** shaders,unsigned shaders_count)2570 tu_link_shaders(nir_shader **shaders, unsigned shaders_count)
2571 {
2572 nir_shader *consumer = NULL;
2573 for (gl_shader_stage stage = (gl_shader_stage) (shaders_count - 1);
2574 stage >= MESA_SHADER_VERTEX; stage = (gl_shader_stage) (stage - 1)) {
2575 if (!shaders[stage])
2576 continue;
2577
2578 nir_shader *producer = shaders[stage];
2579 if (!consumer) {
2580 consumer = producer;
2581 continue;
2582 }
2583
2584 if (nir_link_opt_varyings(producer, consumer)) {
2585 NIR_PASS_V(consumer, nir_opt_constant_folding);
2586 NIR_PASS_V(consumer, nir_opt_algebraic);
2587 NIR_PASS_V(consumer, nir_opt_dce);
2588 }
2589
2590 const nir_remove_dead_variables_options out_var_opts = {
2591 .can_remove_var = nir_vk_is_not_xfb_output,
2592 };
2593 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
2594
2595 NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2596
2597 bool progress = nir_remove_unused_varyings(producer, consumer);
2598
2599 nir_compact_varyings(producer, consumer, true);
2600 if (progress) {
2601 if (nir_lower_global_vars_to_local(producer)) {
2602 /* Remove dead writes, which can remove input loads */
2603 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2604 NIR_PASS_V(producer, nir_opt_dce);
2605 }
2606 nir_lower_global_vars_to_local(consumer);
2607 }
2608
2609 consumer = producer;
2610 }
2611
2612 /* Gather info after linking so that we can fill out the ir3 shader key.
2613 */
2614 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2615 stage <= MESA_SHADER_FRAGMENT; stage = (gl_shader_stage) (stage + 1)) {
2616 if (shaders[stage])
2617 nir_shader_gather_info(shaders[stage],
2618 nir_shader_get_entrypoint(shaders[stage]));
2619 }
2620 }
2621
2622 static uint32_t
tu6_get_tessmode(const struct nir_shader * shader)2623 tu6_get_tessmode(const struct nir_shader *shader)
2624 {
2625 enum tess_primitive_mode primitive_mode = shader->info.tess._primitive_mode;
2626 switch (primitive_mode) {
2627 case TESS_PRIMITIVE_ISOLINES:
2628 return IR3_TESS_ISOLINES;
2629 case TESS_PRIMITIVE_TRIANGLES:
2630 return IR3_TESS_TRIANGLES;
2631 case TESS_PRIMITIVE_QUADS:
2632 return IR3_TESS_QUADS;
2633 case TESS_PRIMITIVE_UNSPECIFIED:
2634 return IR3_TESS_NONE;
2635 default:
2636 unreachable("bad tessmode");
2637 }
2638 }
2639
2640 VkResult
tu_compile_shaders(struct tu_device * device,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stage_infos,nir_shader ** nir,const struct tu_shader_key * keys,struct tu_pipeline_layout * layout,const unsigned char * pipeline_sha1,struct tu_shader ** shaders,char ** nir_initial_disasm,void * nir_initial_disasm_mem_ctx,nir_shader ** nir_out,VkPipelineCreationFeedback * stage_feedbacks)2641 tu_compile_shaders(struct tu_device *device,
2642 VkPipelineCreateFlags2KHR pipeline_flags,
2643 const VkPipelineShaderStageCreateInfo **stage_infos,
2644 nir_shader **nir,
2645 const struct tu_shader_key *keys,
2646 struct tu_pipeline_layout *layout,
2647 const unsigned char *pipeline_sha1,
2648 struct tu_shader **shaders,
2649 char **nir_initial_disasm,
2650 void *nir_initial_disasm_mem_ctx,
2651 nir_shader **nir_out,
2652 VkPipelineCreationFeedback *stage_feedbacks)
2653 {
2654 struct ir3_shader_key ir3_key = {};
2655 VkResult result = VK_SUCCESS;
2656 void *mem_ctx = ralloc_context(NULL);
2657
2658 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2659 stage = (gl_shader_stage) (stage + 1)) {
2660 const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2661 if (!stage_info)
2662 continue;
2663
2664 int64_t stage_start = os_time_get_nano();
2665
2666 nir[stage] = tu_spirv_to_nir(device, mem_ctx, pipeline_flags,
2667 stage_info, stage);
2668 if (!nir[stage]) {
2669 result = VK_ERROR_OUT_OF_HOST_MEMORY;
2670 goto fail;
2671 }
2672
2673 stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2674 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2675 }
2676
2677 if (nir[MESA_SHADER_GEOMETRY])
2678 ir3_key.has_gs = true;
2679
2680 ir3_key.sample_shading = keys[MESA_SHADER_FRAGMENT].force_sample_interp;
2681
2682 if (nir_initial_disasm) {
2683 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2684 stage < MESA_SHADER_STAGES;
2685 stage = (gl_shader_stage) (stage + 1)) {
2686 if (!nir[stage])
2687 continue;
2688
2689 nir_initial_disasm[stage] =
2690 nir_shader_as_str(nir[stage], nir_initial_disasm_mem_ctx);
2691 }
2692 }
2693
2694 tu_link_shaders(nir, MESA_SHADER_STAGES);
2695
2696 if (nir_out) {
2697 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2698 stage < MESA_SHADER_STAGES; stage = (gl_shader_stage) (stage + 1)) {
2699 if (!nir[stage])
2700 continue;
2701
2702 nir_out[stage] = nir_shader_clone(NULL, nir[stage]);
2703 }
2704 }
2705
2706 /* With pipelines, tessellation modes can be set on either shader, for
2707 * compatibility with HLSL and GLSL, and the driver is supposed to merge
2708 * them. Shader objects requires modes to be set on at least the TES except
2709 * for OutputVertices which has to be set at least on the TCS. Make sure
2710 * all modes are set on the TES when compiling together multiple shaders,
2711 * and then from this point on we will use the modes in the TES (and output
2712 * vertices on the TCS).
2713 */
2714 if (nir[MESA_SHADER_TESS_EVAL]) {
2715 nir_shader *tcs = nir[MESA_SHADER_TESS_CTRL];
2716 nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
2717
2718 if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
2719 tes->info.tess._primitive_mode = tcs->info.tess._primitive_mode;
2720
2721 tes->info.tess.point_mode |= tcs->info.tess.point_mode;
2722 tes->info.tess.ccw |= tcs->info.tess.ccw;
2723
2724 if (tes->info.tess.spacing == TESS_SPACING_UNSPECIFIED) {
2725 tes->info.tess.spacing = tcs->info.tess.spacing;
2726 }
2727
2728 if (tcs->info.tess.tcs_vertices_out == 0)
2729 tcs->info.tess.tcs_vertices_out = tes->info.tess.tcs_vertices_out;
2730
2731 ir3_key.tessellation = tu6_get_tessmode(tes);
2732 }
2733
2734 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2735 stage = (gl_shader_stage) (stage + 1)) {
2736 if (!nir[stage])
2737 continue;
2738
2739 if (stage > MESA_SHADER_TESS_CTRL) {
2740 if (stage == MESA_SHADER_FRAGMENT) {
2741 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2742 (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2743 } else {
2744 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2745 BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2746 }
2747 }
2748 }
2749
2750 /* In the the tess-but-not-FS case we don't know whether the FS will read
2751 * PrimID so we need to unconditionally store it.
2752 */
2753 if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
2754 ir3_key.tcs_store_primid = true;
2755
2756 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2757 stage = (gl_shader_stage) (stage + 1)) {
2758 if (!nir[stage] || shaders[stage])
2759 continue;
2760
2761 int64_t stage_start = os_time_get_nano();
2762
2763 unsigned char shader_sha1[21];
2764 memcpy(shader_sha1, pipeline_sha1, 20);
2765 shader_sha1[20] = (unsigned char) stage;
2766
2767 result = tu_shader_create(device,
2768 &shaders[stage], nir[stage], &keys[stage],
2769 &ir3_key, shader_sha1, sizeof(shader_sha1),
2770 layout, !!nir_initial_disasm);
2771 if (result != VK_SUCCESS) {
2772 goto fail;
2773 }
2774
2775 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2776 }
2777
2778 ralloc_free(mem_ctx);
2779
2780 return VK_SUCCESS;
2781
2782 fail:
2783 ralloc_free(mem_ctx);
2784
2785 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2786 stage = (gl_shader_stage) (stage + 1)) {
2787 if (shaders[stage]) {
2788 tu_shader_destroy(device, shaders[stage]);
2789 }
2790 if (nir_out && nir_out[stage]) {
2791 ralloc_free(nir_out[stage]);
2792 }
2793 }
2794
2795 return result;
2796 }
2797
2798 void
tu_shader_key_subgroup_size(struct tu_shader_key * key,bool allow_varying_subgroup_size,bool require_full_subgroups,const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo * subgroup_info,struct tu_device * dev)2799 tu_shader_key_subgroup_size(struct tu_shader_key *key,
2800 bool allow_varying_subgroup_size,
2801 bool require_full_subgroups,
2802 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info,
2803 struct tu_device *dev)
2804 {
2805 enum ir3_wavesize_option api_wavesize, real_wavesize;
2806 if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
2807 api_wavesize = IR3_SINGLE_ONLY;
2808 real_wavesize = IR3_SINGLE_ONLY;
2809 } else {
2810 if (allow_varying_subgroup_size) {
2811 api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2812 } else {
2813 if (subgroup_info) {
2814 if (subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2815 api_wavesize = IR3_SINGLE_ONLY;
2816 } else {
2817 assert(subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2818 api_wavesize = IR3_DOUBLE_ONLY;
2819 }
2820 } else {
2821 /* Match the exposed subgroupSize. */
2822 api_wavesize = IR3_DOUBLE_ONLY;
2823 }
2824
2825 if (require_full_subgroups)
2826 real_wavesize = api_wavesize;
2827 else if (api_wavesize == IR3_SINGLE_ONLY)
2828 real_wavesize = IR3_SINGLE_ONLY;
2829 else
2830 real_wavesize = IR3_SINGLE_OR_DOUBLE;
2831 }
2832 }
2833
2834 key->api_wavesize = api_wavesize;
2835 key->real_wavesize = real_wavesize;
2836 }
2837
2838 static VkResult
tu_empty_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,gl_shader_stage stage)2839 tu_empty_shader_create(struct tu_device *dev,
2840 struct tu_shader **shader_out,
2841 gl_shader_stage stage)
2842 {
2843 struct tu_shader *shader = tu_shader_init(dev, NULL, 0);
2844
2845 if (!shader)
2846 return VK_ERROR_OUT_OF_HOST_MEMORY;
2847
2848 pthread_mutex_lock(&dev->pipeline_mutex);
2849 VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2850 32 * 4, 128);
2851 pthread_mutex_unlock(&dev->pipeline_mutex);
2852
2853 if (result != VK_SUCCESS) {
2854 vk_free(&dev->vk.alloc, shader);
2855 return result;
2856 }
2857
2858 TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2859 tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2860
2861 struct tu_pvtmem_config pvtmem_config = { };
2862
2863 struct tu_cs sub_cs;
2864 tu_cs_begin_sub_stream(&shader->cs, 32, &sub_cs);
2865 TU_CALLX(dev, tu6_emit_variant)(&sub_cs, stage, NULL, &pvtmem_config, 0, 0);
2866 shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2867
2868 *shader_out = shader;
2869 return VK_SUCCESS;
2870 }
2871
2872 static VkResult
tu_empty_fs_create(struct tu_device * dev,struct tu_shader ** shader,bool fragment_density_map)2873 tu_empty_fs_create(struct tu_device *dev, struct tu_shader **shader,
2874 bool fragment_density_map)
2875 {
2876 struct ir3_shader_key key = {};
2877 const struct ir3_shader_options options = {};
2878 struct ir3_stream_output_info so_info = {};
2879 const nir_shader_compiler_options *nir_options =
2880 ir3_get_compiler_options(dev->compiler);
2881 nir_builder fs_b;
2882
2883 fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options,
2884 "noop_fs");
2885
2886 *shader = tu_shader_init(dev, NULL, 0);
2887 if (!*shader)
2888 return VK_ERROR_OUT_OF_HOST_MEMORY;
2889
2890 (*shader)->fs.has_fdm = fragment_density_map;
2891 if (fragment_density_map)
2892 (*shader)->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2893
2894 for (unsigned i = 0; i < MAX_SETS; i++)
2895 (*shader)->dynamic_descriptor_sizes[i] = -1;
2896
2897 struct ir3_shader *ir3_shader =
2898 ir3_shader_from_nir(dev->compiler, fs_b.shader, &options, &so_info);
2899 (*shader)->variant = ir3_shader_create_variant(ir3_shader, &key, false);
2900 ir3_shader_destroy(ir3_shader);
2901
2902 return tu_upload_shader(dev, *shader);
2903 }
2904
2905 VkResult
tu_init_empty_shaders(struct tu_device * dev)2906 tu_init_empty_shaders(struct tu_device *dev)
2907 {
2908 VkResult result;
2909
2910 result = tu_empty_shader_create(dev, &dev->empty_tcs, MESA_SHADER_TESS_CTRL);
2911 if (result != VK_SUCCESS)
2912 goto out;
2913
2914 result = tu_empty_shader_create(dev, &dev->empty_tes, MESA_SHADER_TESS_EVAL);
2915 if (result != VK_SUCCESS)
2916 goto out;
2917
2918 result = tu_empty_shader_create(dev, &dev->empty_gs, MESA_SHADER_GEOMETRY);
2919 if (result != VK_SUCCESS)
2920 goto out;
2921
2922 result = tu_empty_fs_create(dev, &dev->empty_fs, false);
2923 if (result != VK_SUCCESS)
2924 goto out;
2925
2926 result = tu_empty_fs_create(dev, &dev->empty_fs_fdm, true);
2927 if (result != VK_SUCCESS)
2928 goto out;
2929
2930 return VK_SUCCESS;
2931
2932 out:
2933 if (dev->empty_tcs)
2934 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
2935 if (dev->empty_tes)
2936 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
2937 if (dev->empty_gs)
2938 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
2939 if (dev->empty_fs)
2940 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
2941 if (dev->empty_fs_fdm)
2942 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
2943 return result;
2944 }
2945
2946 void
tu_destroy_empty_shaders(struct tu_device * dev)2947 tu_destroy_empty_shaders(struct tu_device *dev)
2948 {
2949 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
2950 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
2951 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
2952 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
2953 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
2954 }
2955
2956 void
tu_shader_destroy(struct tu_device * dev,struct tu_shader * shader)2957 tu_shader_destroy(struct tu_device *dev,
2958 struct tu_shader *shader)
2959 {
2960 tu_cs_finish(&shader->cs);
2961 TU_RMV(resource_destroy, dev, &shader->bo);
2962
2963 pthread_mutex_lock(&dev->pipeline_mutex);
2964 tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2965 pthread_mutex_unlock(&dev->pipeline_mutex);
2966
2967 if (shader->pvtmem_bo)
2968 tu_bo_finish(dev, shader->pvtmem_bo);
2969
2970 if (shader->variant)
2971 ralloc_free((void *)shader->variant);
2972 if (shader->safe_const_variant)
2973 ralloc_free((void *)shader->safe_const_variant);
2974
2975 vk_free(&dev->vk.alloc, shader);
2976 }
2977