xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_shader.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Google LLC
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_shader.h"
7 
8 #include "spirv/nir_spirv.h"
9 #include "util/mesa-sha1.h"
10 #include "nir/nir_xfb_info.h"
11 #include "vk_nir.h"
12 #include "vk_nir_convert_ycbcr.h"
13 #include "vk_pipeline.h"
14 #include "vk_util.h"
15 
16 #include "ir3/ir3_compiler.h"
17 #include "ir3/ir3_nir.h"
18 
19 #include "tu_device.h"
20 #include "tu_descriptor_set.h"
21 #include "tu_lrz.h"
22 #include "tu_pipeline.h"
23 #include "tu_rmv.h"
24 
25 #include <initializer_list>
26 
27 nir_shader *
tu_spirv_to_nir(struct tu_device * dev,void * mem_ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage_info,gl_shader_stage stage)28 tu_spirv_to_nir(struct tu_device *dev,
29                 void *mem_ctx,
30                 VkPipelineCreateFlags2KHR pipeline_flags,
31                 const VkPipelineShaderStageCreateInfo *stage_info,
32                 gl_shader_stage stage)
33 {
34    /* TODO these are made-up */
35    const struct spirv_to_nir_options spirv_options = {
36       /* ViewID is a sysval in geometry stages and an input in the FS */
37       .view_index_is_input = stage == MESA_SHADER_FRAGMENT,
38 
39       /* Use 16-bit math for RelaxedPrecision ALU ops */
40       .mediump_16bit_alu = true,
41 
42       .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
43       .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
44 
45       /* Accessed via stg/ldg */
46       .phys_ssbo_addr_format = nir_address_format_64bit_global,
47 
48       /* Accessed via the const register file */
49       .push_const_addr_format = nir_address_format_logical,
50 
51       /* Accessed via ldl/stl */
52       .shared_addr_format = nir_address_format_32bit_offset,
53 
54       /* Accessed via stg/ldg (not used with Vulkan?) */
55       .global_addr_format = nir_address_format_64bit_global,
56    };
57 
58    const nir_shader_compiler_options *nir_options =
59       ir3_get_compiler_options(dev->compiler);
60 
61    nir_shader *nir;
62    VkResult result =
63       vk_pipeline_shader_stage_to_nir(&dev->vk, pipeline_flags, stage_info,
64                                       &spirv_options, nir_options,
65                                       mem_ctx, &nir);
66    if (result != VK_SUCCESS)
67       return NULL;
68 
69    /* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
70     * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
71     * which is useless for us, so reset them here.
72     */
73    nir->info.num_ubos = 0;
74    nir->info.num_ssbos = 0;
75 
76    if (TU_DEBUG(NIR)) {
77       fprintf(stderr, "translated nir:\n");
78       nir_print_shader(nir, stderr);
79    }
80 
81    const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
82       .point_coord = true,
83    };
84    NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
85 
86    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
87 
88    /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
89     * precision on arg passed to relaxed param") will pass function args through
90     * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
91     * prop before we lower mediump vars, or you'll be unable to optimize out
92     * array copies after lowering.  We do this before splitting copies, since
93     * that works against nir_opt_find_array_copies().
94     * */
95    NIR_PASS_V(nir, nir_opt_find_array_copies);
96    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
97    NIR_PASS_V(nir, nir_opt_dce);
98 
99    NIR_PASS_V(nir, nir_split_var_copies);
100    NIR_PASS_V(nir, nir_lower_var_copies);
101 
102    NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
103    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
104    NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
105 
106    NIR_PASS_V(nir, nir_lower_system_values);
107    NIR_PASS_V(nir, nir_lower_is_helper_invocation);
108 
109    ir3_optimize_loop(dev->compiler, nir);
110 
111    NIR_PASS_V(nir, nir_opt_conditional_discard);
112 
113    return nir;
114 }
115 
116 static void
lower_load_push_constant(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)117 lower_load_push_constant(struct tu_device *dev,
118                          nir_builder *b,
119                          nir_intrinsic_instr *instr,
120                          struct tu_shader *shader,
121                          const struct tu_pipeline_layout *layout)
122 {
123    uint32_t base = nir_intrinsic_base(instr);
124    assert(base % 4 == 0);
125 
126    if (tu6_shared_constants_enable(layout, dev->compiler)) {
127       /* All stages share the same range.  We could potentially add
128        * push_constant_offset to layout and apply it, but this is good for
129        * now.
130        */
131       base += dev->compiler->shared_consts_base_offset * 4;
132    } else {
133       assert(base >= shader->const_state.push_consts.lo * 4);
134       base -= shader->const_state.push_consts.lo * 4;
135    }
136 
137    nir_def *load =
138       nir_load_const_ir3(b, instr->num_components, instr->def.bit_size,
139                          nir_ushr_imm(b, instr->src[0].ssa, 2), .base = base);
140 
141    nir_def_replace(&instr->def, load);
142 }
143 
144 static void
lower_vulkan_resource_index(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)145 lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
146                             nir_intrinsic_instr *instr,
147                             struct tu_shader *shader,
148                             const struct tu_pipeline_layout *layout)
149 {
150    struct ir3_compiler *compiler = dev->compiler;
151    nir_def *vulkan_idx = instr->src[0].ssa;
152 
153    unsigned set = nir_intrinsic_desc_set(instr);
154    unsigned binding = nir_intrinsic_binding(instr);
155    struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
156    struct tu_descriptor_set_binding_layout *binding_layout =
157       &set_layout->binding[binding];
158    nir_def *base;
159 
160    if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
161       return;
162 
163    shader->active_desc_sets |= 1u << set;
164 
165    switch (binding_layout->type) {
166    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
167    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
168       int offset = 0;
169       for (unsigned i = 0; i < set; i++) {
170          if (shader->dynamic_descriptor_sizes[i] >= 0) {
171             offset += shader->dynamic_descriptor_sizes[i];
172          } else {
173             offset = -1;
174             break;
175          }
176       }
177 
178       if (offset < 0) {
179          /* With independent sets, we don't know
180           * layout->set[set].dynamic_offset_start until after link time which
181           * with fast linking means after the shader is compiled. We have to
182           * get it from the const file instead.
183           */
184          base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
185          nir_def *dynamic_offset_start;
186          if (compiler->load_shader_consts_via_preamble) {
187             dynamic_offset_start =
188                ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
189          } else {
190             dynamic_offset_start = nir_load_const_ir3(
191                b, 1, 32, nir_imm_int(b, 0),
192                .base = shader->const_state.dynamic_offset_loc + set);
193          }
194          base = nir_iadd(b, base, dynamic_offset_start);
195       } else {
196          base = nir_imm_int(b, (offset +
197             binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
198       }
199       assert(dev->physical_device->reserved_set_idx >= 0);
200       set = dev->physical_device->reserved_set_idx;
201       break;
202    }
203    default:
204       base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
205       break;
206    }
207 
208    unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
209    assert(util_is_power_of_two_nonzero(stride));
210    nir_def *shift = nir_imm_int(b, util_logbase2(stride));
211 
212    nir_def *def = nir_vec3(b, nir_imm_int(b, set),
213                                nir_iadd(b, base,
214                                         nir_ishl(b, vulkan_idx, shift)),
215                                shift);
216 
217    nir_def_replace(&instr->def, def);
218 }
219 
220 static void
lower_vulkan_resource_reindex(nir_builder * b,nir_intrinsic_instr * instr)221 lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
222 {
223    nir_def *old_index = instr->src[0].ssa;
224    nir_def *delta = instr->src[1].ssa;
225    nir_def *shift = nir_channel(b, old_index, 2);
226 
227    nir_def *new_index =
228       nir_vec3(b, nir_channel(b, old_index, 0),
229                nir_iadd(b, nir_channel(b, old_index, 1),
230                         nir_ishl(b, delta, shift)),
231                shift);
232 
233    nir_def_replace(&instr->def, new_index);
234 }
235 
236 static void
lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin)237 lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
238 {
239    nir_def *old_index = intrin->src[0].ssa;
240    /* Loading the descriptor happens as part of the load/store instruction so
241     * this is a no-op. We just need to turn the shift into an offset of 0.
242     */
243    nir_def *new_index =
244       nir_vec3(b, nir_channel(b, old_index, 0),
245                nir_channel(b, old_index, 1),
246                nir_imm_int(b, 0));
247    nir_def_replace(&intrin->def, new_index);
248 }
249 
250 static bool
lower_ssbo_ubo_intrinsic(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * intrin)251 lower_ssbo_ubo_intrinsic(struct tu_device *dev,
252                          nir_builder *b, nir_intrinsic_instr *intrin)
253 {
254    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
255 
256    /* The bindless base is part of the instruction, which means that part of
257     * the "pointer" has to be constant. We solve this in the same way the blob
258     * does, by generating a bunch of if-statements. In the usual case where
259     * the descriptor set is constant we can skip that, though).
260     */
261 
262    unsigned buffer_src;
263    if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
264       /* This has the value first */
265       buffer_src = 1;
266    } else {
267       buffer_src = 0;
268    }
269 
270    /* Don't lower non-bindless UBO loads of driver params */
271    if (intrin->src[buffer_src].ssa->num_components == 1)
272       return false;
273 
274    nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
275    nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
276 
277    if (intrin->intrinsic == nir_intrinsic_load_ubo &&
278        dev->instance->allow_oob_indirect_ubo_loads) {
279       nir_scalar offset = nir_scalar_resolved(intrin->src[1].ssa, 0);
280       if (!nir_scalar_is_const(offset)) {
281          nir_intrinsic_set_range(intrin, ~0);
282       }
283    }
284 
285    /* Descriptor index has to be adjusted in the following cases:
286     *  - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
287     *    loads -- next-index descriptor will be able to do that;
288     *  - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
289     *    storage accesses of that size.
290     */
291    if ((dev->physical_device->info->a6xx.storage_16bit &&
292         !dev->physical_device->info->a6xx.has_isam_v &&
293         intrin->intrinsic == nir_intrinsic_load_ssbo &&
294         (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
295         intrin->def.bit_size > 16) ||
296        (dev->physical_device->info->a7xx.storage_8bit &&
297         ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
298          (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
299       descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
300    }
301 
302    nir_def *results[MAX_SETS] = { NULL };
303 
304    if (nir_scalar_is_const(scalar_idx)) {
305       nir_def *bindless =
306          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
307       nir_src_rewrite(&intrin->src[buffer_src], bindless);
308       return true;
309    }
310 
311    nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
312    for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
313       /* if (base_idx == i) { ... */
314       nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
315 
316       nir_def *bindless =
317          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
318 
319       nir_intrinsic_instr *copy =
320          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
321 
322       copy->num_components = intrin->num_components;
323 
324       for (unsigned src = 0; src < info->num_srcs; src++) {
325          if (src == buffer_src)
326             copy->src[src] = nir_src_for_ssa(bindless);
327          else
328             copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
329       }
330 
331       for (unsigned idx = 0; idx < info->num_indices; idx++) {
332          copy->const_index[idx] = intrin->const_index[idx];
333       }
334 
335       if (info->has_dest) {
336          nir_def_init(&copy->instr, &copy->def,
337                       intrin->def.num_components,
338                       intrin->def.bit_size);
339          results[i] = &copy->def;
340       }
341 
342       nir_builder_instr_insert(b, &copy->instr);
343 
344       /* } else { ... */
345       nir_push_else(b, nif);
346    }
347 
348    nir_def *result =
349       nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
350    for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
351       nir_pop_if(b, NULL);
352       if (info->has_dest)
353          result = nir_if_phi(b, results[i], result);
354    }
355 
356    if (info->has_dest)
357       nir_def_rewrite_uses(&intrin->def, result);
358    nir_instr_remove(&intrin->instr);
359    return true;
360 }
361 
362 static nir_def *
build_bindless(struct tu_device * dev,nir_builder * b,nir_deref_instr * deref,bool is_sampler,struct tu_shader * shader,const struct tu_pipeline_layout * layout)363 build_bindless(struct tu_device *dev, nir_builder *b,
364                nir_deref_instr *deref, bool is_sampler,
365                struct tu_shader *shader,
366                const struct tu_pipeline_layout *layout)
367 {
368    nir_variable *var = nir_deref_instr_get_variable(deref);
369 
370    unsigned set = var->data.descriptor_set;
371    unsigned binding = var->data.binding;
372    const struct tu_descriptor_set_binding_layout *bind_layout =
373       &layout->set[set].layout->binding[binding];
374 
375    /* input attachments use non bindless workaround */
376    if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
377        !TU_DEBUG(DYNAMIC)) {
378       const struct glsl_type *glsl_type = glsl_without_array(var->type);
379       uint32_t idx = var->data.index * 2;
380 
381       BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
382 
383       /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
384       if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
385          idx += 1;
386 
387       if (deref->deref_type == nir_deref_type_var)
388          return nir_imm_int(b, idx);
389 
390       nir_def *arr_index = deref->arr.index.ssa;
391       return nir_iadd_imm(b, nir_imul_imm(b, arr_index, 2), idx);
392    }
393 
394    shader->active_desc_sets |= 1u << set;
395 
396    nir_def *desc_offset;
397    unsigned descriptor_stride;
398    unsigned offset = 0;
399    /* Samplers come second in combined image/sampler descriptors, see
400       * write_combined_image_sampler_descriptor().
401       */
402    if (is_sampler && bind_layout->type ==
403          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
404       offset = 1;
405    }
406    desc_offset =
407       nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
408                   offset);
409    descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
410 
411    if (deref->deref_type != nir_deref_type_var) {
412       assert(deref->deref_type == nir_deref_type_array);
413 
414       nir_def *arr_index = deref->arr.index.ssa;
415       desc_offset = nir_iadd(b, desc_offset,
416                              nir_imul_imm(b, arr_index, descriptor_stride));
417    }
418 
419    return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
420 }
421 
422 static void
lower_image_deref(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)423 lower_image_deref(struct tu_device *dev, nir_builder *b,
424                   nir_intrinsic_instr *instr, struct tu_shader *shader,
425                   const struct tu_pipeline_layout *layout)
426 {
427    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
428    nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
429    nir_rewrite_image_intrinsic(instr, bindless, true);
430 }
431 
432 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout)433 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
434                 struct tu_device *dev,
435                 struct tu_shader *shader,
436                 const struct tu_pipeline_layout *layout)
437 {
438    switch (instr->intrinsic) {
439    case nir_intrinsic_load_push_constant:
440       lower_load_push_constant(dev, b, instr, shader, layout);
441       return true;
442 
443    case nir_intrinsic_load_vulkan_descriptor:
444       lower_load_vulkan_descriptor(b, instr);
445       return true;
446 
447    case nir_intrinsic_vulkan_resource_index:
448       lower_vulkan_resource_index(dev, b, instr, shader, layout);
449       return true;
450    case nir_intrinsic_vulkan_resource_reindex:
451       lower_vulkan_resource_reindex(b, instr);
452       return true;
453 
454    case nir_intrinsic_load_ubo:
455    case nir_intrinsic_load_ssbo:
456    case nir_intrinsic_store_ssbo:
457    case nir_intrinsic_ssbo_atomic:
458    case nir_intrinsic_ssbo_atomic_swap:
459    case nir_intrinsic_get_ssbo_size:
460       return lower_ssbo_ubo_intrinsic(dev, b, instr);
461 
462    case nir_intrinsic_image_deref_load:
463    case nir_intrinsic_image_deref_store:
464    case nir_intrinsic_image_deref_atomic:
465    case nir_intrinsic_image_deref_atomic_swap:
466    case nir_intrinsic_image_deref_size:
467    case nir_intrinsic_image_deref_samples:
468       lower_image_deref(dev, b, instr, shader, layout);
469       return true;
470 
471    case nir_intrinsic_load_frag_size_ir3:
472    case nir_intrinsic_load_frag_offset_ir3: {
473       if (!dev->compiler->load_shader_consts_via_preamble)
474          return false;
475 
476       enum ir3_driver_param param =
477          instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
478          IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
479 
480       unsigned offset = param - IR3_DP_FS_DYNAMIC;
481 
482       nir_def *view = instr->src[0].ssa;
483       nir_def *result =
484          ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
485                                       offset, view, nir_intrinsic_range(instr));
486 
487       nir_def_replace(&instr->def, result);
488       return true;
489    }
490    case nir_intrinsic_load_frag_invocation_count: {
491       if (!dev->compiler->load_shader_consts_via_preamble)
492          return false;
493 
494       nir_def *result =
495          ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
496                              IR3_DP_FS_FRAG_INVOCATION_COUNT -
497                              IR3_DP_FS_DYNAMIC);
498 
499       nir_def_replace(&instr->def, result);
500       return true;
501    }
502 
503    default:
504       return false;
505    }
506 }
507 
508 static void
lower_tex_ycbcr(const struct tu_pipeline_layout * layout,nir_builder * builder,nir_tex_instr * tex)509 lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
510                 nir_builder *builder,
511                 nir_tex_instr *tex)
512 {
513    int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
514    assert(deref_src_idx >= 0);
515    nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
516 
517    nir_variable *var = nir_deref_instr_get_variable(deref);
518    const struct tu_descriptor_set_layout *set_layout =
519       layout->set[var->data.descriptor_set].layout;
520    const struct tu_descriptor_set_binding_layout *binding =
521       &set_layout->binding[var->data.binding];
522    const struct vk_ycbcr_conversion_state *ycbcr_samplers =
523       tu_immutable_ycbcr_samplers(set_layout, binding);
524 
525    if (!ycbcr_samplers)
526       return;
527 
528    /* For the following instructions, we don't apply any change */
529    if (tex->op == nir_texop_txs ||
530        tex->op == nir_texop_query_levels ||
531        tex->op == nir_texop_lod)
532       return;
533 
534    assert(tex->texture_index == 0);
535    unsigned array_index = 0;
536    if (deref->deref_type != nir_deref_type_var) {
537       assert(deref->deref_type == nir_deref_type_array);
538       if (!nir_src_is_const(deref->arr.index))
539          return;
540       array_index = nir_src_as_uint(deref->arr.index);
541       array_index = MIN2(array_index, binding->array_size - 1);
542    }
543    const struct vk_ycbcr_conversion_state *ycbcr_sampler = ycbcr_samplers + array_index;
544 
545    if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
546       return;
547 
548    /* Skip if not actually a YCbCr format.  CtsGraphics, for example, tries to create
549     * YcbcrConversions for RGB formats.
550     */
551    if (!vk_format_get_ycbcr_info(ycbcr_sampler->format))
552       return;
553 
554    builder->cursor = nir_after_instr(&tex->instr);
555 
556    uint8_t bits = vk_format_get_component_bits(ycbcr_sampler->format,
557                                                UTIL_FORMAT_COLORSPACE_RGB,
558                                                PIPE_SWIZZLE_X);
559    uint32_t bpcs[3] = {bits, bits, bits}; /* TODO: use right bpc for each channel ? */
560    nir_def *result = nir_convert_ycbcr_to_rgb(builder,
561                                               ycbcr_sampler->ycbcr_model,
562                                               ycbcr_sampler->ycbcr_range,
563                                               &tex->def,
564                                               bpcs);
565    nir_def_rewrite_uses_after(&tex->def, result,
566                               result->parent_instr);
567 
568    builder->cursor = nir_before_instr(&tex->instr);
569 }
570 
571 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout)572 lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
573           struct tu_shader *shader, const struct tu_pipeline_layout *layout)
574 {
575    lower_tex_ycbcr(layout, b, tex);
576 
577    int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
578    if (sampler_src_idx >= 0) {
579       nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
580       nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout);
581       nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
582       tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
583    }
584 
585    int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
586    if (tex_src_idx >= 0) {
587       nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
588       nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
589       nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
590       tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
591 
592       /* for the input attachment case: */
593       if (bindless->parent_instr->type != nir_instr_type_intrinsic)
594          tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
595    }
596 
597    return true;
598 }
599 
600 struct lower_instr_params {
601    struct tu_device *dev;
602    struct tu_shader *shader;
603    const struct tu_pipeline_layout *layout;
604 };
605 
606 static bool
lower_instr(nir_builder * b,nir_instr * instr,void * cb_data)607 lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
608 {
609    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
610    b->cursor = nir_before_instr(instr);
611    switch (instr->type) {
612    case nir_instr_type_tex:
613       return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout);
614    case nir_instr_type_intrinsic:
615       return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev, params->shader, params->layout);
616    default:
617       return false;
618    }
619 }
620 
621 /* Since we always push inline uniforms into constant memory, lower loads of
622  * them to load_uniform which turns into constant memory loads.
623  */
624 static bool
lower_inline_ubo(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)625 lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
626 {
627    if (intrin->intrinsic != nir_intrinsic_load_ubo)
628       return false;
629 
630    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
631    struct tu_shader *shader = params->shader;
632    const struct tu_pipeline_layout *layout = params->layout;
633 
634    nir_binding binding = nir_chase_binding(intrin->src[0]);
635 
636    if (!binding.success)
637       return false;
638 
639    struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
640    struct tu_descriptor_set_binding_layout *binding_layout =
641       &set_layout->binding[binding.binding];
642 
643    if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
644       return false;
645 
646    /* lookup the const offset of the inline UBO */
647    struct tu_const_state *const_state = &shader->const_state;
648 
649    unsigned base = UINT_MAX;
650    unsigned range;
651    bool use_load = false;
652    bool use_ldg_k =
653       params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
654 
655    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
656       if (const_state->ubos[i].base == binding.desc_set &&
657           const_state->ubos[i].offset == binding_layout->offset) {
658          range = const_state->ubos[i].size_vec4 * 4;
659          if (use_ldg_k) {
660             base = i * 2;
661          } else {
662             use_load = const_state->ubos[i].push_address;
663             base = const_state->ubos[i].const_offset_vec4 * 4;
664          }
665          break;
666       }
667    }
668 
669    if (base == UINT_MAX) {
670       /* Assume we're loading out-of-bounds from a 0-sized inline uniform
671        * filtered out below.
672        */
673       nir_def_rewrite_uses(&intrin->def,
674                                nir_undef(b, intrin->num_components,
675                                              intrin->def.bit_size));
676       return true;
677    }
678 
679    nir_def *offset = intrin->src[1].ssa;
680 
681    b->cursor = nir_before_instr(&intrin->instr);
682    nir_def *val;
683 
684    if (use_load || use_ldg_k) {
685       nir_def *base_addr;
686       if (use_ldg_k) {
687          base_addr = ir3_load_driver_ubo(b, 2,
688                                          &params->shader->const_state.inline_uniforms_ubo,
689                                          base);
690       } else {
691          base_addr =
692             nir_load_const_ir3(b, 2, 32, nir_imm_int(b, 0), .base = base);
693       }
694       val = nir_load_global_ir3(b, intrin->num_components,
695                                 intrin->def.bit_size,
696                                 base_addr, nir_ishr_imm(b, offset, 2),
697                                 .access =
698                                  (enum gl_access_qualifier)(
699                                     (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
700                                     ACCESS_CAN_SPECULATE),
701                                 .align_mul = 16,
702                                 .align_offset = 0,
703                                 .range_base = 0,
704                                 .range = range);
705    } else {
706       val =
707          nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,
708                             nir_ishr_imm(b, offset, 2), .base = base);
709    }
710 
711    nir_def_replace(&intrin->def, val);
712    return true;
713 }
714 
715 /* Figure out the range of push constants that we're actually going to push to
716  * the shader, and tell the backend to reserve this range when pushing UBO
717  * constants.
718  */
719 
720 static void
gather_push_constants(nir_shader * shader,struct tu_shader * tu_shader)721 gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
722 {
723    uint32_t min = UINT32_MAX, max = 0;
724    nir_foreach_function_impl(impl, shader) {
725       nir_foreach_block(block, impl) {
726          nir_foreach_instr_safe(instr, block) {
727             if (instr->type != nir_instr_type_intrinsic)
728                continue;
729 
730             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
731             if (intrin->intrinsic != nir_intrinsic_load_push_constant)
732                continue;
733 
734             uint32_t base = nir_intrinsic_base(intrin);
735             uint32_t range = nir_intrinsic_range(intrin);
736             min = MIN2(min, base);
737             max = MAX2(max, base + range);
738             break;
739          }
740       }
741    }
742 
743    if (min >= max) {
744       tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
745       return;
746    }
747 
748    /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
749     * dwords while loading regular consts is in units of vec4's.
750     * So we unify the unit here as dwords for tu_push_constant_range, then
751     * we should consider correct unit when emitting.
752     *
753     * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
754     * the range and change units accordingly.
755     */
756    tu_shader->const_state.push_consts.lo = (min / 4) / 4 * 4;
757    tu_shader->const_state.push_consts.dwords =
758       align(max, 16) / 4 - tu_shader->const_state.push_consts.lo;
759 }
760 
761 static bool
shader_uses_push_consts(nir_shader * shader)762 shader_uses_push_consts(nir_shader *shader)
763 {
764    nir_foreach_function_impl (impl, shader) {
765       nir_foreach_block (block, impl) {
766          nir_foreach_instr_safe (instr, block) {
767             if (instr->type != nir_instr_type_intrinsic)
768                continue;
769 
770             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
771             if (intrin->intrinsic == nir_intrinsic_load_push_constant)
772                return true;
773          }
774       }
775    }
776    return false;
777 }
778 
779 static bool
tu_lower_io(nir_shader * shader,struct tu_device * dev,struct tu_shader * tu_shader,const struct tu_pipeline_layout * layout,unsigned * reserved_consts_vec4_out)780 tu_lower_io(nir_shader *shader, struct tu_device *dev,
781             struct tu_shader *tu_shader,
782             const struct tu_pipeline_layout *layout,
783             unsigned *reserved_consts_vec4_out)
784 {
785    tu_shader->const_state.push_consts = (struct tu_push_constant_range) {
786       .lo = 0,
787       .dwords = layout->push_constant_size / 4,
788       .type = tu_push_consts_type(layout, dev->compiler),
789    };
790 
791    if (tu_shader->const_state.push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
792       gather_push_constants(shader, tu_shader);
793    } else if (tu_shader->const_state.push_consts.type ==
794             IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
795       /* Disable pushing constants for this stage if none were loaded in the
796        * shader.  If all stages don't load their declared push constants, as
797        * is often the case under zink, then we could additionally skip
798        * emitting REG_A7XX_HLSQ_SHARED_CONSTS_IMM entirely.
799        */
800       if (!shader_uses_push_consts(shader))
801          tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
802    }
803 
804    struct tu_const_state *const_state = &tu_shader->const_state;
805    unsigned reserved_consts_vec4 =
806       align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
807             dev->compiler->const_upload_unit);
808 
809    bool unknown_dynamic_size = false;
810    bool unknown_dynamic_offset = false;
811    for (unsigned i = 0; i < layout->num_sets; i++) {
812       if (tu_shader->dynamic_descriptor_sizes[i] == -1) {
813          unknown_dynamic_size = true;
814       } else if (unknown_dynamic_size &&
815                  tu_shader->dynamic_descriptor_sizes[i] > 0) {
816          /* If there is an unknown size followed by a known size, then we may
817           * need to dynamically determine the offset when linking.
818           */
819          unknown_dynamic_offset = true;
820       }
821    }
822 
823    if (unknown_dynamic_offset) {
824       const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
825       assert(dev->physical_device->reserved_set_idx >= 0);
826       reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4);
827    } else {
828       const_state->dynamic_offset_loc = UINT32_MAX;
829    }
830 
831    /* Reserve space for inline uniforms, so we can always load them from
832     * constants and not setup a UBO descriptor for them.
833     */
834    bool use_ldg_k =
835       dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
836    for (unsigned set = 0; set < layout->num_sets; set++) {
837       const struct tu_descriptor_set_layout *desc_layout =
838          layout->set[set].layout;
839 
840       if (!desc_layout || !desc_layout->has_inline_uniforms)
841          continue;
842 
843       for (unsigned b = 0; b < desc_layout->binding_count; b++) {
844          const struct tu_descriptor_set_binding_layout *binding =
845             &desc_layout->binding[b];
846 
847          if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
848             continue;
849          if (!(binding->shader_stages &
850                mesa_to_vk_shader_stage(shader->info.stage)))
851             continue;
852 
853          /* If we don't know the size at compile time due to a variable
854           * descriptor count, then with descriptor buffers we cannot know
855           * how much space the real inline uniform has. In this case we fall
856           * back to pushing the address and using ldg, which is slower than
857           * setting up a descriptor but setting up our own descriptor with
858           * descriptor_buffer is also painful and has to be done on the GPU
859           * and doesn't avoid the UBO getting pushed anyway and faulting if a
860           * out-of-bounds access is hidden behind an if and not dynamically
861           * executed. Given the small max size, there shouldn't be much reason
862           * to use variable size anyway.
863           */
864          bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
865             b == desc_layout->binding_count - 1;
866 
867          if (push_address) {
868             perf_debug(dev,
869                        "falling back to ldg for variable-sized inline "
870                        "uniform block");
871          }
872 
873          assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
874          unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
875          const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) {
876             .base = set,
877             .offset = binding->offset,
878             .push_address = push_address,
879             .const_offset_vec4 = reserved_consts_vec4,
880             .size_vec4 = size_vec4,
881          };
882 
883          if (!use_ldg_k)
884             reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
885       }
886    }
887 
888    *reserved_consts_vec4_out = reserved_consts_vec4;
889 
890    struct lower_instr_params params = {
891       .dev = dev,
892       .shader = tu_shader,
893       .layout = layout,
894    };
895 
896    bool progress = false;
897    if (const_state->num_inline_ubos) {
898       progress |= nir_shader_intrinsics_pass(shader, lower_inline_ubo,
899                                                nir_metadata_none,
900                                                &params);
901    }
902 
903    progress |= nir_shader_instructions_pass(shader,
904                                             lower_instr,
905                                             nir_metadata_none,
906                                             &params);
907 
908    /* Remove now-unused variables so that when we gather the shader info later
909     * they won't be counted.
910     */
911 
912    if (progress)
913       nir_opt_dce(shader);
914 
915    progress |=
916       nir_remove_dead_variables(shader,
917                                 nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
918                                 NULL);
919 
920    return progress;
921 }
922 
923 struct lower_fdm_options {
924    unsigned num_views;
925    bool adjust_fragcoord;
926    bool multiview;
927 };
928 
929 static bool
lower_fdm_filter(const nir_instr * instr,const void * data)930 lower_fdm_filter(const nir_instr *instr, const void *data)
931 {
932    const struct lower_fdm_options *options =
933       (const struct lower_fdm_options *)data;
934 
935    if (instr->type != nir_instr_type_intrinsic)
936       return false;
937 
938    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
939    return intrin->intrinsic == nir_intrinsic_load_frag_size ||
940       (intrin->intrinsic == nir_intrinsic_load_frag_coord &&
941        options->adjust_fragcoord);
942 }
943 
944 static nir_def *
lower_fdm_instr(struct nir_builder * b,nir_instr * instr,void * data)945 lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
946 {
947    const struct lower_fdm_options *options =
948       (const struct lower_fdm_options *)data;
949 
950    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
951 
952    nir_def *view;
953    if (options->multiview) {
954       nir_variable *view_var =
955          nir_find_variable_with_location(b->shader, nir_var_shader_in,
956                                          VARYING_SLOT_VIEW_INDEX);
957 
958       if (view_var == NULL) {
959          view_var = nir_variable_create(b->shader, nir_var_shader_in,
960                                         glsl_int_type(), NULL);
961          view_var->data.location = VARYING_SLOT_VIEW_INDEX;
962          view_var->data.interpolation = INTERP_MODE_FLAT;
963          view_var->data.driver_location = b->shader->num_inputs++;
964       }
965 
966       view = nir_load_var(b, view_var);
967    } else {
968       view = nir_imm_int(b, 0);
969    }
970 
971    nir_def *frag_size =
972       nir_load_frag_size_ir3(b, view, .range = options->num_views);
973 
974    if (intrin->intrinsic == nir_intrinsic_load_frag_coord) {
975       nir_def *frag_offset =
976          nir_load_frag_offset_ir3(b, view, .range = options->num_views);
977       nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
978       nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
979       xy = nir_fmul(b, nir_fsub(b, xy, frag_offset), nir_i2f32(b, frag_size));
980       return nir_vec4(b,
981                       nir_channel(b, xy, 0),
982                       nir_channel(b, xy, 1),
983                       nir_channel(b, unscaled_coord, 2),
984                       nir_channel(b, unscaled_coord, 3));
985    }
986 
987    assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
988    return frag_size;
989 }
990 
991 static bool
tu_nir_lower_fdm(nir_shader * shader,const struct lower_fdm_options * options)992 tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
993 {
994    return nir_shader_lower_instructions(shader, lower_fdm_filter,
995                                         lower_fdm_instr, (void *)options);
996 }
997 
998 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)999 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1000 {
1001    assert(glsl_type_is_vector_or_scalar(type));
1002 
1003    unsigned comp_size =
1004       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
1005    unsigned length = glsl_get_vector_elements(type);
1006    *size = comp_size * length;
1007    *align = comp_size;
1008 }
1009 
1010 static void
tu_gather_xfb_info(nir_shader * nir,struct ir3_stream_output_info * info)1011 tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
1012 {
1013    nir_shader_gather_xfb_info(nir);
1014 
1015    if (!nir->xfb_info)
1016       return;
1017 
1018    nir_xfb_info *xfb = nir->xfb_info;
1019 
1020    uint8_t output_map[VARYING_SLOT_TESS_MAX];
1021    memset(output_map, 0, sizeof(output_map));
1022 
1023    nir_foreach_shader_out_variable(var, nir) {
1024       unsigned slots = nir_variable_count_slots(var, var->type);
1025       for (unsigned i = 0; i < slots; i++)
1026          output_map[var->data.location + i] = var->data.driver_location + i;
1027    }
1028 
1029    assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
1030    info->num_outputs = xfb->output_count;
1031 
1032    for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1033       info->stride[i] = xfb->buffers[i].stride / 4;
1034       info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
1035    }
1036 
1037    info->streams_written = xfb->streams_written;
1038 
1039    for (int i = 0; i < xfb->output_count; i++) {
1040       info->output[i].register_index = output_map[xfb->outputs[i].location];
1041       info->output[i].start_component = xfb->outputs[i].component_offset;
1042       info->output[i].num_components =
1043                            util_bitcount(xfb->outputs[i].component_mask);
1044       info->output[i].output_buffer  = xfb->outputs[i].buffer;
1045       info->output[i].dst_offset = xfb->outputs[i].offset / 4;
1046       info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
1047    }
1048 }
1049 
1050 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)1051 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
1052 {
1053    const struct ir3_const_state *const_state = ir3_const_state(xs);
1054    uint32_t base = const_state->offsets.immediate;
1055    int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
1056 
1057    /* truncate size to avoid writing constants that shader
1058     * does not use:
1059     */
1060    size = MIN2(size + base, xs->constlen) - base;
1061 
1062    return MAX2(size, 0) * 4;
1063 }
1064 
1065 /* We allocate fixed-length substreams for shader state, however some
1066  * parts of the state may have unbound length. Their additional space
1067  * requirements should be calculated here.
1068  */
1069 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)1070 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
1071 {
1072    const struct ir3_const_state *const_state = ir3_const_state(xs);
1073 
1074    uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
1075 
1076    /* Variable number of UBO upload ranges. */
1077    size += 4 * const_state->ubo_state.num_enabled;
1078 
1079    /* Variable number of dwords for the primitive map */
1080    size += xs->input_size;
1081 
1082    size += xs->constant_data_size / 4;
1083 
1084    return size;
1085 }
1086 
1087 static const struct xs_config {
1088    uint16_t reg_sp_xs_config;
1089    uint16_t reg_sp_xs_instrlen;
1090    uint16_t reg_sp_xs_first_exec_offset;
1091    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
1092    uint16_t reg_sp_xs_vgpr_config;
1093 } xs_config[] = {
1094    [MESA_SHADER_VERTEX] = {
1095       REG_A6XX_SP_VS_CONFIG,
1096       REG_A6XX_SP_VS_INSTRLEN,
1097       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
1098       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
1099       REG_A7XX_SP_VS_VGPR_CONFIG,
1100    },
1101    [MESA_SHADER_TESS_CTRL] = {
1102       REG_A6XX_SP_HS_CONFIG,
1103       REG_A6XX_SP_HS_INSTRLEN,
1104       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
1105       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
1106       REG_A7XX_SP_HS_VGPR_CONFIG,
1107    },
1108    [MESA_SHADER_TESS_EVAL] = {
1109       REG_A6XX_SP_DS_CONFIG,
1110       REG_A6XX_SP_DS_INSTRLEN,
1111       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
1112       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
1113       REG_A7XX_SP_DS_VGPR_CONFIG,
1114    },
1115    [MESA_SHADER_GEOMETRY] = {
1116       REG_A6XX_SP_GS_CONFIG,
1117       REG_A6XX_SP_GS_INSTRLEN,
1118       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
1119       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
1120       REG_A7XX_SP_GS_VGPR_CONFIG,
1121    },
1122    [MESA_SHADER_FRAGMENT] = {
1123       REG_A6XX_SP_FS_CONFIG,
1124       REG_A6XX_SP_FS_INSTRLEN,
1125       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
1126       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
1127       REG_A7XX_SP_FS_VGPR_CONFIG,
1128    },
1129    [MESA_SHADER_COMPUTE] = {
1130       REG_A6XX_SP_CS_CONFIG,
1131       REG_A6XX_SP_CS_INSTRLEN,
1132       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
1133       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
1134       REG_A7XX_SP_CS_VGPR_CONFIG,
1135    },
1136 };
1137 
1138 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1139 tu6_emit_xs(struct tu_cs *cs,
1140             gl_shader_stage stage, /* xs->type, but xs may be NULL */
1141             const struct ir3_shader_variant *xs,
1142             const struct tu_pvtmem_config *pvtmem,
1143             uint64_t binary_iova)
1144 {
1145    const struct xs_config *cfg = &xs_config[stage];
1146 
1147    if (!xs) {
1148       /* shader stage disabled */
1149       return;
1150    }
1151 
1152    enum a6xx_threadsize thrsz =
1153       xs->info.double_threadsize ? THREAD128 : THREAD64;
1154    switch (stage) {
1155    case MESA_SHADER_VERTEX:
1156       tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
1157                .halfregfootprint = xs->info.max_half_reg + 1,
1158                .fullregfootprint = xs->info.max_reg + 1,
1159                .branchstack = ir3_shader_branchstack_hw(xs),
1160                .mergedregs = xs->mergedregs,
1161                .earlypreamble = xs->early_preamble,
1162       ));
1163       break;
1164    case MESA_SHADER_TESS_CTRL:
1165       tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
1166                .halfregfootprint = xs->info.max_half_reg + 1,
1167                .fullregfootprint = xs->info.max_reg + 1,
1168                .branchstack = ir3_shader_branchstack_hw(xs),
1169                .earlypreamble = xs->early_preamble,
1170       ));
1171       break;
1172    case MESA_SHADER_TESS_EVAL:
1173       tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
1174                .halfregfootprint = xs->info.max_half_reg + 1,
1175                .fullregfootprint = xs->info.max_reg + 1,
1176                .branchstack = ir3_shader_branchstack_hw(xs),
1177                .earlypreamble = xs->early_preamble,
1178       ));
1179       break;
1180    case MESA_SHADER_GEOMETRY:
1181       tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
1182                .halfregfootprint = xs->info.max_half_reg + 1,
1183                .fullregfootprint = xs->info.max_reg + 1,
1184                .branchstack = ir3_shader_branchstack_hw(xs),
1185                .earlypreamble = xs->early_preamble,
1186       ));
1187       break;
1188    case MESA_SHADER_FRAGMENT:
1189       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
1190                .halfregfootprint = xs->info.max_half_reg + 1,
1191                .fullregfootprint = xs->info.max_reg + 1,
1192                .branchstack = ir3_shader_branchstack_hw(xs),
1193                .threadsize = thrsz,
1194                .varying = xs->total_in != 0,
1195                .lodpixmask = xs->need_full_quad,
1196                /* unknown bit, seems unnecessary */
1197                .unk24 = true,
1198                .pixlodenable = xs->need_pixlod,
1199                .earlypreamble = xs->early_preamble,
1200                .mergedregs = xs->mergedregs,
1201       ));
1202       break;
1203    case MESA_SHADER_COMPUTE:
1204       thrsz = cs->device->physical_device->info->a6xx
1205             .supports_double_threadsize ? thrsz : THREAD128;
1206       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
1207                .halfregfootprint = xs->info.max_half_reg + 1,
1208                .fullregfootprint = xs->info.max_reg + 1,
1209                .branchstack = ir3_shader_branchstack_hw(xs),
1210                .threadsize = thrsz,
1211                .earlypreamble = xs->early_preamble,
1212                .mergedregs = xs->mergedregs,
1213       ));
1214       break;
1215    default:
1216       unreachable("bad shader stage");
1217    }
1218 
1219    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
1220    tu_cs_emit(cs, xs->instrlen);
1221 
1222    /* emit program binary & private memory layout
1223     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
1224     */
1225 
1226    assert((binary_iova & 0x7f) == 0);
1227    assert((pvtmem->iova & 0x1f) == 0);
1228 
1229    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
1230    tu_cs_emit(cs, 0);
1231    tu_cs_emit_qw(cs, binary_iova);
1232    tu_cs_emit(cs,
1233               A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
1234    tu_cs_emit_qw(cs, pvtmem->iova);
1235    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
1236                   COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
1237 
1238    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
1239    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
1240 
1241    if (cs->device->physical_device->info->chip >= A7XX) {
1242       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
1243       tu_cs_emit(cs, 0);
1244    }
1245 
1246    if (cs->device->physical_device->info->chip == A6XX) {
1247       uint32_t shader_preload_size =
1248          MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
1249 
1250       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1251       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1252                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1253                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1254                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1255                      CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
1256       tu_cs_emit_qw(cs, binary_iova);
1257    }
1258 
1259    /* emit immediates */
1260 
1261    const struct ir3_const_state *const_state = ir3_const_state(xs);
1262    uint32_t base = const_state->offsets.immediate;
1263    unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
1264 
1265    if (immediate_size > 0) {
1266       assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
1267       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
1268       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1269                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1270                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1271                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1272                  CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
1273       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1274       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1275 
1276       tu_cs_emit_array(cs, const_state->immediates, immediate_size);
1277    }
1278 
1279    if (const_state->consts_ubo.idx != -1) {
1280       uint64_t iova = binary_iova + xs->info.constant_data_offset;
1281       uint32_t offset = const_state->consts_ubo.idx;
1282 
1283       /* Upload UBO state for the constant data. */
1284       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1285       tu_cs_emit(cs,
1286                  CP_LOAD_STATE6_0_DST_OFF(offset) |
1287                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
1288                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1289                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1290                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1291       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1292       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1293       int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
1294       tu_cs_emit_qw(cs,
1295                     iova |
1296                     (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
1297 
1298       /* Upload the constant data to the const file if needed. */
1299       const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
1300 
1301       if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1302          for (int i = 0; i < ubo_state->num_enabled; i++) {
1303             if (ubo_state->range[i].ubo.block != offset ||
1304                 ubo_state->range[i].ubo.bindless) {
1305                continue;
1306             }
1307 
1308             uint32_t start = ubo_state->range[i].start;
1309             uint32_t end = ubo_state->range[i].end;
1310             uint32_t size = MIN2(end - start,
1311                                  (16 * xs->constlen) - ubo_state->range[i].offset);
1312 
1313             tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1314             tu_cs_emit(cs,
1315                      CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
1316                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1317                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1318                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1319                      CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
1320             tu_cs_emit_qw(cs, iova + start);
1321          }
1322       }
1323    }
1324 
1325    /* emit statically-known FS driver param */
1326    if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
1327       uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
1328       uint32_t size = ARRAY_SIZE(data);
1329 
1330       /* A7XX TODO: Emit data via sub_cs instead of NOP */
1331       uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
1332       uint32_t base = const_state->driver_params_ubo.idx;
1333 
1334       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1335       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1336                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
1337                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1338                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1339                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1340       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1341       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1342       int size_vec4s = DIV_ROUND_UP(size, 4);
1343       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
1344    } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
1345       uint32_t base = const_state->offsets.driver_param;
1346       int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
1347       size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
1348 
1349       if (size > 0) {
1350          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + 4);
1351          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1352                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1353                     CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1354                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1355                     CP_LOAD_STATE6_0_NUM_UNIT(size));
1356          tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1357          tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1358 
1359          tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
1360          tu_cs_emit(cs, 0);
1361          tu_cs_emit(cs, 0);
1362          tu_cs_emit(cs, 0);
1363       }
1364    }
1365 }
1366 
1367 template <chip CHIP>
1368 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1369 tu6_emit_cs_config(struct tu_cs *cs,
1370                    const struct ir3_shader_variant *v,
1371                    const struct tu_pvtmem_config *pvtmem,
1372                    uint64_t binary_iova)
1373 {
1374    bool shared_consts_enable =
1375       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1376    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1377 
1378    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1379          .cs_state = true,
1380          .cs_ibo = true,
1381          .cs_shared_const = shared_consts_enable));
1382 
1383    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
1384    tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
1385 
1386    uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
1387    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
1388    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
1389                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
1390 
1391    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) {
1392       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
1393       tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
1394                      A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
1395    }
1396 
1397    uint32_t local_invocation_id =
1398       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
1399    uint32_t work_group_id =
1400       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
1401 
1402    /*
1403     * Devices that do not support double threadsize take the threadsize from
1404     * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
1405     * which is always set to THREAD128.
1406     */
1407    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
1408    enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
1409       .supports_double_threadsize ? thrsz : THREAD128;
1410    if (CHIP == A6XX) {
1411       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
1412       tu_cs_emit(cs,
1413                  A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1414                  A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1415                  A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1416                  A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1417       tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1418                      A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
1419       if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
1420          tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1421          tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
1422       }
1423 
1424       if (cs->device->physical_device->info->a6xx.has_lpac) {
1425          tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
1426          tu_cs_emit(cs,
1427                     A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1428                     A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1429                     A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1430                     A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1431          tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1432                   A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
1433       }
1434    } else {
1435       unsigned tile_height = (v->local_size[1] % 8 == 0)   ? 3
1436                              : (v->local_size[1] % 4 == 0) ? 5
1437                              : (v->local_size[1] % 2 == 0) ? 9
1438                                                            : 17;
1439       tu_cs_emit_regs(
1440          cs, HLSQ_CS_CNTL_1(CHIP,
1441                    .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
1442                    .workgrouprastorderzfirsten = true,
1443                    .wgtilewidth = 4, .wgtileheight = tile_height));
1444 
1445       tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
1446 
1447       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1);
1448       tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1449                         A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1450                         A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1451                         A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1452 
1453       tu_cs_emit_regs(cs,
1454                       SP_CS_CNTL_1(CHIP,
1455                         .linearlocalidregid = regid(63, 0),
1456                         .threadsize = thrsz_cs,
1457                         .workitemrastorder =
1458                            v->cs.force_linear_dispatch ?
1459                            WORKITEMRASTORDER_LINEAR :
1460                            WORKITEMRASTORDER_TILED, ));
1461 
1462       tu_cs_emit_regs(
1463          cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
1464                                      .localsizey = v->local_size[1] - 1,
1465                                      .localsizez = v->local_size[2] - 1, ));
1466 
1467       tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
1468    }
1469 }
1470 
1471 #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
1472 
1473 static void
tu6_emit_vfd_dest(struct tu_cs * cs,const struct ir3_shader_variant * vs)1474 tu6_emit_vfd_dest(struct tu_cs *cs,
1475                   const struct ir3_shader_variant *vs)
1476 {
1477    int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1478    uint32_t attr_count = 0;
1479 
1480    for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
1481       input_for_attr[i] = -1;
1482 
1483    for (unsigned i = 0; i < vs->inputs_count; i++) {
1484       if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
1485          continue;
1486 
1487       assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
1488       unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
1489       input_for_attr[loc] = i;
1490       attr_count = MAX2(attr_count, loc + 1);
1491    }
1492 
1493    tu_cs_emit_regs(cs,
1494                    A6XX_VFD_CONTROL_0(
1495                      .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
1496                      .decode_cnt = attr_count));
1497 
1498    if (attr_count)
1499       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
1500 
1501    for (unsigned i = 0; i < attr_count; i++) {
1502       if (input_for_attr[i] >= 0) {
1503             unsigned input_idx = input_for_attr[i];
1504             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1505                              .writemask = vs->inputs[input_idx].compmask,
1506                              .regid = vs->inputs[input_idx].regid).value);
1507       } else {
1508             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1509                              .writemask = 0,
1510                              .regid = regid(63, 0)).value);
1511       }
1512    }
1513 }
1514 
1515 static enum a6xx_tex_prefetch_cmd
tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)1516 tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)
1517 {
1518    switch (tex_opc) {
1519    case OPC_SAM:
1520       return TEX_PREFETCH_SAM;
1521    default:
1522       unreachable("Unknown tex opc for prefeth cmd");
1523    }
1524 }
1525 
1526 template <chip CHIP>
1527 static void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1528 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1529 {
1530    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1531    uint32_t ij_regid[IJ_COUNT];
1532    uint32_t smask_in_regid;
1533 
1534    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1535    bool enable_varyings = fs->total_in > 0;
1536 
1537    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1538    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1539    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1540    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1541    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1542    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1543       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1544 
1545    if (fs->num_sampler_prefetch > 0) {
1546       /* It seems like ij_pix is *required* to be r0.x */
1547       assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
1548              ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1549    }
1550 
1551    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1552    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1553                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
1554                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
1555                      COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
1556                           A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
1557                      COND(fs->prefetch_end_of_quad,
1558                           A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
1559    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1560       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1561       tu_cs_emit(
1562          cs, SP_FS_PREFETCH_CMD(
1563                 CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id,
1564                 .tex_id = prefetch->tex_id, .dst = prefetch->dst,
1565                 .wrmask = prefetch->wrmask, .half = prefetch->half_precision,
1566                 .bindless = prefetch->bindless,
1567                 .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value);
1568    }
1569 
1570    if (fs->num_sampler_prefetch > 0) {
1571       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1572       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1573          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1574          tu_cs_emit(cs,
1575                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1576                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1577       }
1578    }
1579 
1580    tu_cs_emit_regs(cs,
1581       HLSQ_CONTROL_1_REG(CHIP,
1582          .primallocthreshold =
1583             cs->device->physical_device->info->a6xx.prim_alloc_threshold),
1584       HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid,
1585                          .sampleid = samp_id_regid,
1586                          .samplemask = smask_in_regid,
1587                          .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]),
1588       HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1589                          .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1590                          .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1591                          .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]),
1592       HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1593                          .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1594                          .xycoordregid = coord_regid,
1595                          .zwcoordregid = zwcoord_regid),
1596       HLSQ_CONTROL_5_REG(CHIP, .dword = 0xfcfc), );
1597 
1598    if (CHIP >= A7XX) {
1599       uint32_t sysval_regs = 0;
1600       for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1601          if (VALIDREG(ij_regid[i])) {
1602             if (i == IJ_PERSP_CENTER_RHW)
1603                sysval_regs += 1;
1604             else
1605                sysval_regs += 2;
1606          }
1607       }
1608 
1609       for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid }) {
1610          if (VALIDREG(sysval))
1611             sysval_regs += 1;
1612       }
1613 
1614       for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1615          if (VALIDREG(sysval))
1616             sysval_regs += 2;
1617       }
1618 
1619       tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.sysval_regs_count = sysval_regs,
1620                                                  .unk8 = 1,
1621                                                  .unk9 = 1));
1622    }
1623 
1624    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1625    tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings));
1626 
1627    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1628    bool need_size_persamp = false;
1629    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1630       if (sample_shading)
1631          need_size_persamp = true;
1632       else
1633          need_size = true;
1634    }
1635 
1636    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1637    tu_cs_emit(cs,
1638          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1639          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1640          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1641          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1642          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1643          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1644          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1645          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1646          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1647 
1648    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1649    tu_cs_emit(cs,
1650          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1651          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1652          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1653          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1654          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1655          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1656          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1657          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1658          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1659          COND(fs->fragcoord_compmask != 0,
1660                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1661    tu_cs_emit(cs,
1662          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1663             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1664          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1665          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1666          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1667          COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE)  |
1668          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1669 
1670    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1671    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1672 
1673    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1674    tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1675               A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1676                  sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1677 
1678    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1679    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1680 
1681    uint32_t varmask[4] = { 0 };
1682 
1683    for (int i = ir3_next_varying(fs, -1); i < fs->inputs_count;
1684         i = ir3_next_varying(fs, i)) {
1685       if (fs->inputs[i].inloc >= fs->total_in)
1686          continue;
1687 
1688       unsigned loc = fs->inputs[i].inloc;
1689       for (int j = 0; j < util_last_bit(fs->inputs[i].compmask); j++) {
1690          uint8_t comploc = loc + j;
1691          varmask[comploc / 32] |= 1 << (comploc % 32);
1692       }
1693    }
1694 
1695    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1696    tu_cs_emit(cs, ~varmask[0]);
1697    tu_cs_emit(cs, ~varmask[1]);
1698    tu_cs_emit(cs, ~varmask[2]);
1699    tu_cs_emit(cs, ~varmask[3]);
1700 
1701    unsigned primid_loc = ir3_find_input_loc(fs, VARYING_SLOT_PRIMITIVE_ID);
1702    unsigned viewid_loc = ir3_find_input_loc(fs, VARYING_SLOT_VIEW_INDEX);
1703 
1704    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1705    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
1706                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1707                   A6XX_VPC_CNTL_0_PRIMIDLOC(primid_loc) |
1708                   A6XX_VPC_CNTL_0_VIEWIDLOC(viewid_loc));
1709 }
1710 
1711 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1712 tu6_emit_fs_outputs(struct tu_cs *cs,
1713                     const struct ir3_shader_variant *fs)
1714 {
1715    uint32_t smask_regid, posz_regid, stencilref_regid;
1716 
1717    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1718    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1719    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1720 
1721    int output_reg_count = 0;
1722    uint32_t fragdata_regid[8];
1723 
1724    assert(!fs->color0_mrt);
1725    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1726       fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1727       if (VALIDREG(fragdata_regid[i]))
1728          output_reg_count = i + 1;
1729    }
1730 
1731    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1732    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1733                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1734                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1735                   COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1736 
1737    /* There is no point in having component enabled which is not written
1738     * by the shader. Per VK spec it is an UB, however a few apps depend on
1739     * attachment not being changed if FS doesn't have corresponding output.
1740     */
1741    uint32_t fs_render_components = 0;
1742 
1743    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1744    for (uint32_t i = 0; i < output_reg_count; i++) {
1745       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1746                      (COND(fragdata_regid[i] & HALF_REG_ID,
1747                            A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1748 
1749       if (VALIDREG(fragdata_regid[i])) {
1750          fs_render_components |= 0xf << (i * 4);
1751       }
1752    }
1753 
1754    tu_cs_emit_regs(cs,
1755                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1756 
1757    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
1758    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1759                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1760                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1761                   COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1762 
1763    tu_cs_emit_regs(cs,
1764                    A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1765 }
1766 
1767 template <chip CHIP>
1768 void
tu6_emit_vs(struct tu_cs * cs,const struct ir3_shader_variant * vs,uint32_t view_mask)1769 tu6_emit_vs(struct tu_cs *cs,
1770             const struct ir3_shader_variant *vs,
1771             uint32_t view_mask)
1772 {
1773    bool multi_pos_output = vs->multi_pos_output;
1774 
1775    uint32_t multiview_views = util_logbase2(view_mask) + 1;
1776    uint32_t multiview_cntl = view_mask ?
1777       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1778       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1779       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1780       : 0;
1781 
1782    /* Copy what the blob does here. This will emit an extra 0x3f
1783     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1784     * this is working around yet.
1785     */
1786    if (cs->device->physical_device->info->a6xx.has_cp_reg_write) {
1787       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1788       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1789       tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1790    } else {
1791       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1792    }
1793    tu_cs_emit(cs, multiview_cntl);
1794 
1795    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1796    tu_cs_emit(cs, multiview_cntl);
1797 
1798    if (multiview_cntl &&
1799        cs->device->physical_device->info->a6xx.supports_multiview_mask) {
1800       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1801       tu_cs_emit(cs, view_mask);
1802    }
1803 
1804    if (CHIP >= A7XX) {
1805       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_CNTL, 1);
1806       tu_cs_emit(cs, multiview_cntl);
1807 
1808       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_MASK, 1);
1809       tu_cs_emit(cs, view_mask);
1810    }
1811 
1812    tu6_emit_vfd_dest(cs, vs);
1813 
1814    const uint32_t vertexid_regid =
1815          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
1816    const uint32_t instanceid_regid =
1817          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
1818 
1819    /* Note: we currently don't support multiview with tess or GS. If we did,
1820     * and the HW actually works, then we'd have to somehow share this across
1821     * stages. Note that the blob doesn't support this either.
1822     */
1823    const uint32_t viewid_regid =
1824       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
1825 
1826    const uint32_t vs_primitiveid_regid =
1827       ir3_find_sysval_regid(vs, SYSTEM_VALUE_PRIMITIVE_ID);
1828 
1829    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 1);
1830    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
1831                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
1832                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
1833                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
1834 }
1835 TU_GENX(tu6_emit_vs);
1836 
1837 template <chip CHIP>
1838 void
tu6_emit_hs(struct tu_cs * cs,const struct ir3_shader_variant * hs)1839 tu6_emit_hs(struct tu_cs *cs,
1840             const struct ir3_shader_variant *hs)
1841 {
1842    const uint32_t hs_rel_patch_regid =
1843          ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1844    const uint32_t hs_invocation_regid =
1845          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);
1846 
1847    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_2, 1);
1848    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
1849                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
1850 
1851    if (hs) {
1852       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1853       tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1854    }
1855 }
1856 TU_GENX(tu6_emit_hs);
1857 
1858 template <chip CHIP>
1859 void
tu6_emit_ds(struct tu_cs * cs,const struct ir3_shader_variant * ds)1860 tu6_emit_ds(struct tu_cs *cs,
1861             const struct ir3_shader_variant *ds)
1862 {
1863    const uint32_t ds_rel_patch_regid =
1864          ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1865    const uint32_t tess_coord_x_regid =
1866          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
1867    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
1868          tess_coord_x_regid + 1 :
1869          regid(63, 0);
1870    const uint32_t ds_primitiveid_regid =
1871          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
1872 
1873    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_3, 2);
1874    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
1875                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
1876                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
1877                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
1878    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
1879 }
1880 TU_GENX(tu6_emit_ds);
1881 
1882 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)1883 primitive_to_tess(enum mesa_prim primitive) {
1884    switch (primitive) {
1885    case MESA_PRIM_POINTS:
1886       return TESS_POINTS;
1887    case MESA_PRIM_LINE_STRIP:
1888       return TESS_LINES;
1889    case MESA_PRIM_TRIANGLE_STRIP:
1890       return TESS_CW_TRIS;
1891    default:
1892       unreachable("");
1893    }
1894 }
1895 
1896 template <chip CHIP>
1897 void
tu6_emit_gs(struct tu_cs * cs,const struct ir3_shader_variant * gs)1898 tu6_emit_gs(struct tu_cs *cs,
1899             const struct ir3_shader_variant *gs)
1900 {
1901    const uint32_t gsheader_regid =
1902          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
1903 
1904    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_5, 1);
1905    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
1906                   0xfc00);
1907 
1908    if (gs) {
1909       uint32_t vertices_out, invocations;
1910 
1911       vertices_out = gs->gs.vertices_out - 1;
1912       enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim) gs->gs.output_primitive);
1913       invocations = gs->gs.invocations - 1;
1914 
1915       uint32_t primitive_cntl =
1916          A6XX_PC_PRIMITIVE_CNTL_5(.gs_vertices_out = vertices_out,
1917                                   .gs_invocations = invocations,
1918                                   .gs_output = output,).value;
1919 
1920       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1921       tu_cs_emit(cs, primitive_cntl);
1922 
1923       if (CHIP >= A7XX) {
1924          tu_cs_emit_pkt4(cs, REG_A7XX_VPC_PRIMITIVE_CNTL_5, 1);
1925          tu_cs_emit(cs, primitive_cntl);
1926       } else {
1927          tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1928          tu_cs_emit(cs, 0xff);
1929       }
1930    }
1931 }
1932 TU_GENX(tu6_emit_gs);
1933 
1934 template <chip CHIP>
1935 void
tu6_emit_fs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1936 tu6_emit_fs(struct tu_cs *cs,
1937             const struct ir3_shader_variant *fs)
1938 {
1939    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_6, 1);
1940    tu_cs_emit(cs, COND(fs && fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN));
1941 
1942    tu_cs_emit_regs(cs, A6XX_PC_PS_CNTL(.primitiveiden = fs && fs->reads_primid));
1943 
1944    if (CHIP >= A7XX) {
1945       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
1946       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
1947    }
1948 
1949    if (fs) {
1950       tu6_emit_fs_inputs<CHIP>(cs, fs);
1951       tu6_emit_fs_outputs(cs, fs);
1952    } else {
1953       /* TODO: check if these can be skipped if fs is disabled */
1954       struct ir3_shader_variant dummy_variant = {};
1955       tu6_emit_fs_inputs<CHIP>(cs, &dummy_variant);
1956       tu6_emit_fs_outputs(cs, &dummy_variant);
1957    }
1958 }
1959 TU_GENX(tu6_emit_fs);
1960 
1961 template <chip CHIP>
1962 static void
tu6_emit_variant(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,struct tu_pvtmem_config * pvtmem_config,uint32_t view_mask,uint64_t binary_iova)1963 tu6_emit_variant(struct tu_cs *cs,
1964                  gl_shader_stage stage,
1965                  const struct ir3_shader_variant *xs,
1966                  struct tu_pvtmem_config *pvtmem_config,
1967                  uint32_t view_mask,
1968                  uint64_t binary_iova)
1969 {
1970    if (stage == MESA_SHADER_COMPUTE) {
1971       tu6_emit_cs_config<CHIP>(cs, xs, pvtmem_config, binary_iova);
1972       return;
1973    }
1974 
1975    tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova);
1976 
1977    switch (stage) {
1978    case MESA_SHADER_VERTEX:
1979       tu6_emit_vs<CHIP>(cs, xs, view_mask);
1980       break;
1981    case MESA_SHADER_TESS_CTRL:
1982       tu6_emit_hs<CHIP>(cs, xs);
1983       break;
1984    case MESA_SHADER_TESS_EVAL:
1985       tu6_emit_ds<CHIP>(cs, xs);
1986       break;
1987    case MESA_SHADER_GEOMETRY:
1988       tu6_emit_gs<CHIP>(cs, xs);
1989       break;
1990    case MESA_SHADER_FRAGMENT:
1991       tu6_emit_fs<CHIP>(cs, xs);
1992       break;
1993    default:
1994       unreachable("unknown shader stage");
1995    }
1996 }
1997 
1998 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_shader * shader,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)1999 tu_setup_pvtmem(struct tu_device *dev,
2000                 struct tu_shader *shader,
2001                 struct tu_pvtmem_config *config,
2002                 uint32_t pvtmem_bytes,
2003                 bool per_wave)
2004 {
2005    if (!pvtmem_bytes) {
2006       memset(config, 0, sizeof(*config));
2007       return VK_SUCCESS;
2008    }
2009 
2010    /* There is a substantial memory footprint from private memory BOs being
2011     * allocated on a per-pipeline basis and it isn't required as the same
2012     * BO can be utilized by multiple pipelines as long as they have the
2013     * private memory layout (sizes and per-wave/per-fiber) to avoid being
2014     * overwritten by other active pipelines using the same BO with differing
2015     * private memory layouts resulting memory corruption.
2016     *
2017     * To avoid this, we create private memory BOs on a per-device level with
2018     * an associated private memory layout then dynamically grow them when
2019     * needed and reuse them across pipelines. Growth is done in terms of
2020     * powers of two so that we can avoid frequent reallocation of the
2021     * private memory BOs.
2022     */
2023 
2024    struct tu_pvtmem_bo *pvtmem_bo =
2025       per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
2026    mtx_lock(&pvtmem_bo->mtx);
2027 
2028    if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
2029       if (pvtmem_bo->bo)
2030          tu_bo_finish(dev, pvtmem_bo->bo);
2031 
2032       pvtmem_bo->per_fiber_size =
2033          util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
2034       pvtmem_bo->per_sp_size =
2035          ALIGN(pvtmem_bo->per_fiber_size *
2036                   dev->physical_device->info->fibers_per_sp,
2037                1 << 12);
2038       uint32_t total_size =
2039          dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
2040 
2041       VkResult result = tu_bo_init_new(dev, NULL, &pvtmem_bo->bo, total_size,
2042                                        TU_BO_ALLOC_INTERNAL_RESOURCE, "pvtmem");
2043       if (result != VK_SUCCESS) {
2044          mtx_unlock(&pvtmem_bo->mtx);
2045          return result;
2046       }
2047    }
2048 
2049    config->per_wave = per_wave;
2050    config->per_fiber_size = pvtmem_bo->per_fiber_size;
2051    config->per_sp_size = pvtmem_bo->per_sp_size;
2052 
2053    shader->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
2054    config->iova = shader->pvtmem_bo->iova;
2055 
2056    mtx_unlock(&pvtmem_bo->mtx);
2057 
2058    return VK_SUCCESS;
2059 }
2060 
2061 static uint64_t
tu_upload_variant(struct tu_cs * cs,const struct ir3_shader_variant * variant)2062 tu_upload_variant(struct tu_cs *cs,
2063                   const struct ir3_shader_variant *variant)
2064 {
2065    struct tu_cs_memory memory;
2066 
2067    if (!variant)
2068       return 0;
2069 
2070    /* this expects to get enough alignment because shaders are allocated first
2071     * and total size is always aligned correctly
2072     * note: an assert in tu6_emit_xs_config validates the alignment
2073     */
2074    tu_cs_alloc(cs, variant->info.size / 4, 1, &memory);
2075 
2076    memcpy(memory.map, variant->bin, variant->info.size);
2077    return memory.iova;
2078 }
2079 
2080 static VkResult
tu_upload_shader(struct tu_device * dev,struct tu_shader * shader)2081 tu_upload_shader(struct tu_device *dev,
2082                  struct tu_shader *shader)
2083 {
2084    const struct ir3_shader_variant *v = shader->variant;
2085    const struct ir3_shader_variant *binning = v ? v->binning : NULL;
2086    const struct ir3_shader_variant *safe_const = shader->safe_const_variant;
2087 
2088    if (v->type == MESA_SHADER_VERTEX && v->stream_output.num_outputs != 0)
2089       binning = v;
2090 
2091    uint32_t size = 0;
2092    if (v->type == MESA_SHADER_VERTEX)
2093       size += TU6_EMIT_VFD_DEST_MAX_DWORDS;
2094 
2095    const unsigned xs_size = 128;
2096    const unsigned vpc_size = 32 + (v->stream_output.num_outputs != 0 ? 256 : 0);
2097 
2098    size += xs_size + tu_xs_get_additional_cs_size_dwords(v);
2099    size += v->info.size / 4;
2100    if (binning) {
2101       size += xs_size + tu_xs_get_additional_cs_size_dwords(binning);
2102       size += binning->info.size / 4;
2103    }
2104 
2105    if (safe_const) {
2106       size += xs_size + tu_xs_get_additional_cs_size_dwords(safe_const);
2107       size += safe_const->info.size / 4;
2108    }
2109 
2110    /* We emit an empty VPC including streamout state in the binning draw state */
2111    if (binning || v->type == MESA_SHADER_GEOMETRY) {
2112       size += vpc_size;
2113    }
2114 
2115    pthread_mutex_lock(&dev->pipeline_mutex);
2116    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2117                                           size * 4, 128);
2118    pthread_mutex_unlock(&dev->pipeline_mutex);
2119 
2120    if (result != VK_SUCCESS)
2121       return result;
2122 
2123    uint32_t pvtmem_size = v->pvtmem_size;
2124    bool per_wave = v->pvtmem_per_wave;
2125 
2126    if (v->binning) {
2127       pvtmem_size = MAX2(pvtmem_size, shader->variant->binning->pvtmem_size);
2128       if (!shader->variant->binning->pvtmem_per_wave)
2129          per_wave = false;
2130    }
2131 
2132    if (shader->safe_const_variant) {
2133       pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->pvtmem_size);
2134       if (!shader->safe_const_variant->pvtmem_per_wave)
2135          per_wave = false;
2136 
2137       if (shader->safe_const_variant->binning) {
2138          pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->binning->pvtmem_size);
2139          if (!shader->safe_const_variant->binning->pvtmem_per_wave)
2140             per_wave = false;
2141       }
2142    }
2143 
2144    struct tu_pvtmem_config pvtmem_config;
2145 
2146    result = tu_setup_pvtmem(dev, shader, &pvtmem_config, pvtmem_size, per_wave);
2147    if (result != VK_SUCCESS) {
2148       pthread_mutex_lock(&dev->pipeline_mutex);
2149       tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2150       pthread_mutex_unlock(&dev->pipeline_mutex);
2151       return result;
2152    }
2153 
2154    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2155    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2156 
2157    uint64_t iova = tu_upload_variant(&shader->cs, v);
2158    uint64_t binning_iova = tu_upload_variant(&shader->cs, binning);
2159    uint64_t safe_const_iova = tu_upload_variant(&shader->cs, safe_const);
2160 
2161    struct tu_cs sub_cs;
2162    tu_cs_begin_sub_stream(&shader->cs, xs_size +
2163                           tu_xs_get_additional_cs_size_dwords(v), &sub_cs);
2164    TU_CALLX(dev, tu6_emit_variant)(
2165       &sub_cs, shader->variant->type, shader->variant, &pvtmem_config,
2166       shader->view_mask, iova);
2167    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2168 
2169    if (safe_const) {
2170       tu_cs_begin_sub_stream(&shader->cs, xs_size +
2171                              tu_xs_get_additional_cs_size_dwords(safe_const), &sub_cs);
2172       TU_CALLX(dev, tu6_emit_variant)(
2173          &sub_cs, v->type, safe_const, &pvtmem_config, shader->view_mask,
2174          safe_const_iova);
2175       shader->safe_const_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2176    }
2177 
2178    if (binning) {
2179       tu_cs_begin_sub_stream(&shader->cs, xs_size + vpc_size +
2180                              tu_xs_get_additional_cs_size_dwords(binning), &sub_cs);
2181       TU_CALLX(dev, tu6_emit_variant)(
2182          &sub_cs, v->type, binning, &pvtmem_config, shader->view_mask,
2183          binning_iova);
2184       /* emit an empty VPC */
2185       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, binning, NULL, NULL, NULL, NULL);
2186       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2187    }
2188 
2189    /* We don't support binning variants for GS, so the same draw state is used
2190     * when binning and when drawing, but the VPC draw state is not executed
2191     * when binning so we still need to generate an appropriate VPC config for
2192     * binning.
2193     */
2194    if (v->type == MESA_SHADER_GEOMETRY) {
2195       tu_cs_begin_sub_stream(&shader->cs, vpc_size, &sub_cs);
2196       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, NULL, NULL, NULL, v, NULL);
2197       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2198    }
2199 
2200    return VK_SUCCESS;
2201 }
2202 
2203 static bool
2204 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2205                     struct blob *blob);
2206 
2207 static struct vk_pipeline_cache_object *
2208 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2209                       const void *key_data,
2210                       size_t key_size,
2211                       struct blob_reader *blob);
2212 
2213 static void
tu_shader_pipeline_cache_object_destroy(struct vk_device * vk_device,struct vk_pipeline_cache_object * object)2214 tu_shader_pipeline_cache_object_destroy(struct vk_device *vk_device,
2215                                         struct vk_pipeline_cache_object *object)
2216 {
2217    struct tu_device *device = container_of(vk_device, struct tu_device, vk);
2218    struct tu_shader *shader =
2219       container_of(object, struct tu_shader, base);
2220 
2221    vk_pipeline_cache_object_finish(&shader->base);
2222    tu_shader_destroy(device, shader);
2223 }
2224 
2225 const struct vk_pipeline_cache_object_ops tu_shader_ops = {
2226    .serialize = tu_shader_serialize,
2227    .deserialize = tu_shader_deserialize,
2228    .destroy = tu_shader_pipeline_cache_object_destroy,
2229 };
2230 
2231 static struct tu_shader *
tu_shader_init(struct tu_device * dev,const void * key_data,size_t key_size)2232 tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
2233 {
2234    VK_MULTIALLOC(ma);
2235    VK_MULTIALLOC_DECL(&ma, struct tu_shader, shader, 1);
2236    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
2237 
2238    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2239                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2240       return NULL;
2241 
2242    memcpy(obj_key_data, key_data, key_size);
2243 
2244    vk_pipeline_cache_object_init(&dev->vk, &shader->base,
2245                                  &tu_shader_ops, obj_key_data, key_size);
2246 
2247    shader->const_state.fdm_ubo.idx = -1;
2248    shader->const_state.dynamic_offsets_ubo.idx = -1;
2249    shader->const_state.inline_uniforms_ubo.idx = -1;
2250 
2251    return shader;
2252 }
2253 
2254 static bool
tu_shader_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2255 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2256                     struct blob *blob)
2257 {
2258    struct tu_shader *shader =
2259       container_of(object, struct tu_shader, base);
2260 
2261    blob_write_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2262    blob_write_bytes(blob, &shader->dynamic_descriptor_sizes,
2263                     sizeof(shader->dynamic_descriptor_sizes));
2264    blob_write_uint32(blob, shader->view_mask);
2265    blob_write_uint8(blob, shader->active_desc_sets);
2266 
2267    ir3_store_variant(blob, shader->variant);
2268 
2269    if (shader->safe_const_variant) {
2270       blob_write_uint8(blob, 1);
2271       ir3_store_variant(blob, shader->safe_const_variant);
2272    } else {
2273       blob_write_uint8(blob, 0);
2274    }
2275 
2276 
2277 
2278    switch (shader->variant->type) {
2279    case MESA_SHADER_TESS_EVAL:
2280       blob_write_bytes(blob, &shader->tes, sizeof(shader->tes));
2281       break;
2282    case MESA_SHADER_FRAGMENT:
2283       blob_write_bytes(blob, &shader->fs, sizeof(shader->fs));
2284       break;
2285    default:
2286       break;
2287    }
2288 
2289    return true;
2290 }
2291 
2292 static struct vk_pipeline_cache_object *
tu_shader_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)2293 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2294                       const void *key_data,
2295                       size_t key_size,
2296                       struct blob_reader *blob)
2297 {
2298    struct tu_device *dev =
2299       container_of(cache->base.device, struct tu_device, vk);
2300    struct tu_shader *shader =
2301       tu_shader_init(dev, key_data, key_size);
2302 
2303    if (!shader)
2304       return NULL;
2305 
2306    blob_copy_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2307    blob_copy_bytes(blob, &shader->dynamic_descriptor_sizes,
2308                    sizeof(shader->dynamic_descriptor_sizes));
2309    shader->view_mask = blob_read_uint32(blob);
2310    shader->active_desc_sets = blob_read_uint8(blob);
2311 
2312    shader->variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2313 
2314    bool has_safe_const = blob_read_uint8(blob);
2315    if (has_safe_const)
2316       shader->safe_const_variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2317 
2318    switch (shader->variant->type) {
2319    case MESA_SHADER_TESS_EVAL:
2320       blob_copy_bytes(blob, &shader->tes, sizeof(shader->tes));
2321       break;
2322    case MESA_SHADER_FRAGMENT:
2323       blob_copy_bytes(blob, &shader->fs, sizeof(shader->fs));
2324       break;
2325    default:
2326       break;
2327    }
2328 
2329    VkResult result = tu_upload_shader(dev, shader);
2330    if (result != VK_SUCCESS) {
2331       vk_free(&dev->vk.alloc, shader);
2332       return NULL;
2333    }
2334 
2335    return &shader->base;
2336 }
2337 
2338 VkResult
tu_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,nir_shader * nir,const struct tu_shader_key * key,const struct ir3_shader_key * ir3_key,const void * key_data,size_t key_size,struct tu_pipeline_layout * layout,bool executable_info)2339 tu_shader_create(struct tu_device *dev,
2340                  struct tu_shader **shader_out,
2341                  nir_shader *nir,
2342                  const struct tu_shader_key *key,
2343                  const struct ir3_shader_key *ir3_key,
2344                  const void *key_data,
2345                  size_t key_size,
2346                  struct tu_pipeline_layout *layout,
2347                  bool executable_info)
2348 {
2349    struct tu_shader *shader = tu_shader_init(dev, key_data, key_size);
2350 
2351    if (!shader)
2352       return VK_ERROR_OUT_OF_HOST_MEMORY;
2353 
2354    const nir_opt_access_options access_options = {
2355       .is_vulkan = true,
2356    };
2357    NIR_PASS_V(nir, nir_opt_access, &access_options);
2358 
2359    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2360       const nir_input_attachment_options att_options = {
2361          .use_fragcoord_sysval = true,
2362          .use_layer_id_sysval = false,
2363          /* When using multiview rendering, we must use
2364           * gl_ViewIndex as the layer id to pass to the texture
2365           * sampling function. gl_Layer doesn't work when
2366           * multiview is enabled.
2367           */
2368          .use_view_id_for_layer = key->multiview_mask != 0,
2369          .unscaled_input_attachment_ir3 = key->unscaled_input_fragcoord,
2370       };
2371       NIR_PASS_V(nir, nir_lower_input_attachments, &att_options);
2372    }
2373 
2374    /* This has to happen before lower_input_attachments, because we have to
2375     * lower input attachment coordinates except if unscaled.
2376     */
2377    const struct lower_fdm_options fdm_options = {
2378       .num_views = MAX2(util_last_bit(key->multiview_mask), 1),
2379       .adjust_fragcoord = key->fragment_density_map,
2380    };
2381    NIR_PASS_V(nir, tu_nir_lower_fdm, &fdm_options);
2382 
2383 
2384    /* This needs to happen before multiview lowering which rewrites store
2385     * instructions of the position variable, so that we can just rewrite one
2386     * store at the end instead of having to rewrite every store specified by
2387     * the user.
2388     */
2389    ir3_nir_lower_io_to_temporaries(nir);
2390 
2391    if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
2392       tu_nir_lower_multiview(nir, key->multiview_mask, dev);
2393    }
2394 
2395    if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
2396       nir_foreach_shader_in_variable(var, nir) {
2397          if (!var->data.centroid)
2398             var->data.sample = true;
2399       }
2400    }
2401 
2402    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
2403               nir_address_format_32bit_offset);
2404 
2405    NIR_PASS_V(nir, nir_lower_explicit_io,
2406               nir_var_mem_ubo | nir_var_mem_ssbo,
2407               nir_address_format_vec2_index_32bit_offset);
2408 
2409    NIR_PASS_V(nir, nir_lower_explicit_io,
2410               nir_var_mem_global,
2411               nir_address_format_64bit_global);
2412 
2413    if (nir->info.stage == MESA_SHADER_COMPUTE) {
2414       if (!nir->info.shared_memory_explicit_layout) {
2415          NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
2416                     nir_var_mem_shared, shared_type_info);
2417       }
2418       NIR_PASS_V(nir, nir_lower_explicit_io,
2419                  nir_var_mem_shared,
2420                  nir_address_format_32bit_offset);
2421 
2422       if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
2423          const unsigned chunk_size = 16; /* max single store size */
2424          /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
2425           * extension only requires us to initialize the memory that the shader
2426           * is allocated at the API level, and it's up to the user to ensure
2427           * that accesses are limited to those bounds.
2428           */
2429          const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2430          NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
2431       }
2432 
2433       const struct nir_lower_compute_system_values_options compute_sysval_options = {
2434          .has_base_workgroup_id = true,
2435       };
2436       NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
2437    }
2438 
2439    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
2440    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
2441 
2442   /* Gather information for transform feedback. This should be called after:
2443     * - nir_split_per_member_structs.
2444     * - nir_remove_dead_variables with varyings, so that we could align
2445     *   stream outputs correctly.
2446     * - nir_assign_io_var_locations - to have valid driver_location
2447     */
2448    struct ir3_stream_output_info so_info = {};
2449    if (nir->info.stage == MESA_SHADER_VERTEX ||
2450          nir->info.stage == MESA_SHADER_TESS_EVAL ||
2451          nir->info.stage == MESA_SHADER_GEOMETRY)
2452       tu_gather_xfb_info(nir, &so_info);
2453 
2454    for (unsigned i = 0; i < layout->num_sets; i++) {
2455       if (layout->set[i].layout) {
2456          shader->dynamic_descriptor_sizes[i] =
2457             layout->set[i].layout->dynamic_offset_size;
2458       } else {
2459          shader->dynamic_descriptor_sizes[i] = -1;
2460       }
2461    }
2462 
2463    unsigned reserved_consts_vec4 = 0;
2464    NIR_PASS_V(nir, tu_lower_io, dev, shader, layout, &reserved_consts_vec4);
2465 
2466    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2467 
2468    ir3_finalize_nir(dev->compiler, nir);
2469 
2470    const struct ir3_shader_options options = {
2471       .num_reserved_user_consts = reserved_consts_vec4,
2472       .api_wavesize = key->api_wavesize,
2473       .real_wavesize = key->real_wavesize,
2474       .push_consts_type = shader->const_state.push_consts.type,
2475       .push_consts_base = shader->const_state.push_consts.lo,
2476       .push_consts_dwords = shader->const_state.push_consts.dwords,
2477    };
2478 
2479    struct ir3_shader *ir3_shader =
2480       ir3_shader_from_nir(dev->compiler, nir, &options, &so_info);
2481 
2482    shader->variant =
2483       ir3_shader_create_variant(ir3_shader, ir3_key, executable_info);
2484 
2485    if (ir3_exceeds_safe_constlen(shader->variant)) {
2486       struct ir3_shader_key safe_constlen_key = *ir3_key;
2487       safe_constlen_key.safe_constlen = true;
2488       shader->safe_const_variant =
2489          ir3_shader_create_variant(ir3_shader, &safe_constlen_key,
2490                                    executable_info);
2491    }
2492 
2493    ir3_shader_destroy(ir3_shader);
2494 
2495    shader->view_mask = key->multiview_mask;
2496 
2497    switch (shader->variant->type) {
2498    case MESA_SHADER_TESS_EVAL: {
2499       const struct ir3_shader_variant *tes = shader->variant;
2500       if (tes->tess.point_mode) {
2501          shader->tes.tess_output_lower_left =
2502             shader->tes.tess_output_upper_left = TESS_POINTS;
2503       } else if (tes->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
2504          shader->tes.tess_output_lower_left =
2505             shader->tes.tess_output_upper_left = TESS_LINES;
2506       } else if (tes->tess.ccw) {
2507          /* Tessellation orientation in HW is specified with a lower-left
2508           * origin, we need to swap them if the origin is upper-left.
2509           */
2510          shader->tes.tess_output_lower_left = TESS_CCW_TRIS;
2511          shader->tes.tess_output_upper_left = TESS_CW_TRIS;
2512       } else {
2513          shader->tes.tess_output_lower_left = TESS_CW_TRIS;
2514          shader->tes.tess_output_upper_left = TESS_CCW_TRIS;
2515       }
2516 
2517       switch (tes->tess.spacing) {
2518       case TESS_SPACING_EQUAL:
2519          shader->tes.tess_spacing = TESS_EQUAL;
2520          break;
2521       case TESS_SPACING_FRACTIONAL_ODD:
2522          shader->tes.tess_spacing = TESS_FRACTIONAL_ODD;
2523          break;
2524       case TESS_SPACING_FRACTIONAL_EVEN:
2525          shader->tes.tess_spacing = TESS_FRACTIONAL_EVEN;
2526          break;
2527       case TESS_SPACING_UNSPECIFIED:
2528       default:
2529          unreachable("invalid tess spacing");
2530       }
2531 
2532       break;
2533    }
2534    case MESA_SHADER_FRAGMENT: {
2535       const struct ir3_shader_variant *fs = shader->variant;
2536       shader->fs.per_samp = fs->per_samp || ir3_key->sample_shading;
2537       shader->fs.has_fdm = key->fragment_density_map;
2538       if (fs->has_kill)
2539          shader->fs.lrz.status |= TU_LRZ_FORCE_DISABLE_WRITE;
2540       if (fs->no_earlyz || (fs->writes_pos && !fs->fs.early_fragment_tests))
2541          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2542       /* FDM isn't compatible with LRZ, because the LRZ image uses the original
2543        * resolution and we would need to use the low resolution.
2544        *
2545        * TODO: Use a patchpoint to only disable LRZ for scaled bins.
2546        */
2547       if (key->fragment_density_map)
2548          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2549       if (!fs->fs.early_fragment_tests &&
2550           (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
2551          shader->fs.lrz.force_late_z = true;
2552       }
2553       break;
2554    }
2555    default:
2556       break;
2557    }
2558 
2559    VkResult result = tu_upload_shader(dev, shader);
2560    if (result != VK_SUCCESS) {
2561       vk_free(&dev->vk.alloc, shader);
2562       return result;
2563    }
2564 
2565    *shader_out = shader;
2566    return VK_SUCCESS;
2567 }
2568 
2569 static void
tu_link_shaders(nir_shader ** shaders,unsigned shaders_count)2570 tu_link_shaders(nir_shader **shaders, unsigned shaders_count)
2571 {
2572    nir_shader *consumer = NULL;
2573    for (gl_shader_stage stage = (gl_shader_stage) (shaders_count - 1);
2574         stage >= MESA_SHADER_VERTEX; stage = (gl_shader_stage) (stage - 1)) {
2575       if (!shaders[stage])
2576          continue;
2577 
2578       nir_shader *producer = shaders[stage];
2579       if (!consumer) {
2580          consumer = producer;
2581          continue;
2582       }
2583 
2584       if (nir_link_opt_varyings(producer, consumer)) {
2585          NIR_PASS_V(consumer, nir_opt_constant_folding);
2586          NIR_PASS_V(consumer, nir_opt_algebraic);
2587          NIR_PASS_V(consumer, nir_opt_dce);
2588       }
2589 
2590       const nir_remove_dead_variables_options out_var_opts = {
2591          .can_remove_var = nir_vk_is_not_xfb_output,
2592       };
2593       NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
2594 
2595       NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2596 
2597       bool progress = nir_remove_unused_varyings(producer, consumer);
2598 
2599       nir_compact_varyings(producer, consumer, true);
2600       if (progress) {
2601          if (nir_lower_global_vars_to_local(producer)) {
2602             /* Remove dead writes, which can remove input loads */
2603             NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2604             NIR_PASS_V(producer, nir_opt_dce);
2605          }
2606          nir_lower_global_vars_to_local(consumer);
2607       }
2608 
2609       consumer = producer;
2610    }
2611 
2612    /* Gather info after linking so that we can fill out the ir3 shader key.
2613     */
2614    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2615         stage <= MESA_SHADER_FRAGMENT; stage = (gl_shader_stage) (stage + 1)) {
2616       if (shaders[stage])
2617          nir_shader_gather_info(shaders[stage],
2618                                 nir_shader_get_entrypoint(shaders[stage]));
2619    }
2620 }
2621 
2622 static uint32_t
tu6_get_tessmode(const struct nir_shader * shader)2623 tu6_get_tessmode(const struct nir_shader *shader)
2624 {
2625    enum tess_primitive_mode primitive_mode = shader->info.tess._primitive_mode;
2626    switch (primitive_mode) {
2627    case TESS_PRIMITIVE_ISOLINES:
2628       return IR3_TESS_ISOLINES;
2629    case TESS_PRIMITIVE_TRIANGLES:
2630       return IR3_TESS_TRIANGLES;
2631    case TESS_PRIMITIVE_QUADS:
2632       return IR3_TESS_QUADS;
2633    case TESS_PRIMITIVE_UNSPECIFIED:
2634       return IR3_TESS_NONE;
2635    default:
2636       unreachable("bad tessmode");
2637    }
2638 }
2639 
2640 VkResult
tu_compile_shaders(struct tu_device * device,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stage_infos,nir_shader ** nir,const struct tu_shader_key * keys,struct tu_pipeline_layout * layout,const unsigned char * pipeline_sha1,struct tu_shader ** shaders,char ** nir_initial_disasm,void * nir_initial_disasm_mem_ctx,nir_shader ** nir_out,VkPipelineCreationFeedback * stage_feedbacks)2641 tu_compile_shaders(struct tu_device *device,
2642                    VkPipelineCreateFlags2KHR pipeline_flags,
2643                    const VkPipelineShaderStageCreateInfo **stage_infos,
2644                    nir_shader **nir,
2645                    const struct tu_shader_key *keys,
2646                    struct tu_pipeline_layout *layout,
2647                    const unsigned char *pipeline_sha1,
2648                    struct tu_shader **shaders,
2649                    char **nir_initial_disasm,
2650                    void *nir_initial_disasm_mem_ctx,
2651                    nir_shader **nir_out,
2652                    VkPipelineCreationFeedback *stage_feedbacks)
2653 {
2654    struct ir3_shader_key ir3_key = {};
2655    VkResult result = VK_SUCCESS;
2656    void *mem_ctx = ralloc_context(NULL);
2657 
2658    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2659         stage = (gl_shader_stage) (stage + 1)) {
2660       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2661       if (!stage_info)
2662          continue;
2663 
2664       int64_t stage_start = os_time_get_nano();
2665 
2666       nir[stage] = tu_spirv_to_nir(device, mem_ctx, pipeline_flags,
2667                                    stage_info, stage);
2668       if (!nir[stage]) {
2669          result = VK_ERROR_OUT_OF_HOST_MEMORY;
2670          goto fail;
2671       }
2672 
2673       stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2674       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2675    }
2676 
2677    if (nir[MESA_SHADER_GEOMETRY])
2678       ir3_key.has_gs = true;
2679 
2680    ir3_key.sample_shading = keys[MESA_SHADER_FRAGMENT].force_sample_interp;
2681 
2682    if (nir_initial_disasm) {
2683       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2684            stage < MESA_SHADER_STAGES;
2685            stage = (gl_shader_stage) (stage + 1)) {
2686       if (!nir[stage])
2687          continue;
2688 
2689       nir_initial_disasm[stage] =
2690          nir_shader_as_str(nir[stage], nir_initial_disasm_mem_ctx);
2691       }
2692    }
2693 
2694    tu_link_shaders(nir, MESA_SHADER_STAGES);
2695 
2696    if (nir_out) {
2697       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2698            stage < MESA_SHADER_STAGES; stage = (gl_shader_stage) (stage + 1)) {
2699          if (!nir[stage])
2700             continue;
2701 
2702          nir_out[stage] = nir_shader_clone(NULL, nir[stage]);
2703       }
2704    }
2705 
2706    /* With pipelines, tessellation modes can be set on either shader, for
2707     * compatibility with HLSL and GLSL, and the driver is supposed to merge
2708     * them. Shader objects requires modes to be set on at least the TES except
2709     * for OutputVertices which has to be set at least on the TCS. Make sure
2710     * all modes are set on the TES when compiling together multiple shaders,
2711     * and then from this point on we will use the modes in the TES (and output
2712     * vertices on the TCS).
2713     */
2714    if (nir[MESA_SHADER_TESS_EVAL]) {
2715       nir_shader *tcs = nir[MESA_SHADER_TESS_CTRL];
2716       nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
2717 
2718       if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
2719          tes->info.tess._primitive_mode = tcs->info.tess._primitive_mode;
2720 
2721       tes->info.tess.point_mode |= tcs->info.tess.point_mode;
2722       tes->info.tess.ccw |= tcs->info.tess.ccw;
2723 
2724       if (tes->info.tess.spacing == TESS_SPACING_UNSPECIFIED) {
2725          tes->info.tess.spacing = tcs->info.tess.spacing;
2726       }
2727 
2728       if (tcs->info.tess.tcs_vertices_out == 0)
2729          tcs->info.tess.tcs_vertices_out = tes->info.tess.tcs_vertices_out;
2730 
2731       ir3_key.tessellation = tu6_get_tessmode(tes);
2732    }
2733 
2734    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2735         stage = (gl_shader_stage) (stage + 1)) {
2736       if (!nir[stage])
2737          continue;
2738 
2739       if (stage > MESA_SHADER_TESS_CTRL) {
2740          if (stage == MESA_SHADER_FRAGMENT) {
2741             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2742                (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2743          } else {
2744             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2745                BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2746          }
2747       }
2748    }
2749 
2750    /* In the the tess-but-not-FS case we don't know whether the FS will read
2751     * PrimID so we need to unconditionally store it.
2752     */
2753    if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
2754       ir3_key.tcs_store_primid = true;
2755 
2756    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2757         stage = (gl_shader_stage) (stage + 1)) {
2758       if (!nir[stage] || shaders[stage])
2759          continue;
2760 
2761       int64_t stage_start = os_time_get_nano();
2762 
2763       unsigned char shader_sha1[21];
2764       memcpy(shader_sha1, pipeline_sha1, 20);
2765       shader_sha1[20] = (unsigned char) stage;
2766 
2767       result = tu_shader_create(device,
2768                                 &shaders[stage], nir[stage], &keys[stage],
2769                                 &ir3_key, shader_sha1, sizeof(shader_sha1),
2770                                 layout, !!nir_initial_disasm);
2771       if (result != VK_SUCCESS) {
2772          goto fail;
2773       }
2774 
2775       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2776    }
2777 
2778    ralloc_free(mem_ctx);
2779 
2780    return VK_SUCCESS;
2781 
2782 fail:
2783    ralloc_free(mem_ctx);
2784 
2785    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2786         stage = (gl_shader_stage) (stage + 1)) {
2787       if (shaders[stage]) {
2788          tu_shader_destroy(device, shaders[stage]);
2789       }
2790       if (nir_out && nir_out[stage]) {
2791          ralloc_free(nir_out[stage]);
2792       }
2793    }
2794 
2795    return result;
2796 }
2797 
2798 void
tu_shader_key_subgroup_size(struct tu_shader_key * key,bool allow_varying_subgroup_size,bool require_full_subgroups,const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo * subgroup_info,struct tu_device * dev)2799 tu_shader_key_subgroup_size(struct tu_shader_key *key,
2800                             bool allow_varying_subgroup_size,
2801                             bool require_full_subgroups,
2802                             const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info,
2803                             struct tu_device *dev)
2804 {
2805    enum ir3_wavesize_option api_wavesize, real_wavesize;
2806    if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
2807       api_wavesize = IR3_SINGLE_ONLY;
2808       real_wavesize = IR3_SINGLE_ONLY;
2809    } else {
2810       if (allow_varying_subgroup_size) {
2811          api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2812       } else {
2813          if (subgroup_info) {
2814             if (subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2815                api_wavesize = IR3_SINGLE_ONLY;
2816             } else {
2817                assert(subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2818                api_wavesize = IR3_DOUBLE_ONLY;
2819             }
2820          } else {
2821             /* Match the exposed subgroupSize. */
2822             api_wavesize = IR3_DOUBLE_ONLY;
2823          }
2824 
2825          if (require_full_subgroups)
2826             real_wavesize = api_wavesize;
2827          else if (api_wavesize == IR3_SINGLE_ONLY)
2828             real_wavesize = IR3_SINGLE_ONLY;
2829          else
2830             real_wavesize = IR3_SINGLE_OR_DOUBLE;
2831       }
2832    }
2833 
2834    key->api_wavesize = api_wavesize;
2835    key->real_wavesize = real_wavesize;
2836 }
2837 
2838 static VkResult
tu_empty_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,gl_shader_stage stage)2839 tu_empty_shader_create(struct tu_device *dev,
2840                        struct tu_shader **shader_out,
2841                        gl_shader_stage stage)
2842 {
2843    struct tu_shader *shader = tu_shader_init(dev, NULL, 0);
2844 
2845    if (!shader)
2846       return VK_ERROR_OUT_OF_HOST_MEMORY;
2847 
2848    pthread_mutex_lock(&dev->pipeline_mutex);
2849    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2850                                           32 * 4, 128);
2851    pthread_mutex_unlock(&dev->pipeline_mutex);
2852 
2853    if (result != VK_SUCCESS) {
2854       vk_free(&dev->vk.alloc, shader);
2855       return result;
2856    }
2857 
2858    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2859    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2860 
2861    struct tu_pvtmem_config pvtmem_config = { };
2862 
2863    struct tu_cs sub_cs;
2864    tu_cs_begin_sub_stream(&shader->cs, 32, &sub_cs);
2865    TU_CALLX(dev, tu6_emit_variant)(&sub_cs, stage, NULL, &pvtmem_config, 0, 0);
2866    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2867 
2868    *shader_out = shader;
2869    return VK_SUCCESS;
2870 }
2871 
2872 static VkResult
tu_empty_fs_create(struct tu_device * dev,struct tu_shader ** shader,bool fragment_density_map)2873 tu_empty_fs_create(struct tu_device *dev, struct tu_shader **shader,
2874                    bool fragment_density_map)
2875 {
2876    struct ir3_shader_key key = {};
2877    const struct ir3_shader_options options = {};
2878    struct ir3_stream_output_info so_info = {};
2879    const nir_shader_compiler_options *nir_options =
2880       ir3_get_compiler_options(dev->compiler);
2881    nir_builder fs_b;
2882 
2883    fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options,
2884                                          "noop_fs");
2885 
2886    *shader = tu_shader_init(dev, NULL, 0);
2887    if (!*shader)
2888       return VK_ERROR_OUT_OF_HOST_MEMORY;
2889 
2890    (*shader)->fs.has_fdm = fragment_density_map;
2891    if (fragment_density_map)
2892       (*shader)->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2893 
2894    for (unsigned i = 0; i < MAX_SETS; i++)
2895       (*shader)->dynamic_descriptor_sizes[i] = -1;
2896 
2897    struct ir3_shader *ir3_shader =
2898       ir3_shader_from_nir(dev->compiler, fs_b.shader, &options, &so_info);
2899    (*shader)->variant = ir3_shader_create_variant(ir3_shader, &key, false);
2900    ir3_shader_destroy(ir3_shader);
2901 
2902    return tu_upload_shader(dev, *shader);
2903 }
2904 
2905 VkResult
tu_init_empty_shaders(struct tu_device * dev)2906 tu_init_empty_shaders(struct tu_device *dev)
2907 {
2908    VkResult result;
2909 
2910    result = tu_empty_shader_create(dev, &dev->empty_tcs, MESA_SHADER_TESS_CTRL);
2911    if (result != VK_SUCCESS)
2912       goto out;
2913 
2914    result = tu_empty_shader_create(dev, &dev->empty_tes, MESA_SHADER_TESS_EVAL);
2915    if (result != VK_SUCCESS)
2916       goto out;
2917 
2918    result = tu_empty_shader_create(dev, &dev->empty_gs, MESA_SHADER_GEOMETRY);
2919    if (result != VK_SUCCESS)
2920       goto out;
2921 
2922    result = tu_empty_fs_create(dev, &dev->empty_fs, false);
2923    if (result != VK_SUCCESS)
2924       goto out;
2925 
2926    result = tu_empty_fs_create(dev, &dev->empty_fs_fdm, true);
2927    if (result != VK_SUCCESS)
2928       goto out;
2929 
2930    return VK_SUCCESS;
2931 
2932 out:
2933    if (dev->empty_tcs)
2934       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
2935    if (dev->empty_tes)
2936       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
2937    if (dev->empty_gs)
2938       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
2939    if (dev->empty_fs)
2940       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
2941    if (dev->empty_fs_fdm)
2942       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
2943    return result;
2944 }
2945 
2946 void
tu_destroy_empty_shaders(struct tu_device * dev)2947 tu_destroy_empty_shaders(struct tu_device *dev)
2948 {
2949    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
2950    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
2951    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
2952    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
2953    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
2954 }
2955 
2956 void
tu_shader_destroy(struct tu_device * dev,struct tu_shader * shader)2957 tu_shader_destroy(struct tu_device *dev,
2958                   struct tu_shader *shader)
2959 {
2960    tu_cs_finish(&shader->cs);
2961    TU_RMV(resource_destroy, dev, &shader->bo);
2962 
2963    pthread_mutex_lock(&dev->pipeline_mutex);
2964    tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2965    pthread_mutex_unlock(&dev->pipeline_mutex);
2966 
2967    if (shader->pvtmem_bo)
2968       tu_bo_finish(dev, shader->pvtmem_bo);
2969 
2970    if (shader->variant)
2971       ralloc_free((void *)shader->variant);
2972    if (shader->safe_const_variant)
2973       ralloc_free((void *)shader->safe_const_variant);
2974 
2975    vk_free(&dev->vk.alloc, shader);
2976 }
2977