xref: /aosp_15_r20/external/mesa3d/src/panfrost/vulkan/panvk_vX_shader.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Collabora Ltd.
3  *
4  * Derived from tu_shader.c which is:
5  * Copyright © 2019 Google LLC
6  *
7  * Also derived from anv_pipeline.c which is
8  * Copyright © 2015 Intel Corporation
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a
11  * copy of this software and associated documentation files (the "Software"),
12  * to deal in the Software without restriction, including without limitation
13  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14  * and/or sell copies of the Software, and to permit persons to whom the
15  * Software is furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice (including the next
18  * paragraph) shall be included in all copies or substantial portions of the
19  * Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "genxml/gen_macros.h"
31 
32 #include "panvk_cmd_buffer.h"
33 #include "panvk_device.h"
34 #include "panvk_instance.h"
35 #include "panvk_mempool.h"
36 #include "panvk_physical_device.h"
37 #include "panvk_shader.h"
38 
39 #include "spirv/nir_spirv.h"
40 #include "util/memstream.h"
41 #include "util/mesa-sha1.h"
42 #include "util/u_dynarray.h"
43 #include "nir_builder.h"
44 #include "nir_conversion_builder.h"
45 #include "nir_deref.h"
46 
47 #include "vk_graphics_state.h"
48 #include "vk_shader_module.h"
49 
50 #include "compiler/bifrost_nir.h"
51 #include "util/pan_lower_framebuffer.h"
52 #include "pan_shader.h"
53 
54 #include "vk_log.h"
55 #include "vk_pipeline.h"
56 #include "vk_pipeline_layout.h"
57 #include "vk_shader.h"
58 #include "vk_util.h"
59 
60 static nir_def *
load_sysval_from_push_const(nir_builder * b,unsigned offset,unsigned bit_size,unsigned num_comps)61 load_sysval_from_push_const(nir_builder *b, unsigned offset, unsigned bit_size,
62                             unsigned num_comps)
63 {
64    return nir_load_push_constant(
65       b, num_comps, bit_size, nir_imm_int(b, 0),
66       /* Push constants are placed first, and then come the sysvals. */
67       .base = offset + 256, .range = num_comps * bit_size / 8);
68 }
69 
70 static bool
panvk_lower_sysvals(nir_builder * b,nir_instr * instr,void * data)71 panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
72 {
73    if (instr->type != nir_instr_type_intrinsic)
74       return false;
75 
76    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
77    unsigned num_comps = intr->def.num_components;
78    unsigned bit_size = intr->def.bit_size;
79    nir_def *val = NULL;
80    b->cursor = nir_before_instr(instr);
81 
82 #define SYSVAL(ptype, name) offsetof(struct panvk_##ptype##_sysvals, name)
83    switch (intr->intrinsic) {
84    case nir_intrinsic_load_base_workgroup_id:
85       val = load_sysval_from_push_const(b, SYSVAL(compute, base), bit_size,
86                                         num_comps);
87       break;
88    case nir_intrinsic_load_num_workgroups:
89       val = load_sysval_from_push_const(b, SYSVAL(compute, num_work_groups),
90                                         bit_size, num_comps);
91       break;
92    case nir_intrinsic_load_workgroup_size:
93       val = load_sysval_from_push_const(b, SYSVAL(compute, local_group_size),
94                                         bit_size, num_comps);
95       break;
96    case nir_intrinsic_load_viewport_scale:
97       val = load_sysval_from_push_const(b, SYSVAL(graphics, viewport.scale),
98                                         bit_size, num_comps);
99       break;
100    case nir_intrinsic_load_viewport_offset:
101       val = load_sysval_from_push_const(b, SYSVAL(graphics, viewport.offset),
102                                         bit_size, num_comps);
103       break;
104    case nir_intrinsic_load_first_vertex:
105       val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.first_vertex),
106                                         bit_size, num_comps);
107       break;
108    case nir_intrinsic_load_base_vertex:
109       val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.base_vertex),
110                                         bit_size, num_comps);
111       break;
112    case nir_intrinsic_load_base_instance:
113       val = load_sysval_from_push_const(b, SYSVAL(graphics, vs.base_instance),
114                                         bit_size, num_comps);
115       break;
116    case nir_intrinsic_load_blend_const_color_rgba:
117       val = load_sysval_from_push_const(b, SYSVAL(graphics, blend.constants),
118                                         bit_size, num_comps);
119       break;
120    case nir_intrinsic_load_multisampled_pan:
121       val = load_sysval_from_push_const(b, SYSVAL(graphics, fs.multisampled),
122                                         bit_size, num_comps);
123       break;
124 
125 #if PAN_ARCH <= 7
126    case nir_intrinsic_load_layer_id:
127       assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
128       val = load_sysval_from_push_const(b, SYSVAL(graphics, layer_id), bit_size,
129                                         num_comps);
130       break;
131 #endif
132 
133    default:
134       return false;
135    }
136 #undef SYSVAL
137 
138    b->cursor = nir_after_instr(instr);
139    nir_def_rewrite_uses(&intr->def, val);
140    return true;
141 }
142 
143 #if PAN_ARCH <= 7
144 static bool
lower_gl_pos_layer_writes(nir_builder * b,nir_instr * instr,void * data)145 lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data)
146 {
147    if (instr->type != nir_instr_type_intrinsic)
148       return false;
149 
150    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
151 
152    if (intr->intrinsic != nir_intrinsic_copy_deref)
153       return false;
154 
155    nir_variable *dst_var = nir_intrinsic_get_var(intr, 0);
156    nir_variable *src_var = nir_intrinsic_get_var(intr, 1);
157 
158    if (!dst_var || dst_var->data.mode != nir_var_shader_out || !src_var ||
159        src_var->data.mode != nir_var_shader_temp)
160       return false;
161 
162    if (dst_var->data.location == VARYING_SLOT_LAYER) {
163       /* We don't really write the layer, we just make sure primitives are
164        * discarded if gl_Layer doesn't match the layer passed to the draw.
165        */
166       b->cursor = nir_instr_remove(instr);
167       return true;
168    }
169 
170    if (dst_var->data.location == VARYING_SLOT_POS) {
171       nir_variable *temp_layer_var = data;
172       nir_variable *temp_pos_var = src_var;
173 
174       b->cursor = nir_before_instr(instr);
175       nir_def *layer = nir_load_var(b, temp_layer_var);
176       nir_def *pos = nir_load_var(b, temp_pos_var);
177       nir_def *inf_pos = nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 1.0f);
178       nir_def *ref_layer = load_sysval_from_push_const(
179          b, offsetof(struct panvk_graphics_sysvals, layer_id), 32, 1);
180 
181       nir_store_var(b, temp_pos_var,
182                     nir_bcsel(b, nir_ieq(b, layer, ref_layer), pos, inf_pos),
183                     0xf);
184       return true;
185    }
186 
187    return false;
188 }
189 
190 static bool
lower_layer_writes(nir_shader * nir)191 lower_layer_writes(nir_shader *nir)
192 {
193    if (nir->info.stage == MESA_SHADER_FRAGMENT)
194       return false;
195 
196    nir_variable *temp_layer_var = NULL;
197    bool has_layer_var = false;
198 
199    nir_foreach_variable_with_modes(var, nir,
200                                    nir_var_shader_out | nir_var_shader_temp) {
201       if (var->data.mode == nir_var_shader_out &&
202           var->data.location == VARYING_SLOT_LAYER)
203          has_layer_var = true;
204 
205       if (var->data.mode == nir_var_shader_temp &&
206           var->data.location == VARYING_SLOT_LAYER)
207          temp_layer_var = var;
208    }
209 
210    if (!has_layer_var)
211       return false;
212 
213    assert(temp_layer_var);
214 
215    return nir_shader_instructions_pass(
216       nir, lower_gl_pos_layer_writes,
217       nir_metadata_block_index | nir_metadata_dominance, temp_layer_var);
218 }
219 #endif
220 
221 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)222 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
223 {
224    assert(glsl_type_is_vector_or_scalar(type));
225 
226    uint32_t comp_size =
227       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
228    unsigned length = glsl_get_vector_elements(type);
229    *size = comp_size * length, *align = comp_size * (length == 3 ? 4 : length);
230 }
231 
232 static inline nir_address_format
panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)233 panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
234 {
235    switch (robustness) {
236    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
237    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
238    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
239       return PAN_ARCH <= 7 ? nir_address_format_32bit_index_offset
240                            : nir_address_format_vec2_index_32bit_offset;
241    default:
242       unreachable("Invalid robust buffer access behavior");
243    }
244 }
245 
246 static inline nir_address_format
panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)247 panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
248 {
249    switch (robustness) {
250    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
251       return PAN_ARCH <= 7 ? nir_address_format_64bit_global_32bit_offset
252                            : nir_address_format_vec2_index_32bit_offset;
253    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
254    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
255       return PAN_ARCH <= 7 ? nir_address_format_64bit_bounded_global
256                            : nir_address_format_vec2_index_32bit_offset;
257    default:
258       unreachable("Invalid robust buffer access behavior");
259    }
260 }
261 
262 static const nir_shader_compiler_options *
panvk_get_nir_options(UNUSED struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)263 panvk_get_nir_options(UNUSED struct vk_physical_device *vk_pdev,
264                       UNUSED gl_shader_stage stage,
265                       UNUSED const struct vk_pipeline_robustness_state *rs)
266 {
267    return GENX(pan_shader_get_compiler_options)();
268 }
269 
270 static struct spirv_to_nir_options
panvk_get_spirv_options(UNUSED struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)271 panvk_get_spirv_options(UNUSED struct vk_physical_device *vk_pdev,
272                         UNUSED gl_shader_stage stage,
273                         const struct vk_pipeline_robustness_state *rs)
274 {
275    return (struct spirv_to_nir_options){
276       .ubo_addr_format = panvk_buffer_ubo_addr_format(rs->uniform_buffers),
277       .ssbo_addr_format = panvk_buffer_ssbo_addr_format(rs->storage_buffers),
278       .phys_ssbo_addr_format = nir_address_format_64bit_global,
279    };
280 }
281 
282 static void
panvk_preprocess_nir(UNUSED struct vk_physical_device * vk_pdev,nir_shader * nir)283 panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir)
284 {
285    /* Ensure to regroup output variables at the same location */
286    if (nir->info.stage == MESA_SHADER_FRAGMENT)
287       NIR_PASS_V(nir, nir_lower_io_to_vector, nir_var_shader_out);
288 
289    NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
290               true, true);
291 
292 #if PAN_ARCH <= 7
293    /* This needs to be done just after the io_to_temporaries pass, because we
294     * rely on in/out temporaries to collect the final layer_id value. */
295    NIR_PASS_V(nir, lower_layer_writes);
296 #endif
297 
298    NIR_PASS_V(nir, nir_lower_indirect_derefs,
299               nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
300 
301    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
302    NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
303    NIR_PASS_V(nir, nir_opt_loop);
304 
305    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
306       struct nir_input_attachment_options lower_input_attach_opts = {
307          .use_fragcoord_sysval = true,
308          .use_layer_id_sysval = true,
309       };
310 
311       NIR_PASS_V(nir, nir_lower_input_attachments, &lower_input_attach_opts);
312    }
313 
314    /* Do texture lowering here.  Yes, it's a duplication of the texture
315     * lowering in bifrost_compile.  However, we need to lower texture stuff
316     * now, before we call panvk_per_arch(nir_lower_descriptors)() because some
317     * of the texture lowering generates nir_texop_txs which we handle as part
318     * of descriptor lowering.
319     *
320     * TODO: We really should be doing this in common code, not dpulicated in
321     * panvk.  In order to do that, we need to rework the panfrost compile
322     * flow to look more like the Intel flow:
323     *
324     *  1. Compile SPIR-V to NIR and maybe do a tiny bit of lowering that needs
325     *     to be done really early.
326     *
327     *  2. pan_preprocess_nir: Does common lowering and runs the optimization
328     *     loop.  Nothing here should be API-specific.
329     *
330     *  3. Do additional lowering in panvk
331     *
332     *  4. pan_postprocess_nir: Does final lowering and runs the optimization
333     *     loop again.  This can happen as part of the final compile.
334     *
335     * This would give us a better place to do panvk-specific lowering.
336     */
337    nir_lower_tex_options lower_tex_options = {
338       .lower_txs_lod = true,
339       .lower_txp = ~0,
340       .lower_tg4_broadcom_swizzle = true,
341       .lower_txd = true,
342       .lower_invalid_implicit_lod = true,
343    };
344    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
345    NIR_PASS_V(nir, nir_lower_system_values);
346 
347    nir_lower_compute_system_values_options options = {
348       .has_base_workgroup_id = true,
349    };
350 
351    NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
352 
353    if (nir->info.stage == MESA_SHADER_FRAGMENT)
354       NIR_PASS_V(nir, nir_lower_wpos_center);
355 
356    NIR_PASS_V(nir, nir_split_var_copies);
357    NIR_PASS_V(nir, nir_lower_var_copies);
358 }
359 
360 static void
panvk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)361 panvk_hash_graphics_state(struct vk_physical_device *device,
362                           const struct vk_graphics_pipeline_state *state,
363                           VkShaderStageFlags stages, blake3_hash blake3_out)
364 {
365    struct mesa_blake3 blake3_ctx;
366    _mesa_blake3_init(&blake3_ctx);
367 
368    /* We don't need to do anything here yet */
369 
370    _mesa_blake3_final(&blake3_ctx, blake3_out);
371 }
372 
373 #if PAN_ARCH >= 9
374 static bool
valhall_pack_buf_idx(nir_builder * b,nir_instr * instr,UNUSED void * data)375 valhall_pack_buf_idx(nir_builder *b, nir_instr *instr, UNUSED void *data)
376 {
377    if (instr->type != nir_instr_type_intrinsic)
378       return false;
379 
380    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
381    unsigned index_src;
382 
383    switch (intrin->intrinsic) {
384    case nir_intrinsic_load_ubo:
385    case nir_intrinsic_load_ssbo:
386    case nir_intrinsic_ssbo_atomic:
387    case nir_intrinsic_ssbo_atomic_swap:
388       index_src = 0;
389       break;
390 
391    case nir_intrinsic_store_ssbo:
392       index_src = 1;
393       break;
394 
395    default:
396       return false;
397    }
398 
399    nir_def *index = intrin->src[index_src].ssa;
400 
401    /* The descriptor lowering pass can add UBO loads, and those already have the
402     * right index format. */
403    if (index->num_components == 1)
404       return false;
405 
406    b->cursor = nir_before_instr(&intrin->instr);
407 
408    /* The valhall backend expects nir_address_format_32bit_index_offset,
409     * but address mode is nir_address_format_vec2_index_32bit_offset to allow
410     * us to store the array size, set and index without losing information
411     * while walking the descriptor deref chain (needed to do a bound check on
412     * the array index when we reach the end of the chain).
413     * Turn it back to nir_address_format_32bit_index_offset after IOs
414     * have been lowered. */
415    nir_def *packed_index =
416       nir_iadd(b, nir_channel(b, index, 0), nir_channel(b, index, 1));
417    nir_src_rewrite(&intrin->src[index_src], packed_index);
418    return true;
419 }
420 #endif
421 
422 static void
panvk_lower_nir(struct panvk_device * dev,nir_shader * nir,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts,const struct vk_pipeline_robustness_state * rs,const struct panfrost_compile_inputs * compile_input,struct panvk_shader * shader)423 panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
424                 uint32_t set_layout_count,
425                 struct vk_descriptor_set_layout *const *set_layouts,
426                 const struct vk_pipeline_robustness_state *rs,
427                 const struct panfrost_compile_inputs *compile_input,
428                 struct panvk_shader *shader)
429 {
430    struct panvk_instance *instance =
431       to_panvk_instance(dev->vk.physical->instance);
432    gl_shader_stage stage = nir->info.stage;
433 
434    NIR_PASS_V(nir, panvk_per_arch(nir_lower_descriptors), dev, set_layout_count,
435               set_layouts, shader);
436 
437    NIR_PASS_V(nir, nir_split_var_copies);
438    NIR_PASS_V(nir, nir_lower_var_copies);
439 
440    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
441               panvk_buffer_ubo_addr_format(rs->uniform_buffers));
442    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
443               panvk_buffer_ssbo_addr_format(rs->storage_buffers));
444    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
445               nir_address_format_32bit_offset);
446    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global,
447               nir_address_format_64bit_global);
448 
449 #if PAN_ARCH >= 9
450    NIR_PASS_V(nir, nir_shader_instructions_pass, valhall_pack_buf_idx,
451               nir_metadata_block_index | nir_metadata_dominance, NULL);
452 #endif
453 
454    if (gl_shader_stage_uses_workgroup(stage)) {
455       if (!nir->info.shared_memory_explicit_layout) {
456          NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
457                     shared_type_info);
458       }
459 
460       NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared,
461                  nir_address_format_32bit_offset);
462    }
463 
464    if (stage == MESA_SHADER_VERTEX) {
465       /* We need the driver_location to match the vertex attribute location,
466        * so we can use the attribute layout described by
467        * vk_vertex_input_state where there are holes in the attribute locations.
468        */
469       nir_foreach_shader_in_variable(var, nir) {
470          assert(var->data.location >= VERT_ATTRIB_GENERIC0 &&
471                 var->data.location <= VERT_ATTRIB_GENERIC15);
472          var->data.driver_location = var->data.location - VERT_ATTRIB_GENERIC0;
473       }
474    } else {
475       nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
476                                   stage);
477    }
478 
479    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
480                                stage);
481 
482    /* Needed to turn shader_temp into function_temp since the backend only
483     * handles the latter for now.
484     */
485    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
486 
487    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
488    if (unlikely(instance->debug_flags & PANVK_DEBUG_NIR)) {
489       fprintf(stderr, "translated nir:\n");
490       nir_print_shader(nir, stderr);
491    }
492 
493    pan_shader_preprocess(nir, compile_input->gpu_id);
494 
495    if (stage == MESA_SHADER_VERTEX)
496       NIR_PASS_V(nir, pan_lower_image_index, MAX_VS_ATTRIBS);
497 
498    NIR_PASS_V(nir, nir_shader_instructions_pass, panvk_lower_sysvals,
499               nir_metadata_control_flow, NULL);
500 }
501 
502 static VkResult
panvk_compile_nir(struct panvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,struct panfrost_compile_inputs * compile_input,struct panvk_shader * shader)503 panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
504                   VkShaderCreateFlagsEXT shader_flags,
505                   struct panfrost_compile_inputs *compile_input,
506                   struct panvk_shader *shader)
507 {
508    const bool dump_asm =
509       shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
510 
511    struct util_dynarray binary;
512    util_dynarray_init(&binary, NULL);
513    GENX(pan_shader_compile)(nir, compile_input, &binary, &shader->info);
514 
515    void *bin_ptr = util_dynarray_element(&binary, uint8_t, 0);
516    unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
517 
518    shader->bin_size = 0;
519    shader->bin_ptr = NULL;
520 
521    if (bin_size) {
522       void *data = malloc(bin_size);
523 
524       if (data == NULL)
525          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
526 
527       memcpy(data, bin_ptr, bin_size);
528       shader->bin_size = bin_size;
529       shader->bin_ptr = data;
530    }
531    util_dynarray_fini(&binary);
532 
533    if (dump_asm) {
534       shader->nir_str = nir_shader_as_str(nir, NULL);
535 
536       char *data = NULL;
537       size_t disasm_size = 0;
538 
539       if (shader->bin_size) {
540          struct u_memstream mem;
541          if (u_memstream_open(&mem, &data, &disasm_size)) {
542             FILE *const stream = u_memstream_get(&mem);
543             pan_shader_disassemble(stream, shader->bin_ptr, shader->bin_size,
544                                    compile_input->gpu_id, false);
545             u_memstream_close(&mem);
546          }
547       }
548 
549       char *asm_str = malloc(disasm_size + 1);
550       memcpy(asm_str, data, disasm_size);
551       asm_str[disasm_size] = '\0';
552       free(data);
553 
554       shader->asm_str = asm_str;
555    }
556 
557 #if PAN_ARCH <= 7
558    /* Patch the descriptor count */
559    shader->info.ubo_count =
560       shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] +
561       shader->desc_info.dyn_ubos.count;
562    shader->info.texture_count =
563       shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_TEXTURE];
564    shader->info.sampler_count =
565       shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_SAMPLER];
566 
567    /* Dummy sampler. */
568    if (!shader->info.sampler_count && shader->info.texture_count)
569       shader->info.sampler_count++;
570 
571    if (nir->info.stage == MESA_SHADER_VERTEX) {
572       /* We leave holes in the attribute locations, but pan_shader.c assumes the
573        * opposite. Patch attribute_count accordingly, so
574        * pan_shader_prepare_rsd() does what we expect.
575        */
576       uint32_t gen_attribs =
577          (shader->info.attributes_read & VERT_BIT_GENERIC_ALL) >>
578          VERT_ATTRIB_GENERIC0;
579 
580       shader->info.attribute_count = util_last_bit(gen_attribs);
581 
582       /* NULL IDVS shaders are not allowed. */
583       if (!bin_size)
584          shader->info.vs.idvs = false;
585    }
586 
587    /* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table,
588     * and zero in other stages.
589     */
590    if (shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] > 0)
591       shader->info.attribute_count =
592          shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] +
593          (nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0);
594 #endif
595 
596    shader->local_size.x = nir->info.workgroup_size[0];
597    shader->local_size.y = nir->info.workgroup_size[1];
598    shader->local_size.z = nir->info.workgroup_size[2];
599 
600    return VK_SUCCESS;
601 }
602 
603 #if PAN_ARCH >= 9
604 static enum mali_flush_to_zero_mode
shader_ftz_mode(struct panvk_shader * shader)605 shader_ftz_mode(struct panvk_shader *shader)
606 {
607    if (shader->info.ftz_fp32) {
608       if (shader->info.ftz_fp16)
609          return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
610       else
611          return MALI_FLUSH_TO_ZERO_MODE_DX11;
612    } else {
613       /* We don't have a "flush FP16, preserve FP32" mode, but APIs
614        * should not be able to generate that.
615        */
616       assert(!shader->info.ftz_fp16 && !shader->info.ftz_fp32);
617       return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
618    }
619 }
620 #endif
621 
622 static VkResult
panvk_shader_upload(struct panvk_device * dev,struct panvk_shader * shader,const VkAllocationCallbacks * pAllocator)623 panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader,
624                     const VkAllocationCallbacks *pAllocator)
625 {
626    shader->code_mem = (struct panvk_priv_mem){0};
627 
628 #if PAN_ARCH <= 7
629    shader->rsd = (struct panvk_priv_mem){0};
630 #else
631    shader->spd = (struct panvk_priv_mem){0};
632 #endif
633 
634    if (!shader->bin_size)
635       return VK_SUCCESS;
636 
637    shader->code_mem = panvk_pool_upload_aligned(
638       &dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128);
639 
640 #if PAN_ARCH <= 7
641    if (shader->info.stage == MESA_SHADER_FRAGMENT)
642       return VK_SUCCESS;
643 
644    shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
645 
646    pan_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE, cfg) {
647       pan_shader_prepare_rsd(&shader->info, panvk_shader_get_dev_addr(shader),
648                              &cfg);
649    }
650 #else
651    if (shader->info.stage != MESA_SHADER_VERTEX) {
652       shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
653 
654       pan_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM, cfg) {
655          cfg.stage = pan_shader_stage(&shader->info);
656 
657          if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
658             cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
659          else if (cfg.stage == MALI_SHADER_STAGE_VERTEX)
660             cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
661 
662          cfg.register_allocation =
663             pan_register_allocation(shader->info.work_reg_count);
664          cfg.binary = panvk_shader_get_dev_addr(shader);
665          cfg.preload.r48_r63 = (shader->info.preload >> 48);
666          cfg.flush_to_zero_mode = shader_ftz_mode(shader);
667 
668          if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
669             cfg.requires_helper_threads = shader->info.contains_barrier;
670       }
671    } else {
672       shader->spds.pos_points =
673          panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
674       pan_pack(panvk_priv_mem_host_addr(shader->spds.pos_points),
675                SHADER_PROGRAM, cfg) {
676          cfg.stage = pan_shader_stage(&shader->info);
677          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
678          cfg.register_allocation =
679             pan_register_allocation(shader->info.work_reg_count);
680          cfg.binary = panvk_shader_get_dev_addr(shader);
681          cfg.preload.r48_r63 = (shader->info.preload >> 48);
682          cfg.flush_to_zero_mode = shader_ftz_mode(shader);
683       }
684 
685       shader->spds.pos_triangles =
686          panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
687       pan_pack(panvk_priv_mem_host_addr(shader->spds.pos_triangles),
688                SHADER_PROGRAM, cfg) {
689          cfg.stage = pan_shader_stage(&shader->info);
690          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
691          cfg.register_allocation =
692             pan_register_allocation(shader->info.work_reg_count);
693          cfg.binary =
694             panvk_shader_get_dev_addr(shader) + shader->info.vs.no_psiz_offset;
695          cfg.preload.r48_r63 = (shader->info.preload >> 48);
696          cfg.flush_to_zero_mode = shader_ftz_mode(shader);
697       }
698 
699       if (shader->info.vs.secondary_enable) {
700          shader->spds.var =
701             panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
702          pan_pack(panvk_priv_mem_host_addr(shader->spds.var), SHADER_PROGRAM,
703                   cfg) {
704             unsigned work_count = shader->info.vs.secondary_work_reg_count;
705 
706             cfg.stage = pan_shader_stage(&shader->info);
707             cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
708             cfg.register_allocation = pan_register_allocation(work_count);
709             cfg.binary = panvk_shader_get_dev_addr(shader) +
710                          shader->info.vs.secondary_offset;
711             cfg.preload.r48_r63 = (shader->info.vs.secondary_preload >> 48);
712             cfg.flush_to_zero_mode = shader_ftz_mode(shader);
713          }
714       }
715    }
716 #endif
717 
718    return VK_SUCCESS;
719 }
720 
721 static void
panvk_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)722 panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
723                      const VkAllocationCallbacks *pAllocator)
724 {
725    struct panvk_device *dev = to_panvk_device(vk_dev);
726    struct panvk_shader *shader =
727       container_of(vk_shader, struct panvk_shader, vk);
728 
729    free((void *)shader->asm_str);
730    ralloc_free((void *)shader->nir_str);
731 
732    panvk_pool_free_mem(&dev->mempools.exec, shader->code_mem);
733 
734 #if PAN_ARCH <= 7
735    panvk_pool_free_mem(&dev->mempools.exec, shader->rsd);
736    panvk_pool_free_mem(&dev->mempools.exec, shader->desc_info.others.map);
737 #else
738    panvk_pool_free_mem(&dev->mempools.exec, shader->spd);
739 #endif
740 
741    free((void *)shader->bin_ptr);
742    vk_shader_free(&dev->vk, pAllocator, &shader->vk);
743 }
744 
745 static const struct vk_shader_ops panvk_shader_ops;
746 
747 static VkResult
panvk_compile_shader(struct panvk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)748 panvk_compile_shader(struct panvk_device *dev,
749                      struct vk_shader_compile_info *info,
750                      const struct vk_graphics_pipeline_state *state,
751                      const VkAllocationCallbacks *pAllocator,
752                      struct vk_shader **shader_out)
753 {
754    struct panvk_physical_device *phys_dev =
755       to_panvk_physical_device(dev->vk.physical);
756 
757    struct panvk_shader *shader;
758    VkResult result;
759 
760    /* We consume the NIR, regardless of success or failure */
761    nir_shader *nir = info->nir;
762 
763    shader = vk_shader_zalloc(&dev->vk, &panvk_shader_ops, info->stage,
764                              pAllocator, sizeof(*shader));
765    if (shader == NULL)
766       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
767 
768    struct panfrost_compile_inputs inputs = {
769       .gpu_id = phys_dev->kmod.props.gpu_prod_id,
770       .no_ubo_to_push = true,
771    };
772 
773    panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
774                    info->robustness, &inputs, shader);
775 
776    result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader);
777 
778    if (result != VK_SUCCESS) {
779       panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
780       return result;
781    }
782 
783    result = panvk_shader_upload(dev, shader, pAllocator);
784 
785    if (result != VK_SUCCESS) {
786       panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
787       return result;
788    }
789 
790    *shader_out = &shader->vk;
791 
792    return result;
793 }
794 
795 static VkResult
panvk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)796 panvk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
797                       struct vk_shader_compile_info *infos,
798                       const struct vk_graphics_pipeline_state *state,
799                       const VkAllocationCallbacks *pAllocator,
800                       struct vk_shader **shaders_out)
801 {
802    struct panvk_device *dev = to_panvk_device(vk_dev);
803    VkResult result;
804    uint32_t i;
805 
806    for (i = 0; i < shader_count; i++) {
807       result = panvk_compile_shader(dev, &infos[i], state, pAllocator,
808                                     &shaders_out[i]);
809 
810       /* Clean up NIR for the current shader */
811       ralloc_free(infos[i].nir);
812 
813       if (result != VK_SUCCESS)
814          goto err_cleanup;
815    }
816 
817    /* TODO: If we get multiple shaders here, we can perform part of the link
818     * logic at compile time. */
819 
820    return VK_SUCCESS;
821 
822 err_cleanup:
823    /* Clean up all the shaders before this point */
824    for (uint32_t j = 0; j < i; j++)
825       panvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
826 
827    /* Clean up all the NIR after this point */
828    for (uint32_t j = i + 1; j < shader_count; j++)
829       ralloc_free(infos[j].nir);
830 
831    /* Memset the output array */
832    memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
833 
834    return result;
835 }
836 
837 static VkResult
shader_desc_info_deserialize(struct blob_reader * blob,struct panvk_shader * shader)838 shader_desc_info_deserialize(struct blob_reader *blob,
839                              struct panvk_shader *shader)
840 {
841    shader->desc_info.used_set_mask = blob_read_uint32(blob);
842 
843 #if PAN_ARCH <= 7
844    shader->desc_info.dyn_ubos.count = blob_read_uint32(blob);
845    blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map,
846                    shader->desc_info.dyn_ubos.count);
847    shader->desc_info.dyn_ssbos.count = blob_read_uint32(blob);
848    blob_copy_bytes(blob, shader->desc_info.dyn_ssbos.map,
849                    shader->desc_info.dyn_ssbos.count);
850 
851    uint32_t others_count = 0;
852    for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
853       shader->desc_info.others.count[i] = blob_read_uint32(blob);
854       others_count += shader->desc_info.others.count[i];
855    }
856 
857    if (others_count) {
858       struct panvk_device *dev = to_panvk_device(shader->vk.base.device);
859       struct panvk_pool_alloc_info alloc_info = {
860          .size = others_count * sizeof(uint32_t),
861          .alignment = sizeof(uint32_t),
862       };
863       shader->desc_info.others.map =
864          panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
865       uint32_t *copy_table =
866          panvk_priv_mem_host_addr(shader->desc_info.others.map);
867 
868       if (!copy_table)
869          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
870 
871       blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
872    }
873 #else
874    shader->desc_info.dyn_bufs.count = blob_read_uint32(blob);
875    blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map,
876                    shader->desc_info.dyn_bufs.count);
877 #endif
878 
879    return VK_SUCCESS;
880 }
881 
882 static VkResult
panvk_deserialize_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)883 panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
884                          uint32_t binary_version,
885                          const VkAllocationCallbacks *pAllocator,
886                          struct vk_shader **shader_out)
887 {
888    struct panvk_device *device = to_panvk_device(vk_dev);
889    struct panvk_shader *shader;
890    VkResult result;
891 
892    struct pan_shader_info info;
893    blob_copy_bytes(blob, &info, sizeof(info));
894 
895    struct pan_compute_dim local_size;
896    blob_copy_bytes(blob, &local_size, sizeof(local_size));
897 
898    const uint32_t bin_size = blob_read_uint32(blob);
899 
900    if (blob->overrun)
901       return vk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
902 
903    shader = vk_shader_zalloc(vk_dev, &panvk_shader_ops, info.stage, pAllocator,
904                              sizeof(*shader));
905    if (shader == NULL)
906       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
907 
908    shader->info = info;
909    shader->local_size = local_size;
910    shader->bin_size = bin_size;
911 
912    shader->bin_ptr = malloc(bin_size);
913    if (shader->bin_ptr == NULL) {
914       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
915       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
916    }
917 
918    blob_copy_bytes(blob, (void *)shader->bin_ptr, shader->bin_size);
919 
920    result = shader_desc_info_deserialize(blob, shader);
921 
922    if (result != VK_SUCCESS) {
923       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
924       return vk_error(device, result);
925    }
926 
927    if (blob->overrun) {
928       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
929       return vk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
930    }
931 
932    result = panvk_shader_upload(device, shader, pAllocator);
933 
934    if (result != VK_SUCCESS) {
935       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
936       return result;
937    }
938 
939    *shader_out = &shader->vk;
940 
941    return result;
942 }
943 
944 static void
shader_desc_info_serialize(struct blob * blob,const struct panvk_shader * shader)945 shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader)
946 {
947    blob_write_uint32(blob, shader->desc_info.used_set_mask);
948 
949 #if PAN_ARCH <= 7
950    blob_write_uint32(blob, shader->desc_info.dyn_ubos.count);
951    blob_write_bytes(blob, shader->desc_info.dyn_ubos.map,
952                     sizeof(*shader->desc_info.dyn_ubos.map) *
953                        shader->desc_info.dyn_ubos.count);
954    blob_write_uint32(blob, shader->desc_info.dyn_ssbos.count);
955    blob_write_bytes(blob, shader->desc_info.dyn_ssbos.map,
956                     sizeof(*shader->desc_info.dyn_ssbos.map) *
957                        shader->desc_info.dyn_ssbos.count);
958 
959    unsigned others_count = 0;
960    for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
961       blob_write_uint32(blob, shader->desc_info.others.count[i]);
962       others_count += shader->desc_info.others.count[i];
963    }
964 
965    blob_write_bytes(blob,
966                     panvk_priv_mem_host_addr(shader->desc_info.others.map),
967                     sizeof(uint32_t) * others_count);
968 #else
969    blob_write_uint32(blob, shader->desc_info.dyn_bufs.count);
970    blob_write_bytes(blob, shader->desc_info.dyn_bufs.map,
971                     sizeof(*shader->desc_info.dyn_bufs.map) *
972                        shader->desc_info.dyn_bufs.count);
973 #endif
974 }
975 
976 static bool
panvk_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)977 panvk_shader_serialize(struct vk_device *vk_dev,
978                        const struct vk_shader *vk_shader, struct blob *blob)
979 {
980    struct panvk_shader *shader =
981       container_of(vk_shader, struct panvk_shader, vk);
982 
983    /**
984     * We can't currently cache assembly
985     * TODO: Implement seriaization with assembly
986     **/
987    if (shader->nir_str != NULL || shader->asm_str != NULL)
988       return false;
989 
990    blob_write_bytes(blob, &shader->info, sizeof(shader->info));
991    blob_write_bytes(blob, &shader->local_size, sizeof(shader->local_size));
992    blob_write_uint32(blob, shader->bin_size);
993    blob_write_bytes(blob, shader->bin_ptr, shader->bin_size);
994    shader_desc_info_serialize(blob, shader);
995 
996    return !blob->out_of_memory;
997 }
998 
999 #define WRITE_STR(field, ...)                                                  \
1000    ({                                                                          \
1001       memset(field, 0, sizeof(field));                                         \
1002       UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__);              \
1003       assert(i > 0 && i < sizeof(field));                                      \
1004    })
1005 
1006 static VkResult
panvk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1007 panvk_shader_get_executable_properties(
1008    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1009    uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
1010 {
1011    UNUSED struct panvk_shader *shader =
1012       container_of(vk_shader, struct panvk_shader, vk);
1013 
1014    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
1015                           executable_count);
1016 
1017    vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
1018    {
1019       props->stages = mesa_to_vk_shader_stage(shader->info.stage);
1020       props->subgroupSize = 8;
1021       WRITE_STR(props->name, "%s",
1022                 _mesa_shader_stage_to_string(shader->info.stage));
1023       WRITE_STR(props->description, "%s shader",
1024                 _mesa_shader_stage_to_string(shader->info.stage));
1025    }
1026 
1027    return vk_outarray_status(&out);
1028 }
1029 
1030 static VkResult
panvk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1031 panvk_shader_get_executable_statistics(
1032    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1033    uint32_t executable_index, uint32_t *statistic_count,
1034    VkPipelineExecutableStatisticKHR *statistics)
1035 {
1036    UNUSED struct panvk_shader *shader =
1037       container_of(vk_shader, struct panvk_shader, vk);
1038 
1039    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
1040                           statistic_count);
1041 
1042    assert(executable_index == 0);
1043 
1044    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1045    {
1046       WRITE_STR(stat->name, "Code Size");
1047       WRITE_STR(stat->description,
1048                 "Size of the compiled shader binary, in bytes");
1049       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1050       stat->value.u64 = shader->bin_size;
1051    }
1052 
1053    /* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */
1054 
1055    return vk_outarray_status(&out);
1056 }
1057 
1058 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1059 write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
1060               const char *data)
1061 {
1062    ir->isText = VK_TRUE;
1063 
1064    size_t data_len = strlen(data) + 1;
1065 
1066    if (ir->pData == NULL) {
1067       ir->dataSize = data_len;
1068       return true;
1069    }
1070 
1071    strncpy(ir->pData, data, ir->dataSize);
1072    if (ir->dataSize < data_len)
1073       return false;
1074 
1075    ir->dataSize = data_len;
1076    return true;
1077 }
1078 
1079 static VkResult
panvk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1080 panvk_shader_get_executable_internal_representations(
1081    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1082    uint32_t executable_index, uint32_t *internal_representation_count,
1083    VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1084 {
1085    UNUSED struct panvk_shader *shader =
1086       container_of(vk_shader, struct panvk_shader, vk);
1087    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1088                           internal_representations,
1089                           internal_representation_count);
1090    bool incomplete_text = false;
1091 
1092    if (shader->nir_str != NULL) {
1093       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
1094                                &out, ir)
1095       {
1096          WRITE_STR(ir->name, "NIR shader");
1097          WRITE_STR(ir->description,
1098                    "NIR shader before sending to the back-end compiler");
1099          if (!write_ir_text(ir, shader->nir_str))
1100             incomplete_text = true;
1101       }
1102    }
1103 
1104    if (shader->asm_str != NULL) {
1105       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
1106                                &out, ir)
1107       {
1108          WRITE_STR(ir->name, "Assembly");
1109          WRITE_STR(ir->description, "Final Assembly");
1110          if (!write_ir_text(ir, shader->asm_str))
1111             incomplete_text = true;
1112       }
1113    }
1114 
1115    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1116 }
1117 
1118 static mali_pixel_format
get_varying_format(gl_shader_stage stage,gl_varying_slot loc,enum pipe_format pfmt)1119 get_varying_format(gl_shader_stage stage, gl_varying_slot loc,
1120                    enum pipe_format pfmt)
1121 {
1122    switch (loc) {
1123    case VARYING_SLOT_PNTC:
1124    case VARYING_SLOT_PSIZ:
1125 #if PAN_ARCH <= 6
1126       return (MALI_R16F << 12) | panfrost_get_default_swizzle(1);
1127 #else
1128       return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
1129 #endif
1130    case VARYING_SLOT_POS:
1131 #if PAN_ARCH <= 6
1132       return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4);
1133 #else
1134       return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
1135 #endif
1136    default:
1137       assert(pfmt != PIPE_FORMAT_NONE);
1138       return GENX(panfrost_format_from_pipe_format)(pfmt)->hw;
1139    }
1140 }
1141 
1142 struct varyings_info {
1143    enum pipe_format fmts[VARYING_SLOT_MAX];
1144    BITSET_DECLARE(active, VARYING_SLOT_MAX);
1145 };
1146 
1147 static void
collect_varyings_info(const struct pan_shader_varying * varyings,unsigned varying_count,struct varyings_info * info)1148 collect_varyings_info(const struct pan_shader_varying *varyings,
1149                       unsigned varying_count, struct varyings_info *info)
1150 {
1151    for (unsigned i = 0; i < varying_count; i++) {
1152       gl_varying_slot loc = varyings[i].location;
1153 
1154       if (varyings[i].format == PIPE_FORMAT_NONE)
1155          continue;
1156 
1157       info->fmts[loc] = varyings[i].format;
1158       BITSET_SET(info->active, loc);
1159    }
1160 }
1161 
1162 static inline enum panvk_varying_buf_id
varying_buf_id(gl_varying_slot loc)1163 varying_buf_id(gl_varying_slot loc)
1164 {
1165    switch (loc) {
1166    case VARYING_SLOT_POS:
1167       return PANVK_VARY_BUF_POSITION;
1168    case VARYING_SLOT_PSIZ:
1169       return PANVK_VARY_BUF_PSIZ;
1170    default:
1171       return PANVK_VARY_BUF_GENERAL;
1172    }
1173 }
1174 
1175 static mali_pixel_format
varying_format(gl_varying_slot loc,enum pipe_format pfmt)1176 varying_format(gl_varying_slot loc, enum pipe_format pfmt)
1177 {
1178    switch (loc) {
1179    case VARYING_SLOT_PNTC:
1180    case VARYING_SLOT_PSIZ:
1181 #if PAN_ARCH <= 6
1182       return (MALI_R16F << 12) | panfrost_get_default_swizzle(1);
1183 #else
1184       return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
1185 #endif
1186    case VARYING_SLOT_POS:
1187 #if PAN_ARCH <= 6
1188       return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4);
1189 #else
1190       return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
1191 #endif
1192    default:
1193       return GENX(panfrost_format_from_pipe_format)(pfmt)->hw;
1194    }
1195 }
1196 
1197 static VkResult
emit_varying_attrs(struct panvk_pool * desc_pool,const struct pan_shader_varying * varyings,unsigned varying_count,const struct varyings_info * info,unsigned * buf_offsets,struct panvk_priv_mem * mem)1198 emit_varying_attrs(struct panvk_pool *desc_pool,
1199                    const struct pan_shader_varying *varyings,
1200                    unsigned varying_count, const struct varyings_info *info,
1201                    unsigned *buf_offsets, struct panvk_priv_mem *mem)
1202 {
1203    unsigned attr_count = BITSET_COUNT(info->active);
1204 
1205    *mem = panvk_pool_alloc_desc_array(desc_pool, attr_count, ATTRIBUTE);
1206 
1207    if (attr_count && !panvk_priv_mem_dev_addr(*mem))
1208       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1209 
1210    struct mali_attribute_packed *attrs = panvk_priv_mem_host_addr(*mem);
1211    unsigned attr_idx = 0;
1212 
1213    for (unsigned i = 0; i < varying_count; i++) {
1214       pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) {
1215          gl_varying_slot loc = varyings[i].location;
1216          enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE
1217                                     ? info->fmts[loc]
1218                                     : PIPE_FORMAT_NONE;
1219 
1220          if (pfmt == PIPE_FORMAT_NONE) {
1221 #if PAN_ARCH >= 7
1222             cfg.format = (MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000;
1223 #else
1224             cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0);
1225 #endif
1226          } else {
1227             cfg.buffer_index = varying_buf_id(loc);
1228             cfg.offset = buf_offsets[loc];
1229             cfg.format = varying_format(loc, info->fmts[loc]);
1230          }
1231          cfg.offset_enable = false;
1232       }
1233    }
1234 
1235    return VK_SUCCESS;
1236 }
1237 
1238 VkResult
panvk_per_arch(link_shaders)1239 panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
1240                              const struct panvk_shader *vs,
1241                              const struct panvk_shader *fs,
1242                              struct panvk_shader_link *link)
1243 {
1244    BITSET_DECLARE(active_attrs, VARYING_SLOT_MAX) = {0};
1245    unsigned buf_strides[PANVK_VARY_BUF_MAX] = {0};
1246    unsigned buf_offsets[VARYING_SLOT_MAX] = {0};
1247    struct varyings_info out_vars = {0};
1248    struct varyings_info in_vars = {0};
1249    unsigned loc;
1250 
1251    assert(vs);
1252    assert(vs->info.stage == MESA_SHADER_VERTEX);
1253 
1254    if (PAN_ARCH >= 9) {
1255       /* No need to calculate varying stride if there's no fragment shader. */
1256       if (fs)
1257          link->buf_strides[PANVK_VARY_BUF_GENERAL] =
1258             MAX2(fs->info.varyings.input_count, vs->info.varyings.output_count);
1259       return VK_SUCCESS;
1260    }
1261 
1262    collect_varyings_info(vs->info.varyings.output,
1263                          vs->info.varyings.output_count, &out_vars);
1264 
1265    if (fs) {
1266       assert(fs->info.stage == MESA_SHADER_FRAGMENT);
1267       collect_varyings_info(fs->info.varyings.input,
1268                             fs->info.varyings.input_count, &in_vars);
1269    }
1270 
1271    BITSET_OR(active_attrs, in_vars.active, out_vars.active);
1272 
1273    /* Handle the position and point size buffers explicitly, as they are
1274     * passed through separate buffer pointers to the tiler job.
1275     */
1276    if (BITSET_TEST(out_vars.active, VARYING_SLOT_POS)) {
1277       buf_strides[PANVK_VARY_BUF_POSITION] = sizeof(float) * 4;
1278       BITSET_CLEAR(active_attrs, VARYING_SLOT_POS);
1279    }
1280 
1281    if (BITSET_TEST(out_vars.active, VARYING_SLOT_PSIZ)) {
1282       buf_strides[PANVK_VARY_BUF_PSIZ] = sizeof(uint16_t);
1283       BITSET_CLEAR(active_attrs, VARYING_SLOT_PSIZ);
1284    }
1285 
1286    BITSET_FOREACH_SET(loc, active_attrs, VARYING_SLOT_MAX) {
1287       /* We expect the VS to write to all inputs read by the FS, and the
1288        * FS to read all inputs written by the VS. If that's not the
1289        * case, we keep PIPE_FORMAT_NONE to reflect the fact we should use a
1290        * sink attribute (writes are discarded, reads return zeros).
1291        */
1292       if (in_vars.fmts[loc] == PIPE_FORMAT_NONE ||
1293           out_vars.fmts[loc] == PIPE_FORMAT_NONE) {
1294          in_vars.fmts[loc] = PIPE_FORMAT_NONE;
1295          out_vars.fmts[loc] = PIPE_FORMAT_NONE;
1296          continue;
1297       }
1298 
1299       unsigned out_size = util_format_get_blocksize(out_vars.fmts[loc]);
1300       unsigned buf_idx = varying_buf_id(loc);
1301 
1302       /* Always trust the VS input format, so we can:
1303        * - discard components that are never read
1304        * - use float types for interpolated fragment shader inputs
1305        * - use fp16 for floats with mediump
1306        * - make sure components that are not written by the FS are set to zero
1307        */
1308       out_vars.fmts[loc] = in_vars.fmts[loc];
1309 
1310       /* Special buffers are handled explicitly before this loop, everything
1311        * else should be laid out in the general varying buffer.
1312        */
1313       assert(buf_idx == PANVK_VARY_BUF_GENERAL);
1314 
1315       /* Keep things aligned a 32-bit component. */
1316       buf_offsets[loc] = buf_strides[buf_idx];
1317       buf_strides[buf_idx] += ALIGN_POT(out_size, 4);
1318    }
1319 
1320    VkResult result = emit_varying_attrs(
1321       desc_pool, vs->info.varyings.output, vs->info.varyings.output_count,
1322       &out_vars, buf_offsets, &link->vs.attribs);
1323    if (result != VK_SUCCESS)
1324       return result;
1325 
1326    if (fs) {
1327       result = emit_varying_attrs(desc_pool, fs->info.varyings.input,
1328                                   fs->info.varyings.input_count, &in_vars,
1329                                   buf_offsets, &link->fs.attribs);
1330       if (result != VK_SUCCESS)
1331          return result;
1332    }
1333 
1334    memcpy(link->buf_strides, buf_strides, sizeof(link->buf_strides));
1335    return VK_SUCCESS;
1336 }
1337 
1338 static const struct vk_shader_ops panvk_shader_ops = {
1339    .destroy = panvk_shader_destroy,
1340    .serialize = panvk_shader_serialize,
1341    .get_executable_properties = panvk_shader_get_executable_properties,
1342    .get_executable_statistics = panvk_shader_get_executable_statistics,
1343    .get_executable_internal_representations =
1344       panvk_shader_get_executable_internal_representations,
1345 };
1346 
1347 static void
panvk_cmd_bind_shader(struct panvk_cmd_buffer * cmd,const gl_shader_stage stage,struct panvk_shader * shader)1348 panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage,
1349                       struct panvk_shader *shader)
1350 {
1351    switch (stage) {
1352    case MESA_SHADER_COMPUTE:
1353       cmd->state.compute.shader = shader;
1354       memset(&cmd->state.compute.cs.desc, 0,
1355              sizeof(cmd->state.compute.cs.desc));
1356       break;
1357    case MESA_SHADER_VERTEX:
1358       cmd->state.gfx.vs.shader = shader;
1359       cmd->state.gfx.linked = false;
1360       memset(&cmd->state.gfx.vs.desc, 0, sizeof(cmd->state.gfx.vs.desc));
1361       break;
1362    case MESA_SHADER_FRAGMENT:
1363       cmd->state.gfx.fs.shader = shader;
1364       cmd->state.gfx.linked = false;
1365 #if PAN_ARCH <= 7
1366       cmd->state.gfx.fs.rsd = 0;
1367 #endif
1368       memset(&cmd->state.gfx.fs.desc, 0, sizeof(cmd->state.gfx.fs.desc));
1369       break;
1370    default:
1371       assert(!"Unsupported stage");
1372       break;
1373    }
1374 }
1375 
1376 static void
panvk_cmd_bind_shaders(struct vk_command_buffer * vk_cmd,uint32_t stage_count,const gl_shader_stage * stages,struct vk_shader ** const shaders)1377 panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
1378                        const gl_shader_stage *stages,
1379                        struct vk_shader **const shaders)
1380 {
1381    struct panvk_cmd_buffer *cmd =
1382       container_of(vk_cmd, struct panvk_cmd_buffer, vk);
1383 
1384    for (uint32_t i = 0; i < stage_count; i++) {
1385       struct panvk_shader *shader =
1386          container_of(shaders[i], struct panvk_shader, vk);
1387 
1388       panvk_cmd_bind_shader(cmd, stages[i], shader);
1389    }
1390 }
1391 
1392 const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
1393    .get_nir_options = panvk_get_nir_options,
1394    .get_spirv_options = panvk_get_spirv_options,
1395    .preprocess_nir = panvk_preprocess_nir,
1396    .hash_graphics_state = panvk_hash_graphics_state,
1397    .compile = panvk_compile_shaders,
1398    .deserialize = panvk_deserialize_shader,
1399    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1400    .cmd_bind_shaders = panvk_cmd_bind_shaders,
1401 };
1402