xref: /aosp_15_r20/external/mesa3d/src/imagination/vulkan/pvr_pipeline.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * based in part on v3dv driver which is:
5  * Copyright © 2019 Raspberry Pi
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24  * SOFTWARE.
25  */
26 
27 #include <assert.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <vulkan/vulkan.h>
32 
33 #include "compiler/shader_enums.h"
34 #include "hwdef/rogue_hw_utils.h"
35 #include "nir/nir.h"
36 #include "pvr_bo.h"
37 #include "pvr_csb.h"
38 #include "pvr_csb_enum_helpers.h"
39 #include "pvr_hardcode.h"
40 #include "pvr_pds.h"
41 #include "pvr_private.h"
42 #include "pvr_robustness.h"
43 #include "pvr_shader.h"
44 #include "pvr_types.h"
45 #include "rogue/rogue.h"
46 #include "util/log.h"
47 #include "util/macros.h"
48 #include "util/ralloc.h"
49 #include "util/u_dynarray.h"
50 #include "util/u_math.h"
51 #include "vk_alloc.h"
52 #include "vk_format.h"
53 #include "vk_graphics_state.h"
54 #include "vk_log.h"
55 #include "vk_object.h"
56 #include "vk_pipeline_cache.h"
57 #include "vk_render_pass.h"
58 #include "vk_util.h"
59 
60 /*****************************************************************************
61    PDS functions
62 *****************************************************************************/
63 
64 /* If allocator == NULL, the internal one will be used. */
pvr_pds_coeff_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const uint32_t * fpu_iterators,uint32_t fpu_iterators_count,const uint32_t * destinations,struct pvr_pds_upload * const pds_upload_out,uint32_t * const pds_temps_count_out)65 static VkResult pvr_pds_coeff_program_create_and_upload(
66    struct pvr_device *device,
67    const VkAllocationCallbacks *allocator,
68    const uint32_t *fpu_iterators,
69    uint32_t fpu_iterators_count,
70    const uint32_t *destinations,
71    struct pvr_pds_upload *const pds_upload_out,
72    uint32_t *const pds_temps_count_out)
73 {
74    struct pvr_pds_coeff_loading_program program = {
75       .num_fpu_iterators = fpu_iterators_count,
76    };
77    uint32_t staging_buffer_size;
78    uint32_t *staging_buffer;
79    VkResult result;
80 
81    assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS);
82 
83    /* Get the size of the program and then allocate that much memory. */
84    pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES);
85 
86    if (!program.code_size) {
87       pds_upload_out->pvr_bo = NULL;
88       pds_upload_out->code_size = 0;
89       pds_upload_out->data_size = 0;
90       *pds_temps_count_out = 0;
91 
92       return VK_SUCCESS;
93    }
94 
95    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
96 
97    staging_buffer = vk_alloc2(&device->vk.alloc,
98                               allocator,
99                               staging_buffer_size,
100                               8,
101                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
102    if (!staging_buffer)
103       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
104 
105    /* FIXME: Should we save pointers when we redesign the pds gen api ? */
106    typed_memcpy(program.FPU_iterators,
107                 fpu_iterators,
108                 program.num_fpu_iterators);
109 
110    typed_memcpy(program.destination, destinations, program.num_fpu_iterators);
111 
112    /* Generate the program into is the staging_buffer. */
113    pvr_pds_coefficient_loading(&program,
114                                staging_buffer,
115                                PDS_GENERATE_CODEDATA_SEGMENTS);
116 
117    /* FIXME: Figure out the define for alignment of 16. */
118    result = pvr_gpu_upload_pds(device,
119                                &staging_buffer[0],
120                                program.data_size,
121                                16,
122                                &staging_buffer[program.data_size],
123                                program.code_size,
124                                16,
125                                16,
126                                pds_upload_out);
127    if (result != VK_SUCCESS) {
128       vk_free2(&device->vk.alloc, allocator, staging_buffer);
129       return result;
130    }
131 
132    vk_free2(&device->vk.alloc, allocator, staging_buffer);
133 
134    *pds_temps_count_out = program.temps_used;
135 
136    return VK_SUCCESS;
137 }
138 
139 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
140 /* If allocator == NULL, the internal one will be used. */
pvr_pds_fragment_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const struct pvr_suballoc_bo * fragment_shader_bo,uint32_t fragment_temp_count,enum rogue_msaa_mode msaa_mode,bool has_phase_rate_change,struct pvr_pds_upload * const pds_upload_out)141 VkResult pvr_pds_fragment_program_create_and_upload(
142    struct pvr_device *device,
143    const VkAllocationCallbacks *allocator,
144    const struct pvr_suballoc_bo *fragment_shader_bo,
145    uint32_t fragment_temp_count,
146    enum rogue_msaa_mode msaa_mode,
147    bool has_phase_rate_change,
148    struct pvr_pds_upload *const pds_upload_out)
149 {
150    const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE)
151       sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode);
152    struct pvr_pds_kickusc_program program = { 0 };
153    uint32_t staging_buffer_size;
154    uint32_t *staging_buffer;
155    VkResult result;
156 
157    /* FIXME: Should it be passing in the USC offset rather than address here?
158     */
159    /* Note this is not strictly required to be done before calculating the
160     * staging_buffer_size in this particular case. It can also be done after
161     * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
162     */
163    pvr_pds_setup_doutu(&program.usc_task_control,
164                        fragment_shader_bo->dev_addr.addr,
165                        fragment_temp_count,
166                        sample_rate,
167                        has_phase_rate_change);
168 
169    pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
170 
171    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
172 
173    staging_buffer = vk_alloc2(&device->vk.alloc,
174                               allocator,
175                               staging_buffer_size,
176                               8,
177                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
178    if (!staging_buffer)
179       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
180 
181    pvr_pds_kick_usc(&program,
182                     staging_buffer,
183                     0,
184                     false,
185                     PDS_GENERATE_CODEDATA_SEGMENTS);
186 
187    /* FIXME: Figure out the define for alignment of 16. */
188    result = pvr_gpu_upload_pds(device,
189                                &staging_buffer[0],
190                                program.data_size,
191                                16,
192                                &staging_buffer[program.data_size],
193                                program.code_size,
194                                16,
195                                16,
196                                pds_upload_out);
197    if (result != VK_SUCCESS) {
198       vk_free2(&device->vk.alloc, allocator, staging_buffer);
199       return result;
200    }
201 
202    vk_free2(&device->vk.alloc, allocator, staging_buffer);
203 
204    return VK_SUCCESS;
205 }
206 
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(const struct pvr_device_info * dev_info,bool robust_buffer_access)207 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
208    const struct pvr_device_info *dev_info,
209    bool robust_buffer_access)
210 {
211    /* FIXME: Use more local variable to improve formatting. */
212 
213    /* Maximum memory allocation needed for const map entries in
214     * pvr_pds_generate_vertex_primary_program().
215     * When robustBufferAccess is disabled, it must be >= 410.
216     * When robustBufferAccess is enabled, it must be >= 570.
217     *
218     * 1. Size of entry for base instance
219     *        (pvr_const_map_entry_base_instance)
220     *
221     * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
222     *     if (!robustBufferAccess)
223     *         size of vertex attribute entry
224     *             (pvr_const_map_entry_vertex_attribute_address) +
225     *     else
226     *         size of robust vertex attribute entry
227     *             (pvr_const_map_entry_robust_vertex_attribute_address) +
228     *         size of entry for max attribute index
229     *             (pvr_const_map_entry_vertex_attribute_max_index) +
230     *     fi
231     *     size of Unified Store burst entry
232     *         (pvr_const_map_entry_literal32) +
233     *     size of entry for vertex stride
234     *         (pvr_const_map_entry_literal32) +
235     *     size of entries for DDMAD control word
236     *         (num_ddmad_literals * pvr_const_map_entry_literal32))
237     *
238     * 3. Size of entry for DOUTW vertex/instance control word
239     *     (pvr_const_map_entry_literal32)
240     *
241     * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
242     */
243 
244    const size_t attribute_size =
245       (!robust_buffer_access)
246          ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
247          : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
248               sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
249 
250    /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
251     * and is increased by one DWORD to contain the data for the DDMADT's
252     * out-of-bounds check.
253     */
254    const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
255       1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
256 
257    return (sizeof(struct pvr_const_map_entry_base_instance) +
258            PVR_MAX_VERTEX_INPUT_BINDINGS *
259               (attribute_size +
260                (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
261                   sizeof(struct pvr_const_map_entry_literal32)) +
262            sizeof(struct pvr_const_map_entry_literal32) +
263            sizeof(struct pvr_const_map_entry_doutu_address));
264 }
265 
266 /* This is a const pointer to an array of pvr_pds_vertex_dma structs.
267  * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size.
268  */
269 typedef struct pvr_pds_vertex_dma (
270       *const
271          pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS];
272 
273 /* dma_descriptions_out_ptr is a pointer to the array used as output.
274  * The whole array might not be filled so dma_count_out indicates how many
275  * elements were used.
276  */
pvr_pds_vertex_attrib_init_dma_descriptions(const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,const struct rogue_vs_build_data * vs_data,pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,uint32_t * const dma_count_out)277 static void pvr_pds_vertex_attrib_init_dma_descriptions(
278    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
279    const struct rogue_vs_build_data *vs_data,
280    pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
281    uint32_t *const dma_count_out)
282 {
283    struct pvr_pds_vertex_dma *const dma_descriptions =
284       *dma_descriptions_out_ptr;
285    uint32_t dma_count = 0;
286 
287    if (!vertex_input_state) {
288       *dma_count_out = 0;
289       return;
290    }
291 
292    for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
293         i++) {
294       const VkVertexInputAttributeDescription *const attrib_desc =
295          &vertex_input_state->pVertexAttributeDescriptions[i];
296       const VkVertexInputBindingDescription *binding_desc = NULL;
297       struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count];
298       size_t location = attrib_desc->location;
299 
300       assert(location < vs_data->inputs.num_input_vars);
301 
302       /* Finding the matching binding description. */
303       for (uint32_t j = 0;
304            j < vertex_input_state->vertexBindingDescriptionCount;
305            j++) {
306          const VkVertexInputBindingDescription *const current_binding_desc =
307             &vertex_input_state->pVertexBindingDescriptions[j];
308 
309          if (current_binding_desc->binding == attrib_desc->binding) {
310             binding_desc = current_binding_desc;
311             break;
312          }
313       }
314 
315       /* From the Vulkan 1.2.195 spec for
316        * VkPipelineVertexInputStateCreateInfo:
317        *
318        *    "For every binding specified by each element of
319        *    pVertexAttributeDescriptions, a
320        *    VkVertexInputBindingDescription must exist in
321        *    pVertexBindingDescriptions with the same value of binding"
322        */
323       assert(binding_desc);
324 
325       dma_desc->offset = attrib_desc->offset;
326       dma_desc->stride = binding_desc->stride;
327 
328       dma_desc->flags = 0;
329 
330       if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
331          dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
332 
333       dma_desc->size_in_dwords = vs_data->inputs.components[location];
334       /* TODO: This will be different when other types are supported.
335        * Store in vs_data with base and components?
336        */
337       /* TODO: Use attrib_desc->format. */
338       dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES;
339       dma_desc->destination = vs_data->inputs.base[location];
340       dma_desc->binding_index = attrib_desc->binding;
341       dma_desc->divisor = 1;
342 
343       dma_desc->robustness_buffer_offset =
344          pvr_get_robustness_buffer_format_offset(attrib_desc->format);
345 
346       ++dma_count;
347    }
348 
349    *dma_count_out = dma_count;
350 }
351 
pvr_pds_vertex_attrib_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_pds_vertex_primary_program_input * const input,struct pvr_pds_attrib_program * const program_out)352 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
353    struct pvr_device *const device,
354    const VkAllocationCallbacks *const allocator,
355    struct pvr_pds_vertex_primary_program_input *const input,
356    struct pvr_pds_attrib_program *const program_out)
357 {
358    const size_t const_entries_size_in_bytes =
359       pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
360          &device->pdevice->dev_info,
361          device->vk.enabled_features.robustBufferAccess);
362    struct pvr_pds_upload *const program = &program_out->program;
363    struct pvr_pds_info *const info = &program_out->info;
364    struct pvr_const_map_entry *new_entries;
365    ASSERTED uint32_t code_size_in_dwords;
366    size_t staging_buffer_size;
367    uint32_t *staging_buffer;
368    VkResult result;
369 
370    memset(info, 0, sizeof(*info));
371 
372    info->entries = vk_alloc2(&device->vk.alloc,
373                              allocator,
374                              const_entries_size_in_bytes,
375                              8,
376                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
377    if (!info->entries) {
378       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
379       goto err_out;
380    }
381 
382    info->entries_size_in_bytes = const_entries_size_in_bytes;
383 
384    pvr_pds_generate_vertex_primary_program(
385       input,
386       NULL,
387       info,
388       device->vk.enabled_features.robustBufferAccess,
389       &device->pdevice->dev_info);
390 
391    code_size_in_dwords = info->code_size_in_dwords;
392    staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords);
393 
394    staging_buffer = vk_alloc2(&device->vk.alloc,
395                               allocator,
396                               staging_buffer_size,
397                               8,
398                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
399    if (!staging_buffer) {
400       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
401       goto err_free_entries;
402    }
403 
404    /* This also fills in info->entries. */
405    pvr_pds_generate_vertex_primary_program(
406       input,
407       staging_buffer,
408       info,
409       device->vk.enabled_features.robustBufferAccess,
410       &device->pdevice->dev_info);
411 
412    assert(info->code_size_in_dwords <= code_size_in_dwords);
413 
414    /* FIXME: Add a vk_realloc2() ? */
415    new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
416                             info->entries,
417                             info->entries_written_size_in_bytes,
418                             8,
419                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
420    if (!new_entries) {
421       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
422       goto err_free_staging_buffer;
423    }
424 
425    info->entries = new_entries;
426    info->entries_size_in_bytes = info->entries_written_size_in_bytes;
427 
428    /* FIXME: Figure out the define for alignment of 16. */
429    result = pvr_gpu_upload_pds(device,
430                                NULL,
431                                0,
432                                0,
433                                staging_buffer,
434                                info->code_size_in_dwords,
435                                16,
436                                16,
437                                program);
438    if (result != VK_SUCCESS)
439       goto err_free_staging_buffer;
440 
441    vk_free2(&device->vk.alloc, allocator, staging_buffer);
442 
443    return VK_SUCCESS;
444 
445 err_free_staging_buffer:
446    vk_free2(&device->vk.alloc, allocator, staging_buffer);
447 
448 err_free_entries:
449    vk_free2(&device->vk.alloc, allocator, info->entries);
450 
451 err_out:
452    return result;
453 }
454 
pvr_pds_vertex_attrib_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_attrib_program * const program)455 static inline void pvr_pds_vertex_attrib_program_destroy(
456    struct pvr_device *const device,
457    const struct VkAllocationCallbacks *const allocator,
458    struct pvr_pds_attrib_program *const program)
459 {
460    pvr_bo_suballoc_free(program->program.pvr_bo);
461    vk_free2(&device->vk.alloc, allocator, program->info.entries);
462 }
463 
464 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
465  * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
466  */
467 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
468    [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
469 
470 /* Indicates that the special variable is unused and has not been allocated a
471  * register.
472  */
473 #define PVR_VERTEX_SPECIAL_VAR_UNUSED (-1)
474 
475 /* Each special variable gets allocated its own vtxin reg if used. */
476 struct pvr_vertex_special_vars {
477    /* VertexIndex built-in. */
478    int16_t vertex_id_offset;
479    /* InstanceIndex built-in. */
480    int16_t instance_id_offset;
481 };
482 
483 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
484  * inputs. This will bake the code segment and create a template of the data
485  * segment for the command buffer to fill in.
486  */
487 /* If allocator == NULL, the internal one will be used.
488  *
489  * programs_out_ptr is a pointer to the array where the outputs will be placed.
490  */
pvr_pds_vertex_attrib_programs_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * const allocator,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,uint32_t usc_temp_count,const struct rogue_vs_build_data * vs_data,const struct pvr_pds_vertex_dma dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],uint32_t dma_count,const struct pvr_vertex_special_vars * special_vars_layout,pvr_pds_attrib_programs_array_ptr programs_out_ptr)491 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
492    struct pvr_device *device,
493    const VkAllocationCallbacks *const allocator,
494    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
495    uint32_t usc_temp_count,
496    const struct rogue_vs_build_data *vs_data,
497 
498    /* Needed for the new path. */
499    /* TODO: Remove some of the above once the compiler is hooked up. */
500    const struct pvr_pds_vertex_dma
501       dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],
502    uint32_t dma_count,
503    const struct pvr_vertex_special_vars *special_vars_layout,
504 
505    pvr_pds_attrib_programs_array_ptr programs_out_ptr)
506 {
507    struct pvr_pds_vertex_dma dma_descriptions_old[PVR_MAX_VERTEX_ATTRIB_DMAS];
508 
509    struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
510    struct pvr_pds_vertex_primary_program_input input = { 0 };
511    VkResult result;
512 
513    const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
514 
515    if (old_path) {
516       pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state,
517                                                   vs_data,
518                                                   &dma_descriptions_old,
519                                                   &input.dma_count);
520 
521       input.dma_list = dma_descriptions_old;
522    } else {
523       input.dma_list = dma_descriptions;
524       input.dma_count = dma_count;
525 
526       if (special_vars_layout->vertex_id_offset !=
527           PVR_VERTEX_SPECIAL_VAR_UNUSED) {
528          /* Gets filled by the HW and copied into the appropriate reg. */
529          input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED;
530          input.vertex_id_register = special_vars_layout->vertex_id_offset;
531       }
532 
533       if (special_vars_layout->instance_id_offset !=
534           PVR_VERTEX_SPECIAL_VAR_UNUSED) {
535          /* Gets filled by the HW and copied into the appropriate reg. */
536          input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED;
537          input.instance_id_register = special_vars_layout->instance_id_offset;
538       }
539    }
540 
541    pvr_pds_setup_doutu(&input.usc_task_control,
542                        0,
543                        usc_temp_count,
544                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
545                        false);
546 
547    /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
548     * typedef.
549     */
550    for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
551       uint32_t extra_flags;
552 
553       switch (i) {
554       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
555          extra_flags = 0;
556          break;
557 
558       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
559          extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
560          break;
561 
562       case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
563          extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
564          break;
565 
566       default:
567          unreachable("Invalid vertex attrib program type.");
568       }
569 
570       input.flags |= extra_flags;
571 
572       result =
573          pvr_pds_vertex_attrib_program_create_and_upload(device,
574                                                          allocator,
575                                                          &input,
576                                                          &programs_out[i]);
577       if (result != VK_SUCCESS) {
578          for (uint32_t j = 0; j < i; j++) {
579             pvr_pds_vertex_attrib_program_destroy(device,
580                                                   allocator,
581                                                   &programs_out[j]);
582          }
583 
584          return result;
585       }
586 
587       input.flags &= ~extra_flags;
588    }
589 
590    return VK_SUCCESS;
591 }
592 
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)593 size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
594 {
595    /* Maximum memory allocation needed for const map entries in
596     * pvr_pds_generate_descriptor_upload_program().
597     * It must be >= 688 bytes. This size is calculated as the sum of:
598     *
599     *  1. Max. number of descriptor sets (8) * (
600     *         size of descriptor entry
601     *             (pvr_const_map_entry_descriptor_set) +
602     *         size of Common Store burst entry
603     *             (pvr_const_map_entry_literal32))
604     *
605     *  2. Max. number of PDS program buffers (24) * (
606     *         size of the largest buffer structure
607     *             (pvr_const_map_entry_constant_buffer) +
608     *         size of Common Store burst entry
609     *             (pvr_const_map_entry_literal32)
610     *
611     *  3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
612     *
613     *  4. Max. number of PDS address literals (8) * (
614     *         size of entry
615     *             (pvr_const_map_entry_descriptor_set_addrs_table)
616     *
617     *  5. Max. number of address literals with single buffer entry to DOUTD
618               size of entry
619                   (pvr_pds_const_map_entry_addr_literal_buffer) +
620               8 * size of entry (pvr_pds_const_map_entry_addr_literal)
621     */
622 
623    /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
624     * say that it should be 8.
625     * Figure our a define for this or is the comment wrong?
626     */
627    return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
628                 sizeof(struct pvr_const_map_entry_literal32)) +
629            PVR_PDS_MAX_BUFFERS *
630               (sizeof(struct pvr_const_map_entry_constant_buffer) +
631                sizeof(struct pvr_const_map_entry_literal32)) +
632            sizeof(struct pvr_const_map_entry_doutu_address) +
633            sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) +
634            8 * sizeof(struct pvr_pds_const_map_entry_addr_literal));
635 }
636 
637 /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
638  * structs.
639  */
640 typedef struct pvr_pds_buffer (
641       *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS];
642 
643 /**
644  * \brief Setup buffers for the PDS descriptor program.
645  *
646  * Sets up buffers required by the PDS gen api based on compiler info.
647  *
648  * For compile time static constants that need DMAing it uploads them and
649  * returns the upload in \r static_consts_pvr_bo_out .
650  */
pvr_pds_descriptor_program_setup_buffers(struct pvr_device * device,bool robust_buffer_access,const struct rogue_compile_time_consts_data * compile_time_consts_data,const struct rogue_ubo_data * ubo_data,pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,uint32_t * const buffer_count_out,struct pvr_suballoc_bo ** const static_consts_pvr_bo_out)651 static VkResult pvr_pds_descriptor_program_setup_buffers(
652    struct pvr_device *device,
653    bool robust_buffer_access,
654    const struct rogue_compile_time_consts_data *compile_time_consts_data,
655    const struct rogue_ubo_data *ubo_data,
656    pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,
657    uint32_t *const buffer_count_out,
658    struct pvr_suballoc_bo **const static_consts_pvr_bo_out)
659 {
660    struct pvr_pds_buffer *const buffers = *buffers_out_ptr;
661    uint32_t buffer_count = 0;
662 
663    for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) {
664       struct pvr_pds_buffer *current_buffer = &buffers[buffer_count];
665 
666       /* This is fine since buffers_out_ptr is a pointer to an array. */
667       assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
668 
669       current_buffer->type = PVR_BUFFER_TYPE_UBO;
670       current_buffer->size_in_dwords = ubo_data->size[i];
671       current_buffer->destination = ubo_data->dest[i];
672 
673       current_buffer->buffer_id = buffer_count;
674       current_buffer->desc_set = ubo_data->desc_set[i];
675       current_buffer->binding = ubo_data->binding[i];
676       /* TODO: Is this always the case?
677        * E.g. can multiple UBOs have the same base buffer?
678        */
679       current_buffer->source_offset = 0;
680 
681       buffer_count++;
682    }
683 
684    if (compile_time_consts_data->static_consts.num > 0) {
685       VkResult result;
686 
687       assert(compile_time_consts_data->static_consts.num <=
688              ARRAY_SIZE(compile_time_consts_data->static_consts.value));
689 
690       /* This is fine since buffers_out_ptr is a pointer to an array. */
691       assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
692 
693       /* TODO: Is it possible to have multiple static consts buffer where the
694        * destination is not adjoining? If so we need to handle that.
695        * Currently we're only setting up a single buffer.
696        */
697       buffers[buffer_count++] = (struct pvr_pds_buffer){
698          .type = PVR_BUFFER_TYPE_COMPILE_TIME,
699          .size_in_dwords = compile_time_consts_data->static_consts.num,
700          .destination = compile_time_consts_data->static_consts.dest,
701       };
702 
703       result = pvr_gpu_upload(device,
704                               device->heaps.general_heap,
705                               compile_time_consts_data->static_consts.value,
706                               compile_time_consts_data->static_consts.num *
707                                  ROGUE_REG_SIZE_BYTES,
708                               ROGUE_REG_SIZE_BYTES,
709                               static_consts_pvr_bo_out);
710       if (result != VK_SUCCESS)
711          return result;
712    } else {
713       *static_consts_pvr_bo_out = NULL;
714    }
715 
716    *buffer_count_out = buffer_count;
717 
718    return VK_SUCCESS;
719 }
720 
pvr_pds_descriptor_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const struct rogue_compile_time_consts_data * const compile_time_consts_data,const struct rogue_ubo_data * const ubo_data,const struct pvr_explicit_constant_usage * const explicit_const_usage,const struct pvr_pipeline_layout * const layout,enum pvr_stage_allocation stage,const struct pvr_sh_reg_layout * sh_reg_layout,struct pvr_stage_allocation_descriptor_state * const descriptor_state)721 static VkResult pvr_pds_descriptor_program_create_and_upload(
722    struct pvr_device *const device,
723    const VkAllocationCallbacks *const allocator,
724    const struct rogue_compile_time_consts_data *const compile_time_consts_data,
725    const struct rogue_ubo_data *const ubo_data,
726    const struct pvr_explicit_constant_usage *const explicit_const_usage,
727    const struct pvr_pipeline_layout *const layout,
728    enum pvr_stage_allocation stage,
729    const struct pvr_sh_reg_layout *sh_reg_layout,
730    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
731 {
732    const size_t const_entries_size_in_bytes =
733       pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
734    struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
735    struct pvr_pds_descriptor_program_input program = { 0 };
736    struct pvr_const_map_entry *new_entries;
737    ASSERTED uint32_t code_size_in_dwords;
738    uint32_t staging_buffer_size;
739    uint32_t *staging_buffer;
740    VkResult result;
741 
742    const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
743 
744    assert(stage != PVR_STAGE_ALLOCATION_COUNT);
745 
746    *pds_info = (struct pvr_pds_info){ 0 };
747 
748    if (old_path) {
749       result = pvr_pds_descriptor_program_setup_buffers(
750          device,
751          device->vk.enabled_features.robustBufferAccess,
752          compile_time_consts_data,
753          ubo_data,
754          &program.buffers,
755          &program.buffer_count,
756          &descriptor_state->static_consts);
757       if (result != VK_SUCCESS)
758          return result;
759 
760       if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords)
761          assert(!"Unimplemented");
762 
763       for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) {
764          const struct pvr_descriptor_set_layout_mem_layout *const reg_layout =
765             &layout->register_layout_in_dwords_per_stage[stage][set_num];
766          const uint32_t start_offset = explicit_const_usage->start_offset;
767 
768          /* TODO: Use compiler usage info to optimize this? */
769 
770          /* Only dma primaries if they are actually required. */
771          if (reg_layout->primary_size) {
772             program.descriptor_sets[program.descriptor_set_count++] =
773                (struct pvr_pds_descriptor_set){
774                   .descriptor_set = set_num,
775                   .size_in_dwords = reg_layout->primary_size,
776                   .destination = reg_layout->primary_offset + start_offset,
777                   .primary = true,
778                };
779          }
780 
781          /* Only dma secondaries if they are actually required. */
782          if (!reg_layout->secondary_size)
783             continue;
784 
785          program.descriptor_sets[program.descriptor_set_count++] =
786             (struct pvr_pds_descriptor_set){
787                .descriptor_set = set_num,
788                .size_in_dwords = reg_layout->secondary_size,
789                .destination = reg_layout->secondary_offset + start_offset,
790             };
791       }
792    } else {
793       uint32_t addr_literals = 0;
794 
795       if (sh_reg_layout->descriptor_set_addrs_table.present) {
796          program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
797             .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
798             .destination = sh_reg_layout->descriptor_set_addrs_table.offset,
799          };
800          addr_literals++;
801       }
802 
803       if (sh_reg_layout->push_consts.present) {
804          program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
805             .type = PVR_PDS_ADDR_LITERAL_PUSH_CONSTS,
806             .destination = sh_reg_layout->push_consts.offset,
807          };
808          addr_literals++;
809       }
810 
811       if (sh_reg_layout->blend_consts.present) {
812          program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
813             .type = PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS,
814             .destination = sh_reg_layout->blend_consts.offset,
815          };
816          addr_literals++;
817       }
818 
819       program.addr_literal_count = addr_literals;
820    }
821 
822    pds_info->entries = vk_alloc2(&device->vk.alloc,
823                                  allocator,
824                                  const_entries_size_in_bytes,
825                                  8,
826                                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
827    if (!pds_info->entries) {
828       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
829       goto err_free_static_consts;
830    }
831 
832    pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
833 
834    pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
835 
836    code_size_in_dwords = pds_info->code_size_in_dwords;
837    staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords);
838 
839    if (!staging_buffer_size) {
840       vk_free2(&device->vk.alloc, allocator, pds_info->entries);
841 
842       *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
843 
844       return VK_SUCCESS;
845    }
846 
847    staging_buffer = vk_alloc2(&device->vk.alloc,
848                               allocator,
849                               staging_buffer_size,
850                               8,
851                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
852    if (!staging_buffer) {
853       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
854       goto err_free_entries;
855    }
856 
857    pvr_pds_generate_descriptor_upload_program(&program,
858                                               staging_buffer,
859                                               pds_info);
860 
861    assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
862 
863    /* FIXME: use vk_realloc2() ? */
864    new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
865                             pds_info->entries,
866                             pds_info->entries_written_size_in_bytes,
867                             8,
868                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
869    if (!new_entries) {
870       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
871       goto err_free_staging_buffer;
872    }
873 
874    pds_info->entries = new_entries;
875    pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
876 
877    /* FIXME: Figure out the define for alignment of 16. */
878    result = pvr_gpu_upload_pds(device,
879                                NULL,
880                                0,
881                                0,
882                                staging_buffer,
883                                pds_info->code_size_in_dwords,
884                                16,
885                                16,
886                                &descriptor_state->pds_code);
887    if (result != VK_SUCCESS)
888       goto err_free_staging_buffer;
889 
890    vk_free2(&device->vk.alloc, allocator, staging_buffer);
891 
892    return VK_SUCCESS;
893 
894 err_free_staging_buffer:
895    vk_free2(&device->vk.alloc, allocator, staging_buffer);
896 
897 err_free_entries:
898    vk_free2(&device->vk.alloc, allocator, pds_info->entries);
899 
900 err_free_static_consts:
901    pvr_bo_suballoc_free(descriptor_state->static_consts);
902 
903    return result;
904 }
905 
pvr_pds_descriptor_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_stage_allocation_descriptor_state * const descriptor_state)906 static void pvr_pds_descriptor_program_destroy(
907    struct pvr_device *const device,
908    const struct VkAllocationCallbacks *const allocator,
909    struct pvr_stage_allocation_descriptor_state *const descriptor_state)
910 {
911    if (!descriptor_state)
912       return;
913 
914    pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo);
915    vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
916    pvr_bo_suballoc_free(descriptor_state->static_consts);
917 }
918 
pvr_pds_compute_program_setup(const struct pvr_device_info * dev_info,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,bool add_base_workgroup,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_compute_shader_program * const program)919 static void pvr_pds_compute_program_setup(
920    const struct pvr_device_info *dev_info,
921    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
922    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
923    uint32_t barrier_coefficient,
924    bool add_base_workgroup,
925    uint32_t usc_temps,
926    pvr_dev_addr_t usc_shader_dev_addr,
927    struct pvr_pds_compute_shader_program *const program)
928 {
929    pvr_pds_compute_shader_program_init(program);
930    program->local_input_regs[0] = local_input_regs[0];
931    program->local_input_regs[1] = local_input_regs[1];
932    program->local_input_regs[2] = local_input_regs[2];
933    program->work_group_input_regs[0] = work_group_input_regs[0];
934    program->work_group_input_regs[1] = work_group_input_regs[1];
935    program->work_group_input_regs[2] = work_group_input_regs[2];
936    program->barrier_coefficient = barrier_coefficient;
937    program->add_base_workgroup = add_base_workgroup;
938    program->flattened_work_groups = true;
939    program->kick_usc = true;
940 
941    STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
942                  PVR_WORKGROUP_DIMENSIONS);
943    STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
944                  PVR_WORKGROUP_DIMENSIONS);
945    STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
946                  PVR_WORKGROUP_DIMENSIONS);
947 
948    pvr_pds_setup_doutu(&program->usc_task_control,
949                        usc_shader_dev_addr.addr,
950                        usc_temps,
951                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
952                        false);
953 
954    pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
955 }
956 
957 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
958  */
pvr_pds_compute_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_upload * const pds_upload_out,struct pvr_pds_info * const pds_info_out)959 static VkResult pvr_pds_compute_program_create_and_upload(
960    struct pvr_device *const device,
961    const VkAllocationCallbacks *const allocator,
962    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
963    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
964    uint32_t barrier_coefficient,
965    uint32_t usc_temps,
966    pvr_dev_addr_t usc_shader_dev_addr,
967    struct pvr_pds_upload *const pds_upload_out,
968    struct pvr_pds_info *const pds_info_out)
969 {
970    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
971    struct pvr_pds_compute_shader_program program;
972    uint32_t staging_buffer_size;
973    uint32_t *staging_buffer;
974    VkResult result;
975 
976    pvr_pds_compute_program_setup(dev_info,
977                                  local_input_regs,
978                                  work_group_input_regs,
979                                  barrier_coefficient,
980                                  false,
981                                  usc_temps,
982                                  usc_shader_dev_addr,
983                                  &program);
984 
985    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
986     * is in bytes. Investigate this.
987     */
988    staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
989 
990    staging_buffer = vk_alloc2(&device->vk.alloc,
991                               allocator,
992                               staging_buffer_size,
993                               8,
994                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
995    if (!staging_buffer)
996       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
997 
998    /* FIXME: pvr_pds_compute_shader doesn't implement
999     * PDS_GENERATE_CODEDATA_SEGMENTS.
1000     */
1001    pvr_pds_compute_shader(&program,
1002                           &staging_buffer[0],
1003                           PDS_GENERATE_CODE_SEGMENT,
1004                           dev_info);
1005 
1006    pvr_pds_compute_shader(&program,
1007                           &staging_buffer[program.code_size],
1008                           PDS_GENERATE_DATA_SEGMENT,
1009                           dev_info);
1010 
1011    /* FIXME: Figure out the define for alignment of 16. */
1012    result = pvr_gpu_upload_pds(device,
1013                                &staging_buffer[program.code_size],
1014                                program.data_size,
1015                                16,
1016                                &staging_buffer[0],
1017                                program.code_size,
1018                                16,
1019                                16,
1020                                pds_upload_out);
1021    if (result != VK_SUCCESS) {
1022       vk_free2(&device->vk.alloc, allocator, staging_buffer);
1023       return result;
1024    }
1025 
1026    *pds_info_out = (struct pvr_pds_info){
1027       .temps_required = program.highest_temp,
1028       .code_size_in_dwords = program.code_size,
1029       .data_size_in_dwords = program.data_size,
1030    };
1031 
1032    vk_free2(&device->vk.alloc, allocator, staging_buffer);
1033 
1034    return VK_SUCCESS;
1035 };
1036 
pvr_pds_compute_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_upload * const pds_program,struct pvr_pds_info * const pds_info)1037 static void pvr_pds_compute_program_destroy(
1038    struct pvr_device *const device,
1039    const struct VkAllocationCallbacks *const allocator,
1040    struct pvr_pds_upload *const pds_program,
1041    struct pvr_pds_info *const pds_info)
1042 {
1043    /* We don't allocate an entries buffer so we don't need to free it */
1044    pvr_bo_suballoc_free(pds_program->pvr_bo);
1045 }
1046 
1047 /* This only uploads the code segment. The data segment will need to be patched
1048  * with the base workgroup before uploading.
1049  */
pvr_pds_compute_base_workgroup_variant_program_init(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_base_workgroup_program * program_out)1050 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
1051    struct pvr_device *const device,
1052    const VkAllocationCallbacks *const allocator,
1053    const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
1054    const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
1055    uint32_t barrier_coefficient,
1056    uint32_t usc_temps,
1057    pvr_dev_addr_t usc_shader_dev_addr,
1058    struct pvr_pds_base_workgroup_program *program_out)
1059 {
1060    struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1061    struct pvr_pds_compute_shader_program program;
1062    uint32_t buffer_size;
1063    uint32_t *buffer;
1064    VkResult result;
1065 
1066    pvr_pds_compute_program_setup(dev_info,
1067                                  local_input_regs,
1068                                  work_group_input_regs,
1069                                  barrier_coefficient,
1070                                  true,
1071                                  usc_temps,
1072                                  usc_shader_dev_addr,
1073                                  &program);
1074 
1075    /* FIXME: According to pvr_device_init_compute_pds_program() the code size
1076     * is in bytes. Investigate this.
1077     */
1078    buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
1079 
1080    buffer = vk_alloc2(&device->vk.alloc,
1081                       allocator,
1082                       buffer_size,
1083                       8,
1084                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1085    if (!buffer)
1086       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1087 
1088    pvr_pds_compute_shader(&program,
1089                           &buffer[0],
1090                           PDS_GENERATE_CODE_SEGMENT,
1091                           dev_info);
1092 
1093    /* FIXME: Figure out the define for alignment of 16. */
1094    result = pvr_gpu_upload_pds(device,
1095                                NULL,
1096                                0,
1097                                0,
1098                                buffer,
1099                                program.code_size,
1100                                16,
1101                                16,
1102                                &program_out->code_upload);
1103    if (result != VK_SUCCESS) {
1104       vk_free2(&device->vk.alloc, allocator, buffer);
1105       return result;
1106    }
1107 
1108    pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
1109 
1110    program_out->data_section = buffer;
1111 
1112    /* We'll need to patch the base workgroup in the PDS data section before
1113     * dispatch so we save the offsets at which to patch. We only need to save
1114     * the offset for the first workgroup id since the workgroup ids are stored
1115     * contiguously in the data segment.
1116     */
1117    program_out->base_workgroup_data_patching_offset =
1118       program.base_workgroup_constant_offset_in_dwords[0];
1119 
1120    program_out->info = (struct pvr_pds_info){
1121       .temps_required = program.highest_temp,
1122       .code_size_in_dwords = program.code_size,
1123       .data_size_in_dwords = program.data_size,
1124    };
1125 
1126    return VK_SUCCESS;
1127 }
1128 
pvr_pds_compute_base_workgroup_variant_program_finish(struct pvr_device * device,const VkAllocationCallbacks * const allocator,struct pvr_pds_base_workgroup_program * const state)1129 static void pvr_pds_compute_base_workgroup_variant_program_finish(
1130    struct pvr_device *device,
1131    const VkAllocationCallbacks *const allocator,
1132    struct pvr_pds_base_workgroup_program *const state)
1133 {
1134    pvr_bo_suballoc_free(state->code_upload.pvr_bo);
1135    vk_free2(&device->vk.alloc, allocator, state->data_section);
1136 }
1137 
1138 /******************************************************************************
1139    Generic pipeline functions
1140  ******************************************************************************/
1141 
pvr_pipeline_init(struct pvr_device * device,enum pvr_pipeline_type type,struct pvr_pipeline * const pipeline)1142 static void pvr_pipeline_init(struct pvr_device *device,
1143                               enum pvr_pipeline_type type,
1144                               struct pvr_pipeline *const pipeline)
1145 {
1146    assert(!pipeline->layout);
1147 
1148    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
1149 
1150    pipeline->type = type;
1151 }
1152 
pvr_pipeline_finish(struct pvr_pipeline * pipeline)1153 static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
1154 {
1155    vk_object_base_finish(&pipeline->base);
1156 }
1157 
1158 /* How many shared regs it takes to store a pvr_dev_addr_t.
1159  * Each shared reg is 32 bits.
1160  */
1161 #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
1162    DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
1163 
1164 /**
1165  * \brief Allocates shared registers.
1166  *
1167  * \return How many sh regs are required.
1168  */
1169 static uint32_t
pvr_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_pipeline_layout * layout,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)1170 pvr_pipeline_alloc_shareds(const struct pvr_device *device,
1171                            const struct pvr_pipeline_layout *layout,
1172                            enum pvr_stage_allocation stage,
1173                            struct pvr_sh_reg_layout *const sh_reg_layout_out)
1174 {
1175    ASSERTED const uint64_t reserved_shared_size =
1176       device->pdevice->dev_runtime_info.reserved_shared_size;
1177    ASSERTED const uint64_t max_coeff =
1178       device->pdevice->dev_runtime_info.max_coeffs;
1179 
1180    struct pvr_sh_reg_layout reg_layout = { 0 };
1181    uint32_t next_free_sh_reg = 0;
1182 
1183    reg_layout.descriptor_set_addrs_table.present =
1184       !!(layout->shader_stage_mask & BITFIELD_BIT(stage));
1185 
1186    if (reg_layout.descriptor_set_addrs_table.present) {
1187       reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg;
1188       next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1189    }
1190 
1191    reg_layout.push_consts.present =
1192       !!(layout->push_constants_shader_stages & BITFIELD_BIT(stage));
1193 
1194    if (reg_layout.push_consts.present) {
1195       reg_layout.push_consts.offset = next_free_sh_reg;
1196       next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1197    }
1198 
1199    *sh_reg_layout_out = reg_layout;
1200 
1201    /* FIXME: We might need to take more things into consideration.
1202     * See pvr_calc_fscommon_size_and_tiles_in_flight().
1203     */
1204    assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
1205 
1206    return next_free_sh_reg;
1207 }
1208 
1209 /******************************************************************************
1210    Compute pipeline functions
1211  ******************************************************************************/
1212 
1213 /* Compiles and uploads shaders and PDS programs. */
pvr_compute_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1214 static VkResult pvr_compute_pipeline_compile(
1215    struct pvr_device *const device,
1216    struct vk_pipeline_cache *cache,
1217    const VkComputePipelineCreateInfo *pCreateInfo,
1218    const VkAllocationCallbacks *const allocator,
1219    struct pvr_compute_pipeline *const compute_pipeline)
1220 {
1221    struct pvr_pipeline_layout *layout = compute_pipeline->base.layout;
1222    struct pvr_sh_reg_layout *sh_reg_layout =
1223       &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_COMPUTE];
1224    struct rogue_compile_time_consts_data compile_time_consts_data;
1225    uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
1226    struct pvr_explicit_constant_usage explicit_const_usage;
1227    uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
1228    struct rogue_ubo_data ubo_data;
1229    uint32_t barrier_coefficient;
1230    uint32_t usc_temps;
1231    VkResult result;
1232 
1233    if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info)) {
1234       struct pvr_hard_code_compute_build_info build_info;
1235 
1236       result = pvr_hard_code_compute_pipeline(device,
1237                                               &compute_pipeline->shader_state,
1238                                               &build_info);
1239       if (result != VK_SUCCESS)
1240          return result;
1241 
1242       ubo_data = build_info.ubo_data;
1243       compile_time_consts_data = build_info.compile_time_consts_data;
1244 
1245       /* We make sure that the compiler's unused reg value is compatible with
1246        * the pds api.
1247        */
1248       STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED);
1249 
1250       barrier_coefficient = build_info.barrier_reg;
1251 
1252       /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */
1253       local_input_regs[0] = build_info.local_invocation_regs[0];
1254       local_input_regs[1] = build_info.local_invocation_regs[1];
1255       /* This is not a mistake. We want to assign element 1 to 2. */
1256       local_input_regs[2] = build_info.local_invocation_regs[1];
1257 
1258       STATIC_ASSERT(
1259          __same_type(work_group_input_regs, build_info.work_group_regs));
1260       typed_memcpy(work_group_input_regs,
1261                    build_info.work_group_regs,
1262                    PVR_WORKGROUP_DIMENSIONS);
1263 
1264       usc_temps = build_info.usc_temps;
1265 
1266       explicit_const_usage = build_info.explicit_conts_usage;
1267 
1268    } else {
1269       uint32_t sh_count;
1270       sh_count = pvr_pipeline_alloc_shareds(device,
1271                                             layout,
1272                                             PVR_STAGE_ALLOCATION_COMPUTE,
1273                                             sh_reg_layout);
1274 
1275       compute_pipeline->shader_state.const_shared_reg_count = sh_count;
1276 
1277       /* FIXME: Compile and upload the shader. */
1278       /* FIXME: Initialize the shader state and setup build info. */
1279       abort();
1280    };
1281 
1282    result = pvr_pds_descriptor_program_create_and_upload(
1283       device,
1284       allocator,
1285       &compile_time_consts_data,
1286       &ubo_data,
1287       &explicit_const_usage,
1288       layout,
1289       PVR_STAGE_ALLOCATION_COMPUTE,
1290       sh_reg_layout,
1291       &compute_pipeline->descriptor_state);
1292    if (result != VK_SUCCESS)
1293       goto err_free_shader;
1294 
1295    result = pvr_pds_compute_program_create_and_upload(
1296       device,
1297       allocator,
1298       local_input_regs,
1299       work_group_input_regs,
1300       barrier_coefficient,
1301       usc_temps,
1302       compute_pipeline->shader_state.bo->dev_addr,
1303       &compute_pipeline->primary_program,
1304       &compute_pipeline->primary_program_info);
1305    if (result != VK_SUCCESS)
1306       goto err_free_descriptor_program;
1307 
1308    /* If the workgroup ID is required, then we require the base workgroup
1309     * variant of the PDS compute program as well.
1310     */
1311    compute_pipeline->flags.base_workgroup =
1312       work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1313       work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1314       work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
1315 
1316    if (compute_pipeline->flags.base_workgroup) {
1317       result = pvr_pds_compute_base_workgroup_variant_program_init(
1318          device,
1319          allocator,
1320          local_input_regs,
1321          work_group_input_regs,
1322          barrier_coefficient,
1323          usc_temps,
1324          compute_pipeline->shader_state.bo->dev_addr,
1325          &compute_pipeline->primary_base_workgroup_variant_program);
1326       if (result != VK_SUCCESS)
1327          goto err_destroy_compute_program;
1328    }
1329 
1330    return VK_SUCCESS;
1331 
1332 err_destroy_compute_program:
1333    pvr_pds_compute_program_destroy(device,
1334                                    allocator,
1335                                    &compute_pipeline->primary_program,
1336                                    &compute_pipeline->primary_program_info);
1337 
1338 err_free_descriptor_program:
1339    pvr_pds_descriptor_program_destroy(device,
1340                                       allocator,
1341                                       &compute_pipeline->descriptor_state);
1342 
1343 err_free_shader:
1344    pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1345 
1346    return result;
1347 }
1348 
1349 static VkResult
pvr_compute_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_compute_pipeline * compute_pipeline)1350 pvr_compute_pipeline_init(struct pvr_device *device,
1351                           struct vk_pipeline_cache *cache,
1352                           const VkComputePipelineCreateInfo *pCreateInfo,
1353                           const VkAllocationCallbacks *allocator,
1354                           struct pvr_compute_pipeline *compute_pipeline)
1355 {
1356    VkResult result;
1357 
1358    pvr_pipeline_init(device,
1359                      PVR_PIPELINE_TYPE_COMPUTE,
1360                      &compute_pipeline->base);
1361 
1362    compute_pipeline->base.layout =
1363       pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1364 
1365    result = pvr_compute_pipeline_compile(device,
1366                                          cache,
1367                                          pCreateInfo,
1368                                          allocator,
1369                                          compute_pipeline);
1370    if (result != VK_SUCCESS) {
1371       pvr_pipeline_finish(&compute_pipeline->base);
1372       return result;
1373    }
1374 
1375    return VK_SUCCESS;
1376 }
1377 
1378 static VkResult
pvr_compute_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1379 pvr_compute_pipeline_create(struct pvr_device *device,
1380                             struct vk_pipeline_cache *cache,
1381                             const VkComputePipelineCreateInfo *pCreateInfo,
1382                             const VkAllocationCallbacks *allocator,
1383                             VkPipeline *const pipeline_out)
1384 {
1385    struct pvr_compute_pipeline *compute_pipeline;
1386    VkResult result;
1387 
1388    compute_pipeline = vk_zalloc2(&device->vk.alloc,
1389                                  allocator,
1390                                  sizeof(*compute_pipeline),
1391                                  8,
1392                                  VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1393    if (!compute_pipeline)
1394       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1395 
1396    /* Compiles and uploads shaders and PDS programs. */
1397    result = pvr_compute_pipeline_init(device,
1398                                       cache,
1399                                       pCreateInfo,
1400                                       allocator,
1401                                       compute_pipeline);
1402    if (result != VK_SUCCESS) {
1403       vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1404       return result;
1405    }
1406 
1407    *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1408 
1409    return VK_SUCCESS;
1410 }
1411 
pvr_compute_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1412 static void pvr_compute_pipeline_destroy(
1413    struct pvr_device *const device,
1414    const VkAllocationCallbacks *const allocator,
1415    struct pvr_compute_pipeline *const compute_pipeline)
1416 {
1417    if (compute_pipeline->flags.base_workgroup) {
1418       pvr_pds_compute_base_workgroup_variant_program_finish(
1419          device,
1420          allocator,
1421          &compute_pipeline->primary_base_workgroup_variant_program);
1422    }
1423 
1424    pvr_pds_compute_program_destroy(device,
1425                                    allocator,
1426                                    &compute_pipeline->primary_program,
1427                                    &compute_pipeline->primary_program_info);
1428    pvr_pds_descriptor_program_destroy(device,
1429                                       allocator,
1430                                       &compute_pipeline->descriptor_state);
1431    pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1432 
1433    pvr_pipeline_finish(&compute_pipeline->base);
1434 
1435    vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1436 }
1437 
1438 VkResult
pvr_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1439 pvr_CreateComputePipelines(VkDevice _device,
1440                            VkPipelineCache pipelineCache,
1441                            uint32_t createInfoCount,
1442                            const VkComputePipelineCreateInfo *pCreateInfos,
1443                            const VkAllocationCallbacks *pAllocator,
1444                            VkPipeline *pPipelines)
1445 {
1446    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
1447    PVR_FROM_HANDLE(pvr_device, device, _device);
1448    VkResult result = VK_SUCCESS;
1449 
1450    for (uint32_t i = 0; i < createInfoCount; i++) {
1451       const VkResult local_result =
1452          pvr_compute_pipeline_create(device,
1453                                      cache,
1454                                      &pCreateInfos[i],
1455                                      pAllocator,
1456                                      &pPipelines[i]);
1457       if (local_result != VK_SUCCESS) {
1458          result = local_result;
1459          pPipelines[i] = VK_NULL_HANDLE;
1460       }
1461    }
1462 
1463    return result;
1464 }
1465 
1466 /******************************************************************************
1467    Graphics pipeline functions
1468  ******************************************************************************/
1469 
1470 static void
pvr_graphics_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1471 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1472                               const VkAllocationCallbacks *const allocator,
1473                               struct pvr_graphics_pipeline *const gfx_pipeline)
1474 {
1475    const uint32_t num_vertex_attrib_programs =
1476       ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
1477 
1478    pvr_pds_descriptor_program_destroy(
1479       device,
1480       allocator,
1481       &gfx_pipeline->shader_state.fragment.descriptor_state);
1482 
1483    pvr_pds_descriptor_program_destroy(
1484       device,
1485       allocator,
1486       &gfx_pipeline->shader_state.vertex.descriptor_state);
1487 
1488    for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1489       struct pvr_pds_attrib_program *const attrib_program =
1490          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
1491 
1492       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1493    }
1494 
1495    pvr_bo_suballoc_free(
1496       gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
1497    pvr_bo_suballoc_free(
1498       gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
1499 
1500    pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
1501    pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
1502 
1503    pvr_pipeline_finish(&gfx_pipeline->base);
1504 
1505    vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1506 }
1507 
1508 static void
pvr_vertex_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data,uint32_t vtxin_regs_used,const struct rogue_vs_build_data * vs_data)1509 pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1510                       const struct rogue_common_build_data *common_data,
1511                       uint32_t vtxin_regs_used,
1512                       const struct rogue_vs_build_data *vs_data)
1513 {
1514    struct pvr_vertex_shader_state *vertex_state =
1515       &gfx_pipeline->shader_state.vertex;
1516 
1517    /* TODO: Hard coding these for now. These should be populated based on the
1518     * information returned by the compiler.
1519     */
1520    vertex_state->stage_state.const_shared_reg_count = common_data->shareds;
1521    vertex_state->stage_state.const_shared_reg_offset = 0;
1522    vertex_state->stage_state.coefficient_size = common_data->coeffs;
1523    vertex_state->stage_state.uses_atomic_ops = false;
1524    vertex_state->stage_state.uses_texture_rw = false;
1525    vertex_state->stage_state.uses_barrier = false;
1526    vertex_state->stage_state.has_side_effects = false;
1527    vertex_state->stage_state.empty_program = false;
1528 
1529    /* This ends up unused since we'll use the temp_usage for the PDS program we
1530     * end up selecting, and the descriptor PDS program doesn't use any temps.
1531     * Let's set it to ~0 in case it ever gets used.
1532     */
1533    vertex_state->stage_state.pds_temps_count = ~0;
1534 
1535    vertex_state->vertex_input_size = vtxin_regs_used;
1536    vertex_state->vertex_output_size =
1537       vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES;
1538    vertex_state->user_clip_planes_mask = 0;
1539    vertex_state->entry_offset = 0;
1540 
1541    /* TODO: The number of varyings should be checked against the fragment
1542     * shader inputs and assigned in the place where that happens.
1543     * There will also be an opportunity to cull unused fs inputs/vs outputs.
1544     */
1545    pvr_csb_pack (&gfx_pipeline->shader_state.vertex.varying[0],
1546                  TA_STATE_VARYING0,
1547                  varying0) {
1548       varying0.f32_linear = vs_data->num_varyings;
1549       varying0.f32_flat = 0;
1550       varying0.f32_npc = 0;
1551    }
1552 
1553    pvr_csb_pack (&gfx_pipeline->shader_state.vertex.varying[1],
1554                  TA_STATE_VARYING1,
1555                  varying1) {
1556       varying1.f16_linear = 0;
1557       varying1.f16_flat = 0;
1558       varying1.f16_npc = 0;
1559    }
1560 }
1561 
1562 static void
pvr_fragment_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data)1563 pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1564                         const struct rogue_common_build_data *common_data)
1565 {
1566    struct pvr_fragment_shader_state *fragment_state =
1567       &gfx_pipeline->shader_state.fragment;
1568 
1569    /* TODO: Hard coding these for now. These should be populated based on the
1570     * information returned by the compiler.
1571     */
1572    fragment_state->stage_state.const_shared_reg_count = 0;
1573    fragment_state->stage_state.const_shared_reg_offset = 0;
1574    fragment_state->stage_state.coefficient_size = common_data->coeffs;
1575    fragment_state->stage_state.uses_atomic_ops = false;
1576    fragment_state->stage_state.uses_texture_rw = false;
1577    fragment_state->stage_state.uses_barrier = false;
1578    fragment_state->stage_state.has_side_effects = false;
1579    fragment_state->stage_state.empty_program = false;
1580 
1581    fragment_state->pass_type = PVRX(TA_PASSTYPE_OPAQUE);
1582    fragment_state->entry_offset = 0;
1583 
1584    /* We can't initialize it yet since we still need to generate the PDS
1585     * programs so set it to `~0` to make sure that we set this up later on.
1586     */
1587    fragment_state->stage_state.pds_temps_count = ~0;
1588 }
1589 
pvr_blend_factor_requires_consts(VkBlendFactor factor)1590 static bool pvr_blend_factor_requires_consts(VkBlendFactor factor)
1591 {
1592    switch (factor) {
1593    case VK_BLEND_FACTOR_CONSTANT_COLOR:
1594    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1595    case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1596    case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1597       return true;
1598 
1599    default:
1600       return false;
1601    }
1602 }
1603 
1604 /**
1605  * \brief Indicates whether dynamic blend constants are needed.
1606  *
1607  * If the user has specified the blend constants to be dynamic, they might not
1608  * necessarily be using them. This function makes sure that they are being used
1609  * in order to determine whether we need to upload them later on for the shader
1610  * to access them.
1611  */
pvr_graphics_pipeline_requires_dynamic_blend_consts(const struct pvr_graphics_pipeline * gfx_pipeline)1612 static bool pvr_graphics_pipeline_requires_dynamic_blend_consts(
1613    const struct pvr_graphics_pipeline *gfx_pipeline)
1614 {
1615    const struct vk_dynamic_graphics_state *const state =
1616       &gfx_pipeline->dynamic_state;
1617 
1618    if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
1619       return false;
1620 
1621    for (uint32_t i = 0; i < state->cb.attachment_count; i++) {
1622       const struct vk_color_blend_attachment_state *attachment =
1623          &state->cb.attachments[i];
1624 
1625       const bool has_color_write =
1626          attachment->write_mask &
1627          (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
1628           VK_COLOR_COMPONENT_B_BIT);
1629       const bool has_alpha_write = attachment->write_mask &
1630                                    VK_COLOR_COMPONENT_A_BIT;
1631 
1632       if (!attachment->blend_enable || attachment->write_mask == 0)
1633          continue;
1634 
1635       if (has_color_write) {
1636          const uint8_t src_color_blend_factor =
1637             attachment->src_color_blend_factor;
1638          const uint8_t dst_color_blend_factor =
1639             attachment->dst_color_blend_factor;
1640 
1641          if (pvr_blend_factor_requires_consts(src_color_blend_factor) ||
1642              pvr_blend_factor_requires_consts(dst_color_blend_factor)) {
1643             return true;
1644          }
1645       }
1646 
1647       if (has_alpha_write) {
1648          const uint8_t src_alpha_blend_factor =
1649             attachment->src_alpha_blend_factor;
1650          const uint8_t dst_alpha_blend_factor =
1651             attachment->dst_alpha_blend_factor;
1652 
1653          if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) ||
1654              pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) {
1655             return true;
1656          }
1657       }
1658    }
1659 
1660    return false;
1661 }
1662 
pvr_graphics_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_graphics_pipeline * gfx_pipeline,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)1663 static uint32_t pvr_graphics_pipeline_alloc_shareds(
1664    const struct pvr_device *device,
1665    const struct pvr_graphics_pipeline *gfx_pipeline,
1666    enum pvr_stage_allocation stage,
1667    struct pvr_sh_reg_layout *const sh_reg_layout_out)
1668 {
1669    ASSERTED const uint64_t reserved_shared_size =
1670       device->pdevice->dev_runtime_info.reserved_shared_size;
1671    ASSERTED const uint64_t max_coeff =
1672       device->pdevice->dev_runtime_info.max_coeffs;
1673 
1674    const struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
1675    struct pvr_sh_reg_layout reg_layout = { 0 };
1676    uint32_t next_free_sh_reg = 0;
1677 
1678    next_free_sh_reg =
1679       pvr_pipeline_alloc_shareds(device, layout, stage, &reg_layout);
1680 
1681    reg_layout.blend_consts.present =
1682       (stage == PVR_STAGE_ALLOCATION_FRAGMENT &&
1683        pvr_graphics_pipeline_requires_dynamic_blend_consts(gfx_pipeline));
1684    if (reg_layout.blend_consts.present) {
1685       reg_layout.blend_consts.offset = next_free_sh_reg;
1686       next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1687    }
1688 
1689    *sh_reg_layout_out = reg_layout;
1690 
1691    /* FIXME: We might need to take more things into consideration.
1692     * See pvr_calc_fscommon_size_and_tiles_in_flight().
1693     */
1694    assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
1695 
1696    return next_free_sh_reg;
1697 }
1698 
1699 #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS
1700 
pvr_graphics_pipeline_alloc_vertex_inputs(const VkPipelineVertexInputStateCreateInfo * const vs_data,rogue_vertex_inputs * const vertex_input_layout_out,unsigned * num_vertex_input_regs_out,pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,uint32_t * const dma_count_out)1701 static void pvr_graphics_pipeline_alloc_vertex_inputs(
1702    const VkPipelineVertexInputStateCreateInfo *const vs_data,
1703    rogue_vertex_inputs *const vertex_input_layout_out,
1704    unsigned *num_vertex_input_regs_out,
1705    pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
1706    uint32_t *const dma_count_out)
1707 {
1708    const VkVertexInputBindingDescription
1709       *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1710    const VkVertexInputAttributeDescription
1711       *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1712 
1713    rogue_vertex_inputs build_data = {
1714       .num_input_vars = vs_data->vertexAttributeDescriptionCount,
1715    };
1716    uint32_t next_reg_offset = 0;
1717 
1718    struct pvr_pds_vertex_dma *const dma_descriptions =
1719       *dma_descriptions_out_ptr;
1720    uint32_t dma_count = 0;
1721 
1722    /* Vertex attributes map to the `layout(location = x)` annotation in the
1723     * shader where `x` is the attribute's location.
1724     * Vertex bindings have NO relation to the shader. They have nothing to do
1725     * with the `layout(set = x, binding = y)` notation. They instead indicate
1726     * where the data for a collection of vertex attributes comes from. The
1727     * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific
1728     * binding number and based on that we'll know which buffer to DMA the data
1729     * from, to fill in the collection of vertex attributes.
1730     */
1731 
1732    for (uint32_t i = 0; i < vs_data->vertexBindingDescriptionCount; i++) {
1733       const VkVertexInputBindingDescription *binding_desc =
1734          &vs_data->pVertexBindingDescriptions[i];
1735 
1736       sorted_bindings[binding_desc->binding] = binding_desc;
1737    }
1738 
1739    for (uint32_t i = 0; i < vs_data->vertexAttributeDescriptionCount; i++) {
1740       const VkVertexInputAttributeDescription *attribute_desc =
1741          &vs_data->pVertexAttributeDescriptions[i];
1742 
1743       sorted_attributes[attribute_desc->location] = attribute_desc;
1744    }
1745 
1746    for (uint32_t i = 0, j = 0; i < ARRAY_SIZE(sorted_attributes); i++) {
1747       if (sorted_attributes[i])
1748          sorted_attributes[j++] = sorted_attributes[i];
1749    }
1750 
1751    for (uint32_t i = 0; i < vs_data->vertexAttributeDescriptionCount; i++) {
1752       const VkVertexInputAttributeDescription *attribute = sorted_attributes[i];
1753       const VkVertexInputBindingDescription *binding =
1754          sorted_bindings[attribute->binding];
1755       const struct util_format_description *fmt_description =
1756          vk_format_description(attribute->format);
1757       struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[dma_count];
1758       unsigned vtxin_reg_offset;
1759 
1760       /* Reg allocation. */
1761 
1762       vtxin_reg_offset = next_reg_offset;
1763       build_data.base[i] = vtxin_reg_offset;
1764 
1765       if (fmt_description->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
1766           fmt_description->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
1767           fmt_description->block.bits % 32 != 0 || !fmt_description->is_array) {
1768          /* For now we only support formats with 32 bit components since we
1769           * don't need to pack/unpack them.
1770           */
1771          /* TODO: Support any other format with VERTEX_BUFFER_BIT set that
1772           * doesn't have 32 bit components if we're advertising any.
1773           */
1774          assert(false);
1775       }
1776 
1777       /* TODO: Check if this is fine with the compiler. Does it want the amount
1778        * of components or does it want a size in dwords to figure out how many
1779        * vtxin regs are covered. For formats with 32 bit components the
1780        * distinction doesn't change anything.
1781        */
1782       build_data.components[i] =
1783          util_format_get_nr_components(fmt_description->format);
1784 
1785       next_reg_offset += build_data.components[i];
1786 
1787       /* DMA setup. */
1788 
1789       /* The PDS program sets up DDMADs to DMA attributes into vtxin regs.
1790        *
1791        * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address).
1792        *          DMA source addr = src0 * src1 + src2
1793        *          DMA params = src3
1794        *
1795        * In the PDS program we setup src0 with the binding's stride and src1
1796        * with either the instance id or vertex id (both of which get filled by
1797        * the hardware). We setup src2 later on once we know which VkBuffer to
1798        * DMA the data from so it's saved for later when we patch the data
1799        * section.
1800        */
1801 
1802       /* TODO: Right now we're setting up a DMA per attribute. In a case where
1803        * there are multiple attributes packed into a single binding with
1804        * adjacent locations we'd still be DMAing them separately. This is not
1805        * great so the DMA setup should be smarter and could do with some
1806        * optimization.
1807        */
1808 
1809       *dma_desc = (struct pvr_pds_vertex_dma){ 0 };
1810 
1811       /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1812        * this corresponds to `attribDesc.offset`.
1813        * The PDS program doesn't do anything with it but just save it in the
1814        * PDS program entry.
1815        */
1816       dma_desc->offset = attribute->offset;
1817 
1818       /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1819        * this corresponds to `bindingDesc.stride`.
1820        * The PDS program will calculate the `effectiveVertexOffset` with this
1821        * and add it to the address provided in the patched data segment.
1822        */
1823       dma_desc->stride = binding->stride;
1824 
1825       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1826          dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
1827       else
1828          dma_desc->flags = 0;
1829 
1830       /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */
1831       assert(fmt_description->block.bits != 0); /* Likely an unsupported fmt. */
1832       dma_desc->size_in_dwords = fmt_description->block.bits / 32;
1833 
1834       /* Vtxin reg offset to start DMAing into. */
1835       dma_desc->destination = vtxin_reg_offset;
1836 
1837       /* Will be used by the driver to figure out buffer address to patch in the
1838        * data section. I.e. which binding we should DMA from.
1839        */
1840       dma_desc->binding_index = attribute->binding;
1841 
1842       /* We don't currently support VK_EXT_vertex_attribute_divisor so no
1843        * repeating of instance-rate vertex attributes needed. We should always
1844        * move on to the next vertex attribute.
1845        */
1846       dma_desc->divisor = 1;
1847 
1848       /* Will be used to generate PDS code that takes care of robust buffer
1849        * access, and later on by the driver to write the correct robustness
1850        * buffer address to DMA the fallback values from.
1851        */
1852       dma_desc->robustness_buffer_offset =
1853          pvr_get_robustness_buffer_format_offset(attribute->format);
1854 
1855       /* Used by later on by the driver to figure out if the buffer is being
1856        * accessed out of bounds, for robust buffer access.
1857        */
1858       dma_desc->component_size_in_bytes =
1859          fmt_description->block.bits / fmt_description->nr_channels / 8;
1860 
1861       dma_count++;
1862    };
1863 
1864    *vertex_input_layout_out = build_data;
1865    *num_vertex_input_regs_out = next_reg_offset;
1866    *dma_count_out = dma_count;
1867 }
1868 
pvr_graphics_pipeline_alloc_vertex_special_vars(unsigned * num_vertex_input_regs,struct pvr_vertex_special_vars * special_vars_layout_out)1869 static void pvr_graphics_pipeline_alloc_vertex_special_vars(
1870    unsigned *num_vertex_input_regs,
1871    struct pvr_vertex_special_vars *special_vars_layout_out)
1872 {
1873    unsigned next_free_reg = *num_vertex_input_regs;
1874    struct pvr_vertex_special_vars layout;
1875 
1876    /* We don't support VK_KHR_shader_draw_parameters or Vulkan 1.1 so no
1877     * BaseInstance, BaseVertex, DrawIndex.
1878     */
1879 
1880    /* TODO: The shader might not necessarily be using this so we'd just be
1881     * wasting regs. Get the info from the compiler about whether or not the
1882     * shader uses them and allocate them accordingly. For now we'll set them up
1883     * regardless.
1884     */
1885 
1886    layout.vertex_id_offset = (int16_t)next_free_reg;
1887    next_free_reg++;
1888 
1889    layout.instance_id_offset = (int16_t)next_free_reg;
1890    next_free_reg++;
1891 
1892    *num_vertex_input_regs = next_free_reg;
1893    *special_vars_layout_out = layout;
1894 }
1895 
1896 /* Compiles and uploads shaders and PDS programs. */
1897 static VkResult
pvr_graphics_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1898 pvr_graphics_pipeline_compile(struct pvr_device *const device,
1899                               struct vk_pipeline_cache *cache,
1900                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
1901                               const VkAllocationCallbacks *const allocator,
1902                               struct pvr_graphics_pipeline *const gfx_pipeline)
1903 {
1904    /* FIXME: Remove this hard coding. */
1905    struct pvr_explicit_constant_usage vert_explicit_const_usage = {
1906       .start_offset = 16,
1907    };
1908    struct pvr_explicit_constant_usage frag_explicit_const_usage = {
1909       .start_offset = 0,
1910    };
1911    static uint32_t hard_code_pipeline_n = 0;
1912 
1913    struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
1914    struct pvr_sh_reg_layout *sh_reg_layout_vert =
1915       &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
1916    struct pvr_sh_reg_layout *sh_reg_layout_frag =
1917       &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_FRAGMENT];
1918    const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
1919       pCreateInfo->pVertexInputState;
1920    const uint32_t cache_line_size =
1921       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
1922    struct rogue_compiler *compiler = device->pdevice->compiler;
1923    struct rogue_build_ctx *ctx;
1924    VkResult result;
1925 
1926    const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
1927 
1928    /* Vars needed for the new path. */
1929    struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
1930    uint32_t vtx_dma_count = 0;
1931    rogue_vertex_inputs *vertex_input_layout;
1932    unsigned *vertex_input_reg_count;
1933 
1934    /* TODO: The compiler should be making use of this to determine where
1935     * specific special variables are located in the vtxin reg set.
1936     */
1937    struct pvr_vertex_special_vars special_vars_layout = { 0 };
1938 
1939    uint32_t sh_count[PVR_STAGE_ALLOCATION_COUNT] = { 0 };
1940 
1941    /* Setup shared build context. */
1942    ctx = rogue_build_context_create(compiler, layout);
1943    if (!ctx)
1944       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1945 
1946    vertex_input_layout = &ctx->stage_data.vs.inputs;
1947    vertex_input_reg_count = &ctx->stage_data.vs.num_vertex_input_regs;
1948 
1949    if (!old_path) {
1950       pvr_graphics_pipeline_alloc_vertex_inputs(vertex_input_state,
1951                                                 vertex_input_layout,
1952                                                 vertex_input_reg_count,
1953                                                 &vtx_dma_descriptions,
1954                                                 &vtx_dma_count);
1955 
1956       pvr_graphics_pipeline_alloc_vertex_special_vars(vertex_input_reg_count,
1957                                                       &special_vars_layout);
1958 
1959       for (enum pvr_stage_allocation pvr_stage =
1960               PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY;
1961            pvr_stage < PVR_STAGE_ALLOCATION_COMPUTE;
1962            ++pvr_stage)
1963          sh_count[pvr_stage] = pvr_pipeline_alloc_shareds(
1964             device,
1965             layout,
1966             pvr_stage,
1967             &layout->sh_reg_layout_per_stage[pvr_stage]);
1968    }
1969 
1970    /* NIR middle-end translation. */
1971    for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1972         stage--) {
1973       const VkPipelineShaderStageCreateInfo *create_info;
1974       size_t stage_index = gfx_pipeline->stage_indices[stage];
1975 
1976       if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info)) {
1977          if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1978              BITFIELD_BIT(stage)) {
1979             continue;
1980          }
1981       }
1982 
1983       /* Skip unused/inactive stages. */
1984       if (stage_index == ~0)
1985          continue;
1986 
1987       create_info = &pCreateInfo->pStages[stage_index];
1988 
1989       /* SPIR-V to NIR. */
1990       ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info);
1991       if (!ctx->nir[stage]) {
1992          ralloc_free(ctx);
1993          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1994       }
1995    }
1996 
1997    /* Pre-back-end analysis and optimization, driver data extraction. */
1998    /* TODO: Analyze and cull unused I/O between stages. */
1999    /* TODO: Allocate UBOs between stages;
2000     * pipeline->layout->set_{count,layout}.
2001     */
2002 
2003    /* Back-end translation. */
2004    for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
2005         stage--) {
2006       if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
2007           pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
2008              BITFIELD_BIT(stage)) {
2009          const struct pvr_device_info *const dev_info =
2010             &device->pdevice->dev_info;
2011          struct pvr_explicit_constant_usage *explicit_const_usage;
2012 
2013          switch (stage) {
2014          case MESA_SHADER_VERTEX:
2015             explicit_const_usage = &vert_explicit_const_usage;
2016             break;
2017 
2018          case MESA_SHADER_FRAGMENT:
2019             explicit_const_usage = &frag_explicit_const_usage;
2020             break;
2021 
2022          default:
2023             unreachable("Unsupported stage.");
2024          }
2025 
2026          pvr_hard_code_graphics_shader(dev_info,
2027                                        hard_code_pipeline_n,
2028                                        stage,
2029                                        &ctx->binary[stage]);
2030 
2031          pvr_hard_code_graphics_get_build_info(dev_info,
2032                                                hard_code_pipeline_n,
2033                                                stage,
2034                                                &ctx->common_data[stage],
2035                                                &ctx->stage_data,
2036                                                explicit_const_usage);
2037 
2038          continue;
2039       }
2040 
2041       if (!ctx->nir[stage])
2042          continue;
2043 
2044       ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]);
2045       if (!ctx->rogue[stage]) {
2046          ralloc_free(ctx);
2047          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2048       }
2049 
2050       pvr_rogue_to_binary(ctx, ctx->rogue[stage], &ctx->binary[stage]);
2051       if (!ctx->binary[stage].size) {
2052          ralloc_free(ctx);
2053          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2054       }
2055    }
2056 
2057    if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
2058        pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
2059           BITFIELD_BIT(MESA_SHADER_VERTEX)) {
2060       pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info,
2061                                           hard_code_pipeline_n,
2062                                           &gfx_pipeline->shader_state.vertex);
2063    } else {
2064       pvr_vertex_state_init(gfx_pipeline,
2065                             &ctx->common_data[MESA_SHADER_VERTEX],
2066                             *vertex_input_reg_count,
2067                             &ctx->stage_data.vs);
2068 
2069       if (!old_path) {
2070          struct pvr_vertex_shader_state *vertex_state =
2071             &gfx_pipeline->shader_state.vertex;
2072 
2073          /* FIXME: For now we just overwrite it but the compiler shouldn't be
2074           * returning the sh count since the driver is in charge of allocating
2075           * them.
2076           */
2077          vertex_state->stage_state.const_shared_reg_count =
2078             sh_count[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
2079 
2080          gfx_pipeline->shader_state.vertex.vertex_input_size =
2081             ctx->stage_data.vs.num_vertex_input_regs;
2082       }
2083    }
2084 
2085    result =
2086       pvr_gpu_upload_usc(device,
2087                          util_dynarray_begin(&ctx->binary[MESA_SHADER_VERTEX]),
2088                          ctx->binary[MESA_SHADER_VERTEX].size,
2089                          cache_line_size,
2090                          &gfx_pipeline->shader_state.vertex.bo);
2091    if (result != VK_SUCCESS)
2092       goto err_free_build_context;
2093 
2094    if (ctx->nir[MESA_SHADER_FRAGMENT]) {
2095       struct pvr_fragment_shader_state *fragment_state =
2096          &gfx_pipeline->shader_state.fragment;
2097 
2098       if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
2099           pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
2100              BITFIELD_BIT(MESA_SHADER_FRAGMENT)) {
2101          pvr_hard_code_graphics_fragment_state(
2102             &device->pdevice->dev_info,
2103             hard_code_pipeline_n,
2104             &gfx_pipeline->shader_state.fragment);
2105       } else {
2106          pvr_fragment_state_init(gfx_pipeline,
2107                                  &ctx->common_data[MESA_SHADER_FRAGMENT]);
2108 
2109          if (!old_path) {
2110             /* FIXME: For now we just overwrite it but the compiler shouldn't be
2111              * returning the sh count since the driver is in charge of
2112              * allocating them.
2113              */
2114             fragment_state->stage_state.const_shared_reg_count =
2115                sh_count[PVR_STAGE_ALLOCATION_FRAGMENT];
2116          }
2117       }
2118 
2119       result = pvr_gpu_upload_usc(
2120          device,
2121          util_dynarray_begin(&ctx->binary[MESA_SHADER_FRAGMENT]),
2122          ctx->binary[MESA_SHADER_FRAGMENT].size,
2123          cache_line_size,
2124          &gfx_pipeline->shader_state.fragment.bo);
2125       if (result != VK_SUCCESS)
2126          goto err_free_vertex_bo;
2127 
2128       /* TODO: powervr has an optimization where it attempts to recompile
2129        * shaders. See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented
2130        * since in our case the optimization doesn't happen.
2131        */
2132 
2133       result = pvr_pds_coeff_program_create_and_upload(
2134          device,
2135          allocator,
2136          ctx->stage_data.fs.iterator_args.fpu_iterators,
2137          ctx->stage_data.fs.iterator_args.num_fpu_iterators,
2138          ctx->stage_data.fs.iterator_args.destination,
2139          &fragment_state->pds_coeff_program,
2140          &fragment_state->stage_state.pds_temps_count);
2141       if (result != VK_SUCCESS)
2142          goto err_free_fragment_bo;
2143 
2144       result = pvr_pds_fragment_program_create_and_upload(
2145          device,
2146          allocator,
2147          gfx_pipeline->shader_state.fragment.bo,
2148          ctx->common_data[MESA_SHADER_FRAGMENT].temps,
2149          ctx->stage_data.fs.msaa_mode,
2150          ctx->stage_data.fs.phas,
2151          &fragment_state->pds_fragment_program);
2152       if (result != VK_SUCCESS)
2153          goto err_free_coeff_program;
2154 
2155       /* FIXME: For now we pass in the same explicit_const_usage since it
2156        * contains all invalid entries. Fix this by hooking it up to the
2157        * compiler.
2158        */
2159       result = pvr_pds_descriptor_program_create_and_upload(
2160          device,
2161          allocator,
2162          &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data,
2163          &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data,
2164          &frag_explicit_const_usage,
2165          layout,
2166          PVR_STAGE_ALLOCATION_FRAGMENT,
2167          sh_reg_layout_frag,
2168          &fragment_state->descriptor_state);
2169       if (result != VK_SUCCESS)
2170          goto err_free_frag_program;
2171 
2172       /* If not, we need to MAX2() and set
2173        * `fragment_state->stage_state.pds_temps_count` appropriately.
2174        */
2175       assert(fragment_state->descriptor_state.pds_info.temps_required == 0);
2176    }
2177 
2178    result = pvr_pds_vertex_attrib_programs_create_and_upload(
2179       device,
2180       allocator,
2181       vertex_input_state,
2182       ctx->common_data[MESA_SHADER_VERTEX].temps,
2183       &ctx->stage_data.vs,
2184       vtx_dma_descriptions,
2185       vtx_dma_count,
2186       &special_vars_layout,
2187       &gfx_pipeline->shader_state.vertex.pds_attrib_programs);
2188    if (result != VK_SUCCESS)
2189       goto err_free_frag_descriptor_program;
2190 
2191    result = pvr_pds_descriptor_program_create_and_upload(
2192       device,
2193       allocator,
2194       &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data,
2195       &ctx->common_data[MESA_SHADER_VERTEX].ubo_data,
2196       &vert_explicit_const_usage,
2197       layout,
2198       PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
2199       sh_reg_layout_vert,
2200       &gfx_pipeline->shader_state.vertex.descriptor_state);
2201    if (result != VK_SUCCESS)
2202       goto err_free_vertex_attrib_program;
2203 
2204    /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
2205     * scratch buffer for both vertex and fragment stage.
2206     * Figure out the best place to do this.
2207     */
2208    /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
2209    /* TODO: Implement spilling with the above. */
2210 
2211    ralloc_free(ctx);
2212 
2213    hard_code_pipeline_n++;
2214 
2215    return VK_SUCCESS;
2216 
2217 err_free_vertex_attrib_program:
2218    for (uint32_t i = 0;
2219         i < ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
2220         i++) {
2221       struct pvr_pds_attrib_program *const attrib_program =
2222          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
2223 
2224       pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
2225    }
2226 err_free_frag_descriptor_program:
2227    pvr_pds_descriptor_program_destroy(
2228       device,
2229       allocator,
2230       &gfx_pipeline->shader_state.fragment.descriptor_state);
2231 err_free_frag_program:
2232    pvr_bo_suballoc_free(
2233       gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
2234 err_free_coeff_program:
2235    pvr_bo_suballoc_free(
2236       gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
2237 err_free_fragment_bo:
2238    pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
2239 err_free_vertex_bo:
2240    pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
2241 err_free_build_context:
2242    ralloc_free(ctx);
2243    return result;
2244 }
2245 
2246 static struct vk_render_pass_state
pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo * const info)2247 pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info)
2248 {
2249    PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass);
2250    const struct pvr_render_subpass *const subpass =
2251       &pass->subpasses[info->subpass];
2252 
2253    enum vk_rp_attachment_flags attachments = 0;
2254 
2255    assert(info->subpass < pass->subpass_count);
2256 
2257    for (uint32_t i = 0; i < subpass->color_count; i++) {
2258       if (pass->attachments[subpass->color_attachments[i]].aspects)
2259          attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i;
2260    }
2261 
2262    if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
2263       VkImageAspectFlags ds_aspects =
2264          pass->attachments[subpass->depth_stencil_attachment].aspects;
2265       if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2266          attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2267       if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
2268          attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2269    }
2270 
2271    return (struct vk_render_pass_state){
2272       .attachments = attachments,
2273 
2274       /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2),
2275        * which is not currently supported.
2276        */
2277       .view_mask = 0,
2278    };
2279 }
2280 
2281 static VkResult
pvr_graphics_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_graphics_pipeline * gfx_pipeline)2282 pvr_graphics_pipeline_init(struct pvr_device *device,
2283                            struct vk_pipeline_cache *cache,
2284                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
2285                            const VkAllocationCallbacks *allocator,
2286                            struct pvr_graphics_pipeline *gfx_pipeline)
2287 {
2288    struct vk_dynamic_graphics_state *const dynamic_state =
2289       &gfx_pipeline->dynamic_state;
2290    const struct vk_render_pass_state rp_state =
2291       pvr_create_renderpass_state(pCreateInfo);
2292 
2293    struct vk_graphics_pipeline_all_state all_state;
2294    struct vk_graphics_pipeline_state state = { 0 };
2295 
2296    VkResult result;
2297 
2298    pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
2299 
2300    result = vk_graphics_pipeline_state_fill(&device->vk,
2301                                             &state,
2302                                             pCreateInfo,
2303                                             &rp_state,
2304                                             0,
2305                                             &all_state,
2306                                             NULL,
2307                                             0,
2308                                             NULL);
2309    if (result != VK_SUCCESS)
2310       goto err_pipeline_finish;
2311 
2312    vk_dynamic_graphics_state_init(dynamic_state);
2313 
2314    /* Load static state into base dynamic state holder. */
2315    vk_dynamic_graphics_state_fill(dynamic_state, &state);
2316 
2317    /* The value of ms.rasterization_samples is undefined when
2318     * rasterizer_discard_enable is set, but we need a specific value.
2319     * Fill that in here.
2320     */
2321    if (state.rs->rasterizer_discard_enable)
2322       dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
2323 
2324    memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
2325 
2326    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
2327       VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
2328       gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
2329       /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
2330        *
2331        *    "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
2332        *    or VK_SHADER_STAGE_ALL."
2333        *
2334        * So we don't handle that.
2335        *
2336        * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
2337        * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
2338        * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
2339        * structure returned by the driver.
2340        */
2341       switch (pCreateInfo->pStages[i].stage) {
2342       case VK_SHADER_STAGE_VERTEX_BIT:
2343       case VK_SHADER_STAGE_FRAGMENT_BIT:
2344          gfx_pipeline->stage_indices[gl_stage] = i;
2345          break;
2346       default:
2347          unreachable("Unsupported stage.");
2348       }
2349    }
2350 
2351    gfx_pipeline->base.layout =
2352       pvr_pipeline_layout_from_handle(pCreateInfo->layout);
2353 
2354    /* Compiles and uploads shaders and PDS programs. */
2355    result = pvr_graphics_pipeline_compile(device,
2356                                           cache,
2357                                           pCreateInfo,
2358                                           allocator,
2359                                           gfx_pipeline);
2360    if (result != VK_SUCCESS)
2361       goto err_pipeline_finish;
2362 
2363    return VK_SUCCESS;
2364 
2365 err_pipeline_finish:
2366    pvr_pipeline_finish(&gfx_pipeline->base);
2367 
2368    return result;
2369 }
2370 
2371 /* If allocator == NULL, the internal one will be used. */
2372 static VkResult
pvr_graphics_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)2373 pvr_graphics_pipeline_create(struct pvr_device *device,
2374                              struct vk_pipeline_cache *cache,
2375                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
2376                              const VkAllocationCallbacks *allocator,
2377                              VkPipeline *const pipeline_out)
2378 {
2379    struct pvr_graphics_pipeline *gfx_pipeline;
2380    VkResult result;
2381 
2382    gfx_pipeline = vk_zalloc2(&device->vk.alloc,
2383                              allocator,
2384                              sizeof(*gfx_pipeline),
2385                              8,
2386                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2387    if (!gfx_pipeline)
2388       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2389 
2390    /* Compiles and uploads shaders and PDS programs too. */
2391    result = pvr_graphics_pipeline_init(device,
2392                                        cache,
2393                                        pCreateInfo,
2394                                        allocator,
2395                                        gfx_pipeline);
2396    if (result != VK_SUCCESS) {
2397       vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
2398       return result;
2399    }
2400 
2401    *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
2402 
2403    return VK_SUCCESS;
2404 }
2405 
2406 VkResult
pvr_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)2407 pvr_CreateGraphicsPipelines(VkDevice _device,
2408                             VkPipelineCache pipelineCache,
2409                             uint32_t createInfoCount,
2410                             const VkGraphicsPipelineCreateInfo *pCreateInfos,
2411                             const VkAllocationCallbacks *pAllocator,
2412                             VkPipeline *pPipelines)
2413 {
2414    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
2415    PVR_FROM_HANDLE(pvr_device, device, _device);
2416    VkResult result = VK_SUCCESS;
2417 
2418    for (uint32_t i = 0; i < createInfoCount; i++) {
2419       const VkResult local_result =
2420          pvr_graphics_pipeline_create(device,
2421                                       cache,
2422                                       &pCreateInfos[i],
2423                                       pAllocator,
2424                                       &pPipelines[i]);
2425       if (local_result != VK_SUCCESS) {
2426          result = local_result;
2427          pPipelines[i] = VK_NULL_HANDLE;
2428       }
2429    }
2430 
2431    return result;
2432 }
2433 
2434 /*****************************************************************************
2435    Other functions
2436 *****************************************************************************/
2437 
pvr_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2438 void pvr_DestroyPipeline(VkDevice _device,
2439                          VkPipeline _pipeline,
2440                          const VkAllocationCallbacks *pAllocator)
2441 {
2442    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2443    PVR_FROM_HANDLE(pvr_device, device, _device);
2444 
2445    if (!pipeline)
2446       return;
2447 
2448    switch (pipeline->type) {
2449    case PVR_PIPELINE_TYPE_GRAPHICS: {
2450       struct pvr_graphics_pipeline *const gfx_pipeline =
2451          to_pvr_graphics_pipeline(pipeline);
2452 
2453       pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2454       break;
2455    }
2456 
2457    case PVR_PIPELINE_TYPE_COMPUTE: {
2458       struct pvr_compute_pipeline *const compute_pipeline =
2459          to_pvr_compute_pipeline(pipeline);
2460 
2461       pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2462       break;
2463    }
2464 
2465    default:
2466       unreachable("Unknown pipeline type.");
2467    }
2468 }
2469